diff --git a/sys/compat/linux/linux_netlink.c b/sys/compat/linux/linux_netlink.c index a6846035c22e..807cdc7a14bc 100644 --- a/sys/compat/linux/linux_netlink.c +++ b/sys/compat/linux/linux_netlink.c @@ -1,620 +1,619 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" -#include "opt_netlink.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_linux #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static bool valid_rta_size(const struct rtattr *rta, int sz) { return (NL_RTA_DATA_LEN(rta) == sz); } static bool valid_rta_u32(const struct rtattr *rta) { return (valid_rta_size(rta, sizeof(uint32_t))); } static uint32_t _rta_get_uint32(const struct rtattr *rta) { return (*((const uint32_t *)NL_RTA_DATA_CONST(rta))); } static struct nlmsghdr * rtnl_neigh_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct ndmsg *ndm = (struct ndmsg *)(hdr + 1); if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ndmsg)) ndm->ndm_family = linux_to_bsd_domain(ndm->ndm_family); return (hdr); } static struct nlmsghdr * rtnl_ifaddr_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct ifaddrmsg *ifam = (struct ifaddrmsg *)(hdr + 1); if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ifaddrmsg)) ifam->ifa_family = linux_to_bsd_domain(ifam->ifa_family); return (hdr); } static struct nlmsghdr * rtnl_route_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { /* Tweak address families and default fib only */ struct rtmsg *rtm = (struct rtmsg *)(hdr + 1); struct nlattr *nla, *nla_head; int attrs_len; rtm->rtm_family = linux_to_bsd_domain(rtm->rtm_family); if (rtm->rtm_table == 254) rtm->rtm_table = 0; attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr); attrs_len -= NETLINK_ALIGN(sizeof(struct rtmsg)); nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg))); NLA_FOREACH(nla, nla_head, attrs_len) { RT_LOG(LOG_DEBUG3, "GOT type %d len %d total %d", nla->nla_type, nla->nla_len, attrs_len); struct rtattr *rta = (struct rtattr *)nla; if (rta->rta_len < sizeof(struct rtattr)) { break; } switch (rta->rta_type) { case NL_RTA_TABLE: if (!valid_rta_u32(rta)) goto done; rtm->rtm_table = 0; uint32_t fibnum = _rta_get_uint32(rta); RT_LOG(LOG_DEBUG3, "GET RTABLE: %u", fibnum); if (fibnum == 254) { *((uint32_t *)NL_RTA_DATA(rta)) = 0; } break; } } done: return (hdr); } static struct nlmsghdr * rtnl_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { switch (hdr->nlmsg_type) { case NL_RTM_GETROUTE: case NL_RTM_NEWROUTE: case NL_RTM_DELROUTE: return (rtnl_route_from_linux(hdr, npt)); case NL_RTM_GETNEIGH: return (rtnl_neigh_from_linux(hdr, npt)); case NL_RTM_GETADDR: return (rtnl_ifaddr_from_linux(hdr, npt)); /* Silence warning for the messages where no translation is required */ case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: break; default: RT_LOG(LOG_DEBUG, "Passing message type %d untranslated", hdr->nlmsg_type); } return (hdr); } static struct nlmsghdr * nlmsg_from_linux(int netlink_family, struct nlmsghdr *hdr, struct nl_pstate *npt) { switch (netlink_family) { case NETLINK_ROUTE: return (rtnl_from_linux(hdr, npt)); } return (hdr); } /************************************************************ * Kernel -> Linux ************************************************************/ static bool handle_default_out(struct nlmsghdr *hdr, struct nl_writer *nw) { char *out_hdr; out_hdr = nlmsg_reserve_data(nw, NLMSG_ALIGN(hdr->nlmsg_len), char); if (out_hdr != NULL) { memcpy(out_hdr, hdr, hdr->nlmsg_len); return (true); } return (false); } static bool nlmsg_copy_header(struct nlmsghdr *hdr, struct nl_writer *nw) { return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, hdr->nlmsg_flags, 0)); } static void * _nlmsg_copy_next_header(struct nlmsghdr *hdr, struct nl_writer *nw, int sz) { void *next_hdr = nlmsg_reserve_data(nw, sz, void); memcpy(next_hdr, hdr + 1, NLMSG_ALIGN(sz)); return (next_hdr); } #define nlmsg_copy_next_header(_hdr, _ns, _t) \ ((_t *)(_nlmsg_copy_next_header(_hdr, _ns, sizeof(_t)))) static bool nlmsg_copy_nla(const struct nlattr *nla_orig, struct nl_writer *nw) { struct nlattr *nla = nlmsg_reserve_data(nw, nla_orig->nla_len, struct nlattr); if (nla != NULL) { memcpy(nla, nla_orig, nla_orig->nla_len); return (true); } return (false); } /* * Translate a FreeBSD interface name to a Linux interface name. */ static bool nlmsg_translate_ifname_nla(struct nlattr *nla, struct nl_writer *nw) { char ifname[LINUX_IFNAMSIZ]; if (ifname_bsd_to_linux_name((char *)(nla + 1), ifname, sizeof(ifname)) <= 0) return (false); return (nlattr_add_string(nw, IFLA_IFNAME, ifname)); } #define LINUX_NLA_UNHANDLED -1 /* * Translate a FreeBSD attribute to a Linux attribute. * Returns LINUX_NLA_UNHANDLED when the attribute is not processed * and the caller must take care of it, otherwise the result is returned. */ static int nlmsg_translate_all_nla(struct nlmsghdr *hdr, struct nlattr *nla, struct nl_writer *nw) { switch (hdr->nlmsg_type) { case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: switch (nla->nla_type) { case IFLA_IFNAME: return (nlmsg_translate_ifname_nla(nla, nw)); default: break; } default: break; } return (LINUX_NLA_UNHANDLED); } static bool nlmsg_copy_all_nla(struct nlmsghdr *hdr, int raw_hdrlen, struct nl_writer *nw) { struct nlattr *nla; int ret; int hdrlen = NETLINK_ALIGN(raw_hdrlen); int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); NLA_FOREACH(nla, nla_head, attrs_len) { RT_LOG(LOG_DEBUG3, "reading attr %d len %d", nla->nla_type, nla->nla_len); if (nla->nla_len < sizeof(struct nlattr)) { return (false); } ret = nlmsg_translate_all_nla(hdr, nla, nw); if (ret == LINUX_NLA_UNHANDLED) ret = nlmsg_copy_nla(nla, nw); if (!ret) return (false); } return (true); } #undef LINUX_NLA_UNHANDLED static unsigned int rtnl_if_flags_to_linux(unsigned int if_flags) { unsigned int result = 0; for (int i = 0; i < 31; i++) { unsigned int flag = 1 << i; if (!(flag & if_flags)) continue; switch (flag) { case IFF_UP: case IFF_BROADCAST: case IFF_DEBUG: case IFF_LOOPBACK: case IFF_POINTOPOINT: case IFF_DRV_RUNNING: case IFF_NOARP: case IFF_PROMISC: case IFF_ALLMULTI: result |= flag; break; case IFF_NEEDSEPOCH: case IFF_DRV_OACTIVE: case IFF_SIMPLEX: case IFF_LINK0: case IFF_LINK1: case IFF_LINK2: case IFF_CANTCONFIG: case IFF_PPROMISC: case IFF_MONITOR: case IFF_STATICARP: case IFF_STICKYARP: case IFF_DYING: case IFF_RENAMING: /* No Linux analogue */ break; case IFF_MULTICAST: result |= 1 << 12; } } return (result); } static bool rtnl_newlink_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ifinfomsg *ifinfo; ifinfo = nlmsg_copy_next_header(hdr, nw, struct ifinfomsg); ifinfo->ifi_family = bsd_to_linux_domain(ifinfo->ifi_family); /* Convert interface type */ switch (ifinfo->ifi_type) { case IFT_ETHER: ifinfo->ifi_type = LINUX_ARPHRD_ETHER; break; } ifinfo->ifi_flags = rtnl_if_flags_to_linux(ifinfo->ifi_flags); /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifinfomsg), nw)) return (false); /* make ip(8) happy */ if (!nlattr_add_string(nw, IFLA_QDISC, "noqueue")) return (false); if (!nlattr_add_u32(nw, IFLA_TXQLEN, 1000)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newaddr_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ifaddrmsg *ifamsg; ifamsg = nlmsg_copy_next_header(hdr, nw, struct ifaddrmsg); ifamsg->ifa_family = bsd_to_linux_domain(ifamsg->ifa_family); /* XXX: fake ifa_flags? */ /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifaddrmsg), nw)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newneigh_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ndmsg *ndm; ndm = nlmsg_copy_next_header(hdr, nw, struct ndmsg); ndm->ndm_family = bsd_to_linux_domain(ndm->ndm_family); /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ndmsg), nw)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newroute_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct rtmsg *rtm; rtm = nlmsg_copy_next_header(hdr, nw, struct rtmsg); rtm->rtm_family = bsd_to_linux_domain(rtm->rtm_family); struct nlattr *nla; int hdrlen = NETLINK_ALIGN(sizeof(struct rtmsg)); int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); NLA_FOREACH(nla, nla_head, attrs_len) { struct rtattr *rta = (struct rtattr *)nla; //RT_LOG(LOG_DEBUG, "READING attr %d len %d", nla->nla_type, nla->nla_len); if (rta->rta_len < sizeof(struct rtattr)) { break; } switch (rta->rta_type) { case NL_RTA_TABLE: { uint32_t fibnum; fibnum = _rta_get_uint32(rta); if (fibnum == 0) fibnum = 254; RT_LOG(LOG_DEBUG3, "XFIBNUM %u", fibnum); if (!nlattr_add_u32(nw, NL_RTA_TABLE, fibnum)) return (false); } break; default: if (!nlmsg_copy_nla(nla, nw)) return (false); break; } } nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { RT_LOG(LOG_DEBUG2, "Got message type %d", hdr->nlmsg_type); switch (hdr->nlmsg_type) { case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: return (rtnl_newlink_to_linux(hdr, nlp, nw)); case NL_RTM_NEWADDR: case NL_RTM_DELADDR: return (rtnl_newaddr_to_linux(hdr, nlp, nw)); case NL_RTM_NEWROUTE: case NL_RTM_DELROUTE: return (rtnl_newroute_to_linux(hdr, nlp, nw)); case NL_RTM_NEWNEIGH: case NL_RTM_DELNEIGH: case NL_RTM_GETNEIGH: return (rtnl_newneigh_to_linux(hdr, nlp, nw)); default: RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", hdr->nlmsg_type); return (handle_default_out(hdr, nw)); } } static bool nlmsg_error_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct nlmsgerr *nlerr; nlerr = nlmsg_copy_next_header(hdr, nw, struct nlmsgerr); nlerr->error = bsd_to_linux_errno(nlerr->error); int copied_len = sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr); if (hdr->nlmsg_len == copied_len) { nlmsg_end(nw); return (true); } /* * CAP_ACK was not set. Original request needs to be translated. * XXX: implement translation of the original message */ RT_LOG(LOG_DEBUG, "[WARN] Passing ack message type %d untranslated", nlerr->msg.nlmsg_type); char *dst_payload, *src_payload; int copy_len = hdr->nlmsg_len - copied_len; dst_payload = nlmsg_reserve_data(nw, NLMSG_ALIGN(copy_len), char); src_payload = (char *)hdr + copied_len; memcpy(dst_payload, src_payload, copy_len); nlmsg_end(nw); return (true); } static bool nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (hdr->nlmsg_type < NLMSG_MIN_TYPE) { switch (hdr->nlmsg_type) { case NLMSG_ERROR: return (nlmsg_error_to_linux(hdr, nlp, nw)); case NLMSG_NOOP: case NLMSG_DONE: case NLMSG_OVERRUN: return (handle_default_out(hdr, nw)); default: RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", hdr->nlmsg_type); return (handle_default_out(hdr, nw)); } } switch (netlink_family) { case NETLINK_ROUTE: return (rtnl_to_linux(hdr, nlp, nw)); default: return (handle_default_out(hdr, nw)); } } static struct mbuf * nlmsgs_to_linux(int netlink_family, char *buf, int data_length, struct nlpcb *nlp) { RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length); struct nl_writer nw = {}; struct mbuf *m = NULL; if (!nlmsg_get_chain_writer(&nw, data_length, &m)) { RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d", data_length); return (NULL); } /* Assume correct headers. Buffer IS mutable */ int count = 0; for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset]; int msglen = NLMSG_ALIGN(hdr->nlmsg_len); count++; if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) { RT_LOG(LOG_DEBUG, "failed to process msg type %d", hdr->nlmsg_type); m_freem(m); return (NULL); } offset += msglen; } nlmsg_flush(&nw); RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count, m ? m_length(m, NULL) : 0); return (m); } static struct mbuf * mbufs_to_linux(int netlink_family, struct mbuf *m, struct nlpcb *nlp) { /* XXX: easiest solution, not optimized for performance */ int data_length = m_length(m, NULL); char *buf = malloc(data_length, M_LINUX, M_NOWAIT); if (buf == NULL) { RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message", data_length); m_freem(m); return (NULL); } m_copydata(m, 0, data_length, buf); m_freem(m); m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp); free(buf, M_LINUX); return (m); } static struct linux_netlink_provider linux_netlink_v1 = { .mbufs_to_linux = mbufs_to_linux, .msgs_to_linux = nlmsgs_to_linux, .msg_from_linux = nlmsg_from_linux, }; void linux_netlink_register(void) { linux_netlink_p = &linux_netlink_v1; } void linux_netlink_deregister(void) { linux_netlink_p = NULL; } diff --git a/sys/conf/options b/sys/conf/options index b6cce3b17ebb..4e74c4ab3a70 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -1,1023 +1,1023 @@ # # On the handling of kernel options # # All kernel options should be listed in NOTES, with suitable # descriptions. Negative options (options that make some code not # compile) should be commented out; LINT (generated from NOTES) should # compile as much code as possible. Try to structure option-using # code so that a single option only switch code on, or only switch # code off, to make it possible to have a full compile-test. If # necessary, you can check for COMPILING_LINT to get maximum code # coverage. # # All new options shall also be listed in either "conf/options" or # "conf/options.". Options that affect a single source-file # .[c|s] should be directed into "opt_.h", while options # that affect multiple files should either go in "opt_global.h" if # this is a kernel-wide option (used just about everywhere), or in # "opt_.h" if it affects only some files. # Note that the effect of listing only an option without a # header-file-name in conf/options (and cousins) is that the last # convention is followed. # # This handling scheme is not yet fully implemented. # # # Format of this file: # Option name filename # # If filename is missing, the default is # opt_.h AAC_DEBUG opt_aac.h AACRAID_DEBUG opt_aacraid.h AHC_ALLOW_MEMIO opt_aic7xxx.h AHC_TMODE_ENABLE opt_aic7xxx.h AHC_DUMP_EEPROM opt_aic7xxx.h AHC_DEBUG opt_aic7xxx.h AHC_DEBUG_OPTS opt_aic7xxx.h AHC_REG_PRETTY_PRINT opt_aic7xxx.h AHD_DEBUG opt_aic79xx.h AHD_DEBUG_OPTS opt_aic79xx.h AHD_TMODE_ENABLE opt_aic79xx.h AHD_REG_PRETTY_PRINT opt_aic79xx.h # Debugging options. ALT_BREAK_TO_DEBUGGER opt_kdb.h BREAK_TO_DEBUGGER opt_kdb.h BUF_TRACKING opt_global.h DDB DDB_BUFR_SIZE opt_ddb.h DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h DDB_CAPTURE_MAXBUFSIZE opt_ddb.h DDB_CTF opt_ddb.h DDB_NUMSYM opt_ddb.h EARLY_PRINTF opt_global.h FULL_BUF_TRACKING opt_global.h GDB KDB opt_global.h KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h KLD_DEBUG opt_kld.h NUM_CORE_FILES opt_global.h QUEUE_MACRO_DEBUG_TRACE opt_global.h QUEUE_MACRO_DEBUG_TRASH opt_global.h SYSCTL_DEBUG opt_sysctl.h TEXTDUMP_PREFERRED opt_ddb.h TEXTDUMP_VERBOSE opt_ddb.h TSLOG opt_global.h TSLOG_PAGEZERO opt_global.h TSLOGSIZE opt_global.h # Miscellaneous options. ALQ ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h ATSE_CFI_HACK opt_cfi.h AUDIT opt_global.h BOOTHOWTO opt_global.h BOOTVERBOSE opt_global.h CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h CC_CDG opt_global.h CC_CHD opt_global.h CC_CUBIC opt_global.h CC_DEFAULT opt_cc.h CC_DCTCP opt_global.h CC_HD opt_global.h CC_HTCP opt_global.h CC_NEWRENO opt_global.h CC_VEGAS opt_global.h COMPAT_43 opt_global.h COMPAT_43TTY opt_global.h COMPAT_FREEBSD4 opt_global.h COMPAT_FREEBSD5 opt_global.h COMPAT_FREEBSD6 opt_global.h COMPAT_FREEBSD7 opt_global.h COMPAT_FREEBSD9 opt_global.h COMPAT_FREEBSD10 opt_global.h COMPAT_FREEBSD11 opt_global.h COMPAT_FREEBSD12 opt_global.h COMPAT_FREEBSD13 opt_global.h COMPAT_FREEBSD14 opt_global.h COMPAT_LINUXKPI opt_dontuse.h COMPILING_LINT opt_global.h CY_PCI_FASTINTR DEADLKRES opt_watchdog.h EXPERIMENTAL opt_global.h DIRECTIO FILEMON opt_dontuse.h FFCLOCK FULL_PREEMPTION opt_sched.h GZIO opt_gzio.h IMGACT_BINMISC opt_dontuse.h IPI_PREEMPTION opt_sched.h GEOM_BDE opt_geom.h GEOM_CACHE opt_geom.h GEOM_CONCAT opt_geom.h GEOM_ELI opt_geom.h GEOM_GATE opt_geom.h GEOM_JOURNAL opt_geom.h GEOM_LABEL opt_geom.h GEOM_LABEL_GPT opt_geom.h GEOM_LINUX_LVM opt_geom.h GEOM_MAP opt_geom.h GEOM_MIRROR opt_geom.h GEOM_MOUNTVER opt_geom.h GEOM_MULTIPATH opt_geom.h GEOM_NOP opt_geom.h GEOM_PART_APM opt_geom.h GEOM_PART_BSD opt_geom.h GEOM_PART_BSD64 opt_geom.h GEOM_PART_EBR opt_geom.h GEOM_PART_GPT opt_geom.h GEOM_PART_LDM opt_geom.h GEOM_PART_MBR opt_geom.h GEOM_RAID opt_geom.h GEOM_RAID3 opt_geom.h GEOM_SHSEC opt_geom.h GEOM_STRIPE opt_geom.h GEOM_UZIP opt_geom.h GEOM_UZIP_DEBUG opt_geom.h GEOM_VINUM opt_geom.h GEOM_VIRSTOR opt_geom.h GEOM_ZERO opt_geom.h IFLIB opt_iflib.h KDTRACE_HOOKS opt_global.h KDTRACE_FRAME opt_kdtrace.h KN_HASHSIZE opt_kqueue.h KSTACK_MAX_PAGES KSTACK_PAGES KSTACK_USAGE_PROF KTRACE KTRACE_REQUEST_POOL opt_ktrace.h LIBICONV MAC opt_global.h MAC_BIBA opt_dontuse.h MAC_BSDEXTENDED opt_dontuse.h MAC_DDB opt_dontuse.h MAC_IFOFF opt_dontuse.h MAC_IPACL opt_dontuse.h MAC_LOMAC opt_dontuse.h MAC_MLS opt_dontuse.h MAC_NONE opt_dontuse.h MAC_NTPD opt_dontuse.h MAC_PARTITION opt_dontuse.h MAC_PORTACL opt_dontuse.h MAC_PRIORITY opt_dontuse.h MAC_SEEOTHERUIDS opt_dontuse.h MAC_STATIC opt_mac.h MAC_STUB opt_dontuse.h MAC_TEST opt_dontuse.h MAC_GRANTBYLABEL opt_dontuse.h MAC_VERIEXEC opt_dontuse.h MAC_VERIEXEC_SHA1 opt_dontuse.h MAC_VERIEXEC_SHA256 opt_dontuse.h MAC_VERIEXEC_SHA384 opt_dontuse.h MAC_VERIEXEC_SHA512 opt_dontuse.h MD_ROOT opt_md.h MD_ROOT_FSTYPE opt_md.h MD_ROOT_READONLY opt_md.h MD_ROOT_SIZE opt_md.h MD_ROOT_MEM opt_md.h MFI_DEBUG opt_mfi.h MFI_DECODE_LOG opt_mfi.h MPROF_BUFFERS opt_mprof.h MPROF_HASH_SIZE opt_mprof.h NEW_PCIB opt_global.h NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h NO_ADAPTIVE_RWLOCKS NO_ADAPTIVE_SX NO_OBSOLETE_CODE opt_global.h NO_SYSCTL_DESCR opt_global.h NSWBUF_MIN opt_param.h MBUF_PACKET_ZONE_DISABLE opt_global.h PANIC_REBOOT_WAIT_TIME opt_panic.h PCI_HP opt_pci.h PCI_IOV opt_global.h PPC_DEBUG opt_ppc.h PPC_PROBE_CHIPSET opt_ppc.h PPS_SYNC opt_ntp.h PREEMPTION opt_sched.h QUOTA SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h SCHED_ULE opt_sched.h SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h STACK opt_stack.h SUIDDIR MSGMNB opt_sysvipc.h MSGMNI opt_sysvipc.h MSGSEG opt_sysvipc.h MSGSSZ opt_sysvipc.h MSGTQL opt_sysvipc.h SEMMNI opt_sysvipc.h SEMMNS opt_sysvipc.h SEMMNU opt_sysvipc.h SEMMSL opt_sysvipc.h SEMOPM opt_sysvipc.h SEMUME opt_sysvipc.h SHMALL opt_sysvipc.h SHMMAX opt_sysvipc.h SHMMAXPGS opt_sysvipc.h SHMMIN opt_sysvipc.h SHMMNI opt_sysvipc.h SHMSEG opt_sysvipc.h SYSVMSG opt_sysvipc.h SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS opt_inet.h TCP_REQUEST_TRK opt_global.h TCP_ACCOUNTING opt_global.h # # TCP SaD Detection is an experimental Sack attack Detection (SaD) # algorithm that uses "normal" behaviour with SACK's to detect # a possible attack. It is strictly experimental at this point. # TCP_SAD_DETECTION opt_inet.h TURNSTILE_PROFILING UMTX_PROFILING UMTX_CHAINS opt_global.h VERBOSE_SYSINIT ZSTDIO opt_zstdio.h # Sanitizers COVERAGE opt_global.h KASAN opt_global.h KCOV KCSAN opt_global.h KMSAN opt_global.h KUBSAN opt_global.h # POSIX kernel options P1003_1B_MQUEUE opt_posix.h P1003_1B_SEMAPHORES opt_posix.h _KPOSIX_PRIORITY_SCHEDULING opt_posix.h # Do we want the config file compiled into the kernel? INCLUDE_CONFIG_FILE opt_config.h # Options for static filesystems. These should only be used at config # time, since the corresponding lkms cannot work if there are any static # dependencies. Unusability is enforced by hiding the defines for the # options in a never-included header. AUTOFS opt_dontuse.h CD9660 opt_dontuse.h EXT2FS opt_dontuse.h FDESCFS opt_dontuse.h FFS opt_dontuse.h FUSEFS opt_dontuse.h MSDOSFS opt_dontuse.h NULLFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h SMBFS opt_dontuse.h TARFS opt_dontuse.h TMPFS opt_dontuse.h UDF opt_dontuse.h UNIONFS opt_dontuse.h ZFS opt_dontuse.h # Pseudofs debugging PSEUDOFS_TRACE opt_pseudofs.h # Tarfs debugging TARFS_DEBUG opt_tarfs.h # In-kernel GSS-API KGSSAPI opt_kgssapi.h KGSSAPI_DEBUG opt_kgssapi.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are # statically compiled into the kernel, code for mounting them as root # filesystems will be enabled - but look below. # NFSCL - client # NFSD - server NFSCL opt_nfs.h NFSD opt_nfs.h # filesystems and libiconv bridge CD9660_ICONV opt_dontuse.h MSDOSFS_ICONV opt_dontuse.h UDF_ICONV opt_dontuse.h # If you are following the conditions in the copyright, # you can enable soft-updates which will speed up a lot of thigs # and make the system safer from crashes at the same time. # otherwise a STUB module will be compiled in. SOFTUPDATES opt_ffs.h # On small, embedded systems, it can be useful to turn off support for # snapshots. It saves about 30-40k for a feature that would be lightly # used, if it is used at all. NO_FFS_SNAPSHOT opt_ffs.h # Enabling this option turns on support for Access Control Lists in UFS, # which can be used to support high security configurations. Depends on # UFS_EXTATTR. UFS_ACL opt_ufs.h # Enabling this option turns on support for extended attributes in UFS-based # filesystems, which can be used to support high security configurations # as well as new filesystem features. UFS_EXTATTR opt_ufs.h UFS_EXTATTR_AUTOSTART opt_ufs.h # Enable fast hash lookups for large directories on UFS-based filesystems. UFS_DIRHASH opt_ufs.h # Enable gjournal-based UFS journal. UFS_GJOURNAL opt_ufs.h # The below sentence is not in English, and neither is this one. # We plan to remove the static dependences above, with a # _ROOT option to control if it usable as root. This list # allows these options to be present in config files already (though # they won't make any difference yet). NFS_ROOT opt_nfsroot.h # SMB/CIFS requester NETSMB opt_netsmb.h # Enable debugnet(4) networking support. DEBUGNET opt_global.h # Enable netdump(4) client support. NETDUMP opt_global.h # Enable netgdb(4) support. NETGDB opt_global.h # Options used only in subr_param.c. HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h MAXUSERS DFLDSIZ opt_param.h MAXDSIZ opt_param.h MAXSSIZ opt_param.h # Generic SCSI options. CAM_MAX_HIGHPOWER opt_cam.h CAMDEBUG opt_cam.h CAM_DEBUG_COMPILE opt_cam.h CAM_DEBUG_DELAY opt_cam.h CAM_DEBUG_BUS opt_cam.h CAM_DEBUG_TARGET opt_cam.h CAM_DEBUG_LUN opt_cam.h CAM_DEBUG_FLAGS opt_cam.h CAM_BOOT_DELAY opt_cam.h CAM_IOSCHED_DYNAMIC opt_cam.h CAM_IO_STATS opt_cam.h CAM_TEST_FAILURE opt_cam.h SCSI_DELAY opt_scsi.h SCSI_NO_SENSE_STRINGS opt_scsi.h SCSI_NO_OP_STRINGS opt_scsi.h # Options used only in cam/ata/ata_da.c ATA_STATIC_ID opt_ada.h # Options used only in cam/scsi/scsi_cd.c CHANGER_MIN_BUSY_SECONDS opt_cd.h CHANGER_MAX_BUSY_SECONDS opt_cd.h # Options used only in cam/scsi/scsi_da.c DA_TRACK_REFS opt_da.h # Options used only in cam/scsi/scsi_sa.c. SA_IO_TIMEOUT opt_sa.h SA_SPACE_TIMEOUT opt_sa.h SA_REWIND_TIMEOUT opt_sa.h SA_ERASE_TIMEOUT opt_sa.h SA_1FM_AT_EOD opt_sa.h # Options used only in cam/scsi/scsi_pt.c SCSI_PT_DEFAULT_TIMEOUT opt_pt.h # Options used only in cam/scsi/scsi_ses.c SES_ENABLE_PASSTHROUGH opt_ses.h # Options used in dev/sym/ (Symbios SCSI driver). SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking # disabled:0, enabled:1 (default) SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported # default:8, range:[1..64] # Options used only in dev/isp/* ISP_TARGET_MODE opt_isp.h ISP_FW_CRASH_DUMP opt_isp.h ISP_DEFAULT_ROLES opt_isp.h ISP_INTERNAL_TARGET opt_isp.h ISP_FCTAPE_OFF opt_isp.h # Options used only in dev/iscsi ISCSI_INITIATOR_DEBUG opt_iscsi_initiator.h # Net stuff. ACCEPT_FILTER_DATA ACCEPT_FILTER_DNS ACCEPT_FILTER_HTTP ALTQ opt_global.h ALTQ_CBQ opt_altq.h ALTQ_CDNR opt_altq.h ALTQ_CODEL opt_altq.h ALTQ_DEBUG opt_altq.h ALTQ_HFSC opt_altq.h ALTQ_FAIRQ opt_altq.h ALTQ_NOPCC opt_altq.h ALTQ_PRIQ opt_altq.h ALTQ_RED opt_altq.h ALTQ_RIO opt_altq.h BOOTP opt_bootp.h BOOTP_BLOCKSIZE opt_bootp.h BOOTP_COMPAT opt_bootp.h BOOTP_NFSROOT opt_bootp.h BOOTP_NFSV3 opt_bootp.h BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h RATELIMIT opt_ratelimit.h RATELIMIT_DEBUG opt_ratelimit.h INET opt_inet.h INET6 opt_inet6.h STATS opt_global.h IPDIVERT IPFILTER opt_ipfilter.h IPFILTER_DEFAULT_BLOCK opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_LOOKUP opt_ipfilter.h IPFIREWALL opt_ipfw.h IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h IPFIREWALL_NAT opt_ipfw.h IPFIREWALL_NAT64 opt_ipfw.h IPFIREWALL_NPTV6 opt_ipfw.h IPFIREWALL_VERBOSE opt_ipfw.h IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h IPFIREWALL_PMOD opt_ipfw.h IPSEC opt_ipsec.h IPSEC_DEBUG opt_ipsec.h IPSEC_SUPPORT opt_ipsec.h IPSTEALTH KERN_TLS KRPC LIBALIAS LIBMCHAIN MBUF_PROFILING MBUF_STRESS_TEST MROUTING opt_mrouting.h NFSLOCKD -NETLINK opt_netlink.h +NETLINK opt_global.h PF_DEFAULT_TO_DROP opt_pf.h ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h FIB_ALGO opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPPCAP opt_global.h SIFTR TCP_BLACKBOX opt_global.h TCP_HHOOK opt_global.h TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_RFC7413 opt_inet.h TCP_RFC7413_MAX_KEYS opt_inet.h TCP_RFC7413_MAX_PSKS opt_inet.h TCP_SIGNATURE opt_ipsec.h VLAN_ARRAY opt_vlan.h XDR XBONEHACK # # SCTP # SCTP opt_sctp.h SCTP_SUPPORT opt_sctp.h SCTP_DEBUG opt_sctp.h # Enable debug printfs SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns. SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats. SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats # # # # Netgraph(4). Use option NETGRAPH to enable the base netgraph code. # Each netgraph node type can be either be compiled into the kernel # or loaded dynamically. To get the former, include the corresponding # option below. Each type has its own man page, e.g. ng_async(4). NETGRAPH NETGRAPH_DEBUG opt_netgraph.h NETGRAPH_ASYNC opt_netgraph.h NETGRAPH_BLUETOOTH opt_netgraph.h NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h NETGRAPH_BLUETOOTH_H4 opt_netgraph.h NETGRAPH_BLUETOOTH_HCI opt_netgraph.h NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h NETGRAPH_BLUETOOTH_UBT opt_netgraph.h NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h NETGRAPH_BPF opt_netgraph.h NETGRAPH_BRIDGE opt_netgraph.h NETGRAPH_CAR opt_netgraph.h NETGRAPH_CHECKSUM opt_netgraph.h NETGRAPH_CISCO opt_netgraph.h NETGRAPH_DEFLATE opt_netgraph.h NETGRAPH_DEVICE opt_netgraph.h NETGRAPH_ECHO opt_netgraph.h NETGRAPH_EIFACE opt_netgraph.h NETGRAPH_ETHER opt_netgraph.h NETGRAPH_ETHER_ECHO opt_netgraph.h NETGRAPH_FEC opt_netgraph.h NETGRAPH_FRAME_RELAY opt_netgraph.h NETGRAPH_GIF opt_netgraph.h NETGRAPH_GIF_DEMUX opt_netgraph.h NETGRAPH_HOLE opt_netgraph.h NETGRAPH_IFACE opt_netgraph.h NETGRAPH_IP_INPUT opt_netgraph.h NETGRAPH_IPFW opt_netgraph.h NETGRAPH_KSOCKET opt_netgraph.h NETGRAPH_L2TP opt_netgraph.h NETGRAPH_LMI opt_netgraph.h NETGRAPH_MPPC_COMPRESSION opt_netgraph.h NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h NETGRAPH_PATCH opt_netgraph.h NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h NETGRAPH_PRED1 opt_netgraph.h NETGRAPH_RFC1490 opt_netgraph.h NETGRAPH_SOCKET opt_netgraph.h NETGRAPH_SPLIT opt_netgraph.h NETGRAPH_SPPP opt_netgraph.h NETGRAPH_TAG opt_netgraph.h NETGRAPH_TCPMSS opt_netgraph.h NETGRAPH_TEE opt_netgraph.h NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h NETGRAPH_VLAN opt_netgraph.h # DRM options DRM_DEBUG opt_drm.h TI_SF_BUF_JUMBO opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h # Misc debug flags. Most of these should probably be replaced with # 'DEBUG', and then let people recompile just the interesting modules # with 'make CC="cc -DDEBUG"'. DEBUG_1284 opt_ppb_1284.h LPT_DEBUG opt_lpt.h PLIP_DEBUG opt_plip.h LOCKF_DEBUG opt_debug_lockf.h SI_DEBUG opt_debug_si.h IFMEDIA_DEBUG opt_ifmedia.h # Fb options FB_DEBUG opt_fb.h # ppbus related options PERIPH_1284 opt_ppb_1284.h DONTPROBE_1284 opt_ppb_1284.h # smbus related options ENABLE_ALART opt_intpm.h # These cause changes all over the kernel BLKDEV_IOSIZE opt_global.h BURN_BRIDGES opt_global.h DEBUG opt_global.h DEBUG_LOCKS opt_global.h DEBUG_VFS_LOCKS opt_global.h DFLTPHYS opt_global.h DIAGNOSTIC opt_global.h INVARIANT_SUPPORT opt_global.h INVARIANTS opt_global.h KASSERT_PANIC_OPTIONAL opt_global.h MAXCPU opt_global.h MAXMEMDOM opt_global.h MAXPHYS opt_maxphys.h MCLSHIFT opt_global.h MUTEX_NOINLINE opt_global.h LOCK_PROFILING opt_global.h MSIZE opt_global.h REGRESSION opt_global.h RWLOCK_NOINLINE opt_global.h SX_NOINLINE opt_global.h VFS_BIO_DEBUG opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h VM_NRESERVLEVEL opt_vm.h VM_LEVEL_0_ORDER opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h MALLOC_DEBUG_MAXZONES opt_vm.h # The MemGuard replacement allocator used for tamper-after-free detection DEBUG_MEMGUARD opt_vm.h # The RedZone malloc(9) protection DEBUG_REDZONE opt_vm.h # Standard SMP options EARLY_AP_STARTUP opt_global.h SMP opt_global.h NUMA opt_global.h # Size of the kernel message buffer MSGBUF_SIZE opt_msgbuf.h # NFS options NFS_MINATTRTIMO opt_nfs.h NFS_MAXATTRTIMO opt_nfs.h NFS_MINDIRATTRTIMO opt_nfs.h NFS_MAXDIRATTRTIMO opt_nfs.h NFS_DEBUG opt_nfs.h # TMPFS options TMPFS_PAGES_MINRESERVED opt_tmpfs.h # Options for uart(4) UART_PPS_ON_CTS opt_uart.h UART_POLL_FREQ opt_uart.h UART_DEV_TOLERANCE_PCT opt_uart.h # options for bus/device framework BUS_DEBUG opt_bus.h # options for USB support USB_DEBUG opt_usb.h USB_HOST_ALIGN opt_usb.h USB_REQ_DEBUG opt_usb.h USB_TEMPLATE opt_usb.h USB_VERBOSE opt_usb.h USB_DMA_SINGLE_ALLOC opt_usb.h USB_EHCI_BIG_ENDIAN_DESC opt_usb.h U3G_DEBUG opt_u3g.h UKBD_DFLT_KEYMAP opt_ukbd.h UPLCOM_INTR_INTERVAL opt_uplcom.h UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h UVSCOM_INTR_INTERVAL opt_uvscom.h # options for the Realtek rtwn driver RTWN_DEBUG opt_rtwn.h RTWN_WITHOUT_UCODE opt_rtwn.h # Embedded system options INIT_PATH ROOTDEVNAME FDC_DEBUG opt_fdc.h PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h KTR opt_global.h KTR_ALQ opt_ktr.h KTR_MASK opt_ktr.h KTR_CPUMASK opt_ktr.h KTR_COMPILE opt_global.h KTR_BOOT_ENTRIES opt_global.h KTR_ENTRIES opt_global.h KTR_VERBOSE opt_ktr.h WITNESS opt_global.h WITNESS_KDB opt_witness.h WITNESS_NO_VNODE opt_witness.h WITNESS_SKIPSPIN opt_witness.h WITNESS_COUNT opt_witness.h OPENSOLARIS_WITNESS opt_global.h EPOCH_TRACE opt_global.h # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h ACPI_MAX_THREADS opt_acpi.h DEV_ACPI opt_acpi.h ACPI_EARLY_EPYC_WAR opt_acpi.h # options for IOMMU support IOMMU opt_iommu.h # ISA support DEV_ISA opt_isa.h ISAPNP opt_dontuse.h # various 'device presence' options. DEV_BPF opt_bpf.h DEV_CARP opt_carp.h DEV_NETMAP opt_global.h DEV_PCI opt_pci.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h DEV_SPLASH opt_splash.h DEV_VLAN opt_vlan.h # bce driver BCE_DEBUG opt_bce.h BCE_NVRAM_WRITE_SUPPORT opt_bce.h SOCKBUF_DEBUG opt_global.h # options for hifn driver HIFN_DEBUG opt_hifn.h HIFN_RNDTEST opt_hifn.h # options for safenet driver SAFE_DEBUG opt_safe.h SAFE_NO_RNG opt_safe.h SAFE_RNDTEST opt_safe.h # syscons/vt options MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_CUT_SPACES2TABS opt_syscons.h SC_CUT_SEPCHARS opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DFLT_TERM opt_syscons.h SC_DISABLE_KDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_ATTRS opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_MODE_CHANGE opt_syscons.h SC_NO_SUSPEND_VTYSWITCH opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NO_TERM_DUMB opt_syscons.h SC_NO_TERM_SC opt_syscons.h SC_NO_TERM_TEKEN opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h VT_ALT_TO_ESC_HACK opt_syscons.h VT_FB_MAX_WIDTH opt_syscons.h VT_FB_MAX_HEIGHT opt_syscons.h VT_MAXWINDOWS opt_syscons.h VT_TWOBUTTON_MOUSE opt_syscons.h DEV_SC opt_syscons.h DEV_VT opt_syscons.h # teken terminal emulator options TEKEN_CONS25 opt_teken.h TEKEN_UTF8 opt_teken.h TERMINAL_KERN_ATTR opt_teken.h TERMINAL_NORM_ATTR opt_teken.h # options for printf PRINTF_BUFR_SIZE opt_printf.h BOOT_TAG opt_printf.h BOOT_TAG_SZ opt_printf.h # kbd options KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBD_DELAY1 opt_kbd.h KBD_DELAY2 opt_kbd.h KBDIO_DEBUG opt_kbd.h KBDMUX_DFLT_KEYMAP opt_kbdmux.h # options for the Atheros driver ATH_DEBUG opt_ath.h ATH_TXBUF opt_ath.h ATH_RXBUF opt_ath.h ATH_DIAGAPI opt_ath.h ATH_TX99_DIAG opt_ath.h ATH_ENABLE_11N opt_ath.h ATH_ENABLE_DFS opt_ath.h ATH_EEPROM_FIRMWARE opt_ath.h ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h ATH_DEBUG_ALQ opt_ath.h ATH_KTR_INTR_DEBUG opt_ath.h AH_DEBUG opt_ah.h AH_ASSERT opt_ah.h AH_DEBUG_ALQ opt_ah.h AH_REGOPS_FUNC opt_ah.h AH_WRITE_REGDOMAIN opt_ah.h AH_DEBUG_COUNTRY opt_ah.h AH_WRITE_EEPROM opt_ah.h AH_PRIVATE_DIAG opt_ah.h AH_NEED_DESC_SWAP opt_ah.h AH_USE_INIPDGAIN opt_ah.h AH_MAXCHAN opt_ah.h AH_RXCFG_SDMAMW_4BYTES opt_ah.h AH_INTERRUPT_DEBUGGING opt_ah.h # AR5416 and later interrupt mitigation # XXX do not use this for AR9130 AH_AR5416_INTERRUPT_MITIGATION opt_ah.h # options for the Altera mSGDMA driver (altera_msgdma) ALTERA_MSGDMA_DESC_STD opt_altera_msgdma.h ALTERA_MSGDMA_DESC_EXT opt_altera_msgdma.h ALTERA_MSGDMA_DESC_PF_STD opt_altera_msgdma.h ALTERA_MSGDMA_DESC_PF_EXT opt_altera_msgdma.h # options for the Broadcom BCM43xx driver (bwi) BWI_DEBUG opt_bwi.h BWI_DEBUG_VERBOSE opt_bwi.h # options for the Brodacom BCM43xx driver (bwn) BWN_DEBUG opt_bwn.h BWN_GPL_PHY opt_bwn.h BWN_USE_SIBA opt_bwn.h # Options for the SIBA driver SIBA_DEBUG opt_siba.h # options for the Marvell 8335 wireless driver MALO_DEBUG opt_malo.h MALO_TXBUF opt_malo.h MALO_RXBUF opt_malo.h # options for the Marvell wireless driver MWL_DEBUG opt_mwl.h MWL_TXBUF opt_mwl.h MWL_RXBUF opt_mwl.h MWL_DIAGAPI opt_mwl.h MWL_AGGR_SIZE opt_mwl.h MWL_TX_NODROP opt_mwl.h # Options for the Marvell NETA driver MVNETA_MULTIQUEUE opt_mvneta.h MVNETA_KTR opt_mvneta.h # Options for the Intel 802.11ac wireless driver IWM_DEBUG opt_iwm.h # Options for the Intel 802.11n wireless driver IWN_DEBUG opt_iwn.h # Options for the Intel 3945ABG wireless driver WPI_DEBUG opt_wpi.h # dcons options DCONS_BUF_SIZE opt_dcons.h DCONS_POLL_HZ opt_dcons.h DCONS_FORCE_CONSOLE opt_dcons.h DCONS_FORCE_GDB opt_dcons.h # HWPMC options HWPMC_DEBUG opt_global.h HWPMC_HOOKS # 802.11 support layer IEEE80211_DEBUG opt_wlan.h IEEE80211_DEBUG_REFCNT opt_wlan.h IEEE80211_SUPPORT_MESH opt_wlan.h IEEE80211_SUPPORT_SUPERG opt_wlan.h IEEE80211_SUPPORT_TDMA opt_wlan.h IEEE80211_ALQ opt_wlan.h IEEE80211_DFS_DEBUG opt_wlan.h # 802.11 TDMA support TDMA_SLOTLEN_DEFAULT opt_tdma.h TDMA_SLOTCNT_DEFAULT opt_tdma.h TDMA_BINTVAL_DEFAULT opt_tdma.h TDMA_TXRATE_11B_DEFAULT opt_tdma.h TDMA_TXRATE_11G_DEFAULT opt_tdma.h TDMA_TXRATE_11A_DEFAULT opt_tdma.h TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h TDMA_TXRATE_HALF_DEFAULT opt_tdma.h TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h TDMA_TXRATE_11NA_DEFAULT opt_tdma.h TDMA_TXRATE_11NG_DEFAULT opt_tdma.h # VideoMode PICKMODE_DEBUG opt_videomode.h # Network stack virtualization options VIMAGE opt_global.h VNET_DEBUG opt_global.h # Common Flash Interface (CFI) options CFI_SUPPORT_STRATAFLASH opt_cfi.h CFI_ARMEDANDDANGEROUS opt_cfi.h CFI_HARDWAREBYTESWAP opt_cfi.h # Sound options SND_DEBUG opt_snd.h SND_DIAGNOSTIC opt_snd.h SND_FEEDER_MULTIFORMAT opt_snd.h SND_FEEDER_FULL_MULTIFORMAT opt_snd.h SND_FEEDER_RATE_HP opt_snd.h SND_PCM_64 opt_snd.h SND_OLDSTEREO opt_snd.h X86BIOS # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h # OFED Infiniband stack OFED opt_ofed.h OFED_DEBUG_INIT opt_ofed.h SDP opt_ofed.h SDP_DEBUG opt_ofed.h IPOIB opt_ofed.h IPOIB_DEBUG opt_ofed.h IPOIB_CM opt_ofed.h # Resource Accounting RACCT opt_global.h RACCT_DEFAULT_TO_DISABLED opt_global.h # Resource Limits RCTL opt_global.h # Random number generator(s) # Alternative RNG algorithm. RANDOM_FENESTRASX opt_global.h # With this, no entropy processor is loaded, but the entropy # harvesting infrastructure is present. This means an entropy # processor may be loaded as a module. RANDOM_LOADABLE opt_global.h # This turns on high-rate and potentially expensive harvesting in # the uma slab allocator. RANDOM_ENABLE_UMA opt_global.h RANDOM_ENABLE_ETHER opt_global.h # This options turns TPM into entropy source. TPM_HARVEST opt_tpm.h # BHND(4) driver BHND_LOGLEVEL opt_global.h # GPIO and child devices GPIO_SPI_DEBUG opt_gpio.h # SPI devices SPIGEN_LEGACY_CDEVNAME opt_spi.h # etherswitch(4) driver RTL8366_SOFT_RESET opt_etherswitch.h # evdev protocol support EVDEV_SUPPORT opt_evdev.h EVDEV_DEBUG opt_evdev.h UINPUT_DEBUG opt_evdev.h # Hyper-V network driver HN_DEBUG opt_hn.h # CAM-based MMC stack MMCCAM # Encrypted kernel crash dumps EKCD opt_ekcd.h # NVME options NVME_USE_NVD opt_nvme.h # amdsbwd options AMDSBWD_DEBUG opt_amdsbwd.h # gcov support GCOV opt_global.h LINDEBUGFS # options for HID support HID_DEBUG opt_hid.h IICHID_DEBUG opt_hid.h IICHID_SAMPLING opt_hid.h HKBD_DFLT_KEYMAP opt_hkbd.h HIDRAW_MAKE_UHID_ALIAS opt_hid.h # kenv options # The early kernel environment (loader environment, config(8)-provided static) # is typically cleared after the dynamic environment comes up to ensure that # we're not inadvertently holding on to 'secret' values in these stale envs. # This option is insecure except in controlled environments where the static # environment's contents are known to be safe. PRESERVE_EARLY_KENV opt_global.h diff --git a/sys/modules/carp/Makefile b/sys/modules/carp/Makefile index 5506d1aaec23..d7a85043539f 100644 --- a/sys/modules/carp/Makefile +++ b/sys/modules/carp/Makefile @@ -1,10 +1,10 @@ .PATH: ${SRCTOP}/sys/netinet .PATH: ${SRCTOP}/sys/crypto KMOD= carp SRCS= ip_carp.c sha1.c SRCS+= device_if.h bus_if.h vnode_if.h -SRCS+= opt_carp.h opt_bpf.h opt_inet.h opt_inet6.h opt_ofed.h opt_netlink.h +SRCS+= opt_carp.h opt_bpf.h opt_inet.h opt_inet6.h opt_ofed.h .include diff --git a/sys/modules/ktest/ktest/Makefile b/sys/modules/ktest/ktest/Makefile index f72aa5d820db..264eb7b9f443 100644 --- a/sys/modules/ktest/ktest/Makefile +++ b/sys/modules/ktest/ktest/Makefile @@ -1,13 +1,12 @@ PACKAGE= tests SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" .PATH: ${SYSDIR}/tests KMOD= ktest SRCS= ktest.c -SRCS+= opt_netlink.h .include diff --git a/sys/modules/ktest/ktest_example/Makefile b/sys/modules/ktest/ktest_example/Makefile index 5b0c9b41f9c8..a5fe6d6d7843 100644 --- a/sys/modules/ktest/ktest_example/Makefile +++ b/sys/modules/ktest/ktest_example/Makefile @@ -1,13 +1,12 @@ PACKAGE= tests SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" .PATH: ${SYSDIR}/tests KMOD= ktest_example SRCS= ktest_example.c -SRCS+= opt_netlink.h .include diff --git a/sys/modules/ktest/ktest_netlink_message_writer/Makefile b/sys/modules/ktest/ktest_netlink_message_writer/Makefile index cdcd1d2c190f..2f84b2fecd72 100644 --- a/sys/modules/ktest/ktest_netlink_message_writer/Makefile +++ b/sys/modules/ktest/ktest_netlink_message_writer/Makefile @@ -1,14 +1,13 @@ PACKAGE= tests SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" .PATH: ${SYSDIR}/netlink KMOD= ktest_netlink_message_writer SRCS= ktest_netlink_message_writer.c -SRCS+= opt_netlink.h .include diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile index 51b434d91b00..d1035445ccfb 100644 --- a/sys/modules/linux/Makefile +++ b/sys/modules/linux/Makefile @@ -1,147 +1,146 @@ .if ${MACHINE_CPUARCH} == "amd64" SFX= 32 CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 .endif .PATH: ${SRCTOP}/sys/compat/linux ${SRCTOP}/sys/${MACHINE_CPUARCH}/linux${SFX} .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" .PATH: ${SRCTOP}/sys/x86/linux .endif KMOD= linux SRCS= linux${SFX}_dummy_machdep.c \ linux_elf32.c \ linux_event.c \ linux_file.c \ linux_fork.c \ linux_futex.c \ linux_getcwd.c \ linux_ioctl.c \ linux_ipc.c \ linux${SFX}_machdep.c \ linux_misc.c \ linux_rseq.c \ linux_signal.c \ linux_socket.c \ linux_stats.c \ linux${SFX}_syscalls.c \ linux_sysctl.c \ linux${SFX}_sysent.c \ linux${SFX}_sysvec.c \ linux_time.c \ linux_timer.c \ linux_uid16.c \ linux_vdso.c \ linux_xattr.c \ opt_inet6.h \ opt_ktrace.h \ opt_posix.h \ bus_if.h \ device_if.h \ vnode_if.h .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" SRCS+= linux_dummy_x86.c VDSODEPS=linux_vdso_gettc_x86.inc .endif .if ${MACHINE_CPUARCH} == "amd64" SRCS+= linux${SFX}_support.S .endif DPSRCS= assym.inc linux${SFX}_genassym.c # XXX: for assym.inc SRCS+= opt_kstack_pages.h opt_nfs.h opt_hwpmc_hooks.h .if ${MACHINE_CPUARCH} == "i386" SRCS+= opt_apic.h opt_cpu.h .endif OBJS= linux${SFX}_vdso.so .if ${MACHINE_CPUARCH} == "i386" SRCS+= imgact_linux.c \ linux.c \ linux_dummy.c \ linux_emul.c \ linux_errno.c \ linux_mib.c \ linux_mmap.c \ linux_ptrace_machdep.c \ linux_util.c \ linux_vdso_selector_x86.c \ linux_x86.c \ linux_copyout.c \ - linux_netlink.c \ - opt_netlink.h + linux_netlink.c .endif .if ${MACHINE_CPUARCH} == "i386" EXPORT_SYMS= EXPORT_SYMS+= linux_get_osname EXPORT_SYMS+= linux_get_osrelease EXPORT_SYMS+= linux_ioctl_register_handler EXPORT_SYMS+= linux_ioctl_unregister_handler .endif CLEANFILES= linux${SFX}_assym.h linux${SFX}_genassym.o linux${SFX}_locore.o \ genassym.o linux${SFX}_vdso_gtod.o linux${SFX}_vdso.so.o linux${SFX}_assym.h: linux${SFX}_genassym.o sh ${SYSDIR}/kern/genassym.sh linux${SFX}_genassym.o > ${.TARGET} .if ${MACHINE_CPUARCH} == "amd64" VDSOFLAGS=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 -m32 .else VDSOFLAGS=-mregparm=0 .endif linux${SFX}_locore.o: linux${SFX}_assym.h assym.inc ${CC} -c -x assembler-with-cpp -DLOCORE -fPIC -pipe -O2 -Werror \ -msoft-float \ -fno-common -nostdinc -fasynchronous-unwind-tables \ -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \ -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ ${.IMPSRC} -o ${.TARGET} linux${SFX}_vdso_gtod.o: linux_vdso_gtod.inc ${VDSODEPS} ${CC} -c -fPIC -pipe -O2 -Werror -msoft-float \ -fno-common -nostdinc -fasynchronous-unwind-tables \ -fno-omit-frame-pointer -foptimize-sibling-calls ${VDSOFLAGS} \ -fno-stack-protector -I. -I${SYSDIR} -I${SRCTOP}/include \ ${.IMPSRC} -o ${.TARGET} linux${SFX}_vdso.so.o: linux${SFX}_locore.o linux${SFX}_vdso_gtod.o ${LD} -m elf_i386 --shared --eh-frame-hdr -soname=linux-gate.so.1 \ --no-undefined --hash-style=both -warn-common -nostdlib \ --strip-debug -s --build-id=sha1 --Bsymbolic \ -T${SRCTOP}/sys/${MACHINE}/linux${SFX}/linux${SFX}_vdso.lds.s \ -o ${.TARGET} ${.ALLSRC:M*.o} .if ${MACHINE_CPUARCH} == "amd64" OBJCOPY_TARGET=--output-target elf64-x86-64-freebsd --binary-architecture i386 .elif ${MACHINE_CPUARCH} == "i386" OBJCOPY_TARGET=--output-target elf32-i386-freebsd --binary-architecture i386 .else .error ${MACHINE_CPUARCH} not yet supported by linux .endif linux${SFX}_vdso.so: linux${SFX}_vdso.so.o ${OBJCOPY} --input-target binary ${OBJCOPY_TARGET} \ linux${SFX}_vdso.so.o ${.TARGET} ${STRIPBIN} -N _binary_linux${SFX}_vdso_so_o_size ${.TARGET} .if ${MACHINE_CPUARCH} == "amd64" linux${SFX}_support.o: linux${SFX}_support.S linux${SFX}_assym.h assym.inc ${CC} -c -x assembler-with-cpp -DLOCORE ${CFLAGS} \ ${.ALLSRC:M*.S:u} -o ${.TARGET} .endif linux${SFX}_genassym.o: offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} -fcommon ${.IMPSRC} .if !defined(KERNBUILDDIR) .warning Building Linuxulator outside of a kernel does not make sense .endif EXPORT_SYMS= YES .include diff --git a/sys/modules/linux_common/Makefile b/sys/modules/linux_common/Makefile index 07df4555d1ee..374bc32215a2 100644 --- a/sys/modules/linux_common/Makefile +++ b/sys/modules/linux_common/Makefile @@ -1,28 +1,27 @@ .PATH: ${SRCTOP}/sys/compat/linux .if ${MACHINE_CPUARCH} == "amd64" .PATH: ${SRCTOP}/sys/x86/linux .endif KMOD= linux_common SRCS= linux_common.c linux_mib.c linux_mmap.c linux_util.c linux_emul.c \ linux_dummy.c linux_errno.c linux_netlink.c \ - linux.c device_if.h vnode_if.h bus_if.h opt_inet6.h opt_inet.h \ - opt_netlink.h + linux.c device_if.h vnode_if.h bus_if.h opt_inet6.h opt_inet.h .if ${MACHINE_CPUARCH} == "amd64" SRCS+= linux_x86.c linux_vdso_selector_x86.c .endif EXPORT_SYMS= EXPORT_SYMS+= linux_get_osname EXPORT_SYMS+= linux_get_osrelease EXPORT_SYMS+= linux_use_real_ifname .if !defined(KERNBUILDDIR) .warning Building Linuxulator outside of a kernel does not make sense .endif EXPORT_SYMS= YES .include diff --git a/sys/modules/netlink/Makefile b/sys/modules/netlink/Makefile index 667c0b2475fe..6835be6e7bd7 100644 --- a/sys/modules/netlink/Makefile +++ b/sys/modules/netlink/Makefile @@ -1,20 +1,20 @@ .PATH: ${SRCTOP}/sys/netlink KMOD= netlink SRCS = netlink_module.c netlink_domain.c netlink_io.c \ netlink_message_writer.c netlink_generic.c \ netlink_route.c route/iface.c route/iface_drivers.c route/neigh.c \ route/nexthop.c route/rt.c -SRCS+= opt_inet.h opt_inet6.h opt_route.h opt_netlink.h +SRCS+= opt_inet.h opt_inet6.h opt_route.h CFLAGS+= -DNETLINK_MODULE EXPORT_SYMS= EXPORT_SYMS+= nlmsg_get_chain_writer EXPORT_SYMS+= nlmsg_refill_buffer EXPORT_SYMS+= nlmsg_end EXPORT_SYMS+= nlmsg_flush EXPORT_SYMS= YES .include diff --git a/sys/net/if_clone.c b/sys/net/if_clone.c index 1a2cea94ad35..5345d613f9d6 100644 --- a/sys/net/if_clone.c +++ b/sys/net/if_clone.c @@ -1,956 +1,954 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2012 Gleb Smirnoff * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Current IF_MAXUNIT expands maximum to 5 characters. */ #define IFCLOSIZ (IFNAMSIZ - 5) /* * Structure describing a `cloning' interface. * * List of locks * (c) const until freeing * (d) driver specific data, may need external protection. * (e) locked by if_cloners_mtx * (i) locked by ifc_mtx mtx */ struct if_clone { char ifc_name[IFCLOSIZ]; /* (c) Name of device, e.g. `gif' */ struct unrhdr *ifc_unrhdr; /* (c) alloc_unr(9) header */ int ifc_maxunit; /* (c) maximum unit number */ int ifc_flags; long ifc_refcnt; /* (i) Reference count. */ LIST_HEAD(, ifnet) ifc_iflist; /* (i) List of cloned interfaces */ struct mtx ifc_mtx; /* Mutex to protect members. */ ifc_match_f *ifc_match; /* (c) Matcher function */ ifc_create_f *ifc_create; /* (c) Creates new interface */ ifc_destroy_f *ifc_destroy; /* (c) Destroys cloned interface */ ifc_create_nl_f *create_nl; /* (c) Netlink creation handler */ ifc_modify_nl_f *modify_nl; /* (c) Netlink modification handler */ ifc_dump_nl_f *dump_nl; /* (c) Netlink dump handler */ #ifdef CLONE_COMPAT_13 /* (c) Driver specific cloning functions. Called with no locks held. */ union { struct { /* advanced cloner */ ifc_create_t *_ifc_create; ifc_destroy_t *_ifc_destroy; } A; struct { /* simple cloner */ ifcs_create_t *_ifcs_create; ifcs_destroy_t *_ifcs_destroy; int _ifcs_minifs; /* minimum ifs */ } S; } U; #define ifca_create U.A._ifc_create #define ifca_destroy U.A._ifc_destroy #define ifcs_create U.S._ifcs_create #define ifcs_destroy U.S._ifcs_destroy #define ifcs_minifs U.S._ifcs_minifs #endif LIST_ENTRY(if_clone) ifc_list; /* (e) On list of cloners */ }; static void if_clone_free(struct if_clone *ifc); static int if_clone_createif_nl(struct if_clone *ifc, const char *name, struct ifc_data_nl *ifd); static int ifc_simple_match(struct if_clone *ifc, const char *name); static int ifc_handle_unit(struct if_clone *ifc, char *name, size_t len, int *punit); static struct if_clone *ifc_find_cloner(const char *name); static struct if_clone *ifc_find_cloner_match(const char *name); #ifdef CLONE_COMPAT_13 static int ifc_simple_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen, struct ifc_data *ifc_data, struct ifnet **ifpp); static int ifc_advanced_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen, struct ifc_data *ifc_data, struct ifnet **ifpp); #endif static struct mtx if_cloners_mtx; MTX_SYSINIT(if_cloners_lock, &if_cloners_mtx, "if_cloners lock", MTX_DEF); VNET_DEFINE_STATIC(int, if_cloners_count); VNET_DEFINE(LIST_HEAD(, if_clone), if_cloners); #define V_if_cloners_count VNET(if_cloners_count) #define V_if_cloners VNET(if_cloners) #define IF_CLONERS_LOCK_ASSERT() mtx_assert(&if_cloners_mtx, MA_OWNED) #define IF_CLONERS_LOCK() mtx_lock(&if_cloners_mtx) #define IF_CLONERS_UNLOCK() mtx_unlock(&if_cloners_mtx) #define IF_CLONE_LOCK_INIT(ifc) \ mtx_init(&(ifc)->ifc_mtx, "if_clone lock", NULL, MTX_DEF) #define IF_CLONE_LOCK_DESTROY(ifc) mtx_destroy(&(ifc)->ifc_mtx) #define IF_CLONE_LOCK_ASSERT(ifc) mtx_assert(&(ifc)->ifc_mtx, MA_OWNED) #define IF_CLONE_LOCK(ifc) mtx_lock(&(ifc)->ifc_mtx) #define IF_CLONE_UNLOCK(ifc) mtx_unlock(&(ifc)->ifc_mtx) #define IF_CLONE_ADDREF(ifc) \ do { \ IF_CLONE_LOCK(ifc); \ IF_CLONE_ADDREF_LOCKED(ifc); \ IF_CLONE_UNLOCK(ifc); \ } while (0) #define IF_CLONE_ADDREF_LOCKED(ifc) \ do { \ IF_CLONE_LOCK_ASSERT(ifc); \ KASSERT((ifc)->ifc_refcnt >= 0, \ ("negative refcnt %ld", (ifc)->ifc_refcnt)); \ (ifc)->ifc_refcnt++; \ } while (0) #define IF_CLONE_REMREF(ifc) \ do { \ IF_CLONE_LOCK(ifc); \ IF_CLONE_REMREF_LOCKED(ifc); \ } while (0) #define IF_CLONE_REMREF_LOCKED(ifc) \ do { \ IF_CLONE_LOCK_ASSERT(ifc); \ KASSERT((ifc)->ifc_refcnt > 0, \ ("bogus refcnt %ld", (ifc)->ifc_refcnt)); \ if (--(ifc)->ifc_refcnt == 0) { \ IF_CLONE_UNLOCK(ifc); \ if_clone_free(ifc); \ } else { \ /* silently free the lock */ \ IF_CLONE_UNLOCK(ifc); \ } \ } while (0) #define IFC_IFLIST_INSERT(_ifc, _ifp) \ LIST_INSERT_HEAD(&_ifc->ifc_iflist, _ifp, if_clones) #define IFC_IFLIST_REMOVE(_ifc, _ifp) \ LIST_REMOVE(_ifp, if_clones) static MALLOC_DEFINE(M_CLONE, "clone", "interface cloning framework"); void vnet_if_clone_init(void) { LIST_INIT(&V_if_cloners); } /* * Lookup and create a clone network interface. */ int ifc_create_ifp(const char *name, struct ifc_data *ifd, struct ifnet **ifpp) { struct if_clone *ifc = ifc_find_cloner_match(name); if (ifc == NULL) return (EINVAL); struct ifc_data_nl ifd_new = { .flags = ifd->flags, .unit = ifd->unit, .params = ifd->params, }; int error = if_clone_createif_nl(ifc, name, &ifd_new); if (ifpp != NULL) *ifpp = ifd_new.ifp; return (error); } bool ifc_create_ifp_nl(const char *name, struct ifc_data_nl *ifd) { struct if_clone *ifc = ifc_find_cloner_match(name); if (ifc == NULL) { ifd->error = EINVAL; return (false); } ifd->error = if_clone_createif_nl(ifc, name, ifd); return (true); } int if_clone_create(char *name, size_t len, caddr_t params) { struct ifc_data ifd = { .params = params }; struct ifnet *ifp; int error = ifc_create_ifp(name, &ifd, &ifp); if (error == 0) strlcpy(name, if_name(ifp), len); return (error); } bool ifc_modify_ifp_nl(struct ifnet *ifp, struct ifc_data_nl *ifd) { struct if_clone *ifc = ifc_find_cloner(ifp->if_dname); if (ifc == NULL) { ifd->error = EINVAL; return (false); } ifd->error = (*ifc->modify_nl)(ifp, ifd); return (true); } bool ifc_dump_ifp_nl(struct ifnet *ifp, struct nl_writer *nw) { struct if_clone *ifc = ifc_find_cloner(ifp->if_dname); if (ifc == NULL) return (false); (*ifc->dump_nl)(ifp, nw); return (true); } static int ifc_create_ifp_nl_default(struct if_clone *ifc, char *name, size_t len, struct ifc_data_nl *ifd) { struct ifc_data ifd_new = { .flags = ifd->flags, .unit = ifd->unit, .params = ifd->params, }; return ((*ifc->ifc_create)(ifc, name, len, &ifd_new, &ifd->ifp)); } static int ifc_modify_ifp_nl_default(struct ifnet *ifp, struct ifc_data_nl *ifd) { if (ifd->lattrs != NULL) return (nl_modify_ifp_generic(ifp, ifd->lattrs, ifd->bm, ifd->npt)); return (0); } static void ifc_dump_ifp_nl_default(struct ifnet *ifp, struct nl_writer *nw) { int off = nlattr_add_nested(nw, IFLA_LINKINFO); if (off != 0) { nlattr_add_string(nw, IFLA_INFO_KIND, ifp->if_dname); nlattr_set_len(nw, off); } } void ifc_link_ifp(struct if_clone *ifc, struct ifnet *ifp) { if_addgroup(ifp, ifc->ifc_name); IF_CLONE_LOCK(ifc); IFC_IFLIST_INSERT(ifc, ifp); IF_CLONE_UNLOCK(ifc); } void if_clone_addif(struct if_clone *ifc, struct ifnet *ifp) { ifc_link_ifp(ifc, ifp); } bool ifc_unlink_ifp(struct if_clone *ifc, struct ifnet *ifp) { struct ifnet *ifcifp; IF_CLONE_LOCK(ifc); LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) { if (ifcifp == ifp) { IFC_IFLIST_REMOVE(ifc, ifp); break; } } IF_CLONE_UNLOCK(ifc); if (ifcifp != NULL) if_delgroup(ifp, ifc->ifc_name); return (ifcifp != NULL); } static struct if_clone * ifc_find_cloner_match(const char *name) { struct if_clone *ifc; IF_CLONERS_LOCK(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { if (ifc->ifc_match(ifc, name)) break; } IF_CLONERS_UNLOCK(); return (ifc); } static struct if_clone * ifc_find_cloner(const char *name) { struct if_clone *ifc; IF_CLONERS_LOCK(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { if (strcmp(ifc->ifc_name, name) == 0) { break; } } IF_CLONERS_UNLOCK(); return (ifc); } static struct if_clone * ifc_find_cloner_in_vnet(const char *name, struct vnet *vnet) { CURVNET_SET_QUIET(vnet); struct if_clone *ifc = ifc_find_cloner(name); CURVNET_RESTORE(); return (ifc); } /* * Create a clone network interface. */ static int if_clone_createif_nl(struct if_clone *ifc, const char *ifname, struct ifc_data_nl *ifd) { char name[IFNAMSIZ]; int error; strlcpy(name, ifname, sizeof(name)); if (ifunit(name) != NULL) return (EEXIST); if (ifc->ifc_flags & IFC_F_AUTOUNIT) { if ((error = ifc_handle_unit(ifc, name, sizeof(name), &ifd->unit)) != 0) return (error); } if (ifd->lattrs != NULL) error = (*ifc->create_nl)(ifc, name, sizeof(name), ifd); else error = ifc_create_ifp_nl_default(ifc, name, sizeof(name), ifd); if (error != 0) { if (ifc->ifc_flags & IFC_F_AUTOUNIT) ifc_free_unit(ifc, ifd->unit); return (error); } MPASS(ifd->ifp != NULL); if_clone_addif(ifc, ifd->ifp); if (ifd->lattrs != NULL) error = (*ifc->modify_nl)(ifd->ifp, ifd); return (error); } /* * Lookup and destroy a clone network interface. */ int if_clone_destroy(const char *name) { int err; struct if_clone *ifc; struct ifnet *ifp; ifp = ifunit_ref(name); if (ifp == NULL) return (ENXIO); ifc = ifc_find_cloner_in_vnet(ifp->if_dname, ifp->if_home_vnet); if (ifc == NULL) { if_rele(ifp); return (EINVAL); } err = if_clone_destroyif(ifc, ifp); if_rele(ifp); return err; } /* * Destroy a clone network interface. */ static int if_clone_destroyif_flags(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { int err; /* * Given that the cloned ifnet might be attached to a different * vnet from where its cloner was registered, we have to * switch to the vnet context of the target vnet. */ CURVNET_SET_QUIET(ifp->if_vnet); if (!ifc_unlink_ifp(ifc, ifp)) { CURVNET_RESTORE(); return (ENXIO); /* ifp is not on the list. */ } int unit = ifp->if_dunit; err = (*ifc->ifc_destroy)(ifc, ifp, flags); if (err != 0) ifc_link_ifp(ifc, ifp); else if (ifc->ifc_flags & IFC_F_AUTOUNIT) ifc_free_unit(ifc, unit); CURVNET_RESTORE(); return (err); } int if_clone_destroyif(struct if_clone *ifc, struct ifnet *ifp) { return (if_clone_destroyif_flags(ifc, ifp, 0)); } static struct if_clone * if_clone_alloc(const char *name, int maxunit) { struct if_clone *ifc; KASSERT(name != NULL, ("%s: no name\n", __func__)); ifc = malloc(sizeof(struct if_clone), M_CLONE, M_WAITOK | M_ZERO); strncpy(ifc->ifc_name, name, IFCLOSIZ-1); IF_CLONE_LOCK_INIT(ifc); IF_CLONE_ADDREF(ifc); ifc->ifc_maxunit = maxunit ? maxunit : IF_MAXUNIT; ifc->ifc_unrhdr = new_unrhdr(0, ifc->ifc_maxunit, &ifc->ifc_mtx); LIST_INIT(&ifc->ifc_iflist); ifc->create_nl = ifc_create_ifp_nl_default; ifc->modify_nl = ifc_modify_ifp_nl_default; ifc->dump_nl = ifc_dump_ifp_nl_default; return (ifc); } static int if_clone_attach(struct if_clone *ifc) { struct if_clone *ifc1; IF_CLONERS_LOCK(); LIST_FOREACH(ifc1, &V_if_cloners, ifc_list) if (strcmp(ifc->ifc_name, ifc1->ifc_name) == 0) { IF_CLONERS_UNLOCK(); IF_CLONE_REMREF(ifc); return (EEXIST); } LIST_INSERT_HEAD(&V_if_cloners, ifc, ifc_list); V_if_cloners_count++; IF_CLONERS_UNLOCK(); return (0); } struct if_clone * ifc_attach_cloner(const char *name, struct if_clone_addreq *req) { if (req->create_f == NULL || req->destroy_f == NULL) return (NULL); if (strnlen(name, IFCLOSIZ) >= (IFCLOSIZ - 1)) return (NULL); struct if_clone *ifc = if_clone_alloc(name, req->maxunit); ifc->ifc_match = req->match_f != NULL ? req->match_f : ifc_simple_match; ifc->ifc_create = req->create_f; ifc->ifc_destroy = req->destroy_f; ifc->ifc_flags = (req->flags & IFC_F_AUTOUNIT); if (req->version == 2) { struct if_clone_addreq_v2 *req2 = (struct if_clone_addreq_v2 *)req; ifc->create_nl = req2->create_nl_f; ifc->modify_nl = req2->modify_nl_f; ifc->dump_nl = req2->dump_nl_f; } ifc->dump_nl = ifc_dump_ifp_nl_default; if (if_clone_attach(ifc) != 0) return (NULL); EVENTHANDLER_INVOKE(if_clone_event, ifc); return (ifc); } void ifc_detach_cloner(struct if_clone *ifc) { if_clone_detach(ifc); } #ifdef CLONE_COMPAT_13 static int ifc_advanced_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen, struct ifc_data *ifc_data, struct ifnet **ifpp) { int error = ifc->ifca_create(ifc, name, maxlen, ifc_data->params); if (error == 0) *ifpp = ifunit(name); return (error); } static int ifc_advanced_destroy_wrapper(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { if (ifc->ifca_destroy == NULL) return (ENOTSUP); return (ifc->ifca_destroy(ifc, ifp)); } struct if_clone * if_clone_advanced(const char *name, u_int maxunit, ifc_match_t match, ifc_create_t create, ifc_destroy_t destroy) { struct if_clone *ifc; ifc = if_clone_alloc(name, maxunit); ifc->ifc_match = match; ifc->ifc_create = ifc_advanced_create_wrapper; ifc->ifc_destroy = ifc_advanced_destroy_wrapper; ifc->ifca_destroy = destroy; ifc->ifca_create = create; if (if_clone_attach(ifc) != 0) return (NULL); EVENTHANDLER_INVOKE(if_clone_event, ifc); return (ifc); } static int ifc_simple_create_wrapper(struct if_clone *ifc, char *name, size_t maxlen, struct ifc_data *ifc_data, struct ifnet **ifpp) { int unit = 0; ifc_name2unit(name, &unit); int error = ifc->ifcs_create(ifc, unit, ifc_data->params); if (error == 0) *ifpp = ifunit(name); return (error); } static int ifc_simple_destroy_wrapper(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { if (ifp->if_dunit < ifc->ifcs_minifs && (flags & IFC_F_FORCE) == 0) return (EINVAL); ifc->ifcs_destroy(ifp); return (0); } struct if_clone * if_clone_simple(const char *name, ifcs_create_t create, ifcs_destroy_t destroy, u_int minifs) { struct if_clone *ifc; u_int unit; ifc = if_clone_alloc(name, 0); ifc->ifc_match = ifc_simple_match; ifc->ifc_create = ifc_simple_create_wrapper; ifc->ifc_destroy = ifc_simple_destroy_wrapper; ifc->ifcs_create = create; ifc->ifcs_destroy = destroy; ifc->ifcs_minifs = minifs; ifc->ifc_flags = IFC_F_AUTOUNIT; if (if_clone_attach(ifc) != 0) return (NULL); for (unit = 0; unit < minifs; unit++) { char name[IFNAMSIZ]; int error __unused; struct ifc_data_nl ifd = {}; snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, unit); error = if_clone_createif_nl(ifc, name, &ifd); KASSERT(error == 0, ("%s: failed to create required interface %s", __func__, name)); } EVENTHANDLER_INVOKE(if_clone_event, ifc); return (ifc); } #endif /* * Unregister a network interface cloner. */ void if_clone_detach(struct if_clone *ifc) { IF_CLONERS_LOCK(); LIST_REMOVE(ifc, ifc_list); V_if_cloners_count--; IF_CLONERS_UNLOCK(); /* destroy all interfaces for this cloner */ while (!LIST_EMPTY(&ifc->ifc_iflist)) if_clone_destroyif_flags(ifc, LIST_FIRST(&ifc->ifc_iflist), IFC_F_FORCE); IF_CLONE_REMREF(ifc); } static void if_clone_free(struct if_clone *ifc) { KASSERT(LIST_EMPTY(&ifc->ifc_iflist), ("%s: ifc_iflist not empty", __func__)); IF_CLONE_LOCK_DESTROY(ifc); delete_unrhdr(ifc->ifc_unrhdr); free(ifc, M_CLONE); } /* * Provide list of interface cloners to userspace. */ int if_clone_list(struct if_clonereq *ifcr) { char *buf, *dst, *outbuf = NULL; struct if_clone *ifc; int buf_count, count, err = 0; if (ifcr->ifcr_count < 0) return (EINVAL); IF_CLONERS_LOCK(); /* * Set our internal output buffer size. We could end up not * reporting a cloner that is added between the unlock and lock * below, but that's not a major problem. Not caping our * allocation to the number of cloners actually in the system * could be because that would let arbitrary users cause us to * allocate arbitrary amounts of kernel memory. */ buf_count = (V_if_cloners_count < ifcr->ifcr_count) ? V_if_cloners_count : ifcr->ifcr_count; IF_CLONERS_UNLOCK(); outbuf = malloc(IFNAMSIZ*buf_count, M_CLONE, M_WAITOK | M_ZERO); IF_CLONERS_LOCK(); ifcr->ifcr_total = V_if_cloners_count; if ((dst = ifcr->ifcr_buffer) == NULL) { /* Just asking how many there are. */ goto done; } count = (V_if_cloners_count < buf_count) ? V_if_cloners_count : buf_count; for (ifc = LIST_FIRST(&V_if_cloners), buf = outbuf; ifc != NULL && count != 0; ifc = LIST_NEXT(ifc, ifc_list), count--, buf += IFNAMSIZ) { strlcpy(buf, ifc->ifc_name, IFNAMSIZ); } done: IF_CLONERS_UNLOCK(); if (err == 0 && dst != NULL) err = copyout(outbuf, dst, buf_count*IFNAMSIZ); if (outbuf != NULL) free(outbuf, M_CLONE); return (err); } #ifdef VIMAGE /* * if_clone_restoregroup() is used in context of if_vmove(). * * Since if_detach_internal() has removed the interface from ALL groups, we * need to "restore" interface membership in the cloner's group. Note that * interface belongs to cloner in its home vnet, so we first find the original * cloner, and then we confirm that cloner with the same name exists in the * current vnet. */ void if_clone_restoregroup(struct ifnet *ifp) { struct if_clone *ifc; struct ifnet *ifcifp; char ifc_name[IFCLOSIZ] = { [0] = '\0' }; CURVNET_SET_QUIET(ifp->if_home_vnet); IF_CLONERS_LOCK(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) { IF_CLONE_LOCK(ifc); LIST_FOREACH(ifcifp, &ifc->ifc_iflist, if_clones) { if (ifp == ifcifp) { strncpy(ifc_name, ifc->ifc_name, IFCLOSIZ-1); break; } } IF_CLONE_UNLOCK(ifc); if (ifc_name[0] != '\0') break; } CURVNET_RESTORE(); LIST_FOREACH(ifc, &V_if_cloners, ifc_list) if (strcmp(ifc->ifc_name, ifc_name) == 0) break; IF_CLONERS_UNLOCK(); if (ifc != NULL) if_addgroup(ifp, ifc_name); } #endif /* * A utility function to extract unit numbers from interface names of * the form name###. * * Returns 0 on success and an error on failure. */ int ifc_name2unit(const char *name, int *unit) { const char *cp; int cutoff = INT_MAX / 10; int cutlim = INT_MAX % 10; for (cp = name; *cp != '\0' && (*cp < '0' || *cp > '9'); cp++) ; if (*cp == '\0') { *unit = -1; } else if (cp[0] == '0' && cp[1] != '\0') { /* Disallow leading zeroes. */ return (EINVAL); } else { for (*unit = 0; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') { /* Bogus unit number. */ return (EINVAL); } if (*unit > cutoff || (*unit == cutoff && *cp - '0' > cutlim)) return (EINVAL); *unit = (*unit * 10) + (*cp - '0'); } } return (0); } static int ifc_alloc_unit_specific(struct if_clone *ifc, int *unit) { char name[IFNAMSIZ]; if (*unit > ifc->ifc_maxunit) return (ENOSPC); if (alloc_unr_specific(ifc->ifc_unrhdr, *unit) == -1) return (EEXIST); snprintf(name, IFNAMSIZ, "%s%d", ifc->ifc_name, *unit); if (ifunit(name) != NULL) { free_unr(ifc->ifc_unrhdr, *unit); return (EEXIST); } IF_CLONE_ADDREF(ifc); return (0); } static int ifc_alloc_unit_next(struct if_clone *ifc, int *unit) { int error; *unit = alloc_unr(ifc->ifc_unrhdr); if (*unit == -1) return (ENOSPC); free_unr(ifc->ifc_unrhdr, *unit); for (;;) { error = ifc_alloc_unit_specific(ifc, unit); if (error != EEXIST) break; (*unit)++; } return (error); } int ifc_alloc_unit(struct if_clone *ifc, int *unit) { if (*unit < 0) return (ifc_alloc_unit_next(ifc, unit)); else return (ifc_alloc_unit_specific(ifc, unit)); } void ifc_free_unit(struct if_clone *ifc, int unit) { free_unr(ifc->ifc_unrhdr, unit); IF_CLONE_REMREF(ifc); } static int ifc_simple_match(struct if_clone *ifc, const char *name) { const char *cp; int i; /* Match the name */ for (cp = name, i = 0; i < strlen(ifc->ifc_name); i++, cp++) { if (ifc->ifc_name[i] != *cp) return (0); } /* Make sure there's a unit number or nothing after the name */ for (; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int ifc_handle_unit(struct if_clone *ifc, char *name, size_t len, int *punit) { char *dp; int wildcard; int unit; int err; err = ifc_name2unit(name, &unit); if (err != 0) return (err); wildcard = (unit < 0); err = ifc_alloc_unit(ifc, &unit); if (err != 0) return (err); /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { /* * This can only be a programmer error and * there's no straightforward way to recover if * it happens. */ panic("if_clone_create(): interface name too long"); } } *punit = unit; return (0); } int ifc_copyin(const struct ifc_data *ifd, void *target, size_t len) { if (ifd->params == NULL) return (EINVAL); if (ifd->flags & IFC_F_SYSSPACE) { memcpy(target, ifd->params, len); return (0); } else return (copyin(ifd->params, target, len)); } diff --git a/sys/net/if_vlan.c b/sys/net/if_vlan.c index e7a290564722..b69d8107e30d 100644 --- a/sys/net/if_vlan.c +++ b/sys/net/if_vlan.c @@ -1,2513 +1,2512 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2012 ADARA Networks, Inc. * Copyright 2017 Dell EMC Isilon * * Portions of this software were developed by Robert N. M. Watson under * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" -#include "opt_netlink.h" #include "opt_vlan.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #include #include #include #include #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; #if defined(KERN_TLS) || defined(RATELIMIT) struct vlan_snd_tag { struct m_snd_tag com; struct m_snd_tag *tag; }; static inline struct vlan_snd_tag * mst_to_vst(struct m_snd_tag *mst) { return (__containerof(mst, struct vlan_snd_tag, com)); } #endif /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. */ #ifdef VLAN_ARRAY #define VLAN_FOREACH(_ifv, _trunk) \ size_t _i; \ for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i]) != NULL) #else /* VLAN_ARRAY */ #define VLAN_FOREACH(_ifv, _trunk) \ struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* * This macro provides a facility to iterate over every vlan on a trunk while * also modifying the number of vlans on the trunk. The iteration continues * until some condition is met or there are no more vlans on the trunk. */ #ifdef VLAN_ARRAY /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i])) #else /* VLAN_ARRAY */ /* * The hash table case is more complicated. We allow for the hash table to be * modified (i.e. vlans removed) while we are iterating over it. To allow for * this we must restart the iteration every time we "touch" something during * the iteration, since removal will resize the hash table and invalidate our * current position. If acting on the touched element causes the trunk to be * emptied, then iteration also stops. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ bool _touch = false; \ for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; struct epoch_context mc_epoch_ctx; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) (TRUNK(ifv)->parent) void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ struct ether_8021q_tag ifv_qtag; #define ifv_proto ifv_qtag.proto #define ifv_vid ifv_qtag.vid #define ifv_pcp ifv_qtag.pcp struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; VNET_DECLARE(int, vlan_mtag_pcp); #define V_vlan_mtag_pcp VNET(vlan_mtag_pcp) static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; static eventhandler_tag ifevent_tag; /* * if_vlan uses two module-level synchronizations primitives to allow concurrent * modification of vlan interfaces and (mostly) allow for vlans to be destroyed * while they are being used for tx/rx. To accomplish this in a way that has * acceptable performance and cooperation with other parts of the network stack * there is a non-sleepable epoch(9) and an sx(9). * * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * */ #define _VLAN_SX_ID ifv_sx static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ sx_init_flags(&_VLAN_SX_ID, "vlan_sx", SX_RECURSE) #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) #define VLAN_XUNLOCK() sx_xunlock(&_VLAN_SX_ID) #define VLAN_SLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_SLOCKED) #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) /* * We also have a per-trunk mutex that should be acquired when changing * its state. */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however in practice it does not. Probably this is because the array * is too big to fit into CPU cache. */ #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); static int vlan_snd_tag_modify(struct m_snd_tag *, union if_snd_tag_modify_params *); static int vlan_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void vlan_snd_tag_free(struct m_snd_tag *); static struct m_snd_tag *vlan_next_snd_tag(struct m_snd_tag *); static void vlan_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); #ifdef ALTQ static void vlan_altq_start(struct ifnet *ifp); static int vlan_altq_transmit(struct ifnet *ifp, struct mbuf *m); #endif static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag, uint16_t proto); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, struct ifc_data *, struct ifnet **); static int vlan_clone_destroy(struct if_clone *, struct ifnet *, uint32_t); static int vlan_clone_create_nl(struct if_clone *ifc, char *name, size_t len, struct ifc_data_nl *ifd); static int vlan_clone_modify_nl(struct ifnet *ifp, struct ifc_data_nl *ifd); static void vlan_clone_dump_nl(struct ifnet *ifp, struct nl_writer *nw); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static void vlan_ifevent(void *arg, struct ifnet *ifp, int event); static void vlan_lladdr_fn(void *arg, int pending); static struct if_clone *vlan_cloner; #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif #ifdef RATELIMIT static const struct if_snd_tag_sw vlan_snd_tag_ul_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_UNLIMITED }; static const struct if_snd_tag_sw vlan_snd_tag_rl_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_RATE_LIMIT }; #endif #ifdef KERN_TLS static const struct if_snd_tag_sw vlan_snd_tag_tls_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_TLS }; #ifdef RATELIMIT static const struct if_snd_tag_sw vlan_snd_tag_tls_rl_sw = { .snd_tag_modify = vlan_snd_tag_modify, .snd_tag_query = vlan_snd_tag_query, .snd_tag_free = vlan_snd_tag_free, .next_snd_tag = vlan_next_snd_tag, .type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT }; #endif #endif static void vlan_mc_free(struct epoch_context *ctx) { struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx); free(mc, M_VLAN); } #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) CK_SLIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << (trunk->hwidth - 1); i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_LOCK_DESTROY(trunk); if_rele(trunk->parent); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); CURVNET_RESTORE(); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) { CURVNET_RESTORE(); return (error); } } CURVNET_RESTORE(); return (0); } /* * A handler for interface ifnet events. */ static void vlan_ifevent(void *arg __unused, struct ifnet *ifp, int event) { struct epoch_tracker et; struct ifvlan *ifv; struct ifvlantrunk *trunk; if (event != IFNET_EVENT_UPDATE_BAUDRATE) return; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = ifp->if_baudrate; } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; /* Need the epoch since this is run on taskqueue_swi. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR * ioctl calls on the parent garbling the lladdr of the child vlan. */ TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { /* * Copy new new lladdr into the ifv_ifp, enqueue a task * to actually call if_setlladdr. if_setlladdr needs to * be deferred to a taskqueue because it will call into * the if_vlan ioctl path and try to acquire the global * lock. */ ifv_ifp = ifv->ifv_ifp; bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp), ifp->if_addrlen); sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr; sdl->sdl_alen = ifp->if_addrlen; taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; struct ifvlantrunk *trunk; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_XLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_XUNLOCK(); return; } /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk, ifp->if_vlantrunk == NULL) vlan_unconfig_locked(ifv->ifv_ifp, 1); /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_XUNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } static int vlan_pcp(struct ifnet *ifp, uint16_t *pcpp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *pcpp = ifv->ifv_pcp; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) return (NULL); ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; return (ifp); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); static struct if_clone_addreq_v2 vlan_addreq = { .version = 2, .match_f = vlan_clone_match, .create_f = vlan_clone_create, .destroy_f = vlan_clone_destroy, .create_nl_f = vlan_clone_create_nl, .modify_nl_f = vlan_clone_modify_nl, .dump_nl_f = vlan_clone_dump_nl, }; static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); ifevent_tag = EVENTHANDLER_REGISTER(ifnet_event, vlan_ifevent, NULL, EVENTHANDLER_PRI_ANY); if (ifevent_tag == NULL) return (ENOMEM); VLAN_LOCKING_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_pcp_p = vlan_pcp; vlan_devat_p = vlan_devat; #ifndef VIMAGE vlan_cloner = ifc_attach_cloner(vlanname, (struct if_clone_addreq *)&vlan_addreq); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE ifc_detach_cloner(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); EVENTHANDLER_DEREGISTER(ifnet_event, ifevent_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCKING_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { vlan_cloner = ifc_attach_cloner(vlanname, (struct if_clone_addreq *)&vlan_addreq); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { ifc_detach_cloner(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_vlan_uninit, NULL); #endif /* * Check for .[. ...] style interface names. */ static struct ifnet * vlan_clone_match_ethervid(const char *name, int *vidp) { char ifname[IFNAMSIZ]; char *cp; struct ifnet *ifp; int vid; strlcpy(ifname, name, IFNAMSIZ); if ((cp = strrchr(ifname, '.')) == NULL) return (NULL); *cp = '\0'; if ((ifp = ifunit_ref(ifname)) == NULL) return (NULL); /* Parse VID. */ if (*++cp == '\0') { if_rele(ifp); return (NULL); } vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') { if_rele(ifp); return (NULL); } if (vidp != NULL) *vidp = vid; return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { struct ifnet *ifp; const char *cp; ifp = vlan_clone_match_ethervid(name, NULL); if (ifp != NULL) { if_rele(ifp); return (1); } if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int vlan_clone_create(struct if_clone *ifc, char *name, size_t len, struct ifc_data *ifd, struct ifnet **ifpp) { char *dp; bool wildcard = false; bool subinterface = false; int unit; int error; int vid = 0; uint16_t proto = ETHERTYPE_VLAN; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p = NULL; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * There are three ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify parameters in the text of the clone device name * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are supported * for backwards compatibility. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ if (ifd->params != NULL) { error = ifc_copyin(ifd, &vlr, sizeof(vlr)); if (error) return error; vid = vlr.vlr_tag; proto = vlr.vlr_proto; if (proto == 0) proto = ETHERTYPE_VLAN; p = ifunit_ref(vlr.vlr_parent); if (p == NULL) return (ENXIO); } if ((error = ifc_name2unit(name, &unit)) == 0) { /* * vlanX interface. Set wildcard to true if the unit number * is not fixed (-1) */ wildcard = (unit < 0); } else { struct ifnet *p_tmp = vlan_clone_match_ethervid(name, &vid); if (p_tmp != NULL) { error = 0; subinterface = true; unit = IF_DUNIT_NONE; wildcard = false; if (p != NULL) { if_rele(p_tmp); if (p != p_tmp) error = EINVAL; } else p = p_tmp; } else error = ENXIO; } if (error != 0) { if (p != NULL) if_rele(p); return (error); } if (!subinterface) { /* vlanX interface, mark X as busy or allocate new unit # */ error = ifc_alloc_unit(ifc, &unit); if (error != 0) { if (p != NULL) if_rele(p); return (error); } } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { if (!subinterface) ifc_free_unit(ifc, unit); free(ifv, M_VLAN); if (p != NULL) if_rele(p); return (ENOSPC); } CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; ifp->if_init = vlan_init; #ifdef ALTQ ifp->if_start = vlan_altq_start; ifp->if_transmit = vlan_altq_transmit; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); #else ifp->if_transmit = vlan_transmit; #endif ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; #if defined(KERN_TLS) || defined(RATELIMIT) ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; ifp->if_ratelimit_query = vlan_ratelimit_query; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (p != NULL) { error = vlan_config(ifv, p, vid, proto); if_rele(p); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); if (!subinterface) ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } } *ifpp = ifp; return (0); } /* * * Parsers of IFLA_INFO_DATA inside IFLA_LINKINFO of RTM_NEWLINK * {{nla_len=8, nla_type=IFLA_LINK}, 2}, * {{nla_len=12, nla_type=IFLA_IFNAME}, "xvlan22"}, * {{nla_len=24, nla_type=IFLA_LINKINFO}, * [ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x16\x00\x00\x00"}]} */ struct nl_parsed_vlan { uint16_t vlan_id; uint16_t vlan_proto; struct ifla_vlan_flags vlan_flags; }; #define _OUT(_field) offsetof(struct nl_parsed_vlan, _field) static const struct nlattr_parser nla_p_vlan[] = { { .type = IFLA_VLAN_ID, .off = _OUT(vlan_id), .cb = nlattr_get_uint16 }, { .type = IFLA_VLAN_FLAGS, .off = _OUT(vlan_flags), .cb = nlattr_get_nla }, { .type = IFLA_VLAN_PROTOCOL, .off = _OUT(vlan_proto), .cb = nlattr_get_uint16 }, }; #undef _OUT NL_DECLARE_ATTR_PARSER(vlan_parser, nla_p_vlan); static int vlan_clone_create_nl(struct if_clone *ifc, char *name, size_t len, struct ifc_data_nl *ifd) { struct epoch_tracker et; struct ifnet *ifp_parent; struct nl_pstate *npt = ifd->npt; struct nl_parsed_link *lattrs = ifd->lattrs; int error; /* * lattrs.ifla_ifname is the new interface name * lattrs.ifi_index contains parent interface index * lattrs.ifla_idata contains un-parsed vlan data */ struct nl_parsed_vlan attrs = { .vlan_id = 0xFEFE, .vlan_proto = ETHERTYPE_VLAN }; if (lattrs->ifla_idata == NULL) { nlmsg_report_err_msg(npt, "vlan id is required, guessing not supported"); return (ENOTSUP); } error = nl_parse_nested(lattrs->ifla_idata, &vlan_parser, npt, &attrs); if (error != 0) return (error); if (attrs.vlan_id > 4095) { nlmsg_report_err_msg(npt, "Invalid VID: %d", attrs.vlan_id); return (EINVAL); } if (attrs.vlan_proto != ETHERTYPE_VLAN && attrs.vlan_proto != ETHERTYPE_QINQ) { nlmsg_report_err_msg(npt, "Unsupported ethertype: 0x%04X", attrs.vlan_proto); return (ENOTSUP); } struct vlanreq params = { .vlr_tag = attrs.vlan_id, .vlr_proto = attrs.vlan_proto, }; struct ifc_data ifd_new = { .flags = IFC_F_SYSSPACE, .unit = ifd->unit, .params = ¶ms }; NET_EPOCH_ENTER(et); ifp_parent = ifnet_byindex(lattrs->ifi_index); if (ifp_parent != NULL) strlcpy(params.vlr_parent, if_name(ifp_parent), sizeof(params.vlr_parent)); NET_EPOCH_EXIT(et); if (ifp_parent == NULL) { nlmsg_report_err_msg(npt, "unable to find parent interface %u", lattrs->ifi_index); return (ENOENT); } error = vlan_clone_create(ifc, name, len, &ifd_new, &ifd->ifp); return (error); } static int vlan_clone_modify_nl(struct ifnet *ifp, struct ifc_data_nl *ifd) { struct nl_parsed_link *lattrs = ifd->lattrs; if ((lattrs->ifla_idata != NULL) && ((ifd->flags & IFC_F_CREATE) == 0)) { struct epoch_tracker et; struct nl_parsed_vlan attrs = { .vlan_proto = ETHERTYPE_VLAN, }; int error; error = nl_parse_nested(lattrs->ifla_idata, &vlan_parser, ifd->npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); struct ifnet *ifp_parent = ifnet_byindex_ref(lattrs->ifla_link); NET_EPOCH_EXIT(et); if (ifp_parent == NULL) { nlmsg_report_err_msg(ifd->npt, "unable to find parent interface %u", lattrs->ifla_link); return (ENOENT); } struct ifvlan *ifv = ifp->if_softc; error = vlan_config(ifv, ifp_parent, attrs.vlan_id, attrs.vlan_proto); if_rele(ifp_parent); if (error != 0) return (error); } return (nl_modify_ifp_generic(ifp, ifd->lattrs, ifd->bm, ifd->npt)); } /* * {{nla_len=24, nla_type=IFLA_LINKINFO}, * [ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x16\x00\x00\x00"}]} */ static void vlan_clone_dump_nl(struct ifnet *ifp, struct nl_writer *nw) { uint32_t parent_index = 0; uint16_t vlan_id = 0; uint16_t vlan_proto = 0; VLAN_SLOCK(); struct ifvlan *ifv = ifp->if_softc; if (TRUNK(ifv) != NULL) parent_index = PARENT(ifv)->if_index; vlan_id = ifv->ifv_vid; vlan_proto = ifv->ifv_proto; VLAN_SUNLOCK(); if (parent_index != 0) nlattr_add_u32(nw, IFLA_LINK, parent_index); int off = nlattr_add_nested(nw, IFLA_LINKINFO); if (off != 0) { nlattr_add_string(nw, IFLA_INFO_KIND, "vlan"); int off2 = nlattr_add_nested(nw, IFLA_INFO_DATA); if (off2 != 0) { nlattr_add_u16(nw, IFLA_VLAN_ID, vlan_id); nlattr_add_u16(nw, IFLA_VLAN_PROTOCOL, vlan_proto); nlattr_set_len(nw, off2); } nlattr_set_len(nw, off); } } static int vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags) { struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; if (ifp->if_vlantrunk) return (EBUSY); #ifdef ALTQ IFQ_PURGE(&ifp->if_snd); #endif ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ /* * We should have the only reference to the ifv now, so we can now * drain any remaining lladdr task before freeing the ifnet and the * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); if (unit != IF_DUNIT_NONE) ifc_free_unit(ifc, unit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; NET_EPOCH_ASSERT(); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); #if defined(KERN_TLS) || defined(RATELIMIT) if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { struct vlan_snd_tag *vst; struct m_snd_tag *mst; MPASS(m->m_pkthdr.snd_tag->ifp == ifp); mst = m->m_pkthdr.snd_tag; vst = mst_to_vst(mst); if (vst->tag->ifp != p) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (EAGAIN); } m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); m_snd_tag_rele(mst); } #endif /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, &ifv->ifv_qtag)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (0); } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static int vlan_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct ifvlan *ifv; struct ifnet *p; NET_EPOCH_ASSERT(); /* * Find the first non-VLAN parent interface. */ ifv = ifp->if_softc; do { if (TRUNK(ifv) == NULL) { m_freem(m); return (ENETDOWN); } p = PARENT(ifv); ifv = p->if_softc; } while (p->if_type == IFT_L2VLAN); return p->if_output(ifp, m, dst, ro); } #ifdef ALTQ static void vlan_altq_start(if_t ifp) { struct ifaltq *ifq = &ifp->if_snd; struct mbuf *m; IFQ_LOCK(ifq); IFQ_DEQUEUE_NOLOCK(ifq, m); while (m != NULL) { vlan_transmit(ifp, m); IFQ_DEQUEUE_NOLOCK(ifq, m); } IFQ_UNLOCK(ifq); } static int vlan_altq_transmit(if_t ifp, struct mbuf *m) { int err; if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_ENQUEUE(&ifp->if_snd, m, err); if (err == 0) vlan_altq_start(ifp); } else err = vlan_transmit(ifp, m); return (err); } #endif /* ALTQ */ /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; NET_EPOCH_ASSERT(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { m_freem(m); return; } if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); return; } evl = mtod(m, struct ether_vlan_header *); tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } } vid = EVL_VLANOFTAG(tag); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } if (V_vlan_mtag_pcp) { /* * While uncommon, it is possible that we will find a 802.1q * packet encapsulated inside another packet that also had an * 802.1q header. For example, ethernet tunneled over IPSEC * arriving over ethernet. In that case, we replace the * existing 802.1q PCP m_tag value. */ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); } m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); } static void vlan_lladdr_fn(void *arg, int pending __unused) { struct ifvlan *ifv; struct ifnet *ifp; ifv = (struct ifvlan *)arg; ifp = ifv->ifv_ifp; CURVNET_SET(ifp->if_vnet); /* The ifv_ifp already has the lladdr copied in. */ if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen); CURVNET_RESTORE(); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid, uint16_t proto) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (p->if_type != IFT_ETHER && p->if_type != IFT_L2VLAN && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. * VID numbers 0x0 and 0xFFF are reserved. */ if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) return (EINVAL); if (ifv->ifv_trunk) { trunk = ifv->ifv_trunk; if (trunk->parent != p) return (EBUSY); VLAN_XLOCK(); ifv->ifv_proto = proto; if (ifv->ifv_vid != vid) { /* Re-hash */ vlan_remhash(trunk, ifv); ifv->ifv_vid = vid; error = vlan_inshash(trunk, ifv); } /* Will unlock */ goto done; } VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); TRUNK_WUNLOCK(trunk); } else { trunk = p->if_vlantrunk; } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ ifv->ifv_pcp = 0; /* Default: best effort delivery. */ error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = proto; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; ifp->if_pcp = ifv->ifv_pcp; /* * We wrap the parent's if_output using vlan_output to ensure that it * can't become stale. */ ifp->if_output = vlan_output; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying * physical interface's. */ TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; /* * Do not schedule link address update if it was the same * as previous parent's. This helps avoid updating for each * associated llentry. */ if (memcmp(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen) != 0) { bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_XLOCK(); vlan_unconfig_locked(ifp, 0); VLAN_XUNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_XLOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); NET_EPOCH_CALL(vlan_mc_free, &mc->mc_epoch_ctx); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; NET_EPOCH_WAIT(); trunk_destroy(trunk); } } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; VLAN_SXLOCK_ASSERT(); ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; int cap = 0, ena = 0, mena; u_long hwa = 0; NET_EPOCH_ASSERT(); VLAN_SXLOCK_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); if (ena & IFCAP_TXCSUM_IPV6) hwa |= p->if_hwassist & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ memset(&hw_tsomax, 0, sizeof(hw_tsomax)); if_hw_tsomax_common(p, &hw_tsomax); if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) cap |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ena |= mena & IFCAP_TSO; if (ena & IFCAP_TSO) hwa |= p->if_hwassist & CSUM_TSO; } /* * If the parent interface can do LRO and checksum offloading on * VLANs, then guess it may do LRO on VLANs. False positive here * cost nothing, while false negative may lead to some confusions. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & IFCAP_LRO; if (p->if_capenable & IFCAP_VLAN_HWCSUM) ena |= mena & IFCAP_LRO; /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) cap |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { SETTOEDEV(ifp, TOEDEV(p)); ena |= mena & IFCAP_TOE; } /* * If the parent interface supports dynamic link state, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_LINKSTATE); ena |= (mena & IFCAP_LINKSTATE); #ifdef RATELIMIT /* * If the parent interface supports ratelimiting, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_TXRTLMT); ena |= (mena & IFCAP_TXRTLMT); #endif /* * If the parent interface supports unmapped mbufs, so does * the VLAN interface. Note that this should be fine even for * interfaces that don't support hardware tagging as headers * are prepended in normal mbufs to unmapped mbufs holding * payload data. */ cap |= (p->if_capabilities & IFCAP_MEXTPG); ena |= (mena & IFCAP_MEXTPG); /* * If the parent interface can offload encryption and segmentation * of TLS records over TCP, propagate it's capability to the VLAN * interface. * * All TLS drivers in the tree today can deal with VLANs. If * this ever changes, then a new IFCAP_VLAN_TXTLS can be * defined. */ if (p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT)) cap |= p->if_capabilities & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT); if (p->if_capenable & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT)) ena |= mena & (IFCAP_TXTLS | IFCAP_TXTLS_RTLMT); ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; } static void vlan_trunk_capabilities(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; VLAN_SLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_SUNLOCK(); return; } NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) vlan_capabilities(ifv); NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; #ifdef INET struct ifaddr *ifa; #endif struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0, oldmtu; ifr = (struct ifreq *)data; #ifdef INET ifa = (struct ifaddr *) data; #endif ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ifp->if_addrlen); break; case SIOCGIFMEDIA: VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); if_ref(p); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); if_rele(p); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { error = EINVAL; } VLAN_SUNLOCK(); break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_SLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_WLOCK(trunk); if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; TRUNK_WUNLOCK(trunk); } else error = EINVAL; VLAN_SUNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit_ref(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } if (vlr.vlr_proto == 0) vlr.vlr_proto = ETHERTYPE_VLAN; oldmtu = ifp->if_mtu; error = vlan_config(ifv, p, vlr.vlr_tag, vlr.vlr_proto); if_rele(p); /* * VLAN MTU may change during addition of the vlandev. * If it did, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) if_notifymtu(ifp); break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; vlr.vlr_proto = ifv->ifv_proto; } VLAN_SUNLOCK(); error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ VLAN_SLOCK(); if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); VLAN_SUNLOCK(); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. * * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ VLAN_XLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) error = vlan_setmulti(ifp); VLAN_XUNLOCK(); break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif ifr->ifr_vlan_pcp = ifv->ifv_pcp; break; case SIOCSVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = priv_check(curthread, PRIV_NET_SETVLANPCP); if (error) break; if (ifr->ifr_vlan_pcp > VLAN_PCP_MAX) { error = EINVAL; break; } ifv->ifv_pcp = ifr->ifr_vlan_pcp; ifp->if_pcp = ifv->ifv_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); break; case SIOCSIFCAP: VLAN_SLOCK(); ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { struct epoch_tracker et; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; default: error = EINVAL; break; } return (error); } #if defined(KERN_TLS) || defined(RATELIMIT) static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct epoch_tracker et; const struct if_snd_tag_sw *sw; struct vlan_snd_tag *vst; struct ifvlan *ifv; struct ifnet *parent; struct m_snd_tag *mst; int error; NET_EPOCH_ENTER(et); ifv = ifp->if_softc; switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_UNLIMITED: sw = &vlan_snd_tag_ul_sw; break; case IF_SND_TAG_TYPE_RATE_LIMIT: sw = &vlan_snd_tag_rl_sw; break; #endif #ifdef KERN_TLS case IF_SND_TAG_TYPE_TLS: sw = &vlan_snd_tag_tls_sw; break; case IF_SND_TAG_TYPE_TLS_RX: sw = NULL; if (params->tls_rx.vlan_id != 0) goto failure; params->tls_rx.vlan_id = ifv->ifv_vid; break; #ifdef RATELIMIT case IF_SND_TAG_TYPE_TLS_RATE_LIMIT: sw = &vlan_snd_tag_tls_rl_sw; break; #endif #endif default: goto failure; } if (ifv->ifv_trunk != NULL) parent = PARENT(ifv); else parent = NULL; if (parent == NULL) goto failure; if_ref(parent); NET_EPOCH_EXIT(et); if (sw != NULL) { vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); if (vst == NULL) { if_rele(parent); return (ENOMEM); } } else vst = NULL; error = m_snd_tag_alloc(parent, params, &mst); if_rele(parent); if (error) { free(vst, M_VLAN); return (error); } if (sw != NULL) { m_snd_tag_init(&vst->com, ifp, sw); vst->tag = mst; *ppmt = &vst->com; } else *ppmt = mst; return (0); failure: NET_EPOCH_EXIT(et); return (EOPNOTSUPP); } static struct m_snd_tag * vlan_next_snd_tag(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag); } static int vlan_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->sw->snd_tag_modify(vst->tag, params)); } static int vlan_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); return (vst->tag->sw->snd_tag_query(vst->tag, params)); } static void vlan_snd_tag_free(struct m_snd_tag *mst) { struct vlan_snd_tag *vst; vst = mst_to_vst(mst); m_snd_tag_rele(vst->tag); free(vst, M_VLAN); } static void vlan_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) { /* * For vlan, we have an indirect * interface. The caller needs to * get a ratelimit tag on the actual * interface the flow will go on. */ q->rate_table = NULL; q->flags = RT_IS_INDIRECT; q->max_flows = 0; q->number_of_rates = 0; } #endif diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index e8ed6afd3853..66ec28e49af3 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -1,2595 +1,2593 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002 Michael Shalayeff. * Copyright (c) 2003 Ryan McBride. * Copyright (c) 2011 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #include #include #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ struct sockaddr_dl sc_addr; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ #endif struct mtx sc_mtx; int sc_vhid; int sc_advskew; int sc_advbase; struct in_addr sc_carpaddr; struct in6_addr sc_carpaddr6; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 int sc_init_counter; uint64_t sc_counter; /* authentication */ #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; struct carp_if { #ifdef INET int cif_naddrs; #endif #ifdef INET6 int cif_naddrs6; #endif TAILQ_HEAD(, carp_softc) cif_vrs; #ifdef INET struct ip_moptions cif_imo; #endif #ifdef INET6 struct ip6_moptions cif_im6o; #endif struct ifnet *cif_ifp; struct mtx cif_mtx; uint32_t cif_flags; #define CIF_PROMISC 0x00000001 }; /* Kernel equivalent of struct carpreq, but with more fields for new features. * */ struct carpkreq { int carpr_count; int carpr_vhid; int carpr_state; int carpr_advskew; int carpr_advbase; unsigned char carpr_key[CARP_KEY_LEN]; /* Everything above this is identical to carpreq */ struct in_addr carpr_addr; struct in6_addr carpr_addr6; }; /* * Brief design of carp(4). * * Any carp-capable ifnet may have a list of carp softcs hanging off * its ifp->if_carp pointer. Each softc represents one unique virtual * host id, or vhid. The softc has a back pointer to the ifnet. All * softcs are joined in a global list, which has quite limited use. * * Any interface address that takes part in CARP negotiation has a * pointer to the softc of its vhid, ifa->ifa_carp. That could be either * AF_INET or AF_INET6 address. * * Although, one can get the softc's backpointer to ifnet and traverse * through its ifp->if_addrhead queue to find all interface addresses * involved in CARP, we keep a growable array of ifaddr pointers. This * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that * do calls into the network stack, thus avoiding LORs. * * Locking: * * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), * callout-driven events and ioctl()s. * * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx. * To traverse the global list we use the mutex carp_mtx. * * Known issues with locking: * * - Sending ad, we put the pointer to the softc in an mtag, and no reference * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ /* Accept incoming CARP packets. */ VNET_DEFINE_STATIC(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) /* Set DSCP in outgoing CARP packets. */ VNET_DEFINE_STATIC(int, carp_dscp) = 56; #define V_carp_dscp VNET(carp_dscp) /* Preempt slower nodes. */ VNET_DEFINE_STATIC(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ VNET_DEFINE_STATIC(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ VNET_DEFINE_STATIC(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS); static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "CARP"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &VNET_NAME(carp_allow), 0, carp_allow_sysctl, "I", "Accept incoming CARP packets"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_dscp_sysctl, "I", "DSCP value for carp packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_log), 0, "CARP log level"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_demote_adj_sysctl, "I", "Adjust demotion factor (skew of advskew)"); SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_ifdown_adj), 0, "Interface down demotion factor adjustment"); VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); VNET_PCPUSTAT_SYSINIT(carpstats); VNET_PCPUSTAT_SYSUNINIT(carpstats); #define CARPSTATS_ADD(name, val) \ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ sizeof(uint64_t)], (val)) #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ NULL, MTX_DEF) #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CIF_FREE(cif) do { \ CIF_LOCK(cif); \ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ carp_free_if(cif); \ else \ CIF_UNLOCK(cif); \ } while (0) #define CARP_LOG(...) do { \ if (V_carp_log > 0) \ log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ CARP_LOCK_ASSERT(sc); \ for (int _i = 0; \ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ ((ifa) = sc->sc_ifas[_i]) != NULL; \ ++_i) #define IFNET_FOREACH_CARP(ifp, sc) \ KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \ sx_xlocked(&carp_sx), ("cif_vrs not locked")); \ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) #define DEMOTE_ADVSKEW(sc) \ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ CARP_MAXSKEW : \ (((sc)->sc_advskew + V_carp_demotion < 0) ? \ 0 : ((sc)->sc_advskew + V_carp_demotion))) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t, int); static struct carp_softc *carp_alloc(struct ifnet *); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); static void carp_free_if(struct carp_if *); static void carp_set_state(struct carp_softc *, int, const char* reason); static void carp_sc_state(struct carp_softc *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *, const char* reason); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; static struct sx carp_sx; static struct task carp_sendall_task = TASK_INITIALIZER(0, carp_send_ad_all, NULL); static int carp_is_supported_if(if_t ifp) { if (ifp == NULL) return (ENXIO); switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: break; default: return (EOPNOTSUPP); } return (0); } static void carp_hmac_prepare(struct carp_softc *sc) { uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET struct in_addr last, cur, in; #endif #ifdef INET6 struct in6_addr last6, cur6, in6; #endif CARP_LOCK_ASSERT(sc); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET cur.s_addr = 0; do { found = 0; last = cur; cur.s_addr = 0xffffffff; CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && ntohl(in.s_addr) < ntohl(cur.s_addr)) { cur.s_addr = in.s_addr; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); #endif /* INET */ #ifdef INET6 memset(&cur6, 0, sizeof(cur6)); do { found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; if (ifa->ifa_addr->sa_family == AF_INET6 && memcmp(&in6, &last6, sizeof(in6)) > 0 && memcmp(&in6, &cur6, sizeof(in6)) < 0) { cur6 = in6; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; } static void carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; CARP_LOCK_ASSERT(sc); /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET static int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; iplen = *offp; *mp = NULL; CARPSTATS_INC(carps_ipackets); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s: pullup failed\n", __func__); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ch = (struct carp_header *)((char *)ip + iplen); /* * verify that the received packet length is * equal to the CARP header */ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } if ((m = m_pullup(m, len)) == NULL) { CARPSTATS_INC(carps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; if (in_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed on %s\n", __func__, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iplen; carp_input_c(m, ch, AF_INET, ip->ip_ttl); return (IPPROTO_DONE); } #endif #ifdef INET6 static int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct carp_header *ch; u_int len; CARPSTATS_INC(carps_ipackets6); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); CARP_DEBUG("%s: packet received on non-carp interface: %s\n", __func__, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } /* verify that we have a complete carp packet */ if (m->m_len < *offp + sizeof(*ch)) { len = m->m_len; m = m_pullup(m, *offp + sizeof(*ch)); if (m == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); } ch = (struct carp_header *)(mtod(m, char *) + *offp); /* verify the CARP checksum */ m->m_data += *offp; if (in_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed, on %s\n", __func__, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } m->m_data -= *offp; carp_input_c(m, ch, AF_INET6, ip6->ip6_hlim); return (IPPROTO_DONE); } #endif /* INET6 */ /* * This routine should not be necessary at all, but some switches * (VMWare ESX vswitches) can echo our own packets back at us, * and we must ignore them or they will cause us to drop out of * MASTER mode. * * We cannot catch all cases of network loops. Instead, what we * do here is catch any packet that arrives with a carp header * with a VHID of 0, that comes from an address that is our own. * These packets are by definition "from us" (even if they are from * a misconfigured host that is pretending to be us). * * The VHID test is outside this mini-function. */ static int carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af) { #ifdef INET struct ip *ip4; struct in_addr in4; #endif #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; #endif switch (af) { #ifdef INET case AF_INET: ip4 = mtod(m, struct ip *); in4 = ifatoia(ifa)->ia_addr.sin_addr; return (in4.s_addr == ip4->ip_src.s_addr); #endif #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); in6 = ifatoia6(ifa)->ia_addr.sin6_addr; return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0); #endif default: break; } return (0); } static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa, *match; struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; int error; bool multicast = false; NET_EPOCH_ASSERT(); /* * Verify that the VHID is valid on the receiving interface. * * There should be just one match. If there are none * the VHID is not valid and we drop the packet. If * there are multiple VHID matches, take just the first * one, for compatibility with previous code. While we're * scanning, check for obvious loops in the network topology * (these should never happen, and as noted above, we may * miss real loops; this is just a double-check). */ error = 0; match = NULL; IFNET_FOREACH_IFA(ifp, ifa) { if (match == NULL && ifa->ifa_carp != NULL && ifa->ifa_addr->sa_family == af && ifa->ifa_carp->sc_vhid == ch->carp_vhid) match = ifa; if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af)) error = ELOOP; } ifa = error ? NULL : match; if (ifa != NULL) ifa_ref(ifa); if (ifa == NULL) { if (error == ELOOP) { CARP_DEBUG("dropping looped packet on interface %s\n", if_name(ifp)); CARPSTATS_INC(carps_badif); /* ??? */ } else { CARPSTATS_INC(carps_badvhid); } m_freem(m); return; } /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), ch->carp_version); ifa_free(ifa); m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); if (ifa->ifa_addr->sa_family == AF_INET) { multicast = IN_MULTICAST(sc->sc_carpaddr.s_addr); } else { multicast = IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6); } ifa_free(ifa); /* verify that the IP TTL is 255, but only if we're not in unicast mode. */ if (multicast && ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ttl, if_name(m->m_pkthdr.rcvif)); goto out; } if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, sc->sc_vhid, if_name(ifp)); goto out; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = 0; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "more frequent advertisement received"); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "preempting a slower master"); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "master will time out"); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT)) == NULL) { m_freem(m); CARPSTATS_INC(carps_onomem); return (ENOMEM); } bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (0); } /* * To avoid LORs and possible recursions this function shouldn't * be called directly, but scheduled via taskqueue. */ static void carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; struct epoch_tracker et; NET_EPOCH_ENTER(et); mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); NET_EPOCH_EXIT(et); } /* Send a periodic advertisement, executed in callout context. */ static void carp_send_ad(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); } static void carp_send_ad_error(struct carp_softc *sc, int error) { /* * We track errors and successfull sends with this logic: * - Any error resets success counter to 0. * - MAX_ERRORS triggers demotion. * - MIN_SUCCESS successes resets error counter to 0. * - MIN_SUCCESS reverts demotion, if it was triggered before. */ if (error) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send error %d on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, error, if_name(sc->sc_carpdev)); carp_demote_adj(V_carp_senderr_adj, msg); } sc->sc_sendad_success = 0; } else if (sc->sc_sendad_errors > 0) { if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send ok on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, if_name(sc->sc_carpdev)); carp_demote_adj(-V_carp_senderr_adj, msg); } sc->sc_sendad_errors = 0; } } } /* * Pick the best ifaddr on the given ifp for sending CARP * advertisements. * * "Best" here is defined by ifa_preferred(). This function is much * much like ifaof_ifpforaddr() except that we just use ifa_preferred(). * * (This could be simplified to return the actual address, except that * it has a different format in AF_INET and AF_INET6.) */ static struct ifaddr * carp_best_ifa(int af, struct ifnet *ifp) { struct ifaddr *ifa, *best; NET_EPOCH_ASSERT(); if (af >= AF_MAX) return (NULL); best = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; } if (best != NULL) ifa_ref(best); return (best); } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; int len, advskew; NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; /* XXXGL: OpenBSD picks first ifaddr with needed family. */ #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); if (IN_MULTICAST(sc->sc_carpaddr.s_addr)) m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; ip->ip_len = htons(len); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip_fillid(ip); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst = sc->sc_carpaddr; ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; /* Traffic class isn't defined in ip6 struct instead * it gets offset into flowid field */ ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + IPTOS_DSCP_OFFSET)); ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; /* set the source address */ ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ memcpy(&ip6->ip6_dst, &sc->sc_carpaddr6, sizeof(ip6->ip6_dst)); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ resched: callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } static void carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_addroute(ifa); } static void carp_ifa_addroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: in_addprefix(ifatoia(ifa)); ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); break; #endif #ifdef INET6 case AF_INET6: ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_add_ifa_lle(ifatoia6(ifa)); break; #endif } } static void carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_delroute(ifa); } static void carp_ifa_delroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); in_scrubprefix(ifatoia(ifa), LLE_STATIC); break; #endif #ifdef INET6 case AF_INET6: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_rem_ifa_lle(ifatoia6(ifa), 1); break; #endif } } int carp_master(struct ifaddr *ifa) { struct carp_softc *sc = ifa->ifa_carp; return (sc->sc_state == MASTER); } #ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; struct in_addr addr; NET_EPOCH_ASSERT(); CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr)); } } int carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { *enaddr = LLADDR(&sc->sc_addr); return (1); } return (0); } #endif #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } /* * Returns ifa in case it's a carp address and it is MASTER, or if the address * matches and is not a carp address. Returns NULL otherwise. */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); ifa = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) continue; if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) ifa = NULL; else ifa_ref(ifa); break; } return (ifa); } char * carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); if (mtag == NULL) /* Better a bit than nothing. */ return (LLADDR(&sc->sc_addr)); bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (LLADDR(&sc->sc_addr)); } return (NULL); } #endif /* INET6 */ int carp_forus(struct ifnet *ifp, u_char *dhost) { struct carp_softc *sc; uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (0); CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { /* * CARP_LOCK() is not here, since would protect nothing, but * cause deadlock with if_bridge, calling this under its lock. */ if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), ETHER_ADDR_LEN)) { CIF_UNLOCK(ifp->if_carp); return (1); } } CIF_UNLOCK(ifp->if_carp); return (0); } /* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); if (sc->sc_state == BACKUP) { carp_master_down_locked(sc, "master timed out"); } CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); } static void carp_master_down_locked(struct carp_softc *sc, const char *reason) { NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER, reason); carp_send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); #endif carp_setrun(sc, 0); carp_addroute(sc); break; case INIT: case MASTER: #ifdef INVARIANTS panic("carp: VHID %u@%s: master_down event in %s state\n", sc->sc_vhid, if_name(sc->sc_carpdev), sc->sc_state ? "MASTER" : "INIT"); #endif break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; CARP_LOCK_ASSERT(sc); if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) || !V_carp_allow) return; switch (sc->sc_state) { case INIT: carp_set_state(sc, BACKUP, "initialization complete"); carp_setrun(sc, 0); break; case BACKUP: callout_stop(&sc->sc_ad_tmo); tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); #endif break; } break; case MASTER: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); break; } } /* * Setup multicast structures. */ static int carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { struct ifnet *ifp = cif->cif_ifp; int error = 0; switch (sa) { #ifdef INET case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; struct in_addr addr; if (ip_mfilter_first(&imo->imo_head) != NULL) return (0); imf = ip_mfilter_alloc(M_WAITOK, 0, 0); ip_mfilter_init(&imo->imo_head); imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, &imf->imf_inm)) != 0) { ip_mfilter_free(imf); break; } ip_mfilter_insert(&imo->imo_head, imf); imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f[2]; struct in6_addr in6; if (ip6_mfilter_first(&im6o->im6o_head)) return (0); im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0); im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0); ip6_mfilter_init(&im6o->im6o_head); im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) { in6_leavegroup(im6f[0]->im6f_in6m, NULL); ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } ip6_mfilter_insert(&im6o->im6o_head, im6f[0]); ip6_mfilter_insert(&im6o->im6o_head, im6f[1]); break; } #endif } return (error); } /* * Free multicast structures. */ static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { #ifdef INET struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; #endif #ifdef INET6 struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f; #endif sx_assert(&carp_sx, SA_XLOCKED); switch (sa) { #ifdef INET case AF_INET: if (cif->cif_naddrs != 0) break; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup(imf->imf_inm, NULL); ip_mfilter_free(imf); } break; #endif #ifdef INET6 case AF_INET6: if (cif->cif_naddrs6 != 0) break; while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) { ip6_mfilter_remove(&im6o->im6o_head, im6f); in6_leavegroup(im6f->im6f_in6m, NULL); ip6_mfilter_free(im6f); } break; #endif } } int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &sc, sizeof(sc)); switch (sa->sa_family) { case AF_INET: if (! IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr))) return (0); break; case AF_INET6: if (! IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6)) return (0); break; default: panic("Unknown af"); } /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_BRIDGE: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = sc->sc_vhid; } break; default: printf("%s: carp is not supported for the %d interface type\n", if_name(ifp), ifp->if_type); return (EOPNOTSUPP); } return (0); } static struct carp_softc* carp_alloc(struct ifnet *ifp) { struct carp_softc *sc; struct carp_if *cif; sx_assert(&carp_sx, SA_XLOCKED); if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); sc->sc_advbase = CARP_DFLTINTV; sc->sc_vhid = -1; /* required setting */ sc->sc_init_counter = 1; sc->sc_state = INIT; sc->sc_ifasiz = sizeof(struct ifaddr *); sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); sc->sc_carpdev = ifp; sc->sc_carpaddr.s_addr = htonl(INADDR_CARP_GROUP); sc->sc_carpaddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; sc->sc_carpaddr6.s6_addr8[15] = 0x12; CARP_LOCK_INIT(sc); #ifdef INET callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif #ifdef INET6 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); CIF_LOCK(cif); TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carp_list, sc, sc_next); mtx_unlock(&carp_mtx); return (sc); } static void carp_grow_ifas(struct carp_softc *sc) { struct ifaddr **new; new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); CARP_LOCK(sc); bcopy(sc->sc_ifas, new, sc->sc_ifasiz); free(sc->sc_ifas, M_CARP); sc->sc_ifas = new; sc->sc_ifasiz *= 2; CARP_UNLOCK(sc); } static void carp_destroy(struct carp_softc *sc) { struct ifnet *ifp = sc->sc_carpdev; struct carp_if *cif = ifp->if_carp; sx_assert(&carp_sx, SA_XLOCKED); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); CARP_UNLOCK(sc); CIF_LOCK(cif); TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); callout_drain(&sc->sc_ad_tmo); #ifdef INET callout_drain(&sc->sc_md_tmo); #endif #ifdef INET6 callout_drain(&sc->sc_md6_tmo); #endif CARP_LOCK_DESTROY(sc); free(sc->sc_ifas, M_CARP); free(sc, M_CARP); } static struct carp_if* carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; int error; cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if ((error = ifpromisc(ifp, 1)) != 0) printf("%s: ifpromisc(%s) failed: %d\n", __func__, if_name(ifp), error); else cif->cif_flags |= CIF_PROMISC; CIF_LOCK_INIT(cif); cif->cif_ifp = ifp; TAILQ_INIT(&cif->cif_vrs); IF_ADDR_WLOCK(ifp); ifp->if_carp = cif; if_ref(ifp); IF_ADDR_WUNLOCK(ifp); return (cif); } static void carp_free_if(struct carp_if *cif) { struct ifnet *ifp = cif->cif_ifp; CIF_LOCK_ASSERT(cif); KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", __func__)); IF_ADDR_WLOCK(ifp); ifp->if_carp = NULL; IF_ADDR_WUNLOCK(ifp); CIF_LOCK_DESTROY(cif); if (cif->cif_flags & CIF_PROMISC) ifpromisc(ifp, 0); if_rele(ifp); free(cif, M_CARP); } static bool carp_carprcp(void *arg, struct carp_softc *sc, int priv) { struct carpreq *carpr = arg; CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; carpr->carpr_advbase = sc->sc_advbase; carpr->carpr_advskew = sc->sc_advskew; if (priv) bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); else bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); CARP_UNLOCK(sc); return (true); } static int carp_ioctl_set(if_t ifp, struct carpkreq *carpr) { struct epoch_tracker et; struct carp_softc *sc = NULL; int error = 0; if (carpr->carpr_vhid <= 0 || carpr->carpr_vhid > CARP_MAXVHID || carpr->carpr_advbase < 0 || carpr->carpr_advskew < 0) { return (EINVAL); } if (ifp->if_carp) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr->carpr_vhid) break; } if (sc == NULL) { sc = carp_alloc(ifp); CARP_LOCK(sc); sc->sc_vhid = carpr->carpr_vhid; LLADDR(&sc->sc_addr)[0] = 0; LLADDR(&sc->sc_addr)[1] = 0; LLADDR(&sc->sc_addr)[2] = 0x5e; LLADDR(&sc->sc_addr)[3] = 0; LLADDR(&sc->sc_addr)[4] = 1; LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; } else CARP_LOCK(sc); if (carpr->carpr_advbase > 0) { if (carpr->carpr_advbase > 255 || carpr->carpr_advbase < CARP_DFLTINTV) { error = EINVAL; goto out; } sc->sc_advbase = carpr->carpr_advbase; } if (carpr->carpr_advskew >= 255) { error = EINVAL; goto out; } sc->sc_advskew = carpr->carpr_advskew; if (carpr->carpr_addr.s_addr != INADDR_ANY) sc->sc_carpaddr = carpr->carpr_addr; if (! IN6_IS_ADDR_UNSPECIFIED(&carpr->carpr_addr6)) { memcpy(&sc->sc_carpaddr6, &carpr->carpr_addr6, sizeof(sc->sc_carpaddr6)); } if (carpr->carpr_key[0] != '\0') { bcopy(carpr->carpr_key, sc->sc_key, sizeof(sc->sc_key)); carp_hmac_prepare(sc); } if (sc->sc_state != INIT && carpr->carpr_state != sc->sc_state) { switch (carpr->carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "user requested via ifconfig"); carp_setrun(sc, 0); carp_delroute(sc); break; case MASTER: NET_EPOCH_ENTER(et); carp_master_down_locked(sc, "user requested via ifconfig"); NET_EPOCH_EXIT(et); break; default: break; } } out: CARP_UNLOCK(sc); return (error); } static int carp_ioctl_get(if_t ifp, struct ucred *cred, struct carpreq *carpr, bool (*outfn)(void *, struct carp_softc *, int), void *arg) { int priveleged; struct carp_softc *sc; if (carpr->carpr_vhid < 0 || carpr->carpr_vhid > CARP_MAXVHID) return (EINVAL); if (carpr->carpr_count < 1) return (EMSGSIZE); if (ifp->if_carp == NULL) return (ENOENT); priveleged = (priv_check_cred(cred, PRIV_NETINET_CARP) == 0); if (carpr->carpr_vhid != 0) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr->carpr_vhid) break; if (sc == NULL) return (ENOENT); if (! outfn(arg, sc, priveleged)) return (ENOMEM); carpr->carpr_count = 1; } else { int count; count = 0; IFNET_FOREACH_CARP(ifp, sc) count++; if (count > carpr->carpr_count) return (EMSGSIZE); IFNET_FOREACH_CARP(ifp, sc) { if (! outfn(arg, sc, priveleged)) return (ENOMEM); carpr->carpr_count = count; } } return (0); } int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; struct carpkreq carprk = { }; struct ifnet *ifp; int error = 0; if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr))) return (error); ifp = ifunit_ref(ifr->ifr_name); if ((error = carp_is_supported_if(ifp)) != 0) goto out; if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } sx_xlock(&carp_sx); switch (cmd) { case SIOCSVH: if ((error = priv_check(td, PRIV_NETINET_CARP))) break; memcpy(&carprk, &carpr, sizeof(carpr)); error = carp_ioctl_set(ifp, &carprk); break; case SIOCGVH: error = carp_ioctl_get(ifp, td->td_ucred, &carpr, carp_carprcp, &carpr); if (error == 0) { error = copyout(&carpr, (char *)ifr_data_get_ptr(ifr), carpr.carpr_count * sizeof(carpr)); } break; default: error = EINVAL; } sx_xunlock(&carp_sx); out: if (ifp != NULL) if_rele(ifp); return (error); } static int carp_get_vhid(struct ifaddr *ifa) { if (ifa == NULL || ifa->ifa_carp == NULL) return (0); return (ifa->ifa_carp->sc_vhid); } int carp_attach(struct ifaddr *ifa, int vhid) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; int index, error; KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif break; default: return (EPROTOTYPE); } sx_xlock(&carp_sx); if (ifp->if_carp == NULL) { sx_xunlock(&carp_sx); return (ENOPROTOOPT); } IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == vhid) break; if (sc == NULL) { sx_xunlock(&carp_sx); return (ENOENT); } error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); if (error) { CIF_FREE(cif); sx_xunlock(&carp_sx); return (error); } index = sc->sc_naddrs + sc->sc_naddrs6 + 1; if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) carp_grow_ifas(sc); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs++; sc->sc_naddrs++; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6++; sc->sc_naddrs6++; break; #endif } ifa_ref(ifa); CARP_LOCK(sc); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); sx_xunlock(&carp_sx); return (0); } void carp_detach(struct ifaddr *ifa, bool keep_cif) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc = ifa->ifa_carp; int i, index; KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); sx_xlock(&carp_sx); CARP_LOCK(sc); /* Shift array. */ index = sc->sc_naddrs + sc->sc_naddrs6; for (i = 0; i < index; i++) if (sc->sc_ifas[i] == ifa) break; KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); for (; i < index - 1; i++) sc->sc_ifas[i] = sc->sc_ifas[i+1]; sc->sc_ifas[index - 1] = NULL; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs--; sc->sc_naddrs--; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6--; sc->sc_naddrs6--; break; #endif } carp_ifa_delroute(ifa); carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); ifa->ifa_carp = NULL; ifa_free(ifa); carp_hmac_prepare(sc); carp_sc_state(sc); if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) carp_destroy(sc); else CARP_UNLOCK(sc); if (!keep_cif) CIF_FREE(cif); sx_xunlock(&carp_sx); } static void carp_set_state(struct carp_softc *sc, int state, const char *reason) { CARP_LOCK_ASSERT(sc); if (sc->sc_state != state) { const char *carp_states[] = { CARP_STATES }; char subsys[IFNAMSIZ+5]; snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, if_name(sc->sc_carpdev)); CARP_LOG("%s: %s -> %s (%s)\n", subsys, carp_states[sc->sc_state], carp_states[state], reason); sc->sc_state = state; devctl_notify("CARP", subsys, carp_states[state], NULL); } } static void carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); carp_sc_state(sc); CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); } static void carp_sc_state(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP) || !V_carp_allow) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); #endif #ifdef INET6 callout_stop(&sc->sc_md6_tmo); #endif carp_set_state(sc, INIT, "hardware interface down"); carp_setrun(sc, 0); carp_delroute(sc); if (!sc->sc_suppress) carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { carp_set_state(sc, INIT, "hardware interface up"); carp_setrun(sc, 0); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } } static void carp_demote_adj(int adj, char *reason) { atomic_add_int(&V_carp_demotion, adj); CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); } static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; struct carp_softc *sc; new = V_carp_allow; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (V_carp_allow != new) { V_carp_allow = new; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) { CARP_LOCK(sc); if (curvnet == sc->sc_carpdev->if_vnet) carp_sc_state(sc); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } return (0); } static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_dscp; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (new < 0 || new > 63) return (EINVAL); V_carp_dscp = new; return (0); } static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_demotion; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); carp_demote_adj(new, "sysctl"); return (0); } static int nlattr_get_carp_key(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) > CARP_KEY_LEN)) return (EINVAL); memcpy(target, NLA_DATA_CONST(nla), NLA_DATA_LEN(nla)); return (0); } struct carp_nl_send_args { struct nlmsghdr *hdr; struct nl_pstate *npt; }; static bool carp_nl_send(void *arg, struct carp_softc *sc, int priv) { struct carp_nl_send_args *nlsa = arg; struct nlmsghdr *hdr = nlsa->hdr; struct nl_pstate *npt = nlsa->npt; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { nlmsg_abort(nw); return (false); } ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); if (ghdr_new == NULL) { nlmsg_abort(nw); return (false); } ghdr_new->cmd = CARP_NL_CMD_GET; ghdr_new->version = 0; ghdr_new->reserved = 0; CARP_LOCK(sc); nlattr_add_u32(nw, CARP_NL_VHID, sc->sc_vhid); nlattr_add_u32(nw, CARP_NL_STATE, sc->sc_state); nlattr_add_s32(nw, CARP_NL_ADVBASE, sc->sc_advbase); nlattr_add_s32(nw, CARP_NL_ADVSKEW, sc->sc_advskew); nlattr_add_in_addr(nw, CARP_NL_ADDR, &sc->sc_carpaddr); nlattr_add_in6_addr(nw, CARP_NL_ADDR6, &sc->sc_carpaddr6); if (priv) nlattr_add(nw, CARP_NL_KEY, sizeof(sc->sc_key), sc->sc_key); CARP_UNLOCK(sc); if (! nlmsg_end(nw)) { nlmsg_abort(nw); return (false); } return (true); } struct nl_carp_parsed { unsigned int ifindex; char *ifname; uint32_t state; uint32_t vhid; int32_t advbase; int32_t advskew; char key[CARP_KEY_LEN]; struct in_addr addr; struct in6_addr addr6; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_carp_parsed, _field) static const struct nlattr_parser nla_p_set[] = { { .type = CARP_NL_VHID, .off = _OUT(vhid), .cb = nlattr_get_uint32 }, { .type = CARP_NL_STATE, .off = _OUT(state), .cb = nlattr_get_uint32 }, { .type = CARP_NL_ADVBASE, .off = _OUT(advbase), .cb = nlattr_get_uint32 }, { .type = CARP_NL_ADVSKEW, .off = _OUT(advskew), .cb = nlattr_get_uint32 }, { .type = CARP_NL_KEY, .off = _OUT(key), .cb = nlattr_get_carp_key }, { .type = CARP_NL_IFINDEX, .off = _OUT(ifindex), .cb = nlattr_get_uint32 }, { .type = CARP_NL_ADDR, .off = _OUT(addr), .cb = nlattr_get_in_addr }, { .type = CARP_NL_ADDR6, .off = _OUT(addr6), .cb = nlattr_get_in6_addr }, { .type = CARP_NL_IFNAME, .off = _OUT(ifname), .cb = nlattr_get_string }, }; static const struct nlfield_parser nlf_p_set[] = { }; NL_DECLARE_PARSER(carp_parser, struct genlmsghdr, nlf_p_set, nla_p_set); #undef _IN #undef _OUT static int carp_nl_get(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_carp_parsed attrs = { }; struct carp_nl_send_args args; struct carpreq carpr = { }; struct epoch_tracker et; if_t ifp = NULL; int error; error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); if (attrs.ifname != NULL) ifp = ifunit_ref(attrs.ifname); else if (attrs.ifindex != 0) ifp = ifnet_byindex_ref(attrs.ifindex); NET_EPOCH_EXIT(et); if ((error = carp_is_supported_if(ifp)) != 0) goto out; hdr->nlmsg_flags |= NLM_F_MULTI; args.hdr = hdr; args.npt = npt; carpr.carpr_vhid = attrs.vhid; carpr.carpr_count = CARP_MAXVHID; sx_xlock(&carp_sx); error = carp_ioctl_get(ifp, nlp_get_cred(npt->nlp), &carpr, carp_nl_send, &args); sx_xunlock(&carp_sx); if (! nlmsg_end_dump(npt->nw, error, hdr)) error = ENOMEM; out: if (ifp != NULL) if_rele(ifp); return (error); } static int carp_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_carp_parsed attrs = { }; struct carpkreq carpr; struct epoch_tracker et; if_t ifp = NULL; int error; error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs); if (error != 0) return (error); if (attrs.vhid <= 0 || attrs.vhid > CARP_MAXVHID) return (EINVAL); if (attrs.state > CARP_MAXSTATE) return (EINVAL); if (attrs.advbase < 0 || attrs.advskew < 0) return (EINVAL); if (attrs.advbase > 255) return (EINVAL); if (attrs.advskew >= 255) return (EINVAL); NET_EPOCH_ENTER(et); if (attrs.ifname != NULL) ifp = ifunit_ref(attrs.ifname); else if (attrs.ifindex != 0) ifp = ifnet_byindex_ref(attrs.ifindex); NET_EPOCH_EXIT(et); if ((error = carp_is_supported_if(ifp)) != 0) goto out; if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } carpr.carpr_count = 1; carpr.carpr_vhid = attrs.vhid; carpr.carpr_state = attrs.state; carpr.carpr_advbase = attrs.advbase; carpr.carpr_advskew = attrs.advskew; carpr.carpr_addr = attrs.addr; carpr.carpr_addr6 = attrs.addr6; memcpy(&carpr.carpr_key, &attrs.key, sizeof(attrs.key)); sx_xlock(&carp_sx); error = carp_ioctl_set(ifp, &carpr); sx_xunlock(&carp_sx); out: if (ifp != NULL) if_rele(ifp); return (error); } static const struct nlhdr_parser *all_parsers[] = { &carp_parser }; static const struct genl_cmd carp_cmds[] = { { .cmd_num = CARP_NL_CMD_GET, .cmd_name = "SIOCGVH", .cmd_cb = carp_nl_get, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, { .cmd_num = CARP_NL_CMD_SET, .cmd_name = "SIOCSVH", .cmd_cb = carp_nl_set, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_CARP, }, }; static void carp_nl_register(void) { bool ret __diagused; int family_id __diagused; NL_VERIFY_PARSERS(all_parsers); family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2, CARP_NL_CMD_MAX); MPASS(family_id != 0); ret = genl_register_cmds(CARP_NL_FAMILY_NAME, carp_cmds, NL_ARRAY_LEN(carp_cmds)); MPASS(ret); } static void carp_nl_unregister(void) { genl_unregister_family(CARP_NL_FAMILY_NAME); } static void carp_mod_cleanup(void) { carp_nl_unregister(); #ifdef INET (void)ipproto_unregister(IPPROTO_CARP); carp_iamatch_p = NULL; #endif #ifdef INET6 (void)ip6proto_unregister(IPPROTO_CARP); carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif carp_ioctl_p = NULL; carp_attach_p = NULL; carp_detach_p = NULL; carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; carp_demote_adj_p = NULL; carp_master_p = NULL; mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); sx_destroy(&carp_sx); } static void ipcarp_sysinit(void) { /* Load allow as tunable so to postpone carp start after module load */ TUNABLE_INT_FETCH("net.inet.carp.allow", &V_carp_allow); } VNET_SYSINIT(ip_carp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipcarp_sysinit, NULL); static int carp_mod_load(void) { int err; mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); sx_init(&carp_sx, "carp_sx"); LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; carp_linkstate_p = carp_linkstate; carp_ioctl_p = carp_ioctl; carp_attach_p = carp_attach; carp_detach_p = carp_detach; carp_demote_adj_p = carp_demote_adj; carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; err = ip6proto_register(IPPROTO_CARP, carp6_input, NULL); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); return (err); } #endif #ifdef INET carp_iamatch_p = carp_iamatch; err = ipproto_register(IPPROTO_CARP, carp_input, NULL); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); return (err); } #endif carp_nl_register(); return (0); } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: mtx_lock(&carp_mtx); if (LIST_EMPTY(&carp_list)) carp_mod_cleanup(); else { mtx_unlock(&carp_mtx); return (EBUSY); } break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); diff --git a/sys/netlink/ktest_netlink_message_writer.c b/sys/netlink/ktest_netlink_message_writer.c index c13a25e05a70..e46065dd4bd2 100644 --- a/sys/netlink/ktest_netlink_message_writer.c +++ b/sys/netlink/ktest_netlink_message_writer.c @@ -1,169 +1,167 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #define KTEST_CALLER #include #ifdef INVARIANTS struct test_mbuf_attrs { uint32_t size; uint32_t expected_avail; uint32_t expected_count; uint32_t wtype; int waitok; }; #define _OUT(_field) offsetof(struct test_mbuf_attrs, _field) static const struct nlattr_parser nla_p_mbuf_w[] = { { .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 }, { .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 }, { .type = 3, .off = _OUT(expected_count), .cb = nlattr_get_uint32 }, { .type = 4, .off = _OUT(wtype), .cb = nlattr_get_uint32 }, { .type = 5, .off = _OUT(waitok), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_ATTR_PARSER(mbuf_w_parser, nla_p_mbuf_w); static int test_mbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla) { struct test_mbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs)); ctx->arg = attrs; if (attrs != NULL) return (nl_parse_nested(nla, &mbuf_w_parser, ctx->npt, attrs)); return (ENOMEM); } static int test_mbuf_writer_allocation(struct ktest_test_context *ctx) { struct test_mbuf_attrs *attrs = ctx->arg; bool ret; struct nl_writer nw = {}; ret = nlmsg_get_buf_type_wrapper(&nw, attrs->size, attrs->wtype, attrs->waitok); if (!ret) return (EINVAL); int alloc_len = nw.alloc_len; KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len); /* Set cleanup callback */ nw.writer_target = NS_WRITER_TARGET_SOCKET; nlmsg_set_callback_wrapper(&nw); /* Mark enomem to avoid reallocation */ nw.enomem = true; if (nlmsg_reserve_data(&nw, alloc_len, void *) == NULL) { KTEST_LOG(ctx, "unable to get %d bytes from the writer", alloc_len); return (EINVAL); } /* Mark as empty to free the storage */ nw.offset = 0; nlmsg_flush(&nw); if (alloc_len < attrs->expected_avail) { KTEST_LOG(ctx, "alloc_len %d, expected %u", alloc_len, attrs->expected_avail); return (EINVAL); } return (0); } static int test_mbuf_chain_allocation(struct ktest_test_context *ctx) { struct test_mbuf_attrs *attrs = ctx->arg; int mflags = attrs->waitok ? M_WAITOK : M_NOWAIT; struct mbuf *chain = nl_get_mbuf_chain_wrapper(attrs->size, mflags); if (chain == NULL) { KTEST_LOG(ctx, "nl_get_mbuf_chain(%u) returned NULL", attrs->size); return (EINVAL); } /* Iterate and check number of mbufs and space */ uint32_t allocated_count = 0, allocated_size = 0; for (struct mbuf *m = chain; m != NULL; m = m->m_next) { allocated_count++; allocated_size += M_SIZE(m); } m_freem(chain); if (attrs->expected_avail > allocated_size) { KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u" " expected/allocated count %u/%u", attrs->expected_avail, allocated_size, attrs->expected_count, allocated_count); return (EINVAL); } if (attrs->expected_count > 0 && (attrs->expected_count != allocated_count)) { KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u" " expected/allocated count %u/%u", attrs->expected_avail, allocated_size, attrs->expected_count, allocated_count); return (EINVAL); } return (0); } #endif static const struct ktest_test_info tests[] = { #ifdef INVARIANTS { .name = "test_mbuf_writer_allocation", .desc = "test different mbuf sizes in the mbuf writer", .func = &test_mbuf_writer_allocation, .parse = &test_mbuf_parser, }, { .name = "test_mbuf_chain_allocation", .desc = "verify allocation different chain sizes", .func = &test_mbuf_chain_allocation, .parse = &test_mbuf_parser, }, #endif }; KTEST_MODULE_DECLARE(ktest_netlink_message_writer, tests); diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index 262361773fbb..7b2bbd39447d 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -1,809 +1,808 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file contains socket and protocol bindings for netlink. */ -#include "opt_netlink.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* priv_check */ #include #include #include #define DEBUG_MOD_NAME nl_domain #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); _Static_assert((NLP_MAX_GROUPS % 64) == 0, "NLP_MAX_GROUPS has to be multiple of 64"); _Static_assert(NLP_MAX_GROUPS >= 64, "NLP_MAX_GROUPS has to be at least 64"); #define NLCTL_TRACKER struct rm_priotracker nl_tracker #define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) #define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) static u_long nl_sendspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, "Default netlink socket send space"); static u_long nl_recvspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, "Default netlink socket receive space"); extern u_long sb_max_adj; static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0, sysctl_handle_nl_maxsockbuf, "LU", "Maximum Netlink socket buffer size"); static unsigned int osd_slot_id = 0; void nl_osd_register(void) { osd_slot_id = osd_register(OSD_THREAD, NULL, NULL); } void nl_osd_unregister(void) { osd_deregister(OSD_THREAD, osd_slot_id); } struct nlpcb * _nl_get_thread_nlp(struct thread *td) { return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id)); } void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp) { NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id); if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0) return; /* Failed, need to realloc */ void **rsv = osd_reserve(osd_slot_id); osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp); } /* * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. * Returns nlpcb pointer if present else NULL */ static struct nlpcb * nl_port_lookup(uint32_t port_id) { struct nlpcb *nlp; CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { if (nlp->nl_port == port_id) return (nlp); } return (NULL); } static void nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; /* TODO: add family handler callback */ if (!nlp_unconstrained_vnet(nlp)) return; nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64); } static void nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; nlp->nl_groups[group_id / 64] &= ~((uint64_t)1 << (group_id % 64)); } static bool nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; return (nlp->nl_groups[group_id / 64] & ((uint64_t)1 << (group_id % 64))); } static uint32_t nl_get_groups_compat(struct nlpcb *nlp) { uint32_t groups_mask = 0; for (int i = 0; i < 32; i++) { if (nl_isset_group_locked(nlp, i + 1)) groups_mask |= (1 << i); } return (groups_mask); } static void nl_send_one_group(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { if (__predict_false(nlp->nl_flags & NLF_MSG_INFO)) nl_add_msg_info(m); nl_send_one(m, nlp, num_messages, io_flags); } /* * Broadcasts message @m to the protocol @proto group specified by @group_id */ void nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id) { struct nlpcb *nlp_last = NULL; struct nlpcb *nlp; NLCTL_TRACKER; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d", m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id); } struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (__predict_false(ctl == NULL)) { /* * Can be the case when notification is sent within VNET * which doesn't have any netlink sockets. */ m_freem(m); return; } NLCTL_RLOCK(ctl); int io_flags = NL_IOF_UNTRANSLATED; CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) { if (nlp_last != NULL) { struct mbuf *m_copy; m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m_copy != NULL) nl_send_one_group(m_copy, nlp_last, num_messages, io_flags); else { NLP_LOCK(nlp_last); if (nlp_last->nl_socket != NULL) sorwakeup(nlp_last->nl_socket); NLP_UNLOCK(nlp_last); } } nlp_last = nlp; } } if (nlp_last != NULL) nl_send_one_group(m, nlp_last, num_messages, io_flags); else m_freem(m); NLCTL_RUNLOCK(ctl); } bool nl_has_listeners(int netlink_family, uint32_t groups_mask) { return (V_nl_ctl != NULL); } static uint32_t nl_find_port(void) { /* * app can open multiple netlink sockets. * Start with current pid, if already taken, * try random numbers in 65k..256k+65k space, * avoiding clash with pids. */ if (nl_port_lookup(curproc->p_pid) == NULL) return (curproc->p_pid); for (int i = 0; i < 16; i++) { uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; if (nl_port_lookup(nl_port) == 0) return (nl_port); NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); } return (curproc->p_pid); } static int nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) { if (nlp->nl_bound) { if (nlp->nl_port != snl->nl_pid) { NL_LOG(LOG_DEBUG, "bind() failed: program pid %d " "is different from provided pid %d", nlp->nl_port, snl->nl_pid); return (EINVAL); // XXX: better error } } else { if (snl->nl_pid == 0) snl->nl_pid = nl_find_port(); if (nl_port_lookup(snl->nl_pid) != NULL) return (EADDRINUSE); nlp->nl_port = snl->nl_pid; nlp->nl_bound = true; CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); } for (int i = 0; i < 32; i++) { if (snl->nl_groups & ((uint32_t)1 << i)) nl_add_group_locked(nlp, i + 1); else nl_del_group_locked(nlp, i + 1); } return (0); } static int nl_pru_attach(struct socket *so, int proto, struct thread *td) { struct nlpcb *nlp; int error; if (__predict_false(netlink_unloading != 0)) return (EAFNOSUPPORT); error = nl_verify_proto(proto); if (error != 0) return (error); bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", so, is_linux ? "(linux) " : "", curproc->p_pid, nl_get_proto_name(proto)); /* Create per-VNET state on first socket init */ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (ctl == NULL) ctl = vnet_nl_ctl_init(); KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); MPASS(sotonlpcb(so) == NULL); nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); error = soreserve(so, nl_sendspace, nl_recvspace); if (error != 0) { free(nlp, M_PCB); return (error); } so->so_pcb = nlp; nlp->nl_socket = so; /* Copy so_cred to avoid having socket_var.h in every header */ nlp->nl_cred = so->so_cred; nlp->nl_proto = proto; nlp->nl_process_id = curproc->p_pid; nlp->nl_linux = is_linux; nlp->nl_active = true; nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred); nlp->nl_need_thread_setup = true; NLP_LOCK_INIT(nlp); refcount_init(&nlp->nl_refcount, 1); nl_init_io(nlp); nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, taskqueue_thread_enqueue, &nlp->nl_taskqueue); TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, "netlink_socket (PID %u)", nlp->nl_process_id); NLCTL_WLOCK(ctl); /* XXX: check ctl is still alive */ CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); NLCTL_WUNLOCK(ctl); soisconnected(so); return (0); } static void nl_pru_abort(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); soisdisconnected(so); } static int nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; int error; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } NLCTL_WLOCK(ctl); NLP_LOCK(nlp); error = nl_bind_locked(nlp, snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, snl->nl_pid, snl->nl_groups, error); return (error); } static int nl_assign_port(struct nlpcb *nlp, uint32_t port_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct sockaddr_nl snl = { .nl_pid = port_id, }; int error; NLCTL_WLOCK(ctl); NLP_LOCK(nlp); snl.nl_groups = nl_get_groups_compat(nlp); error = nl_bind_locked(nlp, &snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); return (error); } /* * nl_autobind_port binds a unused portid to @nlp * @nlp: pcb data for the netlink socket * @candidate_id: first id to consider */ static int nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); uint32_t port_id = candidate_id; NLCTL_TRACKER; bool exist; int error = EADDRINUSE; for (int i = 0; i < 10; i++) { NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); NLCTL_RLOCK(ctl); exist = nl_port_lookup(port_id) != 0; NLCTL_RUNLOCK(ctl); if (!exist) { error = nl_assign_port(nlp, port_id); if (error != EADDRINUSE) break; } port_id++; } NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); return (error); } static int nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td) { struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; struct nlpcb *nlp; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } nlp = sotonlpcb(so); if (!nlp->nl_bound) { int error = nl_autobind_port(nlp, td->td_proc->p_pid); if (error != 0) { NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); return (error); } } /* XXX: Handle socket flags & multicast */ soisconnected(so); NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); return (0); } static void destroy_nlpcb(struct nlpcb *nlp) { NLP_LOCK(nlp); nl_free_io(nlp); NLP_LOCK_DESTROY(nlp); free(nlp, M_PCB); } static void destroy_nlpcb_epoch(epoch_context_t ctx) { struct nlpcb *nlp; nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); destroy_nlpcb(nlp); } static void nl_pru_detach(struct socket *so) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); MPASS(sotonlpcb(so) != NULL); struct nlpcb *nlp; NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); nlp = sotonlpcb(so); /* Mark as inactive so no new work can be enqueued */ NLP_LOCK(nlp); bool was_bound = nlp->nl_bound; nlp->nl_active = false; NLP_UNLOCK(nlp); /* Wait till all scheduled work has been completed */ taskqueue_drain_all(nlp->nl_taskqueue); taskqueue_free(nlp->nl_taskqueue); NLCTL_WLOCK(ctl); NLP_LOCK(nlp); if (was_bound) { CK_LIST_REMOVE(nlp, nl_port_next); NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); } CK_LIST_REMOVE(nlp, nl_next); nlp->nl_socket = NULL; NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); so->so_pcb = NULL; NL_LOG(LOG_DEBUG3, "socket %p, detached", so); /* XXX: is delayed free needed? */ NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); } static int nl_pru_disconnect(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_pru_peeraddr(struct socket *so, struct sockaddr **sa) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_pru_shutdown(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); socantsendmore(so); return (0); } static int nl_pru_sockaddr(struct socket *so, struct sockaddr **sa) { struct sockaddr_nl *snl; snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO); /* TODO: set other fields */ snl->nl_len = sizeof(struct sockaddr_nl); snl->nl_family = AF_NETLINK; snl->nl_pid = sotonlpcb(so)->nl_port; *sa = (struct sockaddr *)snl; return (0); } static void nl_pru_close(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); soisdisconnected(so); } static int nl_pru_output(struct mbuf *m, struct socket *so, ...) { if (__predict_false(m == NULL || ((m->m_len < sizeof(struct nlmsghdr)) && (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL))) return (ENOBUFS); MPASS((m->m_flags & M_PKTHDR) != 0); NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); nl_receive_async(m, so); return (0); } static int nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *sa, struct mbuf *control, struct thread *td) { NL_LOG(LOG_DEBUG2, "sending message to kernel"); if (__predict_false(control != NULL)) { if (control->m_len) { m_freem(control); return (EINVAL); } m_freem(control); } return (nl_pru_output(m, so)); } static int nl_pru_rcvd(struct socket *so, int flags) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); nl_on_transmit(sotonlpcb(so)); return (0); } static int nl_getoptflag(int sopt_name) { switch (sopt_name) { case NETLINK_CAP_ACK: return (NLF_CAP_ACK); case NETLINK_EXT_ACK: return (NLF_EXT_ACK); case NETLINK_GET_STRICT_CHK: return (NLF_STRICT); case NETLINK_MSG_INFO: return (NLF_MSG_INFO); } return (0); } static int nl_ctloutput(struct socket *so, struct sockopt *sopt) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); uint32_t flag; int optval, error = 0; NLCTL_TRACKER; NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", so, sopt->sopt_name); switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; if (optval <= 0 || optval >= NLP_MAX_GROUPS) { error = ERANGE; break; } NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval); NLCTL_WLOCK(ctl); if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) nl_add_group_locked(nlp, optval); else nl_del_group_locked(nlp, optval); NLCTL_WUNLOCK(ctl); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; flag = nl_getoptflag(sopt->sopt_name); if ((flag == NLF_MSG_INFO) && nlp->nl_linux) { error = EINVAL; break; } NLCTL_WLOCK(ctl); if (optval != 0) nlp->nl_flags |= flag; else nlp->nl_flags &= ~flag; NLCTL_WUNLOCK(ctl); break; default: error = ENOPROTOOPT; } break; case SOPT_GET: switch (sopt->sopt_name) { case NETLINK_LIST_MEMBERSHIPS: NLCTL_RLOCK(ctl); optval = nl_get_groups_compat(nlp); NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: NLCTL_RLOCK(ctl); optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = ENOPROTOOPT; } break; default: error = ENOPROTOOPT; } return (error); } static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_maxsockbuf = nl_maxsockbuf; error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req); if (error || !req->newptr) return (error); if (tmp_maxsockbuf < MSIZE + MCLBYTES) return (EINVAL); nl_maxsockbuf = tmp_maxsockbuf; return (0); } static int nl_setsbopt(struct socket *so, struct sockopt *sopt) { int error, optval; bool result; if (sopt->sopt_name != SO_RCVBUF) return (sbsetopt(so, sopt)); /* Allow to override max buffer size in certain conditions */ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error != 0) return (error); NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); if (optval > sb_max_adj) { if (priv_check(curthread, PRIV_NET_ROUTE) != 0) return (EPERM); } SOCK_RECVBUF_LOCK(so); result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); SOCK_RECVBUF_UNLOCK(so); return (result ? 0 : ENOBUFS); } #define NETLINK_PROTOSW \ .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD, \ .pr_ctloutput = nl_ctloutput, \ .pr_setsbopt = nl_setsbopt, \ .pr_abort = nl_pru_abort, \ .pr_attach = nl_pru_attach, \ .pr_bind = nl_pru_bind, \ .pr_connect = nl_pru_connect, \ .pr_detach = nl_pru_detach, \ .pr_disconnect = nl_pru_disconnect, \ .pr_peeraddr = nl_pru_peeraddr, \ .pr_send = nl_pru_send, \ .pr_rcvd = nl_pru_rcvd, \ .pr_shutdown = nl_pru_shutdown, \ .pr_sockaddr = nl_pru_sockaddr, \ .pr_close = nl_pru_close static struct protosw netlink_raw_sw = { .pr_type = SOCK_RAW, NETLINK_PROTOSW }; static struct protosw netlink_dgram_sw = { .pr_type = SOCK_DGRAM, NETLINK_PROTOSW }; static struct domain netlinkdomain = { .dom_family = PF_NETLINK, .dom_name = "netlink", .dom_flags = DOMF_UNLOADABLE, .dom_nprotosw = 2, .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw }, }; DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c index b28b206a7ace..b7600b2b6b71 100644 --- a/sys/netlink/netlink_generic.c +++ b/sys/netlink/netlink_generic.c @@ -1,304 +1,302 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_generic #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, const struct genl_family *gf, struct nl_writer *nw); /* * Handler called by netlink subsystem when matching netlink message is received */ static int genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlpcb *nlp = npt->nlp; struct genl_family *gf = NULL; int error = 0; int family_id = (int)hdr->nlmsg_type - GENL_MIN_ID; if (__predict_false(family_id < 0 || (gf = genl_get_family(family_id)) == NULL)) { NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", hdr->nlmsg_type); return (ENOTSUP); } if (__predict_false(hdr->nlmsg_len < sizeof(hdr) + GENL_HDRLEN)) { NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", hdr->nlmsg_len); return (EINVAL); } struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1); if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) { NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d", gf->family_name, ghdr->cmd); return (ENOTSUP); } struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd]; if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) { NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed", gf->family_name, ghdr->cmd); return (EPERM); } NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d", gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len); error = cmd->cmd_cb(hdr, npt); return (error); } static uint32_t get_cmd_flags(const struct genl_cmd *cmd) { uint32_t flags = cmd->cmd_flags; if (cmd->cmd_priv != 0) flags |= GENL_ADMIN_PERM; return (flags); } static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, const struct genl_family *gf, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = ghdr->cmd; ghdr_new->version = gf->family_version; ghdr_new->reserved = 0; nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name); nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, gf->family_id); nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version); nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize); nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max); if (gf->family_cmd_size > 0) { int off = nlattr_add_nested(nw, CTRL_ATTR_OPS); if (off == 0) goto enomem; for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) { struct genl_cmd *cmd = &gf->family_cmds[i]; if (cmd->cmd_cb == NULL) continue; int cmd_off = nlattr_add_nested(nw, ++cnt); if (cmd_off == 0) goto enomem; nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num); nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd)); nlattr_set_len(nw, cmd_off); } nlattr_set_len(nw, off); } if (gf->family_num_groups > 0) { int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS); if (off == 0) goto enomem; for (int i = 0, cnt = 0; i < MAX_GROUPS; i++) { struct genl_group *gg = genl_get_group(i); if (gg == NULL || gg->group_family != gf) continue; int cmd_off = nlattr_add_nested(nw, ++cnt); if (cmd_off == 0) goto enomem; nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM); nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name); nlattr_set_len(nw, cmd_off); } nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (0); enomem: NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name); nlmsg_abort(nw); return (ENOMEM); } /* Declare ourself as a user */ static void nlctrl_notify(void *arg, const struct genl_family *gf, int action); static eventhandler_tag family_event_tag; static uint32_t ctrl_family_id; static uint32_t ctrl_group_id; struct nl_parsed_family { uint32_t family_id; char *family_name; uint8_t version; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_parsed_family, _field) static const struct nlfield_parser nlf_p_generic[] = { { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, }; static struct nlattr_parser nla_p_generic[] = { { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint16 }, { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_name), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic); static bool match_family(const struct genl_family *gf, const struct nl_parsed_family *attrs) { if (gf->family_name == NULL) return (false); if (attrs->family_id != 0 && attrs->family_id != gf->family_id) return (false); if (attrs->family_name != NULL && strcmp(attrs->family_name, gf->family_name)) return (false); return (true); } static int nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error = 0; struct nl_parsed_family attrs = {}; error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs); if (error != 0) return (error); struct genlmsghdr ghdr = { .cmd = CTRL_CMD_NEWFAMILY, }; if (attrs.family_id != 0 || attrs.family_name != NULL) { /* Resolve request */ for (int i = 0; i < MAX_FAMILIES; i++) { struct genl_family *gf = genl_get_family(i); if (gf != NULL && match_family(gf, &attrs)) { error = dump_family(hdr, &ghdr, gf, npt->nw); return (error); } } return (ENOENT); } hdr->nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI; for (int i = 0; i < MAX_FAMILIES; i++) { struct genl_family *gf = genl_get_family(i); if (gf != NULL && match_family(gf, &attrs)) { error = dump_family(hdr, &ghdr, gf, npt->nw); if (error != 0) break; } } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static void nlctrl_notify(void *arg __unused, const struct genl_family *gf, int cmd) { struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC }; struct genlmsghdr ghdr = { .cmd = cmd }; struct nl_writer nw = {}; if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_GENERIC, ctrl_group_id)) { dump_family(&hdr, &ghdr, gf, &nw); nlmsg_flush(&nw); return; } NL_LOG(LOG_DEBUG, "error allocating group writer"); } static const struct genl_cmd nlctrl_cmds[] = { { .cmd_num = CTRL_CMD_GETFAMILY, .cmd_name = "GETFAMILY", .cmd_cb = nlctrl_handle_getfamily, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, }; static const struct nlhdr_parser *all_parsers[] = { &genl_parser }; static void genl_load_all(void *u __unused) { NL_VERIFY_PARSERS(all_parsers); ctrl_family_id = genl_register_family(CTRL_FAMILY_NAME, 0, 2, CTRL_ATTR_MAX); genl_register_cmds(CTRL_FAMILY_NAME, nlctrl_cmds, NL_ARRAY_LEN(nlctrl_cmds)); ctrl_group_id = genl_register_group(CTRL_FAMILY_NAME, "notify"); family_event_tag = EVENTHANDLER_REGISTER(genl_family_event, nlctrl_notify, NULL, EVENTHANDLER_PRI_ANY); netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", genl_handle_message); } SYSINIT(genl_load_all, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load_all, NULL); static void genl_unload(void *u __unused) { netlink_unregister_proto(NETLINK_GENERIC); EVENTHANDLER_DEREGISTER(genl_family_event, family_event_tag); genl_unregister_family(CTRL_FAMILY_NAME); NET_EPOCH_WAIT(); } SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL); diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c index 29eac82aaffd..e7649c6b13dc 100644 --- a/sys/netlink/netlink_glue.c +++ b/sys/netlink/netlink_glue.c @@ -1,309 +1,307 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* priv_check */ #include #include #include #include #include #include /* Standard bits: built-in the kernel */ SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, ""); SYSCTL_NODE(_net_netlink, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets"); /* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */ static void ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { } static void ignore_ifmsg_event(struct ifnet *ifp, int if_flags_mask) { } static struct rtbridge ignore_cb = { .route_f = ignore_route_event, .ifmsg_f = ignore_ifmsg_event, }; void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */ struct rtbridge *rtsock_callback_p = &ignore_cb; struct rtbridge *netlink_callback_p = &ignore_cb; /* * nlp accessors. * TODO: move to a separate file once the number grows. */ bool nlp_has_priv(struct nlpcb *nlp, int priv) { return (priv_check_cred(nlp->nl_cred, priv) == 0); } struct ucred * nlp_get_cred(struct nlpcb *nlp) { return (nlp->nl_cred); } uint32_t nlp_get_pid(const struct nlpcb *nlp) { return (nlp->nl_process_id); } bool nlp_unconstrained_vnet(const struct nlpcb *nlp) { return (nlp->nl_unconstrained_vnet); } #ifndef NETLINK /* Stub implementations for the loadable functions */ static bool get_stub_writer(struct nl_writer *nw) { bzero(nw, sizeof(*nw)); nw->writer_type = NS_WRITER_TYPE_STUB; nw->enomem = true; return (false); } static bool nlmsg_get_unicast_writer_stub(struct nl_writer *nw, int size, struct nlpcb *nlp) { return (get_stub_writer(nw)); } static bool nlmsg_get_group_writer_stub(struct nl_writer *nw, int size, int protocol, int group_id) { return (get_stub_writer(nw)); } static bool nlmsg_get_chain_writer_stub(struct nl_writer *nw, int size, struct mbuf **pm) { return (get_stub_writer(nw)); } static bool nlmsg_flush_stub(struct nl_writer *nw __unused) { return (false); } static void nlmsg_ignore_limit_stub(struct nl_writer *nw __unused) { } static bool nlmsg_refill_buffer_stub(struct nl_writer *nw __unused, int required_len __unused) { return (false); } static bool nlmsg_add_stub(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { return (false); } static bool nlmsg_end_stub(struct nl_writer *nw __unused) { return (false); } static void nlmsg_abort_stub(struct nl_writer *nw __unused) { } static bool nlmsg_end_dump_stub(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { return (false); } static int nl_modify_ifp_generic_stub(struct ifnet *ifp __unused, struct nl_parsed_link *lattrs __unused, const struct nlattr_bmask *bm __unused, struct nl_pstate *npt __unused) { return (ENOTSUP); } static void nl_store_ifp_cookie_stub(struct nl_pstate *npt __unused, struct ifnet *ifp __unused) { } static struct nlpcb * nl_get_thread_nlp_stub(struct thread *td __unused) { return (NULL); } const static struct nl_function_wrapper nl_stub = { .nlmsg_add = nlmsg_add_stub, .nlmsg_refill_buffer = nlmsg_refill_buffer_stub, .nlmsg_flush = nlmsg_flush_stub, .nlmsg_end = nlmsg_end_stub, .nlmsg_abort = nlmsg_abort_stub, .nlmsg_ignore_limit = nlmsg_ignore_limit_stub, .nlmsg_get_unicast_writer = nlmsg_get_unicast_writer_stub, .nlmsg_get_group_writer = nlmsg_get_group_writer_stub, .nlmsg_get_chain_writer = nlmsg_get_chain_writer_stub, .nlmsg_end_dump = nlmsg_end_dump_stub, .nl_modify_ifp_generic = nl_modify_ifp_generic_stub, .nl_store_ifp_cookie = nl_store_ifp_cookie_stub, .nl_get_thread_nlp = nl_get_thread_nlp_stub, }; /* * If the kernel is compiled with netlink as a module, * provide a way to introduce non-stub functioms */ static const struct nl_function_wrapper *_nl = &nl_stub; void nl_set_functions(const struct nl_function_wrapper *nl) { _nl = (nl != NULL) ? nl : &nl_stub; } /* Function wrappers */ bool nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) { return (_nl->nlmsg_get_unicast_writer(nw, size, nlp)); } bool nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) { return (_nl->nlmsg_get_group_writer(nw, size, protocol, group_id)); } bool nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) { return (_nl->nlmsg_get_chain_writer(nw, size, pm)); } bool nlmsg_flush(struct nl_writer *nw) { return (_nl->nlmsg_flush(nw)); } void nlmsg_ignore_limit(struct nl_writer *nw) { _nl->nlmsg_ignore_limit(nw); } bool nlmsg_refill_buffer(struct nl_writer *nw, int required_len) { return (_nl->nlmsg_refill_buffer(nw, required_len)); } bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { return (_nl->nlmsg_add(nw, portid, seq, type, flags, len)); } bool nlmsg_end(struct nl_writer *nw) { return (_nl->nlmsg_end(nw)); } void nlmsg_abort(struct nl_writer *nw) { _nl->nlmsg_abort(nw); } bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { return (_nl->nlmsg_end_dump(nw, error, hdr)); } int nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm , struct nl_pstate *npt) { return (_nl->nl_modify_ifp_generic(ifp, lattrs, bm, npt)); } void nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) { return (_nl->nl_store_ifp_cookie(npt, ifp)); } struct nlpcb * nl_get_thread_nlp(struct thread *td) { return (_nl->nl_get_thread_nlp(td)); } #endif /* !NETLINK */ diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c index 0745f7fdcb77..0ca71fb6219b 100644 --- a/sys/netlink/netlink_io.c +++ b/sys/netlink/netlink_io.c @@ -1,601 +1,599 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_io #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * The logic below provide a p2p interface for receiving and * sending netlink data between the kernel and userland. */ static const struct sockaddr_nl _nl_empty_src = { .nl_len = sizeof(struct sockaddr_nl), .nl_family = PF_NETLINK, .nl_pid = 0 /* comes from the kernel */ }; static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); static void queue_push(struct nl_io_queue *q, struct mbuf *mq) { while (mq != NULL) { struct mbuf *m = mq; mq = mq->m_nextpkt; m->m_nextpkt = NULL; q->length += m_length(m, NULL); STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); } } static void queue_push_head(struct nl_io_queue *q, struct mbuf *m) { MPASS(m->m_nextpkt == NULL); q->length += m_length(m, NULL); STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); } static struct mbuf * queue_pop(struct nl_io_queue *q) { if (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; q->length -= m_length(m, NULL); return (m); } return (NULL); } static struct mbuf * queue_head(const struct nl_io_queue *q) { return (STAILQ_FIRST(&q->head)); } static inline bool queue_empty(const struct nl_io_queue *q) { return (q->length == 0); } static void queue_free(struct nl_io_queue *q) { while (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; m_freem(m); } q->length = 0; } void nl_add_msg_info(struct mbuf *m) { struct nlpcb *nlp = nl_get_thread_nlp(curthread); NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", curthread, nlp); if (nlp == NULL) return; /* Prepare what we want to encode - PID, socket PID & msg seq */ struct { struct nlattr nla; uint32_t val; } data[] = { { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, .val = nlp->nl_process_id, }, { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, .val = nlp->nl_port, }, }; while (m->m_next != NULL) m = m->m_next; m->m_next = sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p", (unsigned)sizeof(data), m->m_next); } static __noinline struct mbuf * extract_msg_info(struct mbuf *m) { while (m->m_next != NULL) { if (m->m_next->m_type == MT_CONTROL) { struct mbuf *ctl = m->m_next; m->m_next = NULL; return (ctl); } m = m->m_next; } return (NULL); } static void nl_schedule_taskqueue(struct nlpcb *nlp) { if (!nlp->nl_task_pending) { nlp->nl_task_pending = true; taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); } else { NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); } } int nl_receive_async(struct mbuf *m, struct socket *so) { struct nlpcb *nlp = sotonlpcb(so); int error = 0; m->m_nextpkt = NULL; NLP_LOCK(nlp); if ((__predict_true(nlp->nl_active))) { sbappend(&so->so_snd, m, 0); NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); nl_schedule_taskqueue(nlp); } else { NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", m_length(m, NULL)); m_free(m); error = EINVAL; } NLP_UNLOCK(nlp); return (error); } static bool tx_check_locked(struct nlpcb *nlp) { if (queue_empty(&nlp->tx_queue)) return (true); /* * Check if something can be moved from the internal TX queue * to the socket queue. */ bool appended = false; struct sockbuf *sb = &nlp->nl_socket->so_rcv; SOCKBUF_LOCK(sb); while (true) { struct mbuf *m = queue_head(&nlp->tx_queue); if (m != NULL) { struct mbuf *ctl = NULL; if (__predict_false(m->m_next != NULL)) ctl = extract_msg_info(m); if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) { /* appended successfully */ queue_pop(&nlp->tx_queue); appended = true; } else break; } else break; } SOCKBUF_UNLOCK(sb); if (appended) sorwakeup(nlp->nl_socket); return (queue_empty(&nlp->tx_queue)); } static bool nl_process_received_one(struct nlpcb *nlp) { bool reschedule = false; NLP_LOCK(nlp); nlp->nl_task_pending = false; if (!tx_check_locked(nlp)) { /* TX overflow queue still not empty, ignore RX */ NLP_UNLOCK(nlp); return (false); } if (queue_empty(&nlp->rx_queue)) { /* * Grab all data we have from the socket TX queue * and store it the internal queue, so it can be worked on * w/o holding socket lock. */ struct sockbuf *sb = &nlp->nl_socket->so_snd; SOCKBUF_LOCK(sb); unsigned int avail = sbavail(sb); if (avail > 0) { NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); } SOCKBUF_UNLOCK(sb); } else { /* Schedule another pass to read from the socket queue */ reschedule = true; } int prev_hiwat = nlp->tx_queue.hiwat; NLP_UNLOCK(nlp); while (!queue_empty(&nlp->rx_queue)) { struct mbuf *m = queue_pop(&nlp->rx_queue); m = nl_process_mbuf(m, nlp); if (m != NULL) { queue_push_head(&nlp->rx_queue, m); reschedule = false; break; } } if (nlp->tx_queue.hiwat > prev_hiwat) { NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); } return (reschedule); } static void nl_process_received(struct nlpcb *nlp) { NL_LOG(LOG_DEBUG3, "taskqueue called"); if (__predict_false(nlp->nl_need_thread_setup)) { nl_set_thread_nlp(curthread, nlp); NLP_LOCK(nlp); nlp->nl_need_thread_setup = false; NLP_UNLOCK(nlp); } while (nl_process_received_one(nlp)) ; } void nl_init_io(struct nlpcb *nlp) { STAILQ_INIT(&nlp->rx_queue.head); STAILQ_INIT(&nlp->tx_queue.head); } void nl_free_io(struct nlpcb *nlp) { queue_free(&nlp->rx_queue); queue_free(&nlp->tx_queue); } /* * Called after some data have been read from the socket. */ void nl_on_transmit(struct nlpcb *nlp) { NLP_LOCK(nlp); struct socket *so = nlp->nl_socket; if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { unsigned long dropped_bytes = nlp->nl_dropped_bytes; unsigned long dropped_messages = nlp->nl_dropped_messages; nlp->nl_dropped_bytes = 0; nlp->nl_dropped_messages = 0; struct sockbuf *sb = &so->so_rcv; NLP_LOG(LOG_DEBUG, nlp, "socket RX overflowed, %lu messages (%lu bytes) dropped. " "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); /* TODO: send netlink message */ } nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } void nl_taskqueue_handler(void *_arg, int pending) { struct nlpcb *nlp = (struct nlpcb *)_arg; CURVNET_SET(nlp->nl_socket->so_vnet); nl_process_received(nlp); CURVNET_RESTORE(); } static __noinline void queue_push_tx(struct nlpcb *nlp, struct mbuf *m) { queue_push(&nlp->tx_queue, m); nlp->nl_tx_blocked = true; if (nlp->tx_queue.length > nlp->tx_queue.hiwat) nlp->tx_queue.hiwat = nlp->tx_queue.length; } /* * Tries to send @m to the socket @nlp. * * @m: mbuf(s) to send to. Consumed in any case. * @nlp: socket to send to * @cnt: number of messages in @m * @io_flags: combination of NL_IOF_* flags * * Returns true on success. * If no queue overrunes happened, wakes up socket owner. */ bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { bool untranslated = io_flags & NL_IOF_UNTRANSLATED; bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; bool result = true; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NLP_LOG(LOG_DEBUG2, nlp, "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, io_flags); } if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); if (m == NULL) return (false); } NLP_LOCK(nlp); if (__predict_false(nlp->nl_socket == NULL)) { NLP_UNLOCK(nlp); m_freem(m); return (false); } if (!queue_empty(&nlp->tx_queue)) { if (ignore_limits) { queue_push_tx(nlp, m); } else { m_free(m); result = false; } NLP_UNLOCK(nlp); return (result); } struct socket *so = nlp->nl_socket; struct mbuf *ctl = NULL; if (__predict_false(m->m_next != NULL)) ctl = extract_msg_info(m); if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) { sorwakeup(so); NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); } else { if (ignore_limits) { queue_push_tx(nlp, m); } else { /* * Store dropped data so it can be reported * on the next read */ nlp->nl_dropped_bytes += m_length(m, NULL); nlp->nl_dropped_messages += num_messages; NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", (unsigned long)nlp->nl_dropped_messages, num_messages, (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); soroverflow(so); m_freem(m); result = false; } } NLP_UNLOCK(nlp); return (result); } static int nl_receive_message(struct nlmsghdr *hdr, int remaining_length, struct nlpcb *nlp, struct nl_pstate *npt) { nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; int error = 0; NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, hdr->nlmsg_pid); if (__predict_false(hdr->nlmsg_len > remaining_length)) { NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", hdr->nlmsg_len, remaining_length); return (EINVAL); } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); return (EINVAL); } /* Stamp each message with sender pid */ hdr->nlmsg_pid = nlp->nl_port; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", hdr->nlmsg_type); if (nlp->nl_linux && linux_netlink_p != NULL) { struct nlmsghdr *hdr_orig = hdr; hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); if (hdr == NULL) { /* Failed to translate to kernel format. Report an error back */ hdr = hdr_orig; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_ACK) nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); return (0); } } error = handler(hdr, npt); NL_LOG(LOG_DEBUG2, "retcode: %d", error); } if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { if (!npt->nw->suppress_ack) { NL_LOG(LOG_DEBUG3, "ack"); nlmsg_ack(nlp, error, hdr, npt); } } return (0); } static void npt_clear(struct nl_pstate *npt) { lb_clear(&npt->lb); npt->error = 0; npt->err_msg = NULL; npt->err_off = 0; npt->hdr = NULL; npt->nw->suppress_ack = false; } /* * Processes an incoming packet, which can contain multiple netlink messages */ static struct mbuf * nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) { int offset, buffer_length; struct nlmsghdr *hdr; char *buffer; int error; NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); struct nl_writer nw = {}; if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { m_freem(m); NL_LOG(LOG_DEBUG, "error allocating socket writer"); return (NULL); } nlmsg_ignore_limit(&nw); /* TODO: alloc this buf once for nlp */ int data_length = m_length(m, NULL); buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; if (nlp->nl_linux) buffer_length += roundup2(data_length, 8); buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); if (buffer == NULL) { m_freem(m); nlmsg_flush(&nw); NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", buffer_length); return (NULL); } m_copydata(m, 0, data_length, buffer); struct nl_pstate npt = { .nlp = nlp, .lb.base = &buffer[roundup2(data_length, 8)], .lb.size = buffer_length - roundup2(data_length, 8), .nw = &nw, .strict = nlp->nl_flags & NLF_STRICT, }; for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { hdr = (struct nlmsghdr *)&buffer[offset]; /* Save length prior to calling handler */ int msglen = NLMSG_ALIGN(hdr->nlmsg_len); NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); npt_clear(&npt); error = nl_receive_message(hdr, data_length - offset, nlp, &npt); offset += msglen; if (__predict_false(error != 0 || nlp->nl_tx_blocked)) break; } NL_LOG(LOG_DEBUG3, "packet parsing done"); free(buffer, M_NETLINK); nlmsg_flush(&nw); if (nlp->nl_tx_blocked) { NLP_LOCK(nlp); nlp->nl_tx_blocked = false; NLP_UNLOCK(nlp); m_adj(m, offset); return (m); } else { m_freem(m); return (NULL); } } diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c index 9fc0eddc78ec..8443cc59d9e0 100644 --- a/sys/netlink/netlink_message_writer.c +++ b/sys/netlink/netlink_message_writer.c @@ -1,789 +1,787 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_writer #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * The goal of this file is to provide convenient message writing KPI on top of * different storage methods (mbufs, uio, temporary memory chunks). * * The main KPI guarantee is that the (last) message always resides in the contiguous * memory buffer, so one is able to update the header after writing the entire message. * * This guarantee comes with a side effect of potentially reallocating underlying * buffer, so one needs to update the desired pointers after something is added * to the header. * * Messaging layer contains hooks performing transparent Linux translation for the messages. * * There are 3 types of supported targets: * * socket (adds mbufs to the socket buffer, used for message replies) * * group (sends mbuf/chain to the specified groups, used for the notifications) * * chain (returns mbuf chain, used in Linux message translation code) * * There are 3 types of storage: * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message * fits in NLMBUFSIZE) * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs * to be larger than one supported by NS_WRITER_TYPE_MBUF) * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for * Linux sockets, calls translation hook prior to sending messages to the socket). * * Internally, KPI switches between different types of storage when memory requirements * change. It happens transparently to the caller. */ /* * Uma zone for the mbuf-based Netlink storage */ static uma_zone_t nlmsg_zone; static void nl_free_mbuf_storage(struct mbuf *m) { uma_zfree(nlmsg_zone, m->m_ext.ext_buf); } static int nl_setup_mbuf_storage(void *mem, int size, void *arg, int how __unused) { struct mbuf *m = (struct mbuf *)arg; if (m != NULL) m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE); return (0); } static struct mbuf * nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags) { struct mbuf *m, *m_storage; if (size <= MHLEN) return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags)); if (__predict_false(size > NLMBUFSIZE)) return (NULL); m = m_gethdr(malloc_flags, MT_DATA); if (m == NULL) return (NULL); m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags); if (m_storage == NULL) { m_free_raw(m); return (NULL); } return (m); } static struct mbuf * nl_get_mbuf(int size, int malloc_flags) { return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR)); } /* * Gets a chain of Netlink mbufs. * This is strip-down version of m_getm2() */ static struct mbuf * nl_get_mbuf_chain(int len, int malloc_flags) { struct mbuf *m_chain = NULL, *m_tail = NULL; int mbuf_flags = M_PKTHDR; while (len > 0) { int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len; struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags); if (m == NULL) { m_freem(m_chain); return (NULL); } /* Book keeping. */ len -= M_SIZE(m); if (m_tail != NULL) m_tail->m_next = m; else m_chain = m; m_tail = m; mbuf_flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } return (m_chain); } void nl_init_msg_zone(void) { nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } void nl_destroy_msg_zone(void) { uma_zdestroy(nlmsg_zone); } typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok); typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt); struct nlwriter_ops { nlwriter_op_init *init; nlwriter_op_write *write_socket; nlwriter_op_write *write_group; nlwriter_op_write *write_chain; }; /* * NS_WRITER_TYPE_BUF * Writes message to a temporary memory buffer, * flushing to the socket/group when buffer size limit is reached */ static bool nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO); if (__predict_false(nw->_storage == NULL)) return (false); nw->alloc_len = size; nw->offset = 0; nw->hdr = NULL; nw->data = nw->_storage; nw->writer_type = NS_WRITER_TYPE_BUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; return (true); } static bool nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); if (__predict_false(m == NULL)) { /* XXX: should we set sorcverr? */ free(buf, M_NETLINK); return (false); } m_append(m, datalen, buf); free(buf, M_NETLINK); int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags)); } static bool nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen, nw->arg.group.proto, nw->arg.group.id); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } bool success = m_append(m, datalen, buf) != 0; free(buf, M_NETLINK); if (!success) return (false); nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } static bool nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr); NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } if (*m0 == NULL) { struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } *m0 = m; } if (__predict_false(m_append(*m0, datalen, buf) == 0)) { free(buf, M_NETLINK); return (false); } return (true); } /* * NS_WRITER_TYPE_MBUF * Writes message to the allocated mbuf, * flushing to socket/group when mbuf size limit is reached. * This is the most efficient mechanism as it avoids double-copying. * * Allocates a single mbuf suitable to store up to @size bytes of data. * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr. * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone. * Returns NULL on greater size or the allocation failure. */ static bool nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; struct mbuf *m = nl_get_mbuf(size, mflag); if (__predict_false(m == NULL)) return (false); nw->alloc_len = M_TRAILINGSPACE(m); nw->offset = 0; nw->hdr = NULL; nw->_storage = (void *)m; nw->data = mtod(m, void *); nw->writer_type = NS_WRITER_TYPE_MBUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; memset(nw->data, 0, size); NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p", m, size, nw->alloc_len, nw->data); return (true); } static bool nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m = (struct mbuf *)buf; NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { m_freem(m); return (true); } m->m_pkthdr.len = datalen; m->m_len = datalen; int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags)); } static bool nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m = (struct mbuf *)buf; NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen, nw->arg.group.proto, nw->arg.group.id); if (__predict_false(datalen == 0)) { m_freem(m); return (true); } m->m_pkthdr.len = datalen; m->m_len = datalen; nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } static bool nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m_new = (struct mbuf *)buf; struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr); NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { m_freem(m_new); return (true); } m_new->m_pkthdr.len = datalen; m_new->m_len = datalen; if (*m0 == NULL) { *m0 = m_new; } else { struct mbuf *m_last; for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next) ; m_last->m_next = m_new; (*m0)->m_pkthdr.len += datalen; } return (true); } /* * NS_WRITER_TYPE_LBUF * Writes message to the allocated memory buffer, * flushing to socket/group when mbuf size limit is reached. * Calls linux handler to rewrite messages before sending to the socket. */ static bool nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; size = roundup2(size, sizeof(void *)); int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE; char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO); if (__predict_false(buf == NULL)) return (false); /* Fill buffer header first */ struct linear_buffer *lb = (struct linear_buffer *)buf; lb->base = &buf[sizeof(struct linear_buffer) + size]; lb->size = size + SCRATCH_BUFFER_SIZE; nw->alloc_len = size; nw->offset = 0; nw->hdr = NULL; nw->_storage = buf; nw->data = (char *)(lb + 1); nw->malloc_flag = mflag; nw->writer_type = NS_WRITER_TYPE_LBUF; nw->num_messages = 0; nw->enomem = false; return (true); } static bool nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct linear_buffer *lb = (struct linear_buffer *)buf; char *data = (char *)(lb + 1); struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = NULL; if (linux_netlink_p != NULL) m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp); free(buf, M_NETLINK); if (__predict_false(m == NULL)) { /* XXX: should we set sorcverr? */ return (false); } int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, nlp, cnt, io_flags)); } /* Shouldn't be called (maybe except Linux code originating message) */ static bool nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct linear_buffer *lb = (struct linear_buffer *)buf; char *data = (char *)(lb + 1); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } m_append(m, datalen, data); free(buf, M_NETLINK); nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } static const struct nlwriter_ops nlmsg_writers[] = { /* NS_WRITER_TYPE_MBUF */ { .init = nlmsg_get_ns_mbuf, .write_socket = nlmsg_write_socket_mbuf, .write_group = nlmsg_write_group_mbuf, .write_chain = nlmsg_write_chain_mbuf, }, /* NS_WRITER_TYPE_BUF */ { .init = nlmsg_get_ns_buf, .write_socket = nlmsg_write_socket_buf, .write_group = nlmsg_write_group_buf, .write_chain = nlmsg_write_chain_buf, }, /* NS_WRITER_TYPE_LBUF */ { .init = nlmsg_get_ns_lbuf, .write_socket = nlmsg_write_socket_lbuf, .write_group = nlmsg_write_group_lbuf, }, }; static void nlmsg_set_callback(struct nl_writer *nw) { const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type]; switch (nw->writer_target) { case NS_WRITER_TARGET_SOCKET: nw->cb = pops->write_socket; break; case NS_WRITER_TARGET_GROUP: nw->cb = pops->write_group; break; case NS_WRITER_TARGET_CHAIN: nw->cb = pops->write_chain; break; default: panic("not implemented"); } } static bool nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok) { MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0])); NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type); return (nlmsg_writers[type].init(nw, size, waitok)); } static bool nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux) { int type; if (!is_linux) { if (__predict_true(size <= NLMBUFSIZE)) type = NS_WRITER_TYPE_MBUF; else type = NS_WRITER_TYPE_BUF; } else type = NS_WRITER_TYPE_LBUF; return (nlmsg_get_buf_type(nw, size, type, waitok)); } bool _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) { if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux)) return (false); nw->arg.ptr = (void *)nlp; nw->writer_target = NS_WRITER_TARGET_SOCKET; nlmsg_set_callback(nw); return (true); } bool _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) { if (!nlmsg_get_buf(nw, size, false, false)) return (false); nw->arg.group.proto = protocol; nw->arg.group.id = group_id; nw->writer_target = NS_WRITER_TARGET_GROUP; nlmsg_set_callback(nw); return (true); } bool _nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) { if (!nlmsg_get_buf(nw, size, false, false)) return (false); *pm = NULL; nw->arg.ptr = (void *)pm; nw->writer_target = NS_WRITER_TARGET_CHAIN; nlmsg_set_callback(nw); NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf); return (true); } void _nlmsg_ignore_limit(struct nl_writer *nw) { nw->ignore_limit = true; } bool _nlmsg_flush(struct nl_writer *nw) { if (__predict_false(nw->hdr != NULL)) { /* Last message has not been completed, skip it. */ int completed_len = (char *)nw->hdr - nw->data; /* Send completed messages */ nw->offset -= nw->offset - completed_len; nw->hdr = NULL; } NL_LOG(LOG_DEBUG2, "OUT"); bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages); nw->_storage = NULL; if (!result) { NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb); } return (result); } /* * Flushes previous data and allocates new underlying storage * sufficient for holding at least @required_len bytes. * Return true on success. */ bool _nlmsg_refill_buffer(struct nl_writer *nw, int required_len) { struct nl_writer ns_new = {}; int completed_len, new_len; if (nw->enomem) return (false); NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim", nw->offset, nw->alloc_len, required_len); /* Calculated new buffer size and allocate it s*/ completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset; if (completed_len > 0 && required_len < NLMBUFSIZE) { /* We already ran out of space, use the largest effective size */ new_len = max(nw->alloc_len, NLMBUFSIZE); } else { if (nw->alloc_len < NLMBUFSIZE) new_len = NLMBUFSIZE; else new_len = nw->alloc_len * 2; while (new_len < required_len) new_len *= 2; } bool waitok = (nw->malloc_flag == M_WAITOK); bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF); if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) { nw->enomem = true; NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM"); return (false); } if (nw->ignore_limit) nlmsg_ignore_limit(&ns_new); /* Update callback data */ ns_new.writer_target = nw->writer_target; nlmsg_set_callback(&ns_new); ns_new.arg = nw->arg; /* Copy last (unfinished) header to the new storage */ int last_len = nw->offset - completed_len; if (last_len > 0) { memcpy(ns_new.data, nw->hdr, last_len); ns_new.hdr = (struct nlmsghdr *)ns_new.data; ns_new.offset = last_len; } NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len); /* Flush completed headers & switch to the new nw */ nlmsg_flush(nw); memcpy(nw, &ns_new, sizeof(struct nl_writer)); NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len); return (true); } bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { struct nlmsghdr *hdr; MPASS(nw->hdr == NULL); int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); if (__predict_false(nw->offset + required_len > nw->alloc_len)) { if (!nlmsg_refill_buffer(nw, required_len)) return (false); } hdr = (struct nlmsghdr *)(&nw->data[nw->offset]); hdr->nlmsg_len = len; hdr->nlmsg_type = type; hdr->nlmsg_flags = flags; hdr->nlmsg_seq = seq; hdr->nlmsg_pid = portid; nw->hdr = hdr; nw->offset += sizeof(struct nlmsghdr); return (true); } bool _nlmsg_end(struct nl_writer *nw) { MPASS(nw->hdr != NULL); if (nw->enomem) { NL_LOG(LOG_DEBUG, "ENOMEM when dumping message"); nlmsg_abort(nw); return (false); } nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr); NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags, nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid); nw->hdr = NULL; nw->num_messages++; return (true); } void _nlmsg_abort(struct nl_writer *nw) { if (nw->hdr != NULL) { nw->offset = (uint32_t)((char *)nw->hdr - nw->data); nw->hdr = NULL; } } void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlmsgerr *errmsg; int payload_len; uint32_t flags = nlp->nl_flags; struct nl_writer *nw = npt->nw; bool cap_ack; payload_len = sizeof(struct nlmsgerr); /* * The only case when we send the full message in the * reply is when there is an error and NETLINK_CAP_ACK * is not set. */ cap_ack = (error == 0) || (flags & NLF_CAP_ACK); if (!cap_ack) payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr); payload_len = NETLINK_ALIGN(payload_len); uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0; if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK) nl_flags |= NLM_F_ACK_TLVS; NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d", hdr->nlmsg_type, hdr->nlmsg_seq); if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len)) goto enomem; errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr); errmsg->error = error; /* In case of error copy the whole message, else just the header */ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len); if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg); if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off); if (npt->cookie != NULL) nlattr_add_raw(nw, npt->cookie); if (nlmsg_end(nw)) return; enomem: NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u", hdr->nlmsg_type, hdr->nlmsg_seq); nlmsg_abort(nw); } bool _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { NL_LOG(LOG_DEBUG, "Error finalizing table dump"); return (false); } /* Save operation result */ int *perror = nlmsg_reserve_object(nw, int); NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, nw->offset, perror); *perror = error; nlmsg_end(nw); nw->suppress_ack = true; return (true); } #include diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c index ba56ac6d6a17..b4073721fdc4 100644 --- a/sys/netlink/netlink_module.c +++ b/sys/netlink/netlink_module.c @@ -1,256 +1,254 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(netlink, "Netlink support"); #define DEBUG_MOD_NAME nl_mod #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); #define NL_MAX_HANDLERS 20 struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS]; struct nl_proto_handler *nl_handlers = _nl_handlers; CK_LIST_HEAD(nl_control_head, nl_control); static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); VNET_DEFINE(struct nl_control *, nl_ctl) = NULL; struct mtx nl_global_mtx; MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF); #define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx) #define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx) int netlink_unloading = 0; static void free_nl_ctl(struct nl_control *ctl) { rm_destroy(&ctl->ctl_lock); free(ctl, M_NETLINK); } struct nl_control * vnet_nl_ctl_init(void) { struct nl_control *ctl; ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO); rm_init(&ctl->ctl_lock, "netlink lock"); CK_LIST_INIT(&ctl->ctl_port_head); CK_LIST_INIT(&ctl->ctl_pcb_head); NL_GLOBAL_LOCK(); struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl); if (tmp == NULL) { atomic_store_ptr(&V_nl_ctl, ctl); CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next); NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", curvnet, ctl); } else { NL_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance"); free_nl_ctl(ctl); ctl = tmp; } NL_GLOBAL_UNLOCK(); return (ctl); } static void vnet_nl_ctl_destroy(const void *unused __unused) { struct nl_control *ctl; /* Assume at the time all of the processes / sockets are dead */ NL_GLOBAL_LOCK(); ctl = atomic_load_ptr(&V_nl_ctl); atomic_store_ptr(&V_nl_ctl, NULL); if (ctl != NULL) { NL_LOG(LOG_DEBUG2, "Removing %p from global list", ctl); CK_LIST_REMOVE(ctl, ctl_next); } NL_GLOBAL_UNLOCK(); if (ctl != NULL) free_nl_ctl(ctl); } VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_nl_ctl_destroy, NULL); int nl_verify_proto(int proto) { if (proto < 0 || proto >= NL_MAX_HANDLERS) { return (EINVAL); } int handler_defined = nl_handlers[proto].cb != NULL; return (handler_defined ? 0 : EPROTONOSUPPORT); } const char * nl_get_proto_name(int proto) { return (nl_handlers[proto].proto_name); } bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto)); nl_handlers[proto].cb = handler; nl_handlers[proto].proto_name = proto_name; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Registered netlink %s(%d) handler", proto_name, proto); return (true); } bool netlink_unregister_proto(int proto) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto)); nl_handlers[proto].cb = NULL; nl_handlers[proto].proto_name = NULL; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Unregistered netlink proto %d handler", proto); return (true); } #if !defined(NETLINK) && defined(NETLINK_MODULE) /* Non-stub function provider */ const static struct nl_function_wrapper nl_module = { .nlmsg_add = _nlmsg_add, .nlmsg_refill_buffer = _nlmsg_refill_buffer, .nlmsg_flush = _nlmsg_flush, .nlmsg_end = _nlmsg_end, .nlmsg_abort = _nlmsg_abort, .nlmsg_get_unicast_writer = _nlmsg_get_unicast_writer, .nlmsg_get_group_writer = _nlmsg_get_group_writer, .nlmsg_get_chain_writer = _nlmsg_get_chain_writer, .nlmsg_end_dump = _nlmsg_end_dump, .nl_modify_ifp_generic = _nl_modify_ifp_generic, .nl_store_ifp_cookie = _nl_store_ifp_cookie, .nl_get_thread_nlp = _nl_get_thread_nlp, }; #endif static bool can_unload(void) { struct nl_control *ctl; bool result = true; NL_GLOBAL_LOCK(); CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); result = false; break; } } NL_GLOBAL_UNLOCK(); return (result); } static int netlink_modevent(module_t mod __unused, int what, void *priv __unused) { int ret = 0; switch (what) { case MOD_LOAD: NL_LOG(LOG_DEBUG2, "Loading"); nl_init_msg_zone(); nl_osd_register(); #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(&nl_module); #endif break; case MOD_UNLOAD: NL_LOG(LOG_DEBUG2, "Unload called"); if (can_unload()) { NL_LOG(LOG_WARNING, "unloading"); netlink_unloading = 1; #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(NULL); #endif nl_osd_unregister(); nl_destroy_msg_zone(); } else ret = EBUSY; break; default: ret = EOPNOTSUPP; break; } return (ret); } static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(netlink, 1); diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c index b6e120933f83..0587b478431e 100644 --- a/sys/netlink/route/iface.c +++ b/sys/netlink/route/iface.c @@ -1,1534 +1,1532 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #include #include #include #include #define DEBUG_MOD_NAME nl_iface #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; struct ucred *cred; uint32_t fibnum; int family; int error; int count; int dumped; }; static eventhandler_tag ifdetach_event, ifattach_event, iflink_event, ifaddr_event; static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners); static struct sx rtnl_cloner_lock; SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock"); /* These are external hooks for CARP. */ extern int (*carp_get_vhid_p)(struct ifaddr *); /* * RTM_GETLINK request * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 * * Reply: * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, {{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} [ {{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, {{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, {{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, {{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, {{nla_len=8, nla_type=IFLA_MTU}, 1500}, {{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, {{nla_len=8, nla_type=IFLA_GROUP}, 0}, {{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, {{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, {{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, {{nla_len=5, nla_type=IFLA_CARRIER}, 1}, {{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, {{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, {{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, {{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, {{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, */ struct if_state { uint8_t ifla_operstate; uint8_t ifla_carrier; }; static void get_operstate_ether(if_t ifp, struct if_state *pstate) { struct ifmediareq ifmr = {}; int error; error = if_ioctl(ifp, SIOCGIFMEDIA, (void *)&ifmr); if (error != 0) { NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d", if_name(ifp), error); return; } switch (IFM_TYPE(ifmr.ifm_active)) { case IFM_ETHER: if (ifmr.ifm_status & IFM_ACTIVE) { pstate->ifla_carrier = 1; if (if_getflags(ifp) & IFF_MONITOR) pstate->ifla_operstate = IF_OPER_DORMANT; else pstate->ifla_operstate = IF_OPER_UP; } else pstate->ifla_operstate = IF_OPER_DOWN; } } static bool get_stats(struct nl_writer *nw, if_t ifp) { struct rtnl_link_stats64 *stats; int nla_len = sizeof(struct nlattr) + sizeof(*stats); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return (false); nla->nla_type = IFLA_STATS64; nla->nla_len = nla_len; stats = (struct rtnl_link_stats64 *)(nla + 1); stats->rx_packets = if_getcounter(ifp, IFCOUNTER_IPACKETS); stats->tx_packets = if_getcounter(ifp, IFCOUNTER_OPACKETS); stats->rx_bytes = if_getcounter(ifp, IFCOUNTER_IBYTES); stats->tx_bytes = if_getcounter(ifp, IFCOUNTER_OBYTES); stats->rx_errors = if_getcounter(ifp, IFCOUNTER_IERRORS); stats->tx_errors = if_getcounter(ifp, IFCOUNTER_OERRORS); stats->rx_dropped = if_getcounter(ifp, IFCOUNTER_IQDROPS); stats->tx_dropped = if_getcounter(ifp, IFCOUNTER_OQDROPS); stats->multicast = if_getcounter(ifp, IFCOUNTER_IMCASTS); stats->rx_nohandler = if_getcounter(ifp, IFCOUNTER_NOPROTO); return (true); } static void get_operstate(if_t ifp, struct if_state *pstate) { pstate->ifla_operstate = IF_OPER_UNKNOWN; pstate->ifla_carrier = 0; /* no carrier */ switch (if_gettype(ifp)) { case IFT_ETHER: case IFT_L2VLAN: get_operstate_ether(ifp, pstate); break; default: /* Map admin state to the operstate */ if (if_getflags(ifp) & IFF_UP) { pstate->ifla_operstate = IF_OPER_UP; pstate->ifla_carrier = 1; } else pstate->ifla_operstate = IF_OPER_DOWN; break; } } static void get_hwaddr(struct nl_writer *nw, if_t ifp) { struct ifreq ifr = {}; if (if_gethwaddr(ifp, &ifr) == 0) { nlattr_add(nw, IFLAF_ORIG_HWADDR, if_getaddrlen(ifp), ifr.ifr_addr.sa_data); } } static unsigned ifp_flags_to_netlink(const if_t ifp) { return (if_getflags(ifp) | if_getdrvflags(ifp)); } #define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) static bool dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa) { uint32_t addr_len = 0; const void *addr_data = NULL; #ifdef INET6 struct in6_addr addr6; #endif if (sa == NULL) return (true); switch (sa->sa_family) { #ifdef INET case AF_INET: addr_len = sizeof(struct in_addr); addr_data = &((const struct sockaddr_in *)sa)->sin_addr; break; #endif #ifdef INET6 case AF_INET6: in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); addr_len = sizeof(struct in6_addr); addr_data = &addr6; break; #endif case AF_LINK: addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); break; case AF_UNSPEC: /* Ignore empty SAs without warning */ return (true); default: NL_LOG(LOG_DEBUG2, "unsupported family: %d, skipping", sa->sa_family); return (true); } return (nlattr_add(nw, attr, addr_len, addr_data)); } static bool dump_iface_caps(struct nl_writer *nw, struct ifnet *ifp) { int off = nlattr_add_nested(nw, IFLAF_CAPS); uint32_t active_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; uint32_t all_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; MPASS(sizeof(active_caps) >= 8); MPASS(sizeof(all_caps) >= 8); if (off == 0) return (false); active_caps[0] = (uint32_t)if_getcapabilities(ifp); all_caps[0] = (uint32_t)if_getcapenable(ifp); active_caps[1] = (uint32_t)if_getcapabilities2(ifp); all_caps[1] = (uint32_t)if_getcapenable2(ifp); nlattr_add_u32(nw, NLA_BITSET_SIZE, IFCAP_B_SIZE); nlattr_add(nw, NLA_BITSET_MASK, sizeof(all_caps), all_caps); nlattr_add(nw, NLA_BITSET_VALUE, sizeof(active_caps), active_caps); nlattr_set_len(nw, off); return (true); } /* * Dumps interface state, properties and metrics. * @nw: message writer * @ifp: target interface * @hdr: template header * @if_flags_mask: changed if_[drv]_flags bitmask * * This function is called without epoch and MAY sleep. */ static bool dump_iface(struct nl_writer *nw, if_t ifp, const struct nlmsghdr *hdr, int if_flags_mask) { struct epoch_tracker et; struct ifinfomsg *ifinfo; NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg))) goto enomem; ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg); ifinfo->ifi_family = AF_UNSPEC; ifinfo->__ifi_pad = 0; ifinfo->ifi_type = if_gettype(ifp); ifinfo->ifi_index = if_getindex(ifp); ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); ifinfo->ifi_change = if_flags_mask; struct if_state ifs = {}; get_operstate(ifp, &ifs); if (ifs.ifla_operstate == IF_OPER_UP) ifinfo->ifi_flags |= IFF_LOWER_UP; nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp)); nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate); nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier); /* nlattr_add_u8(nw, IFLA_PROTO_DOWN, val); nlattr_add_u8(nw, IFLA_LINKMODE, val); */ if (if_getaddrlen(ifp) != 0) { struct ifaddr *ifa; NET_EPOCH_ENTER(et); ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); if (ifa != NULL) dump_sa(nw, IFLA_ADDRESS, ifa->ifa_addr); NET_EPOCH_EXIT(et); } if ((if_getbroadcastaddr(ifp) != NULL)) { nlattr_add(nw, IFLA_BROADCAST, if_getaddrlen(ifp), if_getbroadcastaddr(ifp)); } nlattr_add_u32(nw, IFLA_MTU, if_getmtu(ifp)); /* nlattr_add_u32(nw, IFLA_MIN_MTU, 60); nlattr_add_u32(nw, IFLA_MAX_MTU, 9000); nlattr_add_u32(nw, IFLA_GROUP, 0); */ if (if_getdescr(ifp) != NULL) nlattr_add_string(nw, IFLA_IFALIAS, if_getdescr(ifp)); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFLA_FREEBSD); if (off != 0) { get_hwaddr(nw, ifp); dump_iface_caps(nw, ifp); nlattr_set_len(nw, off); } get_stats(nw, ifp); uint32_t val = (if_getflags(ifp) & IFF_PROMISC) != 0; nlattr_add_u32(nw, IFLA_PROMISCUITY, val); ifc_dump_ifp_nl(ifp, nw); if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); nlmsg_abort(nw); return (false); } static bool check_ifmsg(void *hdr, struct nl_pstate *npt) { struct ifinfomsg *ifm = hdr; if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 || ifm->ifi_flags != 0 || ifm->ifi_change != 0) { nlmsg_report_err_msg(npt, "strict checking: non-zero values in ifinfomsg header"); return (false); } return (true); } #define _IN(_field) offsetof(struct ifinfomsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_link, _field) static const struct nlfield_parser nlf_p_if[] = { { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 }, { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_linfo[] = { { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn }, { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla }, }; NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo); static const struct nlattr_parser nla_p_if[] = { { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 }, { .type = IFLA_LINK, .off = _OUT(ifla_link), .cb = nlattr_get_uint32 }, { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested }, { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = nlattr_get_string }, { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string }, { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if); static bool match_iface(if_t ifp, void *_arg) { struct nl_parsed_link *attrs = (struct nl_parsed_link *)_arg; if (attrs->ifi_index != 0 && attrs->ifi_index != if_getindex(ifp)) return (false); if (attrs->ifi_type != 0 && attrs->ifi_index != if_gettype(ifp)) return (false); if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp))) return (false); /* TODO: add group match */ return (true); } static int dump_cb(if_t ifp, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; if (!dump_iface(wa->nw, ifp, &wa->hdr, 0)) return (ENOMEM); return (0); } /* * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0}, * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * [ * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"], * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF] * ] */ static int rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; if_t ifp; int error = 0; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWLINK, }; /* Fast track for an interface w/ explicit name or index match */ if ((attrs.ifi_index != 0) || (attrs.ifla_ifname != NULL)) { if (attrs.ifi_index != 0) { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", attrs.ifi_index); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); } else { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching name %s", attrs.ifla_ifname); ifp = ifunit_ref(attrs.ifla_ifname); } if (ifp != NULL) { if (match_iface(ifp, &attrs)) { if (!dump_iface(wa.nw, ifp, &wa.hdr, 0)) error = ENOMEM; } else error = ENODEV; if_rele(ifp); } else error = ENODEV; return (error); } /* Always treat non-direct-match as a multipart message */ wa.hdr.nlmsg_flags |= NLM_F_MULTI; /* * Fetching some link properties require performing ioctl's that may be blocking. * Address it by saving referenced pointers of the matching links, * exiting from epoch and going through the list one-by-one. */ NL_LOG(LOG_DEBUG2, "Start dump"); if_foreach_sleep(match_iface, &attrs, dump_cb, &wa); NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } /* * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[ * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"], * [ * {nla_len=16, nla_type=IFLA_LINKINFO}, * [ * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"... * ] * ] */ static int rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; if_t ifp; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index); return (ENOENT); } NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp)); sx_xlock(&ifnet_detach_sxlock); error = if_clone_destroy(if_name(ifp)); sx_xunlock(&ifnet_detach_sxlock); NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error); if_rele(ifp); return (error); } /* * New link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1668185590, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} * [ * {{nla_len=8, nla_type=IFLA_MTU}, 123}, * {{nla_len=10, nla_type=IFLA_IFNAME}, "vlan1"}, * {{nla_len=24, nla_type=IFLA_LINKINFO}, * [ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x7b\x00\x00\x00"}]}]} * * Update link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1668185923, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=if_nametoindex("lo"), ifi_flags=0, ifi_change=0}, * {{nla_len=8, nla_type=IFLA_MTU}, 123}} * * * Check command availability: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} */ static int create_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { if (lattrs->ifla_ifname == NULL || strlen(lattrs->ifla_ifname) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute"); return (EINVAL); } if (lattrs->ifla_cloner == NULL || strlen(lattrs->ifla_cloner) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute"); return (EINVAL); } struct ifc_data_nl ifd = { .flags = IFC_F_CREATE, .lattrs = lattrs, .bm = bm, .npt = npt, }; if (ifc_create_ifp_nl(lattrs->ifla_ifname, &ifd) && ifd.error == 0) nl_store_ifp_cookie(npt, ifd.ifp); return (ifd.error); } static int modify_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { if_t ifp = NULL; struct epoch_tracker et; if (lattrs->ifi_index == 0 && lattrs->ifla_ifname == NULL) { /* * Applications like ip(8) verify RTM_NEWLINK command * existence by calling it with empty arguments. Always * return "innocent" error in that case. */ NLMSG_REPORT_ERR_MSG(npt, "empty ifi_index field"); return (EPERM); } if (lattrs->ifi_index != 0) { NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(lattrs->ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface #%u", lattrs->ifi_index); return (ENOENT); } } if (ifp == NULL && lattrs->ifla_ifname != NULL) { ifp = ifunit_ref(lattrs->ifla_ifname); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface %s", lattrs->ifla_ifname); return (ENOENT); } } MPASS(ifp != NULL); /* * Modification request can address either * 1) cloned interface, in which case we call the cloner-specific * modification routine * or * 2) non-cloned (e.g. "physical") interface, in which case we call * generic modification routine */ struct ifc_data_nl ifd = { .lattrs = lattrs, .bm = bm, .npt = npt }; if (!ifc_modify_ifp_nl(ifp, &ifd)) ifd.error = nl_modify_ifp_generic(ifp, lattrs, bm, npt); if_rele(ifp); return (ifd.error); } static int rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct nlattr_bmask bm; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); nl_get_attrs_bmask_nlmsg(hdr, &ifmsg_parser, &bm); if (hdr->nlmsg_flags & NLM_F_CREATE) return (create_link(hdr, &attrs, &bm, nlp, npt)); else return (modify_link(hdr, &attrs, &bm, nlp, npt)); } static void set_scope6(struct sockaddr *sa, uint32_t ifindex) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, ifindex); } #endif } static bool check_sa_family(const struct sockaddr *sa, int family, const char *attr_name, struct nl_pstate *npt) { if (sa == NULL || sa->sa_family == family) return (true); nlmsg_report_err_msg(npt, "wrong family for %s attribute: %d != %d", attr_name, family, sa->sa_family); return (false); } struct nl_parsed_ifa { uint8_t ifa_family; uint8_t ifa_prefixlen; uint8_t ifa_scope; uint32_t ifa_index; uint32_t ifa_flags; uint32_t ifaf_vhid; uint32_t ifaf_flags; struct sockaddr *ifa_address; struct sockaddr *ifa_local; struct sockaddr *ifa_broadcast; struct ifa_cacheinfo *ifa_cacheinfo; struct sockaddr *f_ifa_addr; struct sockaddr *f_ifa_dst; }; static int nlattr_get_cinfo(struct nlattr *nla, struct nl_pstate *npt, const void *arg __unused, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct ifa_cacheinfo))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not ifa_cacheinfo", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((struct ifa_cacheinfo **)target) = (struct ifa_cacheinfo *)NL_RTA_DATA(nla); return (0); } #define _IN(_field) offsetof(struct ifaddrmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_ifa, _field) static const struct nlfield_parser nlf_p_ifa[] = { { .off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_scope), .off_out = _OUT(ifa_scope), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_flags), .off_out = _OUT(ifa_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_ifa_fbsd[] = { { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = nlattr_get_uint32 }, { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(ifa_fbsd_parser, nla_p_ifa_fbsd); static const struct nlattr_parser nla_p_ifa[] = { { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = nlattr_get_ip }, { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = nlattr_get_ip }, { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = nlattr_get_ip }, { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = nlattr_get_cinfo }, { .type = IFA_FLAGS, .off = _OUT(ifa_flags), .cb = nlattr_get_uint32 }, { .type = IFA_FREEBSD, .arg = &ifa_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_ifa(void *_attrs, struct nl_pstate *npt) { struct nl_parsed_ifa *attrs = (struct nl_parsed_ifa *)_attrs; if (!check_sa_family(attrs->ifa_address, attrs->ifa_family, "IFA_ADDRESS", npt)) return (false); if (!check_sa_family(attrs->ifa_local, attrs->ifa_family, "IFA_LOCAL", npt)) return (false); if (!check_sa_family(attrs->ifa_broadcast, attrs->ifa_family, "IFA_BROADADDR", npt)) return (false); set_scope6(attrs->ifa_address, attrs->ifa_index); set_scope6(attrs->ifa_local, attrs->ifa_index); return (true); } NL_DECLARE_PARSER_EXT(ifa_parser, struct ifaddrmsg, NULL, nlf_p_ifa, nla_p_ifa, post_p_ifa); /* {ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, [ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, --- {{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, [ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, */ static uint8_t ifa_get_scope(const struct ifaddr *ifa) { const struct sockaddr *sa; uint8_t addr_scope = RT_SCOPE_UNIVERSE; sa = ifa->ifa_addr; switch (sa->sa_family) { #ifdef INET case AF_INET: { struct in_addr addr; addr = ((const struct sockaddr_in *)sa)->sin_addr; if (IN_LOOPBACK(addr.s_addr)) addr_scope = RT_SCOPE_HOST; else if (IN_LINKLOCAL(addr.s_addr)) addr_scope = RT_SCOPE_LINK; break; } #endif #ifdef INET6 case AF_INET6: { const struct in6_addr *addr; addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; if (IN6_IS_ADDR_LOOPBACK(addr)) addr_scope = RT_SCOPE_HOST; else if (IN6_IS_ADDR_LINKLOCAL(addr)) addr_scope = RT_SCOPE_LINK; break; } #endif } return (addr_scope); } #ifdef INET6 static uint8_t inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } #endif static uint8_t get_sa_plen(const struct sockaddr *sa) { #ifdef INET const struct in_addr *paddr; #endif #ifdef INET6 const struct in6_addr *paddr6; #endif switch (sa->sa_family) { #ifdef INET case AF_INET: paddr = &(((const struct sockaddr_in *)sa)->sin_addr); return bitcount32(paddr->s_addr);; #endif #ifdef INET6 case AF_INET6: paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); return inet6_get_plen(paddr6); #endif } return (0); } #ifdef INET6 static uint32_t in6_flags_to_nl(uint32_t flags) { uint32_t nl_flags = 0; if (flags & IN6_IFF_TEMPORARY) nl_flags |= IFA_F_TEMPORARY; if (flags & IN6_IFF_NODAD) nl_flags |= IFA_F_NODAD; if (flags & IN6_IFF_DEPRECATED) nl_flags |= IFA_F_DEPRECATED; if (flags & IN6_IFF_TENTATIVE) nl_flags |= IFA_F_TENTATIVE; if ((flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == 0) flags |= IFA_F_PERMANENT; if (flags & IN6_IFF_DUPLICATED) flags |= IFA_F_DADFAILED; return (nl_flags); } static uint32_t nl_flags_to_in6(uint32_t flags) { uint32_t in6_flags = 0; if (flags & IFA_F_TEMPORARY) in6_flags |= IN6_IFF_TEMPORARY; if (flags & IFA_F_NODAD) in6_flags |= IN6_IFF_NODAD; if (flags & IFA_F_DEPRECATED) in6_flags |= IN6_IFF_DEPRECATED; if (flags & IFA_F_TENTATIVE) in6_flags |= IN6_IFF_TENTATIVE; if (flags & IFA_F_DADFAILED) in6_flags |= IN6_IFF_DUPLICATED; return (in6_flags); } static void export_cache_info6(struct nl_writer *nw, const struct in6_ifaddr *ia) { struct ifa_cacheinfo ci = { .cstamp = ia->ia6_createtime * 1000, .tstamp = ia->ia6_updatetime * 1000, .ifa_prefered = ia->ia6_lifetime.ia6t_pltime, .ifa_valid = ia->ia6_lifetime.ia6t_vltime, }; nlattr_add(nw, IFA_CACHEINFO, sizeof(ci), &ci); } #endif static void export_cache_info(struct nl_writer *nw, struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET6 case AF_INET6: export_cache_info6(nw, (struct in6_ifaddr *)ifa); break; #endif } } /* * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), ('IFA_LOCAL', '12.0.0.1'), ('IFA_LABEL', 'eth10'), ('IFA_FLAGS', 128), ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], */ static bool dump_iface_addr(struct nl_writer *nw, if_t ifp, struct ifaddr *ifa, const struct nlmsghdr *hdr) { struct ifaddrmsg *ifamsg; struct sockaddr *sa = ifa->ifa_addr; struct sockaddr *sa_dst = ifa->ifa_dstaddr; NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg))) goto enomem; ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg); ifamsg->ifa_family = sa->sa_family; ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); ifamsg->ifa_flags = 0; // ifa_flags is useless ifamsg->ifa_scope = ifa_get_scope(ifa); ifamsg->ifa_index = if_getindex(ifp); if ((if_getflags(ifp) & IFF_POINTOPOINT) && sa_dst != NULL && sa_dst->sa_family != 0) { /* P2P interface may have IPv6 LL with no dst address */ dump_sa(nw, IFA_ADDRESS, sa_dst); dump_sa(nw, IFA_LOCAL, sa); } else { dump_sa(nw, IFA_ADDRESS, sa); #ifdef INET /* * In most cases, IFA_ADDRESS == IFA_LOCAL * Skip IFA_LOCAL for anything except INET */ if (sa->sa_family == AF_INET) dump_sa(nw, IFA_LOCAL, sa); #endif } if (if_getflags(ifp) & IFF_BROADCAST) dump_sa(nw, IFA_BROADCAST, ifa->ifa_broadaddr); nlattr_add_string(nw, IFA_LABEL, if_name(ifp)); uint32_t nl_ifa_flags = 0; #ifdef INET6 if (sa->sa_family == AF_INET6) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; nl_ifa_flags = in6_flags_to_nl(ia->ia6_flags); } #endif nlattr_add_u32(nw, IFA_FLAGS, nl_ifa_flags); export_cache_info(nw, ifa); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFA_FREEBSD); if (off != 0) { if (ifa->ifa_carp != NULL && carp_get_vhid_p != NULL) { uint32_t vhid = (uint32_t)(*carp_get_vhid_p)(ifa); nlattr_add_u32(nw, IFAF_VHID, vhid); } #ifdef INET6 if (sa->sa_family == AF_INET6) { uint32_t ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; nlattr_add_u32(nw, IFAF_FLAGS, ifa_flags); } #endif nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); nlmsg_abort(nw); return (false); } static int dump_iface_addrs(struct netlink_walkargs *wa, if_t ifp) { struct ifaddr *ifa; struct ifa_iter it; int error = 0; for (ifa = ifa_iter_start(ifp, &it); ifa != NULL; ifa = ifa_iter_next(&it)) { if (wa->family != 0 && wa->family != ifa->ifa_addr->sa_family) continue; if (ifa->ifa_addr->sa_family == AF_LINK) continue; if (prison_if(wa->cred, ifa->ifa_addr) != 0) continue; wa->count++; if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) { error = ENOMEM; break; } wa->dumped++; } ifa_iter_finish(&it); return (error); } static int rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { if_t ifp; int error = 0; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .cred = nlp_get_cred(nlp), .family = attrs.ifa_family, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, .hdr.nlmsg_type = NL_RTM_NEWADDR, }; NL_LOG(LOG_DEBUG2, "Start dump"); if (attrs.ifa_index != 0) { ifp = ifnet_byindex(attrs.ifa_index); if (ifp == NULL) error = ENOENT; else error = dump_iface_addrs(&wa, ifp); } else { struct if_iter it; for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { error = dump_iface_addrs(&wa, ifp); if (error != 0) break; } if_iter_finish(&it); } NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } #ifdef INET static int handle_newaddr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { int plen = attrs->ifa_prefixlen; int if_flags = if_getflags(ifp); struct sockaddr_in *addr, *dst; if (plen > 32) { nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); return (EINVAL); }; if (if_flags & IFF_POINTOPOINT) { /* * Only P2P IFAs are allowed by the implementation. */ if (attrs->ifa_address == NULL || attrs->ifa_local == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } addr = (struct sockaddr_in *)attrs->ifa_local; dst = (struct sockaddr_in *)attrs->ifa_address; } else { /* * Map the Netlink attributes to FreeBSD ifa layout. * If only IFA_ADDRESS or IFA_LOCAL is set OR * both are set to the same value => ifa is not p2p * and the attribute value contains interface address. * * Otherwise (both IFA_ADDRESS and IFA_LOCAL are set and * different), IFA_LOCAL contains an interface address and * IFA_ADDRESS contains peer address. */ addr = (struct sockaddr_in *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in *)attrs->ifa_address; if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } /* Generate broadcast address if not set */ if ((if_flags & IFF_BROADCAST) && attrs->ifa_broadcast == NULL) { uint32_t s_baddr; struct sockaddr_in *sin_brd; if (plen == 31) s_baddr = INADDR_BROADCAST; /* RFC 3021 */ else { uint32_t s_mask; s_mask = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); s_baddr = addr->sin_addr.s_addr | ~s_mask; } sin_brd = (struct sockaddr_in *)npt_alloc(npt, sizeof(*sin_brd)); if (sin_brd == NULL) return (ENOMEM); sin_brd->sin_family = AF_INET; sin_brd->sin_len = sizeof(*sin_brd); sin_brd->sin_addr.s_addr = s_baddr; attrs->ifa_broadcast = (struct sockaddr *)sin_brd; } dst = (struct sockaddr_in *)attrs->ifa_broadcast; } struct sockaddr_in mask = { .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0), }; struct in_aliasreq req = { .ifra_addr = *addr, .ifra_mask = mask, .ifra_vhid = attrs->ifaf_vhid, }; if (dst != NULL) req.ifra_dstaddr = *dst; return (in_control_ioctl(SIOCAIFADDR, &req, ifp, nlp_get_cred(nlp))); } static int handle_deladdr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in *addr = (struct sockaddr_in *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in *)attrs->ifa_address; if (addr == NULL) { nlmsg_report_err_msg(npt, "empty IFA_ADDRESS/IFA_LOCAL"); return (EINVAL); } struct in_aliasreq req = { .ifra_addr = *addr }; return (in_control_ioctl(SIOCDIFADDR, &req, ifp, nlp_get_cred(nlp))); } #endif #ifdef INET6 static int handle_newaddr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in6 *addr, *dst; if (attrs->ifa_prefixlen > 128) { nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); return (EINVAL); } /* * In IPv6 implementation, adding non-P2P address to the P2P interface * is allowed. */ addr = (struct sockaddr_in6 *)(attrs->ifa_local); dst = (struct sockaddr_in6 *)(attrs->ifa_address); if (addr == NULL) { addr = dst; dst = NULL; } else if (dst != NULL) { if (IN6_ARE_ADDR_EQUAL(&addr->sin6_addr, &dst->sin6_addr)) { /* * Sometimes Netlink users fills in both attributes * with the same address. It still means "non-p2p". */ dst = NULL; } } if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } uint32_t flags = nl_flags_to_in6(attrs->ifa_flags) | attrs->ifaf_flags; uint32_t pltime = 0, vltime = 0; if (attrs->ifa_cacheinfo != 0) { pltime = attrs->ifa_cacheinfo->ifa_prefered; vltime = attrs->ifa_cacheinfo->ifa_valid; } struct sockaddr_in6 mask = { .sin6_len = sizeof(struct sockaddr_in6), .sin6_family = AF_INET6, }; ip6_writemask(&mask.sin6_addr, attrs->ifa_prefixlen); struct in6_aliasreq req = { .ifra_addr = *addr, .ifra_prefixmask = mask, .ifra_flags = flags, .ifra_lifetime = { .ia6t_vltime = vltime, .ia6t_pltime = pltime }, .ifra_vhid = attrs->ifaf_vhid, }; if (dst != NULL) req.ifra_dstaddr = *dst; return (in6_control_ioctl(SIOCAIFADDR_IN6, &req, ifp, nlp_get_cred(nlp))); } static int handle_deladdr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in6 *)(attrs->ifa_address); if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } struct in6_aliasreq req = { .ifra_addr = *addr }; return (in6_control_ioctl(SIOCDIFADDR_IN6, &req, ifp, nlp_get_cred(nlp))); } #endif static int rtnl_handle_addr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; int error; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); if_t ifp = ifnet_byindex_ref(attrs.ifa_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { nlmsg_report_err_msg(npt, "Unable to find interface with index %u", attrs.ifa_index); return (ENOENT); } int if_flags = if_getflags(ifp); #if defined(INET) || defined(INET6) bool new = hdr->nlmsg_type == NL_RTM_NEWADDR; #endif /* * TODO: Properly handle NLM_F_CREATE / NLM_F_EXCL. * The current ioctl-based KPI always does an implicit create-or-replace. * It is not possible to specify fine-grained options. */ switch (attrs.ifa_family) { #ifdef INET case AF_INET: if (new) error = handle_newaddr_inet(hdr, &attrs, ifp, nlp, npt); else error = handle_deladdr_inet(hdr, &attrs, ifp, nlp, npt); break; #endif #ifdef INET6 case AF_INET6: if (new) error = handle_newaddr_inet6(hdr, &attrs, ifp, nlp, npt); else error = handle_deladdr_inet6(hdr, &attrs, ifp, nlp, npt); break; #endif default: error = EAFNOSUPPORT; } if (error == 0 && !(if_flags & IFF_UP) && (if_getflags(ifp) & IFF_UP)) if_up(ifp); if_rele(ifp); return (error); } static void rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) { struct nlmsghdr hdr = {}; struct nl_writer nw = {}; uint32_t group = 0; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: group = RTNLGRP_IPV4_IFADDR; break; #endif #ifdef INET6 case AF_INET6: group = RTNLGRP_IPV6_IFADDR; break; #endif default: NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", ifa->ifa_addr->sa_family); return; } if (!nl_has_listeners(NETLINK_ROUTE, group)) return; if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, group)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr); nlmsg_flush(&nw); } static void rtnl_handle_ifevent(if_t ifp, int nlmsg_type, int if_flags_mask) { struct nlmsghdr hdr = { .nlmsg_type = nlmsg_type }; struct nl_writer nw = {}; if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) return; if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) { NL_LOG(LOG_DEBUG, "error allocating mbuf"); return; } dump_iface(&nw, ifp, &hdr, if_flags_mask); nlmsg_flush(&nw); } static void rtnl_handle_ifattach(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } static void rtnl_handle_ifdetach(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_DELLINK, 0); } static void rtnl_handle_iflink(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } void rtnl_handle_ifnet_event(if_t ifp, int if_flags_mask) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, if_flags_mask); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETLINK, .name = "RTM_GETLINK", .cb = &rtnl_handle_getlink, .flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELLINK, .name = "RTM_DELLINK", .cb = &rtnl_handle_dellink, .priv = PRIV_NET_IFDESTROY, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_NEWLINK, .name = "RTM_NEWLINK", .cb = &rtnl_handle_newlink, .priv = PRIV_NET_IFCREATE, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_GETADDR, .name = "RTM_GETADDR", .cb = &rtnl_handle_getaddr, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_NEWADDR, .name = "RTM_NEWADDR", .cb = &rtnl_handle_addr, .priv = PRIV_NET_ADDIFADDR, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_DELADDR, .name = "RTM_DELADDR", .cb = &rtnl_handle_addr, .priv = PRIV_NET_DELIFADDR, .flags = RTNL_F_NOEPOCH, }, }; static const struct nlhdr_parser *all_parsers[] = { &ifmsg_parser, &ifa_parser, &ifa_fbsd_parser, }; void rtnl_iface_add_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_INSERT_HEAD(&nl_cloners, cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_iface_del_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_ifaces_init(void) { ifattach_event = EVENTHANDLER_REGISTER( ifnet_arrival_event, rtnl_handle_ifattach, NULL, EVENTHANDLER_PRI_ANY); ifdetach_event = EVENTHANDLER_REGISTER( ifnet_departure_event, rtnl_handle_ifdetach, NULL, EVENTHANDLER_PRI_ANY); ifaddr_event = EVENTHANDLER_REGISTER( rt_addrmsg, rtnl_handle_ifaddr, NULL, EVENTHANDLER_PRI_ANY); iflink_event = EVENTHANDLER_REGISTER( ifnet_link_event, rtnl_handle_iflink, NULL, EVENTHANDLER_PRI_ANY); NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } void rtnl_ifaces_destroy(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); EVENTHANDLER_DEREGISTER(ifnet_link_event, iflink_event); } diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c index 1a7aaffa7dd7..4bf913d9c978 100644 --- a/sys/netlink/route/iface_drivers.c +++ b/sys/netlink/route/iface_drivers.c @@ -1,146 +1,145 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_iface_drivers #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * Generic modification interface handler. * Responsible for changing network stack interface attributes * such as state, mtu or description. */ int _nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt) { int error; if (lattrs->ifla_ifalias != NULL) { if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFDESCR)) { int len = strlen(lattrs->ifla_ifalias) + 1; char *buf = if_allocdescr(len, M_WAITOK); memcpy(buf, lattrs->ifla_ifalias, len); if_setdescr(ifp, buf); if_setlastchange(ifp); } else { nlmsg_report_err_msg(npt, "Not enough privileges to set descr"); return (EPERM); } } if ((lattrs->ifi_change & IFF_UP) && (lattrs->ifi_flags & IFF_UP) == 0) { /* Request to down the interface */ if_down(ifp); } if (lattrs->ifla_mtu > 0) { if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFMTU)) { struct ifreq ifr = { .ifr_mtu = lattrs->ifla_mtu }; error = ifhwioctl(SIOCSIFMTU, ifp, (char *)&ifr, curthread); } else { nlmsg_report_err_msg(npt, "Not enough privileges to set mtu"); return (EPERM); } } if (lattrs->ifi_change & IFF_PROMISC) { error = ifpromisc(ifp, lattrs->ifi_flags & IFF_PROMISC); if (error != 0) { nlmsg_report_err_msg(npt, "unable to set promisc"); return (error); } } return (0); } /* * Saves the resulting ifindex and ifname to report them * to userland along with the operation result. * NLA format: * NLMSGERR_ATTR_COOKIE(nested) * IFLA_NEW_IFINDEX(u32) * IFLA_IFNAME(string) */ void _nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) { int ifname_len = strlen(if_name(ifp)); uint32_t ifindex = (uint32_t)if_getindex(ifp); int nla_len = sizeof(struct nlattr) * 3 + sizeof(ifindex) + NL_ITEM_ALIGN(ifname_len + 1); struct nlattr *nla_cookie = npt_alloc(npt, nla_len); /* Nested TLV */ nla_cookie->nla_len = nla_len; nla_cookie->nla_type = NLMSGERR_ATTR_COOKIE; struct nlattr *nla = nla_cookie + 1; nla->nla_len = sizeof(struct nlattr) + sizeof(ifindex); nla->nla_type = IFLA_NEW_IFINDEX; memcpy(NLA_DATA(nla), &ifindex, sizeof(ifindex)); nla = NLA_NEXT(nla); nla->nla_len = sizeof(struct nlattr) + ifname_len + 1; nla->nla_type = IFLA_IFNAME; strlcpy(NLA_DATA(nla), if_name(ifp), ifname_len + 1); nlmsg_report_cookie(npt, nla_cookie); } diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c index 4168b5936ddc..9914e7febf57 100644 --- a/sys/netlink/route/neigh.c +++ b/sys/netlink/route/neigh.c @@ -1,604 +1,602 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* nd6.h requires this */ #include /* nd6 state machine */ #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_neigh #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static int lle_families[] = { AF_INET, AF_INET6 }; static eventhandler_tag lle_event_p; struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; if_t ifp; int family; int error; int count; int dumped; }; static int lle_state_to_nl_state(int family, struct llentry *lle) { int state = lle->ln_state; switch (family) { case AF_INET: if (lle->la_flags & (LLE_STATIC | LLE_IFADDR)) state = 1; switch (state) { case 0: /* ARP_LLINFO_INCOMPLETE */ return (NUD_INCOMPLETE); case 1: /* ARP_LLINFO_REACHABLE */ return (NUD_REACHABLE); case 2: /* ARP_LLINFO_VERIFY */ return (NUD_PROBE); } break; case AF_INET6: switch (state) { case ND6_LLINFO_INCOMPLETE: return (NUD_INCOMPLETE); case ND6_LLINFO_REACHABLE: return (NUD_REACHABLE); case ND6_LLINFO_STALE: return (NUD_STALE); case ND6_LLINFO_DELAY: return (NUD_DELAY); case ND6_LLINFO_PROBE: return (NUD_PROBE); } break; } return (NUD_NONE); } static uint32_t lle_flags_to_nl_flags(const struct llentry *lle) { uint32_t nl_flags = 0; if (lle->la_flags & LLE_IFADDR) nl_flags |= NTF_SELF; if (lle->la_flags & LLE_PUB) nl_flags |= NTF_PROXY; if (lle->la_flags & LLE_STATIC) nl_flags |= NTF_STICKY; if (lle->ln_router != 0) nl_flags |= NTF_ROUTER; return (nl_flags); } static uint32_t get_lle_next_ts(const struct llentry *lle) { if (lle->la_expire == 0) return (0); return (lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime); } static int dump_lle_locked(struct llentry *lle, void *arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)arg; struct nlmsghdr *hdr = &wa->hdr; struct nl_writer *nw = wa->nw; struct ndmsg *ndm; #if defined(INET) || defined(INET6) union { struct in_addr in; struct in6_addr in6; } addr; #endif IF_DEBUG_LEVEL(LOG_DEBUG2) { char llebuf[NHOP_PRINT_BUFSIZE]; llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf)); NL_LOG(LOG_DEBUG2, "dumping %s", llebuf); } if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg))) goto enomem; ndm = nlmsg_reserve_object(nw, struct ndmsg); ndm->ndm_family = wa->family; ndm->ndm_ifindex = if_getindex(wa->ifp); ndm->ndm_state = lle_state_to_nl_state(wa->family, lle); ndm->ndm_flags = lle_flags_to_nl_flags(lle); switch (wa->family) { #ifdef INET case AF_INET: addr.in = lle->r_l3addr.addr4; nlattr_add(nw, NDA_DST, 4, &addr); break; #endif #ifdef INET6 case AF_INET6: addr.in6 = lle->r_l3addr.addr6; in6_clearscope(&addr.in6); nlattr_add(nw, NDA_DST, 16, &addr); break; #endif } if (lle->r_flags & RLLE_VALID) { /* Has L2 */ int addrlen = if_getaddrlen(wa->ifp); nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr); } nlattr_add_u32(nw, NDA_PROBES, lle->la_asked); struct nda_cacheinfo *cache; cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo); if (cache == NULL) goto enomem; /* TODO: provide confirmed/updated */ cache->ndm_refcnt = lle->lle_refcnt; int off = nlattr_add_nested(nw, NDA_FREEBSD); if (off != 0) { nlattr_add_u32(nw, NDAF_NEXT_STATE_TS, get_lle_next_ts(lle)); nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (0); enomem: NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)"); nlmsg_abort(nw); return (ENOMEM); } static int dump_lle(struct lltable *llt, struct llentry *lle, void *arg) { int error; LLE_RLOCK(lle); error = dump_lle_locked(lle, arg); LLE_RUNLOCK(lle); return (error); } static bool dump_llt(struct lltable *llt, struct netlink_walkargs *wa) { lltable_foreach_lle(llt, dump_lle, wa); return (true); } static int dump_llts_iface(struct netlink_walkargs *wa, if_t ifp, int family) { int error = 0; wa->ifp = ifp; for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) { int fam = lle_families[i]; struct lltable *llt = lltable_get(ifp, fam); if (llt != NULL && (family == 0 || family == fam)) { wa->count++; wa->family = fam; if (!dump_llt(llt, wa)) { error = ENOMEM; break; } wa->dumped++; } } return (error); } static int dump_llts(struct netlink_walkargs *wa, if_t ifp, int family) { NL_LOG(LOG_DEBUG2, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family); wa->hdr.nlmsg_flags |= NLM_F_MULTI; if (ifp != NULL) { dump_llts_iface(wa, ifp, family); } else { struct if_iter it; for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { dump_llts_iface(wa, ifp, family); } if_iter_finish(&it); } NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa->count, wa->dumped); if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) { NL_LOG(LOG_DEBUG, "Unable to add new message"); return (ENOMEM); } return (0); } static int get_lle(struct netlink_walkargs *wa, if_t ifp, int family, struct sockaddr *dst) { struct lltable *llt = lltable_get(ifp, family); if (llt == NULL) return (ESRCH); struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst); if (lle == NULL) return (ESRCH); wa->ifp = ifp; wa->family = family; return (dump_lle(llt, lle, wa)); } static void set_scope6(struct sockaddr *sa, if_t ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct nl_parsed_neigh { struct sockaddr *nda_dst; struct ifnet *nda_ifp; struct nlattr *nda_lladdr; uint32_t ndaf_next_ts; uint32_t ndm_flags; uint16_t ndm_state; uint8_t ndm_family; }; #define _IN(_field) offsetof(struct ndmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_neigh, _field) static const struct nlattr_parser nla_p_neigh_fbsd[] = { { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(neigh_fbsd_parser, nla_p_neigh_fbsd); static const struct nlfield_parser nlf_p_neigh[] = { { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 }, { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 }, { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz }, }; static const struct nlattr_parser nla_p_neigh[] = { { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip }, { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla }, { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp }, { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 }, { .type = NDA_FREEBSD, .arg = &neigh_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_neigh(void *_attrs, struct nl_pstate *npt __unused) { struct nl_parsed_neigh *attrs = (struct nl_parsed_neigh *)_attrs; set_scope6(attrs->nda_dst, attrs->nda_ifp); return (true); } NL_DECLARE_PARSER_EXT(ndmsg_parser, struct ndmsg, NULL, nlf_p_neigh, nla_p_neigh, post_p_neigh); /* * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0}, * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC}, * [ * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")}, * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60} */ static int rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) { if (attrs.nda_ifp == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set"); if (attrs.nda_dst == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); if (attrs.nda_lladdr == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set"); return (EINVAL); } if (attrs.nda_dst->sa_family != attrs.ndm_family) { NLMSG_REPORT_ERR_MSG(npt, "NDA_DST family (%d) is different from ndm_family (%d)", attrs.nda_dst->sa_family, attrs.ndm_family); return (EINVAL); } int addrlen = if_getaddrlen(attrs.nda_ifp); if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) { NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR address length (%d) is different from expected (%d)", (int)attrs.nda_lladdr->nla_len - (int)sizeof(struct nlattr), addrlen); return (EINVAL); } const uint16_t supported_flags = NTF_PROXY | NTF_STICKY; if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) { NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported", attrs.ndm_flags &~ supported_flags); return (ENOTSUP); } /* Replacement requires new entry creation anyway */ if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0) return (ENOTSUP); struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); if (llt == NULL) return (EAFNOSUPPORT); uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize = sizeof(linkhdr); int lladdr_off = 0; if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family, (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data"); return (EINVAL); } int lle_flags = (attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0; if (attrs.ndm_flags & NTF_STICKY) lle_flags |= LLE_STATIC; struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst); if (lle == NULL) return (ENOMEM); lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off); if (attrs.ndm_flags & NTF_STICKY) lle->la_expire = 0; else lle->la_expire = attrs.ndaf_next_ts - time_second + time_uptime; /* llentry created, try to insert or update */ IF_AFDATA_WLOCK(attrs.nda_ifp); LLE_WLOCK(lle); struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst); if (lle_tmp != NULL) { error = EEXIST; if (hdr->nlmsg_flags & NLM_F_EXCL) { LLE_WUNLOCK(lle_tmp); lle_tmp = NULL; } else if (hdr->nlmsg_flags & NLM_F_REPLACE) { if ((lle_tmp->la_flags & LLE_IFADDR) == 0) { lltable_unlink_entry(llt, lle_tmp); lltable_link_entry(llt, lle); error = 0; } else error = EPERM; } } else { if (hdr->nlmsg_flags & NLM_F_CREATE) lltable_link_entry(llt, lle); else error = ENOENT; } IF_AFDATA_WUNLOCK(attrs.nda_ifp); if (error != 0) { if (lle != NULL) llentry_free(lle); return (error); } if (lle_tmp != NULL) llentry_free(lle_tmp); /* XXX: We're inside epoch */ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); llt->llt_post_resolved(llt, lle); return (0); } static int rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); return (EINVAL); } if (attrs.nda_ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided"); return (EINVAL); } struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); if (llt == NULL) return (EAFNOSUPPORT); return (lltable_delete_addr(llt, 0, attrs.nda_dst)); } static int rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided"); return (EINVAL); } struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEIGH, }; if (attrs.nda_dst == NULL) error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family); else error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst); return (error); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_NEWNEIGH, .name = "RTM_NEWNEIGH", .cb = &rtnl_handle_newneigh, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_DELNEIGH, .name = "RTM_DELNEIGH", .cb = &rtnl_handle_delneigh, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_GETNEIGH, .name = "RTM_GETNEIGH", .cb = &rtnl_handle_getneigh, } }; static void rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt) { if_t ifp; int family; LLE_WLOCK_ASSERT(lle); ifp = lltable_get_ifp(lle->lle_tbl); family = lltable_get_af(lle->lle_tbl); if (family != AF_INET && family != AF_INET6) return; int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } struct netlink_walkargs wa = { .hdr.nlmsg_type = nlmsgs_type, .nw = &nw, .ifp = ifp, .family = family, }; dump_lle_locked(lle, &wa); nlmsg_flush(&nw); } static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser, &neigh_fbsd_parser }; void rtnl_neighs_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL, EVENTHANDLER_PRI_ANY); } void rtnl_neighs_destroy(void) { EVENTHANDLER_DEREGISTER(lle_event, lle_event_p); } diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c index 722dc5fdee4e..0d6bb5c9ec84 100644 --- a/sys/netlink/route/nexthop.c +++ b/sys/netlink/route/nexthop.c @@ -1,1124 +1,1122 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_nhop #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * This file contains the logic to maintain kernel nexthops and * nexhop groups based om the data provided by the user. * * Kernel stores (nearly) all of the routing data in the nexthops, * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). * * Netlink API provides higher-level abstraction for the user. Each * user-created nexthop may map to multiple kernel nexthops. * * The following variations require separate kernel nexthop to be * created: * * prefix flags (NHF_HOST, NHF_DEFAULT) * * using IPv6 gateway for IPv4 routes * * different fibnum * * These kernel nexthops have the lifetime bound to the lifetime of * the user_nhop object. They are not collected until user requests * to delete the created user_nhop. * */ struct user_nhop { uint32_t un_idx; /* Userland-provided index */ uint32_t un_fibfam; /* fibnum+af(as highest byte) */ uint8_t un_protocol; /* protocol that install the record */ struct nhop_object *un_nhop; /* "production" nexthop */ struct nhop_object *un_nhop_src; /* nexthop to copy from */ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ uint32_t un_nhgrp_count; /* number of nexthops */ struct user_nhop *un_next; /* next item in hash chain */ struct user_nhop *un_nextchild; /* master -> children */ struct epoch_context un_epoch_ctx; /* epoch ctl helper */ }; /* produce hash value for an object */ #define unhop_hash_obj(_obj) (hash_unhop(_obj)) /* compare two objects */ #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) /* next object accessor */ #define unhop_next(_obj) (_obj)->un_next CHT_SLIST_DEFINE(unhop, struct user_nhop); struct unhop_ctl { struct unhop_head un_head; struct rmlock un_lock; }; #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") #define UN_TRACKER struct rm_priotracker un_tracker #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; #define V_un_ctl VNET(un_ctl) static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); static unsigned int hash_unhop(const struct user_nhop *obj); static void destroy_unhop(struct user_nhop *unhop); static struct nhop_object *clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags); static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) { return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); } /* * Hash callback: calculate hash of an object */ static unsigned int hash_unhop(const struct user_nhop *obj) { return (obj->un_idx ^ obj->un_fibfam); } #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) /* * Factory interface for creating matching kernel nexthops/nexthop groups * * @uidx: userland nexhop index used to create the nexthop * @fibnum: fibnum nexthop will be used in * @family: upper family nexthop will be used in * @nh_flags: desired nexthop prefix flags * @perror: pointer to store error to * * Returns referenced nexthop linked to @fibnum/@family rib on success. */ struct nhop_object * nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, int nh_flags, int *perror) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); UN_TRACKER; if (__predict_false(ctl == NULL)) return (NULL); struct user_nhop key= { .un_idx = uidx, .un_fibfam = fibnum | ((uint32_t)family) << 24, }; struct user_nhop *unhop; nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); if (__predict_false(family == 0)) return (NULL); UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop != NULL) { struct nhop_object *nh = unhop->un_nhop; UN_RLOCK(ctl); *perror = 0; nhop_ref_any(nh); return (nh); } /* * Exact nexthop not found. Search for template nexthop to clone from. */ key.un_fibfam = 0; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { UN_RUNLOCK(ctl); *perror = ESRCH; return (NULL); } UN_RUNLOCK(ctl); /* Create entry to insert first */ struct user_nhop *un_new, *un_tmp; un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); if (un_new == NULL) { *perror = ENOMEM; return (NULL); } un_new->un_idx = uidx; un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; /* Relying on epoch to protect unhop here */ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); if (un_new->un_nhop == NULL) { free(un_new, M_NETLINK); *perror = ENOMEM; return (NULL); } /* Insert back and report */ UN_WLOCK(ctl); /* First, find template record once again */ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { /* Someone deleted the nexthop during the call */ UN_WUNLOCK(ctl); *perror = ESRCH; destroy_unhop(un_new); return (NULL); } /* Second, check the direct match */ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); struct nhop_object *nh; if (un_tmp != NULL) { /* Another thread already created the desired nextop, use it */ nh = un_tmp->un_nhop; } else { /* Finally, insert the new nexthop and link it to the primary */ nh = un_new->un_nhop; CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); un_new->un_nextchild = unhop->un_nextchild; unhop->un_nextchild = un_new; un_new = NULL; NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); } UN_WUNLOCK(ctl); if (un_new != NULL) destroy_unhop(un_new); *perror = 0; nhop_ref_any(nh); return (nh); } static struct user_nhop * nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) { struct user_nhop key= { .un_idx = uidx }; struct user_nhop *unhop = NULL; UN_TRACKER; UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); UN_RUNLOCK(ctl); return (unhop); } #define MAX_STACK_NHOPS 4 static struct nhop_object * clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) { #ifdef ROUTE_MPATH const struct weightened_nhop *wn; struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; uint32_t num_nhops; #endif struct nhop_object *nh = NULL; int error; if (unhop->un_nhop_src != NULL) { IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, family, nh_flags); } struct nhop_object *nh; nh = nhop_alloc(fibnum, AF_UNSPEC); if (nh == NULL) return (NULL); nhop_copy(nh, unhop->un_nhop_src); /* Check that nexthop gateway is compatible with the new family */ if (!nhop_set_upper_family(nh, family)) { nhop_free(nh); return (NULL); } nhop_set_uidx(nh, unhop->un_idx); nhop_set_pxtype_flag(nh, nh_flags); return (nhop_get_nhop(nh, &error)); } #ifdef ROUTE_MPATH wn = unhop->un_nhgrp_src; num_nhops = unhop->un_nhgrp_count; if (num_nhops > MAX_STACK_NHOPS) { wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); if (wn_new == NULL) return (NULL); } else wn_new = wn_base; for (int i = 0; i < num_nhops; i++) { uint32_t uidx = nhop_get_uidx(wn[i].nh); MPASS(uidx != 0); wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); if (error != 0) break; wn_new[i].weight = wn[i].weight; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn_new[0].nh); struct nhgrp_object *nhg; error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); nh = (struct nhop_object *)nhg; } if (wn_new != wn_base) free(wn_new, M_TEMP); #endif return (nh); } static void destroy_unhop(struct user_nhop *unhop) { if (unhop->un_nhop != NULL) nhop_free_any(unhop->un_nhop); if (unhop->un_nhop_src != NULL) nhop_free_any(unhop->un_nhop_src); free(unhop, M_NETLINK); } static void destroy_unhop_epoch(epoch_context_t ctx) { struct user_nhop *unhop; unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); destroy_unhop(unhop); } static uint32_t find_spare_uidx(struct unhop_ctl *ctl) { struct user_nhop *unhop, key = {}; uint32_t uidx = 0; UN_TRACKER; UN_RLOCK(ctl); /* This should return spare uid with 75% of 65k used in ~99/100 cases */ for (int i = 0; i < 16; i++) { key.un_idx = (arc4random() % 65536) + 65536 * 4; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { uidx = key.un_idx; break; } } UN_RUNLOCK(ctl); return (uidx); } /* * Actual netlink code */ struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; int family; int error; int count; int dumped; }; #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem static bool dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) goto enomem; struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); nhm->nh_family = AF_UNSPEC; nhm->nh_scope = 0; nhm->nh_protocol = unhop->un_protocol; nhm->nh_flags = 0; nlattr_add_u32(nw, NHA_ID, unhop->un_idx); nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); struct weightened_nhop *wn = unhop->un_nhgrp_src; uint32_t num_nhops = unhop->un_nhgrp_count; /* TODO: a better API? */ int nla_len = sizeof(struct nlattr); nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) goto enomem; nla->nla_type = NHA_GROUP; nla->nla_len = nla_len; for (int i = 0; i < num_nhops; i++) { struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; grp->id = nhop_get_uidx(wn[i].nh); grp->weight = wn[i].weight; grp->resvd1 = 0; grp->resvd2 = 0; } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); nlmsg_abort(nw); return (false); } static bool dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) goto enomem; struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); ENOMEM_IF_NULL(nhm); nhm->nh_family = nhop_get_neigh_family(nh); nhm->nh_scope = 0; // XXX: what's that? nhm->nh_protocol = nhop_get_origin(nh); nhm->nh_flags = 0; if (uidx != 0) nlattr_add_u32(nw, NHA_ID, uidx); if (nh->nh_flags & NHF_BLACKHOLE) { nlattr_add_flag(nw, NHA_BLACKHOLE); goto done; } nlattr_add_u32(nw, NHA_OIF, if_getindex(nh->nh_ifp)); switch (nh->gw_sa.sa_family) { #ifdef INET case AF_INET: nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr = nh->gw6_sa.sin6_addr; in6_clearscope(&addr); nlattr_add(nw, NHA_GATEWAY, 16, &addr); break; } #endif } int off = nlattr_add_nested(nw, NHA_FREEBSD); if (off != 0) { nlattr_add_u32(nw, NHAF_AIF, if_getindex(nh->nh_aifp)); if (uidx == 0) { nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh)); nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh)); nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh)); } nlattr_set_len(nw, off); } done: if (nlmsg_end(nw)) return (true); enomem: nlmsg_abort(nw); return (false); } static void dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, struct nl_writer *nw) { if (unhop->un_nhop_src != NULL) dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw); else dump_nhgrp(unhop, hdr, nw); } static int delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) { struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; struct user_nhop key = { .un_idx = uidx }; UN_WLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); if (unhop_base != NULL) { CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, "removed base nhop %u: %s", uidx, nhbuf); } /* Unlink all child nexhops as well, keeping the chain intact */ unhop_chain = unhop_base->un_nextchild; while (unhop_chain != NULL) { CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, unhop_ret); MPASS(unhop_chain == unhop_ret); IF_DEBUG_LEVEL(LOG_DEBUG3) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop_chain->un_nhop, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, "removed child nhop %u: %s", uidx, nhbuf); } unhop_chain = unhop_chain->un_nextchild; } } UN_WUNLOCK(ctl); if (unhop_base == NULL) { NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); return (ENOENT); } /* Report nexthop deletion */ struct netlink_walkargs wa = { .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, }; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } dump_unhop(unhop_base, &wa.hdr, &nw); nlmsg_flush(&nw); while (unhop_base != NULL) { unhop_chain = unhop_base->un_nextchild; NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); unhop_base = unhop_chain; } return (0); } static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size) { void *new_ptr = NULL; size_t alloc_size; if (new_size == 0) return; if (new_size != 0) { alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); if (new_ptr == NULL) return; } NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); UN_WLOCK(ctl); if (new_ptr != NULL) { CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); } UN_WUNLOCK(ctl); if (new_ptr != NULL) free(new_ptr, M_NETLINK); } static bool __noinline vnet_init_unhops(void) { uint32_t num_buckets = 16; size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, M_NOWAIT | M_ZERO); if (ctl == NULL) return (false); void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); if (ptr == NULL) { free(ctl, M_NETLINK); return (false); } CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); UN_LOCK_INIT(ctl); if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { free(ptr, M_NETLINK); free(ctl, M_NETLINK); } if (atomic_load_ptr(&V_un_ctl) == NULL) return (false); NL_LOG(LOG_NOTICE, "UNHOPS init done"); return (true); } static void vnet_destroy_unhops(const void *unused __unused) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); struct user_nhop *unhop, *tmp; if (ctl == NULL) return; V_un_ctl = NULL; /* Wait till all unhop users finish their reads */ NET_EPOCH_WAIT(); UN_WLOCK(ctl); CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { destroy_unhop(unhop); } CHT_SLIST_FOREACH_SAFE_END; UN_WUNLOCK(ctl); free(ctl->un_head.ptr, M_NETLINK); free(ctl, M_NETLINK); } VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_destroy_unhops, NULL); static int nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int error = 0; /* Verify attribute correctness */ struct nexthop_grp *grp = NLA_DATA(nla); int data_len = NLA_DATA_LEN(nla); int count = data_len / sizeof(*grp); if (count == 0 || (count * sizeof(*grp) != data_len)) { NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); return (EINVAL); } *((struct nlattr **)target) = nla; return (error); } static void set_scope6(struct sockaddr *sa, if_t ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct nl_parsed_nhop { uint32_t nha_id; uint8_t nha_blackhole; uint8_t nha_groups; uint8_t nhaf_knhops; uint8_t nhaf_family; struct ifnet *nha_oif; struct sockaddr *nha_gw; struct nlattr *nha_group; uint8_t nh_family; uint8_t nh_protocol; uint32_t nhaf_table; uint32_t nhaf_kid; uint32_t nhaf_aif; }; #define _IN(_field) offsetof(struct nhmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) static struct nlattr_parser nla_p_nh_fbsd[] = { { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag }, { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 }, { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 }, { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 }, { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd); static const struct nlfield_parser nlf_p_nh[] = { { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, }; static const struct nlattr_parser nla_p_nh[] = { { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_nh(void *_attrs, struct nl_pstate *npt) { struct nl_parsed_nhop *attrs = (struct nl_parsed_nhop *)_attrs; set_scope6(attrs->nha_gw, attrs->nha_oif); return (true); } NL_DECLARE_PARSER_EXT(nhmsg_parser, struct nhmsg, NULL, nlf_p_nh, nla_p_nh, post_p_nh); static bool eligible_nhg(const struct nhop_object *nh) { return (nh->nh_flags & NHF_GATEWAY); } static int newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) { struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); struct weightened_nhop *wn; wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); if (wn == NULL) return (ENOMEM); for (int i = 0; i < count; i++) { struct user_nhop *unhop; unhop = nl_find_base_unhop(ctl, grp[i].id); if (unhop == NULL) { NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); free(wn, M_NETLINK); return (ESRCH); } else if (unhop->un_nhop_src == NULL) { NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", grp[i].id); free(wn, M_NETLINK); return (ENOTSUP); } else if (!eligible_nhg(unhop->un_nhop_src)) { NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", grp[i].id); free(wn, M_NETLINK); return (ENOTSUP); } /* * TODO: consider more rigid eligibility checks: * restrict nexthops with the same gateway */ wn[i].nh = unhop->un_nhop_src; wn[i].weight = grp[i].weight; } unhop->un_nhgrp_src = wn; unhop->un_nhgrp_count = count; return (0); } /* * Sets nexthop @nh gateway specified by @gw. * If gateway is IPv6 link-local, alters @gw to include scopeid equal to * @ifp ifindex. * Returns 0 on success or errno. */ int nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, if_t ifp, struct nl_pstate *npt) { #ifdef INET6 if (gw->sa_family == AF_INET6) { struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "interface not set"); return (EINVAL); } in6_set_unicast_scopeid(&gw6->sin6_addr, if_getindex(ifp)); } } #endif nhop_set_gw(nh, gw, true); return (0); } static int newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) { struct ifaddr *ifa = NULL; struct nhop_object *nh; int error; if (!attrs->nha_blackhole) { if (attrs->nha_gw == NULL) { NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); return (EINVAL); } if (attrs->nha_oif == NULL) { NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); return (EINVAL); } if (ifa == NULL) ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); if (ifa == NULL) { NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); return (EINVAL); } } int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; nh = nhop_alloc(RT_DEFAULT_FIB, family); if (nh == NULL) { NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); return (ENOMEM); } nhop_set_uidx(nh, attrs->nha_id); nhop_set_origin(nh, attrs->nh_protocol); if (attrs->nha_blackhole) nhop_set_blackhole(nh, NHF_BLACKHOLE); else { error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); if (error != 0) { nhop_free(nh); return (error); } nhop_set_transmit_ifp(nh, attrs->nha_oif); nhop_set_src(nh, ifa); } error = nhop_get_unlinked(nh); if (error != 0) { NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); return (error); } IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); } unhop->un_nhop_src = nh; return (0); } static int rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct user_nhop *unhop; int error; if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) return (ENOMEM); struct unhop_ctl *ctl = V_un_ctl; struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); /* * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class * citizen. */ if (attrs.nha_id == 0) { attrs.nha_id = find_spare_uidx(ctl); if (attrs.nha_id == 0) { NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); return (ENOSPC); } } NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? if_getindex(attrs.nha_oif) : 0); unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); if (unhop == NULL) { NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); return (ENOMEM); } unhop->un_idx = attrs.nha_id; unhop->un_protocol = attrs.nh_protocol; if (attrs.nha_group) error = newnhg(ctl, &attrs, unhop); else error = newnhop(&attrs, unhop, npt); if (error != 0) { free(unhop, M_NETLINK); return (error); } UN_WLOCK(ctl); /* Check if uidx already exists */ struct user_nhop *tmp = NULL; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); if (tmp != NULL) { UN_WUNLOCK(ctl); NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); destroy_unhop(unhop); return (EEXIST); } CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); UN_WUNLOCK(ctl); /* Report addition of the next nexhop */ struct netlink_walkargs wa = { .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, }; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } dump_unhop(unhop, &wa.hdr, &nw); nlmsg_flush(&nw); consider_resize(ctl, num_buckets_new); return (0); } static int rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); int error; if (__predict_false(ctl == NULL)) return (ESRCH); struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nha_id == 0) { NL_LOG(LOG_DEBUG, "NHA_ID not set"); return (EINVAL); } error = delete_unhop(ctl, hdr, attrs.nha_id); return (error); } static bool match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) { if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) return (false); if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) return (false); if (attrs->nha_oif != NULL && (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) return (false); return (true); } static int rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct user_nhop *unhop; UN_TRACKER; int error; struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, }; if (attrs.nha_id != 0) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); struct user_nhop key = { .un_idx = attrs.nha_id }; if (__predict_false(ctl == NULL)) return (ESRCH); NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); UN_RUNLOCK(ctl); if (unhop == NULL) return (ESRCH); dump_unhop(unhop, &wa.hdr, wa.nw); return (0); } else if (attrs.nhaf_kid != 0) { struct nhop_iter iter = { .fibnum = attrs.nhaf_table, .family = attrs.nhaf_family, }; int error = ESRCH; NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family); for (struct nhop_object *nh = nhops_iter_start(&iter); nh; nh = nhops_iter_next(&iter)) { NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh)); if (nhop_get_idx(nh) == attrs.nhaf_kid) { dump_nhop(nh, 0, &wa.hdr, wa.nw); error = 0; break; } } nhops_iter_stop(&iter); return (error); } else if (attrs.nhaf_knhops) { struct nhop_iter iter = { .fibnum = attrs.nhaf_table, .family = attrs.nhaf_family, }; NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family); wa.hdr.nlmsg_flags |= NLM_F_MULTI; for (struct nhop_object *nh = nhops_iter_start(&iter); nh; nh = nhops_iter_next(&iter)) { dump_nhop(nh, 0, &wa.hdr, wa.nw); } nhops_iter_stop(&iter); } else { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); if (__predict_false(ctl == NULL)) return (ESRCH); NL_LOG(LOG_DEBUG2, "DUMP unhops"); UN_RLOCK(ctl); wa.hdr.nlmsg_flags |= NLM_F_MULTI; CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) dump_unhop(unhop, &wa.hdr, wa.nw); } CHT_SLIST_FOREACH_END; UN_RUNLOCK(ctl); } if (wa.error == 0) { if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) return (ENOMEM); } return (0); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_NEWNEXTHOP, .name = "RTM_NEWNEXTHOP", .cb = &rtnl_handle_newnhop, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_DELNEXTHOP, .name = "RTM_DELNEXTHOP", .cb = &rtnl_handle_delnhop, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_GETNEXTHOP, .name = "RTM_GETNEXTHOP", .cb = &rtnl_handle_getnhop, } }; static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser }; void rtnl_nexthops_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c index 2a6a0bc54e85..cfaa2167b0d2 100644 --- a/sys/netlink/route/rt.c +++ b/sys/netlink/route/rt.c @@ -1,1116 +1,1114 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_route #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static unsigned char get_rtm_type(const struct nhop_object *nh) { int nh_flags = nh->nh_flags; /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ if (nh_flags & NHF_BLACKHOLE) return (RTN_BLACKHOLE); else if (nh_flags & NHF_REJECT) return (RTN_PROHIBIT); return (RTN_UNICAST); } static uint8_t nl_get_rtm_protocol(const struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; uint8_t origin = nhgrp_get_origin(nhg); if (origin != RTPROT_UNSPEC) return (origin); nh = nhg->nhops[0]; } #endif uint8_t origin = nhop_get_origin(nh); if (origin != RTPROT_UNSPEC) return (origin); /* TODO: remove guesswork once all kernel users fill in origin */ int rt_flags = nhop_get_rtflags(nh); if (rt_flags & RTF_PROTO1) return (RTPROT_ZEBRA); if (rt_flags & RTF_STATIC) return (RTPROT_STATIC); return (RTPROT_KERNEL); } static int get_rtmsg_type_from_rtsock(int cmd) { switch (cmd) { case RTM_ADD: case RTM_CHANGE: case RTM_GET: return NL_RTM_NEWROUTE; case RTM_DELETE: return NL_RTM_DELROUTE; } return (0); } /* * fibnum heuristics * * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS * msg rtm_table RTA_TABLE result * RTM_GETROUTE/dump 0 - RT_ALL_FIBS * RTM_GETROUTE/dump 1 - 1 * RTM_GETROUTE/get 0 - 0 * */ static struct nhop_object * rc_get_nhop(const struct rib_cmd_info *rc) { return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); } static void dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) { #ifdef INET6 int upper_family; #endif switch (nhop_get_neigh_family(nh)) { case AF_LINK: /* onlink prefix, skip */ break; case AF_INET: nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #ifdef INET6 case AF_INET6: upper_family = nhop_get_upper_family(nh); if (upper_family == AF_INET6) { struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6); } else if (upper_family == AF_INET) { /* IPv4 over IPv6 */ struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); char buf[20]; struct rtvia *via = (struct rtvia *)&buf[0]; via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &gw6, 16); nlattr_add(nw, NL_RTA_VIA, 17, via); } break; #endif } } static void dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) { int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return; nla->nla_type = NL_RTA_METRICS; nla->nla_len = nla_len; nla++; nla->nla_type = NL_RTAX_MTU; nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); *((uint32_t *)(nla + 1)) = nh->nh_mtu; } #ifdef ROUTE_MPATH static void dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) { uint32_t uidx = nhgrp_get_uidx(nhg); uint32_t num_nhops; const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); if (off == 0) return; for (int i = 0; i < num_nhops; i++) { int nh_off = nlattr_save_offset(nw); struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); if (rtnh == NULL) return; rtnh->rtnh_flags = 0; rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp); rtnh->rtnh_hops = wn[i].weight; dump_rc_nhop_gw(nw, wn[i].nh); uint32_t rtflags = nhop_get_rtflags(wn[i].nh); if (rtflags != base_rtflags) nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, wn[i].nh); rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); /* * nlattr_add() allocates 4-byte aligned storage, no need to aligh * length here * */ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; } nlattr_set_len(nw, off); } #endif static void dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rnd->rnd_nhop)) { dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm); return; } #endif const struct nhop_object *nh = rnd->rnd_nhop; uint32_t rtflags = nhop_get_rtflags(nh); /* * IPv4 over IPv6 * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), * IPv4 w/ gw * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], * Direct route: * ('RTA_OIF', 2) */ if (nh->nh_flags & NHF_GATEWAY) dump_rc_nhop_gw(nw, nh); uint32_t uidx = nhop_get_uidx(nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, nh); uint32_t nh_expire = nhop_get_expire(nh); if (nh_expire > 0) nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); /* In any case, fill outgoing interface */ nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp)); if (rnd->rnd_weight != RT_DEFAULT_WEIGHT) nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight); } /* * Dumps output from a rib command into an rtmsg */ static int dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, const struct rtentry *rt, struct route_nhop_data *rnd, struct nl_writer *nw) { struct rtmsg *rtm; int error = 0; NET_EPOCH_ASSERT(); if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) goto enomem; int family = rt_get_family(rt); int rtm_off = nlattr_save_offset(nw); rtm = nlmsg_reserve_object(nw, struct rtmsg); rtm->rtm_family = family; rtm->rtm_dst_len = 0; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; if (fibnum < 255) rtm->rtm_table = (unsigned char)fibnum; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); int plen = 0; #if defined(INET) || defined(INET6) uint32_t scopeid; #endif switch (family) { #ifdef INET case AF_INET: { struct in_addr addr; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 4, &addr); break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 16, &addr); break; } #endif default: FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); error = EAFNOSUPPORT; goto flush; } rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); if (plen > 0) rtm->rtm_dst_len = plen; dump_rc_nhop(nw, rnd, rtm); if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; flush: nlmsg_abort(nw); return (error); } static int family_to_group(int family) { switch (family) { case AF_INET: return (RTNLGRP_IPV4_ROUTE); case AF_INET6: return (RTNLGRP_IPV6_ROUTE); } return (0); } static void report_operation(uint32_t fibnum, struct rib_cmd_info *rc, struct nlpcb *nlp, struct nlmsghdr *hdr) { struct nl_writer nw = {}; uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); switch (rc->rc_cmd) { case RTM_ADD: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; break; case RTM_CHANGE: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_REPLACE; break; case RTM_DELETE: hdr->nlmsg_type = NL_RTM_DELROUTE; break; } dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } rtsock_callback_p->route_f(fibnum, rc); } static void set_scope6(struct sockaddr *sa, struct ifnet *ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct rta_mpath_nh { struct sockaddr *gw; struct ifnet *ifp; uint8_t rtnh_flags; uint8_t rtnh_weight; }; #define _IN(_field) offsetof(struct rtnexthop, _field) #define _OUT(_field) offsetof(struct rta_mpath_nh, _field) const static struct nlattr_parser nla_p_rtnh[] = { { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, }; const static struct nlfield_parser nlf_p_rtnh[] = { { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, }; #undef _IN #undef _OUT static bool post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused) { struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs; set_scope6(attrs->gw, attrs->ifp); return (true); } NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh); struct rta_mpath { int num_nhops; struct rta_mpath_nh nhops[0]; }; static int nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int data_len = nla->nla_len - sizeof(struct nlattr); struct rtnexthop *rtnh; int max_nhops = data_len / sizeof(struct rtnexthop); struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); mp->num_nhops = 0; for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++]; int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, npt, mpnh); if (error != 0) { NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed", mp->num_nhops - 1); return (error); } int len = NL_ITEM_ALIGN(rtnh->rtnh_len); data_len -= len; rtnh = (struct rtnexthop *)((char *)rtnh + len); } if (data_len != 0 || mp->num_nhops == 0) { NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); return (EINVAL); } *((struct rta_mpath **)target) = mp; return (0); } struct nl_parsed_route { struct sockaddr *rta_dst; struct sockaddr *rta_gw; struct ifnet *rta_oif; struct rta_mpath *rta_multipath; uint32_t rta_table; uint32_t rta_rtflags; uint32_t rta_nh_id; uint32_t rta_weight; uint32_t rtax_mtu; uint8_t rtm_family; uint8_t rtm_dst_len; uint8_t rtm_protocol; uint8_t rtm_type; uint32_t rtm_flags; }; #define _IN(_field) offsetof(struct rtmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_route, _field) static struct nlattr_parser nla_p_rtmetrics[] = { { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); static const struct nlattr_parser nla_p_rtmsg[] = { { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 }, { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_rtmsg[] = { { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 }, }; #undef _IN #undef _OUT static bool post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs; set_scope6(attrs->rta_dst, attrs->rta_oif); set_scope6(attrs->rta_gw, attrs->rta_oif); return (true); } NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg); struct netlink_walkargs { struct nl_writer *nw; struct route_nhop_data rnd; struct nlmsghdr hdr; struct nlpcb *nlp; uint32_t fibnum; int family; int error; int count; int dumped; int dumped_tables; }; static int dump_rtentry(struct rtentry *rt, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; int error; wa->count++; if (wa->error != 0) return (0); if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp))) return (0); wa->dumped++; rt_get_rnd(rt, &wa->rnd); error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); IF_DEBUG_LEVEL(LOG_DEBUG3) { char rtbuf[INET6_ADDRSTRLEN + 5]; FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, "Dump %s, offset %u, error %d", rt_print_buf(rt, rtbuf, sizeof(rtbuf)), wa->nw->offset, error); } wa->error = error; return (0); } static void dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) { FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); wa->count = 0; wa->dumped = 0; rib_walk(fibnum, family, false, dump_rtentry, wa); wa->dumped_tables++; FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", wa->count, wa->dumped); NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset); } static int dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) { wa->fibnum = fibnum; if (family == AF_UNSPEC) { for (int i = 0; i < AF_MAX; i++) { if (rt_tables_get_rnh(fibnum, i) != 0) { wa->family = i; dump_rtable_one(wa, fibnum, i); if (wa->error != 0) break; } } } else { if (rt_tables_get_rnh(fibnum, family) != 0) { wa->family = family; dump_rtable_one(wa, fibnum, family); } } return (wa->error); } static int handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { RIB_RLOCK_TRACKER; struct rib_head *rnh; const struct rtentry *rt; struct route_nhop_data rnd; uint32_t fibnum = attrs->rta_table; sa_family_t family = attrs->rtm_family; if (attrs->rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); return (EINVAL); } rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); struct sockaddr *dst = attrs->rta_dst; if (attrs->rtm_flags & RTM_F_PREFIX) rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd); else rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rt_get_rnd(rt, &rnd); rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); RIB_RUNLOCK(rnh); if (!rt_is_exportable(rt, nlp_get_cred(nlp))) return (ESRCH); IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), rt_print_buf(rt, rtbuf, sizeof(rtbuf))); } hdr->nlmsg_type = NL_RTM_NEWROUTE; dump_px(fibnum, hdr, rt, &rnd, npt->nw); return (0); } static int handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, struct nlmsghdr *hdr, struct nl_writer *nw) { struct netlink_walkargs wa = { .nlp = nlp, .nw = nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_type = NL_RTM_NEWROUTE, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, }; if (fibnum == RT_TABLE_UNSPEC) { for (int i = 0; i < V_rt_numfibs; i++) { dump_rtable_fib(&wa, fibnum, family); if (wa.error != 0) break; } } else dump_rtable_fib(&wa, fibnum, family); if (wa.error == 0 && wa.dumped_tables == 0) { FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); wa.error = ESRCH; // How do we propagate it? } if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (wa.error); } static struct nhop_object * finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror) { /* * The following MUST be filled: * nh_ifp, nh_ifa, nh_gw */ if (nh->gw_sa.sa_family == 0) { /* * Empty gateway. Can be direct route with RTA_OIF set. */ if (nh->nh_ifp != NULL) nhop_set_direct_gw(nh, nh->nh_ifp); else { NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); *perror = EINVAL; return (NULL); } /* Both nh_ifp and gateway are set */ } else { /* Gateway is set up, we can derive ifp if not set */ if (nh->nh_ifp == NULL) { uint32_t fibnum = nhop_get_fibnum(nh); uint32_t flags = 0; if (nh->nh_flags & NHF_GATEWAY) flags = RTF_GATEWAY; else if (nh->nh_flags & NHF_HOST) flags = RTF_HOST; struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); *perror = EINVAL; return (NULL); } nhop_set_transmit_ifp(nh, ifa->ifa_ifp); } } /* Both nh_ifp and gateway are set */ if (nh->nh_ifa == NULL) { const struct sockaddr *gw_sa = &nh->gw_sa; if (gw_sa->sa_family != dst->sa_family) { /* * Use dst as the target for determining the default * preferred ifa IF * 1) the gateway is link-level (e.g. direct route) * 2) the gateway family is different (e.g. IPv4 over IPv6). */ gw_sa = dst; } struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); *perror = EINVAL; return (NULL); } nhop_set_src(nh, ifa); } return (nhop_get_nhop(nh, perror)); } static int get_pxflag(const struct nl_parsed_route *attrs) { int pxflag = 0; switch (attrs->rtm_family) { case AF_INET: if (attrs->rtm_dst_len == 32) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; case AF_INET6: if (attrs->rtm_dst_len == 128) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; } return (pxflag); } static int get_op_flags(int nlm_flags) { int op_flags = 0; op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; return (op_flags); } #ifdef ROUTE_MPATH static int create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, struct nl_pstate *npt, struct nhop_object **pnh) { int error; if (mpnh->gw == NULL) return (EINVAL); struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) return (ENOMEM); error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt); if (error != 0) { nhop_free(nh); return (error); } if (mpnh->ifp != NULL) nhop_set_transmit_ifp(nh, mpnh->ifp); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); *pnh = finalize_nhop(nh, attrs->rta_dst, &error); return (error); } #endif static struct nhop_object * create_nexthop_from_attrs(struct nl_parsed_route *attrs, struct nl_pstate *npt, int *perror) { struct nhop_object *nh = NULL; int error = 0; if (attrs->rta_multipath != NULL) { #ifdef ROUTE_MPATH /* Multipath w/o explicit nexthops */ int num_nhops = attrs->rta_multipath->num_nhops; struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); for (int i = 0; i < num_nhops; i++) { struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); if (error != 0) { for (int j = 0; j < i; j++) nhop_free(wn[j].nh); break; } wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn[0].nh); struct nhgrp_object *nhg; nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family, wn, num_nhops, perror); if (nhg != NULL) { if (attrs->rtm_protocol > RTPROT_STATIC) nhgrp_set_origin(nhg, attrs->rtm_protocol); nhg = nhgrp_get_nhgrp(nhg, perror); } for (int i = 0; i < num_nhops; i++) nhop_free(wn[i].nh); if (nhg != NULL) return ((struct nhop_object *)nhg); error = *perror; } #else error = ENOTSUP; #endif *perror = error; } else { nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) { *perror = ENOMEM; return (NULL); } if (attrs->rta_gw != NULL) { *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt); if (*perror != 0) { nhop_free(nh); return (NULL); } } if (attrs->rta_oif != NULL) nhop_set_transmit_ifp(nh, attrs->rta_oif); if (attrs->rtax_mtu != 0) nhop_set_mtu(nh, attrs->rtax_mtu, true); if (attrs->rta_rtflags & RTF_BROADCAST) nhop_set_broadcast(nh, true); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); switch (attrs->rtm_type) { case RTN_UNICAST: break; case RTN_BLACKHOLE: nhop_set_blackhole(nh, RTF_BLACKHOLE); break; case RTN_PROHIBIT: case RTN_UNREACHABLE: nhop_set_blackhole(nh, RTF_REJECT); break; /* TODO: return ENOTSUP for other types if strict option is set */ } nh = finalize_nhop(nh, attrs->rta_dst, perror); } return (nh); } static int rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc = {}; struct nhop_object *nh = NULL; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); /* Check if we have enough data */ if (attrs.rta_dst == NULL) { NL_LOG(LOG_DEBUG, "missing RTA_DST"); return (EINVAL); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (attrs.rta_nh_id != 0) { /* Referenced uindex */ int pxflag = get_pxflag(&attrs); nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, pxflag, &error); if (error != 0) return (error); } else { nh = create_nexthop_from_attrs(&attrs, npt, &error); if (error != 0) { NL_LOG(LOG_DEBUG, "Error creating nexthop"); return (error); } } if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0) attrs.rta_weight = RT_DEFAULT_WEIGHT; struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight }; int op_flags = get_op_flags(hdr->nlmsg_flags); error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, &rnd, op_flags, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) return (0); if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) return (0); return (1); } static int rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); return (ESRCH); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (hdr->nlmsg_flags & NLM_F_DUMP) error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); else error = handle_rtm_getroute(nlp, &attrs, hdr, npt); return (error); } void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { struct nl_writer nw = {}; int family, nlm_flags = 0; family = rt_get_family(rc->rc_rt); /* XXX: check if there are active listeners first */ /* TODO: consider passing PID/type/seq */ switch (rc->rc_cmd) { case RTM_ADD: nlm_flags = NLM_F_EXCL | NLM_F_CREATE; break; case RTM_CHANGE: nlm_flags = NLM_F_REPLACE; break; case RTM_DELETE: nlm_flags = 0; break; } IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "received event %s for %s / nlm_flags=%X", rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), nlm_flags); } struct nlmsghdr hdr = { .nlmsg_flags = nlm_flags, .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), }; struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; uint32_t group_id = family_to_group(family); if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { NL_LOG(LOG_DEBUG, "error allocating event buffer"); return; } dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETROUTE, .name = "RTM_GETROUTE", .cb = &rtnl_handle_getroute, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELROUTE, .name = "RTM_DELROUTE", .cb = &rtnl_handle_delroute, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_NEWROUTE, .name = "RTM_NEWROUTE", .cb = &rtnl_handle_newroute, .priv = PRIV_NET_ROUTE, } }; static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; void rtnl_routes_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c index bb50b3b2b321..459a5dc6507e 100644 --- a/sys/netpfil/pf/pf_nl.c +++ b/sys/netpfil/pf/pf_nl.c @@ -1,372 +1,370 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * Copyright (c) 2023 Rubicon Communications, LLC (Netgate) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_pf #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_DEBUG); struct nl_parsed_state { uint8_t version; uint32_t id; uint32_t creatorid; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_parsed_state, _field) static const struct nlattr_parser nla_p_state[] = { { .type = PF_ST_ID, .off = _OUT(id), .cb = nlattr_get_uint32 }, { .type = PF_ST_CREATORID, .off = _OUT(creatorid), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_generic[] = { { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(state_parser, struct genlmsghdr, nlf_p_generic, nla_p_state); static void dump_addr(struct nl_writer *nw, int attr, const struct pf_addr *addr, int af) { switch (af) { case AF_INET: nlattr_add(nw, attr, 4, &addr->v4); break; case AF_INET6: nlattr_add(nw, attr, 16, &addr->v6); break; }; } static bool dump_state_peer(struct nl_writer *nw, int attr, const struct pf_state_peer *peer) { int off = nlattr_add_nested(nw, attr); if (off == 0) return (false); nlattr_add_u32(nw, PF_STP_SEQLO, peer->seqlo); nlattr_add_u32(nw, PF_STP_SEQHI, peer->seqhi); nlattr_add_u32(nw, PF_STP_SEQDIFF, peer->seqdiff); nlattr_add_u16(nw, PF_STP_MAX_WIN, peer->max_win); nlattr_add_u16(nw, PF_STP_MSS, peer->mss); nlattr_add_u8(nw, PF_STP_STATE, peer->state); nlattr_add_u8(nw, PF_STP_WSCALE, peer->wscale); if (peer->scrub != NULL) { struct pf_state_scrub *sc = peer->scrub; uint16_t pfss_flags = sc->pfss_flags & PFSS_TIMESTAMP; nlattr_add_u16(nw, PF_STP_PFSS_FLAGS, pfss_flags); nlattr_add_u32(nw, PF_STP_PFSS_TS_MOD, sc->pfss_ts_mod); nlattr_add_u8(nw, PF_STP_PFSS_TTL, sc->pfss_ttl); nlattr_add_u8(nw, PF_STP_SCRUB_FLAG, PFSYNC_SCRUB_FLAG_VALID); } nlattr_set_len(nw, off); return (true); } static bool dump_state_key(struct nl_writer *nw, int attr, const struct pf_state_key *key) { int off = nlattr_add_nested(nw, attr); if (off == 0) return (false); dump_addr(nw, PF_STK_ADDR0, &key->addr[0], key->af); dump_addr(nw, PF_STK_ADDR1, &key->addr[1], key->af); nlattr_add_u16(nw, PF_STK_PORT0, key->port[0]); nlattr_add_u16(nw, PF_STK_PORT1, key->port[1]); nlattr_set_len(nw, off); return (true); } static int dump_state(struct nlpcb *nlp, const struct nlmsghdr *hdr, struct pf_kstate *s, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; int error = 0; int af; struct pf_state_key *key; if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GETSTATES; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u64(nw, PF_ST_VERSION, PF_STATE_VERSION); key = s->key[PF_SK_WIRE]; if (!dump_state_key(nw, PF_ST_KEY_WIRE, key)) goto enomem; key = s->key[PF_SK_STACK]; if (!dump_state_key(nw, PF_ST_KEY_STACK, key)) goto enomem; af = s->key[PF_SK_WIRE]->af; nlattr_add_u8(nw, PF_ST_PROTO, s->key[PF_SK_WIRE]->proto); nlattr_add_u8(nw, PF_ST_AF, af); nlattr_add_string(nw, PF_ST_IFNAME, s->kif->pfik_name); nlattr_add_string(nw, PF_ST_ORIG_IFNAME, s->orig_kif->pfik_name); dump_addr(nw, PF_ST_RT_ADDR, &s->rt_addr, af); nlattr_add_u32(nw, PF_ST_CREATION, time_uptime - s->creation); uint32_t expire = pf_state_expires(s); if (expire > time_uptime) expire = expire - time_uptime; nlattr_add_u32(nw, PF_ST_EXPIRE, expire); nlattr_add_u8(nw, PF_ST_DIRECTION, s->direction); nlattr_add_u8(nw, PF_ST_LOG, s->act.log); nlattr_add_u8(nw, PF_ST_TIMEOUT, s->timeout); nlattr_add_u16(nw, PF_ST_STATE_FLAGS, s->state_flags); uint8_t sync_flags = 0; if (s->src_node) sync_flags |= PFSYNC_FLAG_SRCNODE; if (s->nat_src_node) sync_flags |= PFSYNC_FLAG_NATSRCNODE; nlattr_add_u8(nw, PF_ST_SYNC_FLAGS, sync_flags); nlattr_add_u64(nw, PF_ST_ID, s->id); nlattr_add_u32(nw, PF_ST_CREATORID, htonl(s->creatorid)); nlattr_add_u32(nw, PF_ST_RULE, s->rule.ptr ? s->rule.ptr->nr : -1); nlattr_add_u32(nw, PF_ST_ANCHOR, s->anchor.ptr ? s->anchor.ptr->nr : -1); nlattr_add_u32(nw, PF_ST_NAT_RULE, s->nat_rule.ptr ? s->nat_rule.ptr->nr : -1); nlattr_add_u64(nw, PF_ST_PACKETS0, s->packets[0]); nlattr_add_u64(nw, PF_ST_PACKETS1, s->packets[1]); nlattr_add_u64(nw, PF_ST_BYTES0, s->bytes[0]); nlattr_add_u64(nw, PF_ST_BYTES1, s->bytes[1]); if (!dump_state_peer(nw, PF_ST_PEER_SRC, &s->src)) goto enomem; if (!dump_state_peer(nw, PF_ST_PEER_DST, &s->dst)) goto enomem; if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; nlmsg_abort(nw); return (error); } static int handle_dumpstates(struct nlpcb *nlp, struct nl_parsed_state *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { int error = 0; hdr->nlmsg_flags |= NLM_F_MULTI; for (int i = 0; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_kstate *s; if (LIST_EMPTY(&ih->states)) continue; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (s->timeout != PFTM_UNLINKED) { error = dump_state(nlp, hdr, s, npt); if (error != 0) break; } } PF_HASHROW_UNLOCK(ih); } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static int handle_getstate(struct nlpcb *nlp, struct nl_parsed_state *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_kstate *s = pf_find_state_byid(attrs->id, attrs->creatorid); if (s == NULL) return (ENOENT); return (dump_state(nlp, hdr, s, npt)); } static int dump_creatorid(struct nlpcb *nlp, const struct nlmsghdr *hdr, uint32_t creator, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GETCREATORS; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_ST_CREATORID, htonl(creator)); if (nlmsg_end(nw)) return (0); enomem: nlmsg_abort(nw); return (ENOMEM); } static int pf_handle_getstates(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error; struct nl_parsed_state attrs = {}; error = nl_parse_nlmsg(hdr, &state_parser, npt, &attrs); if (error != 0) return (error); if (attrs.id != 0) error = handle_getstate(npt->nlp, &attrs, hdr, npt); else error = handle_dumpstates(npt->nlp, &attrs, hdr, npt); return (error); } static int pf_handle_getcreators(struct nlmsghdr *hdr, struct nl_pstate *npt) { uint32_t creators[16]; int error = 0; bzero(creators, sizeof(creators)); for (int i = 0; i < pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_kstate *s; if (LIST_EMPTY(&ih->states)) continue; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { int j; if (s->timeout == PFTM_UNLINKED) continue; for (j = 0; j < nitems(creators); j++) { if (creators[j] == s->creatorid) break; if (creators[j] == 0) { creators[j] = s->creatorid; break; } } if (j == nitems(creators)) printf("Warning: too many creators!\n"); } PF_HASHROW_UNLOCK(ih); } hdr->nlmsg_flags |= NLM_F_MULTI; for (int i = 0; i < nitems(creators); i++) { if (creators[i] == 0) break; error = dump_creatorid(npt->nlp, hdr, creators[i], npt); } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static const struct nlhdr_parser *all_parsers[] = { &state_parser }; static int family_id; static const struct genl_cmd pf_cmds[] = { { .cmd_num = PFNL_CMD_GETSTATES, .cmd_name = "GETSTATES", .cmd_cb = pf_handle_getstates, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, { .cmd_num = PFNL_CMD_GETCREATORS, .cmd_name = "GETCREATORS", .cmd_cb = pf_handle_getcreators, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, }; void pf_nl_register(void) { NL_VERIFY_PARSERS(all_parsers); family_id = genl_register_family(PFNL_FAMILY_NAME, 0, 2, PFNL_CMD_MAX); genl_register_cmds(PFNL_FAMILY_NAME, pf_cmds, NL_ARRAY_LEN(pf_cmds)); } void pf_nl_unregister(void) { genl_unregister_family(PFNL_FAMILY_NAME); } diff --git a/sys/tests/ktest.c b/sys/tests/ktest.c index 495fedf95dde..cd83a6aaaa3f 100644 --- a/sys/tests/ktest.c +++ b/sys/tests/ktest.c @@ -1,415 +1,413 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ -#include "opt_netlink.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct mtx ktest_mtx; #define KTEST_LOCK() mtx_lock(&ktest_mtx) #define KTEST_UNLOCK() mtx_unlock(&ktest_mtx) #define KTEST_LOCK_ASSERT() mtx_assert(&ktest_mtx, MA_OWNED) MTX_SYSINIT(ktest_mtx, &ktest_mtx, "ktest mutex", MTX_DEF); struct ktest_module { struct ktest_module_info *info; volatile u_int refcount; TAILQ_ENTRY(ktest_module) entries; }; static TAILQ_HEAD(, ktest_module) module_list = TAILQ_HEAD_INITIALIZER(module_list); struct nl_ktest_parsed { char *mod_name; char *test_name; struct nlattr *test_meta; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_ktest_parsed, _field) static const struct nlattr_parser nla_p_get[] = { { .type = KTEST_ATTR_MOD_NAME, .off = _OUT(mod_name), .cb = nlattr_get_string }, { .type = KTEST_ATTR_TEST_NAME, .off = _OUT(test_name), .cb = nlattr_get_string }, { .type = KTEST_ATTR_TEST_META, .off = _OUT(test_meta), .cb = nlattr_get_nla }, }; static const struct nlfield_parser nlf_p_get[] = { }; NL_DECLARE_PARSER(ktest_parser, struct genlmsghdr, nlf_p_get, nla_p_get); #undef _IN #undef _OUT static bool create_reply(struct nl_writer *nw, struct nlmsghdr *hdr, int cmd) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (false); struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = cmd; ghdr_new->version = 0; ghdr_new->reserved = 0; return (true); } static int dump_mod_test(struct nlmsghdr *hdr, struct nl_pstate *npt, struct ktest_module *mod, const struct ktest_test_info *test_info) { struct nl_writer *nw = npt->nw; if (!create_reply(nw, hdr, KTEST_CMD_NEWTEST)) goto enomem; nlattr_add_string(nw, KTEST_ATTR_MOD_NAME, mod->info->name); nlattr_add_string(nw, KTEST_ATTR_TEST_NAME, test_info->name); nlattr_add_string(nw, KTEST_ATTR_TEST_DESCR, test_info->desc); if (nlmsg_end(nw)) return (0); enomem: nlmsg_abort(nw); return (ENOMEM); } static int dump_mod_tests(struct nlmsghdr *hdr, struct nl_pstate *npt, struct ktest_module *mod, struct nl_ktest_parsed *attrs) { for (int i = 0; i < mod->info->num_tests; i++) { const struct ktest_test_info *test_info = &mod->info->tests[i]; if (attrs->test_name != NULL && strcmp(attrs->test_name, test_info->name)) continue; int error = dump_mod_test(hdr, npt, mod, test_info); if (error != 0) return (error); } return (0); } static int dump_tests(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_ktest_parsed attrs = { }; struct ktest_module *mod; int error; error = nl_parse_nlmsg(hdr, &ktest_parser, npt, &attrs); if (error != 0) return (error); hdr->nlmsg_flags |= NLM_F_MULTI; KTEST_LOCK(); TAILQ_FOREACH(mod, &module_list, entries) { if (attrs.mod_name && strcmp(attrs.mod_name, mod->info->name)) continue; error = dump_mod_tests(hdr, npt, mod, &attrs); if (error != 0) break; } KTEST_UNLOCK(); if (!nlmsg_end_dump(npt->nw, error, hdr)) { //NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static int run_test(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_ktest_parsed attrs = { }; struct ktest_module *mod; int error; error = nl_parse_nlmsg(hdr, &ktest_parser, npt, &attrs); if (error != 0) return (error); if (attrs.mod_name == NULL) { nlmsg_report_err_msg(npt, "KTEST_ATTR_MOD_NAME not set"); return (EINVAL); } if (attrs.test_name == NULL) { nlmsg_report_err_msg(npt, "KTEST_ATTR_TEST_NAME not set"); return (EINVAL); } const struct ktest_test_info *test = NULL; KTEST_LOCK(); TAILQ_FOREACH(mod, &module_list, entries) { if (strcmp(attrs.mod_name, mod->info->name)) continue; const struct ktest_module_info *info = mod->info; for (int i = 0; i < info->num_tests; i++) { const struct ktest_test_info *test_info = &info->tests[i]; if (!strcmp(attrs.test_name, test_info->name)) { test = test_info; break; } } break; } if (test != NULL) refcount_acquire(&mod->refcount); KTEST_UNLOCK(); if (test == NULL) return (ESRCH); /* Run the test */ struct ktest_test_context ctx = { .npt = npt, .hdr = hdr, .buf = npt_alloc(npt, KTEST_MAX_BUF), .bufsize = KTEST_MAX_BUF, }; if (ctx.buf == NULL) { //NL_LOG(LOG_DEBUG, "unable to allocate temporary buffer"); return (ENOMEM); } if (test->parse != NULL && attrs.test_meta != NULL) { error = test->parse(&ctx, attrs.test_meta); if (error != 0) return (error); } hdr->nlmsg_flags |= NLM_F_MULTI; KTEST_LOG_LEVEL(&ctx, LOG_INFO, "start running %s", test->name); error = test->func(&ctx); KTEST_LOG_LEVEL(&ctx, LOG_INFO, "end running %s", test->name); refcount_release(&mod->refcount); if (!nlmsg_end_dump(npt->nw, error, hdr)) { //NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } /* USER API */ static void register_test_module(struct ktest_module_info *info) { struct ktest_module *mod = malloc(sizeof(*mod), M_TEMP, M_WAITOK | M_ZERO); mod->info = info; info->module_ptr = mod; KTEST_LOCK(); TAILQ_INSERT_TAIL(&module_list, mod, entries); KTEST_UNLOCK(); } static void unregister_test_module(struct ktest_module_info *info) { struct ktest_module *mod = info->module_ptr; info->module_ptr = NULL; KTEST_LOCK(); TAILQ_REMOVE(&module_list, mod, entries); KTEST_UNLOCK(); free(mod, M_TEMP); } static bool can_unregister(struct ktest_module_info *info) { struct ktest_module *mod = info->module_ptr; return (refcount_load(&mod->refcount) == 0); } int ktest_default_modevent(module_t mod, int type, void *arg) { struct ktest_module_info *info = (struct ktest_module_info *)arg; int error = 0; switch (type) { case MOD_LOAD: register_test_module(info); break; case MOD_UNLOAD: if (!can_unregister(info)) return (EBUSY); unregister_test_module(info); break; default: error = EOPNOTSUPP; break; } return (error); } bool ktest_start_msg(struct ktest_test_context *ctx) { return (create_reply(ctx->npt->nw, ctx->hdr, KTEST_CMD_NEWMESSAGE)); } void ktest_add_msg_meta(struct ktest_test_context *ctx, const char *func, const char *fname, int line) { struct nl_writer *nw = ctx->npt->nw; struct timespec ts; nanouptime(&ts); nlattr_add(nw, KTEST_MSG_ATTR_TS, sizeof(ts), &ts); nlattr_add_string(nw, KTEST_MSG_ATTR_FUNC, func); nlattr_add_string(nw, KTEST_MSG_ATTR_FILE, fname); nlattr_add_u32(nw, KTEST_MSG_ATTR_LINE, line); } void ktest_add_msg_text(struct ktest_test_context *ctx, int msg_level, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vsnprintf(ctx->buf, ctx->bufsize, fmt, ap); va_end(ap); nlattr_add_u8(ctx->npt->nw, KTEST_MSG_ATTR_LEVEL, msg_level); nlattr_add_string(ctx->npt->nw, KTEST_MSG_ATTR_TEXT, ctx->buf); } void ktest_end_msg(struct ktest_test_context *ctx) { nlmsg_end(ctx->npt->nw); } /* Module glue */ static const struct nlhdr_parser *all_parsers[] = { &ktest_parser }; static const struct genl_cmd ktest_cmds[] = { { .cmd_num = KTEST_CMD_LIST, .cmd_name = "KTEST_CMD_LIST", .cmd_cb = dump_tests, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, { .cmd_num = KTEST_CMD_RUN, .cmd_name = "KTEST_CMD_RUN", .cmd_cb = run_test, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_KLD_LOAD, }, }; static void ktest_nl_register(void) { bool ret __diagused; int family_id __diagused; NL_VERIFY_PARSERS(all_parsers); family_id = genl_register_family(KTEST_FAMILY_NAME, 0, 1, KTEST_CMD_MAX); MPASS(family_id != 0); ret = genl_register_cmds(KTEST_FAMILY_NAME, ktest_cmds, NL_ARRAY_LEN(ktest_cmds)); MPASS(ret); } static void ktest_nl_unregister(void) { MPASS(TAILQ_EMPTY(&module_list)); genl_unregister_family(KTEST_FAMILY_NAME); } static int ktest_modevent(module_t mod, int type, void *unused) { int error = 0; switch (type) { case MOD_LOAD: ktest_nl_register(); break; case MOD_UNLOAD: ktest_nl_unregister(); break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t ktestmod = { "ktest", ktest_modevent, 0 }; DECLARE_MODULE(ktestmod, ktestmod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(ktestmod, 1); MODULE_DEPEND(ktestmod, netlink, 1, 1, 1);