diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c index 871638adb048..5bec8fce3fcb 100644 --- a/sys/netinet/ip_carp.c +++ b/sys/netinet/ip_carp.c @@ -1,3105 +1,3104 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002 Michael Shalayeff. * Copyright (c) 2003 Ryan McBride. * Copyright (c) 2011 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #include #include #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ carp_version_t sc_version; /* carp or VRRPv3 */ uint8_t sc_addr[ETHER_ADDR_LEN]; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ #endif struct mtx sc_mtx; int sc_vhid; union { struct { /* sc_version == CARP_VERSION_CARP */ int sc_advskew; int sc_advbase; struct in_addr sc_carpaddr; struct in6_addr sc_carpaddr6; uint64_t sc_counter; bool sc_init_counter; #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; }; struct { /* sc_version == CARP_VERSION_VRRPv3 */ uint8_t sc_vrrp_prio; uint16_t sc_vrrp_adv_inter; uint16_t sc_vrrp_master_inter; }; }; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; struct carp_if { #ifdef INET int cif_naddrs; #endif #ifdef INET6 int cif_naddrs6; #endif TAILQ_HEAD(, carp_softc) cif_vrs; #ifdef INET struct ip_moptions cif_imo; #endif #ifdef INET6 struct ip6_moptions cif_im6o; #endif struct ifnet *cif_ifp; struct mtx cif_mtx; uint32_t cif_flags; #define CIF_PROMISC 0x00000001 }; /* Kernel equivalent of struct carpreq, but with more fields for new features. * */ struct carpkreq { int carpr_count; int carpr_vhid; int carpr_state; int carpr_advskew; int carpr_advbase; unsigned char carpr_key[CARP_KEY_LEN]; /* Everything above this is identical to carpreq */ struct in_addr carpr_addr; struct in6_addr carpr_addr6; carp_version_t carpr_version; uint8_t carpr_vrrp_priority; uint16_t carpr_vrrp_adv_inter; }; /* * Brief design of carp(4). * * Any carp-capable ifnet may have a list of carp softcs hanging off * its ifp->if_carp pointer. Each softc represents one unique virtual * host id, or vhid. The softc has a back pointer to the ifnet. All * softcs are joined in a global list, which has quite limited use. * * Any interface address that takes part in CARP negotiation has a * pointer to the softc of its vhid, ifa->ifa_carp. That could be either * AF_INET or AF_INET6 address. * * Although, one can get the softc's backpointer to ifnet and traverse * through its ifp->if_addrhead queue to find all interface addresses * involved in CARP, we keep a growable array of ifaddr pointers. This * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that * do calls into the network stack, thus avoiding LORs. * * Locking: * * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), * callout-driven events and ioctl()s. * * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx. * To traverse the global list we use the mutex carp_mtx. * * Known issues with locking: * * - Sending ad, we put the pointer to the softc in an mtag, and no reference * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ /* Accept incoming CARP packets. */ VNET_DEFINE_STATIC(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) /* Set DSCP in outgoing CARP packets. */ VNET_DEFINE_STATIC(int, carp_dscp) = 56; #define V_carp_dscp VNET(carp_dscp) /* Preempt slower nodes. */ VNET_DEFINE_STATIC(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ VNET_DEFINE_STATIC(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ VNET_DEFINE_STATIC(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS); static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS); static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "CARP"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, &VNET_NAME(carp_allow), 0, carp_allow_sysctl, "I", "Accept incoming CARP packets"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_dscp_sysctl, "I", "DSCP value for carp packets"); SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_log), 0, "CARP log level"); SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, carp_demote_adj_sysctl, "I", "Adjust demotion factor (skew of advskew)"); SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_ifdown_adj), 0, "Interface down demotion factor adjustment"); VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); VNET_PCPUSTAT_SYSINIT(carpstats); VNET_PCPUSTAT_SYSUNINIT(carpstats); #define CARPSTATS_ADD(name, val) \ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ sizeof(uint64_t)], (val)) #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ NULL, MTX_DEF) #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CIF_FREE(cif) do { \ CIF_LOCK(cif); \ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ carp_free_if(cif); \ else \ CIF_UNLOCK(cif); \ } while (0) #define CARP_LOG(...) do { \ if (V_carp_log > 0) \ log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ CARP_LOCK_ASSERT(sc); \ for (int _i = 0; \ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ ((ifa) = sc->sc_ifas[_i]) != NULL; \ ++_i) #define IFNET_FOREACH_CARP(ifp, sc) \ KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) || \ sx_xlocked(&carp_sx), ("cif_vrs not locked")); \ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) #define DEMOTE_ADVSKEW(sc) \ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ CARP_MAXSKEW : \ (((sc)->sc_advskew + V_carp_demotion < 0) ? \ 0 : ((sc)->sc_advskew + V_carp_demotion))) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t, int); static void vrrp_input_c(struct mbuf *, int, sa_family_t, int, int, uint16_t); static struct carp_softc *carp_alloc(struct ifnet *, carp_version_t, int); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); static void carp_free_if(struct carp_if *); static void carp_set_state(struct carp_softc *, int, const char* reason); static void carp_sc_state(struct carp_softc *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *, const char* reason); static void carp_send_ad_locked(struct carp_softc *); static void vrrp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; static struct sx carp_sx; static struct task carp_sendall_task = TASK_INITIALIZER(0, carp_send_ad_all, NULL); static int carp_is_supported_if(if_t ifp) { if (ifp == NULL) return (ENXIO); switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: break; default: return (EOPNOTSUPP); } return (0); } static void carp_hmac_prepare(struct carp_softc *sc) { uint8_t version = CARP_VERSION_CARP, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET struct in_addr last, cur, in; #endif #ifdef INET6 struct in6_addr last6, cur6, in6; #endif CARP_LOCK_ASSERT(sc); MPASS(sc->sc_version == CARP_VERSION_CARP); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET cur.s_addr = 0; do { found = 0; last = cur; cur.s_addr = 0xffffffff; CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && ntohl(in.s_addr) < ntohl(cur.s_addr)) { cur.s_addr = in.s_addr; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); #endif /* INET */ #ifdef INET6 memset(&cur6, 0, sizeof(cur6)); do { found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; if (ifa->ifa_addr->sa_family == AF_INET6 && memcmp(&in6, &last6, sizeof(in6)) > 0 && memcmp(&in6, &cur6, sizeof(in6)) < 0) { cur6 = in6; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; } static void carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; CARP_LOCK_ASSERT(sc); /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } static int vrrp_checksum_verify(struct mbuf *m, int off, int len, uint16_t phdrcksum) { uint16_t cksum; /* * Note that VRRPv3 checksums are different from CARP checksums. * Carp just calculates the checksum over the packet. * VRRPv3 includes the pseudo-header checksum as well. */ cksum = in_cksum_skip(m, off + len, off); cksum -= phdrcksum; return (cksum); } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET static int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip; struct vrrpv3_header *vh; int iplen; int minlen; int totlen; iplen = *offp; *mp = NULL; CARPSTATS_INC(carps_ipackets); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* Ensure we have enough header to figure out the version. */ if (m->m_pkthdr.len < iplen + sizeof(*vh)) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < sizeof(struct vrrpv3_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } if (m->m_len < iplen + sizeof(*vh)) { if ((m = m_pullup(m, iplen + sizeof(*vh))) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); totlen = ntohs(ip->ip_len); vh = (struct vrrpv3_header *)((char *)ip + iplen); switch (vh->vrrp_version) { case CARP_VERSION_CARP: minlen = sizeof(struct carp_header); break; case CARP_VERSION_VRRPv3: minlen = sizeof(struct vrrpv3_header); break; default: CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: unsupported version %d on %s\n", __func__, vh->vrrp_version, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } /* And now check the length again but with the real minimal length. */ if (m->m_pkthdr.len < iplen + minlen) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < %d " "on %s\n", __func__, m->m_len - sizeof(struct ip), iplen + minlen, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } if (m->m_len < iplen + minlen) { if ((m = m_pullup(m, iplen + minlen)) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); vh = (struct vrrpv3_header *)((char *)ip + iplen); } switch (vh->vrrp_version) { case CARP_VERSION_CARP: { struct carp_header *ch; /* verify the CARP checksum */ if (in_cksum_skip(m, totlen, iplen)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed on %s\n", __func__, if_name(m->m_pkthdr.rcvif)); m_freem(m); break; } ch = (struct carp_header *)((char *)ip + iplen); carp_input_c(m, ch, AF_INET, ip->ip_ttl); break; } case CARP_VERSION_VRRPv3: { uint16_t phdrcksum; phdrcksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)(totlen - iplen) + ip->ip_p)); vrrp_input_c(m, iplen, AF_INET, ip->ip_ttl, totlen - iplen, phdrcksum); break; } default: KASSERT(false, ("Unsupported version %d", vh->vrrp_version)); } return (IPPROTO_DONE); } #endif #ifdef INET6 static int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct vrrpv3_header *vh; u_int len, minlen; CARPSTATS_INC(carps_ipackets6); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); CARP_DEBUG("%s: packet received on non-carp interface: %s\n", __func__, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } if (m->m_len < *offp + sizeof(*vh)) { len = m->m_len; m = m_pullup(m, *offp + sizeof(*vh)); if (m == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); } vh = (struct vrrpv3_header *)(mtod(m, char *) + *offp); switch (vh->vrrp_version) { case CARP_VERSION_CARP: minlen = sizeof(struct carp_header); break; case CARP_VERSION_VRRPv3: minlen = sizeof(struct vrrpv3_header); break; default: CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: unsupported version %d on %s\n", __func__, vh->vrrp_version, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } /* And now check the length again but with the real minimal length. */ if (m->m_pkthdr.len < sizeof(*ip6) + minlen) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < %zd " "on %s\n", __func__, m->m_len - sizeof(struct ip), sizeof(*ip6) + minlen, if_name(m->m_pkthdr.rcvif)); m_freem(m); return (IPPROTO_DONE); } if (m->m_len < sizeof(*ip6) + minlen) { if ((m = m_pullup(m, sizeof(*ip6) + minlen)) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s():%d: pullup failed\n", __func__, __LINE__); return (IPPROTO_DONE); } ip6 = mtod(m, struct ip6_hdr *); vh = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6)); } switch (vh->vrrp_version) { case CARP_VERSION_CARP: { struct carp_header *ch; /* verify the CARP checksum */ if (in_cksum_skip(m, *offp + sizeof(struct carp_header), *offp)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed, on %s\n", __func__, if_name(m->m_pkthdr.rcvif)); m_freem(m); break; } ch = (struct carp_header *)((char *)ip6 + sizeof(*ip6)); carp_input_c(m, ch, AF_INET6, ip6->ip6_hlim); break; } case CARP_VERSION_VRRPv3: { uint16_t phdrcksum; phdrcksum = in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen), ip6->ip6_nxt, 0); vrrp_input_c(m, sizeof(*ip6), AF_INET6, ip6->ip6_hlim, ntohs(ip6->ip6_plen), phdrcksum); break; } default: KASSERT(false, ("Unsupported version %d", vh->vrrp_version)); } return (IPPROTO_DONE); } #endif /* INET6 */ /* * This routine should not be necessary at all, but some switches * (VMWare ESX vswitches) can echo our own packets back at us, * and we must ignore them or they will cause us to drop out of * MASTER mode. * * We cannot catch all cases of network loops. Instead, what we * do here is catch any packet that arrives with a carp header * with a VHID of 0, that comes from an address that is our own. * These packets are by definition "from us" (even if they are from * a misconfigured host that is pretending to be us). * * The VHID test is outside this mini-function. */ static int carp_source_is_self(const struct mbuf *m, struct ifaddr *ifa, sa_family_t af) { #ifdef INET struct ip *ip4; struct in_addr in4; #endif #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; #endif switch (af) { #ifdef INET case AF_INET: ip4 = mtod(m, struct ip *); in4 = ifatoia(ifa)->ia_addr.sin_addr; return (in4.s_addr == ip4->ip_src.s_addr); #endif #ifdef INET6 case AF_INET6: ip6 = mtod(m, struct ip6_hdr *); in6 = ifatoia6(ifa)->ia_addr.sin6_addr; return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0); #endif default: break; } return (0); } static struct ifaddr * carp_find_ifa(const struct mbuf *m, sa_family_t af, uint8_t vhid) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa, *match; int error; NET_EPOCH_ASSERT(); /* * Verify that the VHID is valid on the receiving interface. * * There should be just one match. If there are none * the VHID is not valid and we drop the packet. If * there are multiple VHID matches, take just the first * one, for compatibility with previous code. While we're * scanning, check for obvious loops in the network topology * (these should never happen, and as noted above, we may * miss real loops; this is just a double-check). */ error = 0; match = NULL; IFNET_FOREACH_IFA(ifp, ifa) { if (match == NULL && ifa->ifa_carp != NULL && ifa->ifa_addr->sa_family == af && ifa->ifa_carp->sc_vhid == vhid) match = ifa; if (vhid == 0 && carp_source_is_self(m, ifa, af)) error = ELOOP; } ifa = error ? NULL : match; if (ifa != NULL) ifa_ref(ifa); if (ifa == NULL) { if (error == ELOOP) { CARP_DEBUG("dropping looped packet on interface %s\n", if_name(ifp)); CARPSTATS_INC(carps_badif); /* ??? */ } else { CARPSTATS_INC(carps_badvhid); } } return (ifa); } static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af, int ttl) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa; struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; bool multicast = false; NET_EPOCH_ASSERT(); MPASS(ch->carp_version == CARP_VERSION_CARP); ifa = carp_find_ifa(m, af, ch->carp_vhid); if (ifa == NULL) { m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); /* verify the CARP version. */ if (sc->sc_version != CARP_VERSION_CARP) { CARP_UNLOCK(sc); CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), ch->carp_version); ifa_free(ifa); m_freem(m); return; } if (ifa->ifa_addr->sa_family == AF_INET) { multicast = IN_MULTICAST(sc->sc_carpaddr.s_addr); } else { multicast = IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6); } ifa_free(ifa); /* verify that the IP TTL is 255, but only if we're not in unicast mode. */ if (multicast && ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ttl, if_name(m->m_pkthdr.rcvif)); goto out; } if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, sc->sc_vhid, if_name(ifp)); goto out; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = false; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "more frequent advertisement received"); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "preempting a slower master"); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { carp_master_down_locked(sc, "master will time out"); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static void vrrp_input_c(struct mbuf *m, int off, sa_family_t af, int ttl, int len, uint16_t phdrcksum) { struct vrrpv3_header *vh = mtodo(m, off); struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa; struct carp_softc *sc; NET_EPOCH_ASSERT(); MPASS(vh->vrrp_version == CARP_VERSION_VRRPv3); ifa = carp_find_ifa(m, af, vh->vrrp_vrtid); if (ifa == NULL) { m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); ifa_free(ifa); /* verify the CARP version. */ if (sc->sc_version != CARP_VERSION_VRRPv3) { CARP_UNLOCK(sc); CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", if_name(ifp), vh->vrrp_version); m_freem(m); return; } /* verify that the IP TTL is 255. */ if (ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ttl, if_name(m->m_pkthdr.rcvif)); goto out; } if (vrrp_checksum_verify(m, off, len, phdrcksum)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: incorrect checksum for VRID %u@%s\n", __func__, sc->sc_vhid, if_name(ifp)); goto out; } /* RFC9568, 7.1 Receiving VRRP packets. */ if (sc->sc_vrrp_prio == 255) { CARP_DEBUG("%s: our priority is 255. Ignore peer announcement.\n", __func__); goto out; } /* XXX TODO Check IP address payload. */ sc->sc_vrrp_master_inter = ntohs(vh->vrrp_max_adver_int); switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. * Same if the peer has a higher priority than us. */ if (ntohs(vh->vrrp_max_adver_int) < sc->sc_vrrp_adv_inter || vh->vrrp_priority > sc->sc_vrrp_prio) { callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "more frequent advertisement received"); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && (ntohs(vh->vrrp_max_adver_int) > sc->sc_vrrp_adv_inter || vh->vrrp_priority < sc->sc_vrrp_prio)) { carp_master_down_locked(sc, "preempting a slower master"); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static int carp_tag(struct carp_softc *sc, struct mbuf *m) { struct m_tag *mtag; /* Tag packet for carp_output */ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(sc->sc_vhid), M_NOWAIT)) == NULL) { m_freem(m); CARPSTATS_INC(carps_onomem); return (ENOMEM); } bcopy(&sc->sc_vhid, mtag + 1, sizeof(sc->sc_vhid)); m_tag_prepend(m, mtag); return (0); } static void carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { MPASS(sc->sc_version == CARP_VERSION_CARP); if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); } static inline void send_ad_locked(struct carp_softc *sc) { switch (sc->sc_version) { case CARP_VERSION_CARP: carp_send_ad_locked(sc); break; case CARP_VERSION_VRRPv3: vrrp_send_ad_locked(sc); break; } } /* * To avoid LORs and possible recursions this function shouldn't * be called directly, but scheduled via taskqueue. */ static void carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; struct epoch_tracker et; NET_EPOCH_ENTER(et); mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); NET_EPOCH_EXIT(et); } /* Send a periodic advertisement, executed in callout context. */ static void carp_callout(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); } static void carp_send_ad_error(struct carp_softc *sc, int error) { /* * We track errors and successful sends with this logic: * - Any error resets success counter to 0. * - MAX_ERRORS triggers demotion. * - MIN_SUCCESS successes resets error counter to 0. * - MIN_SUCCESS reverts demotion, if it was triggered before. */ if (error) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send error %d on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, error, if_name(sc->sc_carpdev)); carp_demote_adj(V_carp_senderr_adj, msg); } sc->sc_sendad_success = 0; } else if (sc->sc_sendad_errors > 0) { if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send ok on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, if_name(sc->sc_carpdev)); carp_demote_adj(-V_carp_senderr_adj, msg); } sc->sc_sendad_errors = 0; } } } /* * Pick the best ifaddr on the given ifp for sending CARP * advertisements. * * "Best" here is defined by ifa_preferred(). This function is much * much like ifaof_ifpforaddr() except that we just use ifa_preferred(). * * (This could be simplified to return the actual address, except that * it has a different format in AF_INET and AF_INET6.) */ static struct ifaddr * carp_best_ifa(int af, struct ifnet *ifp) { struct ifaddr *ifa, *best; NET_EPOCH_ASSERT(); if (af >= AF_MAX) return (NULL); best = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == af && (best == NULL || ifa_preferred(best, ifa))) best = ifa; } if (best != NULL) ifa_ref(best); return (best); } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; int len, advskew; NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); MPASS(sc->sc_version == CARP_VERSION_CARP); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION_CARP; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; /* XXXGL: OpenBSD picks first ifaddr with needed family. */ #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); if (IN_MULTICAST(sc->sc_carpaddr.s_addr)) m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; ip->ip_len = htons(len); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip_fillid(ip); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst = sc->sc_carpaddr; ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); carp_prepare_ad(m, sc, ch_ptr); if (IN_MULTICAST(ntohl(sc->sc_carpaddr.s_addr)) && carp_tag(sc, m) != 0) goto resched; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; /* Traffic class isn't defined in ip6 struct instead * it gets offset into flowid field */ ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + IPTOS_DSCP_OFFSET)); ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; /* set the source address */ ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ memcpy(&ip6->ip6_dst, &sc->sc_carpaddr6, sizeof(ip6->ip6_dst)); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); carp_prepare_ad(m, sc, ch_ptr); if (IN6_IS_ADDR_MULTICAST(&sc->sc_carpaddr6) && carp_tag(sc, m) != 0) goto resched; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ resched: callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_callout, sc); } static void vrrp_send_ad_locked(struct carp_softc *sc) { struct vrrpv3_header *vh_ptr; struct ifaddr *ifa; struct mbuf *m; int len; struct vrrpv3_header vh = { .vrrp_version = CARP_VERSION_VRRPv3, .vrrp_type = VRRP_TYPE_ADVERTISEMENT, .vrrp_vrtid = sc->sc_vhid, .vrrp_priority = sc->sc_vrrp_prio, .vrrp_count_addr = 0, .vrrp_max_adver_int = htons(sc->sc_vrrp_adv_inter), .vrrp_checksum = 0, }; NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); MPASS(sc->sc_version == CARP_VERSION_VRRPv3); #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(vh); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET; ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; ip_fillid(ip); ifa = carp_best_ifa(AF_INET, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); /* Include the IP addresses in the announcement. */ for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) { struct sockaddr_in *in; MPASS(sc->sc_ifas[i] != NULL); if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET) continue; in = (struct sockaddr_in *)sc->sc_ifas[i]->ifa_addr; if (m_append(m, sizeof(in->sin_addr), (caddr_t)&in->sin_addr) != 1) { m_freem(m); goto resched; } vh.vrrp_count_addr++; len += sizeof(in->sin_addr); } ip->ip_len = htons(len); vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip)); bcopy(&vh, vh_ptr, sizeof(vh)); vh_ptr->vrrp_checksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((uint16_t)(len - sizeof(*ip)) + ip->ip_p)); vh_ptr->vrrp_checksum = in_cksum_skip(m, len, sizeof(*ip)); if (carp_tag(sc, m)) goto resched; CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(vh); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; /* Traffic class isn't defined in ip6 struct instead * it gets offset into flowid field */ ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN + IPTOS_DSCP_OFFSET)); ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; /* set the source address */ ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ bzero(&ip6->ip6_dst, sizeof(ip6->ip6_dst)); ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr8[15] = 0x12; /* Include the IP addresses in the announcement. */ len = sizeof(vh); for (int i = 0; i < (sc->sc_naddrs + sc->sc_naddrs6); i++) { struct sockaddr_in6 *in6; MPASS(sc->sc_ifas[i] != NULL); if (sc->sc_ifas[i]->ifa_addr->sa_family != AF_INET6) continue; in6 = (struct sockaddr_in6 *)sc->sc_ifas[i]->ifa_addr; if (m_append(m, sizeof(in6->sin6_addr), (char *)&in6->sin6_addr) != 1) { m_freem(m); goto resched; } vh.vrrp_count_addr++; len += sizeof(in6->sin6_addr); } ip6->ip6_plen = htonl(len); vh_ptr = (struct vrrpv3_header *)mtodo(m, sizeof(*ip6)); bcopy(&vh, vh_ptr, sizeof(vh)); vh_ptr->vrrp_checksum = in6_cksum_pseudo(ip6, len, ip6->ip6_nxt, 0); vh_ptr->vrrp_checksum = in_cksum_skip(m, len + sizeof(*ip6), sizeof(*ip6)); if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } if (carp_tag(sc, m)) goto resched; CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif resched: callout_reset(&sc->sc_ad_tmo, sc->sc_vrrp_adv_inter * hz / 100, carp_callout, sc); } static void carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_addroute(ifa); } static void carp_ifa_addroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: in_addprefix(ifatoia(ifa)); ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); break; #endif #ifdef INET6 case AF_INET6: ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_add_ifa_lle(ifatoia6(ifa)); break; #endif } } static void carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_delroute(ifa); } static void carp_ifa_delroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); in_scrubprefix(ifatoia(ifa), LLE_STATIC); break; #endif #ifdef INET6 case AF_INET6: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_rem_ifa_lle(ifatoia6(ifa), 1); break; #endif } } int carp_master(struct ifaddr *ifa) { struct carp_softc *sc = ifa->ifa_carp; return (sc->sc_state == MASTER); } #ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; struct in_addr addr; NET_EPOCH_ASSERT(); CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; arp_announce_ifaddr(sc->sc_carpdev, addr, sc->sc_addr); } } int carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { *enaddr = sc->sc_addr; return (1); } return (0); } #endif #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } /* * Returns ifa in case it's a carp address and it is MASTER, or if the address * matches and is not a carp address. Returns NULL otherwise. */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); ifa = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) continue; if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) ifa = NULL; else ifa_ref(ifa); break; } return (ifa); } char * carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); if (mtag == NULL) /* Better a bit than nothing. */ return (sc->sc_addr); bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (sc->sc_addr); } return (NULL); } #endif /* INET6 */ int carp_forus(struct ifnet *ifp, u_char *dhost) { struct carp_softc *sc; uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (0); CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { /* * CARP_LOCK() is not here, since would protect nothing, but * cause deadlock with if_bridge, calling this under its lock. */ if (sc->sc_state == MASTER && !bcmp(dhost, sc->sc_addr, ETHER_ADDR_LEN)) { CIF_UNLOCK(ifp->if_carp); return (1); } } CIF_UNLOCK(ifp->if_carp); return (0); } /* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; struct epoch_tracker et; NET_EPOCH_ENTER(et); CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); if (sc->sc_state == BACKUP) { carp_master_down_locked(sc, "master timed out"); } CURVNET_RESTORE(); CARP_UNLOCK(sc); NET_EPOCH_EXIT(et); } static void carp_master_down_locked(struct carp_softc *sc, const char *reason) { NET_EPOCH_ASSERT(); CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER, reason); send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); #endif carp_setrun(sc, 0); carp_addroute(sc); break; case INIT: case MASTER: #ifdef INVARIANTS panic("carp: VHID %u@%s: master_down event in %s state\n", sc->sc_vhid, if_name(sc->sc_carpdev), sc->sc_state ? "MASTER" : "INIT"); #endif break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; int timeout; CARP_LOCK_ASSERT(sc); if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) || !V_carp_allow) return; switch (sc->sc_state) { case INIT: carp_set_state(sc, BACKUP, "initialization complete"); carp_setrun(sc, 0); break; case BACKUP: callout_stop(&sc->sc_ad_tmo); switch (sc->sc_version) { case CARP_VERSION_CARP: tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; timeout = tvtohz(&tv); break; case CARP_VERSION_VRRPv3: /* skew time */ timeout = (256 - sc->sc_vrrp_prio) * sc->sc_vrrp_master_inter / 256; timeout += (3 * sc->sc_vrrp_master_inter); timeout *= hz; timeout /= 100; /* master interval is in centiseconds */ break; } switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, timeout, carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, timeout, carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, timeout, carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, timeout, carp_master_down, sc); #endif break; } break; case MASTER: switch (sc->sc_version) { case CARP_VERSION_CARP: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_callout, sc); break; case CARP_VERSION_VRRPv3: callout_reset(&sc->sc_ad_tmo, sc->sc_vrrp_adv_inter * hz / 100, carp_callout, sc); break; } break; } } /* * Setup multicast structures. */ static int carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { struct ifnet *ifp = cif->cif_ifp; int error = 0; switch (sa) { #ifdef INET case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; struct in_addr addr; if (ip_mfilter_first(&imo->imo_head) != NULL) return (0); imf = ip_mfilter_alloc(M_WAITOK, 0, 0); ip_mfilter_init(&imo->imo_head); imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, &imf->imf_inm)) != 0) { ip_mfilter_free(imf); break; } ip_mfilter_insert(&imo->imo_head, imf); imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f[2]; struct in6_addr in6; if (ip6_mfilter_first(&im6o->im6o_head)) return (0); im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0); im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0); ip6_mfilter_init(&im6o->im6o_head); im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) { in6_leavegroup(im6f[0]->im6f_in6m, NULL); ip6_mfilter_free(im6f[0]); ip6_mfilter_free(im6f[1]); break; } ip6_mfilter_insert(&im6o->im6o_head, im6f[0]); ip6_mfilter_insert(&im6o->im6o_head, im6f[1]); break; } #endif } return (error); } /* * Free multicast structures. */ static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { #ifdef INET struct ip_moptions *imo = &cif->cif_imo; struct in_mfilter *imf; #endif #ifdef INET6 struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_mfilter *im6f; #endif sx_assert(&carp_sx, SA_XLOCKED); switch (sa) { #ifdef INET case AF_INET: if (cif->cif_naddrs != 0) break; while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) { ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup(imf->imf_inm, NULL); ip_mfilter_free(imf); } break; #endif #ifdef INET6 case AF_INET6: if (cif->cif_naddrs6 != 0) break; while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) { ip6_mfilter_remove(&im6o->im6o_head, im6f); in6_leavegroup(im6f->im6f_in6m, NULL); ip6_mfilter_free(im6f); } break; #endif } } int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; int vhid; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &vhid, sizeof(vhid)); /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_BRIDGE: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = vhid; } break; default: printf("%s: carp is not supported for the %d interface type\n", if_name(ifp), ifp->if_type); return (EOPNOTSUPP); } return (0); } static struct carp_softc* carp_alloc(struct ifnet *ifp, carp_version_t version, int vhid) { struct carp_softc *sc; struct carp_if *cif; sx_assert(&carp_sx, SA_XLOCKED); if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); sc = malloc(sizeof(*sc), M_CARP, M_WAITOK); *sc = (struct carp_softc ){ .sc_vhid = vhid, .sc_version = version, .sc_state = INIT, .sc_carpdev = ifp, .sc_ifasiz = sizeof(struct ifaddr *), .sc_addr = { 0, 0, 0x5e, 0, 1, vhid }, }; sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); switch (version) { case CARP_VERSION_CARP: sc->sc_advbase = CARP_DFLTINTV; sc->sc_init_counter = true; sc->sc_carpaddr.s_addr = htonl(INADDR_CARP_GROUP); sc->sc_carpaddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; sc->sc_carpaddr6.s6_addr8[15] = 0x12; break; case CARP_VERSION_VRRPv3: sc->sc_vrrp_adv_inter = 100; sc->sc_vrrp_master_inter = sc->sc_vrrp_adv_inter; sc->sc_vrrp_prio = 100; break; } CARP_LOCK_INIT(sc); #ifdef INET callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif #ifdef INET6 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); CIF_LOCK(cif); TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carp_list, sc, sc_next); mtx_unlock(&carp_mtx); return (sc); } static void carp_grow_ifas(struct carp_softc *sc) { struct ifaddr **new; new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); CARP_LOCK(sc); bcopy(sc->sc_ifas, new, sc->sc_ifasiz); free(sc->sc_ifas, M_CARP); sc->sc_ifas = new; sc->sc_ifasiz *= 2; CARP_UNLOCK(sc); } static void carp_destroy(struct carp_softc *sc) { struct ifnet *ifp = sc->sc_carpdev; struct carp_if *cif = ifp->if_carp; sx_assert(&carp_sx, SA_XLOCKED); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); CARP_UNLOCK(sc); CIF_LOCK(cif); TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); callout_drain(&sc->sc_ad_tmo); #ifdef INET callout_drain(&sc->sc_md_tmo); #endif #ifdef INET6 callout_drain(&sc->sc_md6_tmo); #endif CARP_LOCK_DESTROY(sc); free(sc->sc_ifas, M_CARP); free(sc, M_CARP); } static struct carp_if* carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; int error; cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if ((error = ifpromisc(ifp, 1)) != 0) printf("%s: ifpromisc(%s) failed: %d\n", __func__, if_name(ifp), error); else cif->cif_flags |= CIF_PROMISC; CIF_LOCK_INIT(cif); cif->cif_ifp = ifp; TAILQ_INIT(&cif->cif_vrs); IF_ADDR_WLOCK(ifp); ifp->if_carp = cif; if_ref(ifp); IF_ADDR_WUNLOCK(ifp); return (cif); } static void carp_free_if(struct carp_if *cif) { struct ifnet *ifp = cif->cif_ifp; CIF_LOCK_ASSERT(cif); KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", __func__)); IF_ADDR_WLOCK(ifp); ifp->if_carp = NULL; IF_ADDR_WUNLOCK(ifp); CIF_LOCK_DESTROY(cif); if (cif->cif_flags & CIF_PROMISC) ifpromisc(ifp, 0); if_rele(ifp); free(cif, M_CARP); } static bool carp_carprcp(void *arg, struct carp_softc *sc, int priv) { struct carpreq *carpr = arg; CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; switch (sc->sc_version) { case CARP_VERSION_CARP: carpr->carpr_advbase = sc->sc_advbase; carpr->carpr_advskew = sc->sc_advskew; if (priv) bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); else bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); break; case CARP_VERSION_VRRPv3: break; } CARP_UNLOCK(sc); return (true); } static int carp_ioctl_set(if_t ifp, struct carpkreq *carpr) { struct epoch_tracker et; struct carp_softc *sc = NULL; int error = 0; if (carpr->carpr_vhid <= 0 || carpr->carpr_vhid > CARP_MAXVHID) return (EINVAL); switch (carpr->carpr_version) { case CARP_VERSION_CARP: if (carpr->carpr_advbase != 0 && (carpr->carpr_advbase > 255 || carpr->carpr_advbase < CARP_DFLTINTV)) return (EINVAL); if (carpr->carpr_advskew < 0 || carpr->carpr_advskew >= 255) return (EINVAL); break; case CARP_VERSION_VRRPv3: /* XXXGL: shouldn't we check anything? */ break; default: return (EINVAL); } if (ifp->if_carp) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr->carpr_vhid) break; } if (sc == NULL) sc = carp_alloc(ifp, carpr->carpr_version, carpr->carpr_vhid); else if (sc->sc_version != carpr->carpr_version) return (EINVAL); CARP_LOCK(sc); switch (sc->sc_version) { case CARP_VERSION_CARP: if (carpr->carpr_advbase != 0) sc->sc_advbase = carpr->carpr_advbase; sc->sc_advskew = carpr->carpr_advskew; if (carpr->carpr_addr.s_addr != INADDR_ANY) sc->sc_carpaddr = carpr->carpr_addr; if (!IN6_IS_ADDR_UNSPECIFIED(&carpr->carpr_addr6)) { memcpy(&sc->sc_carpaddr6, &carpr->carpr_addr6, sizeof(sc->sc_carpaddr6)); } if (carpr->carpr_key[0] != '\0') { bcopy(carpr->carpr_key, sc->sc_key, sizeof(sc->sc_key)); carp_hmac_prepare(sc); } break; case CARP_VERSION_VRRPv3: if (carpr->carpr_vrrp_priority != 0) sc->sc_vrrp_prio = carpr->carpr_vrrp_priority; if (carpr->carpr_vrrp_adv_inter) sc->sc_vrrp_adv_inter = carpr->carpr_vrrp_adv_inter; break; } if (sc->sc_state != INIT && carpr->carpr_state != sc->sc_state) { switch (carpr->carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP, "user requested via ifconfig"); carp_setrun(sc, 0); carp_delroute(sc); break; case MASTER: NET_EPOCH_ENTER(et); carp_master_down_locked(sc, "user requested via ifconfig"); NET_EPOCH_EXIT(et); break; default: break; } } CARP_UNLOCK(sc); return (error); } static int carp_ioctl_get(if_t ifp, struct ucred *cred, struct carpreq *carpr, bool (*outfn)(void *, struct carp_softc *, int), void *arg) { int priveleged; struct carp_softc *sc; if (carpr->carpr_vhid < 0 || carpr->carpr_vhid > CARP_MAXVHID) return (EINVAL); if (carpr->carpr_count < 1) return (EMSGSIZE); if (ifp->if_carp == NULL) return (ENOENT); priveleged = (priv_check_cred(cred, PRIV_NETINET_CARP) == 0); if (carpr->carpr_vhid != 0) { IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr->carpr_vhid) break; if (sc == NULL) return (ENOENT); if (! outfn(arg, sc, priveleged)) return (ENOMEM); carpr->carpr_count = 1; } else { int count; count = 0; IFNET_FOREACH_CARP(ifp, sc) count++; if (count > carpr->carpr_count) return (EMSGSIZE); IFNET_FOREACH_CARP(ifp, sc) { if (! outfn(arg, sc, priveleged)) return (ENOMEM); carpr->carpr_count = count; } } return (0); } int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; struct carpkreq carprk = { .carpr_version = CARP_VERSION_CARP, }; struct ifnet *ifp; int error = 0; if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr))) return (error); ifp = ifunit_ref(ifr->ifr_name); if ((error = carp_is_supported_if(ifp)) != 0) goto out; if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } sx_xlock(&carp_sx); switch (cmd) { case SIOCSVH: if ((error = priv_check(td, PRIV_NETINET_CARP))) break; memcpy(&carprk, &carpr, sizeof(carpr)); error = carp_ioctl_set(ifp, &carprk); break; case SIOCGVH: error = carp_ioctl_get(ifp, td->td_ucred, &carpr, carp_carprcp, &carpr); if (error == 0) { error = copyout(&carpr, (char *)ifr_data_get_ptr(ifr), carpr.carpr_count * sizeof(carpr)); } break; default: error = EINVAL; } sx_xunlock(&carp_sx); out: if (ifp != NULL) if_rele(ifp); return (error); } static int carp_get_vhid(struct ifaddr *ifa) { if (ifa == NULL || ifa->ifa_carp == NULL) return (0); return (ifa->ifa_carp->sc_vhid); } int carp_attach(struct ifaddr *ifa, int vhid) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; int index, error; KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif break; default: return (EPROTOTYPE); } sx_xlock(&carp_sx); if (ifp->if_carp == NULL) { sx_xunlock(&carp_sx); return (ENOPROTOOPT); } IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == vhid) break; if (sc == NULL) { sx_xunlock(&carp_sx); return (ENOENT); } error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); if (error) { CIF_FREE(cif); sx_xunlock(&carp_sx); return (error); } index = sc->sc_naddrs + sc->sc_naddrs6 + 1; if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) carp_grow_ifas(sc); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs++; sc->sc_naddrs++; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6++; sc->sc_naddrs6++; break; #endif } ifa_ref(ifa); CARP_LOCK(sc); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; if (sc->sc_version == CARP_VERSION_CARP) carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); sx_xunlock(&carp_sx); return (0); } void carp_detach(struct ifaddr *ifa, bool keep_cif) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc = ifa->ifa_carp; int i, index; KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); sx_xlock(&carp_sx); CARP_LOCK(sc); /* Shift array. */ index = sc->sc_naddrs + sc->sc_naddrs6; for (i = 0; i < index; i++) if (sc->sc_ifas[i] == ifa) break; KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); for (; i < index - 1; i++) sc->sc_ifas[i] = sc->sc_ifas[i+1]; sc->sc_ifas[index - 1] = NULL; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs--; sc->sc_naddrs--; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6--; sc->sc_naddrs6--; break; #endif } carp_ifa_delroute(ifa); carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); ifa->ifa_carp = NULL; ifa_free(ifa); if (sc->sc_version == CARP_VERSION_CARP) carp_hmac_prepare(sc); carp_sc_state(sc); if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) carp_destroy(sc); else CARP_UNLOCK(sc); if (!keep_cif) CIF_FREE(cif); sx_xunlock(&carp_sx); } static void carp_set_state(struct carp_softc *sc, int state, const char *reason) { CARP_LOCK_ASSERT(sc); if (sc->sc_state != state) { const char *carp_states[] = { CARP_STATES }; char subsys[IFNAMSIZ+5]; snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, if_name(sc->sc_carpdev)); CARP_LOG("%s: %s -> %s (%s)\n", subsys, carp_states[sc->sc_state], carp_states[state], reason); sc->sc_state = state; devctl_notify("CARP", subsys, carp_states[state], NULL); } } static void carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); carp_sc_state(sc); CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); } static void carp_sc_state(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP) || !V_carp_allow) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); #endif #ifdef INET6 callout_stop(&sc->sc_md6_tmo); #endif carp_set_state(sc, INIT, "hardware interface down"); carp_setrun(sc, 0); carp_delroute(sc); if (!sc->sc_suppress) carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { carp_set_state(sc, INIT, "hardware interface up"); carp_setrun(sc, 0); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } } static void carp_demote_adj(int adj, char *reason) { atomic_add_int(&V_carp_demotion, adj); CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); } static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; struct carp_softc *sc; new = V_carp_allow; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (V_carp_allow != new) { V_carp_allow = new; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) { CARP_LOCK(sc); if (curvnet == sc->sc_carpdev->if_vnet) carp_sc_state(sc); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } return (0); } static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_dscp; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); if (new < 0 || new > 63) return (EINVAL); V_carp_dscp = new; return (0); } static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_demotion; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); carp_demote_adj(new, "sysctl"); return (0); } static int nlattr_get_carp_key(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) > CARP_KEY_LEN)) return (EINVAL); memcpy(target, NLA_DATA_CONST(nla), NLA_DATA_LEN(nla)); return (0); } struct carp_nl_send_args { struct nlmsghdr *hdr; struct nl_pstate *npt; }; static bool carp_nl_send(void *arg, struct carp_softc *sc, int priv) { struct carp_nl_send_args *nlsa = arg; struct nlmsghdr *hdr = nlsa->hdr; struct nl_pstate *npt = nlsa->npt; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { nlmsg_abort(nw); return (false); } ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); if (ghdr_new == NULL) { nlmsg_abort(nw); return (false); } ghdr_new->cmd = CARP_NL_CMD_GET; ghdr_new->version = 0; ghdr_new->reserved = 0; CARP_LOCK(sc); nlattr_add_u32(nw, CARP_NL_VHID, sc->sc_vhid); nlattr_add_u32(nw, CARP_NL_STATE, sc->sc_state); nlattr_add_u8(nw, CARP_NL_VERSION, sc->sc_version); switch (sc->sc_version) { case CARP_VERSION_CARP: nlattr_add_s32(nw, CARP_NL_ADVBASE, sc->sc_advbase); nlattr_add_s32(nw, CARP_NL_ADVSKEW, sc->sc_advskew); nlattr_add_in_addr(nw, CARP_NL_ADDR, &sc->sc_carpaddr); nlattr_add_in6_addr(nw, CARP_NL_ADDR6, &sc->sc_carpaddr6); if (priv) nlattr_add(nw, CARP_NL_KEY, sizeof(sc->sc_key), sc->sc_key); break; case CARP_VERSION_VRRPv3: nlattr_add_u8(nw, CARP_NL_VRRP_PRIORITY, sc->sc_vrrp_prio); nlattr_add_u16(nw, CARP_NL_VRRP_ADV_INTER, sc->sc_vrrp_adv_inter); break; } CARP_UNLOCK(sc); if (! nlmsg_end(nw)) { nlmsg_abort(nw); return (false); } return (true); } struct nl_carp_parsed { unsigned int ifindex; char *ifname; uint32_t state; uint32_t vhid; int32_t advbase; int32_t advskew; char key[CARP_KEY_LEN]; struct in_addr addr; struct in6_addr addr6; carp_version_t version; uint8_t vrrp_prio; uint16_t vrrp_adv_inter; }; #define _OUT(_field) offsetof(struct nl_carp_parsed, _field) static const struct nlattr_parser nla_p_set[] = { { .type = CARP_NL_VHID, .off = _OUT(vhid), .cb = nlattr_get_uint32 }, { .type = CARP_NL_STATE, .off = _OUT(state), .cb = nlattr_get_uint32 }, { .type = CARP_NL_ADVBASE, .off = _OUT(advbase), .cb = nlattr_get_uint32 }, { .type = CARP_NL_ADVSKEW, .off = _OUT(advskew), .cb = nlattr_get_uint32 }, { .type = CARP_NL_KEY, .off = _OUT(key), .cb = nlattr_get_carp_key }, { .type = CARP_NL_IFINDEX, .off = _OUT(ifindex), .cb = nlattr_get_uint32 }, { .type = CARP_NL_ADDR, .off = _OUT(addr), .cb = nlattr_get_in_addr }, { .type = CARP_NL_ADDR6, .off = _OUT(addr6), .cb = nlattr_get_in6_addr }, { .type = CARP_NL_IFNAME, .off = _OUT(ifname), .cb = nlattr_get_string }, { .type = CARP_NL_VERSION, .off = _OUT(version), .cb = nlattr_get_uint8 }, { .type = CARP_NL_VRRP_PRIORITY, .off = _OUT(vrrp_prio), .cb = nlattr_get_uint8 }, { .type = CARP_NL_VRRP_ADV_INTER, .off = _OUT(vrrp_adv_inter), .cb = nlattr_get_uint16 }, }; NL_DECLARE_PARSER(carp_parser, struct genlmsghdr, nlf_p_empty, nla_p_set); #undef _OUT static int carp_nl_get(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_carp_parsed attrs = { }; struct carp_nl_send_args args; struct carpreq carpr = { }; struct epoch_tracker et; if_t ifp = NULL; int error; error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); if (attrs.ifname != NULL) ifp = ifunit_ref(attrs.ifname); else if (attrs.ifindex != 0) ifp = ifnet_byindex_ref(attrs.ifindex); NET_EPOCH_EXIT(et); if ((error = carp_is_supported_if(ifp)) != 0) goto out; hdr->nlmsg_flags |= NLM_F_MULTI; args.hdr = hdr; args.npt = npt; carpr.carpr_vhid = attrs.vhid; carpr.carpr_count = CARP_MAXVHID; sx_xlock(&carp_sx); error = carp_ioctl_get(ifp, nlp_get_cred(npt->nlp), &carpr, carp_nl_send, &args); sx_xunlock(&carp_sx); if (! nlmsg_end_dump(npt->nw, error, hdr)) error = ENOMEM; out: if (ifp != NULL) if_rele(ifp); return (error); } static int carp_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_carp_parsed attrs = { }; struct carpkreq carpr; struct epoch_tracker et; if_t ifp = NULL; int error; error = nl_parse_nlmsg(hdr, &carp_parser, npt, &attrs); if (error != 0) return (error); if (attrs.vhid <= 0 || attrs.vhid > CARP_MAXVHID) return (EINVAL); if (attrs.state > CARP_MAXSTATE) return (EINVAL); if (attrs.version == 0) /* compat with pre-VRRPv3 */ attrs.version = CARP_VERSION_CARP; switch (attrs.version) { case CARP_VERSION_CARP: if (attrs.advbase < 0 || attrs.advskew < 0) return (EINVAL); if (attrs.advbase > 255) return (EINVAL); if (attrs.advskew >= 255) return (EINVAL); break; case CARP_VERSION_VRRPv3: if (attrs.vrrp_adv_inter > VRRP_MAX_INTERVAL) return (EINVAL); break; default: return (EINVAL); } NET_EPOCH_ENTER(et); if (attrs.ifname != NULL) ifp = ifunit_ref(attrs.ifname); else if (attrs.ifindex != 0) ifp = ifnet_byindex_ref(attrs.ifindex); NET_EPOCH_EXIT(et); if ((error = carp_is_supported_if(ifp)) != 0) goto out; if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } carpr.carpr_count = 1; carpr.carpr_vhid = attrs.vhid; carpr.carpr_state = attrs.state; carpr.carpr_version = attrs.version; switch (attrs.version) { case CARP_VERSION_CARP: carpr.carpr_advbase = attrs.advbase; carpr.carpr_advskew = attrs.advskew; carpr.carpr_addr = attrs.addr; carpr.carpr_addr6 = attrs.addr6; memcpy(&carpr.carpr_key, &attrs.key, sizeof(attrs.key)); break; case CARP_VERSION_VRRPv3: carpr.carpr_vrrp_priority = attrs.vrrp_prio; carpr.carpr_vrrp_adv_inter = attrs.vrrp_adv_inter; break; } sx_xlock(&carp_sx); error = carp_ioctl_set(ifp, &carpr); sx_xunlock(&carp_sx); out: if (ifp != NULL) if_rele(ifp); return (error); } static const struct nlhdr_parser *all_parsers[] = { &carp_parser }; static const struct genl_cmd carp_cmds[] = { { .cmd_num = CARP_NL_CMD_GET, .cmd_name = "SIOCGVH", .cmd_cb = carp_nl_get, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, { .cmd_num = CARP_NL_CMD_SET, .cmd_name = "SIOCSVH", .cmd_cb = carp_nl_set, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_CARP, }, }; +static uint16_t carp_family_id; static void carp_nl_register(void) { bool ret __diagused; - int family_id __diagused; NL_VERIFY_PARSERS(all_parsers); - family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2, + carp_family_id = genl_register_family(CARP_NL_FAMILY_NAME, 0, 2, CARP_NL_CMD_MAX); - MPASS(family_id != 0); + MPASS(carp_family_id != 0); - ret = genl_register_cmds(CARP_NL_FAMILY_NAME, carp_cmds, - nitems(carp_cmds)); + ret = genl_register_cmds(carp_family_id, carp_cmds, nitems(carp_cmds)); MPASS(ret); } static void carp_nl_unregister(void) { - genl_unregister_family(CARP_NL_FAMILY_NAME); + genl_unregister_family(carp_family_id); } static void carp_mod_cleanup(void) { carp_nl_unregister(); #ifdef INET (void)ipproto_unregister(IPPROTO_CARP); carp_iamatch_p = NULL; #endif #ifdef INET6 (void)ip6proto_unregister(IPPROTO_CARP); carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif carp_ioctl_p = NULL; carp_attach_p = NULL; carp_detach_p = NULL; carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; carp_demote_adj_p = NULL; carp_master_p = NULL; mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); sx_destroy(&carp_sx); } static void ipcarp_sysinit(void) { /* Load allow as tunable so to postpone carp start after module load */ TUNABLE_INT_FETCH("net.inet.carp.allow", &V_carp_allow); } VNET_SYSINIT(ip_carp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipcarp_sysinit, NULL); static int carp_mod_load(void) { int err; mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); sx_init(&carp_sx, "carp_sx"); LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; carp_linkstate_p = carp_linkstate; carp_ioctl_p = carp_ioctl; carp_attach_p = carp_attach; carp_detach_p = carp_detach; carp_demote_adj_p = carp_demote_adj; carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; err = ip6proto_register(IPPROTO_CARP, carp6_input, NULL); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); return (err); } #endif #ifdef INET carp_iamatch_p = carp_iamatch; err = ipproto_register(IPPROTO_CARP, carp_input, NULL); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); return (err); } #endif carp_nl_register(); return (0); } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: mtx_lock(&carp_mtx); if (LIST_EMPTY(&carp_list)) carp_mod_cleanup(); else { mtx_unlock(&carp_mtx); return (EBUSY); } break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h index 895f70322a29..e7566552ea32 100644 --- a/sys/netlink/netlink_ctl.h +++ b/sys/netlink/netlink_ctl.h @@ -1,126 +1,123 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_CTL_H_ #define _NETLINK_NETLINK_CTL_H_ #ifdef _KERNEL /* * This file provides headers for the public KPI of the netlink * subsystem */ #include MALLOC_DECLARE(M_NETLINK); /* * Macro for handling attribute TLVs */ #define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) #define NETLINK_ALIGN_SIZE sizeof(uint32_t) #define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) #define NLA_ALIGN_SIZE sizeof(uint32_t) #define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE) #define NLA_HDRLEN ((uint16_t)sizeof(struct nlattr)) #define NLA_DATA_LEN(_nla) ((_nla)->nla_len - NLA_HDRLEN) #define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN) #define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN) #define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF) #ifndef typeof #define typeof __typeof #endif #define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) #define _NLA_END(_start, _len) ((char *)(_start) + (_len)) #define NLA_FOREACH(_attr, _start, _len) \ for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \ ((char *)_attr < (char *)_end) && \ ((char *)NLA_NEXT(_attr) <= (char *)_end); \ _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr))) #include #include /* Protocol handlers */ struct nl_pstate; typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt); bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler); bool netlink_unregister_proto(int proto); /* Common helpers */ bool nlp_has_priv(struct nlpcb *nlp, int priv); struct ucred *nlp_get_cred(struct nlpcb *nlp); uint32_t nlp_get_pid(const struct nlpcb *nlp); bool nlp_unconstrained_vnet(const struct nlpcb *nlp); /* netlink_generic.c */ struct genl_cmd { const char *cmd_name; nl_handler_f cmd_cb; uint32_t cmd_flags; uint32_t cmd_priv; uint32_t cmd_num; }; uint16_t genl_register_family(const char *family_name, size_t hdrsize, uint16_t family_version, uint16_t max_attr_idx); -bool genl_unregister_family(const char *family_name); -bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, - int count); -uint32_t genl_register_group(const char *family_name, const char *group_name); +void genl_unregister_family(uint16_t family); +bool genl_register_cmds(uint16_t family, const struct genl_cmd *cmds, + u_int count); +uint32_t genl_register_group(uint16_t family, const char *group_name); -struct genl_family; -const char *genl_get_family_name(const struct genl_family *gf); -uint16_t genl_get_family_id(const struct genl_family *gf); - -typedef void (*genl_family_event_handler_t)(void *arg, const struct genl_family *gf, int action); +typedef void (*genl_family_event_handler_t)(void *arg, const char *family_name, + uint16_t family_id, u_int action); EVENTHANDLER_DECLARE(genl_family_event, genl_family_event_handler_t); struct thread; #if defined(NETLINK) || defined(NETLINK_MODULE) /* Provide optimized calls to the functions inside the same linking unit */ struct nlpcb *_nl_get_thread_nlp(struct thread *td); static inline struct nlpcb * nl_get_thread_nlp(struct thread *td) { return (_nl_get_thread_nlp(td)); } #else /* Provide access to the functions via netlink_glue.c */ struct nlpcb *nl_get_thread_nlp(struct thread *td); #endif /* defined(NETLINK) || defined(NETLINK_MODULE) */ #endif #endif diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c index 30c73133134b..0714f22382cb 100644 --- a/sys/netlink/netlink_generic.c +++ b/sys/netlink/netlink_generic.c @@ -1,561 +1,499 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_generic #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static int nlctrl_handle_getfamily(struct nlmsghdr *, struct nl_pstate *); static struct genl_cmd nlctrl_cmds[] = { [CTRL_CMD_GETFAMILY] = { .cmd_num = CTRL_CMD_GETFAMILY, .cmd_name = "GETFAMILY", .cmd_cb = nlctrl_handle_getfamily, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, }; static struct genl_family { const char *family_name; uint16_t family_hdrsize; uint16_t family_version; uint16_t family_attr_max; uint16_t family_cmd_size; uint16_t family_num_groups; struct genl_cmd *family_cmds; } families[MAX_FAMILIES] = { [CTRL_FAMILY_ID] = { .family_name = CTRL_FAMILY_NAME, .family_hdrsize = 0, .family_version = 2, .family_attr_max = CTRL_ATTR_MAX, .family_cmd_size = CTRL_CMD_GETFAMILY + 1, .family_cmds = nlctrl_cmds, .family_num_groups = 1, }, }; static struct genl_group { struct genl_family *group_family; const char *group_name; } groups[MAX_GROUPS] = { [CTRL_GROUP_ID] = { .group_family = &families[CTRL_FAMILY_ID], .group_name = CTRL_GROUP_NAME, }, }; +static inline struct genl_family * +genl_family(uint16_t family_id) +{ + struct genl_family *gf; + + gf = &families[family_id - GENL_MIN_ID]; + KASSERT(family_id - GENL_MIN_ID < MAX_FAMILIES && + gf->family_name != NULL, ("family %u does not exist", family_id)); + return (gf); +} + +static inline uint16_t +genl_family_id(const struct genl_family *gf) +{ + MPASS(gf >= &families[0] && gf < &families[MAX_FAMILIES]); + return ((uint16_t)(gf - &families[0]) + GENL_MIN_ID); +} + /* * Handler called by netlink subsystem when matching netlink message is received */ static int genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlpcb *nlp = npt->nlp; - struct genl_family *gf = NULL; + struct genl_family *gf; + uint16_t family_id; int error = 0; - int family_id = (int)hdr->nlmsg_type - GENL_MIN_ID; - - if (__predict_false(family_id < 0 || (gf = genl_get_family(family_id)) == NULL)) { - NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", hdr->nlmsg_type); - return (ENOTSUP); - } - if (__predict_false(hdr->nlmsg_len < sizeof(struct nlmsghdr) + GENL_HDRLEN)) { - NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", hdr->nlmsg_len); + NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", + hdr->nlmsg_len); return (EINVAL); } + family_id = hdr->nlmsg_type - GENL_MIN_ID; + gf = &families[family_id]; + if (__predict_false(family_id >= MAX_FAMILIES || + gf->family_name == NULL)) { + NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", + hdr->nlmsg_type); + return (ENOTSUP); + } + struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1); if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) { NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d", gf->family_name, ghdr->cmd); return (ENOTSUP); } struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd]; if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) { NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed", gf->family_name, ghdr->cmd); return (EPERM); } NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d", gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len); error = cmd->cmd_cb(hdr, npt); return (error); } static uint32_t get_cmd_flags(const struct genl_cmd *cmd) { uint32_t flags = cmd->cmd_flags; if (cmd->cmd_priv != 0) flags |= GENL_ADMIN_PERM; return (flags); } static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, const struct genl_family *gf, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = ghdr->cmd; ghdr_new->version = gf->family_version; ghdr_new->reserved = 0; nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name); - nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, genl_get_family_id(gf)); + nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, genl_family_id(gf)); nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version); nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize); nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max); if (gf->family_cmd_size > 0) { int off = nlattr_add_nested(nw, CTRL_ATTR_OPS); if (off == 0) goto enomem; for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) { struct genl_cmd *cmd = &gf->family_cmds[i]; if (cmd->cmd_cb == NULL) continue; int cmd_off = nlattr_add_nested(nw, ++cnt); if (cmd_off == 0) goto enomem; nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num); nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd)); nlattr_set_len(nw, cmd_off); } nlattr_set_len(nw, off); } if (gf->family_num_groups > 0) { int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS); if (off == 0) goto enomem; - for (int i = 0, cnt = 0; i < MAX_GROUPS; i++) { - struct genl_group *gg = genl_get_group(i); - if (gg == NULL || gg->group_family != gf) + for (u_int i = 0, cnt = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + + if (gg->group_family != gf) continue; int cmd_off = nlattr_add_nested(nw, ++cnt); if (cmd_off == 0) goto enomem; nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM); nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name); nlattr_set_len(nw, cmd_off); } nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (0); enomem: NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name); nlmsg_abort(nw); return (ENOMEM); } - -/* Declare ourself as a user */ -static void nlctrl_notify(void *arg, const struct genl_family *gf, int action); -static eventhandler_tag family_event_tag; - struct nl_parsed_family { - uint32_t family_id; char *family_name; + uint16_t family_id; uint8_t version; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_parsed_family, _field) static const struct nlfield_parser nlf_p_generic[] = { { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, }; static struct nlattr_parser nla_p_generic[] = { { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint16 }, { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_name), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic); -static bool -match_family(const struct genl_family *gf, const struct nl_parsed_family *attrs) -{ - if (gf->family_name == NULL) - return (false); - if (attrs->family_id != 0 && attrs->family_id != genl_get_family_id(gf)) - return (false); - if (attrs->family_name != NULL && strcmp(attrs->family_name, gf->family_name)) - return (false); - return (true); -} - static int nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error = 0; struct nl_parsed_family attrs = {}; error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs); if (error != 0) return (error); struct genlmsghdr ghdr = { .cmd = CTRL_CMD_NEWFAMILY, }; if (attrs.family_id != 0 || attrs.family_name != NULL) { - /* Resolve request */ - for (int i = 0; i < MAX_FAMILIES; i++) { - struct genl_family *gf = genl_get_family(i); - if (gf != NULL && match_family(gf, &attrs)) { - error = dump_family(hdr, &ghdr, gf, npt->nw); - return (error); - } + for (u_int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + + if (gf->family_name == NULL) + continue; + if (attrs.family_id != 0 && + attrs.family_id != genl_family_id(gf)) + continue; + if (attrs.family_name != NULL && + strcmp(attrs.family_name, gf->family_name) != 0) + continue; + return (dump_family(hdr, &ghdr, gf, npt->nw)); } return (ENOENT); } hdr->nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI; - for (int i = 0; i < MAX_FAMILIES; i++) { - struct genl_family *gf = genl_get_family(i); - if (gf != NULL && match_family(gf, &attrs)) { + for (u_int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + + if (gf->family_name != NULL) { error = dump_family(hdr, &ghdr, gf, npt->nw); if (error != 0) break; } } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static void -nlctrl_notify(void *arg __unused, const struct genl_family *gf, int cmd) +nlctrl_notify(void *arg __unused, const char *family_name __unused, + uint16_t family_id, u_int cmd) { struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC }; struct genlmsghdr ghdr = { .cmd = cmd }; + struct genl_family *gf; struct nl_writer nw; + gf = genl_family(family_id); if (!nl_writer_group(&nw, NLMSG_SMALL, NETLINK_GENERIC, CTRL_GROUP_ID, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } dump_family(&hdr, &ghdr, gf, &nw); nlmsg_flush(&nw); } static const struct nlhdr_parser *all_parsers[] = { &genl_parser }; +static eventhandler_tag family_event_tag; static void genl_load_all(void *u __unused) { NL_VERIFY_PARSERS(all_parsers); family_event_tag = EVENTHANDLER_REGISTER(genl_family_event, nlctrl_notify, NULL, EVENTHANDLER_PRI_ANY); netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", genl_handle_message); } SYSINIT(genl_load_all, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load_all, NULL); static void genl_unload(void *u __unused) { netlink_unregister_proto(NETLINK_GENERIC); EVENTHANDLER_DEREGISTER(genl_family_event, family_event_tag); NET_EPOCH_WAIT(); } SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL); /* * Public KPI for NETLINK_GENERIC families/groups registration logic below. */ static struct sx sx_lock; SX_SYSINIT(genl_lock, &sx_lock, "genetlink lock"); #define GENL_LOCK() sx_xlock(&sx_lock) #define GENL_UNLOCK() sx_xunlock(&sx_lock) #define GENL_ASSERT_LOCKED() sx_assert(&sx_lock, SA_LOCKED) #define GENL_ASSERT_XLOCKED() sx_assert(&sx_lock, SA_XLOCKED) -static struct genl_family * -find_family(const char *family_name) -{ - GENL_ASSERT_LOCKED(); - for (u_int i = 0; i < MAX_FAMILIES; i++) - if (families[i].family_name != NULL && - strcmp(families[i].family_name, family_name) == 0) - return (&families[i]); - - return (NULL); -} - -static struct genl_family * -find_empty_family_id(const char *family_name) -{ - GENL_ASSERT_LOCKED(); - /* Microoptimization: index 0 is reserved for the control family */ - for (u_int i = 1; i < MAX_FAMILIES; i++) - if (families[i].family_name == NULL) - return (&families[i]); - - return (NULL); -} - uint16_t genl_register_family(const char *family_name, size_t hdrsize, uint16_t family_version, uint16_t max_attr_idx) { struct genl_family *gf; uint16_t family_id; + MPASS(family_name != NULL); + GENL_LOCK(); - if (find_family(family_name) != NULL) { - GENL_UNLOCK(); - return (0); - } + for (u_int i = 0; i < MAX_FAMILIES; i++) + if (families[i].family_name != NULL && + strcmp(families[i].family_name, family_name) == 0) + return (0); - gf = find_empty_family_id(family_name); + /* Microoptimization: index 0 is reserved for the control family. */ + gf = NULL; + for (u_int i = 1; i < MAX_FAMILIES; i++) + if (families[i].family_name == NULL) { + gf = &families[i]; + break; + } KASSERT(gf, ("%s: maximum of %u generic netlink families allocated", __func__, MAX_FAMILIES)); *gf = (struct genl_family) { .family_name = family_name, .family_version = family_version, .family_hdrsize = hdrsize, .family_attr_max = max_attr_idx, }; - family_id = genl_get_family_id(gf); + family_id = genl_family_id(gf); GENL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Registered family %s id %d", gf->family_name, family_id); - EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_NEWFAMILY); + EVENTHANDLER_INVOKE(genl_family_event, gf->family_name, family_id, + CTRL_CMD_NEWFAMILY); return (family_id); } -static void -free_family(struct genl_family *gf) +void +genl_unregister_family(uint16_t family_id) { - if (gf->family_cmds != NULL) - free(gf->family_cmds, M_NETLINK); -} + struct genl_family *gf; -/* - * unregister groups of a given family - */ -static void -unregister_groups(const struct genl_family *gf) -{ + GENL_LOCK(); + gf = genl_family(family_id); + EVENTHANDLER_INVOKE(genl_family_event, gf->family_name, + family_id, CTRL_CMD_DELFAMILY); for (u_int i = 0; i < MAX_GROUPS; i++) { struct genl_group *gg = &groups[i]; if (gg->group_family == gf && gg->group_name != NULL) { gg->group_family = NULL; gg->group_name = NULL; } } -} - -/* - * Can sleep, I guess - */ -bool -genl_unregister_family(const char *family_name) -{ - bool found = false; - - GENL_LOCK(); - struct genl_family *gf = find_family(family_name); - - if (gf != NULL) { - EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_DELFAMILY); - found = true; - unregister_groups(gf); - /* TODO: zero pointer first */ - free_family(gf); - bzero(gf, sizeof(*gf)); - } + if (gf->family_cmds != NULL) + free(gf->family_cmds, M_NETLINK); + bzero(gf, sizeof(*gf)); GENL_UNLOCK(); - - return (found); } bool -genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, - int count) +genl_register_cmds(uint16_t family_id, const struct genl_cmd *cmds, + u_int count) { struct genl_family *gf; uint16_t cmd_size; GENL_LOCK(); - if ((gf = find_family(family_name)) == NULL) { - GENL_UNLOCK(); - return (false); - } + gf = genl_family(family_id); cmd_size = gf->family_cmd_size; for (u_int i = 0; i < count; i++) { MPASS(cmds[i].cmd_cb != NULL); if (cmds[i].cmd_num >= cmd_size) cmd_size = cmds[i].cmd_num + 1; } if (cmd_size > gf->family_cmd_size) { void *old_data; /* need to realloc */ size_t sz = cmd_size * sizeof(struct genl_cmd); void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO); memcpy(data, gf->family_cmds, gf->family_cmd_size * sizeof(struct genl_cmd)); old_data = gf->family_cmds; gf->family_cmds = data; gf->family_cmd_size = cmd_size; free(old_data, M_NETLINK); } for (u_int i = 0; i < count; i++) { const struct genl_cmd *cmd = &cmds[i]; MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL); gf->family_cmds[cmd->cmd_num] = cmds[i]; NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s", cmd->cmd_name, cmd->cmd_num, gf->family_name); } GENL_UNLOCK(); return (true); } -static struct genl_group * -find_group(const struct genl_family *gf, const char *group_name) -{ - for (u_int i = 0; i < MAX_GROUPS; i++) { - struct genl_group *gg = &groups[i]; - if (gg->group_family == gf && - !strcmp(gg->group_name, group_name)) - return (gg); - } - return (NULL); -} - uint32_t -genl_register_group(const char *family_name, const char *group_name) +genl_register_group(uint16_t family_id, const char *group_name) { struct genl_family *gf; uint32_t group_id = 0; - MPASS(family_name != NULL); MPASS(group_name != NULL); GENL_LOCK(); - if ((gf = find_family(family_name)) == NULL || - find_group(gf, group_name) != NULL) { - GENL_UNLOCK(); - return (0); - } + gf = genl_family(family_id); + + for (u_int i = 0; i < MAX_GROUPS; i++) + if (groups[i].group_family == gf && + strcmp(groups[i].group_name, group_name) == 0) { + GENL_UNLOCK(); + return (0); + } /* Microoptimization: index 0 is reserved for the control family */ for (u_int i = 1; i < MAX_GROUPS; i++) { struct genl_group *gg = &groups[i]; if (gg->group_family == NULL) { gf->family_num_groups++; gg->group_family = gf; gg->group_name = group_name; group_id = i + MIN_GROUP_NUM; break; } } GENL_UNLOCK(); return (group_id); } - -/* accessors */ -struct genl_family * -genl_get_family(uint16_t family_id) -{ - return ((family_id < MAX_FAMILIES) ? &families[family_id] : NULL); -} - -const char * -genl_get_family_name(const struct genl_family *gf) -{ - return (gf->family_name); -} - -uint16_t -genl_get_family_id(const struct genl_family *gf) -{ - MPASS(gf >= &families[0] && gf < &families[MAX_FAMILIES]); - return ((uint16_t)(gf - &families[0]) + GENL_MIN_ID); -} - -struct genl_group * -genl_get_group(uint32_t group_id) -{ - return ((group_id < MAX_GROUPS) ? &groups[group_id] : NULL); -} diff --git a/sys/netlink/netlink_sysevent.c b/sys/netlink/netlink_sysevent.c index c955ce2e8b45..09e7e50a7409 100644 --- a/sys/netlink/netlink_sysevent.c +++ b/sys/netlink/netlink_sysevent.c @@ -1,204 +1,205 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Baptiste Daroussin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_sysevent #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); MALLOC_DEFINE(M_NLSE, "nlsysevent", "Memory used for Netlink sysevent"); #define NLSE_FAMILY_NAME "nlsysevent" -static uint32_t ctrl_family_id; +static uint16_t ctrl_family_id; #define MAX_SYSEVENT_GROUPS 64 static struct sysevent_group { char *name; uint32_t id; } sysevent_groups[MAX_SYSEVENT_GROUPS] = {}; static const char *devctl_systems[] = { "ACPI", "AEON", "CAM", "CARP", "coretemp", "DEVFS", "device", "ETHERNET", "GEOM", "HYPERV_NIC_VF", "IFNET", "INFINIBAND", "KERNEL", "nvme", "PMU", "RCTL", "USB", "VFS", "VT", "ZFS", }; static void sysevent_write(struct sysevent_group *se, const char *subsystem, const char *type, const char *data) { struct nl_writer nw; if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_GENERIC, se->id, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } struct nlmsghdr hdr = { .nlmsg_type = ctrl_family_id }; if (!nlmsg_reply(&nw, &hdr, sizeof(struct genlmsghdr))) { return; } struct genlmsghdr *ghdr = nlmsg_reserve_object(&nw, struct genlmsghdr); if (ghdr == NULL) { NL_LOG(LOG_DEBUG, "unable to allocate memory"); return; } ghdr->version = 0; ghdr->cmd = NLSE_CMD_NEWEVENT; ghdr->reserved = 0; nlattr_add_string(&nw, NLSE_ATTR_SYSTEM, se->name); nlattr_add_string(&nw, NLSE_ATTR_SUBSYSTEM, subsystem); nlattr_add_string(&nw, NLSE_ATTR_TYPE, type); if (data != NULL) nlattr_add_string(&nw, NLSE_ATTR_DATA, data); nlmsg_end(&nw); nlmsg_flush(&nw); } static void sysevent_new_group(size_t index, const char *name) { if (index >= MAX_SYSEVENT_GROUPS) { NL_LOG(LOG_WARNING, "impossible to add the event %s, " "too many event groups\n", name); return; } sysevent_groups[index].name = strdup(name, M_NLSE); - sysevent_groups[index].id = genl_register_group(NLSE_FAMILY_NAME, sysevent_groups[index].name); + sysevent_groups[index].id = genl_register_group(ctrl_family_id, + sysevent_groups[index].name); } static struct sysevent_group * sysevent_get_group(const char *system) { for (size_t i = 0; i < MAX_SYSEVENT_GROUPS; i++) { if (sysevent_groups[i].name == NULL) { sysevent_new_group(i, system); return (&sysevent_groups[i]); } if (strcmp(sysevent_groups[i].name, system) == 0) return (&sysevent_groups[i]); } return (NULL); } static void sysevent_send(const char *system, const char *subsystem, const char *type, const char *data) { struct sysevent_group *se = sysevent_get_group(system); if (se == NULL) { NL_LOG(LOG_WARNING, "impossible to add the event %s, " "too many event groups\n", system); return; } CURVNET_SET(vnet0); sysevent_write(se, subsystem, type, data); CURVNET_RESTORE(); } static void nlsysevent_load(void) { devctl_set_notify_hook(sysevent_send); ctrl_family_id = genl_register_family(NLSE_FAMILY_NAME, 0, 2, NLSE_ATTR_MAX); for (size_t i = 0; i < nitems(devctl_systems); i++) { if (i >= MAX_SYSEVENT_GROUPS) { NL_LOG(LOG_WARNING, "impossible to add the event %s, too many events\n", devctl_systems[i]); continue; } sysevent_new_group(i, devctl_systems[i]); } } static void nlsysevent_unload(void) { devctl_unset_notify_hook(); - genl_unregister_family(NLSE_FAMILY_NAME); + genl_unregister_family(ctrl_family_id); for (size_t i = 0; i < MAX_SYSEVENT_GROUPS; i++) { if (sysevent_groups[i].name == NULL) break; free(sysevent_groups[i].name, M_NLSE); } } static int nlsysevent_loader(module_t mod __unused, int what, void *priv __unused) { int err = 0; switch (what) { case MOD_LOAD: nlsysevent_load(); break; case MOD_UNLOAD: nlsysevent_unload(); break; default: err = EOPNOTSUPP; break; } return (err); } static moduledata_t nlsysevent_mod = { "nlsysevent", nlsysevent_loader, NULL}; DECLARE_MODULE(nlsysevent, nlsysevent_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_DEPEND(nlsysevent, netlink, 1, 1, 1); MODULE_VERSION(nlsysevent, 1); diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h index ce10a303f9f7..a59cb2efecd0 100644 --- a/sys/netlink/netlink_var.h +++ b/sys/netlink/netlink_var.h @@ -1,184 +1,180 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_VAR_H_ #define _NETLINK_NETLINK_VAR_H_ #ifdef _KERNEL #include #include #include #include #include #define NLSNDQ 65536 /* Default socket sendspace */ #define NLRCVQ 65536 /* Default socket recvspace */ #define NLMBUFSIZE 2048 /* External storage size for Netlink mbufs */ struct ucred; struct nl_buf { TAILQ_ENTRY(nl_buf) tailq; u_int buflen; u_int datalen; u_int offset; char data[]; }; #define NLP_MAX_GROUPS 128 BITSET_DEFINE(nl_groups, NLP_MAX_GROUPS); struct nlpcb { struct socket *nl_socket; struct nl_groups nl_groups; uint32_t nl_port; uint32_t nl_flags; uint32_t nl_process_id; int nl_proto; bool nl_bound; bool nl_task_pending; bool nl_tx_blocked; /* No new requests accepted */ bool nl_linux; /* true if running under compat */ bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */ bool nl_need_thread_setup; struct taskqueue *nl_taskqueue; struct task nl_task; uint64_t nl_dropped_bytes; uint64_t nl_dropped_messages; CK_LIST_ENTRY(nlpcb) nl_next; CK_LIST_ENTRY(nlpcb) nl_port_next; volatile u_int nl_refcount; struct mtx nl_lock; struct epoch_context nl_epoch_ctx; }; #define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) #define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) #define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) #define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) #define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) #define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) /* nl_flags */ #define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ #define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ #define NLF_STRICT 0x04 /* Perform strict header checks */ #define NLF_MSG_INFO 0x08 /* Send caller info along with the notifications */ SYSCTL_DECL(_net_netlink); SYSCTL_DECL(_net_netlink_debug); struct nl_control { CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; CK_LIST_ENTRY(nl_control) ctl_next; struct rmlock ctl_lock; }; VNET_DECLARE(struct nl_control, nl_ctl); #define V_nl_ctl VNET(nl_ctl) struct sockaddr_nl; struct sockaddr; struct nlmsghdr; int nl_verify_proto(int proto); const char *nl_get_proto_name(int proto); extern int netlink_unloading; struct nl_proto_handler { nl_handler_f cb; const char *proto_name; }; extern struct nl_proto_handler *nl_handlers; /* netlink_domain.c */ bool nl_send_group(struct nl_writer *); void nl_osd_register(void); void nl_osd_unregister(void); void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp); /* netlink_io.c */ bool nl_send(struct nl_writer *, struct nlpcb *); void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg, struct nl_pstate *npt); void nl_on_transmit(struct nlpcb *nlp); void nl_taskqueue_handler(void *_arg, int pending); void nl_schedule_taskqueue(struct nlpcb *nlp); void nl_process_receive_locked(struct nlpcb *nlp); void nl_set_source_metadata(struct mbuf *m, int num_messages); struct nl_buf *nl_buf_alloc(size_t len, int mflag); void nl_buf_free(struct nl_buf *nb); -/* netlink_generic.c */ -struct genl_family *genl_get_family(uint16_t family_id); -struct genl_group *genl_get_group(uint32_t group_id); - #define MAX_FAMILIES 20 #define MAX_GROUPS 64 #define MIN_GROUP_NUM 48 #define CTRL_FAMILY_ID 0 #define CTRL_FAMILY_NAME "nlctrl" #define CTRL_GROUP_ID 0 #define CTRL_GROUP_NAME "notify" struct ifnet; struct nl_parsed_link; struct nlattr_bmask; struct nl_pstate; /* Function map */ struct nl_function_wrapper { bool (*nlmsg_add)(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool (*nlmsg_refill_buffer)(struct nl_writer *nw, size_t required_len); bool (*nlmsg_flush)(struct nl_writer *nw); bool (*nlmsg_end)(struct nl_writer *nw); void (*nlmsg_abort)(struct nl_writer *nw); void (*nlmsg_ignore_limit)(struct nl_writer *nw); bool (*nl_writer_unicast)(struct nl_writer *nw, size_t size, struct nlpcb *nlp, bool waitok); bool (*nl_writer_group)(struct nl_writer *nw, size_t size, uint16_t protocol, uint16_t group_id, int priv, bool waitok); bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr); int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt); void (*nl_store_ifp_cookie)(struct nl_pstate *npt, struct ifnet *ifp); struct nlpcb * (*nl_get_thread_nlp)(struct thread *td); }; void nl_set_functions(const struct nl_function_wrapper *nl); #endif #endif diff --git a/sys/netpfil/pf/pf_nl.c b/sys/netpfil/pf/pf_nl.c index c0f722b1fd18..737e9cf8cab8 100644 --- a/sys/netpfil/pf/pf_nl.c +++ b/sys/netpfil/pf/pf_nl.c @@ -1,2061 +1,2061 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * Copyright (c) 2023 Rubicon Communications, LLC (Netgate) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_pf #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_DEBUG); struct nl_parsed_state { uint8_t version; uint32_t id; uint32_t creatorid; char ifname[IFNAMSIZ]; uint16_t proto; sa_family_t af; struct pf_addr addr; struct pf_addr mask; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_parsed_state, _field) static const struct nlattr_parser nla_p_state[] = { { .type = PF_ST_ID, .off = _OUT(id), .cb = nlattr_get_uint32 }, { .type = PF_ST_CREATORID, .off = _OUT(creatorid), .cb = nlattr_get_uint32 }, { .type = PF_ST_IFNAME, .arg = (const void *)IFNAMSIZ, .off = _OUT(ifname), .cb = nlattr_get_chara }, { .type = PF_ST_AF, .off = _OUT(af), .cb = nlattr_get_uint8 }, { .type = PF_ST_PROTO, .off = _OUT(proto), .cb = nlattr_get_uint16 }, { .type = PF_ST_FILTER_ADDR, .off = _OUT(addr), .cb = nlattr_get_in6_addr }, { .type = PF_ST_FILTER_MASK, .off = _OUT(mask), .cb = nlattr_get_in6_addr }, }; static const struct nlfield_parser nlf_p_generic[] = { { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(state_parser, struct genlmsghdr, nlf_p_generic, nla_p_state); static void dump_addr(struct nl_writer *nw, int attr, const struct pf_addr *addr, int af) { switch (af) { case AF_INET: nlattr_add(nw, attr, 4, &addr->v4); break; case AF_INET6: nlattr_add(nw, attr, 16, &addr->v6); break; }; } static bool dump_state_peer(struct nl_writer *nw, int attr, const struct pf_state_peer *peer) { int off = nlattr_add_nested(nw, attr); if (off == 0) return (false); nlattr_add_u32(nw, PF_STP_SEQLO, peer->seqlo); nlattr_add_u32(nw, PF_STP_SEQHI, peer->seqhi); nlattr_add_u32(nw, PF_STP_SEQDIFF, peer->seqdiff); nlattr_add_u16(nw, PF_STP_MAX_WIN, peer->max_win); nlattr_add_u16(nw, PF_STP_MSS, peer->mss); nlattr_add_u8(nw, PF_STP_STATE, peer->state); nlattr_add_u8(nw, PF_STP_WSCALE, peer->wscale); if (peer->scrub != NULL) { struct pf_state_scrub *sc = peer->scrub; uint16_t pfss_flags = sc->pfss_flags & PFSS_TIMESTAMP; nlattr_add_u16(nw, PF_STP_PFSS_FLAGS, pfss_flags); nlattr_add_u32(nw, PF_STP_PFSS_TS_MOD, sc->pfss_ts_mod); nlattr_add_u8(nw, PF_STP_PFSS_TTL, sc->pfss_ttl); nlattr_add_u8(nw, PF_STP_SCRUB_FLAG, PFSYNC_SCRUB_FLAG_VALID); } nlattr_set_len(nw, off); return (true); } static bool dump_state_key(struct nl_writer *nw, int attr, const struct pf_state_key *key) { int off = nlattr_add_nested(nw, attr); if (off == 0) return (false); dump_addr(nw, PF_STK_ADDR0, &key->addr[0], key->af); dump_addr(nw, PF_STK_ADDR1, &key->addr[1], key->af); nlattr_add_u16(nw, PF_STK_PORT0, key->port[0]); nlattr_add_u16(nw, PF_STK_PORT1, key->port[1]); nlattr_add_u8(nw, PF_STK_AF, key->af); nlattr_add_u16(nw, PF_STK_PROTO, key->proto); nlattr_set_len(nw, off); return (true); } static int dump_state(struct nlpcb *nlp, const struct nlmsghdr *hdr, struct pf_kstate *s, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; int error = 0; int af; struct pf_state_key *key; PF_STATE_LOCK_ASSERT(s); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GETSTATES; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u64(nw, PF_ST_VERSION, PF_STATE_VERSION); key = s->key[PF_SK_WIRE]; if (!dump_state_key(nw, PF_ST_KEY_WIRE, key)) goto enomem; key = s->key[PF_SK_STACK]; if (!dump_state_key(nw, PF_ST_KEY_STACK, key)) goto enomem; af = s->key[PF_SK_WIRE]->af; nlattr_add_u8(nw, PF_ST_PROTO, s->key[PF_SK_WIRE]->proto); nlattr_add_u8(nw, PF_ST_AF, af); nlattr_add_string(nw, PF_ST_IFNAME, s->kif->pfik_name); nlattr_add_string(nw, PF_ST_ORIG_IFNAME, s->orig_kif->pfik_name); dump_addr(nw, PF_ST_RT_ADDR, &s->act.rt_addr, af); nlattr_add_u32(nw, PF_ST_CREATION, time_uptime - (s->creation / 1000)); uint32_t expire = pf_state_expires(s); if (expire > time_uptime) expire = expire - time_uptime; nlattr_add_u32(nw, PF_ST_EXPIRE, expire); nlattr_add_u8(nw, PF_ST_DIRECTION, s->direction); nlattr_add_u8(nw, PF_ST_LOG, s->act.log); nlattr_add_u8(nw, PF_ST_TIMEOUT, s->timeout); nlattr_add_u16(nw, PF_ST_STATE_FLAGS, s->state_flags); uint8_t sync_flags = 0; if (s->src_node) sync_flags |= PFSYNC_FLAG_SRCNODE; if (s->nat_src_node) sync_flags |= PFSYNC_FLAG_NATSRCNODE; nlattr_add_u8(nw, PF_ST_SYNC_FLAGS, sync_flags); nlattr_add_u64(nw, PF_ST_ID, s->id); nlattr_add_u32(nw, PF_ST_CREATORID, htonl(s->creatorid)); nlattr_add_u32(nw, PF_ST_RULE, s->rule ? s->rule->nr : -1); nlattr_add_u32(nw, PF_ST_ANCHOR, s->anchor ? s->anchor->nr : -1); nlattr_add_u32(nw, PF_ST_NAT_RULE, s->nat_rule ? s->nat_rule->nr : -1); nlattr_add_u64(nw, PF_ST_PACKETS0, s->packets[0]); nlattr_add_u64(nw, PF_ST_PACKETS1, s->packets[1]); nlattr_add_u64(nw, PF_ST_BYTES0, s->bytes[0]); nlattr_add_u64(nw, PF_ST_BYTES1, s->bytes[1]); nlattr_add_u32(nw, PF_ST_RTABLEID, s->act.rtableid); nlattr_add_u8(nw, PF_ST_MIN_TTL, s->act.min_ttl); nlattr_add_u16(nw, PF_ST_MAX_MSS, s->act.max_mss); nlattr_add_u16(nw, PF_ST_DNPIPE, s->act.dnpipe); nlattr_add_u16(nw, PF_ST_DNRPIPE, s->act.dnrpipe); nlattr_add_u8(nw, PF_ST_RT, s->act.rt); if (s->act.rt_kif != NULL) nlattr_add_string(nw, PF_ST_RT_IFNAME, s->act.rt_kif->pfik_name); if (!dump_state_peer(nw, PF_ST_PEER_SRC, &s->src)) goto enomem; if (!dump_state_peer(nw, PF_ST_PEER_DST, &s->dst)) goto enomem; if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; nlmsg_abort(nw); return (error); } static int handle_dumpstates(struct nlpcb *nlp, struct nl_parsed_state *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { int error = 0; hdr->nlmsg_flags |= NLM_F_MULTI; for (int i = 0; i <= V_pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_kstate *s; if (LIST_EMPTY(&ih->states)) continue; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { sa_family_t af = s->key[PF_SK_WIRE]->af; if (s->timeout == PFTM_UNLINKED) continue; /* Filter */ if (attrs->creatorid != 0 && s->creatorid != attrs->creatorid) continue; if (attrs->ifname[0] != 0 && strncmp(attrs->ifname, s->kif->pfik_name, IFNAMSIZ) != 0) continue; if (attrs->proto != 0 && s->key[PF_SK_WIRE]->proto != attrs->proto) continue; if (attrs->af != 0 && af != attrs->af) continue; if (pf_match_addr(1, &s->key[PF_SK_WIRE]->addr[0], &attrs->mask, &attrs->addr, af) && pf_match_addr(1, &s->key[PF_SK_WIRE]->addr[1], &attrs->mask, &attrs->addr, af) && pf_match_addr(1, &s->key[PF_SK_STACK]->addr[0], &attrs->mask, &attrs->addr, af) && pf_match_addr(1, &s->key[PF_SK_STACK]->addr[1], &attrs->mask, &attrs->addr, af)) continue; error = dump_state(nlp, hdr, s, npt); if (error != 0) break; } PF_HASHROW_UNLOCK(ih); } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static int handle_getstate(struct nlpcb *nlp, struct nl_parsed_state *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_kstate *s; int ret; s = pf_find_state_byid(attrs->id, attrs->creatorid); if (s == NULL) return (ENOENT); ret = dump_state(nlp, hdr, s, npt); PF_STATE_UNLOCK(s); return (ret); } static int dump_creatorid(struct nlpcb *nlp, const struct nlmsghdr *hdr, uint32_t creator, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GETCREATORS; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_ST_CREATORID, htonl(creator)); if (nlmsg_end(nw)) return (0); enomem: nlmsg_abort(nw); return (ENOMEM); } static int pf_handle_getstates(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error; struct nl_parsed_state attrs = {}; error = nl_parse_nlmsg(hdr, &state_parser, npt, &attrs); if (error != 0) return (error); if (attrs.id != 0) error = handle_getstate(npt->nlp, &attrs, hdr, npt); else error = handle_dumpstates(npt->nlp, &attrs, hdr, npt); return (error); } static int pf_handle_getcreators(struct nlmsghdr *hdr, struct nl_pstate *npt) { uint32_t creators[16]; int error = 0; bzero(creators, sizeof(creators)); for (int i = 0; i < V_pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_kstate *s; if (LIST_EMPTY(&ih->states)) continue; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { int j; if (s->timeout == PFTM_UNLINKED) continue; for (j = 0; j < nitems(creators); j++) { if (creators[j] == s->creatorid) break; if (creators[j] == 0) { creators[j] = s->creatorid; break; } } if (j == nitems(creators)) printf("Warning: too many creators!\n"); } PF_HASHROW_UNLOCK(ih); } hdr->nlmsg_flags |= NLM_F_MULTI; for (int i = 0; i < nitems(creators); i++) { if (creators[i] == 0) break; error = dump_creatorid(npt->nlp, hdr, creators[i], npt); } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static int pf_handle_start(struct nlmsghdr *hdr __unused, struct nl_pstate *npt __unused) { return (pf_start()); } static int pf_handle_stop(struct nlmsghdr *hdr __unused, struct nl_pstate *npt __unused) { return (pf_stop()); } #define _OUT(_field) offsetof(struct pf_addr_wrap, _field) static const struct nlattr_parser nla_p_addr_wrap[] = { { .type = PF_AT_ADDR, .off = _OUT(v.a.addr), .cb = nlattr_get_in6_addr }, { .type = PF_AT_MASK, .off = _OUT(v.a.mask), .cb = nlattr_get_in6_addr }, { .type = PF_AT_IFNAME, .off = _OUT(v.ifname), .arg = (void *)IFNAMSIZ,.cb = nlattr_get_chara }, { .type = PF_AT_TABLENAME, .off = _OUT(v.tblname), .arg = (void *)PF_TABLE_NAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_AT_TYPE, .off = _OUT(type), .cb = nlattr_get_uint8 }, { .type = PF_AT_IFLAGS, .off = _OUT(iflags), .cb = nlattr_get_uint8 }, }; NL_DECLARE_ATTR_PARSER(addr_wrap_parser, nla_p_addr_wrap); #undef _OUT static bool nlattr_add_addr_wrap(struct nl_writer *nw, int attrtype, struct pf_addr_wrap *a) { int off = nlattr_add_nested(nw, attrtype); nlattr_add_in6_addr(nw, PF_AT_ADDR, &a->v.a.addr.v6); nlattr_add_in6_addr(nw, PF_AT_MASK, &a->v.a.mask.v6); nlattr_add_u8(nw, PF_AT_TYPE, a->type); nlattr_add_u8(nw, PF_AT_IFLAGS, a->iflags); if (a->type == PF_ADDR_DYNIFTL) { nlattr_add_string(nw, PF_AT_IFNAME, a->v.ifname); nlattr_add_u32(nw, PF_AT_DYNCNT, a->p.dyncnt); } else if (a->type == PF_ADDR_TABLE) { nlattr_add_string(nw, PF_AT_TABLENAME, a->v.tblname); nlattr_add_u32(nw, PF_AT_TBLCNT, a->p.tblcnt); } nlattr_set_len(nw, off); return (true); } #define _OUT(_field) offsetof(struct pf_rule_addr, _field) static const struct nlattr_parser nla_p_ruleaddr[] = { { .type = PF_RAT_ADDR, .off = _OUT(addr), .arg = &addr_wrap_parser, .cb = nlattr_get_nested }, { .type = PF_RAT_SRC_PORT, .off = _OUT(port[0]), .cb = nlattr_get_uint16 }, { .type = PF_RAT_DST_PORT, .off = _OUT(port[1]), .cb = nlattr_get_uint16 }, { .type = PF_RAT_NEG, .off = _OUT(neg), .cb = nlattr_get_uint8 }, { .type = PF_RAT_OP, .off = _OUT(port_op), .cb = nlattr_get_uint8 }, }; NL_DECLARE_ATTR_PARSER(rule_addr_parser, nla_p_ruleaddr); #undef _OUT static bool nlattr_add_rule_addr(struct nl_writer *nw, int attrtype, struct pf_rule_addr *r) { struct pf_addr_wrap aw = {0}; int off = nlattr_add_nested(nw, attrtype); bcopy(&(r->addr), &aw, sizeof(struct pf_addr_wrap)); pf_addr_copyout(&aw); nlattr_add_addr_wrap(nw, PF_RAT_ADDR, &aw); nlattr_add_u16(nw, PF_RAT_SRC_PORT, r->port[0]); nlattr_add_u16(nw, PF_RAT_DST_PORT, r->port[1]); nlattr_add_u8(nw, PF_RAT_NEG, r->neg); nlattr_add_u8(nw, PF_RAT_OP, r->port_op); nlattr_set_len(nw, off); return (true); } #define _OUT(_field) offsetof(struct pf_mape_portset, _field) static const struct nlattr_parser nla_p_mape_portset[] = { { .type = PF_MET_OFFSET, .off = _OUT(offset), .cb = nlattr_get_uint8 }, { .type = PF_MET_PSID_LEN, .off = _OUT(psidlen), .cb = nlattr_get_uint8 }, {. type = PF_MET_PSID, .off = _OUT(psid), .cb = nlattr_get_uint16 }, }; NL_DECLARE_ATTR_PARSER(mape_portset_parser, nla_p_mape_portset); #undef _OUT static bool nlattr_add_mape_portset(struct nl_writer *nw, int attrtype, const struct pf_mape_portset *m) { int off = nlattr_add_nested(nw, attrtype); nlattr_add_u8(nw, PF_MET_OFFSET, m->offset); nlattr_add_u8(nw, PF_MET_PSID_LEN, m->psidlen); nlattr_add_u16(nw, PF_MET_PSID, m->psid); nlattr_set_len(nw, off); return (true); } struct nl_parsed_labels { char labels[PF_RULE_MAX_LABEL_COUNT][PF_RULE_LABEL_SIZE]; uint32_t i; }; static int nlattr_get_pf_rule_labels(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { struct nl_parsed_labels *l = (struct nl_parsed_labels *)target; int ret; if (l->i >= PF_RULE_MAX_LABEL_COUNT) return (E2BIG); ret = nlattr_get_chara(nla, npt, (void *)PF_RULE_LABEL_SIZE, l->labels[l->i]); if (ret == 0) l->i++; return (ret); } #define _OUT(_field) offsetof(struct nl_parsed_labels, _field) static const struct nlattr_parser nla_p_labels[] = { { .type = PF_LT_LABEL, .off = 0, .cb = nlattr_get_pf_rule_labels }, }; NL_DECLARE_ATTR_PARSER(rule_labels_parser, nla_p_labels); #undef _OUT static int nlattr_get_nested_pf_rule_labels(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { struct nl_parsed_labels parsed_labels = { }; int error; /* Assumes target points to the beginning of the structure */ error = nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), &rule_labels_parser, npt, &parsed_labels); if (error != 0) return (error); memcpy(target, parsed_labels.labels, sizeof(parsed_labels.labels)); return (0); } static bool nlattr_add_labels(struct nl_writer *nw, int attrtype, const struct pf_krule *r) { int off = nlattr_add_nested(nw, attrtype); int i = 0; while (r->label[i][0] != 0 && i < PF_RULE_MAX_LABEL_COUNT) { nlattr_add_string(nw, PF_LT_LABEL, r->label[i]); i++; } nlattr_set_len(nw, off); return (true); } #define _OUT(_field) offsetof(struct pf_kpool, _field) static const struct nlattr_parser nla_p_pool[] = { { .type = PF_PT_KEY, .off = _OUT(key), .arg = (void *)sizeof(struct pf_poolhashkey), .cb = nlattr_get_bytes }, { .type = PF_PT_COUNTER, .off = _OUT(counter), .cb = nlattr_get_in6_addr }, { .type = PF_PT_TBLIDX, .off = _OUT(tblidx), .cb = nlattr_get_uint32 }, { .type = PF_PT_PROXY_SRC_PORT, .off = _OUT(proxy_port[0]), .cb = nlattr_get_uint16 }, { .type = PF_PT_PROXY_DST_PORT, .off = _OUT(proxy_port[1]), .cb = nlattr_get_uint16 }, { .type = PF_PT_OPTS, .off = _OUT(opts), .cb = nlattr_get_uint8 }, { .type = PF_PT_MAPE, .off = _OUT(mape), .arg = &mape_portset_parser, .cb = nlattr_get_nested }, }; NL_DECLARE_ATTR_PARSER(pool_parser, nla_p_pool); #undef _OUT static bool nlattr_add_pool(struct nl_writer *nw, int attrtype, const struct pf_kpool *pool) { int off = nlattr_add_nested(nw, attrtype); nlattr_add(nw, PF_PT_KEY, sizeof(struct pf_poolhashkey), &pool->key); nlattr_add_in6_addr(nw, PF_PT_COUNTER, (const struct in6_addr *)&pool->counter); nlattr_add_u32(nw, PF_PT_TBLIDX, pool->tblidx); nlattr_add_u16(nw, PF_PT_PROXY_SRC_PORT, pool->proxy_port[0]); nlattr_add_u16(nw, PF_PT_PROXY_DST_PORT, pool->proxy_port[1]); nlattr_add_u8(nw, PF_PT_OPTS, pool->opts); nlattr_add_mape_portset(nw, PF_PT_MAPE, &pool->mape); nlattr_set_len(nw, off); return (true); } #define _OUT(_field) offsetof(struct pf_rule_uid, _field) static const struct nlattr_parser nla_p_rule_uid[] = { { .type = PF_RUT_UID_LOW, .off = _OUT(uid[0]), .cb = nlattr_get_uint32 }, { .type = PF_RUT_UID_HIGH, .off = _OUT(uid[1]), .cb = nlattr_get_uint32 }, { .type = PF_RUT_OP, .off = _OUT(op), .cb = nlattr_get_uint8 }, }; NL_DECLARE_ATTR_PARSER(rule_uid_parser, nla_p_rule_uid); #undef _OUT static bool nlattr_add_rule_uid(struct nl_writer *nw, int attrtype, const struct pf_rule_uid *u) { int off = nlattr_add_nested(nw, attrtype); nlattr_add_u32(nw, PF_RUT_UID_LOW, u->uid[0]); nlattr_add_u32(nw, PF_RUT_UID_HIGH, u->uid[1]); nlattr_add_u8(nw, PF_RUT_OP, u->op); nlattr_set_len(nw, off); return (true); } struct nl_parsed_timeouts { uint32_t timeouts[PFTM_MAX]; uint32_t i; }; static int nlattr_get_pf_timeout(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { struct nl_parsed_timeouts *t = (struct nl_parsed_timeouts *)target; int ret; if (t->i >= PFTM_MAX) return (E2BIG); ret = nlattr_get_uint32(nla, npt, NULL, &t->timeouts[t->i]); if (ret == 0) t->i++; return (ret); } #define _OUT(_field) offsetof(struct nl_parsed_timeout, _field) static const struct nlattr_parser nla_p_timeouts[] = { { .type = PF_TT_TIMEOUT, .off = 0, .cb = nlattr_get_pf_timeout }, }; NL_DECLARE_ATTR_PARSER(timeout_parser, nla_p_timeouts); #undef _OUT static int nlattr_get_nested_timeouts(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { struct nl_parsed_timeouts parsed_timeouts = { }; int error; /* Assumes target points to the beginning of the structure */ error = nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), &timeout_parser, npt, &parsed_timeouts); if (error != 0) return (error); memcpy(target, parsed_timeouts.timeouts, sizeof(parsed_timeouts.timeouts)); return (0); } static bool nlattr_add_timeout(struct nl_writer *nw, int attrtype, uint32_t *timeout) { int off = nlattr_add_nested(nw, attrtype); for (int i = 0; i < PFTM_MAX; i++) nlattr_add_u32(nw, PF_RT_TIMEOUT, timeout[i]); nlattr_set_len(nw, off); return (true); } #define _OUT(_field) offsetof(struct pf_krule, _field) static const struct nlattr_parser nla_p_rule[] = { { .type = PF_RT_SRC, .off = _OUT(src), .arg = &rule_addr_parser,.cb = nlattr_get_nested }, { .type = PF_RT_DST, .off = _OUT(dst), .arg = &rule_addr_parser,.cb = nlattr_get_nested }, { .type = PF_RT_RIDENTIFIER, .off = _OUT(ridentifier), .cb = nlattr_get_uint32 }, { .type = PF_RT_LABELS, .off = _OUT(label), .arg = &rule_labels_parser,.cb = nlattr_get_nested_pf_rule_labels }, { .type = PF_RT_IFNAME, .off = _OUT(ifname), .arg = (void *)IFNAMSIZ, .cb = nlattr_get_chara }, { .type = PF_RT_QNAME, .off = _OUT(qname), .arg = (void *)PF_QNAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_RT_PQNAME, .off = _OUT(pqname), .arg = (void *)PF_QNAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_RT_TAGNAME, .off = _OUT(tagname), .arg = (void *)PF_TAG_NAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_RT_MATCH_TAGNAME, .off = _OUT(match_tagname), .arg = (void *)PF_TAG_NAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_RT_OVERLOAD_TBLNAME, .off = _OUT(overload_tblname), .arg = (void *)PF_TABLE_NAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_RT_RPOOL_RDR, .off = _OUT(rdr), .arg = &pool_parser, .cb = nlattr_get_nested }, { .type = PF_RT_OS_FINGERPRINT, .off = _OUT(os_fingerprint), .cb = nlattr_get_uint32 }, { .type = PF_RT_RTABLEID, .off = _OUT(rtableid), .cb = nlattr_get_uint32 }, { .type = PF_RT_TIMEOUT, .off = _OUT(timeout), .arg = &timeout_parser, .cb = nlattr_get_nested_timeouts }, { .type = PF_RT_MAX_STATES, .off = _OUT(max_states), .cb = nlattr_get_uint32 }, { .type = PF_RT_MAX_SRC_NODES, .off = _OUT(max_src_nodes), .cb = nlattr_get_uint32 }, { .type = PF_RT_MAX_SRC_STATES, .off = _OUT(max_src_states), .cb = nlattr_get_uint32 }, { .type = PF_RT_MAX_SRC_CONN_RATE_LIMIT, .off = _OUT(max_src_conn_rate.limit), .cb = nlattr_get_uint32 }, { .type = PF_RT_MAX_SRC_CONN_RATE_SECS, .off = _OUT(max_src_conn_rate.seconds), .cb = nlattr_get_uint32 }, { .type = PF_RT_DNPIPE, .off = _OUT(dnpipe), .cb = nlattr_get_uint16 }, { .type = PF_RT_DNRPIPE, .off = _OUT(dnrpipe), .cb = nlattr_get_uint16 }, { .type = PF_RT_DNFLAGS, .off = _OUT(free_flags), .cb = nlattr_get_uint32 }, { .type = PF_RT_NR, .off = _OUT(nr), .cb = nlattr_get_uint32 }, { .type = PF_RT_PROB, .off = _OUT(prob), .cb = nlattr_get_uint32 }, { .type = PF_RT_CUID, .off = _OUT(cuid), .cb = nlattr_get_uint32 }, {. type = PF_RT_CPID, .off = _OUT(cpid), .cb = nlattr_get_uint32 }, { .type = PF_RT_RETURN_ICMP, .off = _OUT(return_icmp), .cb = nlattr_get_uint16 }, { .type = PF_RT_RETURN_ICMP6, .off = _OUT(return_icmp6), .cb = nlattr_get_uint16 }, { .type = PF_RT_MAX_MSS, .off = _OUT(max_mss), .cb = nlattr_get_uint16 }, { .type = PF_RT_SCRUB_FLAGS, .off = _OUT(scrub_flags), .cb = nlattr_get_uint16 }, { .type = PF_RT_UID, .off = _OUT(uid), .arg = &rule_uid_parser, .cb = nlattr_get_nested }, { .type = PF_RT_GID, .off = _OUT(gid), .arg = &rule_uid_parser, .cb = nlattr_get_nested }, { .type = PF_RT_RULE_FLAG, .off = _OUT(rule_flag), .cb = nlattr_get_uint32 }, { .type = PF_RT_ACTION, .off = _OUT(action), .cb = nlattr_get_uint8 }, { .type = PF_RT_DIRECTION, .off = _OUT(direction), .cb = nlattr_get_uint8 }, { .type = PF_RT_LOG, .off = _OUT(log), .cb = nlattr_get_uint8 }, { .type = PF_RT_LOGIF, .off = _OUT(logif), .cb = nlattr_get_uint8 }, { .type = PF_RT_QUICK, .off = _OUT(quick), .cb = nlattr_get_uint8 }, { .type = PF_RT_IF_NOT, .off = _OUT(ifnot), .cb = nlattr_get_uint8 }, { .type = PF_RT_MATCH_TAG_NOT, .off = _OUT(match_tag_not), .cb = nlattr_get_uint8 }, { .type = PF_RT_NATPASS, .off = _OUT(natpass), .cb = nlattr_get_uint8 }, { .type = PF_RT_KEEP_STATE, .off = _OUT(keep_state), .cb = nlattr_get_uint8 }, { .type = PF_RT_AF, .off = _OUT(af), .cb = nlattr_get_uint8 }, { .type = PF_RT_PROTO, .off = _OUT(proto), .cb = nlattr_get_uint8 }, { .type = PF_RT_TYPE, .off = _OUT(type), .cb = nlattr_get_uint8 }, { .type = PF_RT_CODE, .off = _OUT(code), .cb = nlattr_get_uint8 }, { .type = PF_RT_FLAGS, .off = _OUT(flags), .cb = nlattr_get_uint8 }, { .type = PF_RT_FLAGSET, .off = _OUT(flagset), .cb = nlattr_get_uint8 }, { .type = PF_RT_MIN_TTL, .off = _OUT(min_ttl), .cb = nlattr_get_uint8 }, { .type = PF_RT_ALLOW_OPTS, .off = _OUT(allow_opts), .cb = nlattr_get_uint8 }, { .type = PF_RT_RT, .off = _OUT(rt), .cb = nlattr_get_uint8 }, { .type = PF_RT_RETURN_TTL, .off = _OUT(return_ttl), .cb = nlattr_get_uint8 }, { .type = PF_RT_TOS, .off = _OUT(tos), .cb = nlattr_get_uint8 }, { .type = PF_RT_SET_TOS, .off = _OUT(set_tos), .cb = nlattr_get_uint8 }, { .type = PF_RT_ANCHOR_RELATIVE, .off = _OUT(anchor_relative), .cb = nlattr_get_uint8 }, { .type = PF_RT_ANCHOR_WILDCARD, .off = _OUT(anchor_wildcard), .cb = nlattr_get_uint8 }, { .type = PF_RT_FLUSH, .off = _OUT(flush), .cb = nlattr_get_uint8 }, { .type = PF_RT_PRIO, .off = _OUT(prio), .cb = nlattr_get_uint8 }, { .type = PF_RT_SET_PRIO, .off = _OUT(set_prio[0]), .cb = nlattr_get_uint8 }, { .type = PF_RT_SET_PRIO_REPLY, .off = _OUT(set_prio[1]), .cb = nlattr_get_uint8 }, { .type = PF_RT_DIVERT_ADDRESS, .off = _OUT(divert.addr), .cb = nlattr_get_in6_addr }, { .type = PF_RT_DIVERT_PORT, .off = _OUT(divert.port), .cb = nlattr_get_uint16 }, { .type = PF_RT_RCV_IFNAME, .off = _OUT(rcv_ifname), .arg = (void *)IFNAMSIZ, .cb = nlattr_get_chara }, { .type = PF_RT_MAX_SRC_CONN, .off = _OUT(max_src_conn), .cb = nlattr_get_uint32 }, { .type = PF_RT_RPOOL_NAT, .off = _OUT(nat), .arg = &pool_parser, .cb = nlattr_get_nested }, { .type = PF_RT_NAF, .off = _OUT(naf), .cb = nlattr_get_uint8 }, { .type = PF_RT_RPOOL_RT, .off = _OUT(route), .arg = &pool_parser, .cb = nlattr_get_nested }, }; NL_DECLARE_ATTR_PARSER(rule_parser, nla_p_rule); #undef _OUT struct nl_parsed_addrule { struct pf_krule *rule; uint32_t ticket; uint32_t pool_ticket; char *anchor; char *anchor_call; }; #define _OUT(_field) offsetof(struct nl_parsed_addrule, _field) static const struct nlattr_parser nla_p_addrule[] = { { .type = PF_ART_TICKET, .off = _OUT(ticket), .cb = nlattr_get_uint32 }, { .type = PF_ART_POOL_TICKET, .off = _OUT(pool_ticket), .cb = nlattr_get_uint32 }, { .type = PF_ART_ANCHOR, .off = _OUT(anchor), .cb = nlattr_get_string }, { .type = PF_ART_ANCHOR_CALL, .off = _OUT(anchor_call), .cb = nlattr_get_string }, { .type = PF_ART_RULE, .off = _OUT(rule), .arg = &rule_parser, .cb = nlattr_get_nested_ptr } }; #undef _OUT NL_DECLARE_PARSER(addrule_parser, struct genlmsghdr, nlf_p_empty, nla_p_addrule); static int pf_handle_addrule(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error; struct nl_parsed_addrule attrs = {}; attrs.rule = pf_krule_alloc(); error = nl_parse_nlmsg(hdr, &addrule_parser, npt, &attrs); if (error != 0) { pf_free_rule(attrs.rule); return (error); } error = pf_ioctl_addrule(attrs.rule, attrs.ticket, attrs.pool_ticket, attrs.anchor, attrs.anchor_call, nlp_get_cred(npt->nlp)->cr_uid, hdr->nlmsg_pid); return (error); } #define _OUT(_field) offsetof(struct pfioc_rule, _field) static const struct nlattr_parser nla_p_getrules[] = { { .type = PF_GR_ANCHOR, .off = _OUT(anchor), .arg = (void *)MAXPATHLEN, .cb = nlattr_get_chara }, { .type = PF_GR_ACTION, .off = _OUT(rule.action), .cb = nlattr_get_uint8 }, }; #undef _OUT NL_DECLARE_PARSER(getrules_parser, struct genlmsghdr, nlf_p_empty, nla_p_getrules); static int pf_handle_getrules(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pfioc_rule attrs = {}; int error; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; error = nl_parse_nlmsg(hdr, &getrules_parser, npt, &attrs); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GETRULES; ghdr_new->version = 0; ghdr_new->reserved = 0; error = pf_ioctl_getrules(&attrs); if (error != 0) goto out; nlattr_add_u32(nw, PF_GR_NR, attrs.nr); nlattr_add_u32(nw, PF_GR_TICKET, attrs.ticket); if (!nlmsg_end(nw)) { error = ENOMEM; goto out; } return (0); out: nlmsg_abort(nw); return (error); } struct nl_parsed_get_rule { char anchor[MAXPATHLEN]; uint8_t action; uint32_t nr; uint32_t ticket; uint8_t clear; }; #define _OUT(_field) offsetof(struct nl_parsed_get_rule, _field) static const struct nlattr_parser nla_p_getrule[] = { { .type = PF_GR_ANCHOR, .off = _OUT(anchor), .arg = (void *)MAXPATHLEN, .cb = nlattr_get_chara }, { .type = PF_GR_ACTION, .off = _OUT(action), .cb = nlattr_get_uint8 }, { .type = PF_GR_NR, .off = _OUT(nr), .cb = nlattr_get_uint32 }, { .type = PF_GR_TICKET, .off = _OUT(ticket), .cb = nlattr_get_uint32 }, { .type = PF_GR_CLEAR, .off = _OUT(clear), .cb = nlattr_get_uint8 }, }; #undef _OUT NL_DECLARE_PARSER(getrule_parser, struct genlmsghdr, nlf_p_empty, nla_p_getrule); static int pf_handle_getrule(struct nlmsghdr *hdr, struct nl_pstate *npt) { char anchor_call[MAXPATHLEN]; struct nl_parsed_get_rule attrs = {}; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; struct pf_kruleset *ruleset; struct pf_krule *rule; int rs_num; int error; error = nl_parse_nlmsg(hdr, &getrule_parser, npt, &attrs); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GETRULE; ghdr_new->version = 0; ghdr_new->reserved = 0; PF_RULES_WLOCK(); ruleset = pf_find_kruleset(attrs.anchor); if (ruleset == NULL) { PF_RULES_WUNLOCK(); error = ENOENT; goto out; } rs_num = pf_get_ruleset_number(attrs.action); if (rs_num >= PF_RULESET_MAX) { PF_RULES_WUNLOCK(); error = EINVAL; goto out; } if (attrs.ticket != ruleset->rules[rs_num].active.ticket) { PF_RULES_WUNLOCK(); error = EBUSY; goto out; } rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr); while ((rule != NULL) && (rule->nr != attrs.nr)) rule = TAILQ_NEXT(rule, entries); if (rule == NULL) { PF_RULES_WUNLOCK(); error = EBUSY; goto out; } nlattr_add_rule_addr(nw, PF_RT_SRC, &rule->src); nlattr_add_rule_addr(nw, PF_RT_DST, &rule->dst); nlattr_add_u32(nw, PF_RT_RIDENTIFIER, rule->ridentifier); nlattr_add_labels(nw, PF_RT_LABELS, rule); nlattr_add_string(nw, PF_RT_IFNAME, rule->ifname); nlattr_add_string(nw, PF_RT_QNAME, rule->qname); nlattr_add_string(nw, PF_RT_PQNAME, rule->pqname); nlattr_add_string(nw, PF_RT_TAGNAME, rule->tagname); nlattr_add_string(nw, PF_RT_MATCH_TAGNAME, rule->match_tagname); nlattr_add_string(nw, PF_RT_OVERLOAD_TBLNAME, rule->overload_tblname); nlattr_add_pool(nw, PF_RT_RPOOL_RDR, &rule->rdr); nlattr_add_pool(nw, PF_RT_RPOOL_NAT, &rule->nat); nlattr_add_pool(nw, PF_RT_RPOOL_RT, &rule->route); nlattr_add_u32(nw, PF_RT_OS_FINGERPRINT, rule->os_fingerprint); nlattr_add_u32(nw, PF_RT_RTABLEID, rule->rtableid); nlattr_add_timeout(nw, PF_RT_TIMEOUT, rule->timeout); nlattr_add_u32(nw, PF_RT_MAX_STATES, rule->max_states); nlattr_add_u32(nw, PF_RT_MAX_SRC_NODES, rule->max_src_nodes); nlattr_add_u32(nw, PF_RT_MAX_SRC_STATES, rule->max_src_states); nlattr_add_u32(nw, PF_RT_MAX_SRC_CONN, rule->max_src_conn); nlattr_add_u32(nw, PF_RT_MAX_SRC_CONN_RATE_LIMIT, rule->max_src_conn_rate.limit); nlattr_add_u32(nw, PF_RT_MAX_SRC_CONN_RATE_SECS, rule->max_src_conn_rate.seconds); nlattr_add_u16(nw, PF_RT_DNPIPE, rule->dnpipe); nlattr_add_u16(nw, PF_RT_DNRPIPE, rule->dnrpipe); nlattr_add_u32(nw, PF_RT_DNFLAGS, rule->free_flags); nlattr_add_u32(nw, PF_RT_NR, rule->nr); nlattr_add_u32(nw, PF_RT_PROB, rule->prob); nlattr_add_u32(nw, PF_RT_CUID, rule->cuid); nlattr_add_u32(nw, PF_RT_CPID, rule->cpid); nlattr_add_u16(nw, PF_RT_RETURN_ICMP, rule->return_icmp); nlattr_add_u16(nw, PF_RT_RETURN_ICMP6, rule->return_icmp6); nlattr_add_u16(nw, PF_RT_RETURN_ICMP6, rule->return_icmp6); nlattr_add_u16(nw, PF_RT_MAX_MSS, rule->max_mss); nlattr_add_u16(nw, PF_RT_SCRUB_FLAGS, rule->scrub_flags); nlattr_add_rule_uid(nw, PF_RT_UID, &rule->uid); nlattr_add_rule_uid(nw, PF_RT_GID, (const struct pf_rule_uid *)&rule->gid); nlattr_add_string(nw, PF_RT_RCV_IFNAME, rule->rcv_ifname); nlattr_add_u32(nw, PF_RT_RULE_FLAG, rule->rule_flag); nlattr_add_u8(nw, PF_RT_ACTION, rule->action); nlattr_add_u8(nw, PF_RT_DIRECTION, rule->direction); nlattr_add_u8(nw, PF_RT_LOG, rule->log); nlattr_add_u8(nw, PF_RT_LOGIF, rule->logif); nlattr_add_u8(nw, PF_RT_QUICK, rule->quick); nlattr_add_u8(nw, PF_RT_IF_NOT, rule->ifnot); nlattr_add_u8(nw, PF_RT_MATCH_TAG_NOT, rule->match_tag_not); nlattr_add_u8(nw, PF_RT_NATPASS, rule->natpass); nlattr_add_u8(nw, PF_RT_KEEP_STATE, rule->keep_state); nlattr_add_u8(nw, PF_RT_AF, rule->af); nlattr_add_u8(nw, PF_RT_NAF, rule->naf); nlattr_add_u8(nw, PF_RT_PROTO, rule->proto); nlattr_add_u8(nw, PF_RT_TYPE, rule->type); nlattr_add_u8(nw, PF_RT_CODE, rule->code); nlattr_add_u8(nw, PF_RT_FLAGS, rule->flags); nlattr_add_u8(nw, PF_RT_FLAGSET, rule->flagset); nlattr_add_u8(nw, PF_RT_MIN_TTL, rule->min_ttl); nlattr_add_u8(nw, PF_RT_ALLOW_OPTS, rule->allow_opts); nlattr_add_u8(nw, PF_RT_RT, rule->rt); nlattr_add_u8(nw, PF_RT_RETURN_TTL, rule->return_ttl); nlattr_add_u8(nw, PF_RT_TOS, rule->tos); nlattr_add_u8(nw, PF_RT_SET_TOS, rule->set_tos); nlattr_add_u8(nw, PF_RT_ANCHOR_RELATIVE, rule->anchor_relative); nlattr_add_u8(nw, PF_RT_ANCHOR_WILDCARD, rule->anchor_wildcard); nlattr_add_u8(nw, PF_RT_FLUSH, rule->flush); nlattr_add_u8(nw, PF_RT_PRIO, rule->prio); nlattr_add_u8(nw, PF_RT_SET_PRIO, rule->set_prio[0]); nlattr_add_u8(nw, PF_RT_SET_PRIO_REPLY, rule->set_prio[1]); nlattr_add_in6_addr(nw, PF_RT_DIVERT_ADDRESS, &rule->divert.addr.v6); nlattr_add_u16(nw, PF_RT_DIVERT_PORT, rule->divert.port); nlattr_add_u64(nw, PF_RT_PACKETS_IN, pf_counter_u64_fetch(&rule->packets[0])); nlattr_add_u64(nw, PF_RT_PACKETS_OUT, pf_counter_u64_fetch(&rule->packets[1])); nlattr_add_u64(nw, PF_RT_BYTES_IN, pf_counter_u64_fetch(&rule->bytes[0])); nlattr_add_u64(nw, PF_RT_BYTES_OUT, pf_counter_u64_fetch(&rule->bytes[1])); nlattr_add_u64(nw, PF_RT_EVALUATIONS, pf_counter_u64_fetch(&rule->evaluations)); nlattr_add_u64(nw, PF_RT_TIMESTAMP, pf_get_timestamp(rule)); nlattr_add_u64(nw, PF_RT_STATES_CUR, counter_u64_fetch(rule->states_cur)); nlattr_add_u64(nw, PF_RT_STATES_TOTAL, counter_u64_fetch(rule->states_tot)); nlattr_add_u64(nw, PF_RT_SRC_NODES, counter_u64_fetch(rule->src_nodes)); error = pf_kanchor_copyout(ruleset, rule, anchor_call, sizeof(anchor_call)); MPASS(error == 0); nlattr_add_string(nw, PF_RT_ANCHOR_CALL, anchor_call); if (attrs.clear) pf_krule_clear_counters(rule); PF_RULES_WUNLOCK(); if (!nlmsg_end(nw)) { error = ENOMEM; goto out; } return (0); out: nlmsg_abort(nw); return (error); } #define _OUT(_field) offsetof(struct pf_kstate_kill, _field) static const struct nlattr_parser nla_p_clear_states[] = { { .type = PF_CS_CMP_ID, .off = _OUT(psk_pfcmp.id), .cb = nlattr_get_uint64 }, { .type = PF_CS_CMP_CREATORID, .off = _OUT(psk_pfcmp.creatorid), .cb = nlattr_get_uint32 }, { .type = PF_CS_CMP_DIR, .off = _OUT(psk_pfcmp.direction), .cb = nlattr_get_uint8 }, { .type = PF_CS_AF, .off = _OUT(psk_af), .cb = nlattr_get_uint8 }, { .type = PF_CS_PROTO, .off = _OUT(psk_proto), .cb = nlattr_get_uint8 }, { .type = PF_CS_SRC, .off = _OUT(psk_src), .arg = &rule_addr_parser, .cb = nlattr_get_nested }, { .type = PF_CS_DST, .off = _OUT(psk_dst), .arg = &rule_addr_parser, .cb = nlattr_get_nested }, { .type = PF_CS_RT_ADDR, .off = _OUT(psk_rt_addr), .arg = &rule_addr_parser, .cb = nlattr_get_nested }, { .type = PF_CS_IFNAME, .off = _OUT(psk_ifname), .arg = (void *)IFNAMSIZ, .cb = nlattr_get_chara }, { .type = PF_CS_LABEL, .off = _OUT(psk_label), .arg = (void *)PF_RULE_LABEL_SIZE, .cb = nlattr_get_chara }, { .type = PF_CS_KILL_MATCH, .off = _OUT(psk_kill_match), .cb = nlattr_get_bool }, { .type = PF_CS_NAT, .off = _OUT(psk_nat), .cb = nlattr_get_bool }, }; #undef _OUT NL_DECLARE_PARSER(clear_states_parser, struct genlmsghdr, nlf_p_empty, nla_p_clear_states); static int pf_handle_killclear_states(struct nlmsghdr *hdr, struct nl_pstate *npt, int cmd) { struct pf_kstate_kill kill = {}; struct epoch_tracker et; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; unsigned int killed = 0; error = nl_parse_nlmsg(hdr, &clear_states_parser, npt, &kill); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = cmd; ghdr_new->version = 0; ghdr_new->reserved = 0; NET_EPOCH_ENTER(et); if (cmd == PFNL_CMD_KILLSTATES) pf_killstates(&kill, &killed); else killed = pf_clear_states(&kill); NET_EPOCH_EXIT(et); nlattr_add_u32(nw, PF_CS_KILLED, killed); if (! nlmsg_end(nw)) { error = ENOMEM; goto out; } return (0); out: nlmsg_abort(nw); return (error); } static int pf_handle_clear_states(struct nlmsghdr *hdr, struct nl_pstate *npt) { return (pf_handle_killclear_states(hdr, npt, PFNL_CMD_CLRSTATES)); } static int pf_handle_kill_states(struct nlmsghdr *hdr, struct nl_pstate *npt) { return (pf_handle_killclear_states(hdr, npt, PFNL_CMD_KILLSTATES)); } struct nl_parsed_set_statusif { char ifname[IFNAMSIZ]; }; #define _OUT(_field) offsetof(struct nl_parsed_set_statusif, _field) static const struct nlattr_parser nla_p_set_statusif[] = { { .type = PF_SS_IFNAME, .off = _OUT(ifname), .arg = (const void *)IFNAMSIZ, .cb = nlattr_get_chara }, }; #undef _OUT NL_DECLARE_PARSER(set_statusif_parser, struct genlmsghdr, nlf_p_empty, nla_p_set_statusif); static int pf_handle_set_statusif(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error; struct nl_parsed_set_statusif attrs = {}; error = nl_parse_nlmsg(hdr, &set_statusif_parser, npt, &attrs); if (error != 0) return (error); PF_RULES_WLOCK(); strlcpy(V_pf_status.ifname, attrs.ifname, IFNAMSIZ); PF_RULES_WUNLOCK(); return (0); } static bool nlattr_add_counters(struct nl_writer *nw, int attr, size_t number, char **names, counter_u64_t *counters) { for (int i = 0; i < number; i++) { int off = nlattr_add_nested(nw, attr); nlattr_add_u32(nw, PF_C_ID, i); nlattr_add_string(nw, PF_C_NAME, names[i]); nlattr_add_u64(nw, PF_C_COUNTER, counter_u64_fetch(counters[i])); nlattr_set_len(nw, off); } return (true); } static bool nlattr_add_fcounters(struct nl_writer *nw, int attr, size_t number, char **names, struct pf_counter_u64 *counters) { for (int i = 0; i < number; i++) { int off = nlattr_add_nested(nw, attr); nlattr_add_u32(nw, PF_C_ID, i); nlattr_add_string(nw, PF_C_NAME, names[i]); nlattr_add_u64(nw, PF_C_COUNTER, pf_counter_u64_fetch(&counters[i])); nlattr_set_len(nw, off); } return (true); } static bool nlattr_add_u64_array(struct nl_writer *nw, int attr, size_t number, uint64_t *array) { int off = nlattr_add_nested(nw, attr); for (size_t i = 0; i < number; i++) nlattr_add_u64(nw, 0, array[i]); nlattr_set_len(nw, off); return (true); } static int pf_handle_get_status(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_status s; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; char *pf_reasons[PFRES_MAX+1] = PFRES_NAMES; char *pf_lcounter[KLCNT_MAX+1] = KLCNT_NAMES; char *pf_fcounter[FCNT_MAX+1] = FCNT_NAMES; int error; PF_RULES_RLOCK_TRACKER; if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_STATUS; ghdr_new->version = 0; ghdr_new->reserved = 0; PF_RULES_RLOCK(); nlattr_add_string(nw, PF_GS_IFNAME, V_pf_status.ifname); nlattr_add_bool(nw, PF_GS_RUNNING, V_pf_status.running); nlattr_add_u32(nw, PF_GS_SINCE, V_pf_status.since); nlattr_add_u32(nw, PF_GS_DEBUG, V_pf_status.debug); nlattr_add_u32(nw, PF_GS_HOSTID, ntohl(V_pf_status.hostid)); nlattr_add_u32(nw, PF_GS_STATES, V_pf_status.states); nlattr_add_u32(nw, PF_GS_SRC_NODES, V_pf_status.src_nodes); nlattr_add_u32(nw, PF_GS_REASSEMBLE, V_pf_status.reass); nlattr_add_u32(nw, PF_GS_SYNCOOKIES_ACTIVE, V_pf_status.syncookies_active); nlattr_add_counters(nw, PF_GS_COUNTERS, PFRES_MAX, pf_reasons, V_pf_status.counters); nlattr_add_counters(nw, PF_GS_LCOUNTERS, KLCNT_MAX, pf_lcounter, V_pf_status.lcounters); nlattr_add_fcounters(nw, PF_GS_FCOUNTERS, FCNT_MAX, pf_fcounter, V_pf_status.fcounters); nlattr_add_counters(nw, PF_GS_SCOUNTERS, SCNT_MAX, pf_fcounter, V_pf_status.scounters); pfi_update_status(V_pf_status.ifname, &s); nlattr_add_u64_array(nw, PF_GS_BCOUNTERS, 2 * 2, (uint64_t *)s.bcounters); nlattr_add_u64_array(nw, PF_GS_PCOUNTERS, 2 * 2 * 2, (uint64_t *)s.pcounters); nlattr_add(nw, PF_GS_CHKSUM, PF_MD5_DIGEST_LENGTH, V_pf_status.pf_chksum); PF_RULES_RUNLOCK(); if (!nlmsg_end(nw)) { error = ENOMEM; goto out; } return (0); out: nlmsg_abort(nw); return (error); } static int pf_handle_clear_status(struct nlmsghdr *hdr, struct nl_pstate *npt) { pf_ioctl_clear_status(); return (0); } struct pf_nl_natlook { sa_family_t af; uint8_t direction; uint8_t proto; struct pf_addr src; struct pf_addr dst; uint16_t sport; uint16_t dport; }; #define _OUT(_field) offsetof(struct pf_nl_natlook, _field) static const struct nlattr_parser nla_p_natlook[] = { { .type = PF_NL_AF, .off = _OUT(af), .cb = nlattr_get_uint8 }, { .type = PF_NL_DIRECTION, .off = _OUT(direction), .cb = nlattr_get_uint8 }, { .type = PF_NL_PROTO, .off = _OUT(proto), .cb = nlattr_get_uint8 }, { .type = PF_NL_SRC_ADDR, .off = _OUT(src), .cb = nlattr_get_in6_addr }, { .type = PF_NL_DST_ADDR, .off = _OUT(dst), .cb = nlattr_get_in6_addr }, { .type = PF_NL_SRC_PORT, .off = _OUT(sport), .cb = nlattr_get_uint16 }, { .type = PF_NL_DST_PORT, .off = _OUT(dport), .cb = nlattr_get_uint16 }, }; #undef _OUT NL_DECLARE_PARSER(natlook_parser, struct genlmsghdr, nlf_p_empty, nla_p_natlook); static int pf_handle_natlook(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_natlook attrs = {}; struct pf_state_key_cmp key = {}; struct nl_writer *nw = npt->nw; struct pf_state_key *sk; struct pf_kstate *state; struct genlmsghdr *ghdr_new; int error, m = 0; int sidx, didx; error = nl_parse_nlmsg(hdr, &natlook_parser, npt, &attrs); if (error != 0) return (error); if (attrs.proto == 0 || PF_AZERO(&attrs.src, attrs.af) || PF_AZERO(&attrs.dst, attrs.af) || ((attrs.proto == IPPROTO_TCP || attrs.proto == IPPROTO_UDP) && (attrs.sport == 0 || attrs.dport == 0))) return (EINVAL); /* NATLOOK src and dst are reversed, so reverse sidx/didx */ sidx = (attrs.direction == PF_IN) ? 1 : 0; didx = (attrs.direction == PF_IN) ? 0 : 1; key.af = attrs.af; key.proto = attrs.proto; PF_ACPY(&key.addr[sidx], &attrs.src, attrs.af); key.port[sidx] = attrs.sport; PF_ACPY(&key.addr[didx], &attrs.dst, attrs.af); key.port[didx] = attrs.dport; state = pf_find_state_all(&key, attrs.direction, &m); if (state == NULL) return (ENOENT); if (m > 1) { PF_STATE_UNLOCK(state); return (E2BIG); } if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { PF_STATE_UNLOCK(state); return (ENOMEM); } ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_NATLOOK; ghdr_new->version = 0; ghdr_new->reserved = 0; sk = state->key[sidx]; nlattr_add_in6_addr(nw, PF_NL_SRC_ADDR, &sk->addr[sidx].v6); nlattr_add_in6_addr(nw, PF_NL_DST_ADDR, &sk->addr[didx].v6); nlattr_add_u16(nw, PF_NL_SRC_PORT, sk->port[sidx]); nlattr_add_u16(nw, PF_NL_DST_PORT, sk->port[didx]); PF_STATE_UNLOCK(state); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } struct pf_nl_set_debug { uint32_t level; }; #define _OUT(_field) offsetof(struct pf_nl_set_debug, _field) static const struct nlattr_parser nla_p_set_debug[] = { { .type = PF_SD_LEVEL, .off = _OUT(level), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(set_debug_parser, struct genlmsghdr, nlf_p_empty, nla_p_set_debug); static int pf_handle_set_debug(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_set_debug attrs = {}; int error; error = nl_parse_nlmsg(hdr, &set_debug_parser, npt, &attrs); if (error != 0) return (error); PF_RULES_WLOCK(); V_pf_status.debug = attrs.level; PF_RULES_WUNLOCK(); return (0); } struct pf_nl_set_timeout { uint32_t timeout; uint32_t seconds; }; #define _OUT(_field) offsetof(struct pf_nl_set_timeout, _field) static const struct nlattr_parser nla_p_set_timeout[] = { { .type = PF_TO_TIMEOUT, .off = _OUT(timeout), .cb = nlattr_get_uint32 }, { .type = PF_TO_SECONDS, .off = _OUT(seconds), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(set_timeout_parser, struct genlmsghdr, nlf_p_empty, nla_p_set_timeout); static int pf_handle_set_timeout(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_set_timeout attrs = {}; int error; error = nl_parse_nlmsg(hdr, &set_timeout_parser, npt, &attrs); if (error != 0) return (error); return (pf_ioctl_set_timeout(attrs.timeout, attrs.seconds, NULL)); } static int pf_handle_get_timeout(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_set_timeout attrs = {}; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &set_timeout_parser, npt, &attrs); if (error != 0) return (error); error = pf_ioctl_get_timeout(attrs.timeout, &attrs.seconds); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_TIMEOUT; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_TO_SECONDS, attrs.seconds); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } struct pf_nl_set_limit { uint32_t index; uint32_t limit; }; #define _OUT(_field) offsetof(struct pf_nl_set_limit, _field) static const struct nlattr_parser nla_p_set_limit[] = { { .type = PF_LI_INDEX, .off = _OUT(index), .cb = nlattr_get_uint32 }, { .type = PF_LI_LIMIT, .off = _OUT(limit), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(set_limit_parser, struct genlmsghdr, nlf_p_empty, nla_p_set_limit); static int pf_handle_set_limit(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_set_limit attrs = {}; int error; error = nl_parse_nlmsg(hdr, &set_limit_parser, npt, &attrs); if (error != 0) return (error); return (pf_ioctl_set_limit(attrs.index, attrs.limit, NULL)); } static int pf_handle_get_limit(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_set_limit attrs = {}; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &set_limit_parser, npt, &attrs); if (error != 0) return (error); error = pf_ioctl_get_limit(attrs.index, &attrs.limit); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_LIMIT; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_LI_LIMIT, attrs.limit); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } static int pf_handle_begin_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; uint32_t ticket; int error; error = pf_ioctl_begin_addrs(&ticket); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_BEGIN_ADDRS; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_BA_TICKET, ticket); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } static bool nlattr_add_pool_addr(struct nl_writer *nw, int attrtype, struct pf_pooladdr *a) { int off; off = nlattr_add_nested(nw, attrtype); nlattr_add_addr_wrap(nw, PF_PA_ADDR, &a->addr); nlattr_add_string(nw, PF_PA_IFNAME, a->ifname); nlattr_set_len(nw, off); return (true); } #define _OUT(_field) offsetof(struct pf_pooladdr, _field) static const struct nlattr_parser nla_p_pool_addr[] = { { .type = PF_PA_ADDR, .off = _OUT(addr), .arg = &addr_wrap_parser, .cb = nlattr_get_nested }, { .type = PF_PA_IFNAME, .off = _OUT(ifname), .arg = (void *)IFNAMSIZ, .cb = nlattr_get_chara }, }; NL_DECLARE_ATTR_PARSER(pool_addr_parser, nla_p_pool_addr); #undef _OUT #define _OUT(_field) offsetof(struct pf_nl_pooladdr, _field) static const struct nlattr_parser nla_p_add_addr[] = { { .type = PF_AA_ACTION, .off = _OUT(action), .cb = nlattr_get_uint32 }, { .type = PF_AA_TICKET, .off = _OUT(ticket), .cb = nlattr_get_uint32 }, { .type = PF_AA_NR, .off = _OUT(nr), .cb = nlattr_get_uint32 }, { .type = PF_AA_R_NUM, .off = _OUT(r_num), .cb = nlattr_get_uint32 }, { .type = PF_AA_R_ACTION, .off = _OUT(r_action), .cb = nlattr_get_uint8 }, { .type = PF_AA_R_LAST, .off = _OUT(r_last), .cb = nlattr_get_uint8 }, { .type = PF_AA_AF, .off = _OUT(af), .cb = nlattr_get_uint8 }, { .type = PF_AA_ANCHOR, .off = _OUT(anchor), .arg = (void *)MAXPATHLEN, .cb = nlattr_get_chara }, { .type = PF_AA_ADDR, .off = _OUT(addr), .arg = &pool_addr_parser, .cb = nlattr_get_nested }, { .type = PF_AA_WHICH, .off = _OUT(which), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(add_addr_parser, struct genlmsghdr, nlf_p_empty, nla_p_add_addr); static int pf_handle_add_addr(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_pooladdr attrs = { 0 }; int error; error = nl_parse_nlmsg(hdr, &add_addr_parser, npt, &attrs); if (error != 0) return (error); if (attrs.which == 0) attrs.which = PF_RDR; error = pf_ioctl_add_addr(&attrs); return (error); } static int pf_handle_get_addrs(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_pooladdr attrs = { 0 }; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &add_addr_parser, npt, &attrs); if (error != 0) return (error); if (attrs.which == 0) attrs.which = PF_RDR; error = pf_ioctl_get_addrs(&attrs); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_ADDRS; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_AA_NR, attrs.nr); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (error); } static int pf_handle_get_addr(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pf_nl_pooladdr attrs = { 0 }; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &add_addr_parser, npt, &attrs); if (error != 0) return (error); if (attrs.which == 0) attrs.which = PF_RDR; error = pf_ioctl_get_addr(&attrs); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_ADDR; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_AA_ACTION, attrs.action); nlattr_add_u32(nw, PF_AA_TICKET, attrs.ticket); nlattr_add_u32(nw, PF_AA_NR, attrs.nr); nlattr_add_u32(nw, PF_AA_R_NUM, attrs.r_num); nlattr_add_u8(nw, PF_AA_R_ACTION, attrs.r_action); nlattr_add_u8(nw, PF_AA_R_LAST, attrs.r_last); nlattr_add_u8(nw, PF_AA_AF, attrs.af); nlattr_add_string(nw, PF_AA_ANCHOR, attrs.anchor); nlattr_add_pool_addr(nw, PF_AA_ADDR, &attrs.addr); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } #define _OUT(_field) offsetof(struct pfioc_ruleset, _field) static const struct nlattr_parser nla_p_ruleset[] = { { .type = PF_RS_PATH, .off = _OUT(path), .arg = (void *)MAXPATHLEN, .cb = nlattr_get_chara }, { .type = PF_RS_NR, .off = _OUT(nr), .cb = nlattr_get_uint32 }, }; NL_DECLARE_PARSER(ruleset_parser, struct genlmsghdr, nlf_p_empty, nla_p_ruleset); #undef _OUT static int pf_handle_get_rulesets(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pfioc_ruleset attrs = { 0 }; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &ruleset_parser, npt, &attrs); if (error != 0) return (error); error = pf_ioctl_get_rulesets(&attrs); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_RULESETS; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_RS_NR, attrs.nr); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } static int pf_handle_get_ruleset(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pfioc_ruleset attrs = { 0 }; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &ruleset_parser, npt, &attrs); if (error) return (error); error = pf_ioctl_get_ruleset(&attrs); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_RULESET; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_string(nw, PF_RS_NAME, attrs.name); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } static bool nlattr_add_pf_threshold(struct nl_writer *nw, int attrtype, struct pf_threshold *t, int secs) { int off = nlattr_add_nested(nw, attrtype); int diff, conn_rate_count; /* Adjust the connection rate estimate. */ conn_rate_count = t->count; diff = secs - t->last; if (diff >= t->seconds) conn_rate_count = 0; else conn_rate_count -= t->count * diff / t->seconds; nlattr_add_u32(nw, PF_TH_LIMIT, t->limit); nlattr_add_u32(nw, PF_TH_SECONDS, t->seconds); nlattr_add_u32(nw, PF_TH_COUNT, conn_rate_count); nlattr_add_u32(nw, PF_TH_LAST, t->last); nlattr_set_len(nw, off); return (true); } static int pf_handle_get_srcnodes(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; struct pf_ksrc_node *n; struct pf_srchash *sh; int i; int secs; hdr->nlmsg_flags |= NLM_F_MULTI; for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) { /* Avoid locking empty rows. */ if (LIST_EMPTY(&sh->nodes)) continue; PF_HASHROW_LOCK(sh); secs = time_uptime; LIST_FOREACH(n, &sh->nodes, entry) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { nlmsg_abort(nw); return (ENOMEM); } ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_GET_SRCNODES; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_in6_addr(nw, PF_SN_ADDR, &n->addr.v6); nlattr_add_in6_addr(nw, PF_SN_RADDR, &n->raddr.v6); nlattr_add_u32(nw, PF_SN_RULE_NR, n->rule->nr); nlattr_add_u64(nw, PF_SN_BYTES_IN, counter_u64_fetch(n->bytes[0])); nlattr_add_u64(nw, PF_SN_BYTES_OUT, counter_u64_fetch(n->bytes[1])); nlattr_add_u64(nw, PF_SN_PACKETS_IN, counter_u64_fetch(n->packets[0])); nlattr_add_u64(nw, PF_SN_PACKETS_OUT, counter_u64_fetch(n->packets[1])); nlattr_add_u32(nw, PF_SN_STATES, n->states); nlattr_add_u32(nw, PF_SN_CONNECTIONS, n->conn); nlattr_add_u8(nw, PF_SN_AF, n->af); nlattr_add_u8(nw, PF_SN_NAF, n->naf); nlattr_add_u8(nw, PF_SN_RULE_TYPE, n->ruletype); nlattr_add_u64(nw, PF_SN_CREATION, secs - n->creation); if (n->expire > secs) nlattr_add_u64(nw, PF_SN_EXPIRE, n->expire - secs); else nlattr_add_u64(nw, PF_SN_EXPIRE, 0); nlattr_add_pf_threshold(nw, PF_SN_CONNECTION_RATE, &n->conn_rate, secs); if (!nlmsg_end(nw)) { PF_HASHROW_UNLOCK(sh); nlmsg_abort(nw); return (ENOMEM); } } PF_HASHROW_UNLOCK(sh); } return (0); } #define _OUT(_field) offsetof(struct pfioc_table, _field) static const struct nlattr_parser nla_p_table[] = { { .type = PF_T_ANCHOR, .off = _OUT(pfrio_table.pfrt_anchor), .arg = (void *)MAXPATHLEN, .cb = nlattr_get_chara }, { .type = PF_T_NAME, .off = _OUT(pfrio_table.pfrt_name), .arg = (void *)PF_TABLE_NAME_SIZE, .cb = nlattr_get_chara }, { .type = PF_T_TABLE_FLAGS, .off = _OUT(pfrio_table.pfrt_flags), .cb = nlattr_get_uint32 }, { .type = PF_T_FLAGS, .off = _OUT(pfrio_flags), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_table[] = {}; NL_DECLARE_PARSER(table_parser, struct genlmsghdr, nlf_p_table, nla_p_table); #undef _OUT static int pf_handle_clear_tables(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pfioc_table attrs = { 0 }; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int ndel = 0; int error; error = nl_parse_nlmsg(hdr, &table_parser, npt, &attrs); if (error != 0) return (error); PF_RULES_WLOCK(); error = pfr_clr_tables(&attrs.pfrio_table, &ndel, attrs.pfrio_flags | PFR_FLAG_USERIOCTL); PF_RULES_WUNLOCK(); if (error != 0) return (error); if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (ENOMEM); ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFNL_CMD_CLEAR_TABLES; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PF_T_NBR_DELETED, ndel); if (!nlmsg_end(nw)) { nlmsg_abort(nw); return (ENOMEM); } return (0); } static const struct nlhdr_parser *all_parsers[] = { &state_parser, &addrule_parser, &getrules_parser, &clear_states_parser, &set_statusif_parser, &natlook_parser, &set_debug_parser, &set_timeout_parser, &set_limit_parser, &pool_addr_parser, &add_addr_parser, &ruleset_parser, &table_parser, }; -static int family_id; +static uint16_t family_id; static const struct genl_cmd pf_cmds[] = { { .cmd_num = PFNL_CMD_GETSTATES, .cmd_name = "GETSTATES", .cmd_cb = pf_handle_getstates, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GETCREATORS, .cmd_name = "GETCREATORS", .cmd_cb = pf_handle_getcreators, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_START, .cmd_name = "START", .cmd_cb = pf_handle_start, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_STOP, .cmd_name = "STOP", .cmd_cb = pf_handle_stop, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_ADDRULE, .cmd_name = "ADDRULE", .cmd_cb = pf_handle_addrule, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GETRULES, .cmd_name = "GETRULES", .cmd_cb = pf_handle_getrules, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GETRULE, .cmd_name = "GETRULE", .cmd_cb = pf_handle_getrule, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_CLRSTATES, .cmd_name = "CLRSTATES", .cmd_cb = pf_handle_clear_states, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_KILLSTATES, .cmd_name = "KILLSTATES", .cmd_cb = pf_handle_kill_states, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_SET_STATUSIF, .cmd_name = "SETSTATUSIF", .cmd_cb = pf_handle_set_statusif, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_STATUS, .cmd_name = "GETSTATUS", .cmd_cb = pf_handle_get_status, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_CLEAR_STATUS, .cmd_name = "CLEARSTATUS", .cmd_cb = pf_handle_clear_status, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_NATLOOK, .cmd_name = "NATLOOK", .cmd_cb = pf_handle_natlook, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_SET_DEBUG, .cmd_name = "SET_DEBUG", .cmd_cb = pf_handle_set_debug, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_SET_TIMEOUT, .cmd_name = "SET_TIMEOUT", .cmd_cb = pf_handle_set_timeout, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_TIMEOUT, .cmd_name = "GET_TIMEOUT", .cmd_cb = pf_handle_get_timeout, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_SET_LIMIT, .cmd_name = "SET_LIMIT", .cmd_cb = pf_handle_set_limit, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_LIMIT, .cmd_name = "GET_LIMIT", .cmd_cb = pf_handle_get_limit, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_BEGIN_ADDRS, .cmd_name = "BEGIN_ADDRS", .cmd_cb = pf_handle_begin_addrs, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_ADD_ADDR, .cmd_name = "ADD_ADDR", .cmd_cb = pf_handle_add_addr, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_ADDRS, .cmd_name = "GET_ADDRS", .cmd_cb = pf_handle_get_addrs, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_ADDR, .cmd_name = "GET_ADDRS", .cmd_cb = pf_handle_get_addr, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_RULESETS, .cmd_name = "GET_RULESETS", .cmd_cb = pf_handle_get_rulesets, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_RULESET, .cmd_name = "GET_RULESET", .cmd_cb = pf_handle_get_ruleset, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_GET_SRCNODES, .cmd_name = "GET_SRCNODES", .cmd_cb = pf_handle_get_srcnodes, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFNL_CMD_CLEAR_TABLES, .cmd_name = "CLEAR_TABLES", .cmd_cb = pf_handle_clear_tables, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, }; void pf_nl_register(void) { NL_VERIFY_PARSERS(all_parsers); family_id = genl_register_family(PFNL_FAMILY_NAME, 0, 2, PFNL_CMD_MAX); - genl_register_cmds(PFNL_FAMILY_NAME, pf_cmds, nitems(pf_cmds)); + genl_register_cmds(family_id, pf_cmds, nitems(pf_cmds)); } void pf_nl_unregister(void) { - genl_unregister_family(PFNL_FAMILY_NAME); + genl_unregister_family(family_id); } diff --git a/sys/netpfil/pf/pflow.c b/sys/netpfil/pf/pflow.c index 8741d55b622c..ae9d162bb6bf 100644 --- a/sys/netpfil/pf/pflow.c +++ b/sys/netpfil/pf/pflow.c @@ -1,1842 +1,1842 @@ /* $OpenBSD: if_pflow.c,v 1.100 2023/11/09 08:53:20 mvs Exp $ */ /* * Copyright (c) 2023 Rubicon Communications, LLC (Netgate) * Copyright (c) 2011 Florian Obser * Copyright (c) 2011 Sebastian Benoit * Copyright (c) 2008 Henning Brauer * Copyright (c) 2008 Joerg Goltermann * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "net/if_var.h" #define PFLOW_MINMTU \ (sizeof(struct pflow_header) + sizeof(struct pflow_flow)) #ifdef PFLOWDEBUG #define DPRINTF(x) do { printf x ; } while (0) #else #define DPRINTF(x) #endif enum pflow_family_t { PFLOW_INET, PFLOW_INET6, PFLOW_NAT4, }; static void pflow_output_process(void *); static int pflow_create(int); static int pflow_destroy(int, bool); static int pflow_calc_mtu(struct pflow_softc *, int, int); static void pflow_setmtu(struct pflow_softc *, int); static int pflowvalidsockaddr(const struct sockaddr *, int); static struct mbuf *pflow_get_mbuf(struct pflow_softc *, u_int16_t); static void pflow_flush(struct pflow_softc *); static int pflow_sendout_v5(struct pflow_softc *); static int pflow_sendout_ipfix(struct pflow_softc *, enum pflow_family_t); static int pflow_sendout_ipfix_tmpl(struct pflow_softc *); static int pflow_sendout_mbuf(struct pflow_softc *, struct mbuf *); static int sysctl_pflowstats(SYSCTL_HANDLER_ARGS); static void pflow_timeout(void *); static void pflow_timeout6(void *); static void pflow_timeout_tmpl(void *); static void pflow_timeout_nat4(void *); static void copy_flow_data(struct pflow_flow *, struct pflow_flow *, const struct pf_kstate *, struct pf_state_key *, int, int); static void copy_flow_ipfix_4_data(struct pflow_ipfix_flow4 *, struct pflow_ipfix_flow4 *, const struct pf_kstate *, struct pf_state_key *, struct pflow_softc *, int, int); static void copy_flow_ipfix_6_data(struct pflow_ipfix_flow6 *, struct pflow_ipfix_flow6 *, const struct pf_kstate *, struct pf_state_key *, struct pflow_softc *, int, int); static int pflow_pack_flow(const struct pf_kstate *, struct pf_state_key *, struct pflow_softc *); static int pflow_pack_flow_ipfix(const struct pf_kstate *, struct pf_state_key *, struct pflow_softc *); static void export_pflow(const struct pf_kstate *); static int export_pflow_if(const struct pf_kstate*, struct pf_state_key *, struct pflow_softc *); static int copy_flow_to_m(struct pflow_flow *flow, struct pflow_softc *sc); static int copy_flow_ipfix_4_to_m(struct pflow_ipfix_flow4 *flow, struct pflow_softc *sc); static int copy_flow_ipfix_6_to_m(struct pflow_ipfix_flow6 *flow, struct pflow_softc *sc); static int copy_nat_ipfix_4_to_m(struct pflow_ipfix_nat4 *, const struct pf_kstate *, struct pflow_softc *, uint8_t, uint64_t); static const char pflowname[] = "pflow"; enum pflowstat_counters { pflow_flows, pflow_packets, pflow_onomem, pflow_oerrors, pflow_ncounters, }; struct pflowstats_ctr { counter_u64_t c[pflow_ncounters]; }; /** * Locking concept * * The list of pflow devices (V_pflowif_list) is managed through epoch. * It is safe to read the list without locking (while in NET_EPOCH). * There may only be one simultaneous modifier, hence we need V_pflow_list_mtx * on every add/delete. * * Each pflow interface protects its own data with the sc_lock mutex. * * We do not require any pf locks, and in fact expect to be called without * hashrow locks held. **/ VNET_DEFINE(struct unrhdr *, pflow_unr); #define V_pflow_unr VNET(pflow_unr) VNET_DEFINE(CK_LIST_HEAD(, pflow_softc), pflowif_list); #define V_pflowif_list VNET(pflowif_list) VNET_DEFINE(struct mtx, pflowif_list_mtx); #define V_pflowif_list_mtx VNET(pflowif_list_mtx) VNET_DEFINE(struct pflowstats_ctr, pflowstat); #define V_pflowstats VNET(pflowstat) #define PFLOW_LOCK(_sc) mtx_lock(&(_sc)->sc_lock) #define PFLOW_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_lock) #define PFLOW_ASSERT(_sc) mtx_assert(&(_sc)->sc_lock, MA_OWNED) SYSCTL_NODE(_net, OID_AUTO, pflow, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "PFLOW"); SYSCTL_PROC(_net_pflow, OID_AUTO, stats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_pflowstats, "S,pflowstats", "PFLOW statistics (struct pflowstats, net/if_pflow.h)"); static inline void pflowstat_inc(enum pflowstat_counters c) { counter_u64_add(V_pflowstats.c[c], 1); } static void vnet_pflowattach(void) { CK_LIST_INIT(&V_pflowif_list); mtx_init(&V_pflowif_list_mtx, "pflow interface list mtx", NULL, MTX_DEF); V_pflow_unr = new_unrhdr(0, PFLOW_MAX_ENTRIES - 1, &V_pflowif_list_mtx); for (int i = 0; i < pflow_ncounters; i++) V_pflowstats.c[i] = counter_u64_alloc(M_WAITOK); } VNET_SYSINIT(vnet_pflowattach, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY, vnet_pflowattach, NULL); static int pflow_jail_remove(void *obj, void *data __unused) { #ifdef VIMAGE const struct prison *pr = obj; #endif struct pflow_softc *sc; CURVNET_SET(pr->pr_vnet); CK_LIST_FOREACH(sc, &V_pflowif_list, sc_next) { pflow_destroy(sc->sc_id, false); } CURVNET_RESTORE(); return (0); } static void vnet_pflowdetach(void) { /* Should have been done by pflow_jail_remove() */ MPASS(CK_LIST_EMPTY(&V_pflowif_list)); delete_unrhdr(V_pflow_unr); mtx_destroy(&V_pflowif_list_mtx); for (int i = 0; i < pflow_ncounters; i++) counter_u64_free(V_pflowstats.c[i]); } VNET_SYSUNINIT(vnet_pflowdetach, SI_SUB_PROTO_FIREWALL, SI_ORDER_FOURTH, vnet_pflowdetach, NULL); static void vnet_pflow_finalise(void) { /* * Ensure we've freed all interfaces, and do not have pending * epoch cleanup calls. */ NET_EPOCH_DRAIN_CALLBACKS(); } VNET_SYSUNINIT(vnet_pflow_finalise, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD, vnet_pflow_finalise, NULL); static void pflow_output_process(void *arg) { struct mbufq ml; struct pflow_softc *sc = arg; struct mbuf *m; mbufq_init(&ml, 0); PFLOW_LOCK(sc); mbufq_concat(&ml, &sc->sc_outputqueue); PFLOW_UNLOCK(sc); CURVNET_SET(sc->sc_vnet); while ((m = mbufq_dequeue(&ml)) != NULL) { pflow_sendout_mbuf(sc, m); } CURVNET_RESTORE(); } static int pflow_create(int unit) { struct pflow_softc *pflowif; int error; pflowif = malloc(sizeof(*pflowif), M_DEVBUF, M_WAITOK|M_ZERO); mtx_init(&pflowif->sc_lock, "pflowlk", NULL, MTX_DEF); pflowif->sc_version = PFLOW_PROTO_DEFAULT; pflowif->sc_observation_dom = PFLOW_ENGINE_TYPE; /* ipfix template init */ bzero(&pflowif->sc_tmpl_ipfix,sizeof(pflowif->sc_tmpl_ipfix)); pflowif->sc_tmpl_ipfix.set_header.set_id = htons(PFLOW_IPFIX_TMPL_SET_ID); pflowif->sc_tmpl_ipfix.set_header.set_length = htons(sizeof(struct pflow_ipfix_tmpl)); /* ipfix IPv4 template */ pflowif->sc_tmpl_ipfix.ipv4_tmpl.h.tmpl_id = htons(PFLOW_IPFIX_TMPL_IPV4_ID); pflowif->sc_tmpl_ipfix.ipv4_tmpl.h.field_count = htons(PFLOW_IPFIX_TMPL_IPV4_FIELD_COUNT); pflowif->sc_tmpl_ipfix.ipv4_tmpl.src_ip.field_id = htons(PFIX_IE_sourceIPv4Address); pflowif->sc_tmpl_ipfix.ipv4_tmpl.src_ip.len = htons(4); pflowif->sc_tmpl_ipfix.ipv4_tmpl.dest_ip.field_id = htons(PFIX_IE_destinationIPv4Address); pflowif->sc_tmpl_ipfix.ipv4_tmpl.dest_ip.len = htons(4); pflowif->sc_tmpl_ipfix.ipv4_tmpl.if_index_in.field_id = htons(PFIX_IE_ingressInterface); pflowif->sc_tmpl_ipfix.ipv4_tmpl.if_index_in.len = htons(4); pflowif->sc_tmpl_ipfix.ipv4_tmpl.if_index_out.field_id = htons(PFIX_IE_egressInterface); pflowif->sc_tmpl_ipfix.ipv4_tmpl.if_index_out.len = htons(4); pflowif->sc_tmpl_ipfix.ipv4_tmpl.packets.field_id = htons(PFIX_IE_packetDeltaCount); pflowif->sc_tmpl_ipfix.ipv4_tmpl.packets.len = htons(8); pflowif->sc_tmpl_ipfix.ipv4_tmpl.octets.field_id = htons(PFIX_IE_octetDeltaCount); pflowif->sc_tmpl_ipfix.ipv4_tmpl.octets.len = htons(8); pflowif->sc_tmpl_ipfix.ipv4_tmpl.start.field_id = htons(PFIX_IE_flowStartMilliseconds); pflowif->sc_tmpl_ipfix.ipv4_tmpl.start.len = htons(8); pflowif->sc_tmpl_ipfix.ipv4_tmpl.finish.field_id = htons(PFIX_IE_flowEndMilliseconds); pflowif->sc_tmpl_ipfix.ipv4_tmpl.finish.len = htons(8); pflowif->sc_tmpl_ipfix.ipv4_tmpl.src_port.field_id = htons(PFIX_IE_sourceTransportPort); pflowif->sc_tmpl_ipfix.ipv4_tmpl.src_port.len = htons(2); pflowif->sc_tmpl_ipfix.ipv4_tmpl.dest_port.field_id = htons(PFIX_IE_destinationTransportPort); pflowif->sc_tmpl_ipfix.ipv4_tmpl.dest_port.len = htons(2); pflowif->sc_tmpl_ipfix.ipv4_tmpl.tos.field_id = htons(PFIX_IE_ipClassOfService); pflowif->sc_tmpl_ipfix.ipv4_tmpl.tos.len = htons(1); pflowif->sc_tmpl_ipfix.ipv4_tmpl.protocol.field_id = htons(PFIX_IE_protocolIdentifier); pflowif->sc_tmpl_ipfix.ipv4_tmpl.protocol.len = htons(1); /* ipfix IPv6 template */ pflowif->sc_tmpl_ipfix.ipv6_tmpl.h.tmpl_id = htons(PFLOW_IPFIX_TMPL_IPV6_ID); pflowif->sc_tmpl_ipfix.ipv6_tmpl.h.field_count = htons(PFLOW_IPFIX_TMPL_IPV6_FIELD_COUNT); pflowif->sc_tmpl_ipfix.ipv6_tmpl.src_ip.field_id = htons(PFIX_IE_sourceIPv6Address); pflowif->sc_tmpl_ipfix.ipv6_tmpl.src_ip.len = htons(16); pflowif->sc_tmpl_ipfix.ipv6_tmpl.dest_ip.field_id = htons(PFIX_IE_destinationIPv6Address); pflowif->sc_tmpl_ipfix.ipv6_tmpl.dest_ip.len = htons(16); pflowif->sc_tmpl_ipfix.ipv6_tmpl.if_index_in.field_id = htons(PFIX_IE_ingressInterface); pflowif->sc_tmpl_ipfix.ipv6_tmpl.if_index_in.len = htons(4); pflowif->sc_tmpl_ipfix.ipv6_tmpl.if_index_out.field_id = htons(PFIX_IE_egressInterface); pflowif->sc_tmpl_ipfix.ipv6_tmpl.if_index_out.len = htons(4); pflowif->sc_tmpl_ipfix.ipv6_tmpl.packets.field_id = htons(PFIX_IE_packetDeltaCount); pflowif->sc_tmpl_ipfix.ipv6_tmpl.packets.len = htons(8); pflowif->sc_tmpl_ipfix.ipv6_tmpl.octets.field_id = htons(PFIX_IE_octetDeltaCount); pflowif->sc_tmpl_ipfix.ipv6_tmpl.octets.len = htons(8); pflowif->sc_tmpl_ipfix.ipv6_tmpl.start.field_id = htons(PFIX_IE_flowStartMilliseconds); pflowif->sc_tmpl_ipfix.ipv6_tmpl.start.len = htons(8); pflowif->sc_tmpl_ipfix.ipv6_tmpl.finish.field_id = htons(PFIX_IE_flowEndMilliseconds); pflowif->sc_tmpl_ipfix.ipv6_tmpl.finish.len = htons(8); pflowif->sc_tmpl_ipfix.ipv6_tmpl.src_port.field_id = htons(PFIX_IE_sourceTransportPort); pflowif->sc_tmpl_ipfix.ipv6_tmpl.src_port.len = htons(2); pflowif->sc_tmpl_ipfix.ipv6_tmpl.dest_port.field_id = htons(PFIX_IE_destinationTransportPort); pflowif->sc_tmpl_ipfix.ipv6_tmpl.dest_port.len = htons(2); pflowif->sc_tmpl_ipfix.ipv6_tmpl.tos.field_id = htons(PFIX_IE_ipClassOfService); pflowif->sc_tmpl_ipfix.ipv6_tmpl.tos.len = htons(1); pflowif->sc_tmpl_ipfix.ipv6_tmpl.protocol.field_id = htons(PFIX_IE_protocolIdentifier); pflowif->sc_tmpl_ipfix.ipv6_tmpl.protocol.len = htons(1); /* NAT44 create template */ pflowif->sc_tmpl_ipfix.nat44_tmpl.h.tmpl_id = htons(PFLOW_IPFIX_TMPL_NAT44_ID); pflowif->sc_tmpl_ipfix.nat44_tmpl.h.field_count = htons(PFLOW_IPFIX_TMPL_NAT44_FIELD_COUNT); pflowif->sc_tmpl_ipfix.nat44_tmpl.timestamp.field_id = htons(PFIX_IE_timeStamp); pflowif->sc_tmpl_ipfix.nat44_tmpl.timestamp.len = htons(8); pflowif->sc_tmpl_ipfix.nat44_tmpl.nat_event.field_id = htons(PFIX_IE_natEvent); pflowif->sc_tmpl_ipfix.nat44_tmpl.nat_event.len = htons(1); pflowif->sc_tmpl_ipfix.nat44_tmpl.protocol.field_id = htons(PFIX_IE_protocolIdentifier); pflowif->sc_tmpl_ipfix.nat44_tmpl.protocol.len = htons(1); pflowif->sc_tmpl_ipfix.nat44_tmpl.src_ip.field_id = htons(PFIX_IE_sourceIPv4Address); pflowif->sc_tmpl_ipfix.nat44_tmpl.src_ip.len = htons(4); pflowif->sc_tmpl_ipfix.nat44_tmpl.src_port.field_id = htons(PFIX_IE_sourceTransportPort); pflowif->sc_tmpl_ipfix.nat44_tmpl.src_port.len = htons(2); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_src_ip.field_id = htons(PFIX_IE_postNATSourceIPv4Address); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_src_ip.len = htons(4); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_src_port.field_id = htons(PFIX_IE_postNAPTSourceTransportPort); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_src_port.len = htons(2); pflowif->sc_tmpl_ipfix.nat44_tmpl.dst_ip.field_id = htons(PFIX_IE_destinationIPv4Address); pflowif->sc_tmpl_ipfix.nat44_tmpl.dst_ip.len = htons(4); pflowif->sc_tmpl_ipfix.nat44_tmpl.dst_port.field_id = htons(PFIX_IE_destinationTransportPort); pflowif->sc_tmpl_ipfix.nat44_tmpl.dst_port.len = htons(2); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_dst_ip.field_id = htons(PFIX_IE_postNATDestinationIPv4Address); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_dst_ip.len = htons(4); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_dst_port.field_id = htons(PFIX_IE_postNAPTDestinationTransportPort); pflowif->sc_tmpl_ipfix.nat44_tmpl.postnat_dst_port.len = htons(2); pflowif->sc_id = unit; pflowif->sc_vnet = curvnet; mbufq_init(&pflowif->sc_outputqueue, 8192); pflow_setmtu(pflowif, ETHERMTU); callout_init_mtx(&pflowif->sc_tmo, &pflowif->sc_lock, 0); callout_init_mtx(&pflowif->sc_tmo6, &pflowif->sc_lock, 0); callout_init_mtx(&pflowif->sc_tmo_nat4, &pflowif->sc_lock, 0); callout_init_mtx(&pflowif->sc_tmo_tmpl, &pflowif->sc_lock, 0); error = swi_add(&pflowif->sc_swi_ie, pflowname, pflow_output_process, pflowif, SWI_NET, INTR_MPSAFE, &pflowif->sc_swi_cookie); if (error) { free(pflowif, M_DEVBUF); return (error); } /* Insert into list of pflows */ mtx_lock(&V_pflowif_list_mtx); CK_LIST_INSERT_HEAD(&V_pflowif_list, pflowif, sc_next); mtx_unlock(&V_pflowif_list_mtx); V_pflow_export_state_ptr = export_pflow; return (0); } static void pflow_free_cb(struct epoch_context *ctx) { struct pflow_softc *sc; sc = __containerof(ctx, struct pflow_softc, sc_epoch_ctx); free(sc, M_DEVBUF); } static int pflow_destroy(int unit, bool drain) { struct pflow_softc *sc; int error __diagused; mtx_lock(&V_pflowif_list_mtx); CK_LIST_FOREACH(sc, &V_pflowif_list, sc_next) { if (sc->sc_id == unit) break; } if (sc == NULL) { mtx_unlock(&V_pflowif_list_mtx); return (ENOENT); } CK_LIST_REMOVE(sc, sc_next); if (CK_LIST_EMPTY(&V_pflowif_list)) V_pflow_export_state_ptr = NULL; mtx_unlock(&V_pflowif_list_mtx); sc->sc_dying = 1; if (drain) { /* Let's be sure no one is using this interface any more. */ NET_EPOCH_DRAIN_CALLBACKS(); } error = swi_remove(sc->sc_swi_cookie); MPASS(error == 0); error = intr_event_destroy(sc->sc_swi_ie); MPASS(error == 0); callout_drain(&sc->sc_tmo); callout_drain(&sc->sc_tmo6); callout_drain(&sc->sc_tmo_nat4); callout_drain(&sc->sc_tmo_tmpl); m_freem(sc->sc_mbuf); m_freem(sc->sc_mbuf6); m_freem(sc->sc_mbuf_nat4); PFLOW_LOCK(sc); mbufq_drain(&sc->sc_outputqueue); if (sc->so != NULL) { soclose(sc->so); sc->so = NULL; } if (sc->sc_flowdst != NULL) free(sc->sc_flowdst, M_DEVBUF); if (sc->sc_flowsrc != NULL) free(sc->sc_flowsrc, M_DEVBUF); PFLOW_UNLOCK(sc); mtx_destroy(&sc->sc_lock); free_unr(V_pflow_unr, unit); NET_EPOCH_CALL(pflow_free_cb, &sc->sc_epoch_ctx); return (0); } static int pflowvalidsockaddr(const struct sockaddr *sa, int ignore_port) { const struct sockaddr_in6 *sin6; const struct sockaddr_in *sin; if (sa == NULL) return (0); switch(sa->sa_family) { case AF_INET: sin = (const struct sockaddr_in *)sa; return (sin->sin_addr.s_addr != INADDR_ANY && (ignore_port || sin->sin_port != 0)); case AF_INET6: sin6 = (const struct sockaddr_in6 *)sa; return (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && (ignore_port || sin6->sin6_port != 0)); default: return (0); } } int pflow_calc_mtu(struct pflow_softc *sc, int mtu, int hdrsz) { size_t min; sc->sc_maxcount4 = (mtu - hdrsz - sizeof(struct udpiphdr)) / sizeof(struct pflow_ipfix_flow4); sc->sc_maxcount6 = (mtu - hdrsz - sizeof(struct udpiphdr)) / sizeof(struct pflow_ipfix_flow6); sc->sc_maxcount_nat4 = (mtu - hdrsz - sizeof(struct udpiphdr)) / sizeof(struct pflow_ipfix_nat4); if (sc->sc_maxcount4 > PFLOW_MAXFLOWS) sc->sc_maxcount4 = PFLOW_MAXFLOWS; if (sc->sc_maxcount6 > PFLOW_MAXFLOWS) sc->sc_maxcount6 = PFLOW_MAXFLOWS; if (sc->sc_maxcount_nat4 > PFLOW_MAXFLOWS) sc->sc_maxcount_nat4 = PFLOW_MAXFLOWS; min = MIN(sc->sc_maxcount4 * sizeof(struct pflow_ipfix_flow4), sc->sc_maxcount6 * sizeof(struct pflow_ipfix_flow6)); min = MIN(min, sc->sc_maxcount_nat4 * sizeof(struct pflow_ipfix_nat4)); return (hdrsz + sizeof(struct udpiphdr) + min); } static void pflow_setmtu(struct pflow_softc *sc, int mtu_req) { int mtu; mtu = mtu_req; switch (sc->sc_version) { case PFLOW_PROTO_5: sc->sc_maxcount = (mtu - sizeof(struct pflow_header) - sizeof(struct udpiphdr)) / sizeof(struct pflow_flow); if (sc->sc_maxcount > PFLOW_MAXFLOWS) sc->sc_maxcount = PFLOW_MAXFLOWS; break; case PFLOW_PROTO_10: pflow_calc_mtu(sc, mtu, sizeof(struct pflow_v10_header)); break; default: /* NOTREACHED */ break; } } static struct mbuf * pflow_get_mbuf(struct pflow_softc *sc, u_int16_t set_id) { struct pflow_set_header set_hdr; struct pflow_header h; struct mbuf *m; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) { pflowstat_inc(pflow_onomem); return (NULL); } MCLGET(m, M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); pflowstat_inc(pflow_onomem); return (NULL); } m->m_len = m->m_pkthdr.len = 0; if (sc == NULL) /* get only a new empty mbuf */ return (m); switch (sc->sc_version) { case PFLOW_PROTO_5: /* populate pflow_header */ h.reserved1 = 0; h.reserved2 = 0; h.count = 0; h.version = htons(PFLOW_PROTO_5); h.flow_sequence = htonl(sc->sc_gcounter); h.engine_type = PFLOW_ENGINE_TYPE; h.engine_id = PFLOW_ENGINE_ID; m_copyback(m, 0, PFLOW_HDRLEN, (caddr_t)&h); sc->sc_count = 0; callout_reset(&sc->sc_tmo, PFLOW_TIMEOUT * hz, pflow_timeout, sc); break; case PFLOW_PROTO_10: /* populate pflow_set_header */ set_hdr.set_length = 0; set_hdr.set_id = htons(set_id); m_copyback(m, 0, PFLOW_SET_HDRLEN, (caddr_t)&set_hdr); break; default: /* NOTREACHED */ break; } return (m); } static void copy_flow_data(struct pflow_flow *flow1, struct pflow_flow *flow2, const struct pf_kstate *st, struct pf_state_key *sk, int src, int dst) { flow1->src_ip = flow2->dest_ip = sk->addr[src].v4.s_addr; flow1->src_port = flow2->dest_port = sk->port[src]; flow1->dest_ip = flow2->src_ip = sk->addr[dst].v4.s_addr; flow1->dest_port = flow2->src_port = sk->port[dst]; flow1->dest_as = flow2->src_as = flow1->src_as = flow2->dest_as = 0; flow1->if_index_in = htons(st->if_index_in); flow1->if_index_out = htons(st->if_index_out); flow2->if_index_in = htons(st->if_index_out); flow2->if_index_out = htons(st->if_index_in); flow1->dest_mask = flow2->src_mask = flow1->src_mask = flow2->dest_mask = 0; flow1->flow_packets = htonl(st->packets[0]); flow2->flow_packets = htonl(st->packets[1]); flow1->flow_octets = htonl(st->bytes[0]); flow2->flow_octets = htonl(st->bytes[1]); /* * Pretend the flow was created or expired when the machine came up * when creation is in the future of the last time a package was seen * or was created / expired before this machine came up due to pfsync. */ flow1->flow_start = flow2->flow_start = st->creation < 0 || st->creation > st->expire ? htonl(0) : htonl(st->creation); flow1->flow_finish = flow2->flow_finish = st->expire < 0 ? htonl(0) : htonl(st->expire); flow1->tcp_flags = flow2->tcp_flags = 0; flow1->protocol = flow2->protocol = sk->proto; flow1->tos = flow2->tos = st->rule->tos; } static void copy_flow_ipfix_4_data(struct pflow_ipfix_flow4 *flow1, struct pflow_ipfix_flow4 *flow2, const struct pf_kstate *st, struct pf_state_key *sk, struct pflow_softc *sc, int src, int dst) { flow1->src_ip = flow2->dest_ip = sk->addr[src].v4.s_addr; flow1->src_port = flow2->dest_port = sk->port[src]; flow1->dest_ip = flow2->src_ip = sk->addr[dst].v4.s_addr; flow1->dest_port = flow2->src_port = sk->port[dst]; flow1->if_index_in = htonl(st->if_index_in); flow1->if_index_out = htonl(st->if_index_out); flow2->if_index_in = htonl(st->if_index_out); flow2->if_index_out = htonl(st->if_index_in); flow1->flow_packets = htobe64(st->packets[0]); flow2->flow_packets = htobe64(st->packets[1]); flow1->flow_octets = htobe64(st->bytes[0]); flow2->flow_octets = htobe64(st->bytes[1]); /* * Pretend the flow was created when the machine came up when creation * is in the future of the last time a package was seen due to pfsync. */ if (st->creation > st->expire) flow1->flow_start = flow2->flow_start = htobe64((time_second - time_uptime)*1000); else flow1->flow_start = flow2->flow_start = htobe64((pf_get_time() - (pf_get_uptime() - st->creation))); flow1->flow_finish = flow2->flow_finish = htobe64((pf_get_time() - (pf_get_uptime() - st->expire))); flow1->protocol = flow2->protocol = sk->proto; flow1->tos = flow2->tos = st->rule->tos; } static void copy_flow_ipfix_6_data(struct pflow_ipfix_flow6 *flow1, struct pflow_ipfix_flow6 *flow2, const struct pf_kstate *st, struct pf_state_key *sk, struct pflow_softc *sc, int src, int dst) { bcopy(&sk->addr[src].v6, &flow1->src_ip, sizeof(flow1->src_ip)); bcopy(&sk->addr[src].v6, &flow2->dest_ip, sizeof(flow2->dest_ip)); flow1->src_port = flow2->dest_port = sk->port[src]; bcopy(&sk->addr[dst].v6, &flow1->dest_ip, sizeof(flow1->dest_ip)); bcopy(&sk->addr[dst].v6, &flow2->src_ip, sizeof(flow2->src_ip)); flow1->dest_port = flow2->src_port = sk->port[dst]; flow1->if_index_in = htonl(st->if_index_in); flow1->if_index_out = htonl(st->if_index_out); flow2->if_index_in = htonl(st->if_index_out); flow2->if_index_out = htonl(st->if_index_in); flow1->flow_packets = htobe64(st->packets[0]); flow2->flow_packets = htobe64(st->packets[1]); flow1->flow_octets = htobe64(st->bytes[0]); flow2->flow_octets = htobe64(st->bytes[1]); /* * Pretend the flow was created when the machine came up when creation * is in the future of the last time a package was seen due to pfsync. */ if (st->creation > st->expire) flow1->flow_start = flow2->flow_start = htobe64((time_second - time_uptime)*1000); else flow1->flow_start = flow2->flow_start = htobe64((pf_get_time() - (pf_get_uptime() - st->creation))); flow1->flow_finish = flow2->flow_finish = htobe64((pf_get_time() - (pf_get_uptime() - st->expire))); flow1->protocol = flow2->protocol = sk->proto; flow1->tos = flow2->tos = st->rule->tos; } static void copy_nat_ipfix_4_data(struct pflow_ipfix_nat4 *nat1, struct pflow_ipfix_nat4 *nat2, const struct pf_kstate *st, struct pf_state_key *sk, struct pflow_softc *sc, int src, int dst) { nat1->src_ip = nat2->dest_ip = st->key[PF_SK_STACK]->addr[src].v4.s_addr; nat1->src_port = nat2->dest_port = st->key[PF_SK_STACK]->port[src]; nat1->dest_ip = nat2->src_ip = st->key[PF_SK_STACK]->addr[dst].v4.s_addr; nat1->dest_port = nat2->src_port = st->key[PF_SK_STACK]->port[dst]; nat1->postnat_src_ip = nat2->postnat_dest_ip = st->key[PF_SK_WIRE]->addr[src].v4.s_addr; nat1->postnat_src_port = nat2->postnat_dest_port = st->key[PF_SK_WIRE]->port[src]; nat1->postnat_dest_ip = nat2->postnat_src_ip = st->key[PF_SK_WIRE]->addr[dst].v4.s_addr; nat1->postnat_dest_port = nat2->postnat_src_port = st->key[PF_SK_WIRE]->port[dst]; nat1->protocol = nat2->protocol = sk->proto; /* * Because we have to generate a create and delete event we'll fill out the * timestamp and nat_event fields when we transmit. As opposed to doing this * work a second time. */ } static void export_pflow(const struct pf_kstate *st) { struct pflow_softc *sc = NULL; struct pf_state_key *sk; NET_EPOCH_ASSERT(); /* e.g. if pf_state_key_attach() fails. */ if (st->key[PF_SK_STACK] == NULL || st->key[PF_SK_WIRE] == NULL) return; sk = st->key[st->direction == PF_IN ? PF_SK_WIRE : PF_SK_STACK]; CK_LIST_FOREACH(sc, &V_pflowif_list, sc_next) { PFLOW_LOCK(sc); switch (sc->sc_version) { case PFLOW_PROTO_5: if (sk->af == AF_INET) export_pflow_if(st, sk, sc); break; case PFLOW_PROTO_10: if (sk->af == AF_INET || sk->af == AF_INET6) export_pflow_if(st, sk, sc); break; default: /* NOTREACHED */ break; } PFLOW_UNLOCK(sc); } } static int export_pflow_if(const struct pf_kstate *st, struct pf_state_key *sk, struct pflow_softc *sc) { struct pf_kstate pfs_copy; u_int64_t bytes[2]; int ret = 0; if (sc->sc_version == PFLOW_PROTO_10) return (pflow_pack_flow_ipfix(st, sk, sc)); /* PFLOW_PROTO_5 */ if ((st->bytes[0] < (u_int64_t)PFLOW_MAXBYTES) && (st->bytes[1] < (u_int64_t)PFLOW_MAXBYTES)) return (pflow_pack_flow(st, sk, sc)); /* flow > PFLOW_MAXBYTES need special handling */ bcopy(st, &pfs_copy, sizeof(pfs_copy)); bytes[0] = pfs_copy.bytes[0]; bytes[1] = pfs_copy.bytes[1]; while (bytes[0] > PFLOW_MAXBYTES) { pfs_copy.bytes[0] = PFLOW_MAXBYTES; pfs_copy.bytes[1] = 0; if ((ret = pflow_pack_flow(&pfs_copy, sk, sc)) != 0) return (ret); if ((bytes[0] - PFLOW_MAXBYTES) > 0) bytes[0] -= PFLOW_MAXBYTES; } while (bytes[1] > (u_int64_t)PFLOW_MAXBYTES) { pfs_copy.bytes[1] = PFLOW_MAXBYTES; pfs_copy.bytes[0] = 0; if ((ret = pflow_pack_flow(&pfs_copy, sk, sc)) != 0) return (ret); if ((bytes[1] - PFLOW_MAXBYTES) > 0) bytes[1] -= PFLOW_MAXBYTES; } pfs_copy.bytes[0] = bytes[0]; pfs_copy.bytes[1] = bytes[1]; return (pflow_pack_flow(&pfs_copy, sk, sc)); } static int copy_flow_to_m(struct pflow_flow *flow, struct pflow_softc *sc) { int ret = 0; PFLOW_ASSERT(sc); if (sc->sc_mbuf == NULL) { if ((sc->sc_mbuf = pflow_get_mbuf(sc, 0)) == NULL) return (ENOBUFS); } m_copyback(sc->sc_mbuf, PFLOW_HDRLEN + (sc->sc_count * sizeof(struct pflow_flow)), sizeof(struct pflow_flow), (caddr_t)flow); pflowstat_inc(pflow_flows); sc->sc_gcounter++; sc->sc_count++; if (sc->sc_count >= sc->sc_maxcount) ret = pflow_sendout_v5(sc); return(ret); } static int copy_flow_ipfix_4_to_m(struct pflow_ipfix_flow4 *flow, struct pflow_softc *sc) { int ret = 0; PFLOW_ASSERT(sc); if (sc->sc_mbuf == NULL) { if ((sc->sc_mbuf = pflow_get_mbuf(sc, PFLOW_IPFIX_TMPL_IPV4_ID)) == NULL) { return (ENOBUFS); } sc->sc_count4 = 0; callout_reset(&sc->sc_tmo, PFLOW_TIMEOUT * hz, pflow_timeout, sc); } m_copyback(sc->sc_mbuf, PFLOW_SET_HDRLEN + (sc->sc_count4 * sizeof(struct pflow_ipfix_flow4)), sizeof(struct pflow_ipfix_flow4), (caddr_t)flow); pflowstat_inc(pflow_flows); sc->sc_gcounter++; sc->sc_count4++; if (sc->sc_count4 >= sc->sc_maxcount4) ret = pflow_sendout_ipfix(sc, PFLOW_INET); return(ret); } static int copy_flow_ipfix_6_to_m(struct pflow_ipfix_flow6 *flow, struct pflow_softc *sc) { int ret = 0; PFLOW_ASSERT(sc); if (sc->sc_mbuf6 == NULL) { if ((sc->sc_mbuf6 = pflow_get_mbuf(sc, PFLOW_IPFIX_TMPL_IPV6_ID)) == NULL) { return (ENOBUFS); } sc->sc_count6 = 0; callout_reset(&sc->sc_tmo6, PFLOW_TIMEOUT * hz, pflow_timeout6, sc); } m_copyback(sc->sc_mbuf6, PFLOW_SET_HDRLEN + (sc->sc_count6 * sizeof(struct pflow_ipfix_flow6)), sizeof(struct pflow_ipfix_flow6), (caddr_t)flow); pflowstat_inc(pflow_flows); sc->sc_gcounter++; sc->sc_count6++; if (sc->sc_count6 >= sc->sc_maxcount6) ret = pflow_sendout_ipfix(sc, PFLOW_INET6); return(ret); } int copy_nat_ipfix_4_to_m(struct pflow_ipfix_nat4 *nat, const struct pf_kstate *st, struct pflow_softc *sc, uint8_t event, uint64_t timestamp) { int ret = 0; PFLOW_ASSERT(sc); if (sc->sc_mbuf_nat4 == NULL) { if ((sc->sc_mbuf_nat4 = pflow_get_mbuf(sc, PFLOW_IPFIX_TMPL_NAT44_ID)) == NULL) { return (ENOBUFS); } sc->sc_count_nat4 = 0; callout_reset(&sc->sc_tmo, PFLOW_TIMEOUT * hz, pflow_timeout_nat4, sc); } nat->nat_event = event; nat->timestamp = htobe64(pf_get_time() - (pf_get_uptime() - timestamp)); m_copyback(sc->sc_mbuf_nat4, PFLOW_SET_HDRLEN + (sc->sc_count_nat4 * sizeof(struct pflow_ipfix_nat4)), sizeof(struct pflow_ipfix_nat4), (caddr_t)nat); sc->sc_count_nat4++; pflowstat_inc(pflow_flows); sc->sc_gcounter++; if (sc->sc_count_nat4 >= sc->sc_maxcount_nat4) ret = pflow_sendout_ipfix(sc, PFLOW_NAT4); return (ret); } static int pflow_pack_flow(const struct pf_kstate *st, struct pf_state_key *sk, struct pflow_softc *sc) { struct pflow_flow flow1; struct pflow_flow flow2; int ret = 0; bzero(&flow1, sizeof(flow1)); bzero(&flow2, sizeof(flow2)); if (st->direction == PF_OUT) copy_flow_data(&flow1, &flow2, st, sk, 1, 0); else copy_flow_data(&flow1, &flow2, st, sk, 0, 1); if (st->bytes[0] != 0) /* first flow from state */ ret = copy_flow_to_m(&flow1, sc); if (st->bytes[1] != 0) /* second flow from state */ ret = copy_flow_to_m(&flow2, sc); return (ret); } static bool pflow_is_natd(const struct pf_kstate *st) { /* If ports or addresses are different we've been NAT-ed. */ return (memcmp(st->key[PF_SK_WIRE], st->key[PF_SK_STACK], sizeof(struct pf_addr) * 2 + sizeof(uint16_t) * 2) != 0); } static int pflow_pack_flow_ipfix(const struct pf_kstate *st, struct pf_state_key *sk, struct pflow_softc *sc) { struct pflow_ipfix_flow4 flow4_1, flow4_2; struct pflow_ipfix_nat4 nat4_1, nat4_2; struct pflow_ipfix_flow6 flow6_1, flow6_2; int ret = 0; bool nat = false; switch (sk->af) { case AF_INET: bzero(&flow4_1, sizeof(flow4_1)); bzero(&flow4_2, sizeof(flow4_2)); nat = pflow_is_natd(st); if (st->direction == PF_OUT) copy_flow_ipfix_4_data(&flow4_1, &flow4_2, st, sk, sc, 1, 0); else copy_flow_ipfix_4_data(&flow4_1, &flow4_2, st, sk, sc, 0, 1); if (nat) copy_nat_ipfix_4_data(&nat4_1, &nat4_2, st, sk, sc, 1, 0); if (st->bytes[0] != 0) /* first flow from state */ { ret = copy_flow_ipfix_4_to_m(&flow4_1, sc); if (ret == 0 && nat) { ret = copy_nat_ipfix_4_to_m(&nat4_1, st, sc, PFIX_NAT_EVENT_SESSION_CREATE, st->creation); ret |= copy_nat_ipfix_4_to_m(&nat4_1, st, sc, PFIX_NAT_EVENT_SESSION_DELETE, st->expire); } } if (st->bytes[1] != 0) /* second flow from state */ { ret = copy_flow_ipfix_4_to_m(&flow4_2, sc); if (ret == 0 && nat) { ret = copy_nat_ipfix_4_to_m(&nat4_2, st, sc, PFIX_NAT_EVENT_SESSION_CREATE, st->creation); ret |= copy_nat_ipfix_4_to_m(&nat4_2, st, sc, PFIX_NAT_EVENT_SESSION_DELETE, st->expire); } } break; case AF_INET6: bzero(&flow6_1, sizeof(flow6_1)); bzero(&flow6_2, sizeof(flow6_2)); if (st->direction == PF_OUT) copy_flow_ipfix_6_data(&flow6_1, &flow6_2, st, sk, sc, 1, 0); else copy_flow_ipfix_6_data(&flow6_1, &flow6_2, st, sk, sc, 0, 1); if (st->bytes[0] != 0) /* first flow from state */ ret = copy_flow_ipfix_6_to_m(&flow6_1, sc); if (st->bytes[1] != 0) /* second flow from state */ ret = copy_flow_ipfix_6_to_m(&flow6_2, sc); break; } return (ret); } static void pflow_timeout(void *v) { struct pflow_softc *sc = v; PFLOW_ASSERT(sc); CURVNET_SET(sc->sc_vnet); switch (sc->sc_version) { case PFLOW_PROTO_5: pflow_sendout_v5(sc); break; case PFLOW_PROTO_10: pflow_sendout_ipfix(sc, PFLOW_INET); break; default: /* NOTREACHED */ panic("Unsupported version %d", sc->sc_version); break; } CURVNET_RESTORE(); } static void pflow_timeout6(void *v) { struct pflow_softc *sc = v; PFLOW_ASSERT(sc); if (sc->sc_version != PFLOW_PROTO_10) return; CURVNET_SET(sc->sc_vnet); pflow_sendout_ipfix(sc, PFLOW_INET6); CURVNET_RESTORE(); } static void pflow_timeout_tmpl(void *v) { struct pflow_softc *sc = v; PFLOW_ASSERT(sc); if (sc->sc_version != PFLOW_PROTO_10) return; CURVNET_SET(sc->sc_vnet); pflow_sendout_ipfix_tmpl(sc); CURVNET_RESTORE(); } static void pflow_timeout_nat4(void *v) { struct pflow_softc *sc = v; PFLOW_ASSERT(sc); if (sc->sc_version != PFLOW_PROTO_10) return; CURVNET_SET(sc->sc_vnet); pflow_sendout_ipfix(sc, PFLOW_NAT4); CURVNET_RESTORE(); } static void pflow_flush(struct pflow_softc *sc) { PFLOW_ASSERT(sc); switch (sc->sc_version) { case PFLOW_PROTO_5: pflow_sendout_v5(sc); break; case PFLOW_PROTO_10: pflow_sendout_ipfix(sc, PFLOW_INET); pflow_sendout_ipfix(sc, PFLOW_INET6); pflow_sendout_ipfix(sc, PFLOW_NAT4); break; default: /* NOTREACHED */ break; } } static int pflow_sendout_v5(struct pflow_softc *sc) { struct mbuf *m = sc->sc_mbuf; struct pflow_header *h; struct timespec tv; PFLOW_ASSERT(sc); if (m == NULL) return (0); sc->sc_mbuf = NULL; pflowstat_inc(pflow_packets); h = mtod(m, struct pflow_header *); h->count = htons(sc->sc_count); /* populate pflow_header */ h->uptime_ms = htonl(time_uptime * 1000); getnanotime(&tv); h->time_sec = htonl(tv.tv_sec); /* XXX 2038 */ h->time_nanosec = htonl(tv.tv_nsec); if (mbufq_enqueue(&sc->sc_outputqueue, m) == 0) swi_sched(sc->sc_swi_cookie, 0); return (0); } static int pflow_sendout_ipfix(struct pflow_softc *sc, enum pflow_family_t af) { struct mbuf *m; struct pflow_v10_header *h10; struct pflow_set_header *set_hdr; u_int32_t count; int set_length; PFLOW_ASSERT(sc); switch (af) { case PFLOW_INET: m = sc->sc_mbuf; callout_stop(&sc->sc_tmo); if (m == NULL) return (0); sc->sc_mbuf = NULL; count = sc->sc_count4; set_length = sizeof(struct pflow_set_header) + sc->sc_count4 * sizeof(struct pflow_ipfix_flow4); break; case PFLOW_INET6: m = sc->sc_mbuf6; callout_stop(&sc->sc_tmo6); if (m == NULL) return (0); sc->sc_mbuf6 = NULL; count = sc->sc_count6; set_length = sizeof(struct pflow_set_header) + sc->sc_count6 * sizeof(struct pflow_ipfix_flow6); break; case PFLOW_NAT4: m = sc->sc_mbuf_nat4; callout_stop(&sc->sc_tmo_nat4); if (m == NULL) return (0); sc->sc_mbuf_nat4 = NULL; count = sc->sc_count_nat4; set_length = sizeof(struct pflow_set_header) + sc->sc_count_nat4 * sizeof(struct pflow_ipfix_nat4); break; default: panic("Unsupported AF %d", af); } pflowstat_inc(pflow_packets); set_hdr = mtod(m, struct pflow_set_header *); set_hdr->set_length = htons(set_length); /* populate pflow_header */ M_PREPEND(m, sizeof(struct pflow_v10_header), M_NOWAIT); if (m == NULL) { pflowstat_inc(pflow_onomem); return (ENOBUFS); } h10 = mtod(m, struct pflow_v10_header *); h10->version = htons(PFLOW_PROTO_10); h10->length = htons(PFLOW_IPFIX_HDRLEN + set_length); h10->time_sec = htonl(time_second); /* XXX 2038 */ h10->flow_sequence = htonl(sc->sc_sequence); sc->sc_sequence += count; h10->observation_dom = htonl(sc->sc_observation_dom); if (mbufq_enqueue(&sc->sc_outputqueue, m) == 0) swi_sched(sc->sc_swi_cookie, 0); return (0); } static int pflow_sendout_ipfix_tmpl(struct pflow_softc *sc) { struct mbuf *m; struct pflow_v10_header *h10; PFLOW_ASSERT(sc); m = pflow_get_mbuf(sc, 0); if (m == NULL) return (0); m_copyback(m, 0, sizeof(struct pflow_ipfix_tmpl), (caddr_t)&sc->sc_tmpl_ipfix); pflowstat_inc(pflow_packets); /* populate pflow_header */ M_PREPEND(m, sizeof(struct pflow_v10_header), M_NOWAIT); if (m == NULL) { pflowstat_inc(pflow_onomem); return (ENOBUFS); } h10 = mtod(m, struct pflow_v10_header *); h10->version = htons(PFLOW_PROTO_10); h10->length = htons(PFLOW_IPFIX_HDRLEN + sizeof(struct pflow_ipfix_tmpl)); h10->time_sec = htonl(time_second); /* XXX 2038 */ h10->flow_sequence = htonl(sc->sc_sequence); h10->observation_dom = htonl(sc->sc_observation_dom); callout_reset(&sc->sc_tmo_tmpl, PFLOW_TMPL_TIMEOUT * hz, pflow_timeout_tmpl, sc); if (mbufq_enqueue(&sc->sc_outputqueue, m) == 0) swi_sched(sc->sc_swi_cookie, 0); return (0); } static int pflow_sendout_mbuf(struct pflow_softc *sc, struct mbuf *m) { if (sc->so == NULL) { m_freem(m); return (EINVAL); } return (sosend(sc->so, sc->sc_flowdst, NULL, m, NULL, 0, curthread)); } static int sysctl_pflowstats(SYSCTL_HANDLER_ARGS) { struct pflowstats pflowstats; pflowstats.pflow_flows = counter_u64_fetch(V_pflowstats.c[pflow_flows]); pflowstats.pflow_packets = counter_u64_fetch(V_pflowstats.c[pflow_packets]); pflowstats.pflow_onomem = counter_u64_fetch(V_pflowstats.c[pflow_onomem]); pflowstats.pflow_oerrors = counter_u64_fetch(V_pflowstats.c[pflow_oerrors]); return (sysctl_handle_opaque(oidp, &pflowstats, sizeof(pflowstats), req)); } static int pflow_nl_list(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct epoch_tracker et; struct pflow_softc *sc = NULL; struct nl_writer *nw = npt->nw; int error = 0; hdr->nlmsg_flags |= NLM_F_MULTI; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(sc, &V_pflowif_list, sc_next) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { error = ENOMEM; goto out; } struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFLOWNL_CMD_LIST; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PFLOWNL_L_ID, sc->sc_id); if (! nlmsg_end(nw)) { error = ENOMEM; goto out; } } out: NET_EPOCH_EXIT(et); if (error != 0) nlmsg_abort(nw); return (error); } static int pflow_nl_create(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_writer *nw = npt->nw; int error = 0; int unit; if (! nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { return (ENOMEM); } struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = PFLOWNL_CMD_CREATE; ghdr_new->version = 0; ghdr_new->reserved = 0; unit = alloc_unr(V_pflow_unr); if (unit == -1) { nlmsg_abort(nw); return (ENOMEM); } error = pflow_create(unit); if (error != 0) { free_unr(V_pflow_unr, unit); nlmsg_abort(nw); return (error); } nlattr_add_s32(nw, PFLOWNL_CREATE_ID, unit); if (! nlmsg_end(nw)) { pflow_destroy(unit, true); return (ENOMEM); } return (0); } struct pflow_parsed_del { int id; }; #define _OUT(_field) offsetof(struct pflow_parsed_del, _field) static const struct nlattr_parser nla_p_del[] = { { .type = PFLOWNL_DEL_ID, .off = _OUT(id), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(del_parser, struct genlmsghdr, nlf_p_empty, nla_p_del); static int pflow_nl_del(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct pflow_parsed_del d = {}; int error; error = nl_parse_nlmsg(hdr, &del_parser, npt, &d); if (error != 0) return (error); error = pflow_destroy(d.id, true); return (error); } struct pflow_parsed_get { int id; }; #define _OUT(_field) offsetof(struct pflow_parsed_get, _field) static const struct nlattr_parser nla_p_get[] = { { .type = PFLOWNL_GET_ID, .off = _OUT(id), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(get_parser, struct genlmsghdr, nlf_p_empty, nla_p_get); static bool nlattr_add_sockaddr(struct nl_writer *nw, int attr, const struct sockaddr *s) { int off = nlattr_add_nested(nw, attr); if (off == 0) return (false); nlattr_add_u8(nw, PFLOWNL_ADDR_FAMILY, s->sa_family); switch (s->sa_family) { case AF_INET: { const struct sockaddr_in *in = (const struct sockaddr_in *)s; nlattr_add_u16(nw, PFLOWNL_ADDR_PORT, in->sin_port); nlattr_add_in_addr(nw, PFLOWNL_ADDR_IP, &in->sin_addr); break; } case AF_INET6: { const struct sockaddr_in6 *in6 = (const struct sockaddr_in6 *)s; nlattr_add_u16(nw, PFLOWNL_ADDR_PORT, in6->sin6_port); nlattr_add_in6_addr(nw, PFLOWNL_ADDR_IP6, &in6->sin6_addr); break; } default: panic("Unknown address family %d", s->sa_family); } nlattr_set_len(nw, off); return (true); } static int pflow_nl_get(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct epoch_tracker et; struct pflow_parsed_get g = {}; struct pflow_softc *sc = NULL; struct nl_writer *nw = npt->nw; struct genlmsghdr *ghdr_new; int error; error = nl_parse_nlmsg(hdr, &get_parser, npt, &g); if (error != 0) return (error); NET_EPOCH_ENTER(et); CK_LIST_FOREACH(sc, &V_pflowif_list, sc_next) { if (sc->sc_id == g.id) break; } if (sc == NULL) { error = ENOENT; goto out; } if (! nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) { nlmsg_abort(nw); error = ENOMEM; goto out; } ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); if (ghdr_new == NULL) { nlmsg_abort(nw); error = ENOMEM; goto out; } ghdr_new->cmd = PFLOWNL_CMD_GET; ghdr_new->version = 0; ghdr_new->reserved = 0; nlattr_add_u32(nw, PFLOWNL_GET_ID, sc->sc_id); nlattr_add_u16(nw, PFLOWNL_GET_VERSION, sc->sc_version); if (sc->sc_flowsrc) nlattr_add_sockaddr(nw, PFLOWNL_GET_SRC, sc->sc_flowsrc); if (sc->sc_flowdst) nlattr_add_sockaddr(nw, PFLOWNL_GET_DST, sc->sc_flowdst); nlattr_add_u32(nw, PFLOWNL_GET_OBSERVATION_DOMAIN, sc->sc_observation_dom); nlattr_add_u8(nw, PFLOWNL_GET_SOCKET_STATUS, sc->so != NULL); if (! nlmsg_end(nw)) { nlmsg_abort(nw); error = ENOMEM; } out: NET_EPOCH_EXIT(et); return (error); } struct pflow_sockaddr { union { struct sockaddr_in in; struct sockaddr_in6 in6; struct sockaddr_storage storage; }; }; static bool pflow_postparse_sockaddr(void *parsed_args, struct nl_pstate *npt __unused) { struct pflow_sockaddr *s = (struct pflow_sockaddr *)parsed_args; if (s->storage.ss_family == AF_INET) s->storage.ss_len = sizeof(struct sockaddr_in); else if (s->storage.ss_family == AF_INET6) s->storage.ss_len = sizeof(struct sockaddr_in6); else return (false); return (true); } #define _OUT(_field) offsetof(struct pflow_sockaddr, _field) static struct nlattr_parser nla_p_sockaddr[] = { { .type = PFLOWNL_ADDR_FAMILY, .off = _OUT(in.sin_family), .cb = nlattr_get_uint8 }, { .type = PFLOWNL_ADDR_PORT, .off = _OUT(in.sin_port), .cb = nlattr_get_uint16 }, { .type = PFLOWNL_ADDR_IP, .off = _OUT(in.sin_addr), .cb = nlattr_get_in_addr }, { .type = PFLOWNL_ADDR_IP6, .off = _OUT(in6.sin6_addr), .cb = nlattr_get_in6_addr }, }; NL_DECLARE_ATTR_PARSER_EXT(addr_parser, nla_p_sockaddr, pflow_postparse_sockaddr); #undef _OUT struct pflow_parsed_set { int id; uint16_t version; struct sockaddr_storage src; struct sockaddr_storage dst; uint32_t observation_dom; }; #define _OUT(_field) offsetof(struct pflow_parsed_set, _field) static const struct nlattr_parser nla_p_set[] = { { .type = PFLOWNL_SET_ID, .off = _OUT(id), .cb = nlattr_get_uint32 }, { .type = PFLOWNL_SET_VERSION, .off = _OUT(version), .cb = nlattr_get_uint16 }, { .type = PFLOWNL_SET_SRC, .off = _OUT(src), .arg = &addr_parser, .cb = nlattr_get_nested }, { .type = PFLOWNL_SET_DST, .off = _OUT(dst), .arg = &addr_parser, .cb = nlattr_get_nested }, { .type = PFLOWNL_SET_OBSERVATION_DOMAIN, .off = _OUT(observation_dom), .cb = nlattr_get_uint32 }, }; #undef _OUT NL_DECLARE_PARSER(set_parser, struct genlmsghdr, nlf_p_empty, nla_p_set); static int pflow_set(struct pflow_softc *sc, const struct pflow_parsed_set *pflowr, struct ucred *cred) { struct thread *td; struct socket *so; int error = 0; td = curthread; PFLOW_ASSERT(sc); if (pflowr->version != 0) { switch(pflowr->version) { case PFLOW_PROTO_5: case PFLOW_PROTO_10: break; default: return(EINVAL); } } pflow_flush(sc); if (pflowr->dst.ss_len != 0) { if (sc->sc_flowdst != NULL && sc->sc_flowdst->sa_family != pflowr->dst.ss_family) { free(sc->sc_flowdst, M_DEVBUF); sc->sc_flowdst = NULL; if (sc->so != NULL) { soclose(sc->so); sc->so = NULL; } } switch (pflowr->dst.ss_family) { case AF_INET: if (sc->sc_flowdst == NULL) { if ((sc->sc_flowdst = malloc( sizeof(struct sockaddr_in), M_DEVBUF, M_NOWAIT)) == NULL) return (ENOMEM); } memcpy(sc->sc_flowdst, &pflowr->dst, sizeof(struct sockaddr_in)); sc->sc_flowdst->sa_len = sizeof(struct sockaddr_in); break; case AF_INET6: if (sc->sc_flowdst == NULL) { if ((sc->sc_flowdst = malloc( sizeof(struct sockaddr_in6), M_DEVBUF, M_NOWAIT)) == NULL) return (ENOMEM); } memcpy(sc->sc_flowdst, &pflowr->dst, sizeof(struct sockaddr_in6)); sc->sc_flowdst->sa_len = sizeof(struct sockaddr_in6); break; default: break; } } if (pflowr->src.ss_len != 0) { if (sc->sc_flowsrc != NULL) free(sc->sc_flowsrc, M_DEVBUF); sc->sc_flowsrc = NULL; if (sc->so != NULL) { soclose(sc->so); sc->so = NULL; } switch(pflowr->src.ss_family) { case AF_INET: if ((sc->sc_flowsrc = malloc( sizeof(struct sockaddr_in), M_DEVBUF, M_NOWAIT)) == NULL) return (ENOMEM); memcpy(sc->sc_flowsrc, &pflowr->src, sizeof(struct sockaddr_in)); sc->sc_flowsrc->sa_len = sizeof(struct sockaddr_in); break; case AF_INET6: if ((sc->sc_flowsrc = malloc( sizeof(struct sockaddr_in6), M_DEVBUF, M_NOWAIT)) == NULL) return (ENOMEM); memcpy(sc->sc_flowsrc, &pflowr->src, sizeof(struct sockaddr_in6)); sc->sc_flowsrc->sa_len = sizeof(struct sockaddr_in6); break; default: break; } } if (sc->so == NULL) { if (pflowvalidsockaddr(sc->sc_flowdst, 0)) { error = socreate(sc->sc_flowdst->sa_family, &so, SOCK_DGRAM, IPPROTO_UDP, cred, td); if (error) return (error); if (pflowvalidsockaddr(sc->sc_flowsrc, 1)) { error = sobind(so, sc->sc_flowsrc, td); if (error) { soclose(so); return (error); } } sc->so = so; } } else if (!pflowvalidsockaddr(sc->sc_flowdst, 0)) { soclose(sc->so); sc->so = NULL; } if (pflowr->observation_dom != 0) sc->sc_observation_dom = pflowr->observation_dom; /* error check is above */ if (pflowr->version != 0) sc->sc_version = pflowr->version; pflow_setmtu(sc, ETHERMTU); switch (sc->sc_version) { case PFLOW_PROTO_5: callout_stop(&sc->sc_tmo6); callout_stop(&sc->sc_tmo_tmpl); break; case PFLOW_PROTO_10: callout_reset(&sc->sc_tmo_tmpl, PFLOW_TMPL_TIMEOUT * hz, pflow_timeout_tmpl, sc); break; default: /* NOTREACHED */ break; } return (0); } static int pflow_nl_set(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct epoch_tracker et; struct pflow_parsed_set s = {}; struct pflow_softc *sc = NULL; int error; error = nl_parse_nlmsg(hdr, &set_parser, npt, &s); if (error != 0) return (error); NET_EPOCH_ENTER(et); CK_LIST_FOREACH(sc, &V_pflowif_list, sc_next) { if (sc->sc_id == s.id) break; } if (sc == NULL) { error = ENOENT; goto out; } PFLOW_LOCK(sc); error = pflow_set(sc, &s, nlp_get_cred(npt->nlp)); PFLOW_UNLOCK(sc); out: NET_EPOCH_EXIT(et); return (error); } static const struct genl_cmd pflow_cmds[] = { { .cmd_num = PFLOWNL_CMD_LIST, .cmd_name = "LIST", .cmd_cb = pflow_nl_list, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFLOWNL_CMD_CREATE, .cmd_name = "CREATE", .cmd_cb = pflow_nl_create, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFLOWNL_CMD_DEL, .cmd_name = "DEL", .cmd_cb = pflow_nl_del, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFLOWNL_CMD_GET, .cmd_name = "GET", .cmd_cb = pflow_nl_get, .cmd_flags = GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, { .cmd_num = PFLOWNL_CMD_SET, .cmd_name = "SET", .cmd_cb = pflow_nl_set, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_NETINET_PF, }, }; static const struct nlhdr_parser *all_parsers[] = { &del_parser, &get_parser, &set_parser, }; static unsigned pflow_do_osd_jail_slot; +static uint16_t family_id; static int pflow_init(void) { bool ret; - int family_id __diagused; NL_VERIFY_PARSERS(all_parsers); static osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = pflow_jail_remove, }; pflow_do_osd_jail_slot = osd_jail_register(NULL, methods); - family_id = genl_register_family(PFLOWNL_FAMILY_NAME, 0, 2, PFLOWNL_CMD_MAX); + family_id = genl_register_family(PFLOWNL_FAMILY_NAME, 0, 2, + PFLOWNL_CMD_MAX); MPASS(family_id != 0); - ret = genl_register_cmds(PFLOWNL_FAMILY_NAME, pflow_cmds, - nitems(pflow_cmds)); + ret = genl_register_cmds(family_id, pflow_cmds, nitems(pflow_cmds)); return (ret ? 0 : ENODEV); } static void pflow_uninit(void) { osd_jail_deregister(pflow_do_osd_jail_slot); - genl_unregister_family(PFLOWNL_FAMILY_NAME); + genl_unregister_family(family_id); } static int pflow_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: error = pflow_init(); break; case MOD_UNLOAD: pflow_uninit(); break; default: error = EINVAL; break; } return (error); } static moduledata_t pflow_mod = { pflowname, pflow_modevent, 0 }; DECLARE_MODULE(pflow, pflow_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); MODULE_VERSION(pflow, 1); MODULE_DEPEND(pflow, pf, PF_MODVER, PF_MODVER, PF_MODVER); diff --git a/sys/rpc/clnt_nl.c b/sys/rpc/clnt_nl.c index 177566232cb5..90d159907d3c 100644 --- a/sys/rpc/clnt_nl.c +++ b/sys/rpc/clnt_nl.c @@ -1,521 +1,520 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2025 Gleb Smirnoff * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Kernel RPC client over netlink(4), where kernel is RPC client and an * application is a server. See svc_nl.c in the libc/rpc as the counterpart. * * The module registers itself within generic netlink families list under name * "rpc". Every new client creates a new multicast group belonging to this * family. When a client starts RPC, the module will multicast the call to * potential netlink listeners and sleep/retry until receiving a result. The * framing of the request: * * [netlink message header, type = "rpc" ID, seq == xid] * [generic netlink header, cmd = RPCNL_REQUEST] * [netlink attribute RPCNL_REQUEST_GROUP] * [group ID] * [netlink attribute RPCNL_REQUEST_BODY] * [XDR encoded payload] * * Note: the generic netlink header and attributes aren't really necessary * for successful communication, since the netlink multicast membership already * guarantees us all needed filtering. The working prototype was putting the * XDR encoded payload right after netlink message header. But we will provide * this framing to allow for any future extensions. * * The expected RPC result from the userland shall be framed like this: * * [netlink message header, type = "rpc" ID, seq == xid] * [generic netlink header, cmd = RPCNL_REPLY] * [netlink attribute RPCNL_REPLY_GROUP] * [group ID] * [netlink attribute RPCNL_REPLY_BODY] * [XDR encoded payload] * * Disclaimer: has been designed and tested only for the NFS related kernel * RPC clients: kgssapi, RPC binding for NLM, TLS client and TLS server. * * Caveats: * 1) Now the privilege checking is hardcoded to PRIV_NFS_DAEMON at the netlink * command and multicast layers. If any new client in addition to NFS * service emerges, we may want to rewrite privelege checking at the client * level somehow. * 2) Since we are using netlink attribute for the payload, payload size is * limited to UINT16_MAX. Today it is smaller than RPC_MAXDATASIZE of 9000. * What if a future RPC wants more? */ static enum clnt_stat clnt_nl_call(CLIENT *, struct rpc_callextra *, rpcproc_t, struct mbuf *, struct mbuf **, struct timeval); static void clnt_nl_close(CLIENT *); static void clnt_nl_destroy(CLIENT *); static bool_t clnt_nl_control(CLIENT *, u_int, void *); static const struct clnt_ops clnt_nl_ops = { .cl_call = clnt_nl_call, .cl_close = clnt_nl_close, .cl_destroy = clnt_nl_destroy, .cl_control = clnt_nl_control, }; static int clnt_nl_reply(struct nlmsghdr *, struct nl_pstate *); static const struct genl_cmd clnt_cmds[] = { { .cmd_num = RPCNL_REPLY, .cmd_name = "request", .cmd_cb = clnt_nl_reply, .cmd_priv = PRIV_NFS_DAEMON, }, }; struct nl_reply_parsed { uint32_t group; struct nlattr *data; }; static const struct nlattr_parser rpcnl_attr_parser[] = { #define OUT(field) offsetof(struct nl_reply_parsed, field) { .type = RPCNL_REPLY_GROUP, .off = OUT(group), .cb = nlattr_get_uint32 }, { .type = RPCNL_REPLY_BODY, .off = OUT(data), .cb = nlattr_get_nla }, #undef OUT }; NL_DECLARE_PARSER(rpcnl_parser, struct genlmsghdr, nlf_p_empty, rpcnl_attr_parser); struct nl_data { struct mtx nl_lock; RB_ENTRY(nl_data) nl_tree; TAILQ_HEAD(, ct_request) nl_pending; uint32_t nl_xid; u_int nl_mpos; u_int nl_authlen; u_int nl_retries; struct { struct genlmsghdr ghdr; struct nlattr gattr; uint32_t group; } nl_hdr; /* pre-initialized header */ char nl_mcallc[MCALL_MSG_SIZE]; /* marshalled callmsg */ /* msleep(9) arguments */ const char * nl_wchan; int nl_prio; int nl_timo; }; static RB_HEAD(nl_data_t, nl_data) rpcnl_clients; static int32_t nl_data_compare(const struct nl_data *a, const struct nl_data *b) { return ((int32_t)(a->nl_hdr.group - b->nl_hdr.group)); } RB_GENERATE_STATIC(nl_data_t, nl_data, nl_tree, nl_data_compare); static struct rwlock rpcnl_global_lock; static const char rpcnl_family_name[] = "rpc"; static uint16_t rpcnl_family_id; void rpcnl_init(void) { bool rv __diagused; rpcnl_family_id = genl_register_family(rpcnl_family_name, 0, 1, 1); MPASS(rpcnl_family_id != 0); - rv = genl_register_cmds(rpcnl_family_name, clnt_cmds, - nitems(clnt_cmds)); + rv = genl_register_cmds(rpcnl_family_id, clnt_cmds, nitems(clnt_cmds)); MPASS(rv); rw_init(&rpcnl_global_lock, rpcnl_family_name); } CLIENT * client_nl_create(const char *name, const rpcprog_t program, const rpcvers_t version) { CLIENT *cl; struct nl_data *nl; struct timeval now; struct rpc_msg call_msg; XDR xdrs; uint32_t group; bool rv __diagused; - if ((group = genl_register_group(rpcnl_family_name, name)) == 0) + if ((group = genl_register_group(rpcnl_family_id, name)) == 0) return (NULL); nl = malloc(sizeof(*nl), M_RPC, M_WAITOK); *nl = (struct nl_data){ .nl_pending = TAILQ_HEAD_INITIALIZER(nl->nl_pending), .nl_hdr = { .ghdr.cmd = RPCNL_REQUEST, .gattr.nla_type = RPCNL_REQUEST_GROUP, .gattr.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .group = group, }, .nl_wchan = rpcnl_family_name, .nl_prio = PSOCK | PCATCH, .nl_timo = 60 * hz, .nl_retries = 1, }; mtx_init(&nl->nl_lock, "rpc_clnt_nl", NULL, MTX_DEF); /* * Initialize and pre-serialize the static part of the call message. */ getmicrotime(&now); nl->nl_xid = __RPC_GETXID(&now); call_msg = (struct rpc_msg ){ .rm_xid = nl->nl_xid, .rm_direction = CALL, .rm_call = { .cb_rpcvers = RPC_MSG_VERSION, .cb_prog = (uint32_t)program, .cb_vers = (uint32_t)version, }, }; cl = malloc(sizeof(*cl), M_RPC, M_WAITOK); *cl = (CLIENT){ .cl_refs = 1, .cl_ops = &clnt_nl_ops, .cl_private = nl, .cl_auth = authnone_create(), }; /* * Experimentally learn how many bytes does procedure name plus * authnone header needs. Use nl_mcallc as temporary scratch space. */ xdrmem_create(&xdrs, nl->nl_mcallc, MCALL_MSG_SIZE, XDR_ENCODE); rv = xdr_putint32(&xdrs, &(rpcproc_t){0}); MPASS(rv); rv = AUTH_MARSHALL(cl->cl_auth, 0, &xdrs, NULL); MPASS(rv); nl->nl_authlen = xdr_getpos(&xdrs); xdr_destroy(&xdrs); xdrmem_create(&xdrs, nl->nl_mcallc, MCALL_MSG_SIZE, XDR_ENCODE); rv = xdr_callhdr(&xdrs, &call_msg); MPASS(rv); nl->nl_mpos = xdr_getpos(&xdrs); xdr_destroy(&xdrs); rw_wlock(&rpcnl_global_lock); RB_INSERT(nl_data_t, &rpcnl_clients, nl); rw_wunlock(&rpcnl_global_lock); return (cl); } static enum clnt_stat clnt_nl_call(CLIENT *cl, struct rpc_callextra *ext, rpcproc_t proc, struct mbuf *args, struct mbuf **resultsp, struct timeval utimeout) { struct nl_writer nw; struct nl_data *nl = cl->cl_private; struct ct_request *cr; struct rpc_err *errp, err; enum clnt_stat stat; AUTH *auth; XDR xdrs; void *mem; uint32_t len, xlen; u_int retries = 0; bool rv __diagused; CURVNET_ASSERT_SET(); cr = malloc(sizeof(struct ct_request), M_RPC, M_WAITOK); *cr = (struct ct_request){ .cr_xid = atomic_fetchadd_32(&nl->nl_xid, 1), .cr_error = ETIMEDOUT, #ifdef VIMAGE .cr_vnet = curvnet, #endif }; if (ext) { auth = ext->rc_auth; errp = &ext->rc_err; len = RPC_MAXDATASIZE; /* XXXGL: can be improved */ } else { auth = cl->cl_auth; errp = &err; len = nl->nl_mpos + nl->nl_authlen + m_length(args, NULL); } mem = malloc(len, M_RPC, M_WAITOK); retry: xdrmem_create(&xdrs, mem, len, XDR_ENCODE); rv = xdr_putbytes(&xdrs, nl->nl_mcallc, nl->nl_mpos); MPASS(rv); rv = xdr_putint32(&xdrs, &proc); MPASS(rv); if (!AUTH_MARSHALL(auth, cr->cr_xid, &xdrs, args)) { stat = errp->re_status = RPC_CANTENCODEARGS; goto out; } else stat = errp->re_status = RPC_SUCCESS; /* XXX: XID is the first thing in the request. */ *(uint32_t *)mem = htonl(cr->cr_xid); xlen = xdr_getpos(&xdrs); rv = nl_writer_group(&nw, xlen, NETLINK_GENERIC, nl->nl_hdr.group, PRIV_NFS_DAEMON, true); MPASS(rv); rv = nlmsg_add(&nw, 0, cr->cr_xid, rpcnl_family_id, 0, sizeof(nl->nl_hdr) + sizeof(struct nlattr) + xlen); MPASS(rv); memcpy(nlmsg_reserve_data_raw(&nw, sizeof(nl->nl_hdr)), &nl->nl_hdr, sizeof(nl->nl_hdr)); rv = nlattr_add(&nw, RPCNL_REQUEST_BODY, xlen, mem); MPASS(rv); rv = nlmsg_end(&nw); MPASS(rv); mtx_lock(&nl->nl_lock); TAILQ_INSERT_TAIL(&nl->nl_pending, cr, cr_link); mtx_unlock(&nl->nl_lock); nlmsg_flush(&nw); mtx_lock(&nl->nl_lock); if (__predict_true(cr->cr_error == ETIMEDOUT)) (void)msleep(cr, &nl->nl_lock, nl->nl_prio, nl->nl_wchan, (nl->nl_timo ? nl->nl_timo : tvtohz(&utimeout)) / nl->nl_retries); TAILQ_REMOVE(&nl->nl_pending, cr, cr_link); mtx_unlock(&nl->nl_lock); if (__predict_true(cr->cr_error == 0)) { struct rpc_msg reply_msg = { .acpted_rply.ar_verf.oa_base = cr->cr_verf, .acpted_rply.ar_results.proc = (xdrproc_t)xdr_void, }; MPASS(cr->cr_mrep); if (ext && ext->rc_feedback) ext->rc_feedback(FEEDBACK_OK, proc, ext->rc_feedback_arg); xdrmbuf_create(&xdrs, cr->cr_mrep, XDR_DECODE); rv = xdr_replymsg(&xdrs, &reply_msg); if (__predict_false(!rv)) { stat = errp->re_status = RPC_CANTDECODERES; goto out; } if ((reply_msg.rm_reply.rp_stat == MSG_ACCEPTED) && (reply_msg.acpted_rply.ar_stat == SUCCESS)) { struct mbuf *results; stat = errp->re_status = RPC_SUCCESS; results = xdrmbuf_getall(&xdrs); if (__predict_true(AUTH_VALIDATE(auth, cr->cr_xid, &reply_msg.acpted_rply.ar_verf, &results))) { MPASS(results); *resultsp = results; /* end successful completion */ } else { stat = errp->re_status = RPC_AUTHERROR; errp->re_why = AUTH_INVALIDRESP; } } else { stat = _seterr_reply(&reply_msg, errp); } xdr_destroy(&xdrs); /* frees cr->cr_mrep */ } else { MPASS(cr->cr_mrep == NULL); errp->re_errno = cr->cr_error; stat = errp->re_status = RPC_CANTRECV; if (cr->cr_error == ETIMEDOUT && ++retries < nl->nl_retries) { cr->cr_xid = atomic_fetchadd_32(&nl->nl_xid, 1); goto retry; } } out: free(cr, M_RPC); free(mem, M_RPC); return (stat); } static int clnt_nl_reply(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_reply_parsed attrs = {}; struct nl_data *nl; struct ct_request *cr; struct mchain mc; int error; CURVNET_ASSERT_SET(); if ((error = nl_parse_nlmsg(hdr, &rpcnl_parser, npt, &attrs)) != 0) return (error); if (attrs.data == NULL) return (EINVAL); error = mc_get(&mc, NLA_DATA_LEN(attrs.data), M_WAITOK, MT_DATA, 0); MPASS(error == 0); m_copyback(mc_first(&mc), 0, NLA_DATA_LEN(attrs.data), NLA_DATA(attrs.data)); rw_rlock(&rpcnl_global_lock); if ((nl = RB_FIND(nl_data_t, &rpcnl_clients, &(struct nl_data){ .nl_hdr.group = attrs.group })) == NULL) { rw_runlock(&rpcnl_global_lock); mc_freem(&mc); return (EPROGUNAVAIL); }; mtx_lock(&nl->nl_lock); rw_runlock(&rpcnl_global_lock); TAILQ_FOREACH(cr, &nl->nl_pending, cr_link) if (cr->cr_xid == hdr->nlmsg_seq #ifdef VIMAGE && cr->cr_vnet == curvnet #endif ) break; if (cr == NULL) { mtx_unlock(&nl->nl_lock); mc_freem(&mc); return (EPROCUNAVAIL); } cr->cr_mrep = mc_first(&mc); cr->cr_error = 0; wakeup(cr); mtx_unlock(&nl->nl_lock); return (0); } static void clnt_nl_close(CLIENT *cl) { struct nl_data *nl = cl->cl_private; struct ct_request *cr; mtx_lock(&nl->nl_lock); TAILQ_FOREACH(cr, &nl->nl_pending, cr_link) { cr->cr_error = ESHUTDOWN; wakeup(cr); } mtx_unlock(&nl->nl_lock); } static void clnt_nl_destroy(CLIENT *cl) { struct nl_data *nl = cl->cl_private; MPASS(TAILQ_EMPTY(&nl->nl_pending)); rw_wlock(&rpcnl_global_lock); RB_REMOVE(nl_data_t, &rpcnl_clients, nl); rw_wlock(&rpcnl_global_lock); mtx_destroy(&nl->nl_lock); free(nl, M_RPC); free(cl, M_RPC); } static bool_t clnt_nl_control(CLIENT *cl, u_int request, void *info) { struct nl_data *nl = (struct nl_data *)cl->cl_private; mtx_lock(&nl->nl_lock); switch (request) { case CLSET_TIMEOUT: nl->nl_timo = tvtohz((struct timeval *)info); break; case CLGET_TIMEOUT: *(struct timeval *)info = (struct timeval){.tv_sec = nl->nl_timo / hz}; break; case CLSET_RETRIES: nl->nl_retries = *(u_int *)info; break; case CLSET_WAITCHAN: nl->nl_wchan = (const char *)info; break; case CLGET_WAITCHAN: *(const char **)info = nl->nl_wchan; break; case CLSET_INTERRUPTIBLE: if (*(int *)info) nl->nl_prio |= PCATCH; else nl->nl_prio &= ~PCATCH; break; case CLGET_INTERRUPTIBLE: *(int *)info = (nl->nl_prio & PCATCH) ? TRUE : FALSE; break; default: mtx_unlock(&nl->nl_lock); printf("%s: unsupported request %u\n", __func__, request); return (FALSE); } mtx_unlock(&nl->nl_lock); return (TRUE); } diff --git a/sys/tests/ktest.c b/sys/tests/ktest.c index 694e1f4229b5..640710f2b89e 100644 --- a/sys/tests/ktest.c +++ b/sys/tests/ktest.c @@ -1,414 +1,413 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct mtx ktest_mtx; #define KTEST_LOCK() mtx_lock(&ktest_mtx) #define KTEST_UNLOCK() mtx_unlock(&ktest_mtx) #define KTEST_LOCK_ASSERT() mtx_assert(&ktest_mtx, MA_OWNED) MTX_SYSINIT(ktest_mtx, &ktest_mtx, "ktest mutex", MTX_DEF); struct ktest_module { struct ktest_module_info *info; volatile u_int refcount; TAILQ_ENTRY(ktest_module) entries; }; static TAILQ_HEAD(, ktest_module) module_list = TAILQ_HEAD_INITIALIZER(module_list); struct nl_ktest_parsed { char *mod_name; char *test_name; struct nlattr *test_meta; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_ktest_parsed, _field) static const struct nlattr_parser nla_p_get[] = { { .type = KTEST_ATTR_MOD_NAME, .off = _OUT(mod_name), .cb = nlattr_get_string }, { .type = KTEST_ATTR_TEST_NAME, .off = _OUT(test_name), .cb = nlattr_get_string }, { .type = KTEST_ATTR_TEST_META, .off = _OUT(test_meta), .cb = nlattr_get_nla }, }; static const struct nlfield_parser nlf_p_get[] = { }; NL_DECLARE_PARSER(ktest_parser, struct genlmsghdr, nlf_p_get, nla_p_get); #undef _IN #undef _OUT static bool create_reply(struct nl_writer *nw, struct nlmsghdr *hdr, int cmd) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) return (false); struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = cmd; ghdr_new->version = 0; ghdr_new->reserved = 0; return (true); } static int dump_mod_test(struct nlmsghdr *hdr, struct nl_pstate *npt, struct ktest_module *mod, const struct ktest_test_info *test_info) { struct nl_writer *nw = npt->nw; if (!create_reply(nw, hdr, KTEST_CMD_NEWTEST)) goto enomem; nlattr_add_string(nw, KTEST_ATTR_MOD_NAME, mod->info->name); nlattr_add_string(nw, KTEST_ATTR_TEST_NAME, test_info->name); nlattr_add_string(nw, KTEST_ATTR_TEST_DESCR, test_info->desc); if (nlmsg_end(nw)) return (0); enomem: nlmsg_abort(nw); return (ENOMEM); } static int dump_mod_tests(struct nlmsghdr *hdr, struct nl_pstate *npt, struct ktest_module *mod, struct nl_ktest_parsed *attrs) { for (int i = 0; i < mod->info->num_tests; i++) { const struct ktest_test_info *test_info = &mod->info->tests[i]; if (attrs->test_name != NULL && strcmp(attrs->test_name, test_info->name)) continue; int error = dump_mod_test(hdr, npt, mod, test_info); if (error != 0) return (error); } return (0); } static int dump_tests(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_ktest_parsed attrs = { }; struct ktest_module *mod; int error; error = nl_parse_nlmsg(hdr, &ktest_parser, npt, &attrs); if (error != 0) return (error); hdr->nlmsg_flags |= NLM_F_MULTI; KTEST_LOCK(); TAILQ_FOREACH(mod, &module_list, entries) { if (attrs.mod_name && strcmp(attrs.mod_name, mod->info->name)) continue; error = dump_mod_tests(hdr, npt, mod, &attrs); if (error != 0) break; } KTEST_UNLOCK(); if (!nlmsg_end_dump(npt->nw, error, hdr)) { //NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static int run_test(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nl_ktest_parsed attrs = { }; struct ktest_module *mod; int error; error = nl_parse_nlmsg(hdr, &ktest_parser, npt, &attrs); if (error != 0) return (error); if (attrs.mod_name == NULL) { nlmsg_report_err_msg(npt, "KTEST_ATTR_MOD_NAME not set"); return (EINVAL); } if (attrs.test_name == NULL) { nlmsg_report_err_msg(npt, "KTEST_ATTR_TEST_NAME not set"); return (EINVAL); } const struct ktest_test_info *test = NULL; KTEST_LOCK(); TAILQ_FOREACH(mod, &module_list, entries) { if (strcmp(attrs.mod_name, mod->info->name)) continue; const struct ktest_module_info *info = mod->info; for (int i = 0; i < info->num_tests; i++) { const struct ktest_test_info *test_info = &info->tests[i]; if (!strcmp(attrs.test_name, test_info->name)) { test = test_info; break; } } break; } if (test != NULL) refcount_acquire(&mod->refcount); KTEST_UNLOCK(); if (test == NULL) return (ESRCH); /* Run the test */ struct ktest_test_context ctx = { .npt = npt, .hdr = hdr, .buf = npt_alloc(npt, KTEST_MAX_BUF), .bufsize = KTEST_MAX_BUF, }; if (ctx.buf == NULL) { //NL_LOG(LOG_DEBUG, "unable to allocate temporary buffer"); return (ENOMEM); } if (test->parse != NULL && attrs.test_meta != NULL) { error = test->parse(&ctx, attrs.test_meta); if (error != 0) return (error); } hdr->nlmsg_flags |= NLM_F_MULTI; KTEST_LOG_LEVEL(&ctx, LOG_INFO, "start running %s", test->name); error = test->func(&ctx); KTEST_LOG_LEVEL(&ctx, LOG_INFO, "end running %s", test->name); refcount_release(&mod->refcount); if (!nlmsg_end_dump(npt->nw, error, hdr)) { //NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } /* USER API */ static void register_test_module(struct ktest_module_info *info) { struct ktest_module *mod = malloc(sizeof(*mod), M_TEMP, M_WAITOK | M_ZERO); mod->info = info; info->module_ptr = mod; KTEST_LOCK(); TAILQ_INSERT_TAIL(&module_list, mod, entries); KTEST_UNLOCK(); } static void unregister_test_module(struct ktest_module_info *info) { struct ktest_module *mod = info->module_ptr; info->module_ptr = NULL; KTEST_LOCK(); TAILQ_REMOVE(&module_list, mod, entries); KTEST_UNLOCK(); free(mod, M_TEMP); } static bool can_unregister(struct ktest_module_info *info) { struct ktest_module *mod = info->module_ptr; return (refcount_load(&mod->refcount) == 0); } int ktest_default_modevent(module_t mod, int type, void *arg) { struct ktest_module_info *info = (struct ktest_module_info *)arg; int error = 0; switch (type) { case MOD_LOAD: register_test_module(info); break; case MOD_UNLOAD: if (!can_unregister(info)) return (EBUSY); unregister_test_module(info); break; default: error = EOPNOTSUPP; break; } return (error); } bool ktest_start_msg(struct ktest_test_context *ctx) { return (create_reply(ctx->npt->nw, ctx->hdr, KTEST_CMD_NEWMESSAGE)); } void ktest_add_msg_meta(struct ktest_test_context *ctx, const char *func, const char *fname, int line) { struct nl_writer *nw = ctx->npt->nw; struct timespec ts; nanouptime(&ts); nlattr_add(nw, KTEST_MSG_ATTR_TS, sizeof(ts), &ts); nlattr_add_string(nw, KTEST_MSG_ATTR_FUNC, func); nlattr_add_string(nw, KTEST_MSG_ATTR_FILE, fname); nlattr_add_u32(nw, KTEST_MSG_ATTR_LINE, line); } void ktest_add_msg_text(struct ktest_test_context *ctx, int msg_level, const char *fmt, ...) { va_list ap; va_start(ap, fmt); vsnprintf(ctx->buf, ctx->bufsize, fmt, ap); va_end(ap); nlattr_add_u8(ctx->npt->nw, KTEST_MSG_ATTR_LEVEL, msg_level); nlattr_add_string(ctx->npt->nw, KTEST_MSG_ATTR_TEXT, ctx->buf); } void ktest_end_msg(struct ktest_test_context *ctx) { nlmsg_end(ctx->npt->nw); } /* Module glue */ static const struct nlhdr_parser *all_parsers[] = { &ktest_parser }; static const struct genl_cmd ktest_cmds[] = { { .cmd_num = KTEST_CMD_LIST, .cmd_name = "KTEST_CMD_LIST", .cmd_cb = dump_tests, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, { .cmd_num = KTEST_CMD_RUN, .cmd_name = "KTEST_CMD_RUN", .cmd_cb = run_test, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_HASPOL, .cmd_priv = PRIV_KLD_LOAD, }, }; +static int family_id; static void ktest_nl_register(void) { bool ret __diagused; - int family_id __diagused; NL_VERIFY_PARSERS(all_parsers); family_id = genl_register_family(KTEST_FAMILY_NAME, 0, 1, KTEST_CMD_MAX); MPASS(family_id != 0); - ret = genl_register_cmds(KTEST_FAMILY_NAME, ktest_cmds, - nitems(ktest_cmds)); + ret = genl_register_cmds(family_id, ktest_cmds, nitems(ktest_cmds)); MPASS(ret); } static void ktest_nl_unregister(void) { MPASS(TAILQ_EMPTY(&module_list)); - genl_unregister_family(KTEST_FAMILY_NAME); + genl_unregister_family(family_id); } static int ktest_modevent(module_t mod, int type, void *unused) { int error = 0; switch (type) { case MOD_LOAD: ktest_nl_register(); break; case MOD_UNLOAD: ktest_nl_unregister(); break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t ktestmod = { "ktest", ktest_modevent, 0 }; DECLARE_MODULE(ktestmod, ktestmod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(ktestmod, 1); MODULE_DEPEND(ktestmod, netlink, 1, 1, 1);