diff --git a/sys/compat/linux/linux_netlink.c b/sys/compat/linux/linux_netlink.c index 775a36994d2d..85c0e6ba33a0 100644 --- a/sys/compat/linux/linux_netlink.c +++ b/sys/compat/linux/linux_netlink.c @@ -1,624 +1,624 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netlink.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_linux #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); static bool valid_rta_size(const struct rtattr *rta, int sz) { return (NL_RTA_DATA_LEN(rta) == sz); } static bool valid_rta_u32(const struct rtattr *rta) { return (valid_rta_size(rta, sizeof(uint32_t))); } static uint32_t _rta_get_uint32(const struct rtattr *rta) { return (*((const uint32_t *)NL_RTA_DATA_CONST(rta))); } static struct nlmsghdr * rtnl_neigh_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct ndmsg *ndm = (struct ndmsg *)(hdr + 1); if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ndmsg)) ndm->ndm_family = linux_to_bsd_domain(ndm->ndm_family); return (hdr); } static struct nlmsghdr * rtnl_ifaddr_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct ifaddrmsg *ifam = (struct ifaddrmsg *)(hdr + 1); if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ifaddrmsg)) ifam->ifa_family = linux_to_bsd_domain(ifam->ifa_family); return (hdr); } static struct nlmsghdr * rtnl_route_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { /* Tweak address families and default fib only */ struct rtmsg *rtm = (struct rtmsg *)(hdr + 1); struct nlattr *nla, *nla_head; int attrs_len; rtm->rtm_family = linux_to_bsd_domain(rtm->rtm_family); if (rtm->rtm_table == 254) rtm->rtm_table = 0; attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr); attrs_len -= NETLINK_ALIGN(sizeof(struct rtmsg)); nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg))); NLA_FOREACH(nla, nla_head, attrs_len) { RT_LOG(LOG_DEBUG3, "GOT type %d len %d total %d", nla->nla_type, nla->nla_len, attrs_len); struct rtattr *rta = (struct rtattr *)nla; if (rta->rta_len < sizeof(struct rtattr)) { break; } switch (rta->rta_type) { case NL_RTA_TABLE: if (!valid_rta_u32(rta)) goto done; rtm->rtm_table = 0; uint32_t fibnum = _rta_get_uint32(rta); RT_LOG(LOG_DEBUG3, "GET RTABLE: %u", fibnum); if (fibnum == 254) { *((uint32_t *)NL_RTA_DATA(rta)) = 0; } break; } } done: return (hdr); } static struct nlmsghdr * rtnl_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { switch (hdr->nlmsg_type) { case NL_RTM_GETROUTE: case NL_RTM_NEWROUTE: case NL_RTM_DELROUTE: return (rtnl_route_from_linux(hdr, npt)); case NL_RTM_GETNEIGH: return (rtnl_neigh_from_linux(hdr, npt)); case NL_RTM_GETADDR: return (rtnl_ifaddr_from_linux(hdr, npt)); /* Silence warning for the messages where no translation is required */ case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: break; default: RT_LOG(LOG_DEBUG, "Passing message type %d untranslated", hdr->nlmsg_type); } return (hdr); } static struct nlmsghdr * nlmsg_from_linux(int netlink_family, struct nlmsghdr *hdr, struct nl_pstate *npt) { switch (netlink_family) { case NETLINK_ROUTE: return (rtnl_from_linux(hdr, npt)); } return (hdr); } /************************************************************ * Kernel -> Linux ************************************************************/ static bool handle_default_out(struct nlmsghdr *hdr, struct nl_writer *nw) { char *out_hdr; out_hdr = nlmsg_reserve_data(nw, NLMSG_ALIGN(hdr->nlmsg_len), char); if (out_hdr != NULL) { memcpy(out_hdr, hdr, hdr->nlmsg_len); return (true); } return (false); } static bool nlmsg_copy_header(struct nlmsghdr *hdr, struct nl_writer *nw) { return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, hdr->nlmsg_flags, 0)); } static void * _nlmsg_copy_next_header(struct nlmsghdr *hdr, struct nl_writer *nw, int sz) { void *next_hdr = nlmsg_reserve_data(nw, sz, void); memcpy(next_hdr, hdr + 1, NLMSG_ALIGN(sz)); return (next_hdr); } #define nlmsg_copy_next_header(_hdr, _ns, _t) \ ((_t *)(_nlmsg_copy_next_header(_hdr, _ns, sizeof(_t)))) static bool nlmsg_copy_nla(const struct nlattr *nla_orig, struct nl_writer *nw) { struct nlattr *nla = nlmsg_reserve_data(nw, nla_orig->nla_len, struct nlattr); if (nla != NULL) { memcpy(nla, nla_orig, nla_orig->nla_len); return (true); } return (false); } /* * Translate a FreeBSD interface name to a Linux interface name. */ static bool nlmsg_translate_ifname_nla(struct nlattr *nla, struct nl_writer *nw) { char ifname[LINUX_IFNAMSIZ]; if (ifname_bsd_to_linux_name((char *)(nla + 1), ifname, sizeof(ifname)) <= 0) return (false); return (nlattr_add_string(nw, IFLA_IFNAME, ifname)); } #define LINUX_NLA_UNHANDLED -1 /* * Translate a FreeBSD attribute to a Linux attribute. * Returns LINUX_NLA_UNHANDLED when the attribute is not processed * and the caller must take care of it, otherwise the result is returned. */ static int nlmsg_translate_all_nla(struct nlmsghdr *hdr, struct nlattr *nla, struct nl_writer *nw) { switch (hdr->nlmsg_type) { case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: switch (nla->nla_type) { case IFLA_IFNAME: return (nlmsg_translate_ifname_nla(nla, nw)); default: break; } default: break; } return (LINUX_NLA_UNHANDLED); } static bool nlmsg_copy_all_nla(struct nlmsghdr *hdr, int raw_hdrlen, struct nl_writer *nw) { struct nlattr *nla; int ret; int hdrlen = NETLINK_ALIGN(raw_hdrlen); int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); NLA_FOREACH(nla, nla_head, attrs_len) { RT_LOG(LOG_DEBUG3, "reading attr %d len %d", nla->nla_type, nla->nla_len); if (nla->nla_len < sizeof(struct nlattr)) { return (false); } ret = nlmsg_translate_all_nla(hdr, nla, nw); if (ret == LINUX_NLA_UNHANDLED) ret = nlmsg_copy_nla(nla, nw); if (!ret) return (false); } return (true); } #undef LINUX_NLA_UNHANDLED static unsigned int rtnl_if_flags_to_linux(unsigned int if_flags) { unsigned int result = 0; for (int i = 0; i < 31; i++) { unsigned int flag = 1 << i; if (!(flag & if_flags)) continue; switch (flag) { case IFF_UP: case IFF_BROADCAST: case IFF_DEBUG: case IFF_LOOPBACK: case IFF_POINTOPOINT: case IFF_DRV_RUNNING: case IFF_NOARP: case IFF_PROMISC: case IFF_ALLMULTI: result |= flag; break; case IFF_NEEDSEPOCH: case IFF_DRV_OACTIVE: case IFF_SIMPLEX: case IFF_LINK0: case IFF_LINK1: case IFF_LINK2: case IFF_CANTCONFIG: case IFF_PPROMISC: case IFF_MONITOR: case IFF_STATICARP: case IFF_STICKYARP: case IFF_DYING: case IFF_RENAMING: case IFF_NOGROUP: /* No Linux analogue */ break; case IFF_MULTICAST: result |= 1 << 12; } } return (result); } static bool rtnl_newlink_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ifinfomsg *ifinfo; ifinfo = nlmsg_copy_next_header(hdr, nw, struct ifinfomsg); ifinfo->ifi_family = bsd_to_linux_domain(ifinfo->ifi_family); /* Convert interface type */ switch (ifinfo->ifi_type) { case IFT_ETHER: ifinfo->ifi_type = LINUX_ARPHRD_ETHER; break; } ifinfo->ifi_flags = rtnl_if_flags_to_linux(ifinfo->ifi_flags); /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifinfomsg), nw)) return (false); /* make ip(8) happy */ if (!nlattr_add_string(nw, IFLA_QDISC, "noqueue")) return (false); if (!nlattr_add_u32(nw, IFLA_TXQLEN, 1000)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newaddr_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ifaddrmsg *ifamsg; ifamsg = nlmsg_copy_next_header(hdr, nw, struct ifaddrmsg); ifamsg->ifa_family = bsd_to_linux_domain(ifamsg->ifa_family); /* XXX: fake ifa_flags? */ /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifaddrmsg), nw)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newneigh_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ndmsg *ndm; ndm = nlmsg_copy_next_header(hdr, nw, struct ndmsg); ndm->ndm_family = bsd_to_linux_domain(ndm->ndm_family); /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ndmsg), nw)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newroute_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct rtmsg *rtm; rtm = nlmsg_copy_next_header(hdr, nw, struct rtmsg); rtm->rtm_family = bsd_to_linux_domain(rtm->rtm_family); struct nlattr *nla; int hdrlen = NETLINK_ALIGN(sizeof(struct rtmsg)); int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); NLA_FOREACH(nla, nla_head, attrs_len) { struct rtattr *rta = (struct rtattr *)nla; //RT_LOG(LOG_DEBUG, "READING attr %d len %d", nla->nla_type, nla->nla_len); if (rta->rta_len < sizeof(struct rtattr)) { break; } switch (rta->rta_type) { case NL_RTA_TABLE: { uint32_t fibnum; fibnum = _rta_get_uint32(rta); if (fibnum == 0) fibnum = 254; RT_LOG(LOG_DEBUG3, "XFIBNUM %u", fibnum); if (!nlattr_add_u32(nw, NL_RTA_TABLE, fibnum)) return (false); } break; default: if (!nlmsg_copy_nla(nla, nw)) return (false); break; } } nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { RT_LOG(LOG_DEBUG2, "Got message type %d", hdr->nlmsg_type); switch (hdr->nlmsg_type) { case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: return (rtnl_newlink_to_linux(hdr, nlp, nw)); case NL_RTM_NEWADDR: case NL_RTM_DELADDR: return (rtnl_newaddr_to_linux(hdr, nlp, nw)); case NL_RTM_NEWROUTE: case NL_RTM_DELROUTE: return (rtnl_newroute_to_linux(hdr, nlp, nw)); case NL_RTM_NEWNEIGH: case NL_RTM_DELNEIGH: case NL_RTM_GETNEIGH: return (rtnl_newneigh_to_linux(hdr, nlp, nw)); default: RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", hdr->nlmsg_type); return (handle_default_out(hdr, nw)); } } static bool nlmsg_error_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct nlmsgerr *nlerr; nlerr = nlmsg_copy_next_header(hdr, nw, struct nlmsgerr); nlerr->error = bsd_to_linux_errno(nlerr->error); int copied_len = sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr); if (hdr->nlmsg_len == copied_len) { nlmsg_end(nw); return (true); } /* * CAP_ACK was not set. Original request needs to be translated. * XXX: implement translation of the original message */ RT_LOG(LOG_DEBUG, "[WARN] Passing ack message type %d untranslated", nlerr->msg.nlmsg_type); char *dst_payload, *src_payload; int copy_len = hdr->nlmsg_len - copied_len; dst_payload = nlmsg_reserve_data(nw, NLMSG_ALIGN(copy_len), char); src_payload = (char *)hdr + copied_len; memcpy(dst_payload, src_payload, copy_len); nlmsg_end(nw); return (true); } static bool nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (hdr->nlmsg_type < NLMSG_MIN_TYPE) { switch (hdr->nlmsg_type) { case NLMSG_ERROR: return (nlmsg_error_to_linux(hdr, nlp, nw)); case NLMSG_NOOP: case NLMSG_DONE: case NLMSG_OVERRUN: return (handle_default_out(hdr, nw)); default: RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", hdr->nlmsg_type); return (handle_default_out(hdr, nw)); } } switch (netlink_family) { case NETLINK_ROUTE: return (rtnl_to_linux(hdr, nlp, nw)); default: return (handle_default_out(hdr, nw)); } } static struct mbuf * nlmsgs_to_linux(int netlink_family, char *buf, int data_length, struct nlpcb *nlp) { RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length); struct nl_writer nw = {}; struct mbuf *m = NULL; if (!nlmsg_get_chain_writer(&nw, data_length, &m)) { RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d", data_length); return (NULL); } /* Assume correct headers. Buffer IS mutable */ int count = 0; for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset]; int msglen = NLMSG_ALIGN(hdr->nlmsg_len); count++; if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) { RT_LOG(LOG_DEBUG, "failed to process msg type %d", hdr->nlmsg_type); m_freem(m); return (NULL); } offset += msglen; } nlmsg_flush(&nw); RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count, m ? m_length(m, NULL) : 0); return (m); } static struct mbuf * mbufs_to_linux(int netlink_family, struct mbuf *m, struct nlpcb *nlp) { /* XXX: easiest solution, not optimized for performance */ int data_length = m_length(m, NULL); char *buf = malloc(data_length, M_LINUX, M_NOWAIT); if (buf == NULL) { RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message", data_length); m_freem(m); return (NULL); } m_copydata(m, 0, data_length, buf); m_freem(m); m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp); free(buf, M_LINUX); return (m); } static struct linux_netlink_provider linux_netlink_v1 = { .mbufs_to_linux = mbufs_to_linux, .msgs_to_linux = nlmsgs_to_linux, .msg_from_linux = nlmsg_from_linux, }; void linux_netlink_register(void) { linux_netlink_p = &linux_netlink_v1; } void linux_netlink_deregister(void) { linux_netlink_p = NULL; } diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index 7538336bdc45..9138e3417cf6 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -1,809 +1,809 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file contains socket and protocol bindings for netlink. */ #include "opt_netlink.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* priv_check */ #include #include #include #define DEBUG_MOD_NAME nl_domain #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); _Static_assert((NLP_MAX_GROUPS % 64) == 0, "NLP_MAX_GROUPS has to be multiple of 64"); _Static_assert(NLP_MAX_GROUPS >= 64, "NLP_MAX_GROUPS has to be at least 64"); #define NLCTL_TRACKER struct rm_priotracker nl_tracker #define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) #define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) static u_long nl_sendspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, "Default netlink socket send space"); static u_long nl_recvspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, "Default netlink socket receive space"); extern u_long sb_max_adj; static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0, sysctl_handle_nl_maxsockbuf, "LU", "Maximum Netlink socket buffer size"); static unsigned int osd_slot_id = 0; void nl_osd_register(void) { osd_slot_id = osd_register(OSD_THREAD, NULL, NULL); } void nl_osd_unregister(void) { osd_deregister(OSD_THREAD, osd_slot_id); } struct nlpcb * _nl_get_thread_nlp(struct thread *td) { return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id)); } void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp) { NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id); if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0) return; /* Failed, need to realloc */ void **rsv = osd_reserve(osd_slot_id); osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp); } /* * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. * Returns nlpcb pointer if present else NULL */ static struct nlpcb * nl_port_lookup(uint32_t port_id) { struct nlpcb *nlp; CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { if (nlp->nl_port == port_id) return (nlp); } return (NULL); } static void nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; /* TODO: add family handler callback */ if (!nlp_unconstrained_vnet(nlp)) return; nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64); } static void nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; nlp->nl_groups[group_id / 64] &= ~((uint64_t)1 << (group_id % 64)); } static bool nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; return (nlp->nl_groups[group_id / 64] & ((uint64_t)1 << (group_id % 64))); } static uint32_t nl_get_groups_compat(struct nlpcb *nlp) { uint32_t groups_mask = 0; for (int i = 0; i < 32; i++) { if (nl_isset_group_locked(nlp, i + 1)) groups_mask |= (1 << i); } return (groups_mask); } static void nl_send_one_group(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { if (__predict_false(nlp->nl_flags & NLF_MSG_INFO)) nl_add_msg_info(m); nl_send_one(m, nlp, num_messages, io_flags); } /* * Broadcasts message @m to the protocol @proto group specified by @group_id */ void nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id) { struct nlpcb *nlp_last = NULL; struct nlpcb *nlp; NLCTL_TRACKER; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d", m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id); } struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (__predict_false(ctl == NULL)) { /* * Can be the case when notification is sent within VNET * which doesn't have any netlink sockets. */ m_freem(m); return; } NLCTL_RLOCK(ctl); int io_flags = NL_IOF_UNTRANSLATED; CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) { if (nlp_last != NULL) { struct mbuf *m_copy; m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m_copy != NULL) nl_send_one_group(m_copy, nlp_last, num_messages, io_flags); else { NLP_LOCK(nlp_last); if (nlp_last->nl_socket != NULL) sorwakeup(nlp_last->nl_socket); NLP_UNLOCK(nlp_last); } } nlp_last = nlp; } } if (nlp_last != NULL) nl_send_one_group(m, nlp_last, num_messages, io_flags); else m_freem(m); NLCTL_RUNLOCK(ctl); } bool nl_has_listeners(int netlink_family, uint32_t groups_mask) { return (V_nl_ctl != NULL); } static uint32_t nl_find_port(void) { /* * app can open multiple netlink sockets. * Start with current pid, if already taken, * try random numbers in 65k..256k+65k space, * avoiding clash with pids. */ if (nl_port_lookup(curproc->p_pid) == NULL) return (curproc->p_pid); for (int i = 0; i < 16; i++) { uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; if (nl_port_lookup(nl_port) == 0) return (nl_port); NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); } return (curproc->p_pid); } static int nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) { if (nlp->nl_bound) { if (nlp->nl_port != snl->nl_pid) { NL_LOG(LOG_DEBUG, "bind() failed: program pid %d " "is different from provided pid %d", nlp->nl_port, snl->nl_pid); return (EINVAL); // XXX: better error } } else { if (snl->nl_pid == 0) snl->nl_pid = nl_find_port(); if (nl_port_lookup(snl->nl_pid) != NULL) return (EADDRINUSE); nlp->nl_port = snl->nl_pid; nlp->nl_bound = true; CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); } for (int i = 0; i < 32; i++) { if (snl->nl_groups & ((uint32_t)1 << i)) nl_add_group_locked(nlp, i + 1); else nl_del_group_locked(nlp, i + 1); } return (0); } static int nl_pru_attach(struct socket *so, int proto, struct thread *td) { struct nlpcb *nlp; int error; if (__predict_false(netlink_unloading != 0)) return (EAFNOSUPPORT); error = nl_verify_proto(proto); if (error != 0) return (error); bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", so, is_linux ? "(linux) " : "", curproc->p_pid, nl_get_proto_name(proto)); /* Create per-VNET state on first socket init */ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (ctl == NULL) ctl = vnet_nl_ctl_init(); KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); MPASS(sotonlpcb(so) == NULL); nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); error = soreserve(so, nl_sendspace, nl_recvspace); if (error != 0) { free(nlp, M_PCB); return (error); } so->so_pcb = nlp; nlp->nl_socket = so; /* Copy so_cred to avoid having socket_var.h in every header */ nlp->nl_cred = so->so_cred; nlp->nl_proto = proto; nlp->nl_process_id = curproc->p_pid; nlp->nl_linux = is_linux; nlp->nl_active = true; nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred); nlp->nl_need_thread_setup = true; NLP_LOCK_INIT(nlp); refcount_init(&nlp->nl_refcount, 1); nl_init_io(nlp); nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, taskqueue_thread_enqueue, &nlp->nl_taskqueue); TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, "netlink_socket (PID %u)", nlp->nl_process_id); NLCTL_WLOCK(ctl); /* XXX: check ctl is still alive */ CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); NLCTL_WUNLOCK(ctl); soisconnected(so); return (0); } static void nl_pru_abort(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); soisdisconnected(so); } static int nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; int error; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } NLCTL_WLOCK(ctl); NLP_LOCK(nlp); error = nl_bind_locked(nlp, snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, snl->nl_pid, snl->nl_groups, error); return (error); } static int nl_assign_port(struct nlpcb *nlp, uint32_t port_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct sockaddr_nl snl = { .nl_pid = port_id, }; int error; NLCTL_WLOCK(ctl); NLP_LOCK(nlp); snl.nl_groups = nl_get_groups_compat(nlp); error = nl_bind_locked(nlp, &snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); return (error); } /* * nl_autobind_port binds a unused portid to @nlp * @nlp: pcb data for the netlink socket * @candidate_id: first id to consider */ static int nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); uint32_t port_id = candidate_id; NLCTL_TRACKER; bool exist; int error = EADDRINUSE; for (int i = 0; i < 10; i++) { NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); NLCTL_RLOCK(ctl); exist = nl_port_lookup(port_id) != 0; NLCTL_RUNLOCK(ctl); if (!exist) { error = nl_assign_port(nlp, port_id); if (error != EADDRINUSE) break; } port_id++; } NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); return (error); } static int nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td) { struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; struct nlpcb *nlp; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } nlp = sotonlpcb(so); if (!nlp->nl_bound) { int error = nl_autobind_port(nlp, td->td_proc->p_pid); if (error != 0) { NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); return (error); } } /* XXX: Handle socket flags & multicast */ soisconnected(so); NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); return (0); } static void destroy_nlpcb(struct nlpcb *nlp) { NLP_LOCK(nlp); nl_free_io(nlp); NLP_LOCK_DESTROY(nlp); free(nlp, M_PCB); } static void destroy_nlpcb_epoch(epoch_context_t ctx) { struct nlpcb *nlp; nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); destroy_nlpcb(nlp); } static void nl_pru_detach(struct socket *so) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); MPASS(sotonlpcb(so) != NULL); struct nlpcb *nlp; NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); nlp = sotonlpcb(so); /* Mark as inactive so no new work can be enqueued */ NLP_LOCK(nlp); bool was_bound = nlp->nl_bound; nlp->nl_active = false; NLP_UNLOCK(nlp); /* Wait till all scheduled work has been completed */ taskqueue_drain_all(nlp->nl_taskqueue); taskqueue_free(nlp->nl_taskqueue); NLCTL_WLOCK(ctl); NLP_LOCK(nlp); if (was_bound) { CK_LIST_REMOVE(nlp, nl_port_next); NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); } CK_LIST_REMOVE(nlp, nl_next); nlp->nl_socket = NULL; NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); so->so_pcb = NULL; NL_LOG(LOG_DEBUG3, "socket %p, detached", so); /* XXX: is delayed free needed? */ NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); } static int nl_pru_disconnect(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_pru_peeraddr(struct socket *so, struct sockaddr **sa) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_pru_shutdown(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); socantsendmore(so); return (0); } static int nl_pru_sockaddr(struct socket *so, struct sockaddr **sa) { struct sockaddr_nl *snl; snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO); /* TODO: set other fields */ snl->nl_len = sizeof(struct sockaddr_nl); snl->nl_family = AF_NETLINK; snl->nl_pid = sotonlpcb(so)->nl_port; *sa = (struct sockaddr *)snl; return (0); } static void nl_pru_close(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); soisdisconnected(so); } static int nl_pru_output(struct mbuf *m, struct socket *so, ...) { if (__predict_false(m == NULL || ((m->m_len < sizeof(struct nlmsghdr)) && (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL))) return (ENOBUFS); MPASS((m->m_flags & M_PKTHDR) != 0); NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); nl_receive_async(m, so); return (0); } static int nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *sa, struct mbuf *control, struct thread *td) { NL_LOG(LOG_DEBUG2, "sending message to kernel"); if (__predict_false(control != NULL)) { if (control->m_len) { m_freem(control); return (EINVAL); } m_freem(control); } return (nl_pru_output(m, so)); } static int nl_pru_rcvd(struct socket *so, int flags) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); nl_on_transmit(sotonlpcb(so)); return (0); } static int nl_getoptflag(int sopt_name) { switch (sopt_name) { case NETLINK_CAP_ACK: return (NLF_CAP_ACK); case NETLINK_EXT_ACK: return (NLF_EXT_ACK); case NETLINK_GET_STRICT_CHK: return (NLF_STRICT); case NETLINK_MSG_INFO: return (NLF_MSG_INFO); } return (0); } static int nl_ctloutput(struct socket *so, struct sockopt *sopt) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); uint32_t flag; int optval, error = 0; NLCTL_TRACKER; NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", so, sopt->sopt_name); switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; if (optval <= 0 || optval >= NLP_MAX_GROUPS) { error = ERANGE; break; } NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval); NLCTL_WLOCK(ctl); if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) nl_add_group_locked(nlp, optval); else nl_del_group_locked(nlp, optval); NLCTL_WUNLOCK(ctl); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; flag = nl_getoptflag(sopt->sopt_name); if ((flag == NLF_MSG_INFO) && nlp->nl_linux) { error = EINVAL; break; } NLCTL_WLOCK(ctl); if (optval != 0) nlp->nl_flags |= flag; else nlp->nl_flags &= ~flag; NLCTL_WUNLOCK(ctl); break; default: error = ENOPROTOOPT; } break; case SOPT_GET: switch (sopt->sopt_name) { case NETLINK_LIST_MEMBERSHIPS: NLCTL_RLOCK(ctl); optval = nl_get_groups_compat(nlp); NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: NLCTL_RLOCK(ctl); optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = ENOPROTOOPT; } break; default: error = ENOPROTOOPT; } return (error); } static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_maxsockbuf = nl_maxsockbuf; error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req); if (error || !req->newptr) return (error); if (tmp_maxsockbuf < MSIZE + MCLBYTES) return (EINVAL); nl_maxsockbuf = tmp_maxsockbuf; return (0); } static int nl_setsbopt(struct socket *so, struct sockopt *sopt) { int error, optval; bool result; if (sopt->sopt_name != SO_RCVBUF) return (sbsetopt(so, sopt)); /* Allow to override max buffer size in certain conditions */ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error != 0) return (error); NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); if (optval > sb_max_adj) { if (priv_check(curthread, PRIV_NET_ROUTE) != 0) return (EPERM); } SOCK_RECVBUF_LOCK(so); result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); SOCK_RECVBUF_UNLOCK(so); return (result ? 0 : ENOBUFS); } #define NETLINK_PROTOSW \ .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD, \ .pr_ctloutput = nl_ctloutput, \ .pr_setsbopt = nl_setsbopt, \ .pr_abort = nl_pru_abort, \ .pr_attach = nl_pru_attach, \ .pr_bind = nl_pru_bind, \ .pr_connect = nl_pru_connect, \ .pr_detach = nl_pru_detach, \ .pr_disconnect = nl_pru_disconnect, \ .pr_peeraddr = nl_pru_peeraddr, \ .pr_send = nl_pru_send, \ .pr_rcvd = nl_pru_rcvd, \ .pr_shutdown = nl_pru_shutdown, \ .pr_sockaddr = nl_pru_sockaddr, \ .pr_close = nl_pru_close static struct protosw netlink_raw_sw = { .pr_type = SOCK_RAW, NETLINK_PROTOSW }; static struct protosw netlink_dgram_sw = { .pr_type = SOCK_DGRAM, NETLINK_PROTOSW }; static struct domain netlinkdomain = { .dom_family = PF_NETLINK, .dom_name = "netlink", .dom_flags = DOMF_UNLOADABLE, .dom_nprotosw = 2, .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw }, }; DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c index 4595b3074d50..32009e74e6a0 100644 --- a/sys/netlink/netlink_generic.c +++ b/sys/netlink/netlink_generic.c @@ -1,304 +1,304 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_generic #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, const struct genl_family *gf, struct nl_writer *nw); /* * Handler called by netlink subsystem when matching netlink message is received */ static int genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlpcb *nlp = npt->nlp; struct genl_family *gf = NULL; int error = 0; int family_id = (int)hdr->nlmsg_type - GENL_MIN_ID; if (__predict_false(family_id < 0 || (gf = genl_get_family(family_id)) == NULL)) { NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", hdr->nlmsg_type); return (ENOTSUP); } if (__predict_false(hdr->nlmsg_len < sizeof(hdr) + GENL_HDRLEN)) { NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", hdr->nlmsg_len); return (EINVAL); } struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1); if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) { NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d", gf->family_name, ghdr->cmd); return (ENOTSUP); } struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd]; if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) { NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed", gf->family_name, ghdr->cmd); return (EPERM); } NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d", gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len); error = cmd->cmd_cb(hdr, npt); return (error); } static uint32_t get_cmd_flags(const struct genl_cmd *cmd) { uint32_t flags = cmd->cmd_flags; if (cmd->cmd_priv != 0) flags |= GENL_ADMIN_PERM; return (flags); } static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, const struct genl_family *gf, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) goto enomem; struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); ghdr_new->cmd = ghdr->cmd; ghdr_new->version = gf->family_version; ghdr_new->reserved = 0; nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name); nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, gf->family_id); nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version); nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize); nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max); if (gf->family_cmd_size > 0) { int off = nlattr_add_nested(nw, CTRL_ATTR_OPS); if (off == 0) goto enomem; for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) { struct genl_cmd *cmd = &gf->family_cmds[i]; if (cmd->cmd_cb == NULL) continue; int cmd_off = nlattr_add_nested(nw, ++cnt); if (cmd_off == 0) goto enomem; nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num); nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd)); nlattr_set_len(nw, cmd_off); } nlattr_set_len(nw, off); } if (gf->family_num_groups > 0) { int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS); if (off == 0) goto enomem; for (int i = 0, cnt = 0; i < MAX_GROUPS; i++) { struct genl_group *gg = genl_get_group(i); if (gg == NULL || gg->group_family != gf) continue; int cmd_off = nlattr_add_nested(nw, ++cnt); if (cmd_off == 0) goto enomem; nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM); nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name); nlattr_set_len(nw, cmd_off); } nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (0); enomem: NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name); nlmsg_abort(nw); return (ENOMEM); } /* Declare ourself as a user */ static void nlctrl_notify(void *arg, const struct genl_family *gf, int action); static eventhandler_tag family_event_tag; static uint32_t ctrl_family_id; static uint32_t ctrl_group_id; struct nl_parsed_family { uint32_t family_id; char *family_name; uint8_t version; }; #define _IN(_field) offsetof(struct genlmsghdr, _field) #define _OUT(_field) offsetof(struct nl_parsed_family, _field) static const struct nlfield_parser nlf_p_generic[] = { { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, }; static struct nlattr_parser nla_p_generic[] = { { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint16 }, { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_name), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic); static bool match_family(const struct genl_family *gf, const struct nl_parsed_family *attrs) { if (gf->family_name == NULL) return (false); if (attrs->family_id != 0 && attrs->family_id != gf->family_id) return (false); if (attrs->family_name != NULL && strcmp(attrs->family_name, gf->family_name)) return (false); return (true); } static int nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt) { int error = 0; struct nl_parsed_family attrs = {}; error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs); if (error != 0) return (error); struct genlmsghdr ghdr = { .cmd = CTRL_CMD_NEWFAMILY, }; if (attrs.family_id != 0 || attrs.family_name != NULL) { /* Resolve request */ for (int i = 0; i < MAX_FAMILIES; i++) { struct genl_family *gf = genl_get_family(i); if (gf != NULL && match_family(gf, &attrs)) { error = dump_family(hdr, &ghdr, gf, npt->nw); return (error); } } return (ENOENT); } hdr->nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI; for (int i = 0; i < MAX_FAMILIES; i++) { struct genl_family *gf = genl_get_family(i); if (gf != NULL && match_family(gf, &attrs)) { error = dump_family(hdr, &ghdr, gf, npt->nw); if (error != 0) break; } } if (!nlmsg_end_dump(npt->nw, error, hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static void nlctrl_notify(void *arg __unused, const struct genl_family *gf, int cmd) { struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC }; struct genlmsghdr ghdr = { .cmd = cmd }; struct nl_writer nw = {}; if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_GENERIC, ctrl_group_id)) { dump_family(&hdr, &ghdr, gf, &nw); nlmsg_flush(&nw); return; } NL_LOG(LOG_DEBUG, "error allocating group writer"); } static const struct genl_cmd nlctrl_cmds[] = { { .cmd_num = CTRL_CMD_GETFAMILY, .cmd_name = "GETFAMILY", .cmd_cb = nlctrl_handle_getfamily, .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP | GENL_CMD_CAP_HASPOL, }, }; static const struct nlhdr_parser *all_parsers[] = { &genl_parser }; static void genl_load_all(void *u __unused) { NL_VERIFY_PARSERS(all_parsers); ctrl_family_id = genl_register_family(CTRL_FAMILY_NAME, 0, 2, CTRL_ATTR_MAX); genl_register_cmds(CTRL_FAMILY_NAME, nlctrl_cmds, NL_ARRAY_LEN(nlctrl_cmds)); ctrl_group_id = genl_register_group(CTRL_FAMILY_NAME, "notify"); family_event_tag = EVENTHANDLER_REGISTER(genl_family_event, nlctrl_notify, NULL, EVENTHANDLER_PRI_ANY); netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", genl_handle_message); } SYSINIT(genl_load_all, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load_all, NULL); static void genl_unload(void *u __unused) { EVENTHANDLER_DEREGISTER(genl_family_event, family_event_tag); genl_unregister_family(CTRL_FAMILY_NAME); NET_EPOCH_WAIT(); } SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL); diff --git a/sys/netlink/netlink_generic_kpi.c b/sys/netlink/netlink_generic_kpi.c index b64f6bd3f1b6..d64c53b1b3d6 100644 --- a/sys/netlink/netlink_generic_kpi.c +++ b/sys/netlink/netlink_generic_kpi.c @@ -1,279 +1,279 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_generic_kpi #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG3); +_DECLARE_DEBUG(LOG_INFO); /* * NETLINK_GENERIC families/groups registration logic */ #define GENL_LOCK() sx_xlock(&sx_lock) #define GENL_UNLOCK() sx_xunlock(&sx_lock) static struct sx sx_lock; SX_SYSINIT(genl_lock, &sx_lock, "genetlink lock"); static struct genl_family families[MAX_FAMILIES]; static struct genl_group groups[MAX_GROUPS]; static struct genl_family * find_family(const char *family_name) { for (int i = 0; i < MAX_FAMILIES; i++) { struct genl_family *gf = &families[i]; if (gf->family_name != NULL && !strcmp(gf->family_name, family_name)) return (gf); } return (NULL); } static struct genl_family * find_empty_family_id(const char *family_name) { struct genl_family *gf = NULL; if (!strcmp(family_name, CTRL_FAMILY_NAME)) { gf = &families[0]; gf->family_id = GENL_MIN_ID; } else { /* Index 0 is reserved for the control family */ for (int i = 1; i < MAX_FAMILIES; i++) { gf = &families[i]; if (gf->family_name == NULL) { gf->family_id = GENL_MIN_ID + i; break; } } } return (gf); } uint32_t genl_register_family(const char *family_name, size_t hdrsize, int family_version, int max_attr_idx) { uint32_t family_id = 0; MPASS(family_name != NULL); if (find_family(family_name) != NULL) return (0); GENL_LOCK(); struct genl_family *gf = find_empty_family_id(family_name); MPASS(gf != NULL); gf->family_name = family_name; gf->family_version = family_version; gf->family_hdrsize = hdrsize; gf->family_attr_max = max_attr_idx; NL_LOG(LOG_DEBUG2, "Registered family %s id %d", gf->family_name, gf->family_id); family_id = gf->family_id; EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_NEWFAMILY); GENL_UNLOCK(); return (family_id); } static void free_family(struct genl_family *gf) { if (gf->family_cmds != NULL) free(gf->family_cmds, M_NETLINK); } /* * unregister groups of a given family */ static void unregister_groups(const struct genl_family *gf) { for (int i = 0; i < MAX_GROUPS; i++) { struct genl_group *gg = &groups[i]; if (gg->group_family == gf && gg->group_name != NULL) { gg->group_family = NULL; gg->group_name = NULL; } } } /* * Can sleep, I guess */ bool genl_unregister_family(const char *family_name) { bool found = false; GENL_LOCK(); struct genl_family *gf = find_family(family_name); if (gf != NULL) { EVENTHANDLER_INVOKE(genl_family_event, gf, CTRL_CMD_DELFAMILY); found = true; unregister_groups(gf); /* TODO: zero pointer first */ free_family(gf); bzero(gf, sizeof(*gf)); } GENL_UNLOCK(); return (found); } bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, int count) { GENL_LOCK(); struct genl_family *gf = find_family(family_name); if (gf == NULL) { GENL_UNLOCK(); return (false); } int cmd_size = gf->family_cmd_size; for (int i = 0; i < count; i++) { MPASS(cmds[i].cmd_cb != NULL); if (cmds[i].cmd_num >= cmd_size) cmd_size = cmds[i].cmd_num + 1; } if (cmd_size > gf->family_cmd_size) { /* need to realloc */ size_t sz = cmd_size * sizeof(struct genl_cmd); void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO); memcpy(data, gf->family_cmds, gf->family_cmd_size * sizeof(struct genl_cmd)); void *old_data = gf->family_cmds; gf->family_cmds = data; gf->family_cmd_size = cmd_size; free(old_data, M_NETLINK); } for (int i = 0; i < count; i++) { const struct genl_cmd *cmd = &cmds[i]; MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL); gf->family_cmds[cmd->cmd_num] = cmds[i]; NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s", cmd->cmd_name, cmd->cmd_num, gf->family_name); } GENL_UNLOCK(); return (true); } static struct genl_group * find_group(const struct genl_family *gf, const char *group_name) { for (int i = 0; i < MAX_GROUPS; i++) { struct genl_group *gg = &groups[i]; if (gg->group_family == gf && !strcmp(gg->group_name, group_name)) return (gg); } return (NULL); } uint32_t genl_register_group(const char *family_name, const char *group_name) { uint32_t group_id = 0; MPASS(family_name != NULL); MPASS(group_name != NULL); GENL_LOCK(); struct genl_family *gf = find_family(family_name); if (gf == NULL || find_group(gf, group_name) != NULL) { GENL_UNLOCK(); return (0); } for (int i = 0; i < MAX_GROUPS; i++) { struct genl_group *gg = &groups[i]; if (gg->group_family == NULL) { gf->family_num_groups++; gg->group_family = gf; gg->group_name = group_name; group_id = i + MIN_GROUP_NUM; break; } } GENL_UNLOCK(); return (group_id); } /* accessors */ struct genl_family * genl_get_family(uint32_t family_id) { return ((family_id < MAX_FAMILIES) ? &families[family_id] : NULL); } const char * genl_get_family_name(const struct genl_family *gf) { return (gf->family_name); } uint32_t genl_get_family_id(const struct genl_family *gf) { return (gf->family_id); } struct genl_group * genl_get_group(uint32_t group_id) { return ((group_id < MAX_GROUPS) ? &groups[group_id] : NULL); } diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c index 9f5ecf63a07d..0961e6ae89d6 100644 --- a/sys/netlink/netlink_io.c +++ b/sys/netlink/netlink_io.c @@ -1,602 +1,602 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_io #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); /* * The logic below provide a p2p interface for receiving and * sending netlink data between the kernel and userland. */ static const struct sockaddr_nl _nl_empty_src = { .nl_len = sizeof(struct sockaddr_nl), .nl_family = PF_NETLINK, .nl_pid = 0 /* comes from the kernel */ }; static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); static void queue_push(struct nl_io_queue *q, struct mbuf *mq) { while (mq != NULL) { struct mbuf *m = mq; mq = mq->m_nextpkt; m->m_nextpkt = NULL; q->length += m_length(m, NULL); STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); } } static void queue_push_head(struct nl_io_queue *q, struct mbuf *m) { MPASS(m->m_nextpkt == NULL); q->length += m_length(m, NULL); STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); } static struct mbuf * queue_pop(struct nl_io_queue *q) { if (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; q->length -= m_length(m, NULL); return (m); } return (NULL); } static struct mbuf * queue_head(const struct nl_io_queue *q) { return (STAILQ_FIRST(&q->head)); } static inline bool queue_empty(const struct nl_io_queue *q) { return (q->length == 0); } static void queue_free(struct nl_io_queue *q) { while (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; m_freem(m); } q->length = 0; } void nl_add_msg_info(struct mbuf *m) { struct nlpcb *nlp = nl_get_thread_nlp(curthread); NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", curthread, nlp); if (nlp == NULL) return; /* Prepare what we want to encode - PID, socket PID & msg seq */ struct { struct nlattr nla; uint32_t val; } data[] = { { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, .val = nlp->nl_process_id, }, { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, .val = nlp->nl_port, }, }; while (m->m_next != NULL) m = m->m_next; m->m_next = sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p", (unsigned)sizeof(data), m->m_next); } static __noinline struct mbuf * extract_msg_info(struct mbuf *m) { while (m->m_next != NULL) { if (m->m_next->m_type == MT_CONTROL) { struct mbuf *ctl = m->m_next; m->m_next = NULL; return (ctl); } m = m->m_next; } return (NULL); } static void nl_schedule_taskqueue(struct nlpcb *nlp) { if (!nlp->nl_task_pending) { nlp->nl_task_pending = true; taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); } else { NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); } } int nl_receive_async(struct mbuf *m, struct socket *so) { struct nlpcb *nlp = sotonlpcb(so); int error = 0; m->m_nextpkt = NULL; NLP_LOCK(nlp); if ((__predict_true(nlp->nl_active))) { sbappend(&so->so_snd, m, 0); NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); nl_schedule_taskqueue(nlp); } else { NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", m_length(m, NULL)); m_free(m); error = EINVAL; } NLP_UNLOCK(nlp); return (error); } static bool tx_check_locked(struct nlpcb *nlp) { if (queue_empty(&nlp->tx_queue)) return (true); /* * Check if something can be moved from the internal TX queue * to the socket queue. */ bool appended = false; struct sockbuf *sb = &nlp->nl_socket->so_rcv; SOCKBUF_LOCK(sb); while (true) { struct mbuf *m = queue_head(&nlp->tx_queue); if (m != NULL) { struct mbuf *ctl = NULL; if (__predict_false(m->m_next != NULL)) ctl = extract_msg_info(m); if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) { /* appended successfully */ queue_pop(&nlp->tx_queue); appended = true; } else break; } else break; } SOCKBUF_UNLOCK(sb); if (appended) sorwakeup(nlp->nl_socket); return (queue_empty(&nlp->tx_queue)); } static bool nl_process_received_one(struct nlpcb *nlp) { bool reschedule = false; NLP_LOCK(nlp); nlp->nl_task_pending = false; if (!tx_check_locked(nlp)) { /* TX overflow queue still not empty, ignore RX */ NLP_UNLOCK(nlp); return (false); } if (queue_empty(&nlp->rx_queue)) { /* * Grab all data we have from the socket TX queue * and store it the internal queue, so it can be worked on * w/o holding socket lock. */ struct sockbuf *sb = &nlp->nl_socket->so_snd; SOCKBUF_LOCK(sb); unsigned int avail = sbavail(sb); if (avail > 0) { NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); } SOCKBUF_UNLOCK(sb); } else { /* Schedule another pass to read from the socket queue */ reschedule = true; } int prev_hiwat = nlp->tx_queue.hiwat; NLP_UNLOCK(nlp); while (!queue_empty(&nlp->rx_queue)) { struct mbuf *m = queue_pop(&nlp->rx_queue); m = nl_process_mbuf(m, nlp); if (m != NULL) { queue_push_head(&nlp->rx_queue, m); reschedule = false; break; } } if (nlp->tx_queue.hiwat > prev_hiwat) { NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); } return (reschedule); } static void nl_process_received(struct nlpcb *nlp) { NL_LOG(LOG_DEBUG3, "taskqueue called"); if (__predict_false(nlp->nl_need_thread_setup)) { nl_set_thread_nlp(curthread, nlp); NLP_LOCK(nlp); nlp->nl_need_thread_setup = false; NLP_UNLOCK(nlp); } while (nl_process_received_one(nlp)) ; } void nl_init_io(struct nlpcb *nlp) { STAILQ_INIT(&nlp->rx_queue.head); STAILQ_INIT(&nlp->tx_queue.head); } void nl_free_io(struct nlpcb *nlp) { queue_free(&nlp->rx_queue); queue_free(&nlp->tx_queue); } /* * Called after some data have been read from the socket. */ void nl_on_transmit(struct nlpcb *nlp) { NLP_LOCK(nlp); struct socket *so = nlp->nl_socket; if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { unsigned long dropped_bytes = nlp->nl_dropped_bytes; unsigned long dropped_messages = nlp->nl_dropped_messages; nlp->nl_dropped_bytes = 0; nlp->nl_dropped_messages = 0; struct sockbuf *sb = &so->so_rcv; NLP_LOG(LOG_DEBUG, nlp, "socket RX overflowed, %lu messages (%lu bytes) dropped. " "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); /* TODO: send netlink message */ } nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } void nl_taskqueue_handler(void *_arg, int pending) { struct nlpcb *nlp = (struct nlpcb *)_arg; CURVNET_SET(nlp->nl_socket->so_vnet); nl_process_received(nlp); CURVNET_RESTORE(); } static __noinline void queue_push_tx(struct nlpcb *nlp, struct mbuf *m) { queue_push(&nlp->tx_queue, m); nlp->nl_tx_blocked = true; if (nlp->tx_queue.length > nlp->tx_queue.hiwat) nlp->tx_queue.hiwat = nlp->tx_queue.length; } /* * Tries to send @m to the socket @nlp. * * @m: mbuf(s) to send to. Consumed in any case. * @nlp: socket to send to * @cnt: number of messages in @m * @io_flags: combination of NL_IOF_* flags * * Returns true on success. * If no queue overrunes happened, wakes up socket owner. */ bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { bool untranslated = io_flags & NL_IOF_UNTRANSLATED; bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; bool result = true; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NLP_LOG(LOG_DEBUG2, nlp, "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, io_flags); } if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); if (m == NULL) return (false); } NLP_LOCK(nlp); if (__predict_false(nlp->nl_socket == NULL)) { NLP_UNLOCK(nlp); m_freem(m); return (false); } if (!queue_empty(&nlp->tx_queue)) { if (ignore_limits) { queue_push_tx(nlp, m); } else { m_free(m); result = false; } NLP_UNLOCK(nlp); return (result); } struct socket *so = nlp->nl_socket; struct mbuf *ctl = NULL; if (__predict_false(m->m_next != NULL)) ctl = extract_msg_info(m); if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) { sorwakeup(so); NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); } else { if (ignore_limits) { queue_push_tx(nlp, m); } else { /* * Store dropped data so it can be reported * on the next read */ nlp->nl_dropped_bytes += m_length(m, NULL); nlp->nl_dropped_messages += num_messages; NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", (unsigned long)nlp->nl_dropped_messages, num_messages, (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); soroverflow(so); m_freem(m); result = false; } } NLP_UNLOCK(nlp); return (result); } static int nl_receive_message(struct nlmsghdr *hdr, int remaining_length, struct nlpcb *nlp, struct nl_pstate *npt) { nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; int error = 0; NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, hdr->nlmsg_pid); if (__predict_false(hdr->nlmsg_len > remaining_length)) { NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", hdr->nlmsg_len, remaining_length); return (EINVAL); } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); return (EINVAL); } /* Stamp each message with sender pid */ hdr->nlmsg_pid = nlp->nl_port; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", hdr->nlmsg_type); if (nlp->nl_linux && linux_netlink_p != NULL) { struct nlmsghdr *hdr_orig = hdr; hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); if (hdr == NULL) { /* Failed to translate to kernel format. Report an error back */ hdr = hdr_orig; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_ACK) nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); return (0); } } error = handler(hdr, npt); NL_LOG(LOG_DEBUG2, "retcode: %d", error); } if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { if (!npt->nw->suppress_ack) { NL_LOG(LOG_DEBUG3, "ack"); nlmsg_ack(nlp, error, hdr, npt); } } return (0); } static void npt_clear(struct nl_pstate *npt) { lb_clear(&npt->lb); npt->error = 0; npt->err_msg = NULL; npt->err_off = 0; npt->hdr = NULL; npt->nw->suppress_ack = false; } /* * Processes an incoming packet, which can contain multiple netlink messages */ static struct mbuf * nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) { int offset, buffer_length; struct nlmsghdr *hdr; char *buffer; int error; NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); struct nl_writer nw = {}; if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { m_freem(m); NL_LOG(LOG_DEBUG, "error allocating socket writer"); return (NULL); } nlmsg_ignore_limit(&nw); /* TODO: alloc this buf once for nlp */ int data_length = m_length(m, NULL); buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; if (nlp->nl_linux) buffer_length += roundup2(data_length, 8); buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); if (buffer == NULL) { m_freem(m); nlmsg_flush(&nw); NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", buffer_length); return (NULL); } m_copydata(m, 0, data_length, buffer); struct nl_pstate npt = { .nlp = nlp, .lb.base = &buffer[roundup2(data_length, 8)], .lb.size = buffer_length - roundup2(data_length, 8), .nw = &nw, .strict = nlp->nl_flags & NLF_STRICT, }; for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { hdr = (struct nlmsghdr *)&buffer[offset]; /* Save length prior to calling handler */ int msglen = NLMSG_ALIGN(hdr->nlmsg_len); NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); npt_clear(&npt); error = nl_receive_message(hdr, data_length - offset, nlp, &npt); offset += msglen; if (__predict_false(error != 0 || nlp->nl_tx_blocked)) break; } NL_LOG(LOG_DEBUG3, "packet parsing done"); free(buffer, M_NETLINK); nlmsg_flush(&nw); if (nlp->nl_tx_blocked) { NLP_LOCK(nlp); nlp->nl_tx_blocked = false; NLP_UNLOCK(nlp); m_adj(m, offset); return (m); } else { m_freem(m); return (NULL); } } diff --git a/sys/netlink/netlink_message_parser.c b/sys/netlink/netlink_message_parser.c index c6cd82260e5b..a1551d3f3a91 100644 --- a/sys/netlink/netlink_message_parser.c +++ b/sys/netlink/netlink_message_parser.c @@ -1,544 +1,544 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_parser #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...) { va_list ap; if (npt->err_msg != NULL) return (false); char *buf = npt_alloc(npt, NL_MAX_ERROR_BUF); if (buf == NULL) return (false); va_start(ap, fmt); vsnprintf(buf, NL_MAX_ERROR_BUF, fmt, ap); va_end(ap); npt->err_msg = buf; return (true); } bool nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off) { if (npt->err_off != 0) return (false); npt->err_off = off; return (true); } void nlmsg_report_cookie(struct nl_pstate *npt, struct nlattr *nla) { MPASS(nla->nla_type == NLMSGERR_ATTR_COOKIE); MPASS(nla->nla_len >= sizeof(struct nlattr)); npt->cookie = nla; } void nlmsg_report_cookie_u32(struct nl_pstate *npt, uint32_t val) { struct nlattr *nla = npt_alloc(npt, sizeof(*nla) + sizeof(uint32_t)); nla->nla_type = NLMSGERR_ATTR_COOKIE; nla->nla_len = sizeof(*nla) + sizeof(uint32_t); memcpy(nla + 1, &val, sizeof(uint32_t)); nlmsg_report_cookie(npt, nla); } static const struct nlattr_parser * search_states(const struct nlattr_parser *ps, int pslen, int key) { int left_i = 0, right_i = pslen - 1; if (key < ps[0].type || key > ps[pslen - 1].type) return (NULL); while (left_i + 1 < right_i) { int mid_i = (left_i + right_i) / 2; if (key < ps[mid_i].type) right_i = mid_i; else if (key > ps[mid_i].type) left_i = mid_i + 1; else return (&ps[mid_i]); } if (ps[left_i].type == key) return (&ps[left_i]); else if (ps[right_i].type == key) return (&ps[right_i]); return (NULL); } int nl_parse_attrs_raw(struct nlattr *nla_head, int len, const struct nlattr_parser *ps, int pslen, struct nl_pstate *npt, void *target) { struct nlattr *nla = NULL; int error = 0; NL_LOG(LOG_DEBUG3, "parse %p remaining_len %d", nla_head, len); int orig_len = len; NLA_FOREACH(nla, nla_head, len) { NL_LOG(LOG_DEBUG3, ">> parsing %p attr_type %d len %d (rem %d)", nla, nla->nla_type, nla->nla_len, len); if (nla->nla_len < sizeof(struct nlattr)) { NLMSG_REPORT_ERR_MSG(npt, "Invalid attr %p type %d len: %d", nla, nla->nla_type, nla->nla_len); uint32_t off = (char *)nla - (char *)npt->hdr; nlmsg_report_err_offset(npt, off); return (EINVAL); } int nla_type = nla->nla_type & NLA_TYPE_MASK; const struct nlattr_parser *s = search_states(ps, pslen, nla_type); if (s != NULL) { void *ptr = (void *)((char *)target + s->off); error = s->cb(nla, npt, s->arg, ptr); if (error != 0) { uint32_t off = (char *)nla - (char *)npt->hdr; nlmsg_report_err_offset(npt, off); NL_LOG(LOG_DEBUG3, "parse failed att offset %u", off); return (error); } } else { /* Ignore non-specified attributes */ NL_LOG(LOG_DEBUG3, "ignoring attr %d", nla->nla_type); } } if (len >= sizeof(struct nlattr)) { nla = (struct nlattr *)((char *)nla_head + (orig_len - len)); NL_LOG(LOG_DEBUG3, " >>> end %p attr_type %d len %d", nla, nla->nla_type, nla->nla_len); } NL_LOG(LOG_DEBUG3, "end parse: %p remaining_len %d", nla, len); return (0); } void nl_get_attrs_bmask_raw(struct nlattr *nla_head, int len, struct nlattr_bmask *bm) { struct nlattr *nla = NULL; BIT_ZERO(NL_ATTR_BMASK_SIZE, bm); NLA_FOREACH(nla, nla_head, len) { if (nla->nla_len < sizeof(struct nlattr)) return; int nla_type = nla->nla_type & NLA_TYPE_MASK; if (nla_type < NL_ATTR_BMASK_SIZE) BIT_SET(NL_ATTR_BMASK_SIZE, nla_type, bm); else NL_LOG(LOG_DEBUG2, "Skipping type %d in the mask: too short", nla_type); } } bool nl_has_attr(const struct nlattr_bmask *bm, unsigned int nla_type) { MPASS(nla_type < NL_ATTR_BMASK_SIZE); return (BIT_ISSET(NL_ATTR_BMASK_SIZE, nla_type, bm)); } int nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != 0)) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not a flag", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((uint8_t *)target) = 1; return (0); } static struct sockaddr * parse_rta_ip4(void *rta_data, struct nl_pstate *npt, int *perror) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in)); if (__predict_false(sin == NULL)) { *perror = ENOBUFS; return (NULL); } sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr)); return ((struct sockaddr *)sin); } static struct sockaddr * parse_rta_ip6(void *rta_data, struct nl_pstate *npt, int *perror) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in6)); if (__predict_false(sin6 == NULL)) { *perror = ENOBUFS; return (NULL); } sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr)); return ((struct sockaddr *)sin6); } static struct sockaddr * parse_rta_ip(struct rtattr *rta, struct nl_pstate *npt, int *perror) { void *rta_data = NL_RTA_DATA(rta); int rta_len = NL_RTA_DATA_LEN(rta); if (rta_len == sizeof(struct in_addr)) { return (parse_rta_ip4(rta_data, npt, perror)); } else if (rta_len == sizeof(struct in6_addr)) { return (parse_rta_ip6(rta_data, npt, perror)); } else { NLMSG_REPORT_ERR_MSG(npt, "unknown IP len: %d for rta type %d", rta_len, rta->rta_type); *perror = ENOTSUP; return (NULL); } return (NULL); } int nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int error = 0; struct sockaddr *sa = parse_rta_ip((struct rtattr *)nla, npt, &error); *((struct sockaddr **)target) = sa; return (error); } static struct sockaddr * parse_rta_via(struct rtattr *rta, struct nl_pstate *npt, int *perror) { struct rtvia *via = NL_RTA_DATA(rta); int data_len = NL_RTA_DATA_LEN(rta); if (__predict_false(data_len) < sizeof(struct rtvia)) { NLMSG_REPORT_ERR_MSG(npt, "undersized RTA_VIA(%d) attr: len %d", rta->rta_type, data_len); *perror = EINVAL; return (NULL); } data_len -= offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (__predict_false(data_len < sizeof(struct in_addr))) { *perror = EINVAL; return (NULL); } return (parse_rta_ip4(via->rtvia_addr, npt, perror)); case AF_INET6: if (__predict_false(data_len < sizeof(struct in6_addr))) { *perror = EINVAL; return (NULL); } return (parse_rta_ip6(via->rtvia_addr, npt, perror)); default: *perror = ENOTSUP; return (NULL); } } int nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int error = 0; struct sockaddr *sa = parse_rta_via((struct rtattr *)nla, npt, &error); *((struct sockaddr **)target) = sa; return (error); } int nlattr_get_uint8(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint8_t))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint8", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((uint16_t *)target) = *((const uint16_t *)NL_RTA_DATA_CONST(nla)); return (0); } int nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint16_t))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint16", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((uint16_t *)target) = *((const uint16_t *)NL_RTA_DATA_CONST(nla)); return (0); } int nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((uint32_t *)target) = *((const uint32_t *)NL_RTA_DATA_CONST(nla)); return (0); } int nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint64_t))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint64", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } memcpy(target, NL_RTA_DATA_CONST(nla), sizeof(uint64_t)); return (0); } int nlattr_get_in_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(in_addr_t))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not in_addr_t", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } memcpy(target, NLA_DATA_CONST(nla), sizeof(in_addr_t)); return (0); } int nlattr_get_in6_addr(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct in6_addr))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not struct in6_addr", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } memcpy(target, NLA_DATA_CONST(nla), sizeof(struct in6_addr)); return (0); } static int nlattr_get_ifp_internal(struct nlattr *nla, struct nl_pstate *npt, void *target, bool zero_ok) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } uint32_t ifindex = *((const uint32_t *)NLA_DATA_CONST(nla)); if (ifindex == 0 && zero_ok) { *((struct ifnet **)target) = NULL; return (0); } NET_EPOCH_ASSERT(); struct ifnet *ifp = ifnet_byindex(ifindex); if (__predict_false(ifp == NULL)) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d: ifindex %u invalid", nla->nla_type, ifindex); return (ENOENT); } *((struct ifnet **)target) = ifp; NL_LOG(LOG_DEBUG3, "nla type %d: ifindex %u -> %s", nla->nla_type, ifindex, if_name(ifp)); return (0); } int nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { return (nlattr_get_ifp_internal(nla, npt, target, false)); } int nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { return (nlattr_get_ifp_internal(nla, npt, target, true)); } int nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int maxlen = NLA_DATA_LEN(nla); if (__predict_false(strnlen((char *)NLA_DATA(nla), maxlen) >= maxlen)) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not NULL-terminated", nla->nla_type, maxlen); return (EINVAL); } *((char **)target) = (char *)NLA_DATA(nla); return (0); } int nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int maxlen = NLA_DATA_LEN(nla); char *buf = npt_alloc(npt, maxlen + 1); if (buf == NULL) return (ENOMEM); buf[maxlen] = '\0'; memcpy(buf, NLA_DATA(nla), maxlen); *((char **)target) = buf; return (0); } int nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { NL_LOG(LOG_DEBUG3, "STORING %p len %d", nla, nla->nla_len); *((struct nlattr **)target) = nla; return (0); } int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg; int error; /* Assumes target points to the beginning of the structure */ error = nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt, target); return (error); } int nlf_get_ifp(void *src, struct nl_pstate *npt, void *target) { int ifindex = *((const int *)src); NET_EPOCH_ASSERT(); struct ifnet *ifp = ifnet_byindex(ifindex); if (ifp == NULL) { NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex); return (ENOENT); } *((struct ifnet **)target) = ifp; return (0); } int nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target) { int ifindex = *((const int *)src); NET_EPOCH_ASSERT(); struct ifnet *ifp = ifnet_byindex(ifindex); if (ifindex != 0 && ifp == NULL) { NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex); return (ENOENT); } *((struct ifnet **)target) = ifp; return (0); } int nlf_get_u8(void *src, struct nl_pstate *npt, void *target) { uint8_t val = *((const uint8_t *)src); *((uint8_t *)target) = val; return (0); } int nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target) { *((uint32_t *)target) = *((const uint8_t *)src); return (0); } int nlf_get_u16(void *src, struct nl_pstate *npt, void *target) { *((uint16_t *)target) = *((const uint16_t *)src); return (0); } int nlf_get_u32(void *src, struct nl_pstate *npt, void *target) { *((uint32_t *)target) = *((const uint32_t *)src); return (0); } diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c index ef568ecbca07..dbbbec69d9fd 100644 --- a/sys/netlink/netlink_message_writer.c +++ b/sys/netlink/netlink_message_writer.c @@ -1,693 +1,693 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_writer #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); /* * The goal of this file is to provide convenient message writing KPI on top of * different storage methods (mbufs, uio, temporary memory chunks). * * The main KPI guarantee is the the (last) message always resides in the contiguous * memory buffer, so one is able to update the header after writing the entire message. * * This guarantee comes with a side effect of potentially reallocating underlying * buffer, so one needs to update the desired pointers after something is added * to the header. * * Messaging layer contains hooks performing transparent Linux translation for the messages. * * There are 3 types of supported targets: * * socket (adds mbufs to the socket buffer, used for message replies) * * group (sends mbuf/chain to the specified groups, used for the notifications) * * chain (returns mbuf chain, used in Linux message translation code) * * There are 3 types of storage: * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message * fits in MCLBYTES) * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs * to be larger than one supported by NS_WRITER_TYPE_MBUF) * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for * Linux sockets, calls translation hook prior to sending messages to the socket). * * Internally, KPI switches between different types of storage when memory requirements * change. It happens transparently to the caller. */ typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok); typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt); struct nlwriter_ops { nlwriter_op_init *init; nlwriter_op_write *write_socket; nlwriter_op_write *write_group; nlwriter_op_write *write_chain; }; /* * NS_WRITER_TYPE_BUF * Writes message to a temporary memory buffer, * flushing to the socket/group when buffer size limit is reached */ static bool nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO); if (__predict_false(nw->_storage == NULL)) return (false); nw->alloc_len = size; nw->offset = 0; nw->hdr = NULL; nw->data = nw->_storage; nw->writer_type = NS_WRITER_TYPE_BUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; return (true); } static bool nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { /* XXX: should we set sorcverr? */ free(buf, M_NETLINK); return (false); } m_append(m, datalen, buf); free(buf, M_NETLINK); int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags)); } static bool nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen, nw->arg.group.proto, nw->arg.group.id); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } bool success = m_append(m, datalen, buf) != 0; free(buf, M_NETLINK); if (!success) return (false); nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } static bool nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr); NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } if (*m0 == NULL) { struct mbuf *m; m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } *m0 = m; } if (__predict_false(m_append(*m0, datalen, buf) == 0)) { free(buf, M_NETLINK); return (false); } return (true); } /* * NS_WRITER_TYPE_MBUF * Writes message to the allocated mbuf, * flushing to socket/group when mbuf size limit is reached. * This is the most efficient mechanism as it avoids double-copying. * * Allocates a single mbuf suitable to store up to @size bytes of data. * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr * If size <= MCLBYTES (2k), allocate a single mbuf cluster * Otherwise, return NULL. */ static bool nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok) { struct mbuf *m; int mflag = waitok ? M_WAITOK : M_NOWAIT; m = m_get2(size, mflag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) return (false); nw->alloc_len = M_TRAILINGSPACE(m); nw->offset = 0; nw->hdr = NULL; nw->_storage = (void *)m; nw->data = mtod(m, void *); nw->writer_type = NS_WRITER_TYPE_MBUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; memset(nw->data, 0, size); NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p", m, size, nw->alloc_len, nw->data); return (true); } static bool nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m = (struct mbuf *)buf; NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { m_freem(m); return (true); } m->m_pkthdr.len = datalen; m->m_len = datalen; int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags)); } static bool nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m = (struct mbuf *)buf; NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen, nw->arg.group.proto, nw->arg.group.id); if (__predict_false(datalen == 0)) { m_freem(m); return (true); } m->m_pkthdr.len = datalen; m->m_len = datalen; nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } static bool nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m_new = (struct mbuf *)buf; struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr); NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); if (__predict_false(datalen == 0)) { m_freem(m_new); return (true); } m_new->m_pkthdr.len = datalen; m_new->m_len = datalen; if (*m0 == NULL) { *m0 = m_new; } else { struct mbuf *m_last; for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next) ; m_last->m_next = m_new; (*m0)->m_pkthdr.len += datalen; } return (true); } /* * NS_WRITER_TYPE_LBUF * Writes message to the allocated memory buffer, * flushing to socket/group when mbuf size limit is reached. * Calls linux handler to rewrite messages before sending to the socket. */ static bool nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; size = roundup2(size, sizeof(void *)); int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE; char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO); if (__predict_false(buf == NULL)) return (false); /* Fill buffer header first */ struct linear_buffer *lb = (struct linear_buffer *)buf; lb->base = &buf[sizeof(struct linear_buffer) + size]; lb->size = size + SCRATCH_BUFFER_SIZE; nw->alloc_len = size; nw->offset = 0; nw->hdr = NULL; nw->_storage = buf; nw->data = (char *)(lb + 1); nw->malloc_flag = mflag; nw->writer_type = NS_WRITER_TYPE_LBUF; nw->num_messages = 0; nw->enomem = false; return (true); } static bool nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct linear_buffer *lb = (struct linear_buffer *)buf; char *data = (char *)(lb + 1); struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = NULL; if (linux_netlink_p != NULL) m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp); free(buf, M_NETLINK); if (__predict_false(m == NULL)) { /* XXX: should we set sorcverr? */ return (false); } int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, nlp, cnt, io_flags)); } /* Shouldn't be called (maybe except Linux code originating message) */ static bool nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct linear_buffer *lb = (struct linear_buffer *)buf; char *data = (char *)(lb + 1); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } m_append(m, datalen, data); free(buf, M_NETLINK); nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } static const struct nlwriter_ops nlmsg_writers[] = { /* NS_WRITER_TYPE_MBUF */ { .init = nlmsg_get_ns_mbuf, .write_socket = nlmsg_write_socket_mbuf, .write_group = nlmsg_write_group_mbuf, .write_chain = nlmsg_write_chain_mbuf, }, /* NS_WRITER_TYPE_BUF */ { .init = nlmsg_get_ns_buf, .write_socket = nlmsg_write_socket_buf, .write_group = nlmsg_write_group_buf, .write_chain = nlmsg_write_chain_buf, }, /* NS_WRITER_TYPE_LBUF */ { .init = nlmsg_get_ns_lbuf, .write_socket = nlmsg_write_socket_lbuf, .write_group = nlmsg_write_group_lbuf, }, }; static void nlmsg_set_callback(struct nl_writer *nw) { const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type]; switch (nw->writer_target) { case NS_WRITER_TARGET_SOCKET: nw->cb = pops->write_socket; break; case NS_WRITER_TARGET_GROUP: nw->cb = pops->write_group; break; case NS_WRITER_TARGET_CHAIN: nw->cb = pops->write_chain; break; default: panic("not implemented"); } } static bool nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok) { MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0])); NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type); return (nlmsg_writers[type].init(nw, size, waitok)); } static bool nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux) { int type; if (!is_linux) { if (__predict_true(size <= MCLBYTES)) type = NS_WRITER_TYPE_MBUF; else type = NS_WRITER_TYPE_BUF; } else type = NS_WRITER_TYPE_LBUF; return (nlmsg_get_buf_type(nw, size, type, waitok)); } bool _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) { if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux)) return (false); nw->arg.ptr = (void *)nlp; nw->writer_target = NS_WRITER_TARGET_SOCKET; nlmsg_set_callback(nw); return (true); } bool _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) { if (!nlmsg_get_buf(nw, size, false, false)) return (false); nw->arg.group.proto = protocol; nw->arg.group.id = group_id; nw->writer_target = NS_WRITER_TARGET_GROUP; nlmsg_set_callback(nw); return (true); } bool _nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) { if (!nlmsg_get_buf(nw, size, false, false)) return (false); *pm = NULL; nw->arg.ptr = (void *)pm; nw->writer_target = NS_WRITER_TARGET_CHAIN; nlmsg_set_callback(nw); NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf); return (true); } void _nlmsg_ignore_limit(struct nl_writer *nw) { nw->ignore_limit = true; } bool _nlmsg_flush(struct nl_writer *nw) { if (__predict_false(nw->hdr != NULL)) { /* Last message has not been completed, skip it. */ int completed_len = (char *)nw->hdr - nw->data; /* Send completed messages */ nw->offset -= nw->offset - completed_len; nw->hdr = NULL; } NL_LOG(LOG_DEBUG2, "OUT"); bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages); nw->_storage = NULL; if (!result) { NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb); } return (result); } /* * Flushes previous data and allocates new underlying storage * sufficient for holding at least @required_len bytes. * Return true on success. */ bool _nlmsg_refill_buffer(struct nl_writer *nw, int required_len) { struct nl_writer ns_new = {}; int completed_len, new_len; if (nw->enomem) return (false); NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim", nw->offset, nw->alloc_len, required_len); /* Calculated new buffer size and allocate it s*/ completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset; if (completed_len > 0 && required_len < MCLBYTES) { /* We already ran out of space, use the largest effective size */ new_len = max(nw->alloc_len, MCLBYTES); } else { if (nw->alloc_len < MCLBYTES) new_len = MCLBYTES; else new_len = nw->alloc_len * 2; while (new_len < required_len) new_len *= 2; } bool waitok = (nw->malloc_flag == M_WAITOK); bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF); if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) { nw->enomem = true; NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM"); return (false); } if (nw->ignore_limit) nlmsg_ignore_limit(&ns_new); /* Update callback data */ ns_new.writer_target = nw->writer_target; nlmsg_set_callback(&ns_new); ns_new.arg = nw->arg; /* Copy last (unfinished) header to the new storage */ int last_len = nw->offset - completed_len; if (last_len > 0) { memcpy(ns_new.data, nw->hdr, last_len); ns_new.hdr = (struct nlmsghdr *)ns_new.data; ns_new.offset = last_len; } NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len); /* Flush completed headers & switch to the new nw */ nlmsg_flush(nw); memcpy(nw, &ns_new, sizeof(struct nl_writer)); NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len); return (true); } bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { struct nlmsghdr *hdr; MPASS(nw->hdr == NULL); int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); if (__predict_false(nw->offset + required_len > nw->alloc_len)) { if (!nlmsg_refill_buffer(nw, required_len)) return (false); } hdr = (struct nlmsghdr *)(&nw->data[nw->offset]); hdr->nlmsg_len = len; hdr->nlmsg_type = type; hdr->nlmsg_flags = flags; hdr->nlmsg_seq = seq; hdr->nlmsg_pid = portid; nw->hdr = hdr; nw->offset += sizeof(struct nlmsghdr); return (true); } bool _nlmsg_end(struct nl_writer *nw) { MPASS(nw->hdr != NULL); if (nw->enomem) { NL_LOG(LOG_DEBUG, "ENOMEM when dumping message"); nlmsg_abort(nw); return (false); } nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr); NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags, nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid); nw->hdr = NULL; nw->num_messages++; return (true); } void _nlmsg_abort(struct nl_writer *nw) { if (nw->hdr != NULL) { nw->offset = (uint32_t)((char *)nw->hdr - nw->data); nw->hdr = NULL; } } void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlmsgerr *errmsg; int payload_len; uint32_t flags = nlp->nl_flags; struct nl_writer *nw = npt->nw; bool cap_ack; payload_len = sizeof(struct nlmsgerr); /* * The only case when we send the full message in the * reply is when there is an error and NETLINK_CAP_ACK * is not set. */ cap_ack = (error == 0) || (flags & NLF_CAP_ACK); if (!cap_ack) payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr); payload_len = NETLINK_ALIGN(payload_len); uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0; if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK) nl_flags |= NLM_F_ACK_TLVS; NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d", hdr->nlmsg_type, hdr->nlmsg_seq); if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len)) goto enomem; errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr); errmsg->error = error; /* In case of error copy the whole message, else just the header */ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len); if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg); if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off); if (npt->cookie != NULL) nlattr_add_raw(nw, npt->cookie); if (nlmsg_end(nw)) return; enomem: NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u", hdr->nlmsg_type, hdr->nlmsg_seq); nlmsg_abort(nw); } bool _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { NL_LOG(LOG_DEBUG, "Error finalizing table dump"); return (false); } /* Save operation result */ int *perror = nlmsg_reserve_object(nw, int); NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, nw->offset, perror); *perror = error; nlmsg_end(nw); nw->suppress_ack = true; return (true); } diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c index 6835c4a0e730..55df52fc85b6 100644 --- a/sys/netlink/netlink_module.c +++ b/sys/netlink/netlink_module.c @@ -1,255 +1,255 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(netlink, "Netlink support"); #define DEBUG_MOD_NAME nl_mod #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); #define NL_MAX_HANDLERS 20 struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS]; struct nl_proto_handler *nl_handlers = _nl_handlers; CK_LIST_HEAD(nl_control_head, nl_control); static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); VNET_DEFINE(struct nl_control *, nl_ctl) = NULL; struct mtx nl_global_mtx; MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF); #define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx) #define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx) int netlink_unloading = 0; static void free_nl_ctl(struct nl_control *ctl) { rm_destroy(&ctl->ctl_lock); free(ctl, M_NETLINK); } struct nl_control * vnet_nl_ctl_init(void) { struct nl_control *ctl; ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO); rm_init(&ctl->ctl_lock, "netlink lock"); CK_LIST_INIT(&ctl->ctl_port_head); CK_LIST_INIT(&ctl->ctl_pcb_head); NL_GLOBAL_LOCK(); struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl); if (tmp == NULL) { atomic_store_ptr(&V_nl_ctl, ctl); CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next); NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", curvnet, ctl); } else { NL_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance"); free_nl_ctl(ctl); ctl = tmp; } NL_GLOBAL_UNLOCK(); return (ctl); } static void vnet_nl_ctl_destroy(const void *unused __unused) { struct nl_control *ctl; /* Assume at the time all of the processes / sockets are dead */ NL_GLOBAL_LOCK(); ctl = atomic_load_ptr(&V_nl_ctl); atomic_store_ptr(&V_nl_ctl, NULL); if (ctl != NULL) { NL_LOG(LOG_DEBUG2, "Removing %p from global list", ctl); CK_LIST_REMOVE(ctl, ctl_next); } NL_GLOBAL_UNLOCK(); if (ctl != NULL) free_nl_ctl(ctl); } VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_nl_ctl_destroy, NULL); int nl_verify_proto(int proto) { if (proto < 0 || proto >= NL_MAX_HANDLERS) { return (EINVAL); } int handler_defined = nl_handlers[proto].cb != NULL; return (handler_defined ? 0 : EPROTONOSUPPORT); } const char * nl_get_proto_name(int proto) { return (nl_handlers[proto].proto_name); } bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto)); nl_handlers[proto].cb = handler; nl_handlers[proto].proto_name = proto_name; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Registered netlink %s(%d) handler", proto_name, proto); return (true); } bool netlink_unregister_proto(int proto) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto)); nl_handlers[proto].cb = NULL; nl_handlers[proto].proto_name = NULL; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Unregistered netlink proto %d handler", proto); return (true); } #if !defined(NETLINK) && defined(NETLINK_MODULE) /* Non-stub function provider */ const static struct nl_function_wrapper nl_module = { .nlmsg_add = _nlmsg_add, .nlmsg_refill_buffer = _nlmsg_refill_buffer, .nlmsg_flush = _nlmsg_flush, .nlmsg_end = _nlmsg_end, .nlmsg_abort = _nlmsg_abort, .nlmsg_get_unicast_writer = _nlmsg_get_unicast_writer, .nlmsg_get_group_writer = _nlmsg_get_group_writer, .nlmsg_get_chain_writer = _nlmsg_get_chain_writer, .nlmsg_end_dump = _nlmsg_end_dump, .nl_modify_ifp_generic = _nl_modify_ifp_generic, .nl_store_ifp_cookie = _nl_store_ifp_cookie, .nl_get_thread_nlp = _nl_get_thread_nlp, }; #endif static bool can_unload(void) { struct nl_control *ctl; bool result = true; NL_GLOBAL_LOCK(); CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); result = false; break; } } NL_GLOBAL_UNLOCK(); return (result); } static int netlink_modevent(module_t mod __unused, int what, void *priv __unused) { int ret = 0; switch (what) { case MOD_LOAD: NL_LOG(LOG_DEBUG2, "Loading"); nl_osd_register(); #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(&nl_module); #endif break; case MOD_UNLOAD: NL_LOG(LOG_DEBUG2, "Unload called"); if (can_unload()) { NL_LOG(LOG_WARNING, "unloading"); netlink_unloading = 1; #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(NULL); #endif nl_osd_unregister(); } else ret = EBUSY; break; default: ret = EOPNOTSUPP; break; } return (ret); } static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(netlink, 1); diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c index ce0c0eb36dbc..1fd6d5da63a9 100644 --- a/sys/netlink/netlink_route.c +++ b/sys/netlink/netlink_route.c @@ -1,144 +1,144 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_route_core #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); #define HANDLER_MAX_NUM (NL_RTM_MAX + 10) static const struct rtnl_cmd_handler *rtnl_handler[HANDLER_MAX_NUM] = {}; bool rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count) { for (int i = 0; i < count; i++) { if (handlers[i].cmd >= HANDLER_MAX_NUM) return (false); MPASS(rtnl_handler[handlers[i].cmd] == NULL); } for (int i = 0; i < count; i++) rtnl_handler[handlers[i].cmd] = &handlers[i]; return (true); } /* * Handler called by netlink subsystem when matching netlink message is received */ static int rtnl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) { const struct rtnl_cmd_handler *cmd; struct epoch_tracker et; struct nlpcb *nlp = npt->nlp; int error = 0; if (__predict_false(hdr->nlmsg_type >= HANDLER_MAX_NUM)) { NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type); return (ENOTSUP); } cmd = rtnl_handler[hdr->nlmsg_type]; if (__predict_false(cmd == NULL)) { NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type); return (ENOTSUP); } NLP_LOG(LOG_DEBUG2, nlp, "received msg %s(%d) len %d", cmd->name, hdr->nlmsg_type, hdr->nlmsg_len); if (cmd->priv != 0 && !nlp_has_priv(nlp, cmd->priv)) { NLP_LOG(LOG_DEBUG2, nlp, "priv %d check failed for msg %s", cmd->priv, cmd->name); return (EPERM); } else if (cmd->priv != 0) NLP_LOG(LOG_DEBUG3, nlp, "priv %d check passed for msg %s", cmd->priv, cmd->name); if (!nlp_unconstrained_vnet(nlp) && (cmd->flags & RTNL_F_ALLOW_NONVNET_JAIL) == 0) { NLP_LOG(LOG_DEBUG2, nlp, "jail check failed for msg %s", cmd->name); return (EPERM); } bool need_epoch = !(cmd->flags & RTNL_F_NOEPOCH); if (need_epoch) NET_EPOCH_ENTER(et); error = cmd->cb(hdr, nlp, npt); if (need_epoch) NET_EPOCH_EXIT(et); NLP_LOG(LOG_DEBUG3, nlp, "message %s -> error %d", cmd->name, error); return (error); } static struct rtbridge nlbridge = { .route_f = rtnl_handle_route_event, .ifmsg_f = rtnl_handle_ifnet_event, }; static struct rtbridge *nlbridge_orig_p; static void rtnl_load(void *u __unused) { NL_LOG(LOG_DEBUG2, "rtnl loading"); nlbridge_orig_p = netlink_callback_p; netlink_callback_p = &nlbridge; rtnl_neighs_init(); rtnl_ifaces_init(); rtnl_nexthops_init(); rtnl_routes_init(); netlink_register_proto(NETLINK_ROUTE, "NETLINK_ROUTE", rtnl_handle_message); } SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL); static void rtnl_unload(void *u __unused) { netlink_callback_p = nlbridge_orig_p; rtnl_ifaces_destroy(); rtnl_neighs_destroy(); /* Wait till all consumers read nlbridge data */ NET_EPOCH_WAIT(); } SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL); diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c index aaed11e637a2..adcf58b72e1b 100644 --- a/sys/netlink/route/iface.c +++ b/sys/netlink/route/iface.c @@ -1,1140 +1,1140 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_iface #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; struct ucred *cred; uint32_t fibnum; int family; int error; int count; int dumped; }; static eventhandler_tag ifdetach_event, ifattach_event, iflink_event, ifaddr_event; static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners); static struct sx rtnl_cloner_lock; SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock"); /* These are external hooks for CARP. */ extern int (*carp_get_vhid_p)(struct ifaddr *); /* * RTM_GETLINK request * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 * * Reply: * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, {{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} [ {{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, {{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, {{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, {{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, {{nla_len=8, nla_type=IFLA_MTU}, 1500}, {{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, {{nla_len=8, nla_type=IFLA_GROUP}, 0}, {{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, {{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, {{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, {{nla_len=5, nla_type=IFLA_CARRIER}, 1}, {{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, {{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, {{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, {{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, {{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, */ struct if_state { uint8_t ifla_operstate; uint8_t ifla_carrier; }; static void get_operstate_ether(struct ifnet *ifp, struct if_state *pstate) { struct ifmediareq ifmr = {}; int error; error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (void *)&ifmr); if (error != 0) { NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d", if_name(ifp), error); return; } switch (IFM_TYPE(ifmr.ifm_active)) { case IFM_ETHER: if (ifmr.ifm_status & IFM_ACTIVE) { pstate->ifla_carrier = 1; if (ifp->if_flags & IFF_MONITOR) pstate->ifla_operstate = IF_OPER_DORMANT; else pstate->ifla_operstate = IF_OPER_UP; } else pstate->ifla_operstate = IF_OPER_DOWN; } } static bool get_stats(struct nl_writer *nw, struct ifnet *ifp) { struct rtnl_link_stats64 *stats; int nla_len = sizeof(struct nlattr) + sizeof(*stats); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return (false); nla->nla_type = IFLA_STATS64; nla->nla_len = nla_len; stats = (struct rtnl_link_stats64 *)(nla + 1); stats->rx_packets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); stats->tx_packets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); stats->rx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); stats->tx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); stats->rx_errors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); stats->tx_errors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); stats->rx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); stats->tx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); stats->multicast = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); stats->rx_nohandler = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); return (true); } static void get_operstate(struct ifnet *ifp, struct if_state *pstate) { pstate->ifla_operstate = IF_OPER_UNKNOWN; pstate->ifla_carrier = 0; /* no carrier */ switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: get_operstate_ether(ifp, pstate); break; default: /* Map admin state to the operstate */ if (ifp->if_flags & IFF_UP) { pstate->ifla_operstate = IF_OPER_UP; pstate->ifla_carrier = 1; } else pstate->ifla_operstate = IF_OPER_DOWN; break; } } static void get_hwaddr(struct nl_writer *nw, struct ifnet *ifp) { struct ifreq ifr = {}; if (if_gethwaddr(ifp, &ifr) == 0) { nlattr_add(nw, IFLAF_ORIG_HWADDR, if_getaddrlen(ifp), ifr.ifr_addr.sa_data); } } static unsigned ifp_flags_to_netlink(const struct ifnet *ifp) { return (ifp->if_flags | ifp->if_drv_flags); } #define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) static bool dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa) { uint32_t addr_len = 0; const void *addr_data = NULL; #ifdef INET6 struct in6_addr addr6; #endif if (sa == NULL) return (true); switch (sa->sa_family) { #ifdef INET case AF_INET: addr_len = sizeof(struct in_addr); addr_data = &((const struct sockaddr_in *)sa)->sin_addr; break; #endif #ifdef INET6 case AF_INET6: in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); addr_len = sizeof(struct in6_addr); addr_data = &addr6; break; #endif case AF_LINK: addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); break; default: NL_LOG(LOG_DEBUG2, "unsupported family: %d, skipping", sa->sa_family); return (true); } return (nlattr_add(nw, attr, addr_len, addr_data)); } /* * Dumps interface state, properties and metrics. * @nw: message writer * @ifp: target interface * @hdr: template header * @if_flags_mask: changed if_[drv]_flags bitmask * * This function is called without epoch and MAY sleep. */ static bool dump_iface(struct nl_writer *nw, struct ifnet *ifp, const struct nlmsghdr *hdr, int if_flags_mask) { struct ifinfomsg *ifinfo; NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg))) goto enomem; ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg); ifinfo->ifi_family = AF_UNSPEC; ifinfo->__ifi_pad = 0; ifinfo->ifi_type = ifp->if_type; ifinfo->ifi_index = ifp->if_index; ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); ifinfo->ifi_change = if_flags_mask; struct if_state ifs = {}; get_operstate(ifp, &ifs); if (ifs.ifla_operstate == IF_OPER_UP) ifinfo->ifi_flags |= IFF_LOWER_UP; nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp)); nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate); nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier); /* nlattr_add_u8(nw, IFLA_PROTO_DOWN, val); nlattr_add_u8(nw, IFLA_LINKMODE, val); */ if (if_getaddrlen(ifp) != 0) { struct ifaddr *ifa = if_getifaddr(ifp); dump_sa(nw, IFLA_ADDRESS, ifa->ifa_addr); } if ((ifp->if_broadcastaddr != NULL)) { nlattr_add(nw, IFLA_BROADCAST, ifp->if_addrlen, ifp->if_broadcastaddr); } nlattr_add_u32(nw, IFLA_MTU, ifp->if_mtu); /* nlattr_add_u32(nw, IFLA_MIN_MTU, 60); nlattr_add_u32(nw, IFLA_MAX_MTU, 9000); nlattr_add_u32(nw, IFLA_GROUP, 0); */ if (ifp->if_description != NULL) nlattr_add_string(nw, IFLA_IFALIAS, ifp->if_description); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFLA_FREEBSD); if (off != 0) { get_hwaddr(nw, ifp); nlattr_set_len(nw, off); } get_stats(nw, ifp); uint32_t val = (ifp->if_flags & IFF_PROMISC) != 0; nlattr_add_u32(nw, IFLA_PROMISCUITY, val); ifc_dump_ifp_nl(ifp, nw); if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); nlmsg_abort(nw); return (false); } static bool check_ifmsg(void *hdr, struct nl_pstate *npt) { struct ifinfomsg *ifm = hdr; if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 || ifm->ifi_flags != 0 || ifm->ifi_change != 0) { nlmsg_report_err_msg(npt, "strict checking: non-zero values in ifinfomsg header"); return (false); } return (true); } #define _IN(_field) offsetof(struct ifinfomsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_link, _field) static const struct nlfield_parser nlf_p_if[] = { { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 }, { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_linfo[] = { { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn }, { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla }, }; NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo); static const struct nlattr_parser nla_p_if[] = { { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 }, { .type = IFLA_LINK, .off = _OUT(ifla_link), .cb = nlattr_get_uint32 }, { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested }, { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = nlattr_get_string }, { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string }, { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if); static bool match_iface(struct ifnet *ifp, void *_arg) { struct nl_parsed_link *attrs = (struct nl_parsed_link *)_arg; if (attrs->ifi_index != 0 && attrs->ifi_index != ifp->if_index) return (false); if (attrs->ifi_type != 0 && attrs->ifi_index != ifp->if_type) return (false); if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp))) return (false); /* TODO: add group match */ return (true); } static int dump_cb(struct ifnet *ifp, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; if (!dump_iface(wa->nw, ifp, &wa->hdr, 0)) return (ENOMEM); return (0); } /* * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0}, * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * [ * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"], * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF] * ] */ static int rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; struct ifnet *ifp; int error = 0; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWLINK, }; /* Fast track for an interface w/ explicit name or index match */ if ((attrs.ifi_index != 0) || (attrs.ifla_ifname != NULL)) { if (attrs.ifi_index != 0) { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", attrs.ifi_index); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); } else { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching name %s", attrs.ifla_ifname); ifp = ifunit_ref(attrs.ifla_ifname); } if (ifp != NULL) { if (match_iface(ifp, &attrs)) { if (!dump_iface(wa.nw, ifp, &wa.hdr, 0)) error = ENOMEM; } else error = ENODEV; if_rele(ifp); } else error = ENODEV; return (error); } /* Always treat non-direct-match as a multipart message */ wa.hdr.nlmsg_flags |= NLM_F_MULTI; /* * Fetching some link properties require performing ioctl's that may be blocking. * Address it by saving referenced pointers of the matching links, * exiting from epoch and going through the list one-by-one. */ NL_LOG(LOG_DEBUG2, "Start dump"); if_foreach_sleep(match_iface, &attrs, dump_cb, &wa); NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } /* * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[ * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"], * [ * {nla_len=16, nla_type=IFLA_LINKINFO}, * [ * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"... * ] * ] */ static int rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; struct ifnet *ifp; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index); return (ENOENT); } NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp)); sx_xlock(&ifnet_detach_sxlock); error = if_clone_destroy(if_name(ifp)); sx_xunlock(&ifnet_detach_sxlock); NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error); if_rele(ifp); return (error); } /* * New link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1668185590, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} * [ * {{nla_len=8, nla_type=IFLA_MTU}, 123}, * {{nla_len=10, nla_type=IFLA_IFNAME}, "vlan1"}, * {{nla_len=24, nla_type=IFLA_LINKINFO}, * [ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x7b\x00\x00\x00"}]}]} * * Update link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1668185923, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=if_nametoindex("lo"), ifi_flags=0, ifi_change=0}, * {{nla_len=8, nla_type=IFLA_MTU}, 123}} * * * Check command availability: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} */ static int create_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { if (lattrs->ifla_ifname == NULL || strlen(lattrs->ifla_ifname) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute"); return (EINVAL); } if (lattrs->ifla_cloner == NULL || strlen(lattrs->ifla_cloner) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute"); return (EINVAL); } struct ifc_data_nl ifd = { .flags = IFC_F_CREATE, .lattrs = lattrs, .bm = bm, .npt = npt, }; if (ifc_create_ifp_nl(lattrs->ifla_ifname, &ifd) && ifd.error == 0) nl_store_ifp_cookie(npt, ifd.ifp); return (ifd.error); } static int modify_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { struct ifnet *ifp = NULL; struct epoch_tracker et; if (lattrs->ifi_index == 0 && lattrs->ifla_ifname == NULL) { /* * Applications like ip(8) verify RTM_NEWLINK command * existence by calling it with empty arguments. Always * return "innocent" error in that case. */ NLMSG_REPORT_ERR_MSG(npt, "empty ifi_index field"); return (EPERM); } if (lattrs->ifi_index != 0) { NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(lattrs->ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface #%u", lattrs->ifi_index); return (ENOENT); } } if (ifp == NULL && lattrs->ifla_ifname != NULL) { ifp = ifunit_ref(lattrs->ifla_ifname); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface %s", lattrs->ifla_ifname); return (ENOENT); } } MPASS(ifp != NULL); /* * Modification request can address either * 1) cloned interface, in which case we call the cloner-specific * modification routine * or * 2) non-cloned (e.g. "physical") interface, in which case we call * generic modification routine */ struct ifc_data_nl ifd = { .lattrs = lattrs, .bm = bm, .npt = npt }; if (!ifc_modify_ifp_nl(ifp, &ifd)) ifd.error = nl_modify_ifp_generic(ifp, lattrs, bm, npt); if_rele(ifp); return (ifd.error); } static int rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct nlattr_bmask bm; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); nl_get_attrs_bmask_nlmsg(hdr, &ifmsg_parser, &bm); if (hdr->nlmsg_flags & NLM_F_CREATE) return (create_link(hdr, &attrs, &bm, nlp, npt)); else return (modify_link(hdr, &attrs, &bm, nlp, npt)); } struct nl_parsed_ifa { uint8_t ifa_family; uint8_t ifa_prefixlen; uint8_t ifa_scope; uint32_t ifa_index; uint32_t ifa_flags; struct sockaddr *ifa_address; struct sockaddr *ifa_local; }; #define _IN(_field) offsetof(struct ifaddrmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_ifa, _field) static const struct nlfield_parser nlf_p_ifa[] = { { .off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_scope), .off_out = _OUT(ifa_scope), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_flags), .off_out = _OUT(ifa_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_ifa[] = { { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = nlattr_get_ip }, { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = nlattr_get_ip }, { .type = IFA_FLAGS, .off = _OUT(ifa_flags), .cb = nlattr_get_uint32 }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(ifaddrmsg_parser, struct ifaddrmsg, nlf_p_ifa, nla_p_ifa); /* {ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, [ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, --- {{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, [ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, */ static uint8_t ifa_get_scope(const struct ifaddr *ifa) { const struct sockaddr *sa; uint8_t addr_scope = RT_SCOPE_UNIVERSE; sa = ifa->ifa_addr; switch (sa->sa_family) { #ifdef INET case AF_INET: { struct in_addr addr; addr = ((const struct sockaddr_in *)sa)->sin_addr; if (IN_LOOPBACK(addr.s_addr)) addr_scope = RT_SCOPE_HOST; else if (IN_LINKLOCAL(addr.s_addr)) addr_scope = RT_SCOPE_LINK; break; } #endif #ifdef INET6 case AF_INET6: { const struct in6_addr *addr; addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; if (IN6_IS_ADDR_LOOPBACK(addr)) addr_scope = RT_SCOPE_HOST; else if (IN6_IS_ADDR_LINKLOCAL(addr)) addr_scope = RT_SCOPE_LINK; break; } #endif } return (addr_scope); } #ifdef INET6 static uint8_t inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } #endif static uint8_t get_sa_plen(const struct sockaddr *sa) { #ifdef INET const struct in_addr *paddr; #endif #ifdef INET6 const struct in6_addr *paddr6; #endif switch (sa->sa_family) { #ifdef INET case AF_INET: paddr = &(((const struct sockaddr_in *)sa)->sin_addr); return bitcount32(paddr->s_addr);; #endif #ifdef INET6 case AF_INET6: paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); return inet6_get_plen(paddr6); #endif } return (0); } #ifdef INET6 static uint32_t in6_flags_to_nl(uint32_t flags) { uint32_t nl_flags = 0; if (flags & IN6_IFF_TEMPORARY) nl_flags |= IFA_F_TEMPORARY; if (flags & IN6_IFF_NODAD) nl_flags |= IFA_F_NODAD; if (flags & IN6_IFF_DEPRECATED) nl_flags |= IFA_F_DEPRECATED; if (flags & IN6_IFF_TENTATIVE) nl_flags |= IFA_F_TENTATIVE; if ((flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == 0) flags |= IFA_F_PERMANENT; if (flags & IN6_IFF_DUPLICATED) flags |= IFA_F_DADFAILED; return (nl_flags); } static void export_cache_info6(struct nl_writer *nw, const struct in6_ifaddr *ia) { struct ifa_cacheinfo ci = { .cstamp = ia->ia6_createtime * 1000, .tstamp = ia->ia6_updatetime * 1000, .ifa_prefered = ia->ia6_lifetime.ia6t_pltime, .ifa_valid = ia->ia6_lifetime.ia6t_vltime, }; nlattr_add(nw, IFA_CACHEINFO, sizeof(ci), &ci); } #endif static void export_cache_info(struct nl_writer *nw, struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET6 case AF_INET6: export_cache_info6(nw, (struct in6_ifaddr *)ifa); break; #endif } } /* * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), ('IFA_LOCAL', '12.0.0.1'), ('IFA_LABEL', 'eth10'), ('IFA_FLAGS', 128), ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], */ static bool dump_iface_addr(struct nl_writer *nw, struct ifnet *ifp, struct ifaddr *ifa, const struct nlmsghdr *hdr) { struct ifaddrmsg *ifamsg; struct sockaddr *sa = ifa->ifa_addr; NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg))) goto enomem; ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg); ifamsg->ifa_family = sa->sa_family; ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); ifamsg->ifa_flags = 0; // ifa_flags is useless ifamsg->ifa_scope = ifa_get_scope(ifa); ifamsg->ifa_index = ifp->if_index; if (ifp->if_flags & IFF_POINTOPOINT) { dump_sa(nw, IFA_ADDRESS, ifa->ifa_dstaddr); dump_sa(nw, IFA_LOCAL, sa); } else { dump_sa(nw, IFA_ADDRESS, sa); #ifdef INET /* * In most cases, IFA_ADDRESS == IFA_LOCAL * Skip IFA_LOCAL for anything except INET */ if (sa->sa_family == AF_INET) dump_sa(nw, IFA_LOCAL, sa); #endif } if (ifp->if_flags & IFF_BROADCAST) dump_sa(nw, IFA_BROADCAST, ifa->ifa_broadaddr); nlattr_add_string(nw, IFA_LABEL, if_name(ifp)); uint32_t nl_ifa_flags = 0; #ifdef INET6 if (sa->sa_family == AF_INET6) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; nl_ifa_flags = in6_flags_to_nl(ia->ia6_flags); } #endif nlattr_add_u32(nw, IFA_FLAGS, nl_ifa_flags); export_cache_info(nw, ifa); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFA_FREEBSD); if (off != 0) { if (ifa->ifa_carp != NULL && carp_get_vhid_p != NULL) { uint32_t vhid = (uint32_t)(*carp_get_vhid_p)(ifa); nlattr_add_u32(nw, IFAF_VHID, vhid); } #ifdef INET6 if (sa->sa_family == AF_INET6) { uint32_t ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; nlattr_add_u32(nw, IFAF_FLAGS, ifa_flags); } #endif nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); nlmsg_abort(nw); return (false); } static int dump_iface_addrs(struct netlink_walkargs *wa, struct ifnet *ifp) { struct ifaddr *ifa; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (wa->family != 0 && wa->family != ifa->ifa_addr->sa_family) continue; if (ifa->ifa_addr->sa_family == AF_LINK) continue; if (prison_if(wa->cred, ifa->ifa_addr) != 0) continue; wa->count++; if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) return (ENOMEM); wa->dumped++; } return (0); } static int rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct ifnet *ifp; int error = 0; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifaddrmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .cred = nlp_get_cred(nlp), .family = attrs.ifa_family, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, .hdr.nlmsg_type = NL_RTM_NEWADDR, }; NL_LOG(LOG_DEBUG2, "Start dump"); if (attrs.ifa_index != 0) { ifp = ifnet_byindex(attrs.ifa_index); if (ifp == NULL) error = ENOENT; else error = dump_iface_addrs(&wa, ifp); } else { CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { error = dump_iface_addrs(&wa, ifp); if (error != 0) break; } } NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } static void rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) { struct nlmsghdr hdr = {}; struct nl_writer nw = {}; uint32_t group = 0; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: group = RTNLGRP_IPV4_IFADDR; break; #endif #ifdef INET6 case AF_INET6: group = RTNLGRP_IPV6_IFADDR; break; #endif default: NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", ifa->ifa_addr->sa_family); return; } if (!nl_has_listeners(NETLINK_ROUTE, group)) return; if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, group)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr); nlmsg_flush(&nw); } static void rtnl_handle_ifevent(struct ifnet *ifp, int nlmsg_type, int if_flags_mask) { struct nlmsghdr hdr = { .nlmsg_type = nlmsg_type }; struct nl_writer nw = {}; if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) return; if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) { NL_LOG(LOG_DEBUG, "error allocating mbuf"); return; } dump_iface(&nw, ifp, &hdr, if_flags_mask); nlmsg_flush(&nw); } static void rtnl_handle_ifattach(void *arg, struct ifnet *ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } static void rtnl_handle_ifdetach(void *arg, struct ifnet *ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_DELLINK, 0); } static void rtnl_handle_iflink(void *arg, struct ifnet *ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } void rtnl_handle_ifnet_event(struct ifnet *ifp, int if_flags_mask) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, if_flags_mask); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETLINK, .name = "RTM_GETLINK", .cb = &rtnl_handle_getlink, .flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELLINK, .name = "RTM_DELLINK", .cb = &rtnl_handle_dellink, .priv = PRIV_NET_IFDESTROY, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_NEWLINK, .name = "RTM_NEWLINK", .cb = &rtnl_handle_newlink, .priv = PRIV_NET_IFCREATE, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_GETADDR, .name = "RTM_GETADDR", .cb = &rtnl_handle_getaddr, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_NEWADDR, .name = "RTM_NEWADDR", .cb = &rtnl_handle_getaddr, }, { .cmd = NL_RTM_DELADDR, .name = "RTM_DELADDR", .cb = &rtnl_handle_getaddr, }, }; static const struct nlhdr_parser *all_parsers[] = { &ifmsg_parser, &ifaddrmsg_parser }; void rtnl_iface_add_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_INSERT_HEAD(&nl_cloners, cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_iface_del_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_ifaces_init(void) { ifattach_event = EVENTHANDLER_REGISTER( ifnet_arrival_event, rtnl_handle_ifattach, NULL, EVENTHANDLER_PRI_ANY); ifdetach_event = EVENTHANDLER_REGISTER( ifnet_departure_event, rtnl_handle_ifdetach, NULL, EVENTHANDLER_PRI_ANY); ifaddr_event = EVENTHANDLER_REGISTER( rt_addrmsg, rtnl_handle_ifaddr, NULL, EVENTHANDLER_PRI_ANY); iflink_event = EVENTHANDLER_REGISTER( ifnet_link_event, rtnl_handle_iflink, NULL, EVENTHANDLER_PRI_ANY); NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } void rtnl_ifaces_destroy(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); EVENTHANDLER_DEREGISTER(ifnet_link_event, iflink_event); } diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c index 6f6adc323be6..cebbb07b5cac 100644 --- a/sys/netlink/route/iface_drivers.c +++ b/sys/netlink/route/iface_drivers.c @@ -1,147 +1,147 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_iface_drivers #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); /* * Generic modification interface handler. * Responsible for changing network stack interface attributes * such as state, mtu or description. */ int _nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt) { int error; if (lattrs->ifla_ifalias != NULL) { if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFDESCR)) { int len = strlen(lattrs->ifla_ifalias) + 1; char *buf = if_allocdescr(len, M_WAITOK); memcpy(buf, lattrs->ifla_ifalias, len); if_setdescr(ifp, buf); getmicrotime(&ifp->if_lastchange); } else { nlmsg_report_err_msg(npt, "Not enough privileges to set descr"); return (EPERM); } } if ((lattrs->ifi_change & IFF_UP) && (lattrs->ifi_flags & IFF_UP) == 0) { /* Request to down the interface */ if_down(ifp); } if (lattrs->ifla_mtu > 0) { if (nlp_has_priv(npt->nlp, PRIV_NET_SETIFMTU)) { struct ifreq ifr = { .ifr_mtu = lattrs->ifla_mtu }; error = ifhwioctl(SIOCSIFMTU, ifp, (char *)&ifr, curthread); } else { nlmsg_report_err_msg(npt, "Not enough privileges to set mtu"); return (EPERM); } } if (lattrs->ifi_change & IFF_PROMISC) { error = ifpromisc(ifp, lattrs->ifi_flags & IFF_PROMISC); if (error != 0) { nlmsg_report_err_msg(npt, "unable to set promisc"); return (error); } } return (0); } /* * Saves the resulting ifindex and ifname to report them * to userland along with the operation result. * NLA format: * NLMSGERR_ATTR_COOKIE(nested) * IFLA_NEW_IFINDEX(u32) * IFLA_IFNAME(string) */ void _nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) { int ifname_len = strlen(if_name(ifp)); uint32_t ifindex = (uint32_t)ifp->if_index; int nla_len = sizeof(struct nlattr) * 3 + sizeof(ifindex) + NL_ITEM_ALIGN(ifname_len + 1); struct nlattr *nla_cookie = npt_alloc(npt, nla_len); /* Nested TLV */ nla_cookie->nla_len = nla_len; nla_cookie->nla_type = NLMSGERR_ATTR_COOKIE; struct nlattr *nla = nla_cookie + 1; nla->nla_len = sizeof(struct nlattr) + sizeof(ifindex); nla->nla_type = IFLA_NEW_IFINDEX; memcpy(NLA_DATA(nla), &ifindex, sizeof(ifindex)); nla = NLA_NEXT(nla); nla->nla_len = sizeof(struct nlattr) + ifname_len + 1; nla->nla_type = IFLA_IFNAME; strlcpy(NLA_DATA(nla), if_name(ifp), ifname_len + 1); nlmsg_report_cookie(npt, nla_cookie); } diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c index db72bc9bfd4e..ced2c74284e3 100644 --- a/sys/netlink/route/neigh.c +++ b/sys/netlink/route/neigh.c @@ -1,586 +1,586 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include /* nd6.h requires this */ #include /* nd6 state machine */ #include /* scope deembedding */ #define DEBUG_MOD_NAME nl_neigh #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); static int lle_families[] = { AF_INET, AF_INET6 }; static eventhandler_tag lle_event_p; struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; struct ifnet *ifp; int family; int error; int count; int dumped; }; static int lle_state_to_nl_state(int family, struct llentry *lle) { int state = lle->ln_state; switch (family) { case AF_INET: if (lle->la_flags & (LLE_STATIC | LLE_IFADDR)) state = 1; switch (state) { case 0: /* ARP_LLINFO_INCOMPLETE */ return (NUD_INCOMPLETE); case 1: /* ARP_LLINFO_REACHABLE */ return (NUD_REACHABLE); case 2: /* ARP_LLINFO_VERIFY */ return (NUD_PROBE); } break; case AF_INET6: switch (state) { case ND6_LLINFO_INCOMPLETE: return (NUD_INCOMPLETE); case ND6_LLINFO_REACHABLE: return (NUD_REACHABLE); case ND6_LLINFO_STALE: return (NUD_STALE); case ND6_LLINFO_DELAY: return (NUD_DELAY); case ND6_LLINFO_PROBE: return (NUD_PROBE); } break; } return (NUD_NONE); } static uint32_t lle_flags_to_nl_flags(const struct llentry *lle) { uint32_t nl_flags = 0; if (lle->la_flags & LLE_IFADDR) nl_flags |= NTF_SELF; if (lle->la_flags & LLE_PUB) nl_flags |= NTF_PROXY; if (lle->la_flags & LLE_STATIC) nl_flags |= NTF_STICKY; if (lle->ln_router != 0) nl_flags |= NTF_ROUTER; return (nl_flags); } static uint32_t get_lle_next_ts(const struct llentry *lle) { if (lle->la_expire == 0) return (0); return (lle->la_expire + lle->lle_remtime / hz + time_second - time_uptime); } static int dump_lle_locked(struct llentry *lle, void *arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)arg; struct nlmsghdr *hdr = &wa->hdr; struct nl_writer *nw = wa->nw; struct ndmsg *ndm; #if defined(INET) || defined(INET6) union { struct in_addr in; struct in6_addr in6; } addr; #endif IF_DEBUG_LEVEL(LOG_DEBUG2) { char llebuf[NHOP_PRINT_BUFSIZE]; llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf)); NL_LOG(LOG_DEBUG2, "dumping %s", llebuf); } if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg))) goto enomem; ndm = nlmsg_reserve_object(nw, struct ndmsg); ndm->ndm_family = wa->family; ndm->ndm_ifindex = wa->ifp->if_index; ndm->ndm_state = lle_state_to_nl_state(wa->family, lle); ndm->ndm_flags = lle_flags_to_nl_flags(lle); switch (wa->family) { #ifdef INET case AF_INET: addr.in = lle->r_l3addr.addr4; nlattr_add(nw, NDA_DST, 4, &addr); break; #endif #ifdef INET6 case AF_INET6: addr.in6 = lle->r_l3addr.addr6; in6_clearscope(&addr.in6); nlattr_add(nw, NDA_DST, 16, &addr); break; #endif } if (lle->r_flags & RLLE_VALID) { /* Has L2 */ int addrlen = wa->ifp->if_addrlen; nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr); } nlattr_add_u32(nw, NDA_PROBES, lle->la_asked); struct nda_cacheinfo *cache; cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo); if (cache == NULL) goto enomem; /* TODO: provide confirmed/updated */ cache->ndm_refcnt = lle->lle_refcnt; int off = nlattr_add_nested(nw, NDA_FREEBSD); if (off != 0) { nlattr_add_u32(nw, NDAF_NEXT_STATE_TS, get_lle_next_ts(lle)); nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (0); enomem: NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)"); nlmsg_abort(nw); return (ENOMEM); } static int dump_lle(struct lltable *llt, struct llentry *lle, void *arg) { int error; LLE_RLOCK(lle); error = dump_lle_locked(lle, arg); LLE_RUNLOCK(lle); return (error); } static bool dump_llt(struct lltable *llt, struct netlink_walkargs *wa) { lltable_foreach_lle(llt, dump_lle, wa); return (true); } static int dump_llts_iface(struct netlink_walkargs *wa, struct ifnet *ifp, int family) { int error = 0; wa->ifp = ifp; for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) { int fam = lle_families[i]; struct lltable *llt = lltable_get(ifp, fam); if (llt != NULL && (family == 0 || family == fam)) { wa->count++; wa->family = fam; if (!dump_llt(llt, wa)) { error = ENOMEM; break; } wa->dumped++; } } return (error); } static int dump_llts(struct netlink_walkargs *wa, struct ifnet *ifp, int family) { NL_LOG(LOG_DEBUG, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family); wa->hdr.nlmsg_flags |= NLM_F_MULTI; if (ifp != NULL) { dump_llts_iface(wa, ifp, family); } else { CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { dump_llts_iface(wa, ifp, family); } } NL_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa->count, wa->dumped); if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) { NL_LOG(LOG_DEBUG, "Unable to add new message"); return (ENOMEM); } return (0); } static int get_lle(struct netlink_walkargs *wa, struct ifnet *ifp, int family, struct sockaddr *dst) { struct lltable *llt = lltable_get(ifp, family); if (llt == NULL) return (ESRCH); #ifdef INET6 if (dst->sa_family == AF_INET6) { struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; if (IN6_IS_SCOPE_LINKLOCAL(&dst6->sin6_addr)) in6_set_unicast_scopeid(&dst6->sin6_addr, ifp->if_index); } #endif struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst); if (lle == NULL) return (ESRCH); wa->ifp = ifp; wa->family = family; return (dump_lle(llt, lle, wa)); } struct nl_parsed_neigh { struct sockaddr *nda_dst; struct ifnet *nda_ifp; struct nlattr *nda_lladdr; uint32_t ndaf_next_ts; uint32_t ndm_flags; uint16_t ndm_state; uint8_t ndm_family; }; #define _IN(_field) offsetof(struct ndmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_neigh, _field) static const struct nlattr_parser nla_p_neigh_fbsd[] = { { .type = NDAF_NEXT_STATE_TS, .off = _OUT(ndaf_next_ts), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(neigh_fbsd_parser, nla_p_neigh_fbsd); static const struct nlfield_parser nlf_p_neigh[] = { { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 }, { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 }, { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz }, }; static const struct nlattr_parser nla_p_neigh[] = { { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip }, { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla }, { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp }, { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 }, { .type = NDA_FREEBSD, .arg = &neigh_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(ndmsg_parser, struct ndmsg, nlf_p_neigh, nla_p_neigh); /* * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0}, * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC}, * [ * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")}, * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60} */ static int rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) { if (attrs.nda_ifp == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set"); if (attrs.nda_dst == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); if (attrs.nda_lladdr == NULL) NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set"); return (EINVAL); } if (attrs.nda_dst->sa_family != attrs.ndm_family) { NLMSG_REPORT_ERR_MSG(npt, "NDA_DST family (%d) is different from ndm_family (%d)", attrs.nda_dst->sa_family, attrs.ndm_family); return (EINVAL); } int addrlen = attrs.nda_ifp->if_addrlen; if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) { NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR address length (%d) is different from expected (%d)", (int)attrs.nda_lladdr->nla_len - (int)sizeof(struct nlattr), addrlen); return (EINVAL); } const uint16_t supported_flags = NTF_PROXY | NTF_STICKY; if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) { NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported", attrs.ndm_flags &~ supported_flags); return (ENOTSUP); } /* Replacement requires new entry creation anyway */ if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0) return (ENOTSUP); struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); if (llt == NULL) return (EAFNOSUPPORT); uint8_t linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize = sizeof(linkhdr); int lladdr_off = 0; if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family, (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) { NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data"); return (EINVAL); } int lle_flags = (attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0; if (attrs.ndm_flags & NTF_STICKY) lle_flags |= LLE_STATIC; struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst); if (lle == NULL) return (ENOMEM); lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off); if (attrs.ndm_flags & NTF_STICKY) lle->la_expire = 0; else lle->la_expire = attrs.ndaf_next_ts - time_second + time_uptime; /* llentry created, try to insert or update */ IF_AFDATA_WLOCK(attrs.nda_ifp); LLE_WLOCK(lle); struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst); if (lle_tmp != NULL) { error = EEXIST; if (hdr->nlmsg_flags & NLM_F_EXCL) { LLE_WUNLOCK(lle_tmp); lle_tmp = NULL; } else if (hdr->nlmsg_flags & NLM_F_REPLACE) { if ((lle_tmp->la_flags & LLE_IFADDR) == 0) { lltable_unlink_entry(llt, lle_tmp); lltable_link_entry(llt, lle); error = 0; } else error = EPERM; } } else { if (hdr->nlmsg_flags & NLM_F_CREATE) lltable_link_entry(llt, lle); else error = ENOENT; } IF_AFDATA_WUNLOCK(attrs.nda_ifp); if (error != 0) { if (lle != NULL) llentry_free(lle); return (error); } if (lle_tmp != NULL) llentry_free(lle_tmp); /* XXX: We're inside epoch */ EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); LLE_WUNLOCK(lle); llt->llt_post_resolved(llt, lle); return (0); } static int rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); return (EINVAL); } if (attrs.nda_ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided"); return (EINVAL); } struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); if (llt == NULL) return (EAFNOSUPPORT); return (lltable_delete_addr(llt, 0, attrs.nda_dst)); } static int rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_neigh attrs = {}; error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided"); return (EINVAL); } struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEIGH, }; if (attrs.nda_dst == NULL) error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family); else error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst); return (error); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_NEWNEIGH, .name = "RTM_NEWNEIGH", .cb = &rtnl_handle_newneigh, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_DELNEIGH, .name = "RTM_DELNEIGH", .cb = &rtnl_handle_delneigh, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_GETNEIGH, .name = "RTM_GETNEIGH", .cb = &rtnl_handle_getneigh, } }; static void rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct ifnet *ifp; int family; LLE_WLOCK_ASSERT(lle); ifp = lltable_get_ifp(lle->lle_tbl); family = lltable_get_af(lle->lle_tbl); if (family != AF_INET && family != AF_INET6) return; int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } struct netlink_walkargs wa = { .hdr.nlmsg_type = nlmsgs_type, .nw = &nw, .ifp = ifp, .family = family, }; dump_lle_locked(lle, &wa); nlmsg_flush(&nw); } static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser, &neigh_fbsd_parser }; void rtnl_neighs_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL, EVENTHANDLER_PRI_ANY); } void rtnl_neighs_destroy(void) { EVENTHANDLER_DEREGISTER(lle_event, lle_event_p); } diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c index 94cc6caf7d82..2a1585616e07 100644 --- a/sys/netlink/route/nexthop.c +++ b/sys/netlink/route/nexthop.c @@ -1,1103 +1,1103 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_nhop #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); /* * This file contains the logic to maintain kernel nexthops and * nexhop groups based om the data provided by the user. * * Kernel stores (nearly) all of the routing data in the nexthops, * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). * * Netlink API provides higher-level abstraction for the user. Each * user-created nexthop may map to multiple kernel nexthops. * * The following variations require separate kernel nexthop to be * created: * * prefix flags (NHF_HOST, NHF_DEFAULT) * * using IPv6 gateway for IPv4 routes * * different fibnum * * These kernel nexthops have the lifetime bound to the lifetime of * the user_nhop object. They are not collected until user requests * to delete the created user_nhop. * */ struct user_nhop { uint32_t un_idx; /* Userland-provided index */ uint32_t un_fibfam; /* fibnum+af(as highest byte) */ uint8_t un_protocol; /* protocol that install the record */ struct nhop_object *un_nhop; /* "production" nexthop */ struct nhop_object *un_nhop_src; /* nexthop to copy from */ struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ uint32_t un_nhgrp_count; /* number of nexthops */ struct user_nhop *un_next; /* next item in hash chain */ struct user_nhop *un_nextchild; /* master -> children */ struct epoch_context un_epoch_ctx; /* epoch ctl helper */ }; /* produce hash value for an object */ #define unhop_hash_obj(_obj) (hash_unhop(_obj)) /* compare two objects */ #define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) /* next object accessor */ #define unhop_next(_obj) (_obj)->un_next CHT_SLIST_DEFINE(unhop, struct user_nhop); struct unhop_ctl { struct unhop_head un_head; struct rmlock un_lock; }; #define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") #define UN_TRACKER struct rm_priotracker un_tracker #define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) #define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) #define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); #define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; #define V_un_ctl VNET(un_ctl) static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); static unsigned int hash_unhop(const struct user_nhop *obj); static void destroy_unhop(struct user_nhop *unhop); static struct nhop_object *clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags); static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) { return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); } /* * Hash callback: calculate hash of an object */ static unsigned int hash_unhop(const struct user_nhop *obj) { return (obj->un_idx ^ obj->un_fibfam); } #define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) /* * Factory interface for creating matching kernel nexthops/nexthop groups * * @uidx: userland nexhop index used to create the nexthop * @fibnum: fibnum nexthop will be used in * @family: upper family nexthop will be used in * @nh_flags: desired nexthop prefix flags * @perror: pointer to store error to * * Returns referenced nexthop linked to @fibnum/@family rib on success. */ struct nhop_object * nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, int nh_flags, int *perror) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); UN_TRACKER; if (__predict_false(ctl == NULL)) return (NULL); struct user_nhop key= { .un_idx = uidx, .un_fibfam = fibnum | ((uint32_t)family) << 24, }; struct user_nhop *unhop; nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); if (__predict_false(family == 0)) return (NULL); UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop != NULL) { struct nhop_object *nh = unhop->un_nhop; UN_RLOCK(ctl); *perror = 0; nhop_ref_any(nh); return (nh); } /* * Exact nexthop not found. Search for template nexthop to clone from. */ key.un_fibfam = 0; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { UN_RUNLOCK(ctl); *perror = ESRCH; return (NULL); } UN_RUNLOCK(ctl); /* Create entry to insert first */ struct user_nhop *un_new, *un_tmp; un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); if (un_new == NULL) { *perror = ENOMEM; return (NULL); } un_new->un_idx = uidx; un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; /* Relying on epoch to protect unhop here */ un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); if (un_new->un_nhop == NULL) { free(un_new, M_NETLINK); *perror = ENOMEM; return (NULL); } /* Insert back and report */ UN_WLOCK(ctl); /* First, find template record once again */ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { /* Someone deleted the nexthop during the call */ UN_WUNLOCK(ctl); *perror = ESRCH; destroy_unhop(un_new); return (NULL); } /* Second, check the direct match */ CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); struct nhop_object *nh; if (un_tmp != NULL) { /* Another thread already created the desired nextop, use it */ nh = un_tmp->un_nhop; } else { /* Finally, insert the new nexthop and link it to the primary */ nh = un_new->un_nhop; CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); un_new->un_nextchild = unhop->un_nextchild; unhop->un_nextchild = un_new; un_new = NULL; NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); } UN_WUNLOCK(ctl); if (un_new != NULL) destroy_unhop(un_new); *perror = 0; nhop_ref_any(nh); return (nh); } static struct user_nhop * nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) { struct user_nhop key= { .un_idx = uidx }; struct user_nhop *unhop = NULL; UN_TRACKER; UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); UN_RUNLOCK(ctl); return (unhop); } #define MAX_STACK_NHOPS 4 static struct nhop_object * clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) { #ifdef ROUTE_MPATH const struct weightened_nhop *wn; struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; uint32_t num_nhops; #endif struct nhop_object *nh = NULL; int error; if (unhop->un_nhop_src != NULL) { IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, family, nh_flags); } struct nhop_object *nh; nh = nhop_alloc(fibnum, AF_UNSPEC); if (nh == NULL) return (NULL); nhop_copy(nh, unhop->un_nhop_src); /* Check that nexthop gateway is compatible with the new family */ if (!nhop_set_upper_family(nh, family)) { nhop_free(nh); return (NULL); } nhop_set_uidx(nh, unhop->un_idx); nhop_set_pxtype_flag(nh, nh_flags); return (nhop_get_nhop(nh, &error)); } #ifdef ROUTE_MPATH wn = unhop->un_nhgrp_src; num_nhops = unhop->un_nhgrp_count; if (num_nhops > MAX_STACK_NHOPS) { wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); if (wn_new == NULL) return (NULL); } else wn_new = wn_base; for (int i = 0; i < num_nhops; i++) { uint32_t uidx = nhop_get_uidx(wn[i].nh); MPASS(uidx != 0); wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); if (error != 0) break; wn_new[i].weight = wn[i].weight; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn_new[0].nh); struct nhgrp_object *nhg; error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); nh = (struct nhop_object *)nhg; } if (wn_new != wn_base) free(wn_new, M_TEMP); #endif return (nh); } static void destroy_unhop(struct user_nhop *unhop) { if (unhop->un_nhop != NULL) nhop_free_any(unhop->un_nhop); if (unhop->un_nhop_src != NULL) nhop_free_any(unhop->un_nhop_src); free(unhop, M_NETLINK); } static void destroy_unhop_epoch(epoch_context_t ctx) { struct user_nhop *unhop; unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); destroy_unhop(unhop); } static uint32_t find_spare_uidx(struct unhop_ctl *ctl) { struct user_nhop *unhop, key = {}; uint32_t uidx = 0; UN_TRACKER; UN_RLOCK(ctl); /* This should return spare uid with 75% of 65k used in ~99/100 cases */ for (int i = 0; i < 16; i++) { key.un_idx = (arc4random() % 65536) + 65536 * 4; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); if (unhop == NULL) { uidx = key.un_idx; break; } } UN_RUNLOCK(ctl); return (uidx); } /* * Actual netlink code */ struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; int family; int error; int count; int dumped; }; #define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem static bool dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) goto enomem; struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); nhm->nh_family = AF_UNSPEC; nhm->nh_scope = 0; nhm->nh_protocol = unhop->un_protocol; nhm->nh_flags = 0; nlattr_add_u32(nw, NHA_ID, unhop->un_idx); nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); struct weightened_nhop *wn = unhop->un_nhgrp_src; uint32_t num_nhops = unhop->un_nhgrp_count; /* TODO: a better API? */ int nla_len = sizeof(struct nlattr); nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) goto enomem; nla->nla_type = NHA_GROUP; nla->nla_len = nla_len; for (int i = 0; i < num_nhops; i++) { struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; grp->id = nhop_get_uidx(wn[i].nh); grp->weight = wn[i].weight; grp->resvd1 = 0; grp->resvd2 = 0; } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); nlmsg_abort(nw); return (false); } static bool dump_nhop(const struct nhop_object *nh, uint32_t uidx, struct nlmsghdr *hdr, struct nl_writer *nw) { if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) goto enomem; struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); ENOMEM_IF_NULL(nhm); nhm->nh_family = nhop_get_neigh_family(nh); nhm->nh_scope = 0; // XXX: what's that? nhm->nh_protocol = nhop_get_origin(nh); nhm->nh_flags = 0; if (uidx != 0) nlattr_add_u32(nw, NHA_ID, uidx); if (nh->nh_flags & NHF_BLACKHOLE) { nlattr_add_flag(nw, NHA_BLACKHOLE); goto done; } nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); switch (nh->gw_sa.sa_family) { #ifdef INET case AF_INET: nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr = nh->gw6_sa.sin6_addr; in6_clearscope(&addr); nlattr_add(nw, NHA_GATEWAY, 16, &addr); break; } #endif } int off = nlattr_add_nested(nw, NHA_FREEBSD); if (off != 0) { nlattr_add_u32(nw, NHAF_AIF, nh->nh_aifp->if_index); if (uidx == 0) { nlattr_add_u32(nw, NHAF_KID, nhop_get_idx(nh)); nlattr_add_u32(nw, NHAF_FAMILY, nhop_get_upper_family(nh)); nlattr_add_u32(nw, NHAF_TABLE, nhop_get_fibnum(nh)); } nlattr_set_len(nw, off); } done: if (nlmsg_end(nw)) return (true); enomem: nlmsg_abort(nw); return (false); } static void dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, struct nl_writer *nw) { if (unhop->un_nhop_src != NULL) dump_nhop(unhop->un_nhop_src, unhop->un_idx, hdr, nw); else dump_nhgrp(unhop, hdr, nw); } static int delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) { struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; struct user_nhop key = { .un_idx = uidx }; UN_WLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); if (unhop_base != NULL) { CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, "removed base nhop %u: %s", uidx, nhbuf); } /* Unlink all child nexhops as well, keeping the chain intact */ unhop_chain = unhop_base->un_nextchild; while (unhop_chain != NULL) { CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, unhop_ret); MPASS(unhop_chain == unhop_ret); IF_DEBUG_LEVEL(LOG_DEBUG3) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf_any(unhop_chain->un_nhop, nhbuf, sizeof(nhbuf)); FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, "removed child nhop %u: %s", uidx, nhbuf); } unhop_chain = unhop_chain->un_nextchild; } } UN_WUNLOCK(ctl); if (unhop_base == NULL) { NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); return (ENOENT); } /* Report nexthop deletion */ struct netlink_walkargs wa = { .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, }; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } dump_unhop(unhop_base, &wa.hdr, &nw); nlmsg_flush(&nw); while (unhop_base != NULL) { unhop_chain = unhop_base->un_nextchild; NET_EPOCH_CALL(destroy_unhop_epoch, &unhop_base->un_epoch_ctx); unhop_base = unhop_chain; } return (0); } static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size) { void *new_ptr = NULL; size_t alloc_size; if (new_size == 0) return; if (new_size != 0) { alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); if (new_ptr == NULL) return; } NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); UN_WLOCK(ctl); if (new_ptr != NULL) { CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); } UN_WUNLOCK(ctl); if (new_ptr != NULL) free(new_ptr, M_NETLINK); } static bool __noinline vnet_init_unhops(void) { uint32_t num_buckets = 16; size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, M_NOWAIT | M_ZERO); if (ctl == NULL) return (false); void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); if (ptr == NULL) { free(ctl, M_NETLINK); return (false); } CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); UN_LOCK_INIT(ctl); if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { free(ptr, M_NETLINK); free(ctl, M_NETLINK); } if (atomic_load_ptr(&V_un_ctl) == NULL) return (false); NL_LOG(LOG_NOTICE, "UNHOPS init done"); return (true); } static void vnet_destroy_unhops(const void *unused __unused) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); struct user_nhop *unhop, *tmp; if (ctl == NULL) return; V_un_ctl = NULL; /* Wait till all unhop users finish their reads */ NET_EPOCH_WAIT(); UN_WLOCK(ctl); CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { destroy_unhop(unhop); } CHT_SLIST_FOREACH_SAFE_END; UN_WUNLOCK(ctl); free(ctl->un_head.ptr, M_NETLINK); free(ctl, M_NETLINK); } VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_destroy_unhops, NULL); static int nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int error = 0; /* Verify attribute correctness */ struct nexthop_grp *grp = NLA_DATA(nla); int data_len = NLA_DATA_LEN(nla); int count = data_len / sizeof(*grp); if (count == 0 || (count * sizeof(*grp) != data_len)) { NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); return (EINVAL); } *((struct nlattr **)target) = nla; return (error); } struct nl_parsed_nhop { uint32_t nha_id; uint8_t nha_blackhole; uint8_t nha_groups; uint8_t nhaf_knhops; uint8_t nhaf_family; struct ifnet *nha_oif; struct sockaddr *nha_gw; struct nlattr *nha_group; uint8_t nh_family; uint8_t nh_protocol; uint32_t nhaf_table; uint32_t nhaf_kid; uint32_t nhaf_aif; }; #define _IN(_field) offsetof(struct nhmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) static struct nlattr_parser nla_p_nh_fbsd[] = { { .type = NHAF_KNHOPS, .off = _OUT(nhaf_knhops), .cb = nlattr_get_flag }, { .type = NHAF_TABLE, .off = _OUT(nhaf_table), .cb = nlattr_get_uint32 }, { .type = NHAF_FAMILY, .off = _OUT(nhaf_family), .cb = nlattr_get_uint8 }, { .type = NHAF_KID, .off = _OUT(nhaf_kid), .cb = nlattr_get_uint32 }, { .type = NHAF_AIF, .off = _OUT(nhaf_aif), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(nh_fbsd_parser, nla_p_nh_fbsd); static const struct nlfield_parser nlf_p_nh[] = { { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, }; static const struct nlattr_parser nla_p_nh[] = { { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, { .type = NHA_FREEBSD, .arg = &nh_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); static bool eligible_nhg(const struct nhop_object *nh) { return (nh->nh_flags & NHF_GATEWAY); } static int newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) { struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); struct weightened_nhop *wn; wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); if (wn == NULL) return (ENOMEM); for (int i = 0; i < count; i++) { struct user_nhop *unhop; unhop = nl_find_base_unhop(ctl, grp[i].id); if (unhop == NULL) { NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); free(wn, M_NETLINK); return (ESRCH); } else if (unhop->un_nhop_src == NULL) { NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", grp[i].id); free(wn, M_NETLINK); return (ENOTSUP); } else if (!eligible_nhg(unhop->un_nhop_src)) { NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", grp[i].id); free(wn, M_NETLINK); return (ENOTSUP); } /* * TODO: consider more rigid eligibility checks: * restrict nexthops with the same gateway */ wn[i].nh = unhop->un_nhop_src; wn[i].weight = grp[i].weight; } unhop->un_nhgrp_src = wn; unhop->un_nhgrp_count = count; return (0); } /* * Sets nexthop @nh gateway specified by @gw. * If gateway is IPv6 link-local, alters @gw to include scopeid equal to * @ifp ifindex. * Returns 0 on success or errno. */ int nl_set_nexthop_gw(struct nhop_object *nh, struct sockaddr *gw, struct ifnet *ifp, struct nl_pstate *npt) { #ifdef INET6 if (gw->sa_family == AF_INET6) { struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)gw; if (IN6_IS_ADDR_LINKLOCAL(&gw6->sin6_addr)) { if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "interface not set"); return (EINVAL); } in6_set_unicast_scopeid(&gw6->sin6_addr, ifp->if_index); } } #endif nhop_set_gw(nh, gw, true); return (0); } static int newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop, struct nl_pstate *npt) { struct ifaddr *ifa = NULL; struct nhop_object *nh; int error; if (!attrs->nha_blackhole) { if (attrs->nha_gw == NULL) { NLMSG_REPORT_ERR_MSG(npt, "missing NHA_GATEWAY"); return (EINVAL); } if (attrs->nha_oif == NULL) { NLMSG_REPORT_ERR_MSG(npt, "missing NHA_OIF"); return (EINVAL); } if (ifa == NULL) ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); if (ifa == NULL) { NLMSG_REPORT_ERR_MSG(npt, "Unable to determine default source IP"); return (EINVAL); } } int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; nh = nhop_alloc(RT_DEFAULT_FIB, family); if (nh == NULL) { NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); return (ENOMEM); } nhop_set_uidx(nh, attrs->nha_id); nhop_set_origin(nh, attrs->nh_protocol); if (attrs->nha_blackhole) nhop_set_blackhole(nh, NHF_BLACKHOLE); else { error = nl_set_nexthop_gw(nh, attrs->nha_gw, attrs->nha_oif, npt); if (error != 0) { nhop_free(nh); return (error); } nhop_set_transmit_ifp(nh, attrs->nha_oif); nhop_set_src(nh, ifa); } error = nhop_get_unlinked(nh); if (error != 0) { NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); return (error); } IF_DEBUG_LEVEL(LOG_DEBUG2) { char nhbuf[NHOP_PRINT_BUFSIZE]; nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); } unhop->un_nhop_src = nh; return (0); } static int rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct user_nhop *unhop; int error; if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) return (ENOMEM); struct unhop_ctl *ctl = V_un_ctl; struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); /* * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class * citizen. */ if (attrs.nha_id == 0) { attrs.nha_id = find_spare_uidx(ctl); if (attrs.nha_id == 0) { NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); return (ENOSPC); } } NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); if (unhop == NULL) { NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); return (ENOMEM); } unhop->un_idx = attrs.nha_id; unhop->un_protocol = attrs.nh_protocol; if (attrs.nha_group) error = newnhg(ctl, &attrs, unhop); else error = newnhop(&attrs, unhop, npt); if (error != 0) { free(unhop, M_NETLINK); return (error); } UN_WLOCK(ctl); /* Check if uidx already exists */ struct user_nhop *tmp = NULL; CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); if (tmp != NULL) { UN_WUNLOCK(ctl); NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); destroy_unhop(unhop); return (EEXIST); } CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); UN_WUNLOCK(ctl); /* Report addition of the next nexhop */ struct netlink_walkargs wa = { .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, }; struct nl_writer nw = {}; if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { NL_LOG(LOG_DEBUG, "error allocating message writer"); return (ENOMEM); } dump_unhop(unhop, &wa.hdr, &nw); nlmsg_flush(&nw); consider_resize(ctl, num_buckets_new); return (0); } static int rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); int error; if (__predict_false(ctl == NULL)) return (ESRCH); struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); if (attrs.nha_id == 0) { NL_LOG(LOG_DEBUG, "NHA_ID not set"); return (EINVAL); } error = delete_unhop(ctl, hdr, attrs.nha_id); return (error); } static bool match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) { if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) return (false); if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) return (false); if (attrs->nha_oif != NULL && (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) return (false); return (true); } static int rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct user_nhop *unhop; UN_TRACKER; int error; struct nl_parsed_nhop attrs = {}; error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, }; if (attrs.nha_id != 0) { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); struct user_nhop key = { .un_idx = attrs.nha_id }; if (__predict_false(ctl == NULL)) return (ESRCH); NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); UN_RLOCK(ctl); CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); UN_RUNLOCK(ctl); if (unhop == NULL) return (ESRCH); dump_unhop(unhop, &wa.hdr, wa.nw); return (0); } else if (attrs.nhaf_kid != 0) { struct nhop_iter iter = { .fibnum = attrs.nhaf_table, .family = attrs.nhaf_family, }; int error = ESRCH; NL_LOG(LOG_DEBUG2, "START table %u family %d", attrs.nhaf_table, attrs.nhaf_family); for (struct nhop_object *nh = nhops_iter_start(&iter); nh; nh = nhops_iter_next(&iter)) { NL_LOG(LOG_DEBUG3, "get %u", nhop_get_idx(nh)); if (nhop_get_idx(nh) == attrs.nhaf_kid) { dump_nhop(nh, 0, &wa.hdr, wa.nw); error = 0; break; } } nhops_iter_stop(&iter); return (error); } else if (attrs.nhaf_knhops) { struct nhop_iter iter = { .fibnum = attrs.nhaf_table, .family = attrs.nhaf_family, }; NL_LOG(LOG_DEBUG2, "DUMP table %u family %d", attrs.nhaf_table, attrs.nhaf_family); wa.hdr.nlmsg_flags |= NLM_F_MULTI; for (struct nhop_object *nh = nhops_iter_start(&iter); nh; nh = nhops_iter_next(&iter)) { dump_nhop(nh, 0, &wa.hdr, wa.nw); } nhops_iter_stop(&iter); } else { struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); if (__predict_false(ctl == NULL)) return (ESRCH); NL_LOG(LOG_DEBUG2, "DUMP unhops"); UN_RLOCK(ctl); wa.hdr.nlmsg_flags |= NLM_F_MULTI; CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) dump_unhop(unhop, &wa.hdr, wa.nw); } CHT_SLIST_FOREACH_END; UN_RUNLOCK(ctl); } if (wa.error == 0) { if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) return (ENOMEM); } return (0); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_NEWNEXTHOP, .name = "RTM_NEWNEXTHOP", .cb = &rtnl_handle_newnhop, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_DELNEXTHOP, .name = "RTM_DELNEXTHOP", .cb = &rtnl_handle_delnhop, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_GETNEXTHOP, .name = "RTM_GETNEXTHOP", .cb = &rtnl_handle_getnhop, } }; static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser, &nh_fbsd_parser }; void rtnl_nexthops_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c index cbe9ee001dfd..01a7d7ed3e16 100644 --- a/sys/netlink/route/rt.c +++ b/sys/netlink/route/rt.c @@ -1,1116 +1,1116 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_netlink.h" #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_route #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include -_DECLARE_DEBUG(LOG_DEBUG); +_DECLARE_DEBUG(LOG_INFO); static unsigned char get_rtm_type(const struct nhop_object *nh) { int nh_flags = nh->nh_flags; /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ if (nh_flags & NHF_BLACKHOLE) return (RTN_BLACKHOLE); else if (nh_flags & NHF_REJECT) return (RTN_PROHIBIT); return (RTN_UNICAST); } static uint8_t nl_get_rtm_protocol(const struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; uint8_t origin = nhgrp_get_origin(nhg); if (origin != RTPROT_UNSPEC) return (origin); nh = nhg->nhops[0]; } #endif uint8_t origin = nhop_get_origin(nh); if (origin != RTPROT_UNSPEC) return (origin); /* TODO: remove guesswork once all kernel users fill in origin */ int rt_flags = nhop_get_rtflags(nh); if (rt_flags & RTF_PROTO1) return (RTPROT_ZEBRA); if (rt_flags & RTF_STATIC) return (RTPROT_STATIC); return (RTPROT_KERNEL); } static int get_rtmsg_type_from_rtsock(int cmd) { switch (cmd) { case RTM_ADD: case RTM_CHANGE: case RTM_GET: return NL_RTM_NEWROUTE; case RTM_DELETE: return NL_RTM_DELROUTE; } return (0); } /* * fibnum heuristics * * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS * msg rtm_table RTA_TABLE result * RTM_GETROUTE/dump 0 - RT_ALL_FIBS * RTM_GETROUTE/dump 1 - 1 * RTM_GETROUTE/get 0 - 0 * */ static struct nhop_object * rc_get_nhop(const struct rib_cmd_info *rc) { return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); } static void dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) { #ifdef INET6 int upper_family; #endif switch (nhop_get_neigh_family(nh)) { case AF_LINK: /* onlink prefix, skip */ break; case AF_INET: nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #ifdef INET6 case AF_INET6: upper_family = nhop_get_upper_family(nh); if (upper_family == AF_INET6) { struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6); } else if (upper_family == AF_INET) { /* IPv4 over IPv6 */ struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); char buf[20]; struct rtvia *via = (struct rtvia *)&buf[0]; via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &gw6, 16); nlattr_add(nw, NL_RTA_VIA, 17, via); } break; #endif } } static void dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) { int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return; nla->nla_type = NL_RTA_METRICS; nla->nla_len = nla_len; nla++; nla->nla_type = NL_RTAX_MTU; nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); *((uint32_t *)(nla + 1)) = nh->nh_mtu; } #ifdef ROUTE_MPATH static void dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) { uint32_t uidx = nhgrp_get_uidx(nhg); uint32_t num_nhops; const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); if (off == 0) return; for (int i = 0; i < num_nhops; i++) { int nh_off = nlattr_save_offset(nw); struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); if (rtnh == NULL) return; rtnh->rtnh_flags = 0; rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index; rtnh->rtnh_hops = wn[i].weight; dump_rc_nhop_gw(nw, wn[i].nh); uint32_t rtflags = nhop_get_rtflags(wn[i].nh); if (rtflags != base_rtflags) nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, wn[i].nh); rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); /* * nlattr_add() allocates 4-byte aligned storage, no need to aligh * length here * */ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; } nlattr_set_len(nw, off); } #endif static void dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rnd->rnd_nhop)) { dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm); return; } #endif const struct nhop_object *nh = rnd->rnd_nhop; uint32_t rtflags = nhop_get_rtflags(nh); /* * IPv4 over IPv6 * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), * IPv4 w/ gw * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], * Direct route: * ('RTA_OIF', 2) */ if (nh->nh_flags & NHF_GATEWAY) dump_rc_nhop_gw(nw, nh); uint32_t uidx = nhop_get_uidx(nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, nh); uint32_t nh_expire = nhop_get_expire(nh); if (nh_expire > 0) nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); /* In any case, fill outgoing interface */ nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index); if (rnd->rnd_weight != RT_DEFAULT_WEIGHT) nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight); } /* * Dumps output from a rib command into an rtmsg */ static int dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, const struct rtentry *rt, struct route_nhop_data *rnd, struct nl_writer *nw) { struct rtmsg *rtm; int error = 0; NET_EPOCH_ASSERT(); if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) goto enomem; int family = rt_get_family(rt); int rtm_off = nlattr_save_offset(nw); rtm = nlmsg_reserve_object(nw, struct rtmsg); rtm->rtm_family = family; rtm->rtm_dst_len = 0; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; if (fibnum < 255) rtm->rtm_table = (unsigned char)fibnum; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); int plen = 0; #if defined(INET) || defined(INET6) uint32_t scopeid; #endif switch (family) { #ifdef INET case AF_INET: { struct in_addr addr; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 4, &addr); break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 16, &addr); break; } #endif default: FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); error = EAFNOSUPPORT; goto flush; } rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); if (plen > 0) rtm->rtm_dst_len = plen; dump_rc_nhop(nw, rnd, rtm); if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; flush: nlmsg_abort(nw); return (error); } static int family_to_group(int family) { switch (family) { case AF_INET: return (RTNLGRP_IPV4_ROUTE); case AF_INET6: return (RTNLGRP_IPV6_ROUTE); } return (0); } static void report_operation(uint32_t fibnum, struct rib_cmd_info *rc, struct nlpcb *nlp, struct nlmsghdr *hdr) { struct nl_writer nw = {}; uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); switch (rc->rc_cmd) { case RTM_ADD: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; break; case RTM_CHANGE: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_REPLACE; break; case RTM_DELETE: hdr->nlmsg_type = NL_RTM_DELROUTE; break; } dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } rtsock_callback_p->route_f(fibnum, rc); } struct rta_mpath_nh { struct sockaddr *gw; struct ifnet *ifp; uint8_t rtnh_flags; uint8_t rtnh_weight; }; #define _IN(_field) offsetof(struct rtnexthop, _field) #define _OUT(_field) offsetof(struct rta_mpath_nh, _field) const static struct nlattr_parser nla_p_rtnh[] = { { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, }; const static struct nlfield_parser nlf_p_rtnh[] = { { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(mpath_parser, struct rtnexthop, nlf_p_rtnh, nla_p_rtnh); static void set_scope6(struct sockaddr *sa, struct ifnet *ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, ifp->if_index); } #endif } static void post_p_mpath(struct rta_mpath_nh *mpnh) { set_scope6(mpnh->gw, mpnh->ifp); } struct rta_mpath { int num_nhops; struct rta_mpath_nh nhops[0]; }; static int nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int data_len = nla->nla_len - sizeof(struct nlattr); struct rtnexthop *rtnh; int max_nhops = data_len / sizeof(struct rtnexthop); struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); mp->num_nhops = 0; for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++]; int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, npt, mpnh); if (error != 0) { NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed", mp->num_nhops - 1); return (error); } post_p_mpath(mpnh); int len = NL_ITEM_ALIGN(rtnh->rtnh_len); data_len -= len; rtnh = (struct rtnexthop *)((char *)rtnh + len); } if (data_len != 0 || mp->num_nhops == 0) { NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); return (EINVAL); } *((struct rta_mpath **)target) = mp; return (0); } struct nl_parsed_route { struct sockaddr *rta_dst; struct sockaddr *rta_gw; struct ifnet *rta_oif; struct rta_mpath *rta_multipath; uint32_t rta_table; uint32_t rta_rtflags; uint32_t rta_nh_id; uint32_t rta_weight; uint32_t rtax_mtu; uint8_t rtm_family; uint8_t rtm_dst_len; uint8_t rtm_protocol; uint8_t rtm_type; uint32_t rtm_flags; }; #define _IN(_field) offsetof(struct rtmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_route, _field) static struct nlattr_parser nla_p_rtmetrics[] = { { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); static const struct nlattr_parser nla_p_rtmsg[] = { { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 }, { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_rtmsg[] = { { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 }, }; #undef _IN #undef _OUT NL_DECLARE_PARSER(rtm_parser, struct rtmsg, nlf_p_rtmsg, nla_p_rtmsg); static void post_p_rtmsg(struct nl_parsed_route *r) { set_scope6(r->rta_dst, r->rta_oif); set_scope6(r->rta_gw, r->rta_oif); } struct netlink_walkargs { struct nl_writer *nw; struct route_nhop_data rnd; struct nlmsghdr hdr; struct nlpcb *nlp; uint32_t fibnum; int family; int error; int count; int dumped; int dumped_tables; }; static int dump_rtentry(struct rtentry *rt, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; int error; wa->count++; if (wa->error != 0) return (0); if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp))) return (0); wa->dumped++; rt_get_rnd(rt, &wa->rnd); error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); IF_DEBUG_LEVEL(LOG_DEBUG3) { char rtbuf[INET6_ADDRSTRLEN + 5]; FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, "Dump %s, offset %u, error %d", rt_print_buf(rt, rtbuf, sizeof(rtbuf)), wa->nw->offset, error); } wa->error = error; return (0); } static void dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) { FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); wa->count = 0; wa->dumped = 0; rib_walk(fibnum, family, false, dump_rtentry, wa); wa->dumped_tables++; FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", wa->count, wa->dumped); NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset); } static int dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) { wa->fibnum = fibnum; if (family == AF_UNSPEC) { for (int i = 0; i < AF_MAX; i++) { if (rt_tables_get_rnh(fibnum, i) != 0) { wa->family = i; dump_rtable_one(wa, fibnum, i); if (wa->error != 0) break; } } } else { if (rt_tables_get_rnh(fibnum, family) != 0) { wa->family = family; dump_rtable_one(wa, fibnum, family); } } return (wa->error); } static int handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { RIB_RLOCK_TRACKER; struct rib_head *rnh; const struct rtentry *rt; struct route_nhop_data rnd; uint32_t fibnum = attrs->rta_table; sa_family_t family = attrs->rtm_family; if (attrs->rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); return (EINVAL); } rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); struct sockaddr *dst = attrs->rta_dst; if (attrs->rtm_flags & RTM_F_PREFIX) rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd); else rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rt_get_rnd(rt, &rnd); rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); RIB_RUNLOCK(rnh); if (!rt_is_exportable(rt, nlp_get_cred(nlp))) return (ESRCH); IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), rt_print_buf(rt, rtbuf, sizeof(rtbuf))); } hdr->nlmsg_type = NL_RTM_NEWROUTE; dump_px(fibnum, hdr, rt, &rnd, npt->nw); return (0); } static int handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, struct nlmsghdr *hdr, struct nl_writer *nw) { struct netlink_walkargs wa = { .nlp = nlp, .nw = nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_type = NL_RTM_NEWROUTE, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, }; if (fibnum == RT_TABLE_UNSPEC) { for (int i = 0; i < V_rt_numfibs; i++) { dump_rtable_fib(&wa, fibnum, family); if (wa.error != 0) break; } } else dump_rtable_fib(&wa, fibnum, family); if (wa.error == 0 && wa.dumped_tables == 0) { FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); wa.error = ESRCH; // How do we propagate it? } if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (wa.error); } static struct nhop_object * finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror) { /* * The following MUST be filled: * nh_ifp, nh_ifa, nh_gw */ if (nh->gw_sa.sa_family == 0) { /* * Empty gateway. Can be direct route with RTA_OIF set. */ if (nh->nh_ifp != NULL) nhop_set_direct_gw(nh, nh->nh_ifp); else { NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); *perror = EINVAL; return (NULL); } /* Both nh_ifp and gateway are set */ } else { /* Gateway is set up, we can derive ifp if not set */ if (nh->nh_ifp == NULL) { uint32_t fibnum = nhop_get_fibnum(nh); uint32_t flags = 0; if (nh->nh_flags & NHF_GATEWAY) flags = RTF_GATEWAY; else if (nh->nh_flags & NHF_HOST) flags = RTF_HOST; struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); *perror = EINVAL; return (NULL); } nhop_set_transmit_ifp(nh, ifa->ifa_ifp); } } /* Both nh_ifp and gateway are set */ if (nh->nh_ifa == NULL) { const struct sockaddr *gw_sa = &nh->gw_sa; if (gw_sa->sa_family != dst->sa_family) { /* * Use dst as the target for determining the default * preferred ifa IF * 1) the gateway is link-level (e.g. direct route) * 2) the gateway family is different (e.g. IPv4 over IPv6). */ gw_sa = dst; } struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); *perror = EINVAL; return (NULL); } nhop_set_src(nh, ifa); } return (nhop_get_nhop(nh, perror)); } static int get_pxflag(const struct nl_parsed_route *attrs) { int pxflag = 0; switch (attrs->rtm_family) { case AF_INET: if (attrs->rtm_dst_len == 32) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; case AF_INET6: if (attrs->rtm_dst_len == 128) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; } return (pxflag); } static int get_op_flags(int nlm_flags) { int op_flags = 0; op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; return (op_flags); } #ifdef ROUTE_MPATH static int create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, struct nl_pstate *npt, struct nhop_object **pnh) { int error; if (mpnh->gw == NULL) return (EINVAL); struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) return (ENOMEM); error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt); if (error != 0) { nhop_free(nh); return (error); } if (mpnh->ifp != NULL) nhop_set_transmit_ifp(nh, mpnh->ifp); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); *pnh = finalize_nhop(nh, attrs->rta_dst, &error); return (error); } #endif static struct nhop_object * create_nexthop_from_attrs(struct nl_parsed_route *attrs, struct nl_pstate *npt, int *perror) { struct nhop_object *nh = NULL; int error = 0; if (attrs->rta_multipath != NULL) { #ifdef ROUTE_MPATH /* Multipath w/o explicit nexthops */ int num_nhops = attrs->rta_multipath->num_nhops; struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); for (int i = 0; i < num_nhops; i++) { struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); if (error != 0) { for (int j = 0; j < i; j++) nhop_free(wn[j].nh); break; } wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn[0].nh); struct nhgrp_object *nhg; nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family, wn, num_nhops, perror); if (nhg != NULL) { if (attrs->rtm_protocol > RTPROT_STATIC) nhgrp_set_origin(nhg, attrs->rtm_protocol); nhg = nhgrp_get_nhgrp(nhg, perror); } for (int i = 0; i < num_nhops; i++) nhop_free(wn[i].nh); if (nhg != NULL) return ((struct nhop_object *)nhg); error = *perror; } #else error = ENOTSUP; #endif *perror = error; } else { nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) { *perror = ENOMEM; return (NULL); } if (attrs->rta_gw != NULL) { *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt); if (*perror != 0) { nhop_free(nh); return (NULL); } } if (attrs->rta_oif != NULL) nhop_set_transmit_ifp(nh, attrs->rta_oif); if (attrs->rtax_mtu != 0) nhop_set_mtu(nh, attrs->rtax_mtu, true); if (attrs->rta_rtflags & RTF_BROADCAST) nhop_set_broadcast(nh, true); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); switch (attrs->rtm_type) { case RTN_UNICAST: break; case RTN_BLACKHOLE: nhop_set_blackhole(nh, RTF_BLACKHOLE); break; case RTN_PROHIBIT: case RTN_UNREACHABLE: nhop_set_blackhole(nh, RTF_REJECT); break; /* TODO: return ENOTSUP for other types if strict option is set */ } nh = finalize_nhop(nh, attrs->rta_dst, perror); } return (nh); } static int rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc = {}; struct nhop_object *nh = NULL; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); post_p_rtmsg(&attrs); /* Check if we have enough data */ if (attrs.rta_dst == NULL) { NL_LOG(LOG_DEBUG, "missing RTA_DST"); return (EINVAL); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (attrs.rta_nh_id != 0) { /* Referenced uindex */ int pxflag = get_pxflag(&attrs); nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, pxflag, &error); if (error != 0) return (error); } else { nh = create_nexthop_from_attrs(&attrs, npt, &error); if (error != 0) { NL_LOG(LOG_DEBUG, "Error creating nexthop"); return (error); } } if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0) attrs.rta_weight = RT_DEFAULT_WEIGHT; struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight }; int op_flags = get_op_flags(hdr->nlmsg_flags); error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, &rnd, op_flags, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) return (0); if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) return (0); return (1); } static int rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); post_p_rtmsg(&attrs); if (attrs.rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); return (ESRCH); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); post_p_rtmsg(&attrs); if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (hdr->nlmsg_flags & NLM_F_DUMP) error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); else error = handle_rtm_getroute(nlp, &attrs, hdr, npt); return (error); } void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { struct nl_writer nw = {}; int family, nlm_flags = 0; family = rt_get_family(rc->rc_rt); /* XXX: check if there are active listeners first */ /* TODO: consider passing PID/type/seq */ switch (rc->rc_cmd) { case RTM_ADD: nlm_flags = NLM_F_EXCL | NLM_F_CREATE; break; case RTM_CHANGE: nlm_flags = NLM_F_REPLACE; break; case RTM_DELETE: nlm_flags = 0; break; } IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "received event %s for %s / nlm_flags=%X", rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), nlm_flags); } struct nlmsghdr hdr = { .nlmsg_flags = nlm_flags, .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), }; struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; uint32_t group_id = family_to_group(family); if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { NL_LOG(LOG_DEBUG, "error allocating event buffer"); return; } dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETROUTE, .name = "RTM_GETROUTE", .cb = &rtnl_handle_getroute, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELROUTE, .name = "RTM_DELROUTE", .cb = &rtnl_handle_delroute, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_NEWROUTE, .name = "RTM_NEWROUTE", .cb = &rtnl_handle_newroute, .priv = PRIV_NET_ROUTE, } }; static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; void rtnl_routes_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); }