diff --git a/sys/compat/linux/linux_netlink.c b/sys/compat/linux/linux_netlink.c index 807cdc7a14bc..af172fb27ba7 100644 --- a/sys/compat/linux/linux_netlink.c +++ b/sys/compat/linux/linux_netlink.c @@ -1,619 +1,604 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #define DEBUG_MOD_NAME nl_linux #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static bool valid_rta_size(const struct rtattr *rta, int sz) { return (NL_RTA_DATA_LEN(rta) == sz); } static bool valid_rta_u32(const struct rtattr *rta) { return (valid_rta_size(rta, sizeof(uint32_t))); } static uint32_t _rta_get_uint32(const struct rtattr *rta) { return (*((const uint32_t *)NL_RTA_DATA_CONST(rta))); } static struct nlmsghdr * rtnl_neigh_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct ndmsg *ndm = (struct ndmsg *)(hdr + 1); if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ndmsg)) ndm->ndm_family = linux_to_bsd_domain(ndm->ndm_family); return (hdr); } static struct nlmsghdr * rtnl_ifaddr_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { struct ifaddrmsg *ifam = (struct ifaddrmsg *)(hdr + 1); if (hdr->nlmsg_len >= sizeof(struct nlmsghdr) + sizeof(struct ifaddrmsg)) ifam->ifa_family = linux_to_bsd_domain(ifam->ifa_family); return (hdr); } static struct nlmsghdr * rtnl_route_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { /* Tweak address families and default fib only */ struct rtmsg *rtm = (struct rtmsg *)(hdr + 1); struct nlattr *nla, *nla_head; int attrs_len; rtm->rtm_family = linux_to_bsd_domain(rtm->rtm_family); if (rtm->rtm_table == 254) rtm->rtm_table = 0; attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr); attrs_len -= NETLINK_ALIGN(sizeof(struct rtmsg)); nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg))); NLA_FOREACH(nla, nla_head, attrs_len) { RT_LOG(LOG_DEBUG3, "GOT type %d len %d total %d", nla->nla_type, nla->nla_len, attrs_len); struct rtattr *rta = (struct rtattr *)nla; if (rta->rta_len < sizeof(struct rtattr)) { break; } switch (rta->rta_type) { case NL_RTA_TABLE: if (!valid_rta_u32(rta)) goto done; rtm->rtm_table = 0; uint32_t fibnum = _rta_get_uint32(rta); RT_LOG(LOG_DEBUG3, "GET RTABLE: %u", fibnum); if (fibnum == 254) { *((uint32_t *)NL_RTA_DATA(rta)) = 0; } break; } } done: return (hdr); } static struct nlmsghdr * rtnl_from_linux(struct nlmsghdr *hdr, struct nl_pstate *npt) { switch (hdr->nlmsg_type) { case NL_RTM_GETROUTE: case NL_RTM_NEWROUTE: case NL_RTM_DELROUTE: return (rtnl_route_from_linux(hdr, npt)); case NL_RTM_GETNEIGH: return (rtnl_neigh_from_linux(hdr, npt)); case NL_RTM_GETADDR: return (rtnl_ifaddr_from_linux(hdr, npt)); /* Silence warning for the messages where no translation is required */ case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: break; default: RT_LOG(LOG_DEBUG, "Passing message type %d untranslated", hdr->nlmsg_type); } return (hdr); } static struct nlmsghdr * nlmsg_from_linux(int netlink_family, struct nlmsghdr *hdr, struct nl_pstate *npt) { switch (netlink_family) { case NETLINK_ROUTE: return (rtnl_from_linux(hdr, npt)); } return (hdr); } /************************************************************ * Kernel -> Linux ************************************************************/ static bool handle_default_out(struct nlmsghdr *hdr, struct nl_writer *nw) { char *out_hdr; out_hdr = nlmsg_reserve_data(nw, NLMSG_ALIGN(hdr->nlmsg_len), char); if (out_hdr != NULL) { memcpy(out_hdr, hdr, hdr->nlmsg_len); + nw->num_messages++; return (true); } return (false); } static bool nlmsg_copy_header(struct nlmsghdr *hdr, struct nl_writer *nw) { return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, hdr->nlmsg_flags, 0)); } static void * _nlmsg_copy_next_header(struct nlmsghdr *hdr, struct nl_writer *nw, int sz) { void *next_hdr = nlmsg_reserve_data(nw, sz, void); memcpy(next_hdr, hdr + 1, NLMSG_ALIGN(sz)); return (next_hdr); } #define nlmsg_copy_next_header(_hdr, _ns, _t) \ ((_t *)(_nlmsg_copy_next_header(_hdr, _ns, sizeof(_t)))) static bool nlmsg_copy_nla(const struct nlattr *nla_orig, struct nl_writer *nw) { struct nlattr *nla = nlmsg_reserve_data(nw, nla_orig->nla_len, struct nlattr); if (nla != NULL) { memcpy(nla, nla_orig, nla_orig->nla_len); return (true); } return (false); } /* * Translate a FreeBSD interface name to a Linux interface name. */ static bool nlmsg_translate_ifname_nla(struct nlattr *nla, struct nl_writer *nw) { char ifname[LINUX_IFNAMSIZ]; if (ifname_bsd_to_linux_name((char *)(nla + 1), ifname, sizeof(ifname)) <= 0) return (false); return (nlattr_add_string(nw, IFLA_IFNAME, ifname)); } #define LINUX_NLA_UNHANDLED -1 /* * Translate a FreeBSD attribute to a Linux attribute. * Returns LINUX_NLA_UNHANDLED when the attribute is not processed * and the caller must take care of it, otherwise the result is returned. */ static int nlmsg_translate_all_nla(struct nlmsghdr *hdr, struct nlattr *nla, struct nl_writer *nw) { switch (hdr->nlmsg_type) { case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: switch (nla->nla_type) { case IFLA_IFNAME: return (nlmsg_translate_ifname_nla(nla, nw)); default: break; } default: break; } return (LINUX_NLA_UNHANDLED); } static bool nlmsg_copy_all_nla(struct nlmsghdr *hdr, int raw_hdrlen, struct nl_writer *nw) { struct nlattr *nla; int ret; int hdrlen = NETLINK_ALIGN(raw_hdrlen); int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); NLA_FOREACH(nla, nla_head, attrs_len) { RT_LOG(LOG_DEBUG3, "reading attr %d len %d", nla->nla_type, nla->nla_len); if (nla->nla_len < sizeof(struct nlattr)) { return (false); } ret = nlmsg_translate_all_nla(hdr, nla, nw); if (ret == LINUX_NLA_UNHANDLED) ret = nlmsg_copy_nla(nla, nw); if (!ret) return (false); } return (true); } #undef LINUX_NLA_UNHANDLED static unsigned int rtnl_if_flags_to_linux(unsigned int if_flags) { unsigned int result = 0; for (int i = 0; i < 31; i++) { unsigned int flag = 1 << i; if (!(flag & if_flags)) continue; switch (flag) { case IFF_UP: case IFF_BROADCAST: case IFF_DEBUG: case IFF_LOOPBACK: case IFF_POINTOPOINT: case IFF_DRV_RUNNING: case IFF_NOARP: case IFF_PROMISC: case IFF_ALLMULTI: result |= flag; break; case IFF_NEEDSEPOCH: case IFF_DRV_OACTIVE: case IFF_SIMPLEX: case IFF_LINK0: case IFF_LINK1: case IFF_LINK2: case IFF_CANTCONFIG: case IFF_PPROMISC: case IFF_MONITOR: case IFF_STATICARP: case IFF_STICKYARP: case IFF_DYING: case IFF_RENAMING: /* No Linux analogue */ break; case IFF_MULTICAST: result |= 1 << 12; } } return (result); } static bool rtnl_newlink_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ifinfomsg *ifinfo; ifinfo = nlmsg_copy_next_header(hdr, nw, struct ifinfomsg); ifinfo->ifi_family = bsd_to_linux_domain(ifinfo->ifi_family); /* Convert interface type */ switch (ifinfo->ifi_type) { case IFT_ETHER: ifinfo->ifi_type = LINUX_ARPHRD_ETHER; break; } ifinfo->ifi_flags = rtnl_if_flags_to_linux(ifinfo->ifi_flags); /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifinfomsg), nw)) return (false); /* make ip(8) happy */ if (!nlattr_add_string(nw, IFLA_QDISC, "noqueue")) return (false); if (!nlattr_add_u32(nw, IFLA_TXQLEN, 1000)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newaddr_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ifaddrmsg *ifamsg; ifamsg = nlmsg_copy_next_header(hdr, nw, struct ifaddrmsg); ifamsg->ifa_family = bsd_to_linux_domain(ifamsg->ifa_family); /* XXX: fake ifa_flags? */ /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifaddrmsg), nw)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newneigh_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct ndmsg *ndm; ndm = nlmsg_copy_next_header(hdr, nw, struct ndmsg); ndm->ndm_family = bsd_to_linux_domain(ndm->ndm_family); /* Copy attributes unchanged */ if (!nlmsg_copy_all_nla(hdr, sizeof(struct ndmsg), nw)) return (false); nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_newroute_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct rtmsg *rtm; rtm = nlmsg_copy_next_header(hdr, nw, struct rtmsg); rtm->rtm_family = bsd_to_linux_domain(rtm->rtm_family); struct nlattr *nla; int hdrlen = NETLINK_ALIGN(sizeof(struct rtmsg)); int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); NLA_FOREACH(nla, nla_head, attrs_len) { struct rtattr *rta = (struct rtattr *)nla; //RT_LOG(LOG_DEBUG, "READING attr %d len %d", nla->nla_type, nla->nla_len); if (rta->rta_len < sizeof(struct rtattr)) { break; } switch (rta->rta_type) { case NL_RTA_TABLE: { uint32_t fibnum; fibnum = _rta_get_uint32(rta); if (fibnum == 0) fibnum = 254; RT_LOG(LOG_DEBUG3, "XFIBNUM %u", fibnum); if (!nlattr_add_u32(nw, NL_RTA_TABLE, fibnum)) return (false); } break; default: if (!nlmsg_copy_nla(nla, nw)) return (false); break; } } nlmsg_end(nw); RT_LOG(LOG_DEBUG2, "done processing nw %p", nw); return (true); } static bool rtnl_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { RT_LOG(LOG_DEBUG2, "Got message type %d", hdr->nlmsg_type); switch (hdr->nlmsg_type) { case NL_RTM_NEWLINK: case NL_RTM_DELLINK: case NL_RTM_GETLINK: return (rtnl_newlink_to_linux(hdr, nlp, nw)); case NL_RTM_NEWADDR: case NL_RTM_DELADDR: return (rtnl_newaddr_to_linux(hdr, nlp, nw)); case NL_RTM_NEWROUTE: case NL_RTM_DELROUTE: return (rtnl_newroute_to_linux(hdr, nlp, nw)); case NL_RTM_NEWNEIGH: case NL_RTM_DELNEIGH: case NL_RTM_GETNEIGH: return (rtnl_newneigh_to_linux(hdr, nlp, nw)); default: RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", hdr->nlmsg_type); return (handle_default_out(hdr, nw)); } } static bool nlmsg_error_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (!nlmsg_copy_header(hdr, nw)) return (false); struct nlmsgerr *nlerr; nlerr = nlmsg_copy_next_header(hdr, nw, struct nlmsgerr); nlerr->error = bsd_to_linux_errno(nlerr->error); int copied_len = sizeof(struct nlmsghdr) + sizeof(struct nlmsgerr); if (hdr->nlmsg_len == copied_len) { nlmsg_end(nw); return (true); } /* * CAP_ACK was not set. Original request needs to be translated. * XXX: implement translation of the original message */ RT_LOG(LOG_DEBUG, "[WARN] Passing ack message type %d untranslated", nlerr->msg.nlmsg_type); char *dst_payload, *src_payload; int copy_len = hdr->nlmsg_len - copied_len; dst_payload = nlmsg_reserve_data(nw, NLMSG_ALIGN(copy_len), char); src_payload = (char *)hdr + copied_len; memcpy(dst_payload, src_payload, copy_len); nlmsg_end(nw); return (true); } static bool -nlmsg_to_linux(int netlink_family, struct nlmsghdr *hdr, struct nlpcb *nlp, - struct nl_writer *nw) +nlmsg_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_writer *nw) { if (hdr->nlmsg_type < NLMSG_MIN_TYPE) { switch (hdr->nlmsg_type) { case NLMSG_ERROR: return (nlmsg_error_to_linux(hdr, nlp, nw)); case NLMSG_NOOP: case NLMSG_DONE: case NLMSG_OVERRUN: return (handle_default_out(hdr, nw)); default: RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", hdr->nlmsg_type); return (handle_default_out(hdr, nw)); } } - switch (netlink_family) { + switch (nlp->nl_proto) { case NETLINK_ROUTE: return (rtnl_to_linux(hdr, nlp, nw)); default: return (handle_default_out(hdr, nw)); } } -static struct mbuf * -nlmsgs_to_linux(int netlink_family, char *buf, int data_length, struct nlpcb *nlp) +static bool +nlmsgs_to_linux(struct nl_writer *nw, struct nlpcb *nlp) { - RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length); - struct nl_writer nw = {}; - - struct mbuf *m = NULL; - if (!nlmsg_get_chain_writer(&nw, data_length, &m)) { - RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d", - data_length); - return (NULL); - } + struct nl_buf *nb, *orig; + u_int offset, msglen, orig_messages __diagused; + + RT_LOG(LOG_DEBUG3, "%p: in %u bytes %u messages", __func__, + nw->buf->datalen, nw->num_messages); + + orig = nw->buf; + nb = nl_buf_alloc(orig->datalen + SCRATCH_BUFFER_SIZE, M_NOWAIT); + if (__predict_false(nb == NULL)) + return (false); + nw->buf = nb; +#ifdef INVARIANTS + orig_messages = nw->num_messages; +#endif + nw->num_messages = 0; /* Assume correct headers. Buffer IS mutable */ - int count = 0; - for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { - struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset]; - int msglen = NLMSG_ALIGN(hdr->nlmsg_len); - count++; + for (offset = 0; + offset + sizeof(struct nlmsghdr) <= orig->datalen; + offset += msglen) { + struct nlmsghdr *hdr = (struct nlmsghdr *)&orig->data[offset]; - if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) { + msglen = NLMSG_ALIGN(hdr->nlmsg_len); + if (!nlmsg_to_linux(hdr, nlp, nw)) { RT_LOG(LOG_DEBUG, "failed to process msg type %d", hdr->nlmsg_type); - m_freem(m); - return (NULL); + nl_buf_free(nb); + return (false); } - offset += msglen; } - nlmsg_flush(&nw); - RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count, - m ? m_length(m, NULL) : 0); - return (m); -} + MPASS(nw->num_messages == orig_messages); + MPASS(nw->buf == nb); + nl_buf_free(orig); + RT_LOG(LOG_DEBUG3, "%p: out %u bytes", __func__, offset); -static struct mbuf * -mbufs_to_linux(int netlink_family, struct mbuf *m, struct nlpcb *nlp) -{ - /* XXX: easiest solution, not optimized for performance */ - int data_length = m_length(m, NULL); - char *buf = malloc(data_length, M_LINUX, M_NOWAIT); - if (buf == NULL) { - RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message", - data_length); - m_freem(m); - return (NULL); - } - m_copydata(m, 0, data_length, buf); - m_freem(m); - - m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp); - free(buf, M_LINUX); - - return (m); + return (true); } static struct linux_netlink_provider linux_netlink_v1 = { - .mbufs_to_linux = mbufs_to_linux, .msgs_to_linux = nlmsgs_to_linux, .msg_from_linux = nlmsg_from_linux, }; void linux_netlink_register(void) { linux_netlink_p = &linux_netlink_v1; } void linux_netlink_deregister(void) { linux_netlink_p = NULL; } diff --git a/sys/netlink/ktest_netlink_message_writer.c b/sys/netlink/ktest_netlink_message_writer.c index e46065dd4bd2..805f52197f69 100644 --- a/sys/netlink/ktest_netlink_message_writer.c +++ b/sys/netlink/ktest_netlink_message_writer.c @@ -1,167 +1,113 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include -#include #include #include +#include #include #define KTEST_CALLER #include #ifdef INVARIANTS -struct test_mbuf_attrs { +struct test_nlbuf_attrs { uint32_t size; uint32_t expected_avail; - uint32_t expected_count; - uint32_t wtype; int waitok; }; -#define _OUT(_field) offsetof(struct test_mbuf_attrs, _field) -static const struct nlattr_parser nla_p_mbuf_w[] = { +#define _OUT(_field) offsetof(struct test_nlbuf_attrs, _field) +static const struct nlattr_parser nla_p_nlbuf_w[] = { { .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 }, { .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 }, - { .type = 3, .off = _OUT(expected_count), .cb = nlattr_get_uint32 }, - { .type = 4, .off = _OUT(wtype), .cb = nlattr_get_uint32 }, - { .type = 5, .off = _OUT(waitok), .cb = nlattr_get_uint32 }, + { .type = 3, .off = _OUT(waitok), .cb = nlattr_get_uint32 }, }; #undef _OUT -NL_DECLARE_ATTR_PARSER(mbuf_w_parser, nla_p_mbuf_w); +NL_DECLARE_ATTR_PARSER(nlbuf_w_parser, nla_p_nlbuf_w); static int -test_mbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla) +test_nlbuf_parser(struct ktest_test_context *ctx, struct nlattr *nla) { - struct test_mbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs)); + struct test_nlbuf_attrs *attrs = npt_alloc(ctx->npt, sizeof(*attrs)); ctx->arg = attrs; if (attrs != NULL) - return (nl_parse_nested(nla, &mbuf_w_parser, ctx->npt, attrs)); + return (nl_parse_nested(nla, &nlbuf_w_parser, ctx->npt, attrs)); return (ENOMEM); } static int -test_mbuf_writer_allocation(struct ktest_test_context *ctx) +test_nlbuf_writer_allocation(struct ktest_test_context *ctx) { - struct test_mbuf_attrs *attrs = ctx->arg; - bool ret; + struct test_nlbuf_attrs *attrs = ctx->arg; struct nl_writer nw = {}; + u_int alloc_len; + bool ret; - ret = nlmsg_get_buf_type_wrapper(&nw, attrs->size, attrs->wtype, attrs->waitok); + ret = nlmsg_get_buf_wrapper(&nw, attrs->size, attrs->waitok); if (!ret) return (EINVAL); - int alloc_len = nw.alloc_len; + alloc_len = nw.buf->buflen; KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len); - /* Set cleanup callback */ - nw.writer_target = NS_WRITER_TARGET_SOCKET; - nlmsg_set_callback_wrapper(&nw); - /* Mark enomem to avoid reallocation */ nw.enomem = true; if (nlmsg_reserve_data(&nw, alloc_len, void *) == NULL) { KTEST_LOG(ctx, "unable to get %d bytes from the writer", alloc_len); return (EINVAL); } - /* Mark as empty to free the storage */ - nw.offset = 0; - nlmsg_flush(&nw); + nl_buf_free(nw.buf); if (alloc_len < attrs->expected_avail) { KTEST_LOG(ctx, "alloc_len %d, expected %u", alloc_len, attrs->expected_avail); return (EINVAL); } return (0); } - -static int -test_mbuf_chain_allocation(struct ktest_test_context *ctx) -{ - struct test_mbuf_attrs *attrs = ctx->arg; - int mflags = attrs->waitok ? M_WAITOK : M_NOWAIT; - struct mbuf *chain = nl_get_mbuf_chain_wrapper(attrs->size, mflags); - - if (chain == NULL) { - KTEST_LOG(ctx, "nl_get_mbuf_chain(%u) returned NULL", attrs->size); - return (EINVAL); - } - - /* Iterate and check number of mbufs and space */ - uint32_t allocated_count = 0, allocated_size = 0; - for (struct mbuf *m = chain; m != NULL; m = m->m_next) { - allocated_count++; - allocated_size += M_SIZE(m); - } - m_freem(chain); - - if (attrs->expected_avail > allocated_size) { - KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u" - " expected/allocated count %u/%u", - attrs->expected_avail, allocated_size, - attrs->expected_count, allocated_count); - return (EINVAL); - } - - if (attrs->expected_count > 0 && (attrs->expected_count != allocated_count)) { - KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u" - " expected/allocated count %u/%u", - attrs->expected_avail, allocated_size, - attrs->expected_count, allocated_count); - return (EINVAL); - } - - return (0); -} #endif static const struct ktest_test_info tests[] = { #ifdef INVARIANTS { - .name = "test_mbuf_writer_allocation", - .desc = "test different mbuf sizes in the mbuf writer", - .func = &test_mbuf_writer_allocation, - .parse = &test_mbuf_parser, - }, - { - .name = "test_mbuf_chain_allocation", - .desc = "verify allocation different chain sizes", - .func = &test_mbuf_chain_allocation, - .parse = &test_mbuf_parser, + .name = "test_nlbuf_writer_allocation", + .desc = "test different buffer sizes in the netlink writer", + .func = &test_nlbuf_writer_allocation, + .parse = &test_nlbuf_parser, }, #endif }; KTEST_MODULE_DECLARE(ktest_netlink_message_writer, tests); diff --git a/sys/netlink/ktest_netlink_message_writer.h b/sys/netlink/ktest_netlink_message_writer.h index b7864bea59c9..39d2c5e597d6 100644 --- a/sys/netlink/ktest_netlink_message_writer.h +++ b/sys/netlink/ktest_netlink_message_writer.h @@ -1,60 +1,46 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_KTEST_NETLINK_MESSAGE_WRITER_H_ #define _NETLINK_KTEST_NETLINK_MESSAGE_WRITER_H_ #if defined(_KERNEL) && defined(INVARIANTS) -bool nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok); -void nlmsg_set_callback_wrapper(struct nl_writer *nw); -struct mbuf *nl_get_mbuf_chain_wrapper(int len, int malloc_flags); +bool nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok); #ifndef KTEST_CALLER bool -nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok) +nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok) { - return (nlmsg_get_buf_type(nw, size, type, waitok)); -} - -void -nlmsg_set_callback_wrapper(struct nl_writer *nw) -{ - nlmsg_set_callback(nw); -} - -struct mbuf * -nl_get_mbuf_chain_wrapper(int len, int malloc_flags) -{ - return (nl_get_mbuf_chain(len, malloc_flags)); + return (nlmsg_get_buf(nw, size, waitok)); } #endif #endif #endif diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index ecd110d62c1f..3914d402fc04 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -1,833 +1,936 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file contains socket and protocol bindings for netlink. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* priv_check */ #include #include #include #include #define DEBUG_MOD_NAME nl_domain #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); _Static_assert((NLP_MAX_GROUPS % 64) == 0, "NLP_MAX_GROUPS has to be multiple of 64"); _Static_assert(NLP_MAX_GROUPS >= 64, "NLP_MAX_GROUPS has to be at least 64"); #define NLCTL_TRACKER struct rm_priotracker nl_tracker #define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) #define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) static u_long nl_sendspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, "Default netlink socket send space"); static u_long nl_recvspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, "Default netlink socket receive space"); extern u_long sb_max_adj; static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0, sysctl_handle_nl_maxsockbuf, "LU", "Maximum Netlink socket buffer size"); static unsigned int osd_slot_id = 0; void nl_osd_register(void) { osd_slot_id = osd_register(OSD_THREAD, NULL, NULL); } void nl_osd_unregister(void) { osd_deregister(OSD_THREAD, osd_slot_id); } struct nlpcb * _nl_get_thread_nlp(struct thread *td) { return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id)); } void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp) { NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id); if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0) return; /* Failed, need to realloc */ void **rsv = osd_reserve(osd_slot_id); osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp); } /* * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. * Returns nlpcb pointer if present else NULL */ static struct nlpcb * nl_port_lookup(uint32_t port_id) { struct nlpcb *nlp; CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { if (nlp->nl_port == port_id) return (nlp); } return (NULL); } static void nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; /* TODO: add family handler callback */ if (!nlp_unconstrained_vnet(nlp)) return; nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64); } static void nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; nlp->nl_groups[group_id / 64] &= ~((uint64_t)1 << (group_id % 64)); } static bool nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; return (nlp->nl_groups[group_id / 64] & ((uint64_t)1 << (group_id % 64))); } static uint32_t nl_get_groups_compat(struct nlpcb *nlp) { uint32_t groups_mask = 0; for (int i = 0; i < 32; i++) { if (nl_isset_group_locked(nlp, i + 1)) groups_mask |= (1 << i); } return (groups_mask); } static void -nl_send_one_group(struct mbuf *m, struct nlpcb *nlp, int num_messages, - int io_flags) +nl_send_one_group(struct nl_writer *nw, struct nl_buf *nb, struct nlpcb *nlp) { if (__predict_false(nlp->nl_flags & NLF_MSG_INFO)) - nl_add_msg_info(m); - nl_send_one(m, nlp, num_messages, io_flags); + nl_add_msg_info(nb); + nw->buf = nb; + (void)nl_send_one(nw); +} + +static struct nl_buf * +nl_buf_copy(struct nl_buf *nb) +{ + struct nl_buf *copy; + + copy = nl_buf_alloc(nb->buflen, M_NOWAIT); + if (__predict_false(copy == NULL)) + return (NULL); + memcpy(copy, nb, sizeof(*nb) + nb->buflen); + if (nb->control != NULL) { + copy->control = m_copym(nb->control, 0, M_COPYALL, M_NOWAIT); + if (__predict_false(copy->control == NULL)) { + nl_buf_free(copy); + return (NULL); + } + } + + return (copy); } /* - * Broadcasts message @m to the protocol @proto group specified by @group_id + * Broadcasts in the writer's buffer. */ -void -nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id) +bool +nl_send_group(struct nl_writer *nw) { + struct nl_buf *nb = nw->buf; struct nlpcb *nlp_last = NULL; struct nlpcb *nlp; NLCTL_TRACKER; IF_DEBUG_LEVEL(LOG_DEBUG2) { - struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); - NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d", - m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id); + struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data; + NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d", + nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len, + nw->group.proto, nw->group.id); } + nw->buf = NULL; + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (__predict_false(ctl == NULL)) { /* * Can be the case when notification is sent within VNET * which doesn't have any netlink sockets. */ - m_freem(m); - return; + nl_buf_free(nb); + return (false); } NLCTL_RLOCK(ctl); - int io_flags = NL_IOF_UNTRANSLATED; - CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { - if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) { + if (nl_isset_group_locked(nlp, nw->group.id) && + nlp->nl_proto == nw->group.proto) { if (nlp_last != NULL) { - struct mbuf *m_copy; - m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (m_copy != NULL) - nl_send_one_group(m_copy, nlp_last, - num_messages, io_flags); - else { + struct nl_buf *copy; + + copy = nl_buf_copy(nb); + if (copy != NULL) { + nl_send_one_group(nw, copy, nlp_last); + } else { NLP_LOCK(nlp_last); if (nlp_last->nl_socket != NULL) sorwakeup(nlp_last->nl_socket); NLP_UNLOCK(nlp_last); } } nlp_last = nlp; } } if (nlp_last != NULL) - nl_send_one_group(m, nlp_last, num_messages, io_flags); + nl_send_one_group(nw, nb, nlp_last); else - m_freem(m); + nl_buf_free(nb); NLCTL_RUNLOCK(ctl); + + return (true); } bool nl_has_listeners(int netlink_family, uint32_t groups_mask) { return (V_nl_ctl != NULL); } static uint32_t nl_find_port(void) { /* * app can open multiple netlink sockets. * Start with current pid, if already taken, * try random numbers in 65k..256k+65k space, * avoiding clash with pids. */ if (nl_port_lookup(curproc->p_pid) == NULL) return (curproc->p_pid); for (int i = 0; i < 16; i++) { uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; if (nl_port_lookup(nl_port) == 0) return (nl_port); NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); } return (curproc->p_pid); } static int nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) { if (nlp->nl_bound) { if (nlp->nl_port != snl->nl_pid) { NL_LOG(LOG_DEBUG, "bind() failed: program pid %d " "is different from provided pid %d", nlp->nl_port, snl->nl_pid); return (EINVAL); // XXX: better error } } else { if (snl->nl_pid == 0) snl->nl_pid = nl_find_port(); if (nl_port_lookup(snl->nl_pid) != NULL) return (EADDRINUSE); nlp->nl_port = snl->nl_pid; nlp->nl_bound = true; CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); } for (int i = 0; i < 32; i++) { if (snl->nl_groups & ((uint32_t)1 << i)) nl_add_group_locked(nlp, i + 1); else nl_del_group_locked(nlp, i + 1); } return (0); } static int nl_pru_attach(struct socket *so, int proto, struct thread *td) { struct nlpcb *nlp; int error; if (__predict_false(netlink_unloading != 0)) return (EAFNOSUPPORT); error = nl_verify_proto(proto); if (error != 0) return (error); bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", so, is_linux ? "(linux) " : "", curproc->p_pid, nl_get_proto_name(proto)); /* Create per-VNET state on first socket init */ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (ctl == NULL) ctl = vnet_nl_ctl_init(); KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); MPASS(sotonlpcb(so) == NULL); nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); error = soreserve(so, nl_sendspace, nl_recvspace); if (error != 0) { free(nlp, M_PCB); return (error); } - so->so_rcv.sb_mtx = &so->so_rcv_mtx; + TAILQ_INIT(&so->so_rcv.nl_queue); TAILQ_INIT(&so->so_snd.nl_queue); so->so_pcb = nlp; nlp->nl_socket = so; /* Copy so_cred to avoid having socket_var.h in every header */ nlp->nl_cred = so->so_cred; nlp->nl_proto = proto; nlp->nl_process_id = curproc->p_pid; nlp->nl_linux = is_linux; nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred); nlp->nl_need_thread_setup = true; NLP_LOCK_INIT(nlp); refcount_init(&nlp->nl_refcount, 1); - nl_init_io(nlp); nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, taskqueue_thread_enqueue, &nlp->nl_taskqueue); TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, "netlink_socket (PID %u)", nlp->nl_process_id); NLCTL_WLOCK(ctl); /* XXX: check ctl is still alive */ CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); NLCTL_WUNLOCK(ctl); soisconnected(so); return (0); } static int nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; int error; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } NLCTL_WLOCK(ctl); NLP_LOCK(nlp); error = nl_bind_locked(nlp, snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, snl->nl_pid, snl->nl_groups, error); return (error); } static int nl_assign_port(struct nlpcb *nlp, uint32_t port_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct sockaddr_nl snl = { .nl_pid = port_id, }; int error; NLCTL_WLOCK(ctl); NLP_LOCK(nlp); snl.nl_groups = nl_get_groups_compat(nlp); error = nl_bind_locked(nlp, &snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); return (error); } /* * nl_autobind_port binds a unused portid to @nlp * @nlp: pcb data for the netlink socket * @candidate_id: first id to consider */ static int nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); uint32_t port_id = candidate_id; NLCTL_TRACKER; bool exist; int error = EADDRINUSE; for (int i = 0; i < 10; i++) { NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); NLCTL_RLOCK(ctl); exist = nl_port_lookup(port_id) != 0; NLCTL_RUNLOCK(ctl); if (!exist) { error = nl_assign_port(nlp, port_id); if (error != EADDRINUSE) break; } port_id++; } NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); return (error); } static int nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td) { struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; struct nlpcb *nlp; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } nlp = sotonlpcb(so); if (!nlp->nl_bound) { int error = nl_autobind_port(nlp, td->td_proc->p_pid); if (error != 0) { NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); return (error); } } /* XXX: Handle socket flags & multicast */ soisconnected(so); NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); return (0); } -static void -destroy_nlpcb(struct nlpcb *nlp) -{ - NLP_LOCK(nlp); - nl_free_io(nlp); - NLP_LOCK_DESTROY(nlp); - free(nlp, M_PCB); -} - static void destroy_nlpcb_epoch(epoch_context_t ctx) { struct nlpcb *nlp; nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); - destroy_nlpcb(nlp); + NLP_LOCK_DESTROY(nlp); + free(nlp, M_PCB); } - static void nl_close(struct socket *so) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); MPASS(sotonlpcb(so) != NULL); struct nlpcb *nlp; struct nl_buf *nb; NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); nlp = sotonlpcb(so); /* Mark as inactive so no new work can be enqueued */ NLP_LOCK(nlp); bool was_bound = nlp->nl_bound; NLP_UNLOCK(nlp); /* Wait till all scheduled work has been completed */ taskqueue_drain_all(nlp->nl_taskqueue); taskqueue_free(nlp->nl_taskqueue); NLCTL_WLOCK(ctl); NLP_LOCK(nlp); if (was_bound) { CK_LIST_REMOVE(nlp, nl_port_next); NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); } CK_LIST_REMOVE(nlp, nl_next); nlp->nl_socket = NULL; NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); so->so_pcb = NULL; while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) { TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq); - free(nb, M_NETLINK); + nl_buf_free(nb); + } + while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) { + TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq); + nl_buf_free(nb); } - sbdestroy(so, SO_RCV); NL_LOG(LOG_DEBUG3, "socket %p, detached", so); /* XXX: is delayed free needed? */ NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); } static int nl_pru_disconnect(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_pru_shutdown(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); socantsendmore(so); return (0); } static int nl_sockaddr(struct socket *so, struct sockaddr *sa) { *(struct sockaddr_nl *)sa = (struct sockaddr_nl ){ /* TODO: set other fields */ .nl_len = sizeof(struct sockaddr_nl), .nl_family = AF_NETLINK, .nl_pid = sotonlpcb(so)->nl_port, }; return (0); } static int nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *m, struct mbuf *control, int flags, struct thread *td) { struct nlpcb *nlp = sotonlpcb(so); struct sockbuf *sb = &so->so_snd; struct nl_buf *nb; u_int len; int error; MPASS(m == NULL && uio != NULL); NL_LOG(LOG_DEBUG2, "sending message to kernel"); if (__predict_false(control != NULL)) { m_freem(control); return (EINVAL); } if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */ return (EOPNOTSUPP); if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr))) return (ENOBUFS); /* XXXGL: any better error? */ NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE; if (nlp->nl_linux) len += roundup2(uio->uio_resid, 8); - nb = malloc(sizeof(*nb) + len, M_NETLINK, M_WAITOK); + nb = nl_buf_alloc(len, M_WAITOK); nb->datalen = uio->uio_resid; - nb->buflen = len; - nb->offset = 0; error = uiomove(&nb->data[0], uio->uio_resid, uio); if (__predict_false(error)) goto out; SOCK_SENDBUF_LOCK(so); restart: if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) { TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); sb->sb_acc += nb->datalen; sb->sb_ccc += nb->datalen; nb = NULL; } else if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCK_SENDBUF_UNLOCK(so); error = EWOULDBLOCK; goto out; } else { if ((error = sbwait(so, SO_SND)) != 0) { SOCK_SENDBUF_UNLOCK(so); goto out; } else goto restart; } SOCK_SENDBUF_UNLOCK(so); if (nb == NULL) { NL_LOG(LOG_DEBUG3, "enqueue %u bytes", nb->datalen); NLP_LOCK(nlp); nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } out: SOCK_IO_SEND_UNLOCK(so); - free(nb, M_NETLINK); + if (nb != NULL) + nl_buf_free(nb); return (error); } static int -nl_pru_rcvd(struct socket *so, int flags) +nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, + struct mbuf **mp, struct mbuf **controlp, int *flagsp) { + static const struct sockaddr_nl nl_empty_src = { + .nl_len = sizeof(struct sockaddr_nl), + .nl_family = PF_NETLINK, + .nl_pid = 0 /* comes from the kernel */ + }; + struct sockbuf *sb = &so->so_rcv; + struct nl_buf *nb; + int flags, error; + u_int overflow; + bool nonblock, trunc, peek; + + MPASS(mp == NULL && uio != NULL); + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); - MPASS(sotonlpcb(so) != NULL); + + if (psa != NULL) + *psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src, + M_WAITOK); + + flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0; + trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false; + nonblock = (so->so_state & SS_NBIO) || + (flags & (MSG_DONTWAIT | MSG_NBIO)); + peek = flags & MSG_PEEK; + + error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); + if (__predict_false(error)) + return (error); + + SOCK_RECVBUF_LOCK(so); + while ((nb = TAILQ_FIRST(&sb->nl_queue)) == NULL) { + if (nonblock) { + SOCK_RECVBUF_UNLOCK(so); + SOCK_IO_RECV_UNLOCK(so); + return (EWOULDBLOCK); + } + error = sbwait(so, SO_RCV); + if (error) { + SOCK_RECVBUF_UNLOCK(so); + SOCK_IO_RECV_UNLOCK(so); + return (error); + } + } + + /* + * XXXGL + * Here we emulate a PR_ATOMIC behavior of soreceive_generic() where + * we take only the first "record" in the socket buffer and send it + * to uio whole or truncated ignoring how many netlink messages are + * in the record and how much space is left in the uio. + * This needs to be fixed at next refactoring. First, we should perform + * truncation only if the very first message doesn't fit into uio. + * That will help an application with small buffer not to lose data. + * Second, we should continue working on the sb->nl_queue as long as + * there is more space in the uio. That will boost applications with + * large buffers. + */ + if (__predict_true(!peek)) { + TAILQ_REMOVE(&sb->nl_queue, nb, tailq); + sb->sb_acc -= nb->datalen; + sb->sb_ccc -= nb->datalen; + } + SOCK_RECVBUF_UNLOCK(so); + + overflow = __predict_false(nb->datalen > uio->uio_resid) ? + nb->datalen - uio->uio_resid : 0; + error = uiomove(nb->data, (int)nb->datalen, uio); + if (__predict_false(overflow > 0)) { + flags |= MSG_TRUNC; + if (trunc) + uio->uio_resid -= overflow; + } + + if (controlp != NULL) { + *controlp = nb->control; + nb->control = NULL; + } + + if (__predict_true(!peek)) + nl_buf_free(nb); + + if (uio->uio_td) + uio->uio_td->td_ru.ru_msgrcv++; + + if (flagsp != NULL) + *flagsp |= flags; + + SOCK_IO_RECV_UNLOCK(so); nl_on_transmit(sotonlpcb(so)); - return (0); + return (error); } static int nl_getoptflag(int sopt_name) { switch (sopt_name) { case NETLINK_CAP_ACK: return (NLF_CAP_ACK); case NETLINK_EXT_ACK: return (NLF_EXT_ACK); case NETLINK_GET_STRICT_CHK: return (NLF_STRICT); case NETLINK_MSG_INFO: return (NLF_MSG_INFO); } return (0); } static int nl_ctloutput(struct socket *so, struct sockopt *sopt) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); uint32_t flag; int optval, error = 0; NLCTL_TRACKER; NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", so, sopt->sopt_name); switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; if (optval <= 0 || optval >= NLP_MAX_GROUPS) { error = ERANGE; break; } NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval); NLCTL_WLOCK(ctl); if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) nl_add_group_locked(nlp, optval); else nl_del_group_locked(nlp, optval); NLCTL_WUNLOCK(ctl); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; flag = nl_getoptflag(sopt->sopt_name); if ((flag == NLF_MSG_INFO) && nlp->nl_linux) { error = EINVAL; break; } NLCTL_WLOCK(ctl); if (optval != 0) nlp->nl_flags |= flag; else nlp->nl_flags &= ~flag; NLCTL_WUNLOCK(ctl); break; default: error = ENOPROTOOPT; } break; case SOPT_GET: switch (sopt->sopt_name) { case NETLINK_LIST_MEMBERSHIPS: NLCTL_RLOCK(ctl); optval = nl_get_groups_compat(nlp); NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: NLCTL_RLOCK(ctl); optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = ENOPROTOOPT; } break; default: error = ENOPROTOOPT; } return (error); } static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_maxsockbuf = nl_maxsockbuf; error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req); if (error || !req->newptr) return (error); if (tmp_maxsockbuf < MSIZE + MCLBYTES) return (EINVAL); nl_maxsockbuf = tmp_maxsockbuf; return (0); } static int nl_setsbopt(struct socket *so, struct sockopt *sopt) { int error, optval; bool result; if (sopt->sopt_name != SO_RCVBUF) return (sbsetopt(so, sopt)); /* Allow to override max buffer size in certain conditions */ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error != 0) return (error); NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); if (optval > sb_max_adj) { if (priv_check(curthread, PRIV_NET_ROUTE) != 0) return (EPERM); } SOCK_RECVBUF_LOCK(so); result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); SOCK_RECVBUF_UNLOCK(so); return (result ? 0 : ENOBUFS); } #define NETLINK_PROTOSW \ - .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD | \ - PR_SOCKBUF, \ + .pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \ .pr_ctloutput = nl_ctloutput, \ .pr_setsbopt = nl_setsbopt, \ .pr_attach = nl_pru_attach, \ .pr_bind = nl_pru_bind, \ .pr_connect = nl_pru_connect, \ .pr_disconnect = nl_pru_disconnect, \ .pr_sosend = nl_sosend, \ - .pr_rcvd = nl_pru_rcvd, \ + .pr_soreceive = nl_soreceive, \ .pr_shutdown = nl_pru_shutdown, \ .pr_sockaddr = nl_sockaddr, \ .pr_close = nl_close static struct protosw netlink_raw_sw = { .pr_type = SOCK_RAW, NETLINK_PROTOSW }; static struct protosw netlink_dgram_sw = { .pr_type = SOCK_DGRAM, NETLINK_PROTOSW }; static struct domain netlinkdomain = { .dom_family = PF_NETLINK, .dom_name = "netlink", .dom_flags = DOMF_UNLOADABLE, .dom_nprotosw = 2, .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw }, }; DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c index e7649c6b13dc..e4b52ffb191b 100644 --- a/sys/netlink/netlink_glue.c +++ b/sys/netlink/netlink_glue.c @@ -1,307 +1,306 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* priv_check */ #include #include #include #include #include #include /* Standard bits: built-in the kernel */ SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, ""); SYSCTL_NODE(_net_netlink, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets"); /* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */ static void ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { } static void ignore_ifmsg_event(struct ifnet *ifp, int if_flags_mask) { } static struct rtbridge ignore_cb = { .route_f = ignore_route_event, .ifmsg_f = ignore_ifmsg_event, }; void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */ struct rtbridge *rtsock_callback_p = &ignore_cb; struct rtbridge *netlink_callback_p = &ignore_cb; /* * nlp accessors. * TODO: move to a separate file once the number grows. */ bool nlp_has_priv(struct nlpcb *nlp, int priv) { return (priv_check_cred(nlp->nl_cred, priv) == 0); } struct ucred * nlp_get_cred(struct nlpcb *nlp) { return (nlp->nl_cred); } uint32_t nlp_get_pid(const struct nlpcb *nlp) { return (nlp->nl_process_id); } bool nlp_unconstrained_vnet(const struct nlpcb *nlp) { return (nlp->nl_unconstrained_vnet); } #ifndef NETLINK /* Stub implementations for the loadable functions */ static bool get_stub_writer(struct nl_writer *nw) { bzero(nw, sizeof(*nw)); - nw->writer_type = NS_WRITER_TYPE_STUB; nw->enomem = true; return (false); } static bool nlmsg_get_unicast_writer_stub(struct nl_writer *nw, int size, struct nlpcb *nlp) { return (get_stub_writer(nw)); } static bool nlmsg_get_group_writer_stub(struct nl_writer *nw, int size, int protocol, int group_id) { return (get_stub_writer(nw)); } static bool nlmsg_get_chain_writer_stub(struct nl_writer *nw, int size, struct mbuf **pm) { return (get_stub_writer(nw)); } static bool nlmsg_flush_stub(struct nl_writer *nw __unused) { return (false); } static void nlmsg_ignore_limit_stub(struct nl_writer *nw __unused) { } static bool nlmsg_refill_buffer_stub(struct nl_writer *nw __unused, int required_len __unused) { return (false); } static bool nlmsg_add_stub(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { return (false); } static bool nlmsg_end_stub(struct nl_writer *nw __unused) { return (false); } static void nlmsg_abort_stub(struct nl_writer *nw __unused) { } static bool nlmsg_end_dump_stub(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { return (false); } static int nl_modify_ifp_generic_stub(struct ifnet *ifp __unused, struct nl_parsed_link *lattrs __unused, const struct nlattr_bmask *bm __unused, struct nl_pstate *npt __unused) { return (ENOTSUP); } static void nl_store_ifp_cookie_stub(struct nl_pstate *npt __unused, struct ifnet *ifp __unused) { } static struct nlpcb * nl_get_thread_nlp_stub(struct thread *td __unused) { return (NULL); } const static struct nl_function_wrapper nl_stub = { .nlmsg_add = nlmsg_add_stub, .nlmsg_refill_buffer = nlmsg_refill_buffer_stub, .nlmsg_flush = nlmsg_flush_stub, .nlmsg_end = nlmsg_end_stub, .nlmsg_abort = nlmsg_abort_stub, .nlmsg_ignore_limit = nlmsg_ignore_limit_stub, .nlmsg_get_unicast_writer = nlmsg_get_unicast_writer_stub, .nlmsg_get_group_writer = nlmsg_get_group_writer_stub, .nlmsg_get_chain_writer = nlmsg_get_chain_writer_stub, .nlmsg_end_dump = nlmsg_end_dump_stub, .nl_modify_ifp_generic = nl_modify_ifp_generic_stub, .nl_store_ifp_cookie = nl_store_ifp_cookie_stub, .nl_get_thread_nlp = nl_get_thread_nlp_stub, }; /* * If the kernel is compiled with netlink as a module, * provide a way to introduce non-stub functioms */ static const struct nl_function_wrapper *_nl = &nl_stub; void nl_set_functions(const struct nl_function_wrapper *nl) { _nl = (nl != NULL) ? nl : &nl_stub; } /* Function wrappers */ bool nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) { return (_nl->nlmsg_get_unicast_writer(nw, size, nlp)); } bool nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) { return (_nl->nlmsg_get_group_writer(nw, size, protocol, group_id)); } bool nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) { return (_nl->nlmsg_get_chain_writer(nw, size, pm)); } bool nlmsg_flush(struct nl_writer *nw) { return (_nl->nlmsg_flush(nw)); } void nlmsg_ignore_limit(struct nl_writer *nw) { _nl->nlmsg_ignore_limit(nw); } bool nlmsg_refill_buffer(struct nl_writer *nw, int required_len) { return (_nl->nlmsg_refill_buffer(nw, required_len)); } bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { return (_nl->nlmsg_add(nw, portid, seq, type, flags, len)); } bool nlmsg_end(struct nl_writer *nw) { return (_nl->nlmsg_end(nw)); } void nlmsg_abort(struct nl_writer *nw) { _nl->nlmsg_abort(nw); } bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { return (_nl->nlmsg_end_dump(nw, error, hdr)); } int nl_modify_ifp_generic(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm , struct nl_pstate *npt) { return (_nl->nl_modify_ifp_generic(ifp, lattrs, bm, npt)); } void nl_store_ifp_cookie(struct nl_pstate *npt, struct ifnet *ifp) { return (_nl->nl_store_ifp_cookie(npt, ifp)); } struct nlpcb * nl_get_thread_nlp(struct thread *td) { return (_nl->nl_get_thread_nlp(td)); } #endif /* !NETLINK */ diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c index 7e2e098e4a9a..56e430cdcfa8 100644 --- a/sys/netlink/netlink_io.c +++ b/sys/netlink/netlink_io.c @@ -1,533 +1,410 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_io #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * The logic below provide a p2p interface for receiving and * sending netlink data between the kernel and userland. */ -static const struct sockaddr_nl _nl_empty_src = { - .nl_len = sizeof(struct sockaddr_nl), - .nl_family = PF_NETLINK, - .nl_pid = 0 /* comes from the kernel */ -}; -static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; - static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp); -static void -queue_push(struct nl_io_queue *q, struct mbuf *mq) -{ - while (mq != NULL) { - struct mbuf *m = mq; - mq = mq->m_nextpkt; - m->m_nextpkt = NULL; - - q->length += m_length(m, NULL); - STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); - } -} - -static struct mbuf * -queue_pop(struct nl_io_queue *q) +struct nl_buf * +nl_buf_alloc(size_t len, int mflag) { - if (!STAILQ_EMPTY(&q->head)) { - struct mbuf *m = STAILQ_FIRST(&q->head); - STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); - m->m_nextpkt = NULL; - q->length -= m_length(m, NULL); + struct nl_buf *nb; - return (m); + nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag); + if (__predict_true(nb != NULL)) { + nb->buflen = len; + nb->datalen = nb->offset = 0; + nb->control = NULL; } - return (NULL); -} -static struct mbuf * -queue_head(const struct nl_io_queue *q) -{ - return (STAILQ_FIRST(&q->head)); + return (nb); } -static inline bool -queue_empty(const struct nl_io_queue *q) +void +nl_buf_free(struct nl_buf *nb) { - return (q->length == 0); -} -static void -queue_free(struct nl_io_queue *q) -{ - while (!STAILQ_EMPTY(&q->head)) { - struct mbuf *m = STAILQ_FIRST(&q->head); - STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); - m->m_nextpkt = NULL; - m_freem(m); - } - q->length = 0; + if (nb->control) + m_freem(nb->control); + free(nb, M_NETLINK); } void -nl_add_msg_info(struct mbuf *m) +nl_add_msg_info(struct nl_buf *nb) { + /* XXXGL pass nlp as arg? */ struct nlpcb *nlp = nl_get_thread_nlp(curthread); NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", curthread, nlp); if (nlp == NULL) return; /* Prepare what we want to encode - PID, socket PID & msg seq */ struct { struct nlattr nla; uint32_t val; } data[] = { { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, .val = nlp->nl_process_id, }, { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, .val = nlp->nl_port, }, }; - while (m->m_next != NULL) - m = m->m_next; - m->m_next = sbcreatecontrol(data, sizeof(data), + nb->control = sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); - NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p", - (unsigned)sizeof(data), m->m_next); -} - -static __noinline struct mbuf * -extract_msg_info(struct mbuf *m) -{ - while (m->m_next != NULL) { - if (m->m_next->m_type == MT_CONTROL) { - struct mbuf *ctl = m->m_next; - m->m_next = NULL; - return (ctl); - } - m = m->m_next; - } - return (NULL); + if (__predict_true(nb->control != NULL)) + NL_LOG(LOG_DEBUG2, "Storing %u bytes of control data, ctl: %p", + (unsigned)sizeof(data), nb->control); + else + NL_LOG(LOG_DEBUG2, "Failed to allocate %u bytes of control", + (unsigned)sizeof(data)); } void nl_schedule_taskqueue(struct nlpcb *nlp) { if (!nlp->nl_task_pending) { nlp->nl_task_pending = true; taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); } else { NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); } } -static bool -tx_check_locked(struct nlpcb *nlp) -{ - if (queue_empty(&nlp->tx_queue)) - return (true); - - /* - * Check if something can be moved from the internal TX queue - * to the socket queue. - */ - - bool appended = false; - struct sockbuf *sb = &nlp->nl_socket->so_rcv; - SOCKBUF_LOCK(sb); - - while (true) { - struct mbuf *m = queue_head(&nlp->tx_queue); - if (m != NULL) { - struct mbuf *ctl = NULL; - if (__predict_false(m->m_next != NULL)) - ctl = extract_msg_info(m); - if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) { - /* appended successfully */ - queue_pop(&nlp->tx_queue); - appended = true; - } else - break; - } else - break; - } - - SOCKBUF_UNLOCK(sb); - - if (appended) - sorwakeup(nlp->nl_socket); - - return (queue_empty(&nlp->tx_queue)); -} - static bool nl_process_received_one(struct nlpcb *nlp) { struct socket *so = nlp->nl_socket; - struct sockbuf *sb = &so->so_snd; + struct sockbuf *sb; struct nl_buf *nb; bool reschedule = false; NLP_LOCK(nlp); nlp->nl_task_pending = false; + NLP_UNLOCK(nlp); - if (!tx_check_locked(nlp)) { - /* TX overflow queue still not empty, ignore RX */ - NLP_UNLOCK(nlp); + /* + * Do not process queued up requests if there is no space to queue + * replies. + */ + sb = &so->so_rcv; + SOCK_RECVBUF_LOCK(so); + if (sb->sb_hiwat <= sb->sb_ccc) { + SOCK_RECVBUF_UNLOCK(so); return (false); } + SOCK_RECVBUF_UNLOCK(so); - int prev_hiwat = nlp->tx_queue.hiwat; - NLP_UNLOCK(nlp); - + sb = &so->so_snd; SOCK_SENDBUF_LOCK(so); while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) { TAILQ_REMOVE(&sb->nl_queue, nb, tailq); SOCK_SENDBUF_UNLOCK(so); reschedule = nl_process_nbuf(nb, nlp); SOCK_SENDBUF_LOCK(so); if (reschedule) { sb->sb_acc -= nb->datalen; sb->sb_ccc -= nb->datalen; /* XXXGL: potentially can reduce lock&unlock count. */ sowwakeup_locked(so); - free(nb, M_NETLINK); + nl_buf_free(nb); SOCK_SENDBUF_LOCK(so); } else { TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq); break; } } SOCK_SENDBUF_UNLOCK(so); - if (nlp->tx_queue.hiwat > prev_hiwat) { - NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); - - } return (reschedule); } static void nl_process_received(struct nlpcb *nlp) { NL_LOG(LOG_DEBUG3, "taskqueue called"); if (__predict_false(nlp->nl_need_thread_setup)) { nl_set_thread_nlp(curthread, nlp); NLP_LOCK(nlp); nlp->nl_need_thread_setup = false; NLP_UNLOCK(nlp); } while (nl_process_received_one(nlp)) ; } -void -nl_init_io(struct nlpcb *nlp) -{ - STAILQ_INIT(&nlp->tx_queue.head); -} - -void -nl_free_io(struct nlpcb *nlp) -{ - queue_free(&nlp->tx_queue); -} - /* * Called after some data have been read from the socket. */ void nl_on_transmit(struct nlpcb *nlp) { NLP_LOCK(nlp); struct socket *so = nlp->nl_socket; if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { unsigned long dropped_bytes = nlp->nl_dropped_bytes; unsigned long dropped_messages = nlp->nl_dropped_messages; nlp->nl_dropped_bytes = 0; nlp->nl_dropped_messages = 0; struct sockbuf *sb = &so->so_rcv; NLP_LOG(LOG_DEBUG, nlp, "socket RX overflowed, %lu messages (%lu bytes) dropped. " - "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, - sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); + "bytes: [%u/%u]", dropped_messages, dropped_bytes, + sb->sb_ccc, sb->sb_hiwat); /* TODO: send netlink message */ } nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } void nl_taskqueue_handler(void *_arg, int pending) { struct nlpcb *nlp = (struct nlpcb *)_arg; CURVNET_SET(nlp->nl_socket->so_vnet); nl_process_received(nlp); CURVNET_RESTORE(); } -static __noinline void -queue_push_tx(struct nlpcb *nlp, struct mbuf *m) -{ - queue_push(&nlp->tx_queue, m); - nlp->nl_tx_blocked = true; - - if (nlp->tx_queue.length > nlp->tx_queue.hiwat) - nlp->tx_queue.hiwat = nlp->tx_queue.length; -} - /* - * Tries to send @m to the socket @nlp. - * - * @m: mbuf(s) to send to. Consumed in any case. - * @nlp: socket to send to - * @cnt: number of messages in @m - * @io_flags: combination of NL_IOF_* flags + * Tries to send current data buffer from writer. * * Returns true on success. * If no queue overrunes happened, wakes up socket owner. */ bool -nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) +nl_send_one(struct nl_writer *nw) { - bool untranslated = io_flags & NL_IOF_UNTRANSLATED; - bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; - bool result = true; + struct nlpcb *nlp = nw->nlp; + struct socket *so = nlp->nl_socket; + struct sockbuf *sb = &so->so_rcv; + struct nl_buf *nb; + + MPASS(nw->hdr == NULL); IF_DEBUG_LEVEL(LOG_DEBUG2) { - struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); + struct nlmsghdr *hdr = (struct nlmsghdr *)nw->buf->data; NLP_LOG(LOG_DEBUG2, nlp, - "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", - m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, - io_flags); + "TX len %u msgs %u msg type %d first hdrlen %u", + nw->buf->datalen, nw->num_messages, hdr->nlmsg_type, + hdr->nlmsg_len); } - if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { - m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); - if (m == NULL) - return (false); + if (nlp->nl_linux && linux_netlink_p != NULL && + __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) { + nl_buf_free(nw->buf); + nw->buf = NULL; + return (false); } - NLP_LOCK(nlp); + nb = nw->buf; + nw->buf = NULL; - if (__predict_false(nlp->nl_socket == NULL)) { + SOCK_RECVBUF_LOCK(so); + if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) { + SOCK_RECVBUF_UNLOCK(so); + NLP_LOCK(nlp); + nlp->nl_dropped_bytes += nb->datalen; + nlp->nl_dropped_messages += nw->num_messages; + NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", + (unsigned long)nlp->nl_dropped_messages, nw->num_messages, + (unsigned long)nlp->nl_dropped_bytes, nb->datalen); NLP_UNLOCK(nlp); - m_freem(m); + nl_buf_free(nb); return (false); - } - - if (!queue_empty(&nlp->tx_queue)) { - if (ignore_limits) { - queue_push_tx(nlp, m); - } else { - m_free(m); - result = false; - } - NLP_UNLOCK(nlp); - return (result); - } - - struct socket *so = nlp->nl_socket; - struct mbuf *ctl = NULL; - if (__predict_false(m->m_next != NULL)) - ctl = extract_msg_info(m); - if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) { - sorwakeup(so); - NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); } else { - if (ignore_limits) { - queue_push_tx(nlp, m); - } else { - /* - * Store dropped data so it can be reported - * on the next read - */ - nlp->nl_dropped_bytes += m_length(m, NULL); - nlp->nl_dropped_messages += num_messages; - NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", - (unsigned long)nlp->nl_dropped_messages, num_messages, - (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); - soroverflow(so); - m_freem(m); - result = false; + bool full; + + TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); + sb->sb_acc += nb->datalen; + sb->sb_ccc += nb->datalen; + full = sb->sb_hiwat <= sb->sb_ccc; + sorwakeup_locked(so); + if (full) { + NLP_LOCK(nlp); + nlp->nl_tx_blocked = true; + NLP_UNLOCK(nlp); } + return (true); } - NLP_UNLOCK(nlp); - - return (result); } static int nl_receive_message(struct nlmsghdr *hdr, int remaining_length, struct nlpcb *nlp, struct nl_pstate *npt) { nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; int error = 0; NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, hdr->nlmsg_pid); if (__predict_false(hdr->nlmsg_len > remaining_length)) { NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", hdr->nlmsg_len, remaining_length); return (EINVAL); } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); return (EINVAL); } /* Stamp each message with sender pid */ hdr->nlmsg_pid = nlp->nl_port; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", hdr->nlmsg_type); if (nlp->nl_linux && linux_netlink_p != NULL) { struct nlmsghdr *hdr_orig = hdr; hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); if (hdr == NULL) { /* Failed to translate to kernel format. Report an error back */ hdr = hdr_orig; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_ACK) nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); return (0); } } error = handler(hdr, npt); NL_LOG(LOG_DEBUG2, "retcode: %d", error); } if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { if (!npt->nw->suppress_ack) { NL_LOG(LOG_DEBUG3, "ack"); nlmsg_ack(nlp, error, hdr, npt); } } return (0); } static void npt_clear(struct nl_pstate *npt) { lb_clear(&npt->lb); npt->error = 0; npt->err_msg = NULL; npt->err_off = 0; npt->hdr = NULL; npt->nw->suppress_ack = false; } /* * Processes an incoming packet, which can contain multiple netlink messages */ static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp) { struct nlmsghdr *hdr; int error; NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket); struct nl_writer nw = {}; if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { NL_LOG(LOG_DEBUG, "error allocating socket writer"); return (true); } nlmsg_ignore_limit(&nw); struct nl_pstate npt = { .nlp = nlp, .lb.base = &nb->data[roundup2(nb->datalen, 8)], .lb.size = nb->buflen - roundup2(nb->datalen, 8), .nw = &nw, .strict = nlp->nl_flags & NLF_STRICT, }; for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) { hdr = (struct nlmsghdr *)&nb->data[nb->offset]; /* Save length prior to calling handler */ int msglen = NLMSG_ALIGN(hdr->nlmsg_len); NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", nb->offset, nb->datalen); npt_clear(&npt); error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp, &npt); nb->offset += msglen; if (__predict_false(error != 0 || nlp->nl_tx_blocked)) break; } NL_LOG(LOG_DEBUG3, "packet parsing done"); nlmsg_flush(&nw); if (nlp->nl_tx_blocked) { NLP_LOCK(nlp); nlp->nl_tx_blocked = false; NLP_UNLOCK(nlp); return (false); } else return (true); } diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h index 6dd2a964a64a..2d9f8d1b7bd6 100644 --- a/sys/netlink/netlink_linux.h +++ b/sys/netlink/netlink_linux.h @@ -1,54 +1,53 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_LINUX_VAR_H_ #define _NETLINK_LINUX_VAR_H_ +#ifdef _KERNEL /* * The file contains headers for the bridge interface between * linux[_common] module and the netlink module */ struct nlpcb; struct nl_pstate; +struct nl_writer; -typedef struct mbuf *mbufs_to_linux_cb_t(int netlink_family, struct mbuf *m, - struct nlpcb *nlp); -typedef struct mbuf *msgs_to_linux_cb_t(int netlink_family, char *buf, int data_length, - struct nlpcb *nlp); +typedef bool msgs_to_linux_cb_t(struct nl_writer *nw, struct nlpcb *nlp); typedef struct nlmsghdr *msg_from_linux_cb_t(int netlink_family, struct nlmsghdr *hdr, struct nl_pstate *npt); struct linux_netlink_provider { - mbufs_to_linux_cb_t *mbufs_to_linux; msgs_to_linux_cb_t *msgs_to_linux; msg_from_linux_cb_t *msg_from_linux; }; extern struct linux_netlink_provider *linux_netlink_p; #endif +#endif diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c index dafcca6ef038..0b85378b41b6 100644 --- a/sys/netlink/netlink_message_writer.c +++ b/sys/netlink/netlink_message_writer.c @@ -1,838 +1,374 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include -#include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_writer #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); -/* - * The goal of this file is to provide convenient message writing KPI on top of - * different storage methods (mbufs, uio, temporary memory chunks). - * - * The main KPI guarantee is that the (last) message always resides in the contiguous - * memory buffer, so one is able to update the header after writing the entire message. - * - * This guarantee comes with a side effect of potentially reallocating underlying - * buffer, so one needs to update the desired pointers after something is added - * to the header. - * - * Messaging layer contains hooks performing transparent Linux translation for the messages. - * - * There are 3 types of supported targets: - * * socket (adds mbufs to the socket buffer, used for message replies) - * * group (sends mbuf/chain to the specified groups, used for the notifications) - * * chain (returns mbuf chain, used in Linux message translation code) - * - * There are 3 types of storage: - * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message - * fits in NLMBUFSIZE) - * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs - * to be larger than one supported by NS_WRITER_TYPE_MBUF) - * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for - * Linux sockets, calls translation hook prior to sending messages to the socket). - * - * Internally, KPI switches between different types of storage when memory requirements - * change. It happens transparently to the caller. - */ - -/* - * Uma zone for the mbuf-based Netlink storage - */ -static uma_zone_t nlmsg_zone; - -static void -nl_free_mbuf_storage(struct mbuf *m) -{ - uma_zfree(nlmsg_zone, m->m_ext.ext_buf); -} - -static int -nl_setup_mbuf_storage(void *mem, int size, void *arg, int how __unused) -{ - struct mbuf *m = (struct mbuf *)arg; - - if (m != NULL) - m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE); - - return (0); -} - -static struct mbuf * -nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags) -{ - struct mbuf *m, *m_storage; - - if (size <= MHLEN) - return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags)); - - if (__predict_false(size > NLMBUFSIZE)) - return (NULL); - - m = m_gethdr(malloc_flags, MT_DATA); - if (m == NULL) - return (NULL); - - m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags); - if (m_storage == NULL) { - m_free_raw(m); - return (NULL); - } - - return (m); -} - -static struct mbuf * -nl_get_mbuf(int size, int malloc_flags) -{ - return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR)); -} - -/* - * Gets a chain of Netlink mbufs. - * This is strip-down version of m_getm2() - */ -static struct mbuf * -nl_get_mbuf_chain(int len, int malloc_flags) -{ - struct mbuf *m_chain = NULL, *m_tail = NULL; - int mbuf_flags = M_PKTHDR; - - while (len > 0) { - int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len; - struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags); - - if (m == NULL) { - m_freem(m_chain); - return (NULL); - } - - /* Book keeping. */ - len -= M_SIZE(m); - if (m_tail != NULL) - m_tail->m_next = m; - else - m_chain = m; - m_tail = m; - mbuf_flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ - } - - return (m_chain); -} - -void -nl_init_msg_zone(void) -{ - nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage, - NULL, NULL, NULL, UMA_ALIGN_PTR, 0); -} - -void -nl_destroy_msg_zone(void) -{ - uma_zdestroy(nlmsg_zone); -} - - -typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok); -typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt); - -struct nlwriter_ops { - nlwriter_op_init *init; - nlwriter_op_write *write_socket; - nlwriter_op_write *write_group; - nlwriter_op_write *write_chain; -}; - -/* - * NS_WRITER_TYPE_BUF - * Writes message to a temporary memory buffer, - * flushing to the socket/group when buffer size limit is reached - */ -static bool -nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok) -{ - int mflag = waitok ? M_WAITOK : M_NOWAIT; - nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO); - if (__predict_false(nw->_storage == NULL)) - return (false); - nw->alloc_len = size; - nw->offset = 0; - nw->hdr = NULL; - nw->data = nw->_storage; - nw->writer_type = NS_WRITER_TYPE_BUF; - nw->malloc_flag = mflag; - nw->num_messages = 0; - nw->enomem = false; - return (true); -} - static bool -nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) +nlmsg_get_buf(struct nl_writer *nw, u_int len, bool waitok) { - NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); - if (__predict_false(datalen == 0)) { - free(buf, M_NETLINK); - return (true); - } + const int mflag = waitok ? M_WAITOK : M_NOWAIT; - struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); - if (__predict_false(m == NULL)) { - /* XXX: should we set sorcverr? */ - free(buf, M_NETLINK); - return (false); - } - m_append(m, datalen, buf); - free(buf, M_NETLINK); + MPASS(nw->buf == NULL); - int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; - return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags)); -} - -static bool -nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen, - nw->arg.group.proto, nw->arg.group.id); - if (__predict_false(datalen == 0)) { - free(buf, M_NETLINK); - return (true); - } + NL_LOG(LOG_DEBUG3, "Setting up nw %p len %u %s", nw, len, + waitok ? "wait" : "nowait"); - struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); - if (__predict_false(m == NULL)) { - free(buf, M_NETLINK); + nw->buf = nl_buf_alloc(len, mflag); + if (__predict_false(nw->buf == NULL)) return (false); - } - bool success = m_append(m, datalen, buf) != 0; - free(buf, M_NETLINK); - - if (!success) - return (false); - - nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); - return (true); -} - -static bool -nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr); - NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); - - if (__predict_false(datalen == 0)) { - free(buf, M_NETLINK); - return (true); - } - - if (*m0 == NULL) { - struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); - - if (__predict_false(m == NULL)) { - free(buf, M_NETLINK); - return (false); - } - *m0 = m; - } - if (__predict_false(m_append(*m0, datalen, buf) == 0)) { - free(buf, M_NETLINK); - return (false); - } - return (true); -} - - -/* - * NS_WRITER_TYPE_MBUF - * Writes message to the allocated mbuf, - * flushing to socket/group when mbuf size limit is reached. - * This is the most efficient mechanism as it avoids double-copying. - * - * Allocates a single mbuf suitable to store up to @size bytes of data. - * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr. - * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone. - * Returns NULL on greater size or the allocation failure. - */ -static bool -nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok) -{ - int mflag = waitok ? M_WAITOK : M_NOWAIT; - struct mbuf *m = nl_get_mbuf(size, mflag); - - if (__predict_false(m == NULL)) - return (false); - nw->alloc_len = M_TRAILINGSPACE(m); - nw->offset = 0; nw->hdr = NULL; - nw->_storage = (void *)m; - nw->data = mtod(m, void *); - nw->writer_type = NS_WRITER_TYPE_MBUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; - memset(nw->data, 0, size); - NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p", - m, size, nw->alloc_len, nw->data); - return (true); -} - -static bool -nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - struct mbuf *m = (struct mbuf *)buf; - NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); - - if (__predict_false(datalen == 0)) { - m_freem(m); - return (true); - } - - m->m_pkthdr.len = datalen; - m->m_len = datalen; - int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; - return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags)); -} - -static bool -nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - struct mbuf *m = (struct mbuf *)buf; - NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen, - nw->arg.group.proto, nw->arg.group.id); - - if (__predict_false(datalen == 0)) { - m_freem(m); - return (true); - } - m->m_pkthdr.len = datalen; - m->m_len = datalen; - nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); return (true); } -static bool -nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - struct mbuf *m_new = (struct mbuf *)buf; - struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr); - - NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr); - - if (__predict_false(datalen == 0)) { - m_freem(m_new); - return (true); - } - - m_new->m_pkthdr.len = datalen; - m_new->m_len = datalen; - - if (*m0 == NULL) { - *m0 = m_new; - } else { - struct mbuf *m_last; - for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next) - ; - m_last->m_next = m_new; - (*m0)->m_pkthdr.len += datalen; - } - - return (true); -} - -/* - * NS_WRITER_TYPE_LBUF - * Writes message to the allocated memory buffer, - * flushing to socket/group when mbuf size limit is reached. - * Calls linux handler to rewrite messages before sending to the socket. - */ -static bool -nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok) -{ - int mflag = waitok ? M_WAITOK : M_NOWAIT; - size = roundup2(size, sizeof(void *)); - int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE; - char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO); - if (__predict_false(buf == NULL)) - return (false); - - /* Fill buffer header first */ - struct linear_buffer *lb = (struct linear_buffer *)buf; - lb->base = &buf[sizeof(struct linear_buffer) + size]; - lb->size = size + SCRATCH_BUFFER_SIZE; - - nw->alloc_len = size; - nw->offset = 0; - nw->hdr = NULL; - nw->_storage = buf; - nw->data = (char *)(lb + 1); - nw->malloc_flag = mflag; - nw->writer_type = NS_WRITER_TYPE_LBUF; - nw->num_messages = 0; - nw->enomem = false; - return (true); -} - -static bool -nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - struct linear_buffer *lb = (struct linear_buffer *)buf; - char *data = (char *)(lb + 1); - struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr); - - if (__predict_false(datalen == 0)) { - free(buf, M_NETLINK); - return (true); - } - - struct mbuf *m = NULL; - if (linux_netlink_p != NULL) - m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp); - free(buf, M_NETLINK); - - if (__predict_false(m == NULL)) { - /* XXX: should we set sorcverr? */ - return (false); - } - - int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; - return (nl_send_one(m, nlp, cnt, io_flags)); -} - -/* Shouldn't be called (maybe except Linux code originating message) */ -static bool -nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) -{ - struct linear_buffer *lb = (struct linear_buffer *)buf; - char *data = (char *)(lb + 1); - - if (__predict_false(datalen == 0)) { - free(buf, M_NETLINK); - return (true); - } - - struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag); - if (__predict_false(m == NULL)) { - free(buf, M_NETLINK); - return (false); - } - m_append(m, datalen, data); - free(buf, M_NETLINK); - - nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id); - return (true); -} - -static const struct nlwriter_ops nlmsg_writers[] = { - /* NS_WRITER_TYPE_MBUF */ - { - .init = nlmsg_get_ns_mbuf, - .write_socket = nlmsg_write_socket_mbuf, - .write_group = nlmsg_write_group_mbuf, - .write_chain = nlmsg_write_chain_mbuf, - }, - /* NS_WRITER_TYPE_BUF */ - { - .init = nlmsg_get_ns_buf, - .write_socket = nlmsg_write_socket_buf, - .write_group = nlmsg_write_group_buf, - .write_chain = nlmsg_write_chain_buf, - }, - /* NS_WRITER_TYPE_LBUF */ - { - .init = nlmsg_get_ns_lbuf, - .write_socket = nlmsg_write_socket_lbuf, - .write_group = nlmsg_write_group_lbuf, - }, -}; - -static void -nlmsg_set_callback(struct nl_writer *nw) -{ - const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type]; - - switch (nw->writer_target) { - case NS_WRITER_TARGET_SOCKET: - nw->cb = pops->write_socket; - break; - case NS_WRITER_TARGET_GROUP: - nw->cb = pops->write_group; - break; - case NS_WRITER_TARGET_CHAIN: - nw->cb = pops->write_chain; - break; - default: - panic("not implemented"); - } -} - -static bool -nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok) -{ - MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0])); - NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type); - return (nlmsg_writers[type].init(nw, size, waitok)); -} - -static bool -nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux) -{ - int type; - - if (!is_linux) { - if (__predict_true(size <= NLMBUFSIZE)) - type = NS_WRITER_TYPE_MBUF; - else - type = NS_WRITER_TYPE_BUF; - } else - type = NS_WRITER_TYPE_LBUF; - return (nlmsg_get_buf_type(nw, size, type, waitok)); -} - bool _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) { - if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux)) - return (false); - nw->arg.ptr = (void *)nlp; - nw->writer_target = NS_WRITER_TARGET_SOCKET; - nlmsg_set_callback(nw); - return (true); + nw->nlp = nlp; + nw->cb = nl_send_one; + + return (nlmsg_get_buf(nw, size, false)); } bool _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) { - if (!nlmsg_get_buf(nw, size, false, false)) - return (false); - nw->arg.group.proto = protocol; - nw->arg.group.id = group_id; - nw->writer_target = NS_WRITER_TARGET_GROUP; - nlmsg_set_callback(nw); - return (true); -} + nw->group.proto = protocol; + nw->group.id = group_id; + nw->cb = nl_send_group; -bool -_nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) -{ - if (!nlmsg_get_buf(nw, size, false, false)) - return (false); - *pm = NULL; - nw->arg.ptr = (void *)pm; - nw->writer_target = NS_WRITER_TARGET_CHAIN; - nlmsg_set_callback(nw); - NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf); - return (true); + return (nlmsg_get_buf(nw, size, false)); } void _nlmsg_ignore_limit(struct nl_writer *nw) { nw->ignore_limit = true; } bool _nlmsg_flush(struct nl_writer *nw) { if (__predict_false(nw->hdr != NULL)) { /* Last message has not been completed, skip it. */ - int completed_len = (char *)nw->hdr - nw->data; + int completed_len = (char *)nw->hdr - nw->buf->data; /* Send completed messages */ - nw->offset -= nw->offset - completed_len; + nw->buf->datalen -= nw->buf->datalen - completed_len; nw->hdr = NULL; - } + } NL_LOG(LOG_DEBUG2, "OUT"); - bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages); - nw->_storage = NULL; + bool result = nw->cb(nw); + nw->num_messages = 0; if (!result) { - NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb); + NL_LOG(LOG_DEBUG, "nw %p flush with %p() failed", nw, nw->cb); } return (result); } /* * Flushes previous data and allocates new underlying storage * sufficient for holding at least @required_len bytes. * Return true on success. */ bool -_nlmsg_refill_buffer(struct nl_writer *nw, int required_len) +_nlmsg_refill_buffer(struct nl_writer *nw, u_int required_len) { - struct nl_writer ns_new = {}; - int completed_len, new_len; + struct nl_buf *new; + u_int completed_len, new_len, last_len; + + MPASS(nw->buf != NULL); if (nw->enomem) return (false); - NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim", - nw->offset, nw->alloc_len, required_len); + NL_LOG(LOG_DEBUG3, "no space at offset %u/%u (want %u), trying to " + "reclaim", nw->buf->datalen, nw->buf->buflen, required_len); - /* Calculated new buffer size and allocate it s*/ - completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset; + /* Calculate new buffer size and allocate it. */ + completed_len = (nw->hdr != NULL) ? + (char *)nw->hdr - nw->buf->data : nw->buf->datalen; if (completed_len > 0 && required_len < NLMBUFSIZE) { - /* We already ran out of space, use the largest effective size */ - new_len = max(nw->alloc_len, NLMBUFSIZE); + /* We already ran out of space, use largest effective size. */ + new_len = max(nw->buf->buflen, NLMBUFSIZE); } else { - if (nw->alloc_len < NLMBUFSIZE) + if (nw->buf->buflen < NLMBUFSIZE) + /* XXXGL: does this happen? */ new_len = NLMBUFSIZE; else - new_len = nw->alloc_len * 2; + new_len = nw->buf->buflen * 2; while (new_len < required_len) new_len *= 2; } - bool waitok = (nw->malloc_flag == M_WAITOK); - bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF); - if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) { + + new = nl_buf_alloc(new_len, nw->malloc_flag | M_ZERO); + if (__predict_false(new == NULL)) { nw->enomem = true; NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM"); return (false); } - if (nw->ignore_limit) - nlmsg_ignore_limit(&ns_new); - /* Update callback data */ - ns_new.writer_target = nw->writer_target; - nlmsg_set_callback(&ns_new); - ns_new.arg = nw->arg; - - /* Copy last (unfinished) header to the new storage */ - int last_len = nw->offset - completed_len; + /* Copy last (unfinished) header to the new storage. */ + last_len = nw->buf->datalen - completed_len; if (last_len > 0) { - memcpy(ns_new.data, nw->hdr, last_len); - ns_new.hdr = (struct nlmsghdr *)ns_new.data; - ns_new.offset = last_len; + memcpy(new->data, nw->hdr, last_len); + new->datalen = last_len; } - NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len); + NL_LOG(LOG_DEBUG2, "completed: %u bytes, copied: %u bytes", + completed_len, last_len); - /* Flush completed headers & switch to the new nw */ - nlmsg_flush(nw); - memcpy(nw, &ns_new, sizeof(struct nl_writer)); - NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len); + if (completed_len > 0) { + nlmsg_flush(nw); + MPASS(nw->buf == NULL); + } else + nl_buf_free(nw->buf); + nw->buf = new; + nw->hdr = (last_len > 0) ? (struct nlmsghdr *)new->data : NULL; + NL_LOG(LOG_DEBUG2, "switched buffer: used %u/%u bytes", + new->datalen, new->buflen); return (true); } bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { + struct nl_buf *nb = nw->buf; struct nlmsghdr *hdr; + u_int required_len; MPASS(nw->hdr == NULL); - int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); - if (__predict_false(nw->offset + required_len > nw->alloc_len)) { + required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); + if (__predict_false(nb->datalen + required_len > nb->buflen)) { if (!nlmsg_refill_buffer(nw, required_len)) return (false); + nb = nw->buf; } - hdr = (struct nlmsghdr *)(&nw->data[nw->offset]); + hdr = (struct nlmsghdr *)(&nb->data[nb->datalen]); hdr->nlmsg_len = len; hdr->nlmsg_type = type; hdr->nlmsg_flags = flags; hdr->nlmsg_seq = seq; hdr->nlmsg_pid = portid; nw->hdr = hdr; - nw->offset += sizeof(struct nlmsghdr); + nb->datalen += sizeof(struct nlmsghdr); return (true); } bool _nlmsg_end(struct nl_writer *nw) { + struct nl_buf *nb = nw->buf; + MPASS(nw->hdr != NULL); if (nw->enomem) { NL_LOG(LOG_DEBUG, "ENOMEM when dumping message"); nlmsg_abort(nw); return (false); } - nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr); + nw->hdr->nlmsg_len = nb->data + nb->datalen - (char *)nw->hdr; NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags, nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid); nw->hdr = NULL; nw->num_messages++; return (true); } void _nlmsg_abort(struct nl_writer *nw) { + struct nl_buf *nb = nw->buf; + if (nw->hdr != NULL) { - nw->offset = (uint32_t)((char *)nw->hdr - nw->data); + nb->datalen = (char *)nw->hdr - nb->data; nw->hdr = NULL; } } void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlmsgerr *errmsg; int payload_len; uint32_t flags = nlp->nl_flags; struct nl_writer *nw = npt->nw; bool cap_ack; payload_len = sizeof(struct nlmsgerr); /* * The only case when we send the full message in the * reply is when there is an error and NETLINK_CAP_ACK * is not set. */ cap_ack = (error == 0) || (flags & NLF_CAP_ACK); if (!cap_ack) payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr); payload_len = NETLINK_ALIGN(payload_len); uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0; if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK) nl_flags |= NLM_F_ACK_TLVS; NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d", hdr->nlmsg_type, hdr->nlmsg_seq); if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len)) goto enomem; errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr); errmsg->error = error; /* In case of error copy the whole message, else just the header */ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len); if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg); if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off); if (npt->cookie != NULL) nlattr_add_raw(nw, npt->cookie); if (nlmsg_end(nw)) return; enomem: NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u", hdr->nlmsg_type, hdr->nlmsg_seq); nlmsg_abort(nw); } bool _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { NL_LOG(LOG_DEBUG, "Error finalizing table dump"); return (false); } /* Save operation result */ int *perror = nlmsg_reserve_object(nw, int); NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, - nw->offset, perror); + nw->buf->datalen, perror); *perror = error; nlmsg_end(nw); nw->suppress_ack = true; return (true); } /* * KPI functions. */ -int +u_int nlattr_save_offset(const struct nl_writer *nw) { - return (nw->offset - ((char *)nw->hdr - nw->data)); + return (nw->buf->datalen - ((char *)nw->hdr - nw->buf->data)); } void * nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz) { - sz = NETLINK_ALIGN(sz); + struct nl_buf *nb = nw->buf; + void *data; - if (__predict_false(nw->offset + sz > nw->alloc_len)) { + sz = NETLINK_ALIGN(sz); + if (__predict_false(nb->datalen + sz > nb->buflen)) { if (!nlmsg_refill_buffer(nw, sz)) return (NULL); + nb = nw->buf; } - void *data_ptr = &nw->data[nw->offset]; - nw->offset += sz; - bzero(data_ptr, sz); + data = &nb->data[nb->datalen]; + bzero(data, sz); + nb->datalen += sz; - return (data_ptr); + return (data); } bool nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data) { - int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + struct nl_buf *nb = nw->buf; + struct nlattr *nla; + u_int required_len; - if (__predict_false(nw->offset + required_len > nw->alloc_len)) { + required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + if (__predict_false(nb->datalen + required_len > nb->buflen)) { if (!nlmsg_refill_buffer(nw, required_len)) return (false); + nb = nw->buf; } - struct nlattr *nla = (struct nlattr *)(&nw->data[nw->offset]); + nla = (struct nlattr *)(&nb->data[nb->datalen]); nla->nla_len = attr_len + sizeof(struct nlattr); nla->nla_type = attr_type; if (attr_len > 0) { if ((attr_len % 4) != 0) { /* clear padding bytes */ bzero((char *)nla + required_len - 4, 4); } memcpy((nla + 1), data, attr_len); } - nw->offset += required_len; + nb->datalen += required_len; return (true); } #include diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h index 68e434094678..28f3fb78018c 100644 --- a/sys/netlink/netlink_message_writer.h +++ b/sys/netlink/netlink_message_writer.h @@ -1,327 +1,300 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_MESSAGE_WRITER_H_ #define _NETLINK_NETLINK_MESSAGE_WRITER_H_ #ifdef _KERNEL #include /* * It is not meant to be included directly */ -struct mbuf; +struct nl_buf; struct nl_writer; -typedef bool nl_writer_cb(struct nl_writer *nw, void *buf, int buflen, int cnt); +typedef bool nl_writer_cb(struct nl_writer *nw); struct nl_writer { - int alloc_len; /* allocated buffer length */ - int offset; /* offset from the start of the buffer */ - struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */ - char *data; /* pointer to the contiguous storage */ - void *_storage; /* Underlying storage pointer */ - nl_writer_cb *cb; /* Callback to flush data */ + struct nl_buf *buf; /* Underlying storage pointer */ + struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */ + nl_writer_cb *cb; /* Callback to flush data */ union { - void *ptr; + struct nlpcb *nlp; struct { uint16_t proto; uint16_t id; } group; - } arg; - int num_messages; /* Number of messages in the buffer */ - int malloc_flag; /* M_WAITOK or M_NOWAIT */ - uint8_t writer_type; /* NS_WRITER_TYPE_* */ - uint8_t writer_target; /* NS_WRITER_TARGET_* */ - bool ignore_limit; /* If true, ignores RCVBUF limit */ - bool enomem; /* True if ENOMEM occured */ - bool suppress_ack; /* If true, don't send NLMSG_ERR */ + }; + u_int num_messages; /* Number of messages in the buffer */ + int malloc_flag; /* M_WAITOK or M_NOWAIT */ + bool ignore_limit; /* If true, ignores RCVBUF limit */ + bool enomem; /* True if ENOMEM occured */ + bool suppress_ack; /* If true, don't send NLMSG_ERR */ }; -#define NS_WRITER_TARGET_SOCKET 0 -#define NS_WRITER_TARGET_GROUP 1 -#define NS_WRITER_TARGET_CHAIN 2 - -#define NS_WRITER_TYPE_MBUF 0 -#define NS_WRITER_TYPE_BUF 1 -#define NS_WRITER_TYPE_LBUF 2 -#define NS_WRITER_TYPE_MBUFC 3 -#define NS_WRITER_TYPE_STUB 4 - #define NLMSG_SMALL 128 #define NLMSG_LARGE 2048 /* Message and attribute writing */ - -struct nlpcb; - #if defined(NETLINK) || defined(NETLINK_MODULE) /* Provide optimized calls to the functions inside the same linking unit */ bool _nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp); bool _nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id); -bool _nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm); bool _nlmsg_flush(struct nl_writer *nw); void _nlmsg_ignore_limit(struct nl_writer *nw); -bool _nlmsg_refill_buffer(struct nl_writer *nw, int required_size); +bool _nlmsg_refill_buffer(struct nl_writer *nw, u_int required_len); bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool _nlmsg_end(struct nl_writer *nw); void _nlmsg_abort(struct nl_writer *nw); bool _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr); static inline bool nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp) { return (_nlmsg_get_unicast_writer(nw, expected_size, nlp)); } static inline bool nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id) { return (_nlmsg_get_group_writer(nw, expected_size, proto, group_id)); } -static inline bool -nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm) -{ - return (_nlmsg_get_chain_writer(nw, expected_size, pm)); -} - static inline bool nlmsg_flush(struct nl_writer *nw) { return (_nlmsg_flush(nw)); } static inline void nlmsg_ignore_limit(struct nl_writer *nw) { _nlmsg_ignore_limit(nw); } static inline bool nlmsg_refill_buffer(struct nl_writer *nw, int required_size) { return (_nlmsg_refill_buffer(nw, required_size)); } static inline bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { return (_nlmsg_add(nw, portid, seq, type, flags, len)); } static inline bool nlmsg_end(struct nl_writer *nw) { return (_nlmsg_end(nw)); } static inline void nlmsg_abort(struct nl_writer *nw) { return (_nlmsg_abort(nw)); } static inline bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { return (_nlmsg_end_dump(nw, error, hdr)); } #else /* Provide access to the functions via netlink_glue.c */ bool nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp); bool nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id); bool nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm); bool nlmsg_flush(struct nl_writer *nw); void nlmsg_ignore_limit(struct nl_writer *nw); bool nlmsg_refill_buffer(struct nl_writer *nw, int required_size); bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool nlmsg_end(struct nl_writer *nw); void nlmsg_abort(struct nl_writer *nw); bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr); #endif /* defined(NETLINK) || defined(NETLINK_MODULE) */ static inline bool nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len) { return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, hdr->nlmsg_flags, payload_len)); } -#define nlmsg_data(_hdr) ((void *)((_hdr) + 1)) - /* * KPI similar to mtodo(): * current (uncompleted) header is guaranteed to be contiguous, * but can be reallocated, thus pointers may need to be readjusted. */ u_int nlattr_save_offset(const struct nl_writer *nw); static inline void * _nlattr_restore_offset(const struct nl_writer *nw, int off) { return ((void *)((char *)nw->hdr + off)); } #define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off)) static inline void nlattr_set_len(const struct nl_writer *nw, int off) { struct nlattr *nla = nlattr_restore_offset(nw, off, struct nlattr); nla->nla_len = nlattr_save_offset(nw) - off; } void *nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz); #define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, sizeof(_t))) #define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz)) static inline int nlattr_add_nested(struct nl_writer *nw, uint16_t nla_type) { int off = nlattr_save_offset(nw); struct nlattr *nla = nlmsg_reserve_data(nw, sizeof(struct nlattr), struct nlattr); if (__predict_false(nla == NULL)) return (0); nla->nla_type = nla_type; return (off); } static inline void * _nlmsg_reserve_attr(struct nl_writer *nw, uint16_t nla_type, uint16_t sz) { sz += sizeof(struct nlattr); struct nlattr *nla = nlmsg_reserve_data(nw, sz, struct nlattr); if (__predict_false(nla == NULL)) return (NULL); nla->nla_type = nla_type; nla->nla_len = sz; return ((void *)(nla + 1)); } #define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t)))) bool nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data); static inline bool nlattr_add_raw(struct nl_writer *nw, const struct nlattr *nla_src) { int attr_len = nla_src->nla_len - sizeof(struct nlattr); MPASS(attr_len >= 0); return (nlattr_add(nw, nla_src->nla_type, attr_len, (const void *)(nla_src + 1))); } static inline bool nlattr_add_u8(struct nl_writer *nw, int attrtype, uint8_t value) { return (nlattr_add(nw, attrtype, sizeof(uint8_t), &value)); } static inline bool nlattr_add_u16(struct nl_writer *nw, int attrtype, uint16_t value) { return (nlattr_add(nw, attrtype, sizeof(uint16_t), &value)); } static inline bool nlattr_add_u32(struct nl_writer *nw, int attrtype, uint32_t value) { return (nlattr_add(nw, attrtype, sizeof(uint32_t), &value)); } static inline bool nlattr_add_u64(struct nl_writer *nw, int attrtype, uint64_t value) { return (nlattr_add(nw, attrtype, sizeof(uint64_t), &value)); } static inline bool nlattr_add_s8(struct nl_writer *nw, int attrtype, int8_t value) { return (nlattr_add(nw, attrtype, sizeof(int8_t), &value)); } static inline bool nlattr_add_s16(struct nl_writer *nw, int attrtype, int16_t value) { return (nlattr_add(nw, attrtype, sizeof(int16_t), &value)); } static inline bool nlattr_add_s32(struct nl_writer *nw, int attrtype, int32_t value) { return (nlattr_add(nw, attrtype, sizeof(int32_t), &value)); } static inline bool nlattr_add_s64(struct nl_writer *nw, int attrtype, int64_t value) { return (nlattr_add(nw, attrtype, sizeof(int64_t), &value)); } static inline bool nlattr_add_flag(struct nl_writer *nw, int attrtype) { return (nlattr_add(nw, attrtype, 0, NULL)); } static inline bool nlattr_add_string(struct nl_writer *nw, int attrtype, const char *str) { return (nlattr_add(nw, attrtype, strlen(str) + 1, str)); } static inline bool nlattr_add_in_addr(struct nl_writer *nw, int attrtype, const struct in_addr *in) { return (nlattr_add(nw, attrtype, sizeof(*in), in)); } static inline bool nlattr_add_in6_addr(struct nl_writer *nw, int attrtype, const struct in6_addr *in6) { return (nlattr_add(nw, attrtype, sizeof(*in6), in6)); } #endif #endif diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c index e63048072ae9..ddae4488987b 100644 --- a/sys/netlink/netlink_module.c +++ b/sys/netlink/netlink_module.c @@ -1,253 +1,250 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(netlink, "Netlink support"); #define DEBUG_MOD_NAME nl_mod #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); #define NL_MAX_HANDLERS 20 struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS]; struct nl_proto_handler *nl_handlers = _nl_handlers; CK_LIST_HEAD(nl_control_head, nl_control); static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); VNET_DEFINE(struct nl_control *, nl_ctl) = NULL; struct mtx nl_global_mtx; MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF); #define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx) #define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx) int netlink_unloading = 0; static void free_nl_ctl(struct nl_control *ctl) { rm_destroy(&ctl->ctl_lock); free(ctl, M_NETLINK); } struct nl_control * vnet_nl_ctl_init(void) { struct nl_control *ctl; ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO); rm_init(&ctl->ctl_lock, "netlink lock"); CK_LIST_INIT(&ctl->ctl_port_head); CK_LIST_INIT(&ctl->ctl_pcb_head); NL_GLOBAL_LOCK(); struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl); if (tmp == NULL) { atomic_store_ptr(&V_nl_ctl, ctl); CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next); NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", curvnet, ctl); } else { NL_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance"); free_nl_ctl(ctl); ctl = tmp; } NL_GLOBAL_UNLOCK(); return (ctl); } static void vnet_nl_ctl_destroy(const void *unused __unused) { struct nl_control *ctl; /* Assume at the time all of the processes / sockets are dead */ NL_GLOBAL_LOCK(); ctl = atomic_load_ptr(&V_nl_ctl); atomic_store_ptr(&V_nl_ctl, NULL); if (ctl != NULL) { NL_LOG(LOG_DEBUG2, "Removing %p from global list", ctl); CK_LIST_REMOVE(ctl, ctl_next); } NL_GLOBAL_UNLOCK(); if (ctl != NULL) free_nl_ctl(ctl); } VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_nl_ctl_destroy, NULL); int nl_verify_proto(int proto) { if (proto < 0 || proto >= NL_MAX_HANDLERS) { return (EINVAL); } int handler_defined = nl_handlers[proto].cb != NULL; return (handler_defined ? 0 : EPROTONOSUPPORT); } const char * nl_get_proto_name(int proto) { return (nl_handlers[proto].proto_name); } bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto)); nl_handlers[proto].cb = handler; nl_handlers[proto].proto_name = proto_name; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Registered netlink %s(%d) handler", proto_name, proto); return (true); } bool netlink_unregister_proto(int proto) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto)); nl_handlers[proto].cb = NULL; nl_handlers[proto].proto_name = NULL; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Unregistered netlink proto %d handler", proto); return (true); } #if !defined(NETLINK) && defined(NETLINK_MODULE) /* Non-stub function provider */ const static struct nl_function_wrapper nl_module = { .nlmsg_add = _nlmsg_add, .nlmsg_refill_buffer = _nlmsg_refill_buffer, .nlmsg_flush = _nlmsg_flush, .nlmsg_end = _nlmsg_end, .nlmsg_abort = _nlmsg_abort, .nlmsg_get_unicast_writer = _nlmsg_get_unicast_writer, .nlmsg_get_group_writer = _nlmsg_get_group_writer, - .nlmsg_get_chain_writer = _nlmsg_get_chain_writer, .nlmsg_end_dump = _nlmsg_end_dump, .nl_modify_ifp_generic = _nl_modify_ifp_generic, .nl_store_ifp_cookie = _nl_store_ifp_cookie, .nl_get_thread_nlp = _nl_get_thread_nlp, }; #endif static bool can_unload(void) { struct nl_control *ctl; bool result = true; NL_GLOBAL_LOCK(); CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); result = false; break; } } NL_GLOBAL_UNLOCK(); return (result); } static int netlink_modevent(module_t mod __unused, int what, void *priv __unused) { int ret = 0; switch (what) { case MOD_LOAD: NL_LOG(LOG_DEBUG2, "Loading"); - nl_init_msg_zone(); nl_osd_register(); #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(&nl_module); #endif break; case MOD_UNLOAD: NL_LOG(LOG_DEBUG2, "Unload called"); if (can_unload()) { NL_LOG(LOG_WARNING, "unloading"); netlink_unloading = 1; #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(NULL); #endif nl_osd_unregister(); - nl_destroy_msg_zone(); } else ret = EBUSY; break; default: ret = EOPNOTSUPP; break; } return (ret); } static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(netlink, 1); diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h index ec174e17d1a2..97532c31e54b 100644 --- a/sys/netlink/netlink_var.h +++ b/sys/netlink/netlink_var.h @@ -1,214 +1,204 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_VAR_H_ #define _NETLINK_NETLINK_VAR_H_ #ifdef _KERNEL #include #include #include #include #include #define NLSNDQ 65536 /* Default socket sendspace */ #define NLRCVQ 65536 /* Default socket recvspace */ #define NLMBUFSIZE 2048 /* External storage size for Netlink mbufs */ struct ucred; -struct nl_io_queue { - STAILQ_HEAD(, mbuf) head; - int length; - int hiwat; -}; - struct nl_buf { TAILQ_ENTRY(nl_buf) tailq; + struct mbuf *control; u_int buflen; u_int datalen; u_int offset; char data[]; }; #define NLP_MAX_GROUPS 128 struct nlpcb { struct socket *nl_socket; uint64_t nl_groups[NLP_MAX_GROUPS / 64]; uint32_t nl_port; uint32_t nl_flags; uint32_t nl_process_id; int nl_proto; bool nl_bound; bool nl_task_pending; bool nl_tx_blocked; /* No new requests accepted */ bool nl_linux; /* true if running under compat */ bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */ bool nl_need_thread_setup; - struct nl_io_queue tx_queue; struct taskqueue *nl_taskqueue; struct task nl_task; struct ucred *nl_cred; /* Copy of nl_socket->so_cred */ uint64_t nl_dropped_bytes; uint64_t nl_dropped_messages; CK_LIST_ENTRY(nlpcb) nl_next; CK_LIST_ENTRY(nlpcb) nl_port_next; volatile u_int nl_refcount; struct mtx nl_lock; struct epoch_context nl_epoch_ctx; }; #define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) #define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) #define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) #define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) #define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) #define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) /* nl_flags */ #define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ #define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ #define NLF_STRICT 0x04 /* Perform strict header checks */ #define NLF_MSG_INFO 0x08 /* Send caller info along with the notifications */ SYSCTL_DECL(_net_netlink); SYSCTL_DECL(_net_netlink_debug); struct nl_control { CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; CK_LIST_ENTRY(nl_control) ctl_next; struct rmlock ctl_lock; }; VNET_DECLARE(struct nl_control *, nl_ctl); #define V_nl_ctl VNET(nl_ctl) struct sockaddr_nl; struct sockaddr; struct nlmsghdr; /* netlink_module.c */ struct nl_control *vnet_nl_ctl_init(void); int nl_verify_proto(int proto); const char *nl_get_proto_name(int proto); extern int netlink_unloading; struct nl_proto_handler { nl_handler_f cb; const char *proto_name; }; extern struct nl_proto_handler *nl_handlers; /* netlink_domain.c */ -void nl_send_group(struct mbuf *m, int cnt, int proto, int group_id); +bool nl_send_group(struct nl_writer *); void nl_osd_register(void); void nl_osd_unregister(void); void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp); /* netlink_io.c */ #define NL_IOF_UNTRANSLATED 0x01 #define NL_IOF_IGNORE_LIMIT 0x02 -bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int cnt, int io_flags); +bool nl_send_one(struct nl_writer *); void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg, struct nl_pstate *npt); void nl_on_transmit(struct nlpcb *nlp); -void nl_init_io(struct nlpcb *nlp); -void nl_free_io(struct nlpcb *nlp); void nl_taskqueue_handler(void *_arg, int pending); void nl_schedule_taskqueue(struct nlpcb *nlp); void nl_process_receive_locked(struct nlpcb *nlp); void nl_set_source_metadata(struct mbuf *m, int num_messages); -void nl_add_msg_info(struct mbuf *m); - -/* netlink_message_writer.c */ -void nl_init_msg_zone(void); -void nl_destroy_msg_zone(void); +void nl_add_msg_info(struct nl_buf *nb); +struct nl_buf *nl_buf_alloc(size_t len, int mflag); +void nl_buf_free(struct nl_buf *nb); /* netlink_generic.c */ struct genl_family { const char *family_name; uint16_t family_hdrsize; uint16_t family_id; uint16_t family_version; uint16_t family_attr_max; uint16_t family_cmd_size; uint16_t family_num_groups; struct genl_cmd *family_cmds; }; struct genl_group { struct genl_family *group_family; const char *group_name; }; struct genl_family *genl_get_family(uint32_t family_id); struct genl_group *genl_get_group(uint32_t group_id); #define MAX_FAMILIES 20 #define MAX_GROUPS 64 #define MIN_GROUP_NUM 48 #define CTRL_FAMILY_NAME "nlctrl" struct ifnet; struct nl_parsed_link; struct nlattr_bmask; struct nl_pstate; /* Function map */ struct nl_function_wrapper { bool (*nlmsg_add)(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool (*nlmsg_refill_buffer)(struct nl_writer *nw, int required_len); bool (*nlmsg_flush)(struct nl_writer *nw); bool (*nlmsg_end)(struct nl_writer *nw); void (*nlmsg_abort)(struct nl_writer *nw); void (*nlmsg_ignore_limit)(struct nl_writer *nw); bool (*nlmsg_get_unicast_writer)(struct nl_writer *nw, int size, struct nlpcb *nlp); bool (*nlmsg_get_group_writer)(struct nl_writer *nw, int size, int protocol, int group_id); bool (*nlmsg_get_chain_writer)(struct nl_writer *nw, int size, struct mbuf **pm); bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr); int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt); void (*nl_store_ifp_cookie)(struct nl_pstate *npt, struct ifnet *ifp); struct nlpcb * (*nl_get_thread_nlp)(struct thread *td); }; void nl_set_functions(const struct nl_function_wrapper *nl); #endif #endif diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c index ed09748995dc..ffa06fb4c1ab 100644 --- a/sys/netlink/route/rt.c +++ b/sys/netlink/route/rt.c @@ -1,1119 +1,1117 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_route #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); static unsigned char get_rtm_type(const struct nhop_object *nh) { int nh_flags = nh->nh_flags; /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ if (nh_flags & NHF_BLACKHOLE) return (RTN_BLACKHOLE); else if (nh_flags & NHF_REJECT) return (RTN_PROHIBIT); return (RTN_UNICAST); } static uint8_t nl_get_rtm_protocol(const struct nhop_object *nh) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; uint8_t origin = nhgrp_get_origin(nhg); if (origin != RTPROT_UNSPEC) return (origin); nh = nhg->nhops[0]; } #endif uint8_t origin = nhop_get_origin(nh); if (origin != RTPROT_UNSPEC) return (origin); /* TODO: remove guesswork once all kernel users fill in origin */ int rt_flags = nhop_get_rtflags(nh); if (rt_flags & RTF_PROTO1) return (RTPROT_ZEBRA); if (rt_flags & RTF_STATIC) return (RTPROT_STATIC); return (RTPROT_KERNEL); } static int get_rtmsg_type_from_rtsock(int cmd) { switch (cmd) { case RTM_ADD: case RTM_CHANGE: case RTM_GET: return NL_RTM_NEWROUTE; case RTM_DELETE: return NL_RTM_DELROUTE; } return (0); } /* * fibnum heuristics * * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS * msg rtm_table RTA_TABLE result * RTM_GETROUTE/dump 0 - RT_ALL_FIBS * RTM_GETROUTE/dump 1 - 1 * RTM_GETROUTE/get 0 - 0 * */ static struct nhop_object * rc_get_nhop(const struct rib_cmd_info *rc) { return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); } static void dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) { #ifdef INET6 int upper_family; #endif switch (nhop_get_neigh_family(nh)) { case AF_LINK: /* onlink prefix, skip */ break; case AF_INET: nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); break; #ifdef INET6 case AF_INET6: upper_family = nhop_get_upper_family(nh); if (upper_family == AF_INET6) { struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); nlattr_add(nw, NL_RTA_GATEWAY, 16, &gw6); } else if (upper_family == AF_INET) { /* IPv4 over IPv6 */ struct in6_addr gw6 = nh->gw6_sa.sin6_addr; in6_clearscope(&gw6); char buf[20]; struct rtvia *via = (struct rtvia *)&buf[0]; via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &gw6, 16); nlattr_add(nw, NL_RTA_VIA, 17, via); } break; #endif } } static void dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) { int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return; nla->nla_type = NL_RTA_METRICS; nla->nla_len = nla_len; nla++; nla->nla_type = NL_RTAX_MTU; nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); *((uint32_t *)(nla + 1)) = nh->nh_mtu; } #ifdef ROUTE_MPATH static void dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) { uint32_t uidx = nhgrp_get_uidx(nhg); uint32_t num_nhops; const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhgrp_get_idx(nhg)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); if (off == 0) return; for (int i = 0; i < num_nhops; i++) { int nh_off = nlattr_save_offset(nw); struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); if (rtnh == NULL) return; rtnh->rtnh_flags = 0; rtnh->rtnh_ifindex = if_getindex(wn[i].nh->nh_ifp); rtnh->rtnh_hops = wn[i].weight; dump_rc_nhop_gw(nw, wn[i].nh); uint32_t rtflags = nhop_get_rtflags(wn[i].nh); if (rtflags != base_rtflags) nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, wn[i].nh); rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); /* * nlattr_add() allocates 4-byte aligned storage, no need to aligh * length here * */ rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; } nlattr_set_len(nw, off); } #endif static void dump_rc_nhop(struct nl_writer *nw, const struct route_nhop_data *rnd, struct rtmsg *rtm) { #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rnd->rnd_nhop)) { dump_rc_nhg(nw, rnd->rnd_nhgrp, rtm); return; } #endif const struct nhop_object *nh = rnd->rnd_nhop; uint32_t rtflags = nhop_get_rtflags(nh); /* * IPv4 over IPv6 * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), * IPv4 w/ gw * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], * Direct route: * ('RTA_OIF', 2) */ if (nh->nh_flags & NHF_GATEWAY) dump_rc_nhop_gw(nw, nh); uint32_t uidx = nhop_get_uidx(nh); if (uidx != 0) nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); if (rtflags & RTF_FIXEDMTU) dump_rc_nhop_mtu(nw, nh); uint32_t nh_expire = nhop_get_expire(nh); if (nh_expire > 0) nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); /* In any case, fill outgoing interface */ nlattr_add_u32(nw, NL_RTA_OIF, if_getindex(nh->nh_ifp)); if (rnd->rnd_weight != RT_DEFAULT_WEIGHT) nlattr_add_u32(nw, NL_RTA_WEIGHT, rnd->rnd_weight); } /* * Dumps output from a rib command into an rtmsg */ static int dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, const struct rtentry *rt, struct route_nhop_data *rnd, struct nl_writer *nw) { struct rtmsg *rtm; int error = 0; NET_EPOCH_ASSERT(); if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) goto enomem; int family = rt_get_family(rt); int rtm_off = nlattr_save_offset(nw); rtm = nlmsg_reserve_object(nw, struct rtmsg); rtm->rtm_family = family; rtm->rtm_dst_len = 0; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; if (fibnum < 255) rtm->rtm_table = (unsigned char)fibnum; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); int plen = 0; #if defined(INET) || defined(INET6) uint32_t scopeid; #endif switch (family) { #ifdef INET case AF_INET: { struct in_addr addr; rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 4, &addr); break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr addr; rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); nlattr_add(nw, NL_RTA_DST, 16, &addr); break; } #endif default: FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); error = EAFNOSUPPORT; goto flush; } rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); if (plen > 0) rtm->rtm_dst_len = plen; dump_rc_nhop(nw, rnd, rtm); if (nlmsg_end(nw)) return (0); enomem: error = ENOMEM; flush: nlmsg_abort(nw); return (error); } static int family_to_group(int family) { switch (family) { case AF_INET: return (RTNLGRP_IPV4_ROUTE); case AF_INET6: return (RTNLGRP_IPV6_ROUTE); } return (0); } static void report_operation(uint32_t fibnum, struct rib_cmd_info *rc, struct nlpcb *nlp, struct nlmsghdr *hdr) { struct nl_writer nw = {}; uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); switch (rc->rc_cmd) { case RTM_ADD: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; break; case RTM_CHANGE: hdr->nlmsg_type = NL_RTM_NEWROUTE; hdr->nlmsg_flags |= NLM_F_REPLACE; break; case RTM_DELETE: hdr->nlmsg_type = NL_RTM_DELROUTE; break; } dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } rtsock_callback_p->route_f(fibnum, rc); } static void set_scope6(struct sockaddr *sa, struct ifnet *ifp) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6 && ifp != NULL) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, if_getindex(ifp)); } #endif } struct rta_mpath_nh { struct sockaddr *gw; struct ifnet *ifp; uint8_t rtnh_flags; uint8_t rtnh_weight; }; #define _IN(_field) offsetof(struct rtnexthop, _field) #define _OUT(_field) offsetof(struct rta_mpath_nh, _field) const static struct nlattr_parser nla_p_rtnh[] = { { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, }; const static struct nlfield_parser nlf_p_rtnh[] = { { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, }; #undef _IN #undef _OUT static bool post_p_rtnh(void *_attrs, struct nl_pstate *npt __unused) { struct rta_mpath_nh *attrs = (struct rta_mpath_nh *)_attrs; set_scope6(attrs->gw, attrs->ifp); return (true); } NL_DECLARE_PARSER_EXT(mpath_parser, struct rtnexthop, NULL, nlf_p_rtnh, nla_p_rtnh, post_p_rtnh); struct rta_mpath { int num_nhops; struct rta_mpath_nh nhops[0]; }; static int nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) { int data_len = nla->nla_len - sizeof(struct nlattr); struct rtnexthop *rtnh; int max_nhops = data_len / sizeof(struct rtnexthop); struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); mp->num_nhops = 0; for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++]; int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, npt, mpnh); if (error != 0) { NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed", mp->num_nhops - 1); return (error); } int len = NL_ITEM_ALIGN(rtnh->rtnh_len); data_len -= len; rtnh = (struct rtnexthop *)((char *)rtnh + len); } if (data_len != 0 || mp->num_nhops == 0) { NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); return (EINVAL); } *((struct rta_mpath **)target) = mp; return (0); } struct nl_parsed_route { struct sockaddr *rta_dst; struct sockaddr *rta_gw; struct ifnet *rta_oif; struct rta_mpath *rta_multipath; uint32_t rta_table; uint32_t rta_rtflags; uint32_t rta_nh_id; uint32_t rta_weight; uint32_t rtax_mtu; uint8_t rtm_family; uint8_t rtm_dst_len; uint8_t rtm_protocol; uint8_t rtm_type; uint32_t rtm_flags; }; #define _IN(_field) offsetof(struct rtmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_route, _field) static struct nlattr_parser nla_p_rtmetrics[] = { { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); static const struct nlattr_parser nla_p_rtmsg[] = { { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, { .type = NL_RTA_WEIGHT, .off = _OUT(rta_weight), .cb = nlattr_get_uint32 }, { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, }; static const struct nlfield_parser nlf_p_rtmsg[] = { { .off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_protocol), .off_out = _OUT(rtm_protocol), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_type), .off_out = _OUT(rtm_type), .cb = nlf_get_u8 }, { .off_in = _IN(rtm_flags), .off_out = _OUT(rtm_flags), .cb = nlf_get_u32 }, }; #undef _IN #undef _OUT static bool post_p_rtmsg(void *_attrs, struct nl_pstate *npt __unused) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_attrs; set_scope6(attrs->rta_dst, attrs->rta_oif); set_scope6(attrs->rta_gw, attrs->rta_oif); return (true); } NL_DECLARE_PARSER_EXT(rtm_parser, struct rtmsg, NULL, nlf_p_rtmsg, nla_p_rtmsg, post_p_rtmsg); struct netlink_walkargs { struct nl_writer *nw; struct route_nhop_data rnd; struct nlmsghdr hdr; struct nlpcb *nlp; uint32_t fibnum; int family; int error; int count; int dumped; int dumped_tables; }; static int dump_rtentry(struct rtentry *rt, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; int error; wa->count++; if (wa->error != 0) return (0); if (!rt_is_exportable(rt, nlp_get_cred(wa->nlp))) return (0); wa->dumped++; rt_get_rnd(rt, &wa->rnd); error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); IF_DEBUG_LEVEL(LOG_DEBUG3) { char rtbuf[INET6_ADDRSTRLEN + 5]; FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, - "Dump %s, offset %u, error %d", - rt_print_buf(rt, rtbuf, sizeof(rtbuf)), - wa->nw->offset, error); + "Dump %s, error %d", + rt_print_buf(rt, rtbuf, sizeof(rtbuf)), error); } wa->error = error; return (0); } static void dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) { FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); wa->count = 0; wa->dumped = 0; rib_walk(fibnum, family, false, dump_rtentry, wa); wa->dumped_tables++; FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", wa->count, wa->dumped); - NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset); } static int dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) { wa->fibnum = fibnum; if (family == AF_UNSPEC) { for (int i = 0; i < AF_MAX; i++) { if (rt_tables_get_rnh(fibnum, i) != 0) { wa->family = i; dump_rtable_one(wa, fibnum, i); if (wa->error != 0) break; } } } else { if (rt_tables_get_rnh(fibnum, family) != 0) { wa->family = family; dump_rtable_one(wa, fibnum, family); } } return (wa->error); } static int handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, struct nlmsghdr *hdr, struct nl_pstate *npt) { RIB_RLOCK_TRACKER; struct rib_head *rnh; const struct rtentry *rt; struct route_nhop_data rnd; uint32_t fibnum = attrs->rta_table; sa_family_t family = attrs->rtm_family; if (attrs->rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); return (EINVAL); } rnh = rt_tables_get_rnh(fibnum, family); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); struct sockaddr *dst = attrs->rta_dst; if (attrs->rtm_flags & RTM_F_PREFIX) rt = rib_lookup_prefix_plen(rnh, dst, attrs->rtm_dst_len, &rnd); else rt = (const struct rtentry *)rnh->rnh_matchaddr(dst, &rnh->head); if (rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } rt_get_rnd(rt, &rnd); rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); RIB_RUNLOCK(rnh); if (!rt_is_exportable(rt, nlp_get_cred(nlp))) return (ESRCH); IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), rt_print_buf(rt, rtbuf, sizeof(rtbuf))); } hdr->nlmsg_type = NL_RTM_NEWROUTE; dump_px(fibnum, hdr, rt, &rnd, npt->nw); return (0); } static int handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, struct nlmsghdr *hdr, struct nl_writer *nw) { struct netlink_walkargs wa = { .nlp = nlp, .nw = nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_type = NL_RTM_NEWROUTE, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, }; if (fibnum == RT_TABLE_UNSPEC) { for (int i = 0; i < V_rt_numfibs; i++) { dump_rtable_fib(&wa, fibnum, family); if (wa.error != 0) break; } } else dump_rtable_fib(&wa, fibnum, family); if (wa.error == 0 && wa.dumped_tables == 0) { FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); wa.error = ESRCH; // How do we propagate it? } if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (wa.error); } static struct nhop_object * finalize_nhop(struct nhop_object *nh, const struct sockaddr *dst, int *perror) { /* * The following MUST be filled: * nh_ifp, nh_ifa, nh_gw */ if (nh->gw_sa.sa_family == 0) { /* * Empty gateway. Can be direct route with RTA_OIF set. */ if (nh->nh_ifp != NULL) nhop_set_direct_gw(nh, nh->nh_ifp); else { NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); *perror = EINVAL; return (NULL); } /* Both nh_ifp and gateway are set */ } else { /* Gateway is set up, we can derive ifp if not set */ if (nh->nh_ifp == NULL) { uint32_t fibnum = nhop_get_fibnum(nh); uint32_t flags = 0; if (nh->nh_flags & NHF_GATEWAY) flags = RTF_GATEWAY; else if (nh->nh_flags & NHF_HOST) flags = RTF_HOST; struct ifaddr *ifa = ifa_ifwithroute(flags, dst, &nh->gw_sa, fibnum); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); *perror = EINVAL; return (NULL); } nhop_set_transmit_ifp(nh, ifa->ifa_ifp); } } /* Both nh_ifp and gateway are set */ if (nh->nh_ifa == NULL) { const struct sockaddr *gw_sa = &nh->gw_sa; if (gw_sa->sa_family != dst->sa_family) { /* * Use dst as the target for determining the default * preferred ifa IF * 1) the gateway is link-level (e.g. direct route) * 2) the gateway family is different (e.g. IPv4 over IPv6). */ gw_sa = dst; } struct ifaddr *ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { /* Try link-level ifa. */ gw_sa = &nh->gw_sa; ifa = ifaof_ifpforaddr(gw_sa, nh->nh_ifp); if (ifa == NULL) { NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); *perror = EINVAL; return (NULL); } } nhop_set_src(nh, ifa); } return (nhop_get_nhop(nh, perror)); } static int get_pxflag(const struct nl_parsed_route *attrs) { int pxflag = 0; switch (attrs->rtm_family) { case AF_INET: if (attrs->rtm_dst_len == 32) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; case AF_INET6: if (attrs->rtm_dst_len == 128) pxflag = NHF_HOST; else if (attrs->rtm_dst_len == 0) pxflag = NHF_DEFAULT; break; } return (pxflag); } static int get_op_flags(int nlm_flags) { int op_flags = 0; op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; return (op_flags); } #ifdef ROUTE_MPATH static int create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, struct nl_pstate *npt, struct nhop_object **pnh) { int error; if (mpnh->gw == NULL) return (EINVAL); struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) return (ENOMEM); error = nl_set_nexthop_gw(nh, mpnh->gw, mpnh->ifp, npt); if (error != 0) { nhop_free(nh); return (error); } if (mpnh->ifp != NULL) nhop_set_transmit_ifp(nh, mpnh->ifp); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); *pnh = finalize_nhop(nh, attrs->rta_dst, &error); return (error); } #endif static struct nhop_object * create_nexthop_from_attrs(struct nl_parsed_route *attrs, struct nl_pstate *npt, int *perror) { struct nhop_object *nh = NULL; int error = 0; if (attrs->rta_multipath != NULL) { #ifdef ROUTE_MPATH /* Multipath w/o explicit nexthops */ int num_nhops = attrs->rta_multipath->num_nhops; struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); for (int i = 0; i < num_nhops; i++) { struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); if (error != 0) { for (int j = 0; j < i; j++) nhop_free(wn[j].nh); break; } wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; } if (error == 0) { struct rib_head *rh = nhop_get_rh(wn[0].nh); struct nhgrp_object *nhg; nhg = nhgrp_alloc(rh->rib_fibnum, rh->rib_family, wn, num_nhops, perror); if (nhg != NULL) { if (attrs->rtm_protocol > RTPROT_STATIC) nhgrp_set_origin(nhg, attrs->rtm_protocol); nhg = nhgrp_get_nhgrp(nhg, perror); } for (int i = 0; i < num_nhops; i++) nhop_free(wn[i].nh); if (nhg != NULL) return ((struct nhop_object *)nhg); error = *perror; } #else error = ENOTSUP; #endif *perror = error; } else { nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); if (nh == NULL) { *perror = ENOMEM; return (NULL); } if (attrs->rta_gw != NULL) { *perror = nl_set_nexthop_gw(nh, attrs->rta_gw, attrs->rta_oif, npt); if (*perror != 0) { nhop_free(nh); return (NULL); } } if (attrs->rta_oif != NULL) nhop_set_transmit_ifp(nh, attrs->rta_oif); if (attrs->rtax_mtu != 0) nhop_set_mtu(nh, attrs->rtax_mtu, true); if (attrs->rta_rtflags & RTF_BROADCAST) nhop_set_broadcast(nh, true); if (attrs->rtm_protocol > RTPROT_STATIC) nhop_set_origin(nh, attrs->rtm_protocol); nhop_set_pxtype_flag(nh, get_pxflag(attrs)); nhop_set_rtflags(nh, attrs->rta_rtflags); switch (attrs->rtm_type) { case RTN_UNICAST: break; case RTN_BLACKHOLE: nhop_set_blackhole(nh, RTF_BLACKHOLE); break; case RTN_PROHIBIT: case RTN_UNREACHABLE: nhop_set_blackhole(nh, RTF_REJECT); break; /* TODO: return ENOTSUP for other types if strict option is set */ } nh = finalize_nhop(nh, attrs->rta_dst, perror); } return (nh); } static int rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc = {}; struct nhop_object *nh = NULL; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); /* Check if we have enough data */ if (attrs.rta_dst == NULL) { NL_LOG(LOG_DEBUG, "missing RTA_DST"); return (EINVAL); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (attrs.rta_nh_id != 0) { /* Referenced uindex */ int pxflag = get_pxflag(&attrs); nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, pxflag, &error); if (error != 0) return (error); } else { nh = create_nexthop_from_attrs(&attrs, npt, &error); if (error != 0) { NL_LOG(LOG_DEBUG, "Error creating nexthop"); return (error); } } if (!NH_IS_NHGRP(nh) && attrs.rta_weight == 0) attrs.rta_weight = RT_DEFAULT_WEIGHT; struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = attrs.rta_weight }; int op_flags = get_op_flags(hdr->nlmsg_flags); error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, &rnd, op_flags, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) { struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) return (0); if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) return (0); return (1); } static int rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct rib_cmd_info rc; int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_dst == NULL) { NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); return (ESRCH); } if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc); if (error == 0) report_operation(attrs.rta_table, &rc, nlp, hdr); return (error); } static int rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { int error; struct nl_parsed_route attrs = {}; error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); if (error != 0) return (error); if (attrs.rta_table >= V_rt_numfibs) { NLMSG_REPORT_ERR_MSG(npt, "invalid fib"); return (EINVAL); } if (hdr->nlmsg_flags & NLM_F_DUMP) error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); else error = handle_rtm_getroute(nlp, &attrs, hdr, npt); return (error); } void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) { struct nl_writer nw = {}; int family, nlm_flags = 0; family = rt_get_family(rc->rc_rt); /* XXX: check if there are active listeners first */ /* TODO: consider passing PID/type/seq */ switch (rc->rc_cmd) { case RTM_ADD: nlm_flags = NLM_F_EXCL | NLM_F_CREATE; break; case RTM_CHANGE: nlm_flags = NLM_F_REPLACE; break; case RTM_DELETE: nlm_flags = 0; break; } IF_DEBUG_LEVEL(LOG_DEBUG2) { char rtbuf[NHOP_PRINT_BUFSIZE] __unused; FIB_LOG(LOG_DEBUG2, fibnum, family, "received event %s for %s / nlm_flags=%X", rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), nlm_flags); } struct nlmsghdr hdr = { .nlmsg_flags = nlm_flags, .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), }; struct route_nhop_data rnd = { .rnd_nhop = rc_get_nhop(rc), .rnd_weight = rc->rc_nh_weight, }; uint32_t group_id = family_to_group(family); if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { NL_LOG(LOG_DEBUG, "error allocating event buffer"); return; } dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); nlmsg_flush(&nw); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETROUTE, .name = "RTM_GETROUTE", .cb = &rtnl_handle_getroute, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELROUTE, .name = "RTM_DELROUTE", .cb = &rtnl_handle_delroute, .priv = PRIV_NET_ROUTE, }, { .cmd = NL_RTM_NEWROUTE, .name = "RTM_NEWROUTE", .cb = &rtnl_handle_newroute, .priv = PRIV_NET_ROUTE, } }; static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; void rtnl_routes_init(void) { NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); } diff --git a/tests/sys/netlink/test_netlink_message_writer.py b/tests/sys/netlink/test_netlink_message_writer.py index df1768129b11..5f854b14ca45 100644 --- a/tests/sys/netlink/test_netlink_message_writer.py +++ b/tests/sys/netlink/test_netlink_message_writer.py @@ -1,79 +1,39 @@ import mmap import pytest from atf_python.ktest import BaseKernelTest from atf_python.sys.netlink.attrs import NlAttrU32 - M_NOWAIT = 1 M_WAITOK = 2 -NS_WRITER_TYPE_MBUF = 0 -NS_WRITER_TYPE_BUF = 1 -NS_WRITER_TYPE_LBUF = 1 - -MHLEN = 160 -MCLBYTES = 2048 # XXX: may differ on some archs? -MJUMPAGESIZE = mmap.PAGESIZE -MJUM9BYTES = 9 * 1024 -MJUM16BYTES = 16 * 1024 +NLMSG_SMALL = 128 +NLMSG_LARGE = 2048 class TestNetlinkMessageWriter(BaseKernelTest): KTEST_MODULE_NAME = "ktest_netlink_message_writer" @pytest.mark.parametrize( "malloc_flags", [ pytest.param(M_NOWAIT, id="NOWAIT"), pytest.param(M_WAITOK, id="WAITOK"), ], ) - @pytest.mark.parametrize( - "writer_type", - [ - pytest.param(NS_WRITER_TYPE_MBUF, id="MBUF"), - pytest.param(NS_WRITER_TYPE_BUF, id="BUF"), - ], - ) @pytest.mark.parametrize( "sz", [ - pytest.param([160, 160], id="MHLEN"), - pytest.param([MCLBYTES, MCLBYTES], id="MCLBYTES"), + pytest.param([NLMSG_SMALL, NLMSG_SMALL], id="NLMSG_SMALL"), + pytest.param([NLMSG_LARGE, NLMSG_LARGE], id="NLMSG_LARGE"), + pytest.param([NLMSG_LARGE + 256, NLMSG_LARGE + 256], id="NLMSG_LARGE+256"), ], ) - def test_mbuf_writer_allocation(self, sz, writer_type, malloc_flags): + def test_nlbuf_writer_allocation(self, sz, malloc_flags): """override to parametrize""" test_meta = [ NlAttrU32(1, sz[0]), # size NlAttrU32(2, sz[1]), # expected_avail - NlAttrU32(4, writer_type), - NlAttrU32(5, malloc_flags), - ] - self.runtest(test_meta) - - @pytest.mark.parametrize( - "malloc_flags", - [ - pytest.param(M_NOWAIT, id="NOWAIT"), - pytest.param(M_WAITOK, id="WAITOK"), - ], - ) - @pytest.mark.parametrize( - "sz", - [ - pytest.param([160, 160, 1], id="MHLEN"), - pytest.param([MCLBYTES, MCLBYTES, 1], id="MCLBYTES"), - pytest.param([MCLBYTES + 1, MCLBYTES + 1, 2], id="MCLBYTES_MHLEN"), - pytest.param([MCLBYTES + 256, MCLBYTES * 2, 2], id="MCLBYTESx2"), - ], - ) - def test_mbuf_chain_allocation(self, sz, malloc_flags): - test_meta = [ - NlAttrU32(1, sz[0]), # size - NlAttrU32(2, sz[1]), # expected_avail - NlAttrU32(3, sz[2]), # expected_count - NlAttrU32(5, malloc_flags), + NlAttrU32(3, malloc_flags), ] self.runtest(test_meta)