Index: sys/compat/linux/linux.c =================================================================== --- sys/compat/linux/linux.c +++ sys/compat/linux/linux.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -364,6 +365,8 @@ return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); + case LINUX_AF_NETLINK: + return (AF_NETLINK); } return (-1); } @@ -387,6 +390,8 @@ return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); + case AF_NETLINK: + return (LINUX_AF_NETLINK); } return (-1); } @@ -514,6 +519,14 @@ } } + if (bdom == AF_NETLINK) { + if (salen < sizeof(struct sockaddr_nl)) { + error = EINVAL; + goto out; + } + salen = sizeof(struct sockaddr_nl); + } + sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; Index: sys/compat/linux/linux_socket.c =================================================================== --- sys/compat/linux/linux_socket.c +++ sys/compat/linux/linux_socket.c @@ -91,6 +91,8 @@ l_uint, struct msghdr *); static int linux_set_socket_flags(int, int *); +#define SOL_NETLINK 270 + static int linux_to_bsd_sockopt_level(int level) { @@ -2091,6 +2093,10 @@ case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; + case SOL_NETLINK: + level = SOL_SOCKET; + name = args->optname; + break; default: name = -1; break; Index: sys/kern/uipc_domain.c =================================================================== --- sys/kern/uipc_domain.c +++ sys/kern/uipc_domain.c @@ -239,6 +239,29 @@ mtx_unlock(&dom_mtx); } +void +domain_remove(void *data) +{ + struct domain *dp = (struct domain *)data; + + if ((dp->dom_flags & DOMF_UNLOADABLE) == 0) + return; + + mtx_lock(&dom_mtx); + if (domains == dp) { + domains = dp->dom_next; + } else { + struct domain *curr; + for (curr = domains; curr != NULL; curr = curr->dom_next) { + if (curr->dom_next == dp) { + curr->dom_next = dp->dom_next; + break; + } + } + } + mtx_unlock(&dom_mtx); +} + /* ARGSUSED*/ static void domaininit(void *dummy) Index: sys/modules/linux_common/Makefile =================================================================== --- sys/modules/linux_common/Makefile +++ sys/modules/linux_common/Makefile @@ -16,6 +16,8 @@ EXPORT_SYMS+= linux_get_osname EXPORT_SYMS+= linux_get_osrelease EXPORT_SYMS+= linux_use_real_ifname +EXPORT_SYMS+= linux_to_bsd_domain +EXPORT_SYMS+= bsd_to_linux_domain .if !defined(KERNBUILDDIR) .warning Building Linuxulator outside of a kernel does not make sense Index: sys/modules/netlink/Makefile =================================================================== --- /dev/null +++ sys/modules/netlink/Makefile @@ -0,0 +1,7 @@ +.PATH: ${SRCTOP}/sys/netlink +KMOD= netlink + +SRCS = netlink_module.c netlink_domain.c netlink_iface.c netlink_io.c \ + netlink_message.c netlink_route.c netlink_nhop.c netlink_linux.c + +.include Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -344,15 +344,17 @@ void *); struct rt_addrinfo { - int rti_addrs; /* Route RTF_ flags */ + uint16_t rti_addrs; /* rti_info bitmask */ + uint8_t rti_family; /* address family to operate on */ + uint8_t rti_spare2; int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rib_filter_f_t *rti_filter; /* filter function */ - void *rti_filterdata; /* filter parameters */ - u_long rti_mflags; /* metrics RTV_ flags */ - u_long rti_spare; /* Will be used for fib */ + void *rti_filterdata; /* filter parameters */ + uint32_t rti_mflags; /* metrics RTV_ flags */ + uint32_t rti_fibnum; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; Index: sys/net/route/nhop.h =================================================================== --- sys/net/route/nhop.h +++ sys/net/route/nhop.h @@ -199,6 +199,8 @@ void nhop_set_transmit_ifp(struct nhop_object *nh, struct ifnet *ifp); uint32_t nhop_get_idx(const struct nhop_object *nh); +uint32_t nhop_get_uidx(const struct nhop_object *nh); +void nhop_set_uidx(struct nhop_object *nh, uint32_t uidx); enum nhop_type nhop_get_type(const struct nhop_object *nh); int nhop_get_rtflags(const struct nhop_object *nh); struct vnet *nhop_get_vnet(const struct nhop_object *nh); Index: sys/net/route/nhop_ctl.c =================================================================== --- sys/net/route/nhop_ctl.c +++ sys/net/route/nhop_ctl.c @@ -780,6 +780,18 @@ return (nh->nh_priv->nh_idx); } +uint32_t +nhop_get_uidx(const struct nhop_object *nh) +{ + return (nh->nh_priv->nh_uidx); +} + +void +nhop_set_uidx(struct nhop_object *nh, uint32_t uidx) +{ + nh->nh_priv->nh_uidx = uidx; +} + enum nhop_type nhop_get_type(const struct nhop_object *nh) { Index: sys/net/route/nhop_var.h =================================================================== --- sys/net/route/nhop_var.h +++ sys/net/route/nhop_var.h @@ -79,6 +79,7 @@ uint16_t nh_type; /* nexthop type */ uint32_t rt_flags; /* routing flags for the control plane */ uint32_t nh_expire; /* path expiration time */ + uint32_t nh_uidx; /* userland-provided index */ /* nhop lookup comparison end */ uint32_t nh_idx; /* nexthop index */ uint32_t nh_fibnum; /* nexthop fib */ Index: sys/net/route/route_ctl.h =================================================================== --- sys/net/route/route_ctl.h +++ sys/net/route/route_ctl.h @@ -35,6 +35,8 @@ #ifndef _NET_ROUTE_ROUTE_CTL_H_ #define _NET_ROUTE_ROUTE_CTL_H_ +#include + struct rib_cmd_info { uint8_t rc_cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */ uint8_t spare[3]; @@ -184,4 +186,31 @@ void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); +/* Event bridge */ + +/* Types of events */ +#define NLBR_EVENT_ROUTE 1 + +/* Event providers */ +#define NLBR_PROVIDER_KERNEL 1 +#define NLBR_PROVIDER_RTSOCK 2 +#define NLBR_PROVIDER_NETLINK 3 + +struct rib_event_bridge; +typedef void rib_event_bridge_cb_t(uint32_t event_type, uint32_t fibnum, + const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg); + +struct rib_event_bridge { + rib_event_bridge_cb_t *reb_cb; + void *reb_cb_arg; + int reb_provider_id; + CK_STAILQ_ENTRY(rib_event_bridge) reb_link; +}; +void rib_bridge_generic_event(int provider_id, uint32_t event_type, uint32_t val1, + void *ptr1, void *ptr2); +void rib_bridge_rt_event(int provider_id, uint32_t fibnum, struct rt_addrinfo *info, + struct rib_cmd_info *rc); +void rib_bridge_link(struct rib_event_bridge *reb); +void rib_bridge_unlink(struct rib_event_bridge *reb); + #endif Index: sys/net/route/route_ctl.c =================================================================== --- sys/net/route/route_ctl.c +++ sys/net/route/route_ctl.c @@ -59,7 +59,7 @@ #define DEBUG_MOD_NAME route_ctl #define DEBUG_MAX_LEVEL LOG_DEBUG #include -_DECLARE_DEBUG(LOG_INFO); +_DECLARE_DEBUG(LOG_DEBUG3); /* * This file contains control plane routing tables functions. @@ -1592,3 +1592,63 @@ } return ("unknown"); } + +CK_STAILQ_HEAD(rib_event_bridge_head, rib_event_bridge); +static struct rib_event_bridge_head bridge_head; +struct mtx bridge_lock; + +static void +rib_bridge_init(void) +{ + CK_STAILQ_INIT(&bridge_head); + mtx_init(&bridge_lock, "rib_event_bridge_lock", NULL, MTX_DEF); +} +SYSINIT(rib_bridge_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, rib_bridge_init, NULL); + + +void +rib_bridge_generic_event(int provider_id, uint32_t event_type, uint32_t val1, + void *ptr1, void *ptr2) +{ + struct rib_event_bridge *reb; + + NET_EPOCH_ASSERT(); + + CK_STAILQ_FOREACH(reb, &bridge_head, reb_link) { + RT_LOG(LOG_DEBUG3, "HERE reb %p %d", reb, reb->reb_provider_id); + if (reb->reb_provider_id != provider_id) + reb->reb_cb(event_type, val1, ptr1, ptr2, reb->reb_cb_arg); + } +} + +void +rib_bridge_rt_event(int provider_id, uint32_t fibnum, struct rt_addrinfo *info, + struct rib_cmd_info *rc) +{ +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char rtbuf[INET6_ADDRSTRLEN + 5]; + FIB_LOG(LOG_DEBUG3, fibnum, rt_get_family(rc->rc_rt), "received cmd %s for %s", + rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf))); +#endif + rib_bridge_generic_event(provider_id, NLBR_EVENT_ROUTE, fibnum, info, rc); +} + + +void +rib_bridge_link(struct rib_event_bridge *reb) +{ + mtx_lock(&bridge_lock); + CK_STAILQ_INSERT_HEAD(&bridge_head, reb, reb_link); + mtx_unlock(&bridge_lock); + RT_LOG(LOG_DEBUG, "link %p", reb); +} + +void +rib_bridge_unlink(struct rib_event_bridge *reb) +{ + mtx_lock(&bridge_lock); + CK_STAILQ_REMOVE(&bridge_head, reb, rib_event_bridge, reb_link); + mtx_unlock(&bridge_lock); + RT_LOG(LOG_DEBUG, "unlink %p", reb); +} + Index: sys/net/rtsock.c =================================================================== --- sys/net/rtsock.c +++ sys/net/rtsock.c @@ -1085,6 +1085,7 @@ } error = rib_action(fibnum, rtm->rtm_type, &info, &rc); if (error == 0) { + rib_bridge_rt_event(NLBR_PROVIDER_RTSOCK, fibnum, &info, &rc); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_new) || (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { @@ -1106,6 +1107,7 @@ case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { + rib_bridge_rt_event(NLBR_PROVIDER_RTSOCK, fibnum, &info, &rc); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_old) || (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { Index: sys/netlink/netlink.h =================================================================== --- /dev/null +++ sys/netlink/netlink.h @@ -0,0 +1,233 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains structures and constants for RFC 3549 (Netlink) + * protocol. Some values have been taken from Linux implementation. + */ + +#ifndef _NETLINK_LINUX_NETLINK_H_ +#define _NETLINK_LINUX_NETLINK_H_ + +#ifndef _KERNEL +#ifndef PF_NETLINK +#define PF_NETLINK 38 +#endif +#ifndef AF_NETLINK +#define AF_NETLINK 38 +#endif +#ifndef AF_MPLS +#define AF_MPLS 39 +#endif +#endif + +#include +#include + +struct sockaddr_nl { + uint8_t nl_len; /* total length */ + sa_family_t nl_family; /* AF_NETLINK */ + uint16_t nl_pad; /* zero */ + uint32_t nl_pid; /* port ID */ + uint32_t nl_groups; /* multicast groups mask */ +}; + +#define SOL_NETLINK 270 + +/* Currently supported socket options */ +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 /* XXX: not supported */ +#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */ +#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */ +#define NETLINK_RX_RING 6 /* XXX: not supported */ +#define NETLINK_TX_RING 7 /* XXX: not supported */ +#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */ + +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 +#define NETLINK_GET_STRICT_CHK 12 /* XXX: not supported */ + + +/* + * RFC 3549, 2.3.2 Netlink Message Header + */ +struct nlmsghdr { + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message type identifier */ + uint16_t nlmsg_flags; /* Flags (NLM_F_) */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ +}; + +/* + * RFC 3549, 2.3.2.2 The ACK Netlink Message + */ +struct nlmsgerr { + int error; + struct nlmsghdr msg; +}; + +/* + * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags) + */ +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* + * RFC 3549, 2.3.2 Additional flag bits for GET requests + */ +#define NLM_F_ROOT 0x100 /* Return the complete table */ +#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */ +#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot */ +#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH) + +/* + * RFC 3549, 2.3.2 Additional flag bits for NEW requests + */ +#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */ +#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */ +#define NLM_F_CREATE 0x400 /* Create if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + * RFC 3549, 2.3.2 standard message types (nlmsg_type). + */ +#define NLMSG_NOOP 0x1 /* Message is ignored. */ +#define NLMSG_ERROR 0x2 /* reply error code reporting */ +#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */ +#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +/* + * Defition of numbers assigned to the netlink subsystems. + */ +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* (not used) */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* (not used) */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 /* (not used) */ +#define NETLINK_DNRTMSG 14 /* DECnet routing messages (not used) */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 + + +#ifndef roundup2 +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#endif +#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t) +#define NL_ITEM_ALIGN(_len) roundup2(_len, NL_ITEM_ALIGN_SIZE) +#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off)) +#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off)) + +#define NL_ITEM_OK(_ptr, _len, _hlen, _DLEN) \ + ((_len) >= _hlen && _DLEN(_ptr) = _hlen && _DLEN(_ptr) <= (_len)) +#define NL_ITEM_NEXT(_ptr, _LEN_MACRO) (typeof(_ptr)NL_ITEM_DATA(_LEN_MACRO(_ptr))) +#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \ + ((_len) -= _LEN_MACRO(_ptr), (_ptr) = NL_ITEM_NEXT(_ptr, _LEN_MACRO)) + + +#ifndef _KERNEL +/* part of netlink(3) API */ +#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLMSG_HDRLEN ((int)sizeof(struct nlmsghdr)) +#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(_len)) +#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, 0) +#define _NLMSG_LEN(_hdr) ((int)(_hdr)->nlmsg_len) +#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr)) +#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN) +#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN) +#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len))) + +#else +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1)) +#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#endif + +/* + * Base netlink attribute TLV header. + */ +struct nlattr { + uint16_t nla_len; /* Total attribute length */ + uint16_t nla_type; /* Attribute type */ +}; + +/* + * + * nl_type field enconding: + * + * 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |N|O| Attribute type | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * N - attribute contains other attributes + * O - encoded in network byte order + * Note: N & O are mutually exclusive + * + * Note: attribute type value scope normally is per-message + * or per message group. + */ + +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +/* Compatibility macro */ +#ifndef _KERNEL +#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#endif + +#endif Index: sys/netlink/netlink_ctl.h =================================================================== --- /dev/null +++ sys/netlink/netlink_ctl.h @@ -0,0 +1,208 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef _NETLINK_NETLINK_CTL_H_ +#define _NETLINK_NETLINK_CTL_H_ + +#ifdef _KERNEL + +/* + * This file provides headers for the public KPI of the netlink + * subsystem + * */ + +/* + * Messages and attributes (netlink_message.c) + */ +#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) + +#define NETLINK_ALIGN_SIZE sizeof(uint32_t) +#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) + +#define NLA_ALIGN_SIZE sizeof(uint32_t) +#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE) + +#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) +#define NLA_FOREACH(_attr, _start, _len) \ + for (_attr = (_start); ((char *)NLA_NEXT(_attr) - (char *)(_start)) <= (_len); _attr = NLA_NEXT(_attr)) + +struct mbuf; +struct nlmsg_state; +typedef bool nlmsg_state_cb(struct nlmsg_state *ns, char *buf, int buflen); + +struct nlmsg_state { + int alloc_len; + int offset; + struct nlmsghdr *hdr; + char *data; // pointer to contig storage + union { + struct mbuf *_m; + char *_buf; + }; + nlmsg_state_cb *cb; + void *arg; + int malloc_flag; // M_WAITOK | M_NOWAIT + uint8_t writer_type; + uint8_t writer_target; +}; +#define NS_WRITER_TARGET_SOCKET 0 +#define NS_WRITER_TARGET_GROUP 1 +#define NS_WRITER_TARGET_CHAIN 2 + +#define NS_WRITER_TYPE_MBUF 0 +#define NS_WRITER_TYPE_BUF 1 +#define NS_WRITER_TYPE_LBUF 2 +#define NS_WRITER_TYPE_MBUFC 3 + + +#define NLMSG_SMALL 128 +#define NLMSG_LARGE 2048 + +struct nlpcb; +bool nlmsg_get_socket_writer(int size, struct nlpcb *nlp, struct nlmsg_state *ns); +bool nlmsg_get_group_writer(int size, uint32_t group_mask, struct nlmsg_state *ns); +bool nlmsg_get_chain_writer(int size, struct mbuf **pm, struct nlmsg_state *ns); +void nlmsg_free(struct nlmsg_state *ns); +bool nlmsg_add(struct nlmsg_state *ns, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len); +void *nlmsg_reserve_data_raw(struct nlmsg_state *ns, size_t sz); +void nlmsg_end(struct nlmsg_state *ns); +void nlmsg_abort(struct nlmsg_state *ns); +bool nlmsg_flush(struct nlmsg_state *ns); + +#define nlmsg_data(_hdr) ((void *)((_hdr) + 1)) + +#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, NLA_ALIGN(sizeof(_t)))) +#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz)) + +/* Attributes */ +bool nlattr_add_handle_oom(struct nlmsg_state *ns, int attr_type, int attr_len, + const void *data); + +static inline bool +nlattr_add_noerror(struct nlmsg_state *ns, int attr_type, int attr_len, + const void *data) +{ + int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + struct nlattr *nla = (struct nlattr *)(&ns->data[ns->offset]); + + nla->nla_len = attr_len + sizeof(struct nlattr); + nla->nla_type = attr_type; + if (attr_len > 0) { + memcpy((nla + 1), data, attr_len); + } + ns->offset += required_len; + return (true); +} + +static inline bool +nlattr_add(struct nlmsg_state *ns, int attr_type, int attr_len, const void *data) +{ + int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + + if (__predict_false(ns->offset + required_len > ns->alloc_len)) { + if (!nlattr_add_handle_oom(ns, attr_type, attr_len, data)) + return (false); + } + + return (nlattr_add_noerror(ns, attr_type, attr_len, data)); +} + +static inline bool +nlattr_add_u8(struct nlmsg_state *ns, int attrtype, uint8_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(uint8_t), &value)); +} + +static inline bool +nlattr_add_u16(struct nlmsg_state *ns, int attrtype, uint16_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(uint16_t), &value)); +} + +static inline bool +nlattr_add_u32(struct nlmsg_state *ns, int attrtype, uint32_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(uint32_t), &value)); +} + +static inline bool +nlattr_add_u64(struct nlmsg_state *ns, int attrtype, uint64_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(uint64_t), &value)); +} + +static inline bool +nlattr_add_s8(struct nlmsg_state *ns, int attrtype, int8_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(int8_t), &value)); +} + +static inline bool +nlattr_add_s16(struct nlmsg_state *ns, int attrtype, int16_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(int16_t), &value)); +} + +static inline bool +nlattr_add_s32(struct nlmsg_state *ns, int attrtype, int32_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(int32_t), &value)); +} + +static inline bool +nlattr_add_s64(struct nlmsg_state *ns, int attrtype, int64_t value) +{ + return (nlattr_add(ns, attrtype, sizeof(int64_t), &value)); +} + +static inline bool +nlattr_add_flag(struct nlmsg_state *ns, int attrtype) +{ + return (nlattr_add(ns, attrtype, 0, NULL)); +} + +static inline bool +nlattr_add_string(struct nlmsg_state *ns, int attrtype, const char *str) +{ + return (nlattr_add(ns, attrtype, strlen(str) + 1, str)); +} + +/* Protocol handlers */ +struct netlink_parse_tracker; +typedef int (*nl_handler)(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt); + +bool netlink_register_proto(int proto, nl_handler handle); +bool netlink_unregister_proto(int proto); + + +#endif + + +#endif Index: sys/netlink/netlink_debug.h =================================================================== --- /dev/null +++ sys/netlink/netlink_debug.h @@ -0,0 +1,78 @@ +/*- + * Copyright (c) 2022 + * Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETLINK_NETLINK_DEBUG_H_ +#define _NETLINK_NETLINK_DEBUG_H_ + +#include + +/* + * Generic debug + * [nl_domain] func_name: debug text + */ +#define NL_DEBUG RT_DEBUG + +/* + * Logging for events specific for particular process + * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45 + */ +#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, _fib, _fam, _fmt, ## __VA_ARGS__) +#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ + _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__. ##__VA_ARGS__); \ +} + +#if DEBUG_MAX_LEVEL>=LOG_DEBUG3 +#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG2 +#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG +#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_INFO +#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...) +#endif +#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG + + + +#endif Index: sys/netlink/netlink_domain.c =================================================================== --- /dev/null +++ sys/netlink/netlink_domain.c @@ -0,0 +1,535 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains socket and protocol bindings for netlink. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define DEBUG_MOD_NAME nl_domain +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + + +static u_long nl_sendspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, + "Default netlink socket send space"); + +static u_long nl_recvspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, + "Default netlink socket receive space"); + +/* + * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. + * Returns nlpcb pointer if present else NULL + */ +static struct nlpcb * +nl_port_lookup(uint32_t port_id) +{ + struct nlpcb *nlp; + + CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { + if (nlp->nl_port == port_id) + return (nlp); + } + return (NULL); +} + +static void +nl_update_groups_locked(struct nlpcb *nlp, uint32_t nl_groups) +{ + /* Update group mask */ + RT_LOG(LOG_DEBUG2, "socket %p, groups 0x%X -> 0x%X", + nlp->nl_socket, nlp->nl_groups, nl_groups); + nlp->nl_groups = nl_groups; +} + +static uint32_t +nl_find_port() { + /* + * app can open multiple netlink sockets. + * Start with current pid, if already taken, + * try random numbers in 65k..256k+65k space, + * avoiding clash with pids. + */ + if (nl_port_lookup(curproc->p_pid) == NULL) + return (curproc->p_pid); + for (int i = 0; i < 16; i++) { + uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; + if (nl_port_lookup(nl_port) == 0) + return (nl_port); + RT_LOG(LOG_DEBUG3, "tried %u\n", nl_port); + } + return (curproc->p_pid); +} + +static int +nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) +{ + if (nlp->nl_active) { + if (nlp->nl_port != snl->nl_pid) { + RT_LOG(LOG_DEBUG, + "bind() failed: program pid %d " + "is different from provided pid %d", + nlp->nl_port, snl->nl_pid); + return (EINVAL); // XXX: better error + } + } else { + if (snl->nl_pid == 0) + snl->nl_pid = nl_find_port(); + if (nl_port_lookup(snl->nl_pid) != NULL) + return (EADDRINUSE); + nlp->nl_port = snl->nl_pid; + nlp->nl_active = true; + CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); + } + nl_update_groups_locked(nlp, snl->nl_groups); + + return (0); +} + +static int +nl_pru_attach(struct socket *so, int proto, struct thread *td) +{ + struct nlpcb *nlp; + int error; + + bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; + RT_LOG(LOG_DEBUG2, "socket %p, PID %d%s: attaching socket to netlink proto %d", + so, curproc->p_pid, is_linux ? "(linux)" : "", proto); + + if (__predict_false(netlink_unloading != 0)) + return (EAFNOSUPPORT); + + /* Create per-VNET state on first socket init */ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + if (ctl == NULL) + ctl = vnet_nl_ctl_init(); + KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); + + MPASS(sotonlpcb(so) == NULL); + + error = nl_verify_proto(proto); + if (error != 0) + return (error); + + nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); + error = soreserve(so, nl_sendspace, nl_recvspace); + if (error != 0) { + free(nlp, M_PCB); + return (error); + } + so->so_pcb = (void *)nlp; + nlp->nl_socket = so; + nlp->nl_proto = proto; + nlp->nl_process_id = curproc->p_pid; + nlp->nl_linux = is_linux; + NLP_LOCK_INIT(nlp); + refcount_init(&nlp->nl_refcount, 1); + + nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, + taskqueue_thread_enqueue, &nlp->nl_taskqueue); + TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); + taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, + "netlink_socket (PID %u)", nlp->nl_process_id); + + NLCTL_WLOCK(ctl); + /* XXX: check ctl is still alive */ + CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); + NLCTL_WUNLOCK(ctl); + + soisconnected(so); + + return (0); +} + +static void +nl_pru_abort(struct socket *so) +{ + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + soisdisconnected(so); +} + +static int +nl_pru_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct nlpcb *nlp = sotonlpcb(so); + struct sockaddr_nl *snl = (struct sockaddr_nl *)nam; + int error; + + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + if (snl->nl_len != sizeof(*snl)) { + RT_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); + return (EINVAL); + } + + + NLCTL_WLOCK(ctl); + NLP_LOCK(nlp); + error = nl_bind_locked(nlp, snl); + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(ctl); + RT_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, + snl->nl_pid, snl->nl_groups, error); + + return (error); +} + + +static int +nl_assign_port(struct nlpcb *nlp, uint32_t port_id) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct sockaddr_nl snl = { + .nl_pid = port_id, + }; + int error; + + NLCTL_WLOCK(ctl); + NLP_LOCK(nlp); + snl.nl_groups = nlp->nl_groups; + error = nl_bind_locked(nlp, &snl); + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(ctl); + + RT_LOG(LOG_DEBUG2, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); + return (error); +} + +/* + * nl_autobind_port binds a unused portid to @nlp + * @nlp: pcb data for the netlink socket + * @candidate_id: first id to consider + */ +static int +nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + uint32_t port_id = candidate_id; + NLCTL_TRACKER; + bool exist; + int error; + + for (int i = 0; i < 10; i++) { + RT_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); + NLCTL_RLOCK(ctl); + exist = nl_port_lookup(port_id) != 0; + NLCTL_RUNLOCK(ctl); + if (!exist) { + error = nl_assign_port(nlp, port_id); + if (error != EADDRINUSE) + break; + } + port_id++; + } + RT_LOG(LOG_DEBUG2, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); + return (error); +} + +static int +nl_pru_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct sockaddr_nl *snl = (struct sockaddr_nl *)nam; + struct nlpcb *nlp; + + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + if (snl->nl_len != sizeof(*snl)) { + RT_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); + return (EINVAL); + } + + nlp = sotonlpcb(so); + if (!nlp->nl_active) { + int error = nl_autobind_port(nlp, td->td_proc->p_pid); + if (error != 0) { + RT_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); + return (error); + } + } + /* XXX: Handle socket flags & multicast */ + soisconnected(so); + + RT_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); + + return (0); +} + +static void +destroy_socket(struct nlpcb *nlp) +{ + NLP_LOCK(nlp); + NLP_LOCK_DESTROY(nlp); + free(nlp, M_PCB); +} + +static void +destroy_socket_epoch(epoch_context_t ctx) +{ + struct nlpcb *nlp; + + nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); + + destroy_socket(nlp); +} + + +static void +nl_pru_detach(struct socket *so) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + MPASS(sotonlpcb(so) != NULL); + struct nlpcb *nlp; + + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + nlp = sotonlpcb(so); + + /* Mark as inactive so no new work can be enqueued */ + NLP_LOCK(nlp); + bool was_active = nlp->nl_active; + nlp->nl_active = false; + NLP_UNLOCK(nlp); + + /* Wait till all scheduled work has been completed */ + taskqueue_drain_all(nlp->nl_taskqueue); + taskqueue_free(nlp->nl_taskqueue); + + NLCTL_WLOCK(ctl); + NLP_LOCK(nlp); + if (was_active) { + CK_LIST_REMOVE(nlp, nl_port_next); + RT_LOG(LOG_DEBUG2, "socket %p, unlinking bound pid %u", so, nlp->nl_port); + } + CK_LIST_REMOVE(nlp, nl_next); + nlp->nl_socket = NULL; + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(ctl); + + so->so_pcb = NULL; + + RT_LOG(LOG_DEBUG2, "socket %p, detached", so); + + epoch_call(net_epoch_preempt, destroy_socket_epoch, &nlp->nl_epoch_ctx); +} + +static int +nl_pru_disconnect(struct socket *so) +{ + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + return (ENOTCONN); +} + +static int +nl_pru_peeraddr(struct socket *so, struct sockaddr **nam) +{ + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + return (ENOTCONN); +} + +static int +nl_pru_shutdown(struct socket *so) +{ + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + socantsendmore(so); + return (0); +} + +static int +nl_pru_sockaddr(struct socket *so, struct sockaddr **nam) +{ + struct sockaddr_nl *snl; + + snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO); + /* TODO: set other fields */ + snl->nl_len = sizeof(struct sockaddr_nl); + snl->nl_family = AF_NETLINK; + snl->nl_pid = sotonlpcb(so)->nl_port; + *nam = (struct sockaddr *)snl; + return (0); +} + +static void +nl_pru_close(struct socket *so) +{ + RT_LOG(LOG_DEBUG2, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + soisdisconnected(so); +} + +static int +nl_pru_output(struct mbuf *m, struct socket *so, ...) +{ + + if (__predict_false(m == NULL || + ((m->m_len < sizeof(struct nlmsghdr)) && + (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL))) + return (ENOBUFS); + MPASS((m->m_flags & M_PKTHDR) != 0); + + nl_receive_async(m, so); + return (0); +} + + +static int +nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + RT_LOG(LOG_DEBUG2, "sending message to kernel"); + return (nl_pru_output(m, so)); +} + +/* netlink usrreqs */ +static struct pr_usrreqs nl_usrreqs = { + .pru_abort = nl_pru_abort, + .pru_attach = nl_pru_attach, + .pru_bind = nl_pru_bind, + .pru_connect = nl_pru_connect, + .pru_detach = nl_pru_detach, + .pru_disconnect = nl_pru_disconnect, + .pru_peeraddr = nl_pru_peeraddr, + .pru_send = nl_pru_send, + .pru_shutdown = nl_pru_shutdown, + .pru_sockaddr = nl_pru_sockaddr, + .pru_close = nl_pru_close +}; + +static int +nl_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct nlpcb *nlp = sotonlpcb(so); + uint32_t flag, groups; + int optval, error = 0; + NLCTL_TRACKER; + + RT_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", + so, sopt->sopt_name); + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: + sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + + NLCTL_WLOCK(ctl); + if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) + groups = nlp->nl_groups | optval; + else + groups = nlp->nl_groups & ~optval; + nl_update_groups_locked(nlp, groups); + NLCTL_WUNLOCK(ctl); + break; + case NETLINK_CAP_ACK: + case NETLINK_EXT_ACK: + sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + + if (sopt->sopt_name == NETLINK_CAP_ACK) + flag = NLF_CAP_ACK; + else if (sopt->sopt_name == NETLINK_EXT_ACK) + flag = NLF_EXT_ACK; + else + flag = 0; + + NLCTL_WLOCK(ctl); + if (optval != 0) + nlp->nl_flags |= flag; + else + nlp->nl_flags &= ~flag; + NLCTL_WUNLOCK(ctl); + break; + default: + error = ENOPROTOOPT; + } + break; + case SOPT_GET: + switch (sopt->sopt_name) { + case NETLINK_LIST_MEMBERSHIPS: + NLCTL_RLOCK(ctl); + optval = nlp->nl_groups; + NLCTL_RUNLOCK(ctl); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + default: + error = ENOPROTOOPT; + } + break; + default: + error = ENOPROTOOPT; + } + + return (error); +} + +static struct domain netlinkdomain; + +static struct protosw netlinksw[] = { + { + .pr_type = SOCK_RAW, + .pr_domain = &netlinkdomain, + .pr_flags = PR_ATOMIC | PR_ADDR, + //.pr_output = nl_pru_output, + .pr_ctloutput = nl_ctloutput, + .pr_usrreqs = &nl_usrreqs + }, +}; + +static struct domain netlinkdomain = { + .dom_family = PF_NETLINK, + .dom_name = "netlink", + .dom_protosw = netlinksw, + .dom_flags = DOMF_UNLOADABLE, + .dom_protoswNPROTOSW = &netlinksw[sizeof(netlinksw) / sizeof(netlinksw[0])] +}; + +DOMAIN_SET(netlink); Index: sys/netlink/netlink_iface.c =================================================================== --- /dev/null +++ sys/netlink/netlink_iface.c @@ -0,0 +1,541 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* scope deembedding */ + +#define DEBUG_MOD_NAME nl_iface +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + + +struct netlink_walkargs { + struct nlmsg_state ns; + struct rib_cmd_info rc; + struct nlmsghdr hdr; + struct nlpcb *so; + uint32_t fibnum; + int family; + int error; + int count; + int dumped; +}; + +#define FAIL_ATTR(a) {\ + RT_LOG(LOG_DEBUG, "failed writing attribute %s (%d)", #a, a); \ + goto error; \ +} + +static eventhandler_tag ifdetach_event, ifattach_event, ifaddr_event; + +/* */ + +/* + * RTM_GETLINK request + * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, + * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 + * + * Reply: + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, +{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} + +[ +{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, +{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, +{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, +{{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, +{{nla_len=8, nla_type=IFLA_MTU}, 1500}, +{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, + {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, +{{nla_len=8, nla_type=IFLA_GROUP}, 0}, +{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, +{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, +{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, +{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, +{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, +{{nla_len=5, nla_type=IFLA_CARRIER}, 1}, +{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, +{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, +{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, +{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, +{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, + */ + +static unsigned +ifp_flags_to_netlink(const struct ifnet *ifp) +{ + return (ifp->if_flags); +} + +#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) +static bool +dump_sa(struct nlmsg_state *ns, int attr, const struct sockaddr *sa) +{ + uint32_t addr_len = 0; + const void *addr_data = NULL; + struct in6_addr addr6; + + if (sa == NULL) + return (true); + + switch (sa->sa_family) { + case AF_INET: + addr_len = sizeof(struct in_addr); + addr_data = &((const struct sockaddr_in *)sa)->sin_addr; + break; + case AF_INET6: + in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); + addr_len = sizeof(struct in6_addr); + addr_data = &addr6; + break; + case AF_LINK: + addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; + addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); + break; + default: + RT_LOG(LOG_DEBUG, "unsupported family: %d, skipping", sa->sa_family); + return (true); + } + + return (nlattr_add(ns, attr, addr_len, addr_data)); +} + +static bool +dump_iface(struct nlmsg_state *ns, struct ifnet *ifp, const struct nlmsghdr *hdr) +{ + struct ifinfomsg *ifinfo; + + int payload_len = sizeof(struct ifinfomsg); + nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, + hdr->nlmsg_flags, payload_len); + + ifinfo = nlmsg_reserve_object(ns, struct ifinfomsg); + ifinfo->ifi_family = AF_UNSPEC; + ifinfo->__ifi_pad = 0; + ifinfo->ifi_type = ifp->if_type; // ARPHDR + ifinfo->ifi_index = ifp->if_index; + ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); + ifinfo->ifi_change = 0; + + if (!nlattr_add_string(ns, IFLA_IFNAME, if_name(ifp))) + goto error; + + uint8_t val = 0; // XXX: operstate? + if (!nlattr_add_u8(ns, IFLA_OPERSTATE, val)) + goto error; + + /* XXX: carrier */ + if (!nlattr_add_u8(ns, IFLA_CARRIER, val)) + goto error; + + if (!nlattr_add_u8(ns, IFLA_PROTO_DOWN, val)) + goto error; + + if (!nlattr_add_u8(ns, IFLA_LINKMODE, val)) + goto error; + + /* Link addr */ + if ((ifp->if_addr != NULL)) { + if (!dump_sa(ns, IFLA_ADDRESS, ifp->if_addr->ifa_addr)) + goto error; + } + + if (!nlattr_add_u32(ns, IFLA_MTU, ifp->if_mtu)) + goto error; +/* + if (!nlattr_add_u32(ns, IFLA_MIN_MTU, 60)) + goto error; + + if (!nlattr_add_u32(ns, IFLA_MAX_MTU, 9000)) + goto error; + + if (!nlattr_add_u32(ns, IFLA_GROUP, 0)) + goto error; +*/ + if (!nlattr_add_u32(ns, IFLA_PROMISCUITY, 0)) + goto error; + + nlmsg_end(ns); + + return (true); + +error: + RT_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); + nlmsg_abort(ns); + return (false); +} + +int +rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt) +{ + struct ifnet *ifp; + int error = 0; + + struct netlink_walkargs wa = { + .so = nlp, + .rc.rc_cmd = NL_RTM_NEWLINK, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + .hdr.nlmsg_type = NL_RTM_NEWLINK, + }; + + if (!nlmsg_get_socket_writer(NLMSG_LARGE, nlp, &wa.ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return (ENOMEM); + } + + RT_LOG(LOG_DEBUG, "Start dump"); + + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + wa.count++; + if (!dump_iface(&wa.ns, ifp, &wa.hdr)) { + error = ENOMEM; + break; + } + wa.dumped++; + } + + RT_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa.count, wa.dumped); + + if (!nlmsg_add(&wa.ns, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { + RT_LOG(LOG_DEBUG, "Fuck"); + return (ENOMEM); + } + /* report operation result */ + int *perror = nlmsg_reserve_object(&wa.ns, int); + RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, wa.ns.offset, perror); + *perror = error; + nlmsg_end(&wa.ns); + nlmsg_flush(&wa.ns); + + + return (error); +} + + +/* + +{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, + [ + {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, + {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, + {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, + {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, + {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, +--- + +{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, + {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, + [ + {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, + {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, + {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, +*/ + +static uint8_t +ifa_get_scope(const struct ifaddr *ifa) +{ + const struct sockaddr *sa; + uint8_t addr_scope = RT_SCOPE_UNIVERSE; + + sa = ifa->ifa_addr; + switch (sa->sa_family) { + case AF_INET: + { + struct in_addr addr; + addr = ((const struct sockaddr_in *)sa)->sin_addr; + if (IN_LOOPBACK(addr.s_addr)) + addr_scope = RT_SCOPE_HOST; + else if (IN_LINKLOCAL(addr.s_addr)) + addr_scope = RT_SCOPE_LINK; + break; + } + case AF_INET6: + { + const struct in6_addr *addr; + addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; + if (IN6_IS_ADDR_LOOPBACK(addr)) + addr_scope = RT_SCOPE_HOST; + else if (IN6_IS_ADDR_LINKLOCAL(addr)) + addr_scope = RT_SCOPE_LINK; + break; + } + } + + return (addr_scope); +} + +static uint8_t +inet6_get_plen(const struct in6_addr *addr) +{ + + return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); +} + +static uint8_t +get_sa_plen(const struct sockaddr *sa) +{ + const struct in6_addr *paddr6; + const struct in_addr *paddr; + + switch (sa->sa_family) { + case AF_INET: + if (sa == NULL) + return (32); + paddr = &(((const struct sockaddr_in *)sa)->sin_addr); + return bitcount32(paddr->s_addr);; + case AF_INET6: + if (sa == NULL) + return (128); + paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); + return inet6_get_plen(paddr6); + } + + return (0); +} + + +/* + * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), + ('IFA_LOCAL', '12.0.0.1'), + ('IFA_LABEL', 'eth10'), + ('IFA_FLAGS', 128), + ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], + */ +static bool +dump_iface_addr(struct nlmsg_state *ns, struct ifnet *ifp, struct ifaddr *ifa, + const struct nlmsghdr *hdr) +{ + struct ifaddrmsg *ifamsg; + struct sockaddr *sa = ifa->ifa_addr; + + RT_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", + ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); + + int payload_len = sizeof(struct ifaddrmsg); + nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, + hdr->nlmsg_flags, payload_len); + + ifamsg = nlmsg_reserve_object(ns, struct ifaddrmsg); + ifamsg->ifa_family = sa->sa_family; + ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); + ifamsg->ifa_flags = 0; // ifa_flags is useless + ifamsg->ifa_scope = ifa_get_scope(ifa); + ifamsg->ifa_index = ifp->if_index; + + struct sockaddr *dst_sa = ifa->ifa_dstaddr; + if ((dst_sa == NULL) || (dst_sa->sa_family != sa->sa_family)) + dst_sa = sa; + if (!dump_sa(ns, IFA_ADDRESS, dst_sa)) + FAIL_ATTR(IFA_ADDRESS); + if (!dump_sa(ns, IFA_LOCAL, sa)) + FAIL_ATTR(IFA_LOCAL); + + if (!nlattr_add_string(ns, IFA_LABEL, if_name(ifp))) + FAIL_ATTR(IFA_LABEL); + uint32_t val = 0; // ifa->ifa_flags; + if (!nlattr_add_u32(ns, IFA_FLAGS, val)) + FAIL_ATTR(IFA_FLAGS); + + nlmsg_end(ns); + return (true); +error: + RT_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", + rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); + nlmsg_abort(ns); + return (false); +} + +int +rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt) +{ + struct ifaddr *ifa; + struct ifnet *ifp; + int error = 0; + + struct netlink_walkargs wa = { + .so = nlp, + .rc.rc_cmd = NL_RTM_NEWADDR, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + .hdr.nlmsg_type = NL_RTM_NEWADDR, + }; + + if (!nlmsg_get_socket_writer(NLMSG_LARGE, nlp, &wa.ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return (ENOMEM); + } + + RT_LOG(LOG_DEBUG, "Start dump"); + + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (wa.family != 0 && wa.family != ifa->ifa_addr->sa_family) + continue; + if (ifa->ifa_addr->sa_family == AF_LINK) + continue; + wa.count++; + if (!dump_iface_addr(&wa.ns, ifp, ifa, &wa.hdr)) { + error = ENOMEM; + break; + } + wa.dumped++; + } + if (error != 0) + break; + } + + RT_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa.count, wa.dumped); + + if (!nlmsg_add(&wa.ns, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { + RT_LOG(LOG_DEBUG, "Unable to write message"); + return (ENOMEM); + } + int *perror = nlmsg_reserve_object(&wa.ns, int); + RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, wa.ns.offset, perror); + *perror = error; + nlmsg_end(&wa.ns); + nlmsg_flush(&wa.ns); + + return (error); +} + +static void +rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) +{ + struct nlmsghdr hdr = {}; + struct nlmsg_state ns = {}; + uint32_t group = 0; + + if (V_nl_ctl == NULL) + return; + + switch (ifa->ifa_addr->sa_family) { + case AF_INET: + group = RTNLGRP_IPV4_IFADDR; + break; + case AF_INET6: + group = RTNLGRP_IPV6_IFADDR; + break; + default: + RT_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", + ifa->ifa_addr->sa_family); + return; + } + + if (!nlmsg_get_group_writer(NLMSG_LARGE, group, &ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return; + } + + hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; + + dump_iface_addr(&ns, ifa->ifa_ifp, ifa, &hdr); + nlmsg_flush(&ns); +} + +static void +rtnl_handle_ifattach(void *arg, struct ifnet *ifp) +{ + struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_NEWLINK }; + struct nlmsg_state ns = {}; + + if (V_nl_ctl == NULL) + return; + + if (!nlmsg_get_group_writer(NLMSG_LARGE, RTNLGRP_LINK, &ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return; + } + dump_iface(&ns, ifp, &hdr); + nlmsg_flush(&ns); +} + +static void +rtnl_handle_ifdetach(void *arg, struct ifnet *ifp) +{ + struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_DELLINK }; + struct nlmsg_state ns = {}; + + if (V_nl_ctl == NULL) + return; + + if (!nlmsg_get_group_writer(NLMSG_LARGE, RTNLGRP_LINK, &ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return; + } + dump_iface(&ns, ifp, &hdr); + nlmsg_flush(&ns); +} + +void +rtnl_ifaces_init(void) +{ + ifattach_event = EVENTHANDLER_REGISTER( + ifnet_arrival_event, rtnl_handle_ifattach, NULL, + EVENTHANDLER_PRI_ANY); + ifdetach_event = EVENTHANDLER_REGISTER( + ifnet_departure_event, rtnl_handle_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + ifaddr_event = EVENTHANDLER_REGISTER( + rt_addrmsg, rtnl_handle_ifaddr, NULL, + EVENTHANDLER_PRI_ANY); +} + +void +rtnl_ifaces_destroy(void) +{ + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); + EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); +} Index: sys/netlink/netlink_io.c =================================================================== --- /dev/null +++ sys/netlink/netlink_io.c @@ -0,0 +1,364 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#define DEBUG_MOD_NAME nl_io +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + + +static struct sockaddr_nl _nl_empty_src = { + .nl_len = sizeof(struct sockaddr_nl), + .nl_family = PF_NETLINK, + .nl_pid = 0 /* comes from the kernel */ +}; +static struct sockaddr *nl_empty_src = (struct sockaddr *)&_nl_empty_src; + +static int nl_receive(struct mbuf *m, struct nlpcb *nlp); + +int +nl_receive_async(struct mbuf *m, struct socket *so) +{ + struct nlpcb *nlp = sotonlpcb(so); + + m->m_nextpkt = NULL; + + NLP_LOCK(nlp); + + if ((__predict_false(!nlp->nl_active))) { + NLP_UNLOCK(nlp); + m_free(m); + return (EINVAL); + } + + /* XXX: Implement queue limits */ + if (nlp->nl_queue_head == NULL) { + nlp->nl_queue_head = m; + nlp->nl_queue_last = m; + } else { + nlp->nl_queue_last->m_nextpkt = m; + nlp->nl_queue_last = m; + } + nlp->nl_queue_length += m_length(m, NULL); + RT_LOG(LOG_DEBUG3, "enqueue, total len %ld", nlp->nl_queue_length); + + if (!nlp->nl_task_pending) { + nlp->nl_task_pending = true; + taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); + } + NLP_UNLOCK(nlp); + + return (0); +} + +static void +nl_process_received(struct nlpcb *nlp) +{ + struct mbuf *m; + + NLP_LOCK(nlp); + m = nlp->nl_queue_head; + nlp->nl_queue_head = NULL; + nlp->nl_queue_last = NULL; + nlp->nl_queue_length = 0; + nlp->nl_task_pending = false; + NLP_UNLOCK(nlp); + + RT_LOG(LOG_DEBUG2, "taskqueue called"); + + while (m != NULL) { + struct mbuf *m_next = m->m_nextpkt; + m->m_nextpkt = NULL; + nl_receive(m, nlp); + m = m_next; + } +} + +void +nl_taskqueue_handler(void *_arg, int pending) +{ + struct nlpcb *nlp = (struct nlpcb *)_arg; + struct epoch_tracker et; + + CURVNET_SET(nlp->nl_socket->so_vnet); + NET_EPOCH_ENTER(et); + nl_process_received(nlp); + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); +} + +bool +nl_send_one(struct mbuf *m, struct nlpcb *nlp) +{ +#if DEBUG_MAX_LEVEL > LOG_DEBUG2 + struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); + RT_LOG(LOG_DEBUG2, "TX mbuf len %u msg type %d first hdrlen %u", + m->m_len, hdr->nlmsg_type, hdr->nlmsg_len); +#endif + bool result = false; + NLP_LOCK(nlp); + if (nlp->nl_socket != NULL) { + struct socket *so = nlp->nl_socket; + + if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) { + sorwakeup(so); + RT_LOG(LOG_DEBUG3, "TX done"); + result = true; + } else { + soroverflow(so); + m_freem(m); + RT_LOG(LOG_DEBUG, "socket RX overflow for PID %u", + nlp->nl_process_id); + } + + } + NLP_UNLOCK(nlp); + + return (result); +} + +static bool +nl_send_one_wrapper(struct mbuf *m, struct nlpcb *nlp) +{ + if (nlp->nl_linux) { + m = mbufs_to_linux(m, nlp); + if (m == NULL) + return (false); + } + return (nl_send_one(m, nlp)); +} + +/* + * Used when certain data needs to be broadcasted to the group + */ +void +nl_send_group(struct mbuf *m, uint32_t groups_mask) +{ + struct nlpcb *nlp_last = NULL; + struct nlpcb *nlp; + NLCTL_TRACKER; + +#if DEBUG_MAX_LEVEL > LOG_DEBUG2 + struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); + RT_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to groups 0x%X", + m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, groups_mask); +#endif + + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + if (ctl == NULL) { + /* + * Can be the case when notification is sent within VNET + * which doesn't have any netlink sockets. + */ + m_freem(m); + return; + } + + NLCTL_RLOCK(ctl); + + CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_pcb_head, nl_next) { + if (nlp->nl_groups & groups_mask) { + if (nlp_last != NULL) { + struct mbuf *m_copy; + m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (m_copy != NULL) + nl_send_one_wrapper(m_copy, nlp_last); + else { + NLP_LOCK(nlp_last); + if (nlp_last->nl_socket != NULL) + sorwakeup(nlp_last->nl_socket); + NLP_UNLOCK(nlp_last); + } + } + nlp_last = nlp; + } + } + if (nlp_last != NULL) + nl_send_one_wrapper(m, nlp_last); + else + m_freem(m); + + NLCTL_RUNLOCK(ctl); +} + +/* + * Sends an ack message + */ +void +nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg) +{ + struct nlmsgerr *errmsg; + int payload_len; + uint32_t flags = nlp->nl_flags; + struct nlmsg_state ns; + bool cap_ack; + + payload_len = sizeof(struct nlmsgerr); + + /* + * The only case when we send the full message in the + * reply is when there is an error and NETLINK_CAP_ACK + * is not set. + */ + cap_ack = (error == 0) || (flags & NLF_CAP_ACK); + if (!cap_ack) + payload_len += nlmsg->nlmsg_len - sizeof(struct nlmsghdr); + + /* + * TODO: handle NETLINK_F_EXT_ACK sockopt + * TODO: handle cookies + */ + + int sz = payload_len + sizeof(struct nlmsghdr); + if (!nlmsg_get_socket_writer(sz, nlp, &ns)) { + RT_LOG(LOG_NOTICE, "error allocating nlmsg(%d)", sz); + return; + } + + RT_LOG(LOG_DEBUG, "type-%d;payload-%d;pid-%d;seq-%d", NLMSG_ERROR, payload_len, + nlp->nl_port, nlmsg->nlmsg_seq); + + nlmsg_add(&ns, nlp->nl_port, nlmsg->nlmsg_seq, NLMSG_ERROR, 0, payload_len); + + errmsg = nlmsg_reserve_data(&ns, payload_len, struct nlmsgerr); + errmsg->error = error; + /* In case of error copy the whole message, else just the header */ + memcpy(&errmsg->msg, nlmsg, cap_ack ? sizeof(*nlmsg) : nlmsg->nlmsg_len); + + nlmsg_end(&ns); + nlmsg_flush(&ns); +} + +static int +nl_receive_message(struct nlmsghdr *hdr, int remaining_length, + struct nlpcb *nlp, struct netlink_parse_tracker *npt) +{ + nl_handler handler = nl_handlers[nlp->nl_proto]; + int error = 0; + + RT_LOG(LOG_DEBUG3, "msg len: %d type: %d", hdr->nlmsg_len, hdr->nlmsg_type); + + if (__predict_false(hdr->nlmsg_len > remaining_length)) { + RT_LOG(LOG_DEBUG, "invalid message"); + return (EINVAL); + } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { + RT_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); + return (EINVAL); + } + /* Stamp each message with sender pid */ + hdr->nlmsg_pid = nlp->nl_port; + + if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { + RT_LOG(LOG_DEBUG2, "handling message with msg type: %d", + hdr->nlmsg_type); + + struct nlmsghdr *thdr = hdr; + if (nlp->nl_linux) + thdr = nlmsg_from_linux(hdr, npt); + error = handler(thdr, npt); + RT_LOG(LOG_DEBUG2, "retcode: %d", error); + } + if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { + RT_LOG(LOG_DEBUG3, "ack"); + nlmsg_ack(nlp, error, hdr); + RT_LOG(LOG_DEBUG3, "done"); + } + + return (0); +} + +/* + * Processes an incoming packet, which can contain multiple netlink messages + */ +static int +nl_receive(struct mbuf *m, struct nlpcb *nlp) +{ + int offset, buffer_length, error = 0; + struct nlmsghdr *hdr; + char *buffer; + + RT_LOG(LOG_DEBUG, "RX netlink mbuf %p on %p", m, nlp->nl_socket); + + int data_length = m_length(m, NULL); + buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; + if (nlp->nl_linux) + buffer_length += roundup2(data_length, 8); + buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); + if (buffer == NULL) { + m_freem(m); + RT_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", + buffer_length); + return (ENOMEM); + } + m_copydata(m, 0, data_length, buffer); + m_freem(m); // XXX: reuse for ack? + + struct netlink_parse_tracker npt = { + .nlp = nlp, + .lb.base = &buffer[roundup2(data_length, 8)], + .lb.size = buffer_length - roundup2(data_length, 8), + }; + + for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { + hdr = (struct nlmsghdr *)&buffer[offset]; + /* Save length prior to calling handler */ + int msglen = NLMSG_ALIGN(hdr->nlmsg_len); + RT_LOG(LOG_DEBUG2, "parsing offset %d/%d", offset, data_length); + /* Update parse state */ + npt.hdr = hdr; + lb_clear(&npt.lb); + error = nl_receive_message(hdr, data_length - offset, nlp, &npt); + if (__predict_false(error != 0)) + break; + offset += msglen; + } + RT_LOG(LOG_DEBUG2, "packet parsing done"); + + free(buffer, M_NETLINK); + return (error); +} Index: sys/netlink/netlink_linux.c =================================================================== --- /dev/null +++ sys/netlink/netlink_linux.c @@ -0,0 +1,494 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DEBUG_MOD_NAME nl_linux +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + +static int +_linux_to_bsd_domain(int domain) +{ + + switch (domain) { + case LINUX_AF_UNSPEC: + return (AF_UNSPEC); + case LINUX_AF_UNIX: + return (AF_LOCAL); + case LINUX_AF_INET: + return (AF_INET); + case LINUX_AF_INET6: + return (AF_INET6); + } + return (-1); +} + +static int +_bsd_to_linux_domain(int domain) +{ + + switch (domain) { + case AF_UNSPEC: + return (LINUX_AF_UNSPEC); + case AF_LOCAL: + return (LINUX_AF_UNIX); + case AF_INET: + return (LINUX_AF_INET); + case AF_INET6: + return (LINUX_AF_INET6); + } + return (-1); +} + +static bool +valid_rta_size(const struct rtattr *rta, int sz) +{ + return (NL_RTA_DATA_LEN(rta) == sz); +} + +static bool +valid_rta_u32(const struct rtattr *rta) +{ + return (valid_rta_size(rta, sizeof(uint32_t))); +} + +static uint32_t +nl_rta_get_uint32(const struct rtattr *rta) +{ + return (*((const uint32_t *)NL_RTA_DATA_CONST(rta))); +} + +#if 0 +static struct nlmsghdr * +rtnl_ifaddr_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt) +{ + /* Tweak address families and default fib only */ + struct ifaddrmsg *ifamsg = (struct ifaddrmsg *)(hdr + 1); + + ifamsg->ifa_family = _linux_to_bsd_domain(ifamsg->ifa_family); + + return (hdr); +} +#endif + +static struct nlmsghdr * +rtnl_route_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt) +{ + /* Tweak address families and default fib only */ + struct rtmsg *rtm = (struct rtmsg *)(hdr + 1); + struct nlattr *nla, *nla_head; + int attrs_len; + + rtm->rtm_family = _linux_to_bsd_domain(rtm->rtm_family); + + if (rtm->rtm_table == 254) + rtm->rtm_table = 0; + + attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr); + attrs_len -= NETLINK_ALIGN(sizeof(struct rtmsg)); + nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg))); + + NLA_FOREACH(nla, nla_head, attrs_len) { + RT_LOG(LOG_DEBUG3, "GOT type %d len %d total %d", + nla->nla_type, nla->nla_len, attrs_len); + struct rtattr *rta = (struct rtattr *)nla; + if (rta->rta_len < sizeof(struct rtattr)) { + break; + } + switch (rta->rta_type) { + case NL_RTA_TABLE: + if (!valid_rta_u32(rta)) + goto done; + rtm->rtm_table = 0; + uint32_t fibnum = nl_rta_get_uint32(rta); + RT_LOG(LOG_DEBUG, "GET RTABLE: %u", fibnum); + if (fibnum == 254) { + *((uint32_t *)NL_RTA_DATA(rta)) = 0; + } + break; + } + } + +done: + return (hdr); +} + +static struct nlmsghdr * +rtnl_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt) +{ + switch (hdr->nlmsg_type) { + case NL_RTM_GETROUTE: + case NL_RTM_NEWROUTE: + case NL_RTM_DELROUTE: + return (rtnl_route_from_linux(hdr, npt)); + default: + RT_LOG(LOG_DEBUG, "Passing message type %d untranslated", + hdr->nlmsg_type); + } + + return (hdr); +} + +struct nlmsghdr * +nlmsg_from_linux(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt) +{ + struct nlpcb *nlp = npt->nlp; + + switch (nlp->nl_proto) { + case NETLINK_ROUTE: + return (rtnl_from_linux(hdr, npt)); + } + + return (hdr); +} + + +/************************************************************ + * Kernel -> Linux + ************************************************************/ + +static bool +handle_default_out(struct nlmsghdr *hdr, struct nlmsg_state *ns) +{ + char *out_hdr; + out_hdr = nlmsg_reserve_data(ns, NLMSG_ALIGN(hdr->nlmsg_len), char); + + if (out_hdr != NULL) { + memcpy(out_hdr, hdr, hdr->nlmsg_len); + return (true); + } + return (false); +} + +static bool +nlmsg_copy_header(struct nlmsghdr *hdr, struct nlmsg_state *ns) +{ + return (nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, + hdr->nlmsg_flags, 0)); +} + +static void * +_nlmsg_copy_next_header(struct nlmsghdr *hdr, struct nlmsg_state *ns, int sz) +{ + void *next_hdr = nlmsg_reserve_data(ns, sz, void); + memcpy(next_hdr, hdr + 1, NLMSG_ALIGN(sz)); + + return (next_hdr); +} +#define nlmsg_copy_next_header(_hdr, _ns, _t) \ + ((_t *)(_nlmsg_copy_next_header(_hdr, _ns, sizeof(_t)))) + +static bool +nlmsg_copy_nla(const struct nlattr *nla_orig, struct nlmsg_state *ns) +{ + struct nlattr *nla = nlmsg_reserve_data(ns, nla_orig->nla_len, struct nlattr); + if (nla != NULL) { + memcpy(nla, nla_orig, nla_orig->nla_len); + return (true); + } + return (false); +} + +static bool +nlmsg_copy_all_nla(struct nlmsghdr *hdr, int raw_hdrlen, struct nlmsg_state *ns) +{ + struct nlattr *nla; + + int hdrlen = NETLINK_ALIGN(raw_hdrlen); + int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; + struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); + + NLA_FOREACH(nla, nla_head, attrs_len) { + if (!nlmsg_copy_nla(nla, ns)) + return (false); + } + return (true); +} + +static unsigned int +rtnl_if_flags_to_linux(unsigned int if_flags) +{ + unsigned int result = 0; + + for (int i = 0; i < 31; i++) { + unsigned int flag = 1 << i; + if (!(flag & if_flags)) + continue; + switch (flag) { + case IFF_UP: + case IFF_BROADCAST: + case IFF_DEBUG: + case IFF_LOOPBACK: + case IFF_POINTOPOINT: + case IFF_NOARP: + case IFF_PROMISC: + case IFF_ALLMULTI: + result |= flag; + break; + case IFF_KNOWSEPOCH: + case IFF_DRV_RUNNING: + case IFF_DRV_OACTIVE: + case IFF_SIMPLEX: + case IFF_LINK0: + case IFF_LINK1: + case IFF_LINK2: + case IFF_CANTCONFIG: + case IFF_PPROMISC: + case IFF_MONITOR: + case IFF_STATICARP: + case IFF_STICKYARP: + case IFF_DYING: + case IFF_RENAMING: + case IFF_NOGROUP: + /* No Linux analogue */ + break; + case IFF_MULTICAST: + result |= 1 << 12; + } + } + return (result); +} + +static bool +rtnl_newlink_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nlmsg_state *ns) +{ + if (!nlmsg_copy_header(hdr, ns)) + return (false); + + struct ifinfomsg *ifinfo; + ifinfo = nlmsg_copy_next_header(hdr, ns, struct ifinfomsg); + + ifinfo->ifi_family = _bsd_to_linux_domain(ifinfo->ifi_family); + /* Convert interface type */ + switch (ifinfo->ifi_type) { + case IFT_ETHER: + ifinfo->ifi_type = 1; // ARPHRD_ETHER + break; + } + ifinfo->ifi_flags = rtnl_if_flags_to_linux(ifinfo->ifi_flags); + + /* Copy attributes unchanged */ + if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifinfomsg), ns)) + return (false); + + /* make ip(8) happy */ + if (!nlattr_add_string(ns, IFLA_QDISC, "noqueue")) + return (false); + + if (!nlattr_add_u32(ns, IFLA_TXQLEN, 1000)) + return (false); + + nlmsg_end(ns); + RT_LOG(LOG_DEBUG2, "done processing ns %p", ns); + return (true); +} + +static bool +rtnl_newaddr_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nlmsg_state *ns) +{ + if (!nlmsg_copy_header(hdr, ns)) + return (false); + + struct ifaddrmsg *ifamsg; + ifamsg = nlmsg_copy_next_header(hdr, ns, struct ifaddrmsg); + + int old_family = ifamsg->ifa_family; + ifamsg->ifa_family = _bsd_to_linux_domain(ifamsg->ifa_family); + RT_LOG(LOG_DEBUG2, "CONVERT FAMILY %d -> %d", old_family, ifamsg->ifa_family); + /* XXX: fake ifa_flags? */ + + /* Copy attributes unchanged */ + if (!nlmsg_copy_all_nla(hdr, sizeof(struct ifaddrmsg), ns)) + return (false); + + nlmsg_end(ns); + RT_LOG(LOG_DEBUG2, "done processing ns %p", ns); + return (true); +} + +static bool +rtnl_newroute_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nlmsg_state *ns) +{ + if (!nlmsg_copy_header(hdr, ns)) + return (false); + + struct rtmsg *rtm; + rtm = nlmsg_copy_next_header(hdr, ns, struct rtmsg); + int old_family = rtm->rtm_family; + rtm->rtm_family = _bsd_to_linux_domain(rtm->rtm_family); + + RT_LOG(LOG_DEBUG2, "FAMILY %d -> %d", old_family, rtm->rtm_family); + + struct nlattr *nla; + + int hdrlen = NETLINK_ALIGN(sizeof(struct rtmsg)); + int attrs_len = hdr->nlmsg_len - sizeof(struct nlmsghdr) - hdrlen; + struct nlattr *nla_head = (struct nlattr *)((char *)(hdr + 1) + hdrlen); + + NLA_FOREACH(nla, nla_head, attrs_len) { + struct rtattr *rta = (struct rtattr *)nla; + //RT_LOG(LOG_DEBUG, "READING attr %d len %d", nla->nla_type, nla->nla_len); + if (rta->rta_len < sizeof(struct rtattr)) { + break; + } + + switch (rta->rta_type) { + case NL_RTA_TABLE: + { + uint32_t fibnum; + fibnum = nl_rta_get_uint32(rta); + if (fibnum == 0) + fibnum = 254; + RT_LOG(LOG_DEBUG3, "XFIBNUM %u", fibnum); + if (!nlattr_add_u32(ns, NL_RTA_TABLE, fibnum)) + return (false); + } + break; + default: + if (!nlmsg_copy_nla(nla, ns)) + return (false); + break; + } + } + + nlmsg_end(ns); + RT_LOG(LOG_DEBUG2, "done processing ns %p", ns); + return (true); +} + +static bool +rtnl_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nlmsg_state *ns) +{ + RT_LOG(LOG_DEBUG2, "Got message type %d", hdr->nlmsg_type); + + switch (hdr->nlmsg_type) { + case NL_RTM_NEWLINK: + return (rtnl_newlink_to_linux(hdr, nlp, ns)); + case NL_RTM_NEWADDR: + case NL_RTM_DELADDR: + return (rtnl_newaddr_to_linux(hdr, nlp, ns)); + case NL_RTM_NEWROUTE: + case NL_RTM_DELROUTE: + return (rtnl_newroute_to_linux(hdr, nlp, ns)); + default: + RT_LOG(LOG_DEBUG, "[WARN] Passing message type %d untranslated", + hdr->nlmsg_type); + return (handle_default_out(hdr, ns)); + } +} + +static bool +nlmsg_to_linux(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nlmsg_state *ns) +{ + switch (nlp->nl_proto) { + case NETLINK_ROUTE: + return (rtnl_to_linux(hdr, nlp, ns)); + default: + return (handle_default_out(hdr, ns)); + } +} + +struct mbuf * +nlmsgs_to_linux(char *buf, int data_length, struct nlpcb *nlp) +{ + RT_LOG(LOG_DEBUG, "LINUX: get %p size %d", buf, data_length); + struct nlmsg_state ns = {}; + + struct mbuf *m = NULL; + if (!nlmsg_get_chain_writer(data_length, &m, &ns)) { + RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d", + data_length); + return (NULL); + } + + /* Assume correct headers. Buffer IS mutable */ + int count = 0; + for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { + struct nlmsghdr *hdr = (struct nlmsghdr *)&buf[offset]; + int msglen = NLMSG_ALIGN(hdr->nlmsg_len); + count++; + + if (!nlmsg_to_linux(hdr, nlp, &ns)) { + RT_LOG(LOG_DEBUG, "failed to process msg type %d", + hdr->nlmsg_type); + m_freem(m); + return (NULL); + } + offset += msglen; + } + nlmsg_flush(&ns); + nlmsg_free(&ns); + RT_LOG(LOG_DEBUG2, "Processed %d messages, chain size %d", count, m ? m_length(m, NULL) : 0); + + return (m); +} + +struct mbuf * +mbufs_to_linux(struct mbuf *m, struct nlpcb *nlp) +{ + /* XXX: easiest solution, not optimized for performance */ + int data_length = m_length(m, NULL); + char *buf = malloc(data_length, M_NETLINK, M_NOWAIT); + if (buf == NULL) { + RT_LOG(LOG_INFO, "unable to allocate %d bytes, dropping message", + data_length); + m_freem(m); + return (NULL); + } + m_copydata(m, 0, data_length, buf); + m_freem(m); + + return (nlmsgs_to_linux(buf, data_length, nlp)); +} + Index: sys/netlink/netlink_message.c =================================================================== --- /dev/null +++ sys/netlink/netlink_message.c @@ -0,0 +1,582 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DEBUG_MOD_NAME nl_message +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + + +typedef bool nlwriter_op_init(struct nlmsg_state *ns, int size, bool waitok); +typedef bool nlwriter_op_write(struct nlmsg_state *ns, char *buf, int buflen); + +struct nlwriter_ops { + nlwriter_op_init *init; + nlwriter_op_write *write_socket; + nlwriter_op_write *write_group; + nlwriter_op_write *write_chain; +}; + +/* + * NS_WRITER_TYPE_BUF + * Writes message to a temporary memory buffer, + * flushing to the socket/group when buffer size limit is reached + */ +static bool +nlmsg_get_ns_buf(struct nlmsg_state *ns, int size, bool waitok) +{ + int mflag = waitok ? M_WAITOK : M_NOWAIT; + ns->_buf = malloc(size, M_NETLINK, mflag | M_ZERO); + if (__predict_false(ns->_buf == NULL)) + return (false); + ns->alloc_len = size; + ns->offset = 0; + ns->hdr = NULL; + ns->data = ns->_buf; + ns->writer_type = NS_WRITER_TYPE_BUF; + ns->malloc_flag = mflag; + return (true); +} + +static bool +nlmsg_write_socket_buf(struct nlmsg_state *ns, char *buf, int datalen) +{ + RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns); + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + /* XXX: should we set sorcverr? */ + free(buf, M_NETLINK); + return (false); + } + m_append(m, datalen, buf); + free(buf, M_NETLINK); + + return (nl_send_one(m, (struct nlpcb *)(ns->arg))); +} + +static bool +nlmsg_write_group_buf(struct nlmsg_state *ns, char *buf, int datalen) +{ + RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg); + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + free(buf, M_NETLINK); + return (false); + } + bool success = m_append(m, datalen, buf) != 0; + free(buf, M_NETLINK); + + if (!success) + return (false); + + nl_send_group(m, (uint32_t)(uintptr_t)(ns->arg)); + return (true); +} + +static bool +nlmsg_write_chain_buf(struct nlmsg_state *ns, char *buf, int datalen) +{ + struct mbuf **m0 = (struct mbuf **)(ns->arg); + RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg); + + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + if (*m0 == NULL) { + struct mbuf *m; + + m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + free(buf, M_NETLINK); + return (false); + } + *m0 = m; + } + if (__predict_false(m_append(*m0, datalen, buf) == 0)) { + free(buf, M_NETLINK); + return (false); + } + return (true); +} + + +/* + * NS_WRITER_TYPE_MBUF + * Writes message to the allocated mbuf, + * flushing to socket/group when mbuf size limit is reached. + * This is the most efficient mechanism as it avoids double-copying. + * + * Allocates a single mbuf suitable to store up to @size bytes of data. + * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr + * If size <= MCLBYTES (2k), allocate a single mbuf cluster + * Otherwise, return NULL. + */ +static bool +nlmsg_get_ns_mbuf(struct nlmsg_state *ns, int size, bool waitok) +{ + struct mbuf *m; + + int mflag = waitok ? M_WAITOK : M_NOWAIT; + m = m_get2(size, mflag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) + return (false); + ns->alloc_len = M_TRAILINGSPACE(m); + ns->offset = 0; + ns->hdr = NULL; + ns->_m = m; + ns->data = mtod(m, void *); + ns->writer_type = NS_WRITER_TYPE_MBUF; + ns->malloc_flag = mflag; + RT_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p", + m, size, ns->alloc_len, ns->data); + return (true); +} + +static bool +nlmsg_write_socket_mbuf(struct nlmsg_state *ns, char *buf, int datalen) +{ + struct mbuf *m = (struct mbuf *)buf; + RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg); + + if (__predict_false(datalen == 0)) { + m_freem(m); + return (true); + } + + m->m_pkthdr.len = datalen; + m->m_len = datalen; + return (nl_send_one(m, (struct nlpcb *)(ns->arg))); +} + +static bool +nlmsg_write_group_mbuf(struct nlmsg_state *ns, char *buf, int datalen) +{ + struct mbuf *m = (struct mbuf *)buf; + RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg); + + if (__predict_false(datalen == 0)) { + m_freem(m); + return (true); + } + + m->m_pkthdr.len = datalen; + m->m_len = datalen; + nl_send_group(m, (uint32_t)(uintptr_t)(ns->arg)); + return (true); +} + +static bool +nlmsg_write_chain_mbuf(struct nlmsg_state *ns, char *buf, int datalen) +{ + struct mbuf *m_new = (struct mbuf *)buf; + struct mbuf **m0 = (struct mbuf **)(ns->arg); + + RT_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, ns->arg); + + if (__predict_false(datalen == 0)) { + m_freem(m_new); + return (true); + } + + m_new->m_pkthdr.len = datalen; + m_new->m_len = datalen; + + if (*m0 == NULL) { + *m0 = m_new; + } else { + struct mbuf *m_last; + for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next) + ; + m_last->m_next = m_new; + (*m0)->m_pkthdr.len += datalen; + } + + return (true); +} + +/* + * NS_WRITER_TYPE_LBUF + * Writes message to the allocated memory buffer, + * flushing to socket/group when mbuf size limit is reached. + * Calls linux handler to rewrite messages before sending to the socket. + */ +static bool +nlmsg_get_ns_lbuf(struct nlmsg_state *ns, int size, bool waitok) +{ + int mflag = waitok ? M_WAITOK : M_NOWAIT; + size = roundup2(size, sizeof(void *)); + int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE; + char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO); + if (__predict_false(buf == NULL)) + return (false); + + /* Fill buffer header first */ + struct linear_buffer *lb = (struct linear_buffer *)buf; + lb->base = &buf[sizeof(struct linear_buffer) + size]; + lb->size = size + SCRATCH_BUFFER_SIZE; + + ns->alloc_len = size; + ns->offset = 0; + ns->hdr = NULL; + ns->_buf = buf; + ns->data = (char *)(lb + 1); + ns->malloc_flag = mflag; + ns->writer_type = NS_WRITER_TYPE_LBUF; + return (true); +} + + +static bool +nlmsg_write_socket_lbuf(struct nlmsg_state *ns, char *buf, int datalen) +{ + struct linear_buffer *lb = (struct linear_buffer *)buf; + char *data = (char *)(lb + 1); + struct nlpcb *nlp = (struct nlpcb *)(ns->arg); + + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = nlmsgs_to_linux(data, datalen, nlp); + free(buf, M_NETLINK); + + if (__predict_false(m == NULL)) { + /* XXX: should we set sorcverr? */ + return (false); + } + + return (nl_send_one(m, nlp)); +} + +/* Shouldn't be called (maybe except Linux code originating message) */ +static bool +nlmsg_write_group_lbuf(struct nlmsg_state *ns,char *buf, int datalen) +{ + struct linear_buffer *lb = (struct linear_buffer *)buf; + char *data = (char *)(lb + 1); + + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = m_getm2(NULL, datalen, ns->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + free(buf, M_NETLINK); + return (false); + } + m_append(m, datalen, data); + free(buf, M_NETLINK); + + nl_send_group(m, (uint32_t)(uintptr_t)(ns->arg)); + return (true); +} + +struct nlwriter_ops nlmsg_writers[] = { + /* NS_WRITER_TYPE_MBUF */ + { + .init = nlmsg_get_ns_mbuf, + .write_socket = nlmsg_write_socket_mbuf, + .write_group = nlmsg_write_group_mbuf, + .write_chain = nlmsg_write_chain_mbuf, + }, + /* NS_WRITER_TYPE_BUF */ + { + .init = nlmsg_get_ns_buf, + .write_socket = nlmsg_write_socket_buf, + .write_group = nlmsg_write_group_buf, + .write_chain = nlmsg_write_chain_buf, + }, + /* NS_WRITER_TYPE_LBUF */ + { + .init = nlmsg_get_ns_lbuf, + .write_socket = nlmsg_write_socket_lbuf, + .write_group = nlmsg_write_group_lbuf, + }, +}; + +static void +nlmsg_set_callback(struct nlmsg_state *ns) +{ + struct nlwriter_ops *pops = &nlmsg_writers[ns->writer_type]; + + switch (ns->writer_target) { + case NS_WRITER_TARGET_SOCKET: + ns->cb = pops->write_socket; + break; + case NS_WRITER_TARGET_GROUP: + ns->cb = pops->write_group; + break; + case NS_WRITER_TARGET_CHAIN: + ns->cb = pops->write_chain; + break; + default: + panic("not implemented"); + } +} + +static bool +nlmsg_get_buf_type(struct nlmsg_state *ns, int size, int type, bool waitok) +{ + MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0])); + RT_LOG(LOG_DEBUG3, "Setting up ns %p size %d type %d", ns, size, type); + return (nlmsg_writers[type].init(ns, size, waitok)); +} + +static bool +nlmsg_get_buf(struct nlmsg_state *ns, int size, bool waitok, bool is_linux) +{ + int type; + + if (!is_linux) { + if (__predict_true(size <= MCLBYTES)) + type = NS_WRITER_TYPE_MBUF; + else + type = NS_WRITER_TYPE_BUF; + } else + type = NS_WRITER_TYPE_LBUF; + return (nlmsg_get_buf_type(ns, size, type, waitok)); +} + +bool +nlmsg_get_socket_writer(int size, struct nlpcb *nlp, struct nlmsg_state *ns) +{ + if (!nlmsg_get_buf(ns, size, false, nlp->nl_linux)) + return (false); + ns->arg = (void *)nlp; + ns->writer_target = NS_WRITER_TARGET_SOCKET; + nlmsg_set_callback(ns); + return (true); +} + +bool +nlmsg_get_group_writer(int size, uint32_t group_mask, struct nlmsg_state *ns) +{ + if (!nlmsg_get_buf(ns, size, false, false)) + return (false); + ns->arg = (void *)(uintptr_t)group_mask; + ns->writer_target = NS_WRITER_TARGET_GROUP; + nlmsg_set_callback(ns); + return (true); +} + +bool +nlmsg_get_chain_writer(int size, struct mbuf **pm, struct nlmsg_state *ns) +{ + if (!nlmsg_get_buf(ns, size, false, false)) + return (false); + *pm = NULL; + ns->arg = (void *)pm; + ns->writer_target = NS_WRITER_TARGET_CHAIN; + nlmsg_set_callback(ns); + RT_LOG(LOG_DEBUG3, "setup cb %p (need %p)", ns->cb, &nlmsg_write_chain_mbuf); + return (true); +} + +void +nlmsg_free(struct nlmsg_state *ns) +{ + ns->cb(ns, ns->_buf, 0); +} + +bool +nlmsg_flush(struct nlmsg_state *ns) +{ + + if (__predict_false(ns->hdr != NULL)) { + /* Last message has not been completed, skip it. */ + int completed_len = (char *)ns->hdr - ns->data; + /* Send completed messages */ + ns->offset -= ns->offset - completed_len; + ns->hdr = NULL; + } + + bool result = ns->cb(ns, ns->_buf, ns->offset); + ns->_buf = NULL; + + if (!result) { + RT_LOG(LOG_DEBUG, "ns %p offset %d: flush with %p() failed", ns, ns->offset, ns->cb); + } + + return (result); +} + +static __noinline bool +clear_storage(struct nlmsg_state *ns) +{ + struct nlmsg_state ns_new = {}; + int completed_len, new_len; + RT_LOG(LOG_DEBUG2, "realloc storage: used %d/%d bytes", ns->offset, ns->alloc_len); + + /* Calculated new buffer size and allocate it s*/ + completed_len = (ns->hdr != NULL) ? (char *)ns->hdr - ns->data : ns->offset; + if (completed_len > 0) { + /* We already ran out of space, use the largest effective size */ + new_len = max(ns->alloc_len, MCLBYTES); + } else { + if (ns->alloc_len < MCLBYTES) + new_len = MCLBYTES; + else + new_len = ns->alloc_len * 2; + } + bool waitok = ns->malloc_flag == M_WAITOK; + bool is_linux = ns->writer_type == NS_WRITER_TYPE_LBUF; + if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) + return (false); + + /* Update callback data */ + ns_new.writer_target = ns->writer_target; + nlmsg_set_callback(&ns_new); + ns_new.arg = ns->arg; + + /* Copy last (unfinished) header to the new storage */ + int last_len = ns->offset - completed_len; + if (last_len > 0) { + memcpy(ns_new.data, ns->hdr, last_len); + ns_new.hdr = (struct nlmsghdr *)ns_new.data; + ns_new.offset = last_len; + } + + RT_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len); + + /* Flush completed headers */ + if (completed_len > 0) { + RT_LOG(LOG_DEBUG2, "Flushing completed %d bytes", completed_len); + ns->offset -= last_len; + ns->hdr = NULL; + nlmsg_flush(ns); + } + + /* Update state */ + memcpy(ns, &ns_new, sizeof(struct nlmsg_state)); + RT_LOG(LOG_DEBUG2, "switched mbuf: used %d/%d bytes", ns->offset, ns->alloc_len); + + return (true); +} + +/* + * Note it MAY invalidate any previous pointers fetched. + */ +void * +nlmsg_reserve_data_raw(struct nlmsg_state *ns, size_t sz) +{ + if (__predict_false(ns->offset + NETLINK_ALIGN(sz) > ns->alloc_len)) { + if (!clear_storage(ns)) + return (NULL); + } + + void *data_ptr = &ns->data[ns->offset]; + + RT_LOG(LOG_DEBUG3, "add %zu bytes of data at offset %d, buf %p data_ptr %p", + sz, ns->offset, ns->data, data_ptr); + + ns->offset += NLMSG_ALIGN(sz); + + return (data_ptr); +} + +bool +nlmsg_add(struct nlmsg_state *ns, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len) +{ + struct nlmsghdr *hdr; + + if (__predict_false(ns->offset + NETLINK_ALIGN(len + sizeof(struct nlmsghdr)) > ns->alloc_len)) { + if (!clear_storage(ns)) + return (false); + } + + hdr = (struct nlmsghdr *)(&ns->data[ns->offset]); + + hdr->nlmsg_len = len; + hdr->nlmsg_type = type; + hdr->nlmsg_flags = flags; + hdr->nlmsg_seq = seq; + hdr->nlmsg_pid = portid; + + ns->hdr = hdr; + ns->offset += sizeof(struct nlmsghdr); + + return (true); +} + +void +nlmsg_end(struct nlmsg_state *ns) +{ + ns->hdr->nlmsg_len = (uint32_t)(ns->data + ns->offset - (char *)ns->hdr); + ns->hdr = NULL; +} + +void +nlmsg_abort(struct nlmsg_state *ns) +{ + if (ns->hdr != NULL) { + ns->offset = (uint32_t)((char *)ns->hdr - ns->data); + ns->hdr = NULL; + } +} + +bool +nlattr_add_handle_oom(struct nlmsg_state *ns, int attr_type, int attr_len, + const void *data) +{ + int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + + RT_LOG(LOG_DEBUG3, + "no space at offset %d (want %d), alloc_len %d, trying to reclaim", + ns->offset, required_len, ns->alloc_len); + return (clear_storage(ns)); +} Index: sys/netlink/netlink_module.c =================================================================== --- /dev/null +++ sys/netlink/netlink_module.c @@ -0,0 +1,219 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets"); + +#define DEBUG_MOD_NAME nl_mod +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + + +SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, ""); + +nl_handler nl_handlers[NL_MAX_HANDLERS]; + +CK_LIST_HEAD(nl_control_head, nl_control); +static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); + +VNET_DEFINE(struct nl_control *, nl_ctl) = NULL; + +struct mtx nlsock_mtx; +MTX_SYSINIT(nlsock, &nlsock_mtx, "nlsock for handlers or portid list lock", MTX_DEF); + +#define NL_GLOBAL_LOCK_INIT() mtx_init(&nlsock_mtx, "nlsock global mtx", NULL, MTX_DEF) +#define NL_GLOBAL_LOCK() mtx_lock(&nlsock_mtx) +#define NL_GLOBAL_UNLOCK() mtx_unlock(&nlsock_mtx) + +int netlink_unloading = 0; + +static void +free_nl_ctl(struct nl_control *ctl) +{ + rm_destroy(&ctl->ctl_lock); + free(ctl, M_NETLINK); +} + +struct nl_control * +vnet_nl_ctl_init(void) +{ + struct nl_control *ctl; + + ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO); + rm_init(&ctl->ctl_lock, "netlink lock"); + CK_LIST_INIT(&ctl->ctl_port_head); + CK_LIST_INIT(&ctl->ctl_pcb_head); + + NL_GLOBAL_LOCK(); + + struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl); + + if (tmp == NULL) { + atomic_store_ptr(&V_nl_ctl, ctl); + CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next); + RT_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", + curvnet, ctl); + } else { + RT_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance"); + free_nl_ctl(ctl); + ctl = tmp; + } + + NL_GLOBAL_UNLOCK(); + + return (ctl); +} + +static void +vnet_nl_ctl_destroy(const void *unused __unused) +{ + struct nl_control *ctl; + + NL_GLOBAL_LOCK(); + ctl = atomic_load_ptr(&V_nl_ctl); + atomic_store_ptr(&V_nl_ctl, NULL); + if (ctl != NULL) { + RT_LOG(LOG_DEBUG2, "Removing %p from global list", ctl); + CK_LIST_REMOVE(ctl, ctl_next); + } + NL_GLOBAL_UNLOCK(); + + if (ctl != NULL) + free_nl_ctl(ctl); +} +VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_nl_ctl_destroy, NULL); + +int +nl_verify_proto(int proto) +{ + if (proto < 0 || proto >= NL_MAX_HANDLERS) { + return (EINVAL); + } + int handler_defined = nl_handlers[proto] != NULL; + return (handler_defined ? 0 : EPROTONOSUPPORT); +} + +bool +netlink_register_proto(int proto, nl_handler handler) +{ + if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) + return (false); + NL_GLOBAL_LOCK(); + KASSERT((nl_handlers[proto] == NULL), ("netlink handler %d is already set", proto)); + nl_handlers[proto] = handler; + NL_GLOBAL_UNLOCK(); + RT_LOG(LOG_DEBUG, "Registered netlink proto %d handler", proto); + return (true); +} + +bool +netlink_unregister_proto(int proto) +{ + if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) + return (false); + NL_GLOBAL_LOCK(); + KASSERT((nl_handlers[proto] != NULL), ("netlink handler %d is not set", proto)); + nl_handlers[proto] = NULL; + NL_GLOBAL_UNLOCK(); + RT_LOG(LOG_DEBUG, "Unregistered netlink proto %d handler", proto); + return (true); +} + + + +static bool +can_unload(void) +{ + struct nl_control *ctl; + bool result = true; + + NL_GLOBAL_LOCK(); + + CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { + RT_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); + if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { + RT_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); + result = false; + break; + } + } + + NL_GLOBAL_UNLOCK(); + + return (result); +} + +static int +netlink_modevent(module_t mod __unused, int what, void *priv __unused) +{ + int ret = 0; + + switch (what) { + case MOD_LOAD: + RT_LOG(LOG_NOTICE, "Loading"); + break; + + case MOD_UNLOAD: + RT_LOG(LOG_NOTICE, "Unload called"); + if (can_unload()) { + RT_LOG(LOG_WARNING, "unloading"); + netlink_unloading = 1; + } else + ret = EBUSY; + break; + + default: + ret = EOPNOTSUPP; + break; + } + + return (ret); +} +static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; + +DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(netlink, 1); Index: sys/netlink/netlink_nhop.c =================================================================== --- /dev/null +++ sys/netlink/netlink_nhop.c @@ -0,0 +1,318 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_nhop +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + +/* + * idx -> {n:, d:, h:} + * + * + * + */ +struct user_nhop { + uint32_t un_idx; /* Userland-provided index */ + struct nhop_object * un_nhop[3]; /* Normal, host, default */ + struct user_nhop * un_next; + struct epoch_context un_epoch_ctx; /* epoch ctl helper */ +}; + +/* produce hash value for an object */ +#define unhop_hash_obj(_obj) (hash_unhop(_obj)) +/* compare two objects */ +#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) +/* next object accessor */ +#define unhop_next(_obj) (_obj)->un_next + +CHT_SLIST_DEFINE(unhop, struct user_nhop); + +VNET_DEFINE_STATIC(struct unhop_head *, nl_nhop_head) = NULL; +#define V_nl_nhop_head VNET(nl_nhop_head) + +static void consider_resize(uint32_t new_gr_buckets); +static int clone_unhop(const struct nhop_object *nh_base, int nh_flags, + struct nhop_object **pnh); + +static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); +static unsigned int hash_unhop(const struct user_nhop *obj); + +static int +cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) +{ + return (a->un_idx == b->un_idx); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_unhop(const struct user_nhop *obj) +{ + return (obj->un_idx); +} + +/* + * Returns object referenced and unlocked + */ +static int +find_unhop(uint32_t uidx, int nh_flags, struct nhop_object **pnhop) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + NLCTL_TRACKER; + int error = 0; + + struct user_nhop key= { .un_idx = uidx }, *unhop; + nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); + + NLCTL_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(V_nl_nhop_head, unhop, &key, unhop); + if (unhop != NULL) { + int off = 0; + switch (nh_flags) { + case NHF_HOST: + off = 1; + break; + case NHF_DEFAULT: + off = 2; + break; + } + if (unhop->un_nhop[off] != NULL) { + *pnhop = unhop->un_nhop[off]; + goto done; + } + /* Nexthop with the required flags does not exist yet. */ + struct nhop_object *nhop = NULL; + error = clone_unhop(unhop->un_nhop[0], nh_flags, &nhop); + if (error != 0) + goto done; + + /* + * Nexhops remains constant once set and get dereferenced + * only when unhop is deleted. + */ + if (!atomic_cmpset_ptr((uintptr_t *)&unhop->un_nhop[off], + (uintptr_t)NULL, (uintptr_t)nhop)) { + nhop_free_any(nhop); + nhop = atomic_load_ptr(&unhop->un_nhop[off]); + } + *pnhop = unhop->un_nhop[off]; + } else + error = ESRCH; +done: + NLCTL_RUNLOCK(ctl); + return (error); +} + +static struct rib_head * +nhop_get_rnh(const struct nhop_object *nh) +{ + return (rt_tables_get_rnh(nhop_get_fibnum(nh), nhop_get_upper_family(nh))); +} + +#define MAX_STACK_NHOPS 4 +static int +clone_unhop(const struct nhop_object *nh_base, int nh_flags, struct nhop_object **pnh) +{ + const struct weightened_nhop *wn; + struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; + uint32_t num_nhops; + int error; + + if (!NH_IS_NHGRP(nh_base)) { + struct nhop_object *nh; + nh = nhop_alloc(nhop_get_fibnum(nh_base), + nhop_get_upper_family(nh_base)); + if (nh == NULL) + return (ENOMEM); + nhop_copy(nh, nh_base); + nhop_set_uidx(nh, nhop_get_uidx(nh_base)); + nhop_set_pxtype_flag(nh, nh_flags); + *pnh = nhop_get_nhop(nh, &error); + return (error); + } + + const struct nhgrp_object *nhg_base = (const struct nhgrp_object *)nh_base; + wn = nhgrp_get_nhops(nhg_base, &num_nhops); + + if (num_nhops > MAX_STACK_NHOPS) { + wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); + if (wn_new == NULL) + return (ENOMEM); + } else + wn_new = wn_base; + + for (int i = 0; i < num_nhops; i++) { + uint32_t uidx = nhop_get_uidx(wn[i].nh); + if (uidx == 0) { + error = ESRCH; + break; + } + error = find_unhop(uidx, nh_flags, &wn_new[i].nh); + if (error != 0) + break; + wn_new[i].weight = wn[i].weight; + } + + if (error == 0) { + struct rib_head *rh = nhop_get_rnh(wn_new[0].nh); + error = nhgrp_get_group(rh, wn_new, num_nhops, + (struct nhgrp_object **)pnh); + } + + if (wn_new != wn_base) + free(wn_new, M_TEMP); + return (error); +} + +static void +destroy_unhop_epoch(epoch_context_t ctx) +{ + struct user_nhop *unhop; + + unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); + + for (int i = 0; i < 3; i++) + nhop_free_any(unhop->un_nhop[i]); + free(unhop, M_NETLINK); +} + + +static void +delete_unhop(struct user_nhop *unhop) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct user_nhop *unhop_ret; + + NLCTL_WLOCK(ctl); + CHT_SLIST_REMOVE(V_nl_nhop_head, unhop, unhop, unhop_ret); + NLCTL_WUNLOCK(ctl); + + if (unhop_ret == NULL) { + RT_LOG(LOG_DEBUG, "unable to find unhop %u", unhop->un_idx); + } + MPASS(unhop == unhop_ret); + + epoch_call(net_epoch_preempt, destroy_unhop_epoch, + &unhop->un_epoch_ctx); +} + + +static void +consider_resize(uint32_t new_gr_bucket) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + void *gr_ptr = NULL; + size_t alloc_size; + + if (new_gr_bucket == 0) + return; + + if (new_gr_bucket != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_gr_bucket); + gr_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (gr_ptr == NULL) + return; + } + + NLCTL_WLOCK(ctl); + if (gr_ptr != NULL) { + CHT_SLIST_RESIZE(V_nl_nhop_head, unhop, gr_ptr, new_gr_bucket); + } + NLCTL_WUNLOCK(ctl); + + if (gr_ptr != NULL) + free(gr_ptr, M_NETLINK); +} + +static bool __noinline +init_unhops() +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + uint32_t num_buckets = 16; + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + + struct unhop_head *phead = malloc(sizeof(struct unhop_head), M_NETLINK, + M_NOWAIT | M_ZERO); + if (phead == NULL) + return (NULL); + + void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (ptr == NULL) + return (false); + CHT_SLIST_INIT(phead, ptr, num_buckets); + + NLCTL_WLOCK(ctl); + if (V_nl_nhop_head == NULL) + V_nl_nhop_head = phead; + else { + free(ptr, M_NETLINK); + free(phead, M_NETLINK); + } + NLCTL_WUNLOCK(ctl); + + return (true); +} + + +int +rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct netlink_parse_tracker *npt) +{ + if ((__predict_false(V_nl_nhop_head == NULL)) && (!init_unhops())) + return (ENOMEM); + + return (0); +} + + + + Index: sys/netlink/netlink_route.h =================================================================== --- /dev/null +++ sys/netlink/netlink_route.h @@ -0,0 +1,42 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_ROUTE_H_ +#define _NETLINK_NETLINK_ROUTE_H_ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#endif Index: sys/netlink/netlink_route.c =================================================================== --- /dev/null +++ sys/netlink/netlink_route.c @@ -0,0 +1,1083 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_route +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + + +#if defined(INET6) || defined(INET) + +#endif + +static unsigned char +get_rtm_type(const struct nhop_object *nh) +{ + int nh_flags = nh->nh_flags; + + /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ + if (nh_flags & NHF_BLACKHOLE) + return (RTN_BLACKHOLE); + else if (nh_flags & NHF_REJECT) + return (RTN_PROHIBIT); + return (RTN_UNICAST); +} + +static unsigned char +get_rtm_protocol(const struct nhop_object *nh) +{ + if (NH_IS_NHGRP(nh)) { + const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; + nh = nhg->nhops[0]; + } + int rt_flags = nhop_get_rtflags(nh); + if (rt_flags & RTF_PROTO1) + return (RTPROT_ZEBRA); + if (rt_flags & RTF_STATIC) + return (RTPROT_STATIC); + return (RTPROT_KERNEL); +} + +static int +get_rtmsg_type_from_rtsock(int cmd) +{ + switch (cmd) { + case RTM_ADD: + case RTM_CHANGE: + case RTM_GET: + return NL_RTM_NEWROUTE; + case RTM_DELETE: + return NL_RTM_DELROUTE; + } + + return (0); +} + +static struct sockaddr * +parse_rta_ip4(void *rta_data, struct netlink_parse_tracker *npt, int *perror) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in)); + if (__predict_false(sin == NULL)) { + *perror = ENOBUFS; + return (NULL); + } + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr)); + return ((struct sockaddr *)sin); +} + +static struct sockaddr * +get_ip4_netmask(uint8_t plen, struct netlink_parse_tracker *npt, int *perror) +{ + struct in_addr mask; + + if (__predict_false(plen > 32)) { + *perror = EINVAL; + return (NULL); + } + + mask.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); + return (parse_rta_ip4(&mask, npt, perror)); +} + +static struct sockaddr * +parse_rta_ip6(void *rta_data, struct netlink_parse_tracker *npt, int *perror) +{ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in6)); + if (__predict_false(sin6 == NULL)) { + *perror = ENOBUFS; + return (NULL); + } + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in_addr)); + return ((struct sockaddr *)sin6); +} + +static void +ipv6_writemask(struct in6_addr *addr6, uint8_t mask) +{ + uint32_t *cp; + + for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) + *cp++ = 0xFFFFFFFF; + if (mask > 0) + *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); +} + +static struct sockaddr * +get_ip6_netmask(uint8_t plen, struct netlink_parse_tracker *npt, int *perror) +{ + struct in6_addr mask6; + + if (__predict_false(plen > 128)) { + *perror = EINVAL; + return (NULL); + } + ipv6_writemask(&mask6, plen); + + return (parse_rta_ip6(&mask6, npt, perror)); +} + +static struct sockaddr * +parse_rta_ip(struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror) +{ + void *rta_data = NL_RTA_DATA(rta); + int rta_len = NL_RTA_DATA_LEN(rta); + + if (rta_len == sizeof(struct in_addr)) { + return (parse_rta_ip4(rta_data, npt, perror)); + } else if (rta_len == sizeof(struct in6_addr)) { + return (parse_rta_ip6(rta_data, npt, perror)); + } else { + RT_LOG(LOG_NOTICE, "unknown IP len: %d for rta type %d", + rta_len, rta->rta_type); + *perror = ENOTSUP; + return (NULL); + } + return (NULL); +} + +static struct sockaddr * +parse_rta_via(struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror) +{ + struct rtvia *via = NL_RTA_DATA(rta); + int data_len = NL_RTA_DATA_LEN(rta); + + if (__predict_false(data_len) < sizeof(struct rtvia)) { + *perror = EINVAL; + return (NULL); + } + data_len -= offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (__predict_false(data_len < sizeof(struct in_addr))) { + *perror = EINVAL; + return (NULL); + } + return (parse_rta_ip4(via->rtvia_addr, npt, perror)); + case AF_INET6: + if (__predict_false(data_len < sizeof(struct in6_addr))) { + *perror = EINVAL; + return (NULL); + } + return (parse_rta_ip6(via->rtvia_addr, npt, perror)); + default: + *perror = ENOTSUP; + return (NULL); + } +} + +static uint32_t +nl_rta_get_uint32(const struct rtattr *rta, int *perror) +{ + if (__predict_false(NL_RTA_DATA_LEN(rta) != sizeof(uint32_t))) { + RT_LOG(LOG_DEBUG2, "nla type %d size(%u) is not uint32", + rta->rta_type, NL_RTA_DATA_LEN(rta)); + *perror = EINVAL; + return (0); + } + *perror = 0; + return (*((const uint32_t *)NL_RTA_DATA_CONST(rta))); +} + +static struct ifnet * +parse_rta_oif(const struct rtattr *rta, struct netlink_parse_tracker *npt, int *perror) +{ + uint32_t ifindex = nl_rta_get_uint32(rta, perror); + + NET_EPOCH_ASSERT(); + + if (__predict_false(*perror != 0)) + return (NULL); + + return (ifnet_byindex(ifindex)); +} + +struct path_match_data { + struct sockaddr *pmd_dst; /* Destination address */ + struct sockaddr *pmd_gw; /* Destination gateway */ + struct ifnet *pmd_ifp; /* Desination ifp */ + uint32_t pmd_fibnum; + int pmd_family; +}; + +static int +path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) +{ + struct path_match_data *pmd = (struct path_match_data *)_data; + + if ((pmd->pmd_gw != NULL) && !rib_match_gw(rt, nh, pmd->pmd_gw)) + return (0); + + if ((pmd->pmd_ifp != NULL) && (pmd->pmd_ifp != nh->nh_ifp)) + return (0); + + return (1); +} + +static int +get_path_match_data(struct nlmsghdr *hdr, struct path_match_data *pmd, + struct netlink_parse_tracker *npt) +{ + struct nlattr *nla, *nla_head; + int error = 0; + + struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr); + int len = hdr->nlmsg_len - NLMSG_HDRLEN; + + pmd->pmd_fibnum = rtm->rtm_table; + pmd->pmd_family = rtm->rtm_family; + + len -= NETLINK_ALIGN(sizeof(struct rtmsg)); + nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg))); + + RT_LOG(LOG_DEBUG, "parse %p remaining_len %d", nla_head, len); + NLA_FOREACH(nla, nla_head, len) { + struct rtattr *rta = (struct rtattr *)nla; + if (rta->rta_len < sizeof(struct rtattr)) + break; + RT_LOG(LOG_DEBUG2, "parse rta %d len %d", rta->rta_type, rta->rta_len); + + switch (rta->rta_type) { + case NL_RTA_DST: + pmd->pmd_dst = parse_rta_ip(rta, npt, &error); + break; + case NL_RTA_GATEWAY: + pmd->pmd_gw = parse_rta_ip(rta, npt, &error); + break; + case NL_RTA_VIA: + /* Gateway in different AF */ + pmd->pmd_gw = parse_rta_via(rta, npt, &error); + break; + case NL_RTA_OIF: + pmd->pmd_ifp = parse_rta_oif(rta, npt, &error); + break; + case NL_RTA_TABLE: + pmd->pmd_fibnum = nl_rta_get_uint32(rta, &error); + if (pmd->pmd_fibnum >= V_rt_numfibs) { + RT_LOG(LOG_DEBUG, "Incorrect fibnum: %u", pmd->pmd_fibnum); + return (EINVAL); + } + break; + } + if (error != 0) + return (error); + } + return (0); +} + +/* + * fibnum heuristics + * + * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS + * msg rtm_table RTA_TABLE result + * RTM_GETROUTE/dump 0 - RT_ALL_FIBS + * RTM_GETROUTE/dump 1 - 1 + * RTM_GETROUTE/get 0 - 0 + * + */ + +static struct nhop_object * +rc_get_nhop(const struct rib_cmd_info *rc) +{ + return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); +} + +static bool +dump_rc_nhop_gw(struct nlmsg_state *ns, struct nhop_object *nh) +{ + int upper_family; + + switch (nhop_get_neigh_family(nh)) { + case AF_LINK: + /* onlink prefix, skip */ + break; + case AF_INET: + if (!nlattr_add(ns, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr)) + return (false); + break; + case AF_INET6: + upper_family = nhop_get_upper_family(nh); + if (upper_family == AF_INET6) { + if (!nlattr_add(ns, NL_RTA_GATEWAY, 16, &nh->gw6_sa.sin6_addr)) + return (false); + } else if (upper_family == AF_INET) { + /* IPv4 over IPv6 */ + char buf[20]; + struct rtvia *via = (struct rtvia *)&buf[0]; + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &nh->gw6_sa.sin6_addr, 16); + if (!nlattr_add(ns, NL_RTA_VIA, 17, via)) + return (false); + } else { + /* shouldn't happen */ + return (false); + } + break; + } + + return (true); + +} + + +static bool +dump_rc_nhop(struct nlmsg_state *ns, const struct rib_cmd_info *rc) +{ + struct nhop_object *nh; + + nh = rc_get_nhop(rc); + /* XXX: can be raw */ + + if (nh == NULL) + return (false); + + /* + * IPv4 over IPv6 + * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), + * IPv4 w/ gw + * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], + * Direct route: + * ('RTA_OIF', 2) + */ + if (nh->nh_flags & NHF_GATEWAY) + dump_rc_nhop_gw(ns, nh); + + /* Add nhop id. XXX: Switch to user nhop id */ + if (!nlattr_add_u32(ns, NL_RTA_NH_ID, nhop_get_idx(nh))) + return (false); + + /* In any case, fill outgoing interface */ + if (!nlattr_add_u32(ns, NL_RTA_OIF, nh->nh_ifp->if_index)) + return (false); + + return (true); +} + +/* + * Dumps output from a rib command into an rtmsg + */ + +static int +dump_rc(uint32_t fibnum, const struct nlmsghdr *hdr, + const struct rib_cmd_info *rc, struct nlmsg_state *ns) +{ + const struct nhop_object *nh = rc_get_nhop(rc); + struct rtmsg *rtm; + int error = 0; + + NET_EPOCH_ASSERT(); + + int payload_len = sizeof(struct rtmsg); + int nlmsgtype = get_rtmsg_type_from_rtsock(rc->rc_cmd); + if (!nlmsg_add(ns, hdr->nlmsg_pid, hdr->nlmsg_seq, nlmsgtype, + hdr->nlmsg_flags, payload_len)) + goto enomem; + + int family = rt_get_family(rc->rc_rt); + rtm = nlmsg_reserve_object(ns, struct rtmsg); + rtm->rtm_family = family; + rtm->rtm_dst_len = 0; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + if (fibnum < 255) + rtm->rtm_table = (unsigned char)fibnum; + rtm->rtm_protocol = get_rtm_protocol(nh); + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_type = get_rtm_type(nh); + rtm->rtm_flags = 0; + + if (!nlattr_add_u32(ns, NL_RTA_TABLE, fibnum)) + goto enomem; + + int plen = 0; + uint32_t scopeid = 0; + switch (family) { + case AF_INET: + { + struct in_addr addr; + rt_get_inet_prefix_plen(rc->rc_rt, &addr, &plen, &scopeid); + // XXX: fixme + rtm->rtm_dst_len = plen; + if (!nlattr_add(ns, NL_RTA_DST, 4, &addr)) + goto enomem; + break; + } + case AF_INET6: + { + struct in6_addr addr; + rt_get_inet6_prefix_plen(rc->rc_rt, &addr, &plen, &scopeid); + rtm->rtm_dst_len = plen; + if (!nlattr_add(ns, NL_RTA_DST, 16, &addr)) + goto enomem; + break; + } + default: + FIB_LOG(LOG_NOTICE, fibnum, family, "unknown rt family"); + error = EAFNOSUPPORT; + goto flush; + } + + if (!dump_rc_nhop(ns, rc)) + goto enomem; + +/* + struct nlattr *metrics_nla; + metrics_nla = nla_nest_start(m, NL_RTA_METRICS); + nlattr_add_u32(m, NL_RTAX_MTU, nh->nh_mtu); + nla_nest_end(m, metrics_nla); +*/ + nlmsg_end(ns); + return (0); +enomem: + error = ENOMEM; +flush: + nlmsg_abort(ns); + return (error); +} + +static int +family_to_group(int family) +{ + switch (family) { + case AF_INET: + return (RTNLGRP_IPV4_ROUTE); + case AF_INET6: + return (RTNLGRP_IPV6_ROUTE); + } + return (0); +} + + +static void +report_operation(uint32_t fibnum, struct rib_cmd_info *rc, + struct nlpcb *nlp, struct nlmsghdr *hdr) +{ + struct nlmsg_state ns; + + uint32_t group_mask = family_to_group(rt_get_family(rc->rc_rt)); + if (nlmsg_get_group_writer(NLMSG_SMALL, group_mask, &ns)) { + dump_rc(fibnum, hdr, rc, &ns); + nlmsg_flush(&ns); + } +} + + +struct netlink_walkargs { + struct nlmsg_state ns; + struct rib_cmd_info rc; + struct nlmsghdr hdr; + struct nlpcb *nlp; + uint32_t fibnum; + int family; + int error; + int count; + int dumped; + int dumped_tables; +}; + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG3 +static void +print_hex(char *data, int len) +{ + unsigned char buffer[128], *ptr; + + ptr = &buffer[0]; + + for (int i = 0; i < len; i++) { + printf(" WTF? %d %d\n", i, (int)(unsigned char)data[i]); + ptr += snprintf(ptr, 128, "%02X ", (unsigned char)data[i]); + } + *ptr = '\0'; + RT_LOG(LOG_DEBUG2, "DBG: %s", buffer); +} +#endif + +static int +dump_rtentry(struct rtentry *rt, void *_arg) +{ + struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; + int error; + + wa->count++; + if (wa->error != 0) + return (0); + wa->dumped++; + + wa->rc.rc_rt = rt; + wa->rc.rc_nh_new = rt_get_raw_nhop(rt); + + error = dump_rc(wa->fibnum, &wa->hdr, &wa->rc, &wa->ns); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG2 + char rtbuf[INET6_ADDRSTRLEN + 5]; + FIB_LOG(LOG_DEBUG2, wa->fibnum, wa->family, "Dump %s, offset %u, error %d", + rt_print_buf(rt, rtbuf, sizeof(rtbuf)), wa->ns.offset, error); +#endif + wa->error = error; + + return (0); +} + +static void +dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) +{ + FIB_LOG(LOG_DEBUG, fibnum, family, "Start dump"); + wa->count = 0; + wa->dumped = 0; + + rib_walk(fibnum, family, false, dump_rtentry, wa); + + wa->dumped_tables++; + + FIB_LOG(LOG_DEBUG, fibnum, family, "End dump, iterated %d dumped %d", + wa->count, wa->dumped); + RT_LOG(LOG_DEBUG2, "Current offset: %d", wa->ns.offset); +} + +static int +dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) +{ + wa->fibnum = fibnum; + + if (family == AF_UNSPEC) { + for (int i = 0; i < AF_MAX; i++) { + if (rt_tables_get_rnh(fibnum, i) != 0) { + wa->family = i; + dump_rtable_one(wa, fibnum, i); + if (wa->error != 0) + break; + } + } + } else { + if (rt_tables_get_rnh(fibnum, family) != 0) { + wa->family = family; + dump_rtable_one(wa, fibnum, family); + } + } + + return (wa->error); +} + + +static int +handle_rtm_getroute(struct nlpcb *nlp, struct path_match_data *pmd, + struct nlmsghdr *hdr) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + struct nhop_object *nh; + uint32_t fibnum = pmd->pmd_fibnum; + sa_family_t family = pmd->pmd_family; + + if (pmd->pmd_dst == NULL) { + RT_LOG(LOG_DEBUG, "No RTA_DST supplied"); + return (EINVAL); + } + + FIB_LOG(LOG_DEBUG, fibnum, family, "getroute called"); + + rnh = rt_tables_get_rnh(fibnum, family); + if (rnh == NULL) + return (EAFNOSUPPORT); + + struct rib_cmd_info rc = {}; + + RIB_RLOCK(rnh); + + rc.rc_rt = (struct rtentry *)rnh->rnh_matchaddr(pmd->pmd_dst, &rnh->head); + if (rc.rc_rt == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } + + // XXX: multipath + nh = nhop_select_func(rt_get_raw_nhop(rc.rc_rt), 0); + + rc.rc_nh_new = nh; + rc.rc_nh_weight = rc.rc_rt->rt_weight; + rc.rc_cmd = RTM_GET; + RIB_RUNLOCK(rnh); + +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char rtbuf[INET6_ADDRSTRLEN + 5], nhbuf[INET6_ADDRSTRLEN + 5]; + FIB_LOG(LOG_DEBUG, fibnum, family, "getroute completed: got %s for %s", + nhop_print_buf(rc.rc_nh_new, nhbuf, sizeof(nhbuf)), + rt_print_buf(rc.rc_rt, rtbuf, sizeof(rtbuf))); +#endif + struct nlmsg_state ns = {}; + if (!nlmsg_get_socket_writer(NLMSG_SMALL, nlp, &ns)) + return (ENOMEM); + dump_rc(fibnum, hdr, &rc, &ns); + nlmsg_flush(&ns); + + return (0); +} + +static int +handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, + struct nlmsghdr *hdr) +{ + struct netlink_walkargs wa = { + .nlp = nlp, + .rc.rc_cmd = RTM_ADD, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + }; + + if (!nlmsg_get_socket_writer(NLMSG_LARGE, nlp, &wa.ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return (ENOMEM); + } + + if (fibnum == RT_TABLE_UNSPEC) { + for (int i = 0; i < V_rt_numfibs; i++) { + dump_rtable_fib(&wa, fibnum, family); + if (wa.error != 0) + break; + } + } else + dump_rtable_fib(&wa, fibnum, family); + + if (wa.error == 0 && wa.dumped_tables == 0) { + FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); + wa.error = ESRCH; + // How do we propagate it? + } + + if (wa.error == 0) { + if (!nlmsg_add(&wa.ns, wa.hdr.nlmsg_pid, wa.hdr.nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { + RT_LOG(LOG_DEBUG, "Error finalizing table dump"); + return (ENOMEM); + } + /* Save operation result */ + int *perror = nlmsg_reserve_object(&wa.ns, int); + RT_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", wa.error, + wa.ns.offset, perror); + *perror = wa.error; + nlmsg_end(&wa.ns); + } + nlmsg_flush(&wa.ns); + + return (wa.error); +} + +static struct nhop_object * +finalize_nhop(struct nhop_object *nh, int *perror) +{ + /* + * The following MUST be filled: + * nh_ifp, nh_ifa, nh_gw + */ + if (nh->gw_sa.sa_family == 0) { + /* + * Empty gateway. Can be direct route with RTA_OIF set. + */ + if (nh->nh_ifp != NULL) + nhop_set_direct_gw(nh, nh->nh_ifp); + else { + RT_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); + *perror = EINVAL; + return (NULL); + } + /* Both nh_ifp and gateway are set */ + } else { + /* Gateway is set up, we can derive ifp if not set */ + if (nh->nh_ifp == NULL) { + struct ifaddr *ifa = ifa_ifwithnet(&nh->gw_sa, 1, nhop_get_fibnum(nh)); + if (ifa == NULL) { + RT_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); + *perror = EINVAL; + return (NULL); + } + nhop_set_transmit_ifp(nh, ifa->ifa_ifp); + } + } + /* Both nh_ifp and gateway are set */ + if (nh->nh_ifa == NULL) { + struct ifaddr *ifa = ifaof_ifpforaddr(&nh->gw_sa, nh->nh_ifp); + if (ifa == NULL) { + RT_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); + *perror = EINVAL; + return (NULL); + } + nhop_set_src(nh, ifa); + } + + return (nhop_get_nhop(nh, perror)); +} + +static int +get_pxflag(const struct rtmsg *rtm) +{ + int pxflag = 0; + switch (rtm->rtm_family) { + case AF_INET: + if (rtm->rtm_dst_len == 32) + pxflag = NHF_HOST; + else if (rtm->rtm_dst_len == 0) + pxflag = NHF_DEFAULT; + break; + case AF_INET6: + if (rtm->rtm_dst_len == 32) + pxflag = NHF_HOST; + else if (rtm->rtm_dst_len == 0) + pxflag = NHF_DEFAULT; + break; + } + + return (pxflag); +} + +static int +get_rtm_flags(int nlm_flags) +{ + int rtm_flags = 0; + + rtm_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; + rtm_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; + rtm_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; + rtm_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; + + return (rtm_flags); +} + +static int +rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct netlink_parse_tracker *npt) +{ + struct sockaddr *sa, *dst = NULL; + struct nlattr *nla, *nla_head; + struct rib_cmd_info rc = {}; + struct ifnet *ifp; + uint32_t fibnum; + int error = 0; + + struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr); + int len = hdr->nlmsg_len - NLMSG_HDRLEN; + + len -= NETLINK_ALIGN(sizeof(struct rtmsg)); + nla_head = (struct nlattr *)((char *)rtm + NETLINK_ALIGN(sizeof(struct rtmsg))); + + fibnum = rtm->rtm_table; + if (fibnum > V_rt_numfibs) { + RT_LOG(LOG_DEBUG, "incorrect fibnum: %u", fibnum) + return (EINVAL); + } + + struct nhop_object *nh = nhop_alloc(fibnum, rtm->rtm_family); + if (nh == NULL) + return (ENOMEM); + + RT_LOG(LOG_DEBUG, "parse %p remaining_len %d", nla_head, len); + NLA_FOREACH(nla, nla_head, len) { + struct rtattr *rta = (struct rtattr *)nla; + if (rta->rta_len < sizeof(struct rtattr)) { + RT_LOG(LOG_NOTICE, "invalid length for attribute %d, stopping processing", + rta->rta_type); + //error = EINVAL; + break; + } + RT_LOG(LOG_DEBUG2, "parse rta %d len %d", rta->rta_type, rta->rta_len); + + switch (rta->rta_type) { + case NL_RTA_DST: + dst = parse_rta_ip(rta, npt, &error); + break; + case NL_RTA_GATEWAY: + sa = parse_rta_ip(rta, npt, &error); + if (sa != NULL) + nhop_set_gw(nh, sa, true); + break; + case NL_RTA_VIA: + /* Gateway in different AF */ + sa = parse_rta_via(rta, npt, &error); + if (sa != NULL) + nhop_set_gw(nh, sa, true); + break; + case NL_RTA_OIF: + ifp = parse_rta_oif(rta, npt, &error); + if (ifp != NULL) + nhop_set_transmit_ifp(nh, ifp); + break; + case NL_RTA_TABLE: + fibnum = nl_rta_get_uint32(rta, &error); + if (fibnum > V_rt_numfibs) { + RT_LOG(LOG_DEBUG, "incorrect fibnum: %u", fibnum); + error = EINVAL; + } else + nhop_set_fibnum(nh, fibnum); + break; + default: + RT_LOG(LOG_DEBUG, "unsupported rta_type %d", rta->rta_type); + break; + } + + if (__predict_false(error != 0)) + break; + } + + if (error != 0) { + nhop_free(nh); + return (error); + } + + /* Check if we have enough data */ + if (dst == NULL) { + RT_LOG(LOG_DEBUG, "missing NL_RTA_DST"); + return (EINVAL); + } + + int pxflag = get_pxflag(rtm); + if (pxflag != 0) + nhop_set_pxtype_flag(nh, pxflag); + + struct route_nhop_data rnd = { .rnd_weight = RT_DEFAULT_WEIGHT }; + rnd.rnd_nhop = finalize_nhop(nh, &error); + if (error != 0) { + RT_LOG(LOG_DEBUG, "FCK!"); + return (error); + } + + int rtm_flags = get_rtm_flags(hdr->nlmsg_flags); + + error = rib_add_route_px(fibnum, dst, rtm->rtm_dst_len, &rnd, rtm_flags, &rc); + if (error == 0) + report_operation(fibnum, &rc, nlp, hdr); + return (error); +} + +static int +rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct netlink_parse_tracker *npt) +{ + struct rib_cmd_info rc; + struct sockaddr *dst; + int error; + + struct path_match_data pmd = {}; + error = get_path_match_data(hdr, &pmd, npt); + if (error != 0) + return (error); + + if ((dst = pmd.pmd_dst) == NULL) { + RT_LOG(LOG_DEBUG, "No dst to delete"); + return (ESRCH); + } + pmd.pmd_dst = NULL; + struct rtmsg *rtm = (struct rtmsg *)nlmsg_data(hdr); + + error = rib_del_route_px(pmd.pmd_fibnum, dst, rtm->rtm_dst_len, + path_match_func, &pmd, 0, &rc); + if (error == 0) + report_operation(pmd.pmd_fibnum, &rc, nlp, hdr); + return (error); +} + +static int +rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt) +{ + struct path_match_data pmd = {}; + int error; + + error = get_path_match_data(hdr, &pmd, npt); + if (error != 0) + return (error); + + if (hdr->nlmsg_flags & NLM_F_DUMP) + error = handle_rtm_dump(nlp, pmd.pmd_fibnum, pmd.pmd_family, hdr); + else + error = handle_rtm_getroute(nlp, &pmd, hdr); + + return (error); +} + +static int +rtnl_handle_message(struct nlmsghdr *hdr, struct nlpcb *nlp, struct netlink_parse_tracker *npt) +{ + int error = 0; + + RT_LOG(LOG_DEBUG2, "received msg type %d (pid %u)", hdr->nlmsg_type, + hdr->nlmsg_pid); + /* XXX: check min header length */ + switch (hdr->nlmsg_type) { + case NL_RTM_NEWROUTE: + error = rtnl_handle_newroute(hdr, nlp, npt); + break; + case NL_RTM_DELROUTE: + error = rtnl_handle_delroute(hdr, nlp, npt); + break; + case NL_RTM_GETROUTE: + error = rtnl_handle_getroute(hdr, nlp, npt); + break; + case NL_RTM_GETLINK: + error = rtnl_handle_getlink(hdr, nlp, npt); + break; + case NL_RTM_GETADDR: + error = rtnl_handle_getaddr(hdr, nlp, npt); + break; + case NL_RTM_NEWNEXTHOP: + error = rtnl_handle_newnhop(hdr, nlp, npt); + default: + RT_LOG(LOG_DEBUG, "msg type %d unsupported (pid %u)", + hdr->nlmsg_type, hdr->nlmsg_pid); + error = EOPNOTSUPP; + } + + return (error); +} + +/* + * Handler called by netlink subsystem when matching netlink message is received + */ +static int +rtnl_receive_message(struct nlmsghdr *hdr, struct netlink_parse_tracker *npt) +{ + struct epoch_tracker et; + int error; + + NET_EPOCH_ENTER(et); + error = rtnl_handle_message(hdr, npt->nlp, npt); + NET_EPOCH_EXIT(et); + + return (error); +} + +static void +handle_route_event(uint32_t fibnum, const struct rt_addrinfo *info, + const struct rib_cmd_info *rc) +{ + int family, nlm_flags = 0; + + struct nlmsg_state ns; + + family = rt_get_family(rc->rc_rt); + + /* XXX: check if there are active listeners first */ + + /* TODO: consider passing PID/type/seq */ + switch (rc->rc_cmd) { + case RTM_ADD: + nlm_flags = NLM_F_EXCL | NLM_F_CREATE; + break; + case RTM_CHANGE: + nlm_flags = NLM_F_REPLACE; + break; + case RTM_DELETE: + nlm_flags = 0; + break; + } +#if DEBUG_MAX_LEVEL >= LOG_DEBUG + char rtbuf[INET6_ADDRSTRLEN + 5]; + FIB_LOG(LOG_DEBUG, fibnum, family, "received event %s for %s / nlm_flags=%X", + rib_print_cmd(rc->rc_cmd), rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), + nlm_flags); +#endif + struct nlmsghdr hdr = { + .nlmsg_flags = nlm_flags, + }; + + uint32_t group_mask = family_to_group(family); + + if (!nlmsg_get_group_writer(NLMSG_SMALL, group_mask, &ns)) { + RT_LOG(LOG_DEBUG, "error allocating mbuf"); + return; + } + + dump_rc(fibnum, &hdr, rc, &ns); + nlmsg_flush(&ns); +} + +static void nlbridge_cb_func(uint32_t event_type, uint32_t fibnum, + const struct rt_addrinfo *info, const struct rib_cmd_info *rc, void *arg) +{ + RT_LOG(LOG_DEBUG2, "received bridge event %d", event_type); + switch (event_type) { + case NLBR_EVENT_ROUTE: + handle_route_event(fibnum, info, rc); + break; + } +} + +static struct rib_event_bridge nlbridge = { + .reb_cb = nlbridge_cb_func, + .reb_cb_arg = NULL, + .reb_provider_id = NLBR_PROVIDER_NETLINK, +}; + +static void +rtnl_load(void *u __unused) +{ + RT_LOG(LOG_ERR, "netlink support is in ALPHA stage"); + RT_LOG(LOG_NOTICE, "rtnl loading"); + rib_bridge_link(&nlbridge); + rtnl_ifaces_init(); + netlink_register_proto(NETLINK_ROUTE, rtnl_receive_message); +} +SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL); + +static void +rtnl_unload(void *u __unused) +{ + rib_bridge_unlink(&nlbridge); + rtnl_ifaces_destroy(); + + /* Wait till all consumers read nlbridge data */ + epoch_wait_preempt(net_epoch_preempt); +} +SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL); Index: sys/netlink/netlink_var.h =================================================================== --- /dev/null +++ sys/netlink/netlink_var.h @@ -0,0 +1,200 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_VAR_H_ +#define _NETLINK_NETLINK_VAR_H_ + +#include +#include +#include +#include + +MALLOC_DECLARE(M_NETLINK); + +#define NLSNDQ 65536 /* Default socket sendspace */ +#define NLRCVQ 65536 /* Default socket recvspace */ + +struct nlpcb { + struct socket *nl_socket; + uint32_t nl_port; + uint32_t nl_groups; + uint32_t nl_flags; + uint32_t nl_process_id; + int nl_proto; + bool nl_active; + bool nl_task_pending; + bool nl_linux; /* true if running under compat */ + struct mbuf *nl_queue_head; + struct mbuf *nl_queue_last; + int64_t nl_queue_length; + struct taskqueue *nl_taskqueue; + struct task nl_task; + CK_LIST_ENTRY(nlpcb) nl_next; + CK_LIST_ENTRY(nlpcb) nl_port_next; + volatile u_int nl_refcount; + struct mtx nl_lock; + struct epoch_context nl_epoch_ctx; +}; +#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) + +#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) +#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) +#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) +#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) + +#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) + +#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ +#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ + +#define NETISR_NETLINK 15 // XXX hack, must be unused and < 16 + + +SYSCTL_DECL(_net_netlink); + +struct nl_io { + struct callout callout; + struct mbuf *head; + struct mbuf *last; + int64_t length; +}; + + +struct nl_control { + CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; + CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; + CK_LIST_ENTRY(nl_control) ctl_next; + struct nl_io ctl_io; + struct rmlock ctl_lock; +}; +VNET_DECLARE(struct nl_control *, nl_ctl); +#define V_nl_ctl VNET(nl_ctl) + + +/* locking */ +#define NLCTL_TRACKER struct rm_priotracker nl_tracker +#define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) +#define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) + +#define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) +#define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) + +struct sockaddr_nl; +struct sockaddr; +struct nlmsghdr; + +/* Parsing state */ + +struct linear_buffer { + char *base; /* Base allocated memory pointer */ + uint32_t offset; /* Currently used offset */ + uint32_t size; /* Total buffer size */ +}; + +static inline void * +lb_alloc(struct linear_buffer *lb, int len) +{ + len = roundup2(len, sizeof(uint64_t)); + if (lb->offset + len > lb->size) + return (NULL); + void *data = (void *)(lb->base + lb->offset); + lb->offset += len; + return (data); +} + +static inline void +lb_clear(struct linear_buffer *lb) +{ + memset(lb->base, 0, lb->size); + lb->offset = 0; +} + +#define SCRATCH_BUFFER_SIZE 1024 +struct netlink_parse_tracker { + struct linear_buffer lb; /* Per-message scratch buffer */ + struct nlpcb *nlp; /* Originator */ + struct nlmsghdr *hdr; /* Current message being parsed */ + int error; /* last operation error */ +}; + +static inline void * +npt_alloc(struct netlink_parse_tracker *npt, int len) +{ + return (lb_alloc(&npt->lb, len)); +} +#define npt_alloc_sockaddr(_npt, _len) ((struct sockaddr *)(npt_alloc(_npt, _len))) + +/* netlink_netisr.c */ +void netlink_netisr_init(void); +void netlink_netisr_destroy(void); +void netlink_netisr_vnet_init(void); +void netlink_netisr_vnet_destroy(void); +int nl_send_msg(struct mbuf *m, uint32_t group_mask); +void nl_msg_from_netlink(struct mbuf *m); + +extern struct netisr_handler nlsock_nh; + +/* netlink_io.c */ +void nl_taskqueue_handler(void *_arg, int pending); +int nl_receive_async(struct mbuf *m, struct socket *so); +void nl_process_receive_locked(struct nlpcb *nlp); + +/* netlink_iface.c */ +struct rt_addrinfo; +int rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct netlink_parse_tracker *npt); +int rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct netlink_parse_tracker *npt); +void rtnl_ifaces_init(void); +void rtnl_ifaces_destroy(void); + +/* netlink_module.c */ +struct nl_control *vnet_nl_ctl_init(void); + +int nl_verify_proto(int proto); + +extern int netlink_unloading; + +#define NL_MAX_HANDLERS 100 +extern nl_handler nl_handlers[NL_MAX_HANDLERS]; + +/* netlink_nhop.c */ +int rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct netlink_parse_tracker *npt); + +/* netlink_io.c */ +void nl_send_group(struct mbuf *m, uint32_t group_mask); +bool nl_send_one(struct mbuf *m, struct nlpcb *nlp); +void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg); + +/* Linux compat */ +struct nlmsghdr *nlmsg_from_linux(struct nlmsghdr *hdr, + struct netlink_parse_tracker *npt); +struct mbuf *nlmsgs_to_linux(char *buf, int buflen, struct nlpcb *nlp); +struct mbuf *mbufs_to_linux(struct mbuf *m, struct nlpcb *nlp); + +#endif Index: sys/netlink/route/common.h =================================================================== --- /dev/null +++ sys/netlink/route/common.h @@ -0,0 +1,243 @@ +/* + * Common defines for all parts of the netlink route family + */ + +#ifndef _NETLINK_ROUTE_COMMON_H_ +#define _NETLINK_ROUTE_COMMON_H_ + +/* + * All messages defined by the NETLINK_ROUTE subsystem + */ +enum { + NL_RTM_BASE = 16, +#define NL_RTM_BASE NL_RTM_BASE + NL_RTM_NEWLINK = 16, +#define NL_RTM_NEWLINK NL_RTM_NEWLINK + NL_RTM_DELLINK, +#define NL_RTM_DELLINK NL_RTM_DELLINK + NL_RTM_GETLINK, +#define NL_RTM_GETLINK NL_RTM_GETLINK + NL_RTM_SETLINK, +#define NL_RTM_SETLINK NL_RTM_SETLINK + NL_RTM_NEWADDR = 20, +#define NL_RTM_NEWADDR NL_RTM_NEWADDR + NL_RTM_DELADDR, +#define NL_RTM_DELADDR NL_RTM_DELADDR + NL_RTM_GETADDR, +#define NL_RTM_GETADDR NL_RTM_GETADDR + NL_RTM_NEWROUTE = 24, +#define NL_RTM_NEWROUTE NL_RTM_NEWROUTE + NL_RTM_DELROUTE, +#define NL_RTM_DELROUTE NL_RTM_DELROUTE + NL_RTM_GETROUTE, +#define NL_RTM_GETROUTE NL_RTM_GETROUTE + NL_RTM_NEWNEIGH = 28, +#define NL_RTM_NEWNEIGH NL_RTM_NEWNEIGH + NL_RTM_DELNEIGH, +#define NL_RTM_DELNEIGH NL_RTM_DELNEIGH + NL_RTM_GETNEIGH, +#define NL_RTM_GETNEIGH NL_RTM_GETNEIGH + NL_RTM_NEWRULE = 32, +#define NL_RTM_NEWRULE NL_RTM_NEWRULE + NL_RTM_DELRULE, +#define NL_RTM_DELRULE NL_RTM_DELRULE + NL_RTM_GETRULE, +#define NL_RTM_GETRULE NL_RTM_GETRULE + NL_RTM_NEWQDISC = 36, +#define NL_RTM_NEWQDISC NL_RTM_NEWQDISC + NL_RTM_DELQDISC, +#define NL_RTM_DELQDISC NL_RTM_DELQDISC + NL_RTM_GETQDISC, +#define NL_RTM_GETQDISC NL_RTM_GETQDISC + NL_RTM_NEWTCLASS = 40, +#define NL_RTM_NEWTCLASS NL_RTM_NEWTCLASS + NL_RTM_DELTCLASS, +#define NL_RTM_DELTCLASS NL_RTM_DELTCLASS + NL_RTM_GETTCLASS, +#define NL_RTM_GETTCLASS NL_RTM_GETTCLASS + NL_RTM_NEWTFILTER = 44, +#define NL_RTM_NEWTFILTER NL_RTM_NEWTFILTER + NL_RTM_DELTFILTER, +#define NL_RTM_DELTFILTER NL_RTM_DELTFILTER + NL_RTM_GETTFILTER, +#define NL_RTM_GETTFILTER NL_RTM_GETTFILTER + NL_RTM_NEWACTION = 48, +#define NL_RTM_NEWACTION NL_RTM_NEWACTION + NL_RTM_DELACTION, +#define NL_RTM_DELACTION NL_RTM_DELACTION + NL_RTM_GETACTION, +#define NL_RTM_GETACTION NL_RTM_GETACTION + NL_RTM_NEWPREFIX = 52, +#define NL_RTM_NEWPREFIX NL_RTM_NEWPREFIX + NL_RTM_GETMULTICAST = 58, +#define NL_RTM_GETMULTICAST NL_RTM_GETMULTICAST + NL_RTM_GETANYCAST = 62, +#define NL_RTM_GETANYCAST NL_RTM_GETANYCAST + NL_RTM_NEWNEIGHTBL = 64, +#define NL_RTM_NEWNEIGHTBL NL_RTM_NEWNEIGHTBL + NL_RTM_GETNEIGHTBL = 66, +#define NL_RTM_GETNEIGHTBL NL_RTM_GETNEIGHTBL + NL_RTM_SETNEIGHTBL, +#define NL_RTM_SETNEIGHTBL NL_RTM_SETNEIGHTBL + NL_RTM_NEWNDUSEROPT = 68, +#define NL_RTM_NEWNDUSEROPT NL_RTM_NEWNDUSEROPT + NL_RTM_NEWADDRLABEL = 72, +#define NL_RTM_NEWADDRLABEL NL_RTM_NEWADDRLABEL + NL_RTM_DELADDRLABEL, +#define NL_RTM_DELADDRLABEL NL_RTM_DELADDRLABEL + NL_RTM_GETADDRLABEL, +#define NL_RTM_GETADDRLABEL NL_RTM_GETADDRLABEL + NL_RTM_GETDCB = 78, +#define NL_RTM_GETDCB NL_RTM_GETDCB + NL_RTM_SETDCB, +#define NL_RTM_SETDCB NL_RTM_SETDCB + NL_RTM_NEWNETCONF = 80, +#define NL_RTM_NEWNETCONF NL_RTM_NEWNETCONF + NL_RTM_GETNETCONF = 82, +#define NL_RTM_GETNETCONF NL_RTM_GETNETCONF + NL_RTM_NEWMDB = 84, +#define NL_RTM_NEWMDB NL_RTM_NEWMDB + NL_RTM_DELMDB = 85, +#define NL_RTM_DELMDB NL_RTM_DELMDB + NL_RTM_GETMDB = 86, +#define NL_RTM_GETMDB NL_RTM_GETMDB + NL_RTM_NEWNSID = 88, +#define NL_RTM_NEWNSID NL_RTM_NEWNSID + NL_RTM_DELNSID = 89, +#define NL_RTM_DELNSID NL_RTM_DELNSID + NL_RTM_GETNSID = 90, +#define NL_RTM_GETNSID NL_RTM_GETNSID + NL_RTM_NEWSTATS = 92, +#define NL_RTM_NEWSTATS NL_RTM_NEWSTATS + NL_RTM_GETSTATS = 94, +#define NL_RTM_GETSTATS NL_RTM_GETSTATS + NL_RTM_NEWNEXTHOP = 104, +#define NL_RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP + NL_RTM_DELNEXTHOP, +#define NL_RTM_DELNEXTHOP NL_RTM_DELNEXTHOP + NL_RTM_GETNEXTHOP, +#define NL_RTM_GETNEXTHOP NL_RTM_GETNEXTHOP + __NL_RTM_MAX, +}; +#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1) + +#ifndef _KERNEL +/* + * RTM_* namespace clashes with BSD rtsock namespace. + * Use NL_RTM_ prefix in the kernel and map it to RTM_ + * for userland. + */ +#define RTM_BASE NL_RTM_BASE +#define RTM_NEWLINK NL_RTM_NEWLINK +#define RTM_DELLINK NL_RTM_DELLINK +#define RTM_GETLINK NL_RTM_GETLINK +#define RTM_SETLINK NL_RTM_SETLINK +#define RTM_NEWADDR NL_RTM_NEWADDR +#define RTM_DELADDR NL_RTM_DELADDR +#define RTM_GETADDR NL_RTM_GETADDR +#define RTM_NEWROUTE NL_RTM_NEWROUTE +#define RTM_DELROUTE NL_RTM_DELROUTE +#define RTM_GETROUTE NL_RTM_GETROUTE +#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP +#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP +#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP +#endif + +#ifndef _KERNEL +/* rtnetlink multicast groups - backwards compatibility for userspace */ +#define RTMGRP_LINK 0x01 +#define RTMGRP_NOTIFY 0x02 +#define RTMGRP_NEIGH 0x04 +#define RTMGRP_TC 0x08 + +#define RTMGRP_IPV4_IFADDR 0x10 +#define RTMGRP_IPV4_MROUTE 0x20 +#define RTMGRP_IPV4_ROUTE 0x40 +#define RTMGRP_IPV4_RULE 0x80 + +#define RTMGRP_IPV6_IFADDR 0x100 +#define RTMGRP_IPV6_MROUTE 0x200 +#define RTMGRP_IPV6_ROUTE 0x400 +#define RTMGRP_IPV6_IFINFO 0x800 + +#define RTMGRP_DECnet_IFADDR 0x1000 +#define RTMGRP_DECnet_ROUTE 0x4000 + +#define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* rtnetlink multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV4_RULE, +#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_NOP2, + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_DECnet_RULE, +#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE + RTNLGRP_NOP4, + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + RTNLGRP_IPV6_RULE, +#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE + RTNLGRP_ND_USEROPT, +#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT + RTNLGRP_PHONET_IFADDR, +#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR + RTNLGRP_PHONET_ROUTE, +#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE + RTNLGRP_DCB, +#define RTNLGRP_DCB RTNLGRP_DCB + RTNLGRP_IPV4_NETCONF, +#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF + RTNLGRP_IPV6_NETCONF, +#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF + RTNLGRP_MDB, +#define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID + RTNLGRP_MPLS_NETCONF, +#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF + RTNLGRP_IPV4_MROUTE_R, +#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R + RTNLGRP_IPV6_MROUTE_R, +#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) + + +#endif + Index: sys/netlink/route/interface.h =================================================================== --- /dev/null +++ sys/netlink/route/interface.h @@ -0,0 +1,97 @@ +/* + * Interface-related (RTM_LINK) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_INTERFACE_H_ +#define _NETLINK_ROUTE_INTERFACE_H_ + +/* Base header for all of the relevant messages */ +struct ifinfomsg { + unsigned char ifi_family; /* Related XX */ + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ +}; + +#ifndef _KERNEL +/* Compatilbility helpers */ +#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg)) +#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN)) +#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN) +#endif + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + __IFLA_MAX }; + +#define IFLA_MAX (__IFLA_MAX - 1) + +#endif Index: sys/netlink/route/nexthop.h =================================================================== --- /dev/null +++ sys/netlink/route/nexthop.h @@ -0,0 +1,84 @@ +/* + * NEXTHOP-related (RTM_NEXTHOP) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_NEXTHOP_H_ +#define _NETLINK_ROUTE_NEXTHOP_H_ + +/* Base header for all of the relevant messages */ +struct nhmsg { + unsigned char nh_family; + unsigned char nh_scope; /* ignored on RX, filled by kernel */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F_* flags */ +}; + +/* entry in a nexthop group */ +struct nexthop_grp { + uint32_t id; /* nexhop userland index */ + uint8_t weight; /* weight of this nexthop */ + uint8_t resvd1; + uint16_t resvd2; +}; + +enum { + NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ + __NEXTHOP_GRP_TYPE_MAX, +}; +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + +enum { + NHA_UNSPEC, + NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */ + + NHA_GROUP, /* array of nexthop_grp */ + NHA_GROUP_TYPE, /* u16 one of NEXTHOP_GRP_TYPE */ + /* if NHA_GROUP attribute is added, no other attributes can be set */ + + NHA_BLACKHOLE, /* flag; nexthop used to blackhole packets */ + /* if NHA_BLACKHOLE is added, OIF, GATEWAY, ENCAP can not be set */ + + NHA_OIF, /* u32; nexthop device */ + NHA_GATEWAY, /* be32 (IPv4) or in6_addr (IPv6) gw address */ + NHA_ENCAP_TYPE, /* u16; lwt encap type */ + NHA_ENCAP, /* lwt encap data */ + + /* NHA_OIF can be appended to dump request to return only + * nexthops using given device + */ + NHA_GROUPS, /* flag; only return nexthop groups in dump */ + NHA_MASTER, /* u32; only return nexthops with given master dev */ + + NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ + /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + + /* nested; resilient nexthop group attributes */ + NHA_RES_GROUP, + /* nested; nexthop bucket attributes */ + NHA_RES_BUCKET, + + __NHA_MAX, +}; + +#define NHA_MAX (__NHA_MAX - 1) + +enum { + NHA_RES_GROUP_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC, + + /* u16; number of nexthop buckets in a resilient nexthop group */ + NHA_RES_GROUP_BUCKETS, + /* clock_t as u32; nexthop bucket idle timer (per-group) */ + NHA_RES_GROUP_IDLE_TIMER, + /* clock_t as u32; nexthop unbalanced timer */ + NHA_RES_GROUP_UNBALANCED_TIMER, + /* clock_t as u64; nexthop unbalanced time */ + NHA_RES_GROUP_UNBALANCED_TIME, + __NHA_RES_GROUP_MAX, +}; +#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1) + +#endif Index: sys/netlink/route/nhop.h =================================================================== --- /dev/null +++ sys/netlink/route/nhop.h @@ -0,0 +1,27 @@ +#ifndef _NETLINK_ROUTE_NHOP_H_ +#define _NETLINK_ROUTE_NHOP_H_ + +struct nhmsg { + unsigned char nh_family; + unsigned char nh_scope; /* return only */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F flags */ +}; + +/* entry in a nexthop group */ +struct nexthop_grp { + __u32 id; /* nexthop userland index */ + __u8 weight; /* weight of this nexthop */ + __u8 resvd1; + __u16 resvd2; +}; + +enum { + NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ + __NEXTHOP_GRP_TYPE_MAX, +}; +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + +#endif \ No newline at end of file Index: sys/netlink/route/route.h =================================================================== --- /dev/null +++ sys/netlink/route/route.h @@ -0,0 +1,345 @@ +/* + * Route-related (RTM_ROUTE) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_ROUTE_H_ +#define _NETLINK_ROUTE_ROUTE_H_ + +/* Base header for all of the relevant messages */ +struct rtmsg { + unsigned char rtm_family; /* address family */ + unsigned char rtm_dst_len; /* Prefix length */ + unsigned char rtm_src_len; /* Source prefix length (not used) */ + unsigned char rtm_tos; /* Type of service (not used) */ + unsigned char rtm_table; /* rtable id */ + unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */ + unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */ + unsigned char rtm_type; /* Route type (RTN_) */ + unsigned rtm_flags; /* Route flags (RTM_F_) */ +}; + +/* + * RFC 3549, 3.1.1, route type (rtm_type field). + */ +enum { + RTN_UNSPEC, + RTN_UNICAST, /* Unicast route */ + RTN_LOCAL, /* Accept locally (not supported) */ + RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */ + RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */ + RTN_MULTICAST, /* Multicast route */ + RTN_BLACKHOLE, /* Drop traffic towards destination */ + RTN_UNREACHABLE,/* Destination is unreachable */ + RTN_PROHIBIT, /* Administratively prohibited */ + RTN_THROW, /* Not in this table (not supported) */ + RTN_NAT, /* Translate this address (not supported) */ + RTN_XRESOLVE, /* Use external resolver (not supported) */ + __RTN_MAX, +}; +#define RTN_MAX (__RTN_MAX - 1) + +/* + * RFC 3549, 3.1.1, protocol (Identifies what/who added the route). + * Values larger than RTPROT_STATIC(4) are not interpreted by the + * kernel, they are just for user information. + */ +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ + +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC/ND router advertisements */ +#define RTPROT_MRT 10 /* Merit MRT */ +#define RTPROT_ZEBRA 11 /* Zebra */ +#define RTPROT_BIRD 12 /* BIRD */ +#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ +#define RTPROT_XORP 14 /* XORP */ +#define RTPROT_NTK 15 /* Netsukuku */ +#define RTPROT_DHCP 16 /* DHCP client */ +#define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ + +/* + * RFC 3549 3.1.1 Route scope (valid distance to destination). + * + * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200) + * are available to the user. + */ +enum rt_scope_t { + RT_SCOPE_UNIVERSE = 0, + /* User defined values */ + RT_SCOPE_SITE = 200, + RT_SCOPE_LINK = 253, + RT_SCOPE_HOST = 254, + RT_SCOPE_NOWHERE = 255 +}; + +/* + * RFC 3549 3.1.1 Route flags. + */ +#define RTM_F_NOTIFY 0x100 /* Notify user of route change */ +#define RTM_F_CLONED 0x200 /* This route is cloned (not used) */ +#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ +#define RTM_F_PREFIX 0x800 /* Prefix addresses */ +#define RTM_F_LOOKUP_TABLE 0x1000 /* set tableid to FIB lookup result */ +#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ +#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ +#define RTM_F_TRAP 0x8000 /* route is trapping packets */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed */ + +/* Compatibility handling helpers */ +#ifndef _KERNEL +#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg)) +#define RTM_RTA(_rtm) ((struct rtattr *)(NL_ITEM_DATA(_rtm) + NL_RTM_HDRLEN)) +#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN) +#endif + +/* + * Routing table identifiers. + * FreeBSD route table numbering starts from 0, where 0 is a valid default routing table. + * Indicating "all tables" via rtsock can be done by not including RTA_TABLE attribute + * and keeping rtm_table=0 (compatibility) or setting RTA_TABLE value to RT_TABLE_UNSPEC. + */ +#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */ +#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */ + +enum rtattr_type_t { + NL_RTA_UNSPEC, + NL_RTA_DST, + NL_RTA_SRC, + NL_RTA_IIF, + NL_RTA_OIF, + NL_RTA_GATEWAY, + NL_RTA_PRIORITY, + NL_RTA_PREFSRC, + NL_RTA_METRICS, + NL_RTA_MULTIPATH, + NL_RTA_PROTOINFO, /* not used / deprecated */ + NL_RTA_FLOW, + NL_RTA_CACHEINFO, /* not used */ + NL_RTA_SESSION, /* not used / deprecated */ + NL_RTA_MP_ALGO, /* not used / deprecated */ + NL_RTA_TABLE, + NL_RTA_MARK, /* not used */ + NL_RTA_MFC_STATS, + NL_RTA_VIA, + NL_RTA_NEWDST, + NL_RTA_PREF, + NL_RTA_ENCAP_TYPE, + NL_RTA_ENCAP, + NL_RTA_EXPIRES, + NL_RTA_PAD, + NL_RTA_UID, + NL_RTA_TTL_PROPAGATE, + NL_RTA_IP_PROTO, + NL_RTA_SPORT, + NL_RTA_DPORT, + NL_RTA_NH_ID, + __RTA_MAX +}; +#define NL_RTA_MAX (__RTA_MAX - 1) + +#ifndef _KERNEL +/* + * RTA_* space has clashes with rtsock namespace. + * Use NL_RTA_ prefix in the kernel and map to + * RTA_ for userland. + */ +#define RTA_UNSPEC NL_RTA_UNSPEC +#define RTA_DST NL_RTA_DST +#define RTA_SRC NL_RTA_SRC +#define RTA_IIF NL_RTA_IIF +#define RTA_OIF NL_RTA_OIF +#define RTA_GATEWAY NL_RTA_GATEWAY +#define RTA_PRIORITY NL_RTA_PRIORITY +#define RTA_PREFSRC NL_RTA_PREFSRC +#define RTA_METRICS NL_RTA_METRICS +#define RTA_MULTIPATH NL_RTA_MULTIPATH +#define RTA_PROTOINFO NL_RTA_PROTOINFO +#define RTA_FLOW NL_RTA_FLOW +#define RTA_CACHEINFO NL_RTA_CACHEINFO +#define RTA_SESSION NL_RTA_SESSION +#define RTA_MP_ALGO NL_RTA_MP_ALGO +#define RTA_TABLE NL_RTA_TABLE +#define RTA_MARK NL_RTA_MARK +#define RTA_MFC_STATS NL_RTA_MFC_STATS +#define RTA_VIA NL_RTA_VIA +#define RTA_NEWDST NL_RTA_NEWDST +#define RTA_PREF NL_RTA_PREF +#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE +#define RTA_ENCAP NL_RTA_ENCAP +#define RTA_EXPIRES NL_RTA_EXPIRES +#define RTA_PAD NL_RTA_PAD +#define RTA_UID NL_RTA_UID +#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE +#define RTA_IP_PROTO NL_RTA_IP_PROTO +#define RTA_SPORT NL_RTA_SPORT +#define RTA_DPORT NL_RTA_DPORT +#define RTA_NH_ID NL_RTA_NH_ID +#define RTA_MAX NL_RTA_MAX +#endif + +/* route attribute header */ +struct rtattr { + unsigned short rta_len; + unsigned short rta_type; +}; + +#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE +#define NL_RTA_ALIGN NL_ITEM_ALIGN +#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr)) +#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN)) +#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN) +#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN) + +/* Compatibility attribute handling helpers */ +#ifndef _KERNEL +#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE +#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len) +#define _RTA_LEN(_rta) ((int)(_rta)->rta_len) +#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta)) +#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN) +#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN) +#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len)) +#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len)) +#define RTA_DATA(_rta) NL_RTA_DATA(_rta) +#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN) +#endif + +/* RTA attribute headers */ + +/* RTA_VIA */ +struct rtvia { + sa_family_t rtvia_family; + uint8_t rtvia_addr[0]; +}; + +/* + * RTA_METRICS is a nested attribute, consistes of array of 'struct rtattr' + * with the types defined below. Most of the values are uint32_t. + */ + enum { + NL_RTAX_UNSPEC, +#define NL_RTAX_UNSPEC NL_RTAX_UNSPEC + NL_RTAX_LOCK, +#define NL_RTAX_LOCK NL_RTAX_LOCK + NL_RTAX_MTU, +#define NL_RTAX_MTU NL_RTAX_MTU + NL_RTAX_WINDOW, +#define NL_RTAX_WINDOW NL_RTAX_WINDOW + NL_RTAX_RTT, +#define NL_RTAX_RTT NL_RTAX_RTT + NL_RTAX_RTTVAR, +#define NL_RTAX_RTTVAR NL_RTAX_RTTVAR + NL_RTAX_SSTHRESH, +#define NL_RTAX_SSTHRESH NL_RTAX_SSTHRESH + NL_RTAX_CWND, +#define NL_RTAX_CWND NL_RTAX_CWND + NL_RTAX_ADVMSS, +#define NL_RTAX_ADVMSS NL_RTAX_ADVMSS + NL_RTAX_REORDERING, +#define NL_RTAX_REORDERING NL_RTAX_REORDERING + NL_RTAX_HOPLIMIT, +#define NL_RTAX_HOPLIMIT NL_RTAX_HOPLIMIT + NL_RTAX_INITCWND, +#define NL_RTAX_INITCWND NL_RTAX_INITCWND + NL_RTAX_FEATURES, +#define NL_RTAX_FEATURES NL_RTAX_FEATURES + NL_RTAX_RTO_MIN, +#define NL_RTAX_RTO_MIN NL_RTAX_RTO_MIN + NL_RTAX_INITRWND, +#define NL_RTAX_INITRWND NL_RTAX_INITRWND + NL_RTAX_QUICKACK, +#define NL_RTAX_QUICKACK NL_RTAX_QUICKACK + NL_RTAX_CC_ALGO, +#define NL_RTAX_CC_ALGO NL_RTAX_CC_ALGO + NL_RTAX_FASTOPEN_NO_COOKIE, +#define NL_RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE + __NL_RTAX_MAX +}; +#define NL_RTAX_MAX (__NL_RTAX_MAX - 1) + +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK \ + (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG) + +#ifndef _KERNEL + +/* + * RTAX_* space clashes with rtsock namespace. + * Use NL_RTAX_ prefix in the kernel and map to + * RTAX_ for userland. + */ +#define RTAX_UNSPEC NL_RTAX_UNSPEC +#define RTAX_LOCK NL_RTAX_LOCK +#define RTAX_MTU NL_RTAX_MTU +#define RTAX_WINDOW NL_RTAX_WINDOW +#define RTAX_RTT NL_RTAX_RTT +#define RTAX_RTTVAR NL_RTAX_RTTVAR +#define RTAX_SSTHRESH NL_RTAX_SSTHRESH +#define RTAX_CWND NL_RTAX_CWND +#define RTAX_ADVMSS NL_RTAX_ADVMSS +#define RTAX_REORDERING NL_RTAX_REORDERING +#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT +#define RTAX_INITCWND NL_RTAX_INITCWND +#define RTAX_FEATURES NL_RTAX_FEATURES +#define RTAX_RTO_MIN NL_RTAX_RTO_MIN +#define RTAX_INITRWND NL_RTAX_INITRWND +#define RTAX_QUICKACK NL_RTAX_QUICKACK +#define RTAX_CC_ALGO NL_RTAX_CC_ALGO +#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE +#endif + +/* RTA_MULTIPATH consists of an array of rtnexthop structures. */ +struct rtnexthop { + unsigned short rtnh_len; + unsigned char rtnh_flags; + unsigned char rtnh_hops; + int rtnh_ifindex; +}; + +/* rtnh_flags */ +#define RTNH_F_DEAD 0x01 /* Nexthop is dead (used by multipath) */ +#define RTNH_F_PERVASIVE 0x02 /* Do recursive gateway lookup */ +#define RTNH_F_ONLINK 0x04 /* Gateway is forced on link */ +#define RTNH_F_OFFLOAD 0x08 /* Nexthop is offloaded */ +#define RTNH_F_LINKDOWN 0x10 /* carrier-down on nexthop */ +#define RTNH_F_UNRESOLVED 0x20 /* The entry is unresolved (ipmr) */ +#define RTNH_F_TRAP 0x40 /* Nexthop is trapping packets */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \ + RTNH_F_OFFLOAD | RTNH_F_TRAP) + +/* Macros to handle hexthops */ +#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE +#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop)) +#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len) +#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh)) +#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN) +//#define RTNH_NEXT(_nh) (struct rtnexthop *)NL_ITEM_DATA(_nh, RTNH_HDRLEN) +//#define RTNH_NEXT(_nh) NL_ITEM_NEXT(_nh, _RTNH_ALIGNED_LEN(_nh)) +#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len)) +#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len)) +#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN)) + + +struct rtgenmsg { + unsigned char rtgen_family; +}; + +#endif Index: sys/sys/domain.h =================================================================== --- sys/sys/domain.h +++ sys/sys/domain.h @@ -71,11 +71,13 @@ /* dom_flags */ #define DOMF_SUPPORTED 0x0001 /* System supports this domain. */ #define DOMF_INITED 0x0002 /* Initialized in the default vnet. */ +#define DOMF_UNLOADABLE 0x0004 /* Can be unloaded */ #ifdef _KERNEL extern int domain_init_status; extern struct domain *domains; void domain_add(void *); +void domain_remove(void *); void domain_init(void *); #ifdef VIMAGE void vnet_domain_init(void *); @@ -85,6 +87,8 @@ #define DOMAIN_SET(name) \ SYSINIT(domain_add_ ## name, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_FIRST, domain_add, & name ## domain); \ + SYSUNINIT(domain_remove_ ## name, SI_SUB_PROTO_DOMAIN, \ + SI_ORDER_FIRST, domain_remove, & name ## domain); \ SYSINIT(domain_init_ ## name, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_SECOND, domain_init, & name ## domain); #endif /* _KERNEL */ Index: sys/sys/eventhandler.h =================================================================== --- sys/sys/eventhandler.h +++ sys/sys/eventhandler.h @@ -319,4 +319,11 @@ typedef void (*rt_addrmsg_fn)(void *, struct ifaddr *, int); EVENTHANDLER_DECLARE(rt_addrmsg, rt_addrmsg_fn); +/* Routing bridge */ +struct rt_addrinfo; +struct rib_cmd_info; +typedef void (*rtbridge_fn)(void *arg, int provider_id, uint32_t event_type, + uint32_t val1, void *ptr1, void *ptr2); +EVENTHANDLER_DECLARE(rib_rt_bridge, rtbridge_fn); + #endif /* _SYS_EVENTHANDLER_H_ */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -264,6 +264,7 @@ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ +#define AF_NETLINK 38 /* Netlink protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_HYPERV 43 /* HyperV sockets */ @@ -389,6 +390,7 @@ #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 +#define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP Index: tests/sys/net/routing/netlink.py =================================================================== --- /dev/null +++ tests/sys/net/routing/netlink.py @@ -0,0 +1,1076 @@ +#!/usr/local/bin/python3 + +from ctypes import * +import socket +import os +import sys +import unittest +import struct + +from enum import Enum, auto + +from typing import List, Callable, Dict, NamedTuple, Optional + + +def roundup2(val: int, num: int) -> int: + if val % num: + return (val | (num - 1)) + 1 + else: + return val + + +def align4(val: int) -> int: + return roundup2(val, 4) + + +class SockaddrNl(Structure): + _fields_ = [ + ("nl_len", c_ubyte), + ("nl_family", c_ubyte), + ("nl_pad", c_ushort), + ("nl_pid", c_uint), + ("nl_groups", c_uint), + ] + + +class Nlmsghdr(Structure): + _fields_ = [ + ("nlmsg_len", c_uint), + ("nlmsg_type", c_ushort), + ("nlmsg_flags", c_ushort), + ("nlmsg_seq", c_uint), + ("nlmsg_pid", c_uint), + ] + + +class Nlmsgerr(Structure): + _fields_ = [ + ("error", c_int), + ("msg", Nlmsghdr), + ] + + +class RtattrType(Enum): + RTA_UNSPEC = 0 + RTA_DST = auto() + RTA_SRC = auto() + RTA_IIF = auto() + RTA_OIF = auto() + RTA_GATEWAY = auto() + RTA_PRIORITY = auto() + RTA_PREFSRC = auto() + RTA_METRICS = auto() + RTA_MULTIPATH = auto() + RTA_PROTOINFO = auto() + RTA_FLOW = auto() + RTA_CACHEINFO = auto() + RTA_SESSION = auto() + RTA_MP_ALGO = auto() + RTA_TABLE = auto() + RTA_MARK = auto() + RTA_MFC_STATS = auto() + RTA_VIA = auto() + RTA_NEWDST = auto() + RTA_PREF = auto() + RTA_ENCAP_TYPE = auto() + RTA_ENCAP = auto() + RTA_EXPIRES = auto() + RTA_PAD = auto() + RTA_UID = auto() + RTA_TTL_PROPAGATE = auto() + RTA_IP_PROTO = auto() + RTA_SPORT = auto() + RTA_DPORT = auto() + RTA_NH_ID = auto() + + +class NlMsgType(Enum): + NLMSG_NOOP = 1 + NLMSG_ERROR = 2 + NLMSG_DONE = 3 + NLMSG_OVERRUN = 4 + + +class NlRtMsgType(Enum): + RTM_NEWLINK = 16 + RTM_DELLINK = 17 + RTM_GETLINK = 18 + RTM_SETLINK = 19 + RTM_NEWADDR = 20 + RTM_DELADDR = 21 + RTM_GETADDR = 22 + RTM_NEWROUTE = 24 + RTM_DELROUTE = 25 + RTM_GETROUTE = 26 + RTM_NEWNEIGH = 28 + RTM_DELNEIGH = 27 + RTM_GETNEIGH = 28 + RTM_NEWRULE = 32 + RTM_DELRULE = 33 + RTM_GETRULE = 34 + RTM_NEWQDISC = 36 + RTM_DELQDISC = 37 + RTM_GETQDISC = 38 + RTM_NEWTCLASS = 40 + RTM_DELTCLASS = 41 + RTM_GETTCLASS = 42 + RTM_NEWTFILTER = 44 + RTM_DELTFILTER = 45 + RTM_GETTFILTER = 46 + RTM_NEWACTION = 48 + RTM_DELACTION = 49 + RTM_GETACTION = 50 + RTM_NEWPREFIX = 52 + RTM_GETMULTICAST = 58 + RTM_GETANYCAST = 62 + RTM_NEWNEIGHTBL = 64 + RTM_GETNEIGHTBL = 66 + RTM_SETNEIGHTBL = 67 + RTM_NEWNDUSEROPT = 68 + RTM_NEWADDRLABEL = 72 + RTM_DELADDRLABEL = 73 + RTM_GETADDRLABEL = 74 + RTM_GETDCB = 78 + RTM_SETDCB = 79 + RTM_NEWNETCONF = 80 + RTM_GETNETCONF = 82 + RTM_NEWMDB = 84 + RTM_DELMDB = 85 + RTM_GETMDB = 86 + RTM_NEWNSID = 88 + RTM_DELNSID = 89 + RTM_GETNSID = 90 + RTM_NEWSTATS = 92 + RTM_GETSTATS = 94 + + +class RtAttr(Structure): + _fields_ = [ + ("rta_len", c_ushort), + ("rta_type", c_ushort), + ] + + +class RtMsgHdr(Structure): + _fields_ = [ + ("rtm_family", c_ubyte), + ("rtm_dst_len", c_ubyte), + ("rtm_src_len", c_ubyte), + ("rtm_tos", c_ubyte), + ("rtm_table", c_ubyte), + ("rtm_protocol", c_ubyte), + ("rtm_scope", c_ubyte), + ("rtm_type", c_ubyte), + ("rtm_flags", c_uint), + ] + + +class RtMsgFlags(Enum): + RTM_F_NOTIFY = 0x100 + RTM_F_CLONED = 0x200 + RTM_F_EQUALIZE = 0x400 + RTM_F_PREFIX = 0x800 + RTM_F_LOOKUP_TABLE = 0x1000 + RTM_F_FIB_MATCH = 0x2000 + RTM_F_OFFLOAD = 0x4000 + RTM_F_TRAP = 0x8000 + RTM_F_OFFLOAD_FAILED = 0x20000000 + + +class AddressFamilyLinux(Enum): + AF_INET = socket.AF_INET + AF_INET6 = socket.AF_INET6 + AF_NETLINK = 16 + + +class AddressFamilyBsd(Enum): + AF_INET = socket.AF_INET + AF_INET6 = socket.AF_INET6 + AF_NETLINK = 38 + + +class NlmBaseFlags(Enum): + NLM_F_REQUEST = 0x01 + NLM_F_MULTI = 0x02 + NLM_F_ACK = 0x04 + NLM_F_ECHO = 0x08 + NLM_F_DUMP_INTR = 0x10 + NLM_F_DUMP_FILTERED = 0x20 + +# XXX: in python3.8 it is possible to +# class NlmGetFlags(Enum, NlmBaseFlags): + + +class NlmGetFlags(Enum): + NLM_F_ROOT = 0x100 + NLM_F_MATCH = 0x200 + NLM_F_ATOMIC = 0x400 + + +class NlmNewFlags(Enum): + NLM_F_REPLACE = 0x100 + NLM_F_EXCL = 0x200 + NLM_F_CREATE = 0x400 + NLM_F_APPEND = 0x800 + + +class NlmDeleteFlags(Enum): + NLM_F_NONREC = 0x100 + + +class NlmAckFlags(Enum): + NLM_F_CAPPED = 0x100 + NLM_F_ACK_TLVS = 0x200 + + +class RtScope(Enum): + RT_SCOPE_UNIVERSE = 0 + RT_SCOPE_SITE = 200 + RT_SCOPE_LINK = 253 + RT_SCOPE_HOST = 254 + RT_SCOPE_NOWHERE = 255 + + +class RtType(Enum): + RTN_UNSPEC = 0 + RTN_UNICAST = auto() + RTN_LOCAL = auto() + RTN_BROADCAST = auto() + RTN_ANYCAST = auto() + RTN_MULTICAST = auto() + RTN_BLACKHOLE = auto() + RTN_UNREACHABLE = auto() + RTN_PROHIBIT = auto() + RTN_THROW = auto() + RTN_NAT = auto() + RTN_XRESOLVE = auto() + + +class RtProto(Enum): + RTPROT_UNSPEC = 0 + RTPROT_REDIRECT = 1 + RTPROT_KERNEL = 2 + RTPROT_BOOT = 3 + RTPROT_STATIC = 4 + RTPROT_GATED = 8 + RTPROT_RA = 9 + RTPROT_MRT = 10 + RTPROT_ZEBRA = 11 + RTPROT_BIRD = 12 + RTPROT_DNROUTED = 13 + RTPROT_XORP = 14 + RTPROT_NTK = 15 + RTPROT_DHCP = 16 + RTPROT_MROUTED = 17 + RTPROT_KEEPALIVED = 18 + RTPROT_BABEL = 42 + RTPROT_OPENR = 99 + RTPROT_BGP = 186 + RTPROT_ISIS = 187 + RTPROT_OSPF = 188 + RTPROT_RIP = 189 + RTPROT_EIGRP = 192 + + +class NlRtaxType(Enum): + RTAX_UNSPEC = 0 + RTAX_LOCK = auto() + RTAX_MTU = auto() + RTAX_WINDOW = auto() + RTAX_RTT = auto() + RTAX_RTTVAR = auto() + RTAX_SSTHRESH = auto() + RTAX_CWND = auto() + RTAX_ADVMSS = auto() + RTAX_REORDERING = auto() + RTAX_HOPLIMIT = auto() + RTAX_INITCWND = auto() + RTAX_FEATURES = auto() + RTAX_RTO_MIN = auto() + RTAX_INITRWND = auto() + RTAX_QUICKACK = auto() + RTAX_CC_ALGO = auto() + RTAX_FASTOPEN_NO_COOKIE = auto() + + +class NlRtGroup(Enum): + RTNLGRP_NONE = 0 + RTNLGRP_LINK = auto() + RTNLGRP_NOTIFY = auto() + RTNLGRP_NEIGH = auto() + RTNLGRP_TC = auto() + RTNLGRP_IPV4_IFADDR = auto() + RTNLGRP_IPV4_MROUTE = auto() + RTNLGRP_IPV4_ROUTE = auto() + RTNLGRP_IPV4_RULE = auto() + RTNLGRP_IPV6_IFADDR = auto() + RTNLGRP_IPV6_MROUTE = auto() + RTNLGRP_IPV6_ROUTE = auto() + RTNLGRP_IPV6_IFINFO = auto() + RTNLGRP_DECnet_IFADDR = auto() + RTNLGRP_NOP2 = auto() + RTNLGRP_DECnet_ROUTE = auto() + RTNLGRP_DECnet_RULE = auto() + RTNLGRP_NOP4 = auto() + RTNLGRP_IPV6_PREFIX = auto() + RTNLGRP_IPV6_RULE = auto() + RTNLGRP_ND_USEROPT = auto() + RTNLGRP_PHONET_IFADDR = auto() + RTNLGRP_PHONET_ROUTE = auto() + RTNLGRP_DCB = auto() + RTNLGRP_IPV4_NETCONF = auto() + RTNLGRP_IPV6_NETCONF = auto() + RTNLGRP_MDB = auto() + RTNLGRP_MPLS_ROUTE = auto() + RTNLGRP_NSID = auto() + RTNLGRP_MPLS_NETCONF = auto() + RTNLGRP_IPV4_MROUTE_R = auto() + RTNLGRP_IPV6_MROUTE_R = auto() + RTNLGRP_NEXTHOP = auto() + RTNLGRP_BRVLAN = auto() + + +class IfinfoMsg(Structure): + _fields_ = [ + ("ifi_family", c_ubyte), + ("__ifi_pad", c_ubyte), + ("ifi_type", c_ushort), + ("ifi_index", c_int), + ("ifi_flags", c_uint), + ("ifi_change", c_uint), + ] + + +class IflattrType(Enum): + IFLA_UNSPEC = 0 + IFLA_ADDRESS = auto() + IFLA_BROADCAST = auto() + IFLA_IFNAME = auto() + IFLA_MTU = auto() + IFLA_LINK = auto() + IFLA_QDISC = auto() + IFLA_STATS = auto() + IFLA_COST = auto() + IFLA_PRIORITY = auto() + IFLA_MASTER = auto() + IFLA_WIRELESS = auto() + IFLA_PROTINFO = auto() + IFLA_TXQLEN = auto() + IFLA_MAP = auto() + IFLA_WEIGHT = auto() + IFLA_OPERSTATE = auto() + IFLA_LINKMODE = auto() + IFLA_LINKINFO = auto() + IFLA_NET_NS_PID = auto() + IFLA_IFALIAS = auto() + IFLA_NUM_VF = auto() + IFLA_VFINFO_LIST = auto() + IFLA_STATS64 = auto() + IFLA_VF_PORTS = auto() + IFLA_PORT_SELF = auto() + IFLA_AF_SPEC = auto() + IFLA_GROUP = auto() + IFLA_NET_NS_FD = auto() + IFLA_EXT_MASK = auto() + IFLA_PROMISCUITY = auto() + IFLA_NUM_TX_QUEUES = auto() + IFLA_NUM_RX_QUEUES = auto() + IFLA_CARRIER = auto() + IFLA_PHYS_PORT_ID = auto() + IFLA_CARRIER_CHANGES = auto() + IFLA_PHYS_SWITCH_ID = auto() + IFLA_LINK_NETNSID = auto() + IFLA_PHYS_PORT_NAME = auto() + IFLA_PROTO_DOWN = auto() + IFLA_GSO_MAX_SEGS = auto() + IFLA_GSO_MAX_SIZE = auto() + IFLA_PAD = auto() + IFLA_XDP = auto() + IFLA_EVENT = auto() + IFLA_NEW_NETNSID = auto() + IFLA_IF_NETNSID = auto() + IFLA_CARRIER_UP_COUNT = auto() + IFLA_CARRIER_DOWN_COUNT = auto() + IFLA_NEW_IFINDEX = auto() + IFLA_MIN_MTU = auto() + IFLA_MAX_MTU = auto() + IFLA_PROP_LIST = auto() + IFLA_ALT_IFNAME = auto() + IFLA_PERM_ADDRESS = auto() + IFLA_PROTO_DOWN_REASON = auto() + + +class IfaddrMsg(Structure): + _fields_ = [ + ("ifa_family", c_ubyte), + ("ifa_prefixlen", c_ubyte), + ("ifa_flags", c_ubyte), + ("ifa_scope", c_ubyte), + ("ifa_index", c_uint), + ] + + +class IfattrType(Enum): + IFA_UNSPEC = 0 + IFA_ADDRESS = auto() + IFA_LOCAL = auto() + IFA_LABEL = auto() + IFA_BROADCAST = auto() + IFA_ANYCAST = auto() + IFA_CACHEINFO = auto() + IFA_MULTICAST = auto() + IFA_FLAGS = auto() + IFA_RT_PRIORITY = auto() + IFA_TARGET_NETNSID = auto() + + +class NlConst(): + AF_NETLINK = 38 + NETLINK_ROUTE = 0 + + +class NlHelper(): + def __init__(self): + self._pmap = {} + self._af_cls = self.get_af_cls() + + def get_af_cls(self): + if sys.platform.startswith("freebsd"): + cls = AddressFamilyBsd + else: + cls = AddressFamilyLinux + return cls + + def get_propmap(self, cls): + if cls not in self._pmap: + ret = {} + for prop in dir(cls): + if not prop.startswith("_"): + ret[getattr(cls, prop).value] = prop + self._pmap[cls] = ret + return self._pmap[cls] + + def get_name_propmap(self, cls): + ret = {} + for prop in dir(cls): + if not prop.startswith("_"): + ret[prop] = getattr(cls, prop).value + return ret + + def get_attr_byval(self, cls, attr_val): + propmap = self.get_propmap(cls) + return propmap.get(attr_val) + + def get_nlmsg_name(self, val): + for cls in [NlRtMsgType, NlMsgType]: + v = self.get_attr_byval(cls, val) + if v is not None: + return v + return "msg#{}".format(val) + + def get_af_name(self, family): + v = self.get_attr_byval(self._af_cls, family) + if v is not None: + return v + return "af#{}".format(family) + + def get_af_value(self, family_str: str) -> int: + propmap = self.get_name_propmap(self._af_cls) + return propmap.get(family_str) + + def get_rta_name(self, val): + return self.get_attr_byval(RtattrType, val) + + def get_bitmask_map(self, cls, val): + propmap = self.get_propmap(cls) + v = 1 + ret = {} + while val: + if v & val: + if v in propmap: + ret[v] = propmap[v] + else: + ret[v] = hex(v) + val -= v + v *= 2 + return ret + + def get_bitmask_str(self, cls, val): + bmap = self.get_bitmask_map(cls, val) + return ",".join([v for k, v in bmap.items()]) + + def get_nlm_flags_str(self, msg_str: str, reply: bool, val): + if reply: + return self.get_bitmask_str(NlmAckFlags, val) + if msg_str.startswith("RTM_GET"): + return self.get_bitmask_str(NlmGetFlags, val) + elif msg_str.startswith("RTM_DEL"): + return self.get_bitmask_str(NlmDeleteFlags, val) + elif msg_str.startswith("RTM_NEW"): + return self.get_bitmask_str(NlmNewFlags, val) + else: + return self.get_bitmask_str(NlmBaseFlags, val) + + +class BaseRtAttr(object): + def __init__(self, parent, rta_type, rta_len, data=None): + self.parent = parent + self.helper = parent.helper + self.attr_enum = parent.attr_enum + self.rta_type = rta_type & 0x3f + self.is_nested = rta_type & (1 << 15) + self.network_byte_order = rta_type & (1 << 14) + self.rta_len = rta_len + self.rta_type_str = self.helper.get_attr_byval(self.attr_enum, self.rta_type) # noqa: E501 + if data is not None: + self._validate(data) + self._parse(data) + self._orig_data = data + + def print_attribute(self, prepend=""): + if self.rta_type_str: + type_str = self.rta_type_str + else: + type_str = "rta#{}".format(self.rta_type) + print("{}rta_len={} rta_type={}({}){}".format(prepend, + self.rta_len, + type_str, + self.rta_type, + self._print_attr_value()) + ) + + def _print_attr_value(self): + return " [" + " ".join(["{:02X}".format(b) for b in self._orig_data[4:]]) + "]" # noqa: E501 + + @classmethod + def from_bytes(cls, parent, data): + if len(data) < sizeof(RtAttr): + raise ValueError("length less than rtattr header") + rta_hdr = RtAttr.from_buffer_copy(data) + self = cls(parent, rta_hdr.rta_type, rta_hdr.rta_len, data[:rta_hdr.rta_len]) # noqa: E501 + # XXX: nested + return self + + def __bytes__(self): + ret = self._orig_data + if align4(len(ret)) != len(ret): + ret += bytes(align4(len(ret)) - len(ret)) + return ret + + def _validate(self, data): + pass + + def _parse(self, data): + pass + + +class RtAttrIp(BaseRtAttr): + def _validate(self, data): + data_len = len(data) - 4 + if data_len != 4 and data_len != 16: + raise ValueError("Error validating attr {}: rta_len is not valid".format( # noqa: E501 + self.rta_type_str)) + + def _parse(self, data): + data_len = len(data) - 4 + if data_len == 4: + self.family = socket.AF_INET + self.addr = socket.inet_ntop(self.family, data[4:8]) + else: + self.family = socket.AF_INET6 + self.addr = socket.inet_ntop(self.family, data[4:20]) + + def _print_attr_value(self): + return " addr={}".format(self.addr) + + +class RtAttrU32(BaseRtAttr): + def _validate(self, data): + if len(data) != 8: + raise ValueError("Error validating attr {}: rta_len is not valid".format( # noqa: E501 + self.rta_type_str)) + + def _parse(self, data): + self.value = struct.unpack("@I", data[4:8])[0] + + def _print_attr_value(self): + return " value={}".format(self.value) + + +class RtAttrIfindex(RtAttrU32): + def _print_attr_value(self): + try: + ifname = socket.if_indextoname(self.value) + return " iface={}(#{})".format(ifname, self.value) + except OSError as e: + pass + return " iface=if#{}".format(self.value) + + +class RtAttrTable(RtAttrU32): + def _print_attr_value(self): + return " rtable={}".format(self.value) + + +class RtAttrNhId(RtAttrU32): + def _print_attr_value(self): + return " nh_id={}".format(self.value) + + +class RtAttrVia(BaseRtAttr): + def _validate(self, data): + data_len = len(data) - 4 + if data_len == 0: + raise ValueError("Error validating attr {}: empty data".format(self.rta_type_str)) # noqa: E501 + family = int(data_len[0]) + if family not in (socket.AF_INET, socket.AF_INET6): + raise ValueError("Error validating attr {}: unsupported AF {}".format( # noqa: E501 + self.rta_type_str, family)) + if family == socket.AF_INET: + expected_len = 1 + 4 + else: + expected_len = 1 + 16 + if data_len != expected_len: + raise ValueError("Error validating attr {}: expected len {} got {}".format( # noqa: E501 + self.rta_type_str, expected_len, data_len)) + + def _parse(self, data): + data_len = len(data) - 4 + self.family = int(data_len[0]) + if self.family == socket.AF_INET: + self.addr = socket.inet_ntop(self.family, data[5:9]) + else: + self.addr = socket.inet_ntop(self.family, data[5:21]) + + def _print_attr_value(self): + return ", via={}".format(self.addr) + + +class RtAttrStr(BaseRtAttr): + def _validate(self, data): + try: + s = data[4:].decode("utf-8") + except Exception as e: + raise ValueError("wrong utf-8 string") + + def _parse(self, data): + self.str = data[4:].decode("utf-8") + + def _print_attr_value(self): + return " str=\"{}\"".format(self.str) + + +rta_class_map = { + "RTA_DST": RtAttrIp, + "RTA_SRC": RtAttrIp, + "RTA_IIF": RtAttrIfindex, + "RTA_OIF": RtAttrIfindex, + "RTA_GATEWAY": RtAttrIp, + "RTA_TABLE": RtAttrTable, + "RTA_VIA": RtAttrVia, + "RTA_NH_ID": RtAttrNhId, +} + + +ifla_class_map = { + "IFLA_MTU": RtAttrU32, +} + +ifa_class_map = { + "IFA_ADDRESS": RtAttrIp, + "IFA_LOCAL": RtAttrIp, + "IFA_LABEL": RtAttrStr, + "IFA_BROADCAST": RtAttrIp, + "IFA_ANYCAST": RtAttrIp, + "IFA_FLAGS": RtAttrU32, +} + + +class BaseNetlinkMessage(object): + def __init__(self, helper, nlmsg_type): + self.nlmsg_type = nlmsg_type + self.ut = unittest.TestCase() + self.rta_list = [] + self._orig_data = None + self.helper = helper + self.nl_hdr = Nlmsghdr(nlmsg_type=nlmsg_type) + + def assertEqual(self, a, b, msg=None): + self.ut.assertEqual(a, b, msg) + + def assertNotEqual(self, a, b, msg=None): + self.ut.assertNotEqual(a, b, msg) + + @staticmethod + def parse_nl_header(data: bytes): + if len(data) < sizeof(Nlmsghdr): + raise ValueError("length less than netlink message header") + return Nlmsghdr.from_buffer_copy(data), sizeof(Nlmsghdr) + + def is_reply(self, hdr): + return hdr.nlmsg_type == NlMsgType.NLMSG_ERROR.value + + def print_nl_header(self, hdr, prepend=""): + # len=44, type=RTM_DELROUTE, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1641163704, pid=0 # noqa: E501 + is_reply = self.is_reply(hdr) + msg_name = self.helper.get_nlmsg_name(hdr.nlmsg_type) + print("{}len={}, type={}, flags={}(0x{:X}), seq={}, pid={}".format( + prepend, + hdr.nlmsg_len, + msg_name, + self.helper.get_nlm_flags_str(msg_name, is_reply, hdr.nlmsg_flags), # noqa: E501 + hdr.nlmsg_flags, + hdr.nlmsg_seq, + hdr.nlmsg_pid + )) + + @classmethod + def from_bytes(cls, helper, data): + try: + hdr, hdrlen = BaseNetlinkMessage.parse_nl_header(data) + self = cls(helper, hdr.nlmsg_type) + self._orig_data = data + self.nl_hdr = hdr + except ValueError as e: + print("Failed to parse nl header: {}".format(e)) + cls.print_as_bytes(data) + raise + return self + + def print_message(self): + self.print_nl_header(self.nl_hdr) + + @staticmethod + def print_as_bytes(data: bytes, descr: str): + print("===vv {} (len:{:3d}) vv===".format(descr, len(data))) + off = 0 + step = 16 + while off < len(data): + for i in range(step): + if off + i < len(data): + print(" {:02X}".format(data[off + i]), end="") + print("") + off += step + print("--------------------") + + +class NetlinkErrorMessage(BaseNetlinkMessage): + messages = [NlMsgType.NLMSG_ERROR.value] + + def __init__(self, helper, nlmsg_type, error): + super().__init__(helper, nlmsg_type) + self.err_hdr = Nlmsgerr() + + def print_error_header(self, errhdr, prepend=""): + print("{}error={}, ".format(prepend), end="") + self.print_nl_header(errhdr.msg, prepend) + + def print_message(self, prepend=""): + self.print_nl_header(self.nl_nhr, prepend) + self.print_error_header(self.err_hdr, prepend + " ") + + +class BaseNetlinkRtMessage(BaseNetlinkMessage): + attr_class_map = {} + attr_enum = None + + def __init__(self, helper, nlm_type): + super().__init__(helper, nlm_type) + self.base_hdr = None + + def parse_rta_list(self, data: bytes) -> List[BaseRtAttr]: + ret = [] + offset = 0 + while offset < len(data): + # print("OFFSET={}".format(offset)) + if offset + 4 > len(data): + raise ValueError("only {} bytes remaining".format(len(data) - offset)) # noqa: E501 + rta_hdr = RtAttr.from_buffer_copy(data[offset:]) + rta_type_str = self.helper.get_attr_byval(self.attr_enum, rta_hdr.rta_type) # noqa: E501 + cls = self.attr_class_map.get(rta_type_str, BaseRtAttr) + rta = cls.from_bytes(self, data[offset:]) + offset += align4(rta.rta_len) + if rta.rta_len == 0: + raise ValueError("empty rta len, {} bytes remaining".format(len(data) - offset)) # noqa: E501 + ret.append(rta) + return ret, offset + + @classmethod + def from_bytes(cls, helper, data): + try: + hdr, hdrlen = BaseNetlinkMessage.parse_nl_header(data) + self = cls(helper, hdr.nlmsg_type) + self._orig_data = data + self.nl_hdr = hdr + except ValueError as e: + print("Failed to parse nl header: {}".format(e)) + cls.print_as_bytes(data) + raise + + offset = align4(hdrlen) + try: + base_hdr, hdrlen = self.parse_base_header(data[offset:]) + self.base_hdr = base_hdr + offset += align4(hdrlen) + except ValueError as e: + print("Failed to parse nl rt header: {}".format(e)) + cls.print_as_bytes(data) + raise + + orig_offset = offset + try: + rta_list, rta_len = self.parse_rta_list(data[offset:]) + offset += rta_len + if offset != len(data): + raise ValueError("{} bytes left at the end of the packet".format(len(data) - offset)) # noqa: E501 + self.rta_list = rta_list + except ValueError as e: + print("Failed to parse nl rta attributes at offset {}: {}".format(orig_offset, e)) # noqa: E501 + cls.print_as_bytes(data, "msg dump") + cls.print_as_bytes(data[orig_offset:], "failed block") + raise + return self + + def __bytes__(self): + ret = bytes() + for rta in self.rta_list: + ret += bytes(rta) + ret = bytes(self.base_hdr) + ret + self.nl_hdr.nlmsg_len = len(ret) + sizeof(Nlmsghdr) + return bytes(self.nl_hdr) + ret + + def print_message(self): + self.print_nl_header(self.nl_hdr) + self.print_base_header(self.base_hdr, " ") + for rta in self.rta_list: + rta.print_attribute(" ") + + +class NetlinkRtMessage(BaseNetlinkRtMessage): + messages = [ + NlRtMsgType.RTM_NEWROUTE.value, + NlRtMsgType.RTM_DELROUTE.value, + NlRtMsgType.RTM_GETROUTE.value, + ] + attr_class_map = rta_class_map + attr_enum = RtattrType + + def __init__(self, helper, nlm_type): + super().__init__(helper, nlm_type) + self.base_hdr = RtMsgHdr() + + def parse_base_header(self, data): + if len(data) < sizeof(RtMsgHdr): + raise ValueError("length less than rtmsg header") + rtm_hdr = RtMsgHdr.from_buffer_copy(data) + return (rtm_hdr, sizeof(RtMsgHdr)) + + def print_base_header(self, hdr, prepend=""): + family = self.helper.get_af_name(hdr.rtm_family) + print("{}family={}, dst_len={}, src_len={}, tos={}, table={}, protocol={}({}), scope={}({}), type={}({}), flags={}({})".format( # noqa: E501 + prepend, + family, + hdr.rtm_dst_len, + hdr.rtm_src_len, + hdr.rtm_tos, + hdr.rtm_table, + self.helper.get_attr_byval(RtProto, hdr.rtm_protocol), + hdr.rtm_protocol, + self.helper.get_attr_byval(RtScope, hdr.rtm_scope), + hdr.rtm_scope, + self.helper.get_attr_byval(RtType, hdr.rtm_type), + hdr.rtm_type, + self.helper.get_bitmask_str(RtMsgFlags, hdr.rtm_flags), + hdr.rtm_flags)) + + +class NetlinkIflaMessage(BaseNetlinkRtMessage): + messages = [ + NlRtMsgType.RTM_NEWLINK.value, + NlRtMsgType.RTM_DELLINK.value, + NlRtMsgType.RTM_GETLINK.value, + ] + attr_class_map = ifla_class_map + attr_enum = IflattrType + + def __init__(self, helper, nlm_type): + super().__init__(helper, nlm_type) + self.base_hdr = IfinfoMsg() + + def parse_base_header(self, data): + if len(data) < sizeof(IfinfoMsg): + raise ValueError("length less than IfinfoMsg header") + rtm_hdr = IfinfoMsg.from_buffer_copy(data) + return (rtm_hdr, sizeof(IfinfoMsg)) + + def print_base_header(self, hdr, prepend=""): + family = self.helper.get_af_name(hdr.ifi_family) + print("{}family={}, ifi_type={}, ifi_index={}, ifi_flags={}, ifi_change={}".format( # noqa: E501 + prepend, + family, + hdr.ifi_type, + hdr.ifi_index, + hdr.ifi_flags, + hdr.ifi_change)) + + +class NetlinkIfaMessage(BaseNetlinkRtMessage): + messages = [ + NlRtMsgType.RTM_NEWADDR.value, + NlRtMsgType.RTM_DELADDR.value, + NlRtMsgType.RTM_GETADDR.value, + ] + attr_class_map = ifa_class_map + attr_enum = IfattrType + + def __init__(self, helper, nlm_type): + super().__init__(helper, nlm_type) + self.base_hdr = IfaddrMsg() + + def parse_base_header(self, data): + if len(data) < sizeof(IfaddrMsg): + raise ValueError("length less than IfaddrMsg header") + rtm_hdr = IfaddrMsg.from_buffer_copy(data) + return (rtm_hdr, sizeof(IfaddrMsg)) + + def print_base_header(self, hdr, prepend=""): + family = self.helper.get_af_name(hdr.ifa_family) + print("{}family={}, ifa_prefixlen={}, ifa_flags={}, ifa_scope={}, ifa_index={}".format( # noqa: E501 + prepend, + family, + hdr.ifa_prefixlen, + hdr.ifa_flags, + hdr.ifa_scope, + hdr.ifa_index)) + + +class Nlsock(): + def __init__(self, helper): + self.helper = helper + self.sock_fd = self._setup_netlink() + self._data = bytes() + self.rtm_seq = 1 + self.pid = os.getpid() + self.msgmap = self.build_msgmap() + self.set_groups(NlRtGroup.RTNLGRP_IPV4_ROUTE.value | NlRtGroup.RTNLGRP_IPV6_ROUTE.value) # noqa: E501 + + def build_msgmap(self): + classes = [NetlinkRtMessage, NetlinkIfaMessage, NetlinkErrorMessage] + xmap = {} + for cls in classes: + for message in cls.messages: + xmap[message] = cls + return xmap + + def get_seq(self): + ret = self.rtm_seq + self.rtm_seq += 1 + return ret + + def _setup_netlink(self) -> int: + family = self.helper.get_af_value("AF_NETLINK") + s = socket.socket(family, socket.SOCK_RAW, NlConst.NETLINK_ROUTE) + return s + + def set_groups(self, mask: int): + self.sock_fd.setsockopt(socket.SOL_SOCKET, 1, mask) + # snl = SockaddrNl(nl_len = sizeof(SockaddrNl), nl_family=38, + # nl_pid=self.pid, nl_groups=mask) + # xbuffer = create_string_buffer(sizeof(SockaddrNl)) + # memmove(xbuffer, addressof(snl), sizeof(SockaddrNl)) + # k = struct.pack("@BBHII", 12, 38, 0, self.pid, mask) + # self.sock_fd.bind(k) + + def write_message(self, msg): + print("vvvvvvvv OUT vvvvvvvv") + msg.print_message() + msg_bytes = bytes(msg) + try: + ret = os.write(self.sock_fd.fileno(), bytes(msg)) + except Exception as e: + print("write({}) -> {}".format(len(msg_bytes), e)) + + def parse_message(self, data: bytes): + if len(data) < sizeof(Nlmsghdr): + raise Exception("Short read from nl: {} bytes".format(len(data))) + hdr = Nlmsghdr.from_buffer_copy(data) + nlmsg_type = hdr.nlmsg_type + cls = self.msgmap.get(nlmsg_type) + if not cls: + cls = BaseNetlinkMessage + return cls.from_bytes(self.helper, data) + + def write_data(self, data: bytes): + self.sock_fd.send(data) + + def read_data(self): + while True: + data = self.sock_fd.recv(65535) + self._data += data + if len(self._data) >= sizeof(Nlmsghdr): + break + if seq is None: + break + hdr = Nlmsghdr.from_buffer_copy(data) + if hdr.nlmsg_pid == self.pid and hdr.nlmsg_seq == seq: + break + return data + + def read_message(self) -> bytes: + if len(self._data) < sizeof(Nlmsghdr): + self.read_data() + hdr = Nlmsghdr.from_buffer_copy(self._data) + while (hdr.nlmsg_len > len(self._data)): + self.read_data() + raw_msg = self._data[:hdr.nlmsg_len] + self._data = self._data[hdr.nlmsg_len:] + return self.parse_message(raw_msg) + + def fill_msg_seq(self, msg): + msg.nl_hdr.nlmsg_seq = self.get_seq() + msg.nl_hdr.nlmsg_pid = self.pid + + def request_ifaddrs(self, family): + msg = NetlinkIfaMessage(self.helper, NlRtMsgType.RTM_GETADDR.value) + flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value + self.fill_msg_seq(msg) + msg.base_hdr.ifa_family = family + msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value + + msg_bytes = bytes(msg) + x = self.parse_message(msg_bytes) + x.print_message() + print(msg_bytes) + # Skip family for now + self.write_data(msg_bytes) + + def request_routes(self, family): + msg = NetlinkRtMessage(self.helper, NlRtMsgType.RTM_GETROUTE.value) + flags = NlmGetFlags.NLM_F_ROOT.value | NlmGetFlags.NLM_F_MATCH.value + self.fill_msg_seq(msg) + msg.base_hdr.rtm_family = family + msg.nl_hdr.nlmsg_flags = flags | NlmBaseFlags.NLM_F_REQUEST.value + + msg_bytes = bytes(msg) + x = self.parse_message(msg_bytes) + x.print_message() + print(msg_bytes) + # Skip family for now + self.write_data(msg_bytes) + + +def main(): + helper = NlHelper() + nl = Nlsock(helper) + # nl.request_ifaddrs(socket.AF_INET) + nl.request_routes(0) + while True: + msg = nl.read_message() + print("") + msg.print_message() + + pass + + +if __name__ == "__main__": + main()