diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h index 58f7f6c4abe3..895f70322a29 100644 --- a/sys/netlink/netlink_ctl.h +++ b/sys/netlink/netlink_ctl.h @@ -1,127 +1,126 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_CTL_H_ #define _NETLINK_NETLINK_CTL_H_ #ifdef _KERNEL /* * This file provides headers for the public KPI of the netlink * subsystem */ #include MALLOC_DECLARE(M_NETLINK); /* * Macro for handling attribute TLVs */ #define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) #define NETLINK_ALIGN_SIZE sizeof(uint32_t) #define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) #define NLA_ALIGN_SIZE sizeof(uint32_t) #define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE) #define NLA_HDRLEN ((uint16_t)sizeof(struct nlattr)) #define NLA_DATA_LEN(_nla) ((_nla)->nla_len - NLA_HDRLEN) #define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN) #define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN) #define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF) #ifndef typeof #define typeof __typeof #endif #define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) #define _NLA_END(_start, _len) ((char *)(_start) + (_len)) #define NLA_FOREACH(_attr, _start, _len) \ for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \ ((char *)_attr < (char *)_end) && \ ((char *)NLA_NEXT(_attr) <= (char *)_end); \ _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr))) #include #include /* Protocol handlers */ struct nl_pstate; typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt); bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler); bool netlink_unregister_proto(int proto); /* Common helpers */ -bool nl_has_listeners(uint16_t netlink_family, uint32_t groups_mask); bool nlp_has_priv(struct nlpcb *nlp, int priv); struct ucred *nlp_get_cred(struct nlpcb *nlp); uint32_t nlp_get_pid(const struct nlpcb *nlp); bool nlp_unconstrained_vnet(const struct nlpcb *nlp); /* netlink_generic.c */ struct genl_cmd { const char *cmd_name; nl_handler_f cmd_cb; uint32_t cmd_flags; uint32_t cmd_priv; uint32_t cmd_num; }; uint16_t genl_register_family(const char *family_name, size_t hdrsize, uint16_t family_version, uint16_t max_attr_idx); bool genl_unregister_family(const char *family_name); bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, int count); uint32_t genl_register_group(const char *family_name, const char *group_name); struct genl_family; const char *genl_get_family_name(const struct genl_family *gf); uint16_t genl_get_family_id(const struct genl_family *gf); typedef void (*genl_family_event_handler_t)(void *arg, const struct genl_family *gf, int action); EVENTHANDLER_DECLARE(genl_family_event, genl_family_event_handler_t); struct thread; #if defined(NETLINK) || defined(NETLINK_MODULE) /* Provide optimized calls to the functions inside the same linking unit */ struct nlpcb *_nl_get_thread_nlp(struct thread *td); static inline struct nlpcb * nl_get_thread_nlp(struct thread *td) { return (_nl_get_thread_nlp(td)); } #else /* Provide access to the functions via netlink_glue.c */ struct nlpcb *nl_get_thread_nlp(struct thread *td); #endif /* defined(NETLINK) || defined(NETLINK_MODULE) */ #endif #endif diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index 94f880cbfb61..0567537d1d1d 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -1,1013 +1,984 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * Copyright (c) 2023 Gleb Smirnoff * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file contains socket and protocol bindings for netlink. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_domain #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); _Static_assert((NLP_MAX_GROUPS % 64) == 0, "NLP_MAX_GROUPS has to be multiple of 64"); _Static_assert(NLP_MAX_GROUPS >= 64, "NLP_MAX_GROUPS has to be at least 64"); #define NLCTL_TRACKER struct rm_priotracker nl_tracker -#define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) -#define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) +#define NLCTL_RLOCK() rm_rlock(&V_nl_ctl.ctl_lock, &nl_tracker) +#define NLCTL_RUNLOCK() rm_runlock(&V_nl_ctl.ctl_lock, &nl_tracker) +#define NLCTL_LOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_LOCKED) -#define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) -#define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) +#define NLCTL_WLOCK() rm_wlock(&V_nl_ctl.ctl_lock) +#define NLCTL_WUNLOCK() rm_wunlock(&V_nl_ctl.ctl_lock) +#define NLCTL_WLOCK_ASSERT() rm_assert(&V_nl_ctl.ctl_lock, RA_WLOCKED) static u_long nl_sendspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, "Default netlink socket send space"); static u_long nl_recvspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, "Default netlink socket receive space"); extern u_long sb_max_adj; static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0, sysctl_handle_nl_maxsockbuf, "LU", "Maximum Netlink socket buffer size"); static unsigned int osd_slot_id = 0; void nl_osd_register(void) { osd_slot_id = osd_register(OSD_THREAD, NULL, NULL); } void nl_osd_unregister(void) { osd_deregister(OSD_THREAD, osd_slot_id); } struct nlpcb * _nl_get_thread_nlp(struct thread *td) { return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id)); } void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp) { NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id); if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0) return; /* Failed, need to realloc */ void **rsv = osd_reserve(osd_slot_id); osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp); } /* * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. * Returns nlpcb pointer if present else NULL */ static struct nlpcb * nl_port_lookup(uint32_t port_id) { struct nlpcb *nlp; - CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { + CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_port_head, nl_port_next) { if (nlp->nl_port == port_id) return (nlp); } return (NULL); } static void nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id < NLP_MAX_GROUPS); /* TODO: add family handler callback */ if (!nlp_unconstrained_vnet(nlp)) return; BIT_SET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups); } static void nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id < NLP_MAX_GROUPS); BIT_CLR(NLP_MAX_GROUPS, group_id, &nlp->nl_groups); } static bool nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id < NLP_MAX_GROUPS); return (BIT_ISSET(NLP_MAX_GROUPS, group_id, &nlp->nl_groups)); } static uint32_t nl_get_groups_compat(struct nlpcb *nlp) { uint32_t groups_mask = 0; for (int i = 0; i < 32; i++) { if (nl_isset_group_locked(nlp, i + 1)) groups_mask |= (1 << i); } return (groups_mask); } static struct nl_buf * nl_buf_copy(struct nl_buf *nb) { struct nl_buf *copy; copy = nl_buf_alloc(nb->buflen, M_NOWAIT); if (__predict_false(copy == NULL)) return (NULL); memcpy(copy, nb, sizeof(*nb) + nb->buflen); return (copy); } /* * Broadcasts in the writer's buffer. */ bool nl_send_group(struct nl_writer *nw) { struct nl_buf *nb = nw->buf; struct nlpcb *nlp_last = NULL; struct nlpcb *nlp; NLCTL_TRACKER; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = (struct nlmsghdr *)nb->data; NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d", nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len, nw->group.proto, nw->group.id); } nw->buf = NULL; - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); - if (__predict_false(ctl == NULL)) { - /* - * Can be the case when notification is sent within VNET - * which doesn't have any netlink sockets. - */ - nl_buf_free(nb); - return (false); - } - - NLCTL_RLOCK(ctl); - - CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { + NLCTL_RLOCK(); + CK_LIST_FOREACH(nlp, &V_nl_ctl.ctl_pcb_head, nl_next) { if ((nw->group.priv == 0 || priv_check_cred( nlp->nl_socket->so_cred, nw->group.priv) == 0) && nlp->nl_proto == nw->group.proto && nl_isset_group_locked(nlp, nw->group.id)) { if (nlp_last != NULL) { struct nl_buf *copy; copy = nl_buf_copy(nb); if (copy != NULL) { nw->buf = copy; (void)nl_send(nw, nlp_last); } else { NLP_LOCK(nlp_last); if (nlp_last->nl_socket != NULL) sorwakeup(nlp_last->nl_socket); NLP_UNLOCK(nlp_last); } } nlp_last = nlp; } } if (nlp_last != NULL) { nw->buf = nb; (void)nl_send(nw, nlp_last); } else nl_buf_free(nb); - NLCTL_RUNLOCK(ctl); + NLCTL_RUNLOCK(); return (true); } -bool -nl_has_listeners(uint16_t netlink_family, uint32_t groups_mask) -{ - return (V_nl_ctl != NULL); -} - static uint32_t nl_find_port(void) { /* * app can open multiple netlink sockets. * Start with current pid, if already taken, * try random numbers in 65k..256k+65k space, * avoiding clash with pids. */ if (nl_port_lookup(curproc->p_pid) == NULL) return (curproc->p_pid); for (int i = 0; i < 16; i++) { uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; if (nl_port_lookup(nl_port) == 0) return (nl_port); NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); } return (curproc->p_pid); } static int nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) { if (nlp->nl_bound) { if (nlp->nl_port != snl->nl_pid) { NL_LOG(LOG_DEBUG, "bind() failed: program pid %d " "is different from provided pid %d", nlp->nl_port, snl->nl_pid); return (EINVAL); // XXX: better error } } else { if (snl->nl_pid == 0) snl->nl_pid = nl_find_port(); if (nl_port_lookup(snl->nl_pid) != NULL) return (EADDRINUSE); nlp->nl_port = snl->nl_pid; nlp->nl_bound = true; - CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); + CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_port_head, nlp, nl_port_next); } for (int i = 0; i < 32; i++) { if (snl->nl_groups & ((uint32_t)1 << i)) nl_add_group_locked(nlp, i + 1); else nl_del_group_locked(nlp, i + 1); } return (0); } static int nl_attach(struct socket *so, int proto, struct thread *td) { struct nlpcb *nlp; int error; if (__predict_false(netlink_unloading != 0)) return (EAFNOSUPPORT); error = nl_verify_proto(proto); if (error != 0) return (error); bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", so, is_linux ? "(linux) " : "", curproc->p_pid, nl_get_proto_name(proto)); - /* Create per-VNET state on first socket init */ - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); - if (ctl == NULL) - ctl = vnet_nl_ctl_init(); - KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); - - MPASS(sotonlpcb(so) == NULL); - nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); error = soreserve(so, nl_sendspace, nl_recvspace); if (error != 0) { free(nlp, M_PCB); return (error); } TAILQ_INIT(&so->so_rcv.nl_queue); TAILQ_INIT(&so->so_snd.nl_queue); so->so_pcb = nlp; nlp->nl_socket = so; /* Copy so_cred to avoid having socket_var.h in every header */ nlp->nl_cred = so->so_cred; nlp->nl_proto = proto; nlp->nl_process_id = curproc->p_pid; nlp->nl_linux = is_linux; nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred); nlp->nl_need_thread_setup = true; NLP_LOCK_INIT(nlp); refcount_init(&nlp->nl_refcount, 1); nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, taskqueue_thread_enqueue, &nlp->nl_taskqueue); TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, "netlink_socket (PID %u)", nlp->nl_process_id); - NLCTL_WLOCK(ctl); - /* XXX: check ctl is still alive */ - CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); - NLCTL_WUNLOCK(ctl); + NLCTL_WLOCK(); + CK_LIST_INSERT_HEAD(&V_nl_ctl.ctl_pcb_head, nlp, nl_next); + NLCTL_WUNLOCK(); soisconnected(so); return (0); } static int nl_bind(struct socket *so, struct sockaddr *sa, struct thread *td) { - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; int error; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } - NLCTL_WLOCK(ctl); + NLCTL_WLOCK(); NLP_LOCK(nlp); error = nl_bind_locked(nlp, snl); NLP_UNLOCK(nlp); - NLCTL_WUNLOCK(ctl); + NLCTL_WUNLOCK(); NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, snl->nl_pid, snl->nl_groups, error); return (error); } static int nl_assign_port(struct nlpcb *nlp, uint32_t port_id) { - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct sockaddr_nl snl = { .nl_pid = port_id, }; int error; - NLCTL_WLOCK(ctl); + NLCTL_WLOCK(); NLP_LOCK(nlp); snl.nl_groups = nl_get_groups_compat(nlp); error = nl_bind_locked(nlp, &snl); NLP_UNLOCK(nlp); - NLCTL_WUNLOCK(ctl); + NLCTL_WUNLOCK(); NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); return (error); } /* * nl_autobind_port binds a unused portid to @nlp * @nlp: pcb data for the netlink socket * @candidate_id: first id to consider */ static int nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) { - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); uint32_t port_id = candidate_id; NLCTL_TRACKER; bool exist; int error = EADDRINUSE; for (int i = 0; i < 10; i++) { NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); - NLCTL_RLOCK(ctl); + NLCTL_RLOCK(); exist = nl_port_lookup(port_id) != 0; - NLCTL_RUNLOCK(ctl); + NLCTL_RUNLOCK(); if (!exist) { error = nl_assign_port(nlp, port_id); if (error != EADDRINUSE) break; } port_id++; } NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); return (error); } static int nl_connect(struct socket *so, struct sockaddr *sa, struct thread *td) { struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; struct nlpcb *nlp; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } nlp = sotonlpcb(so); if (!nlp->nl_bound) { int error = nl_autobind_port(nlp, td->td_proc->p_pid); if (error != 0) { NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); return (error); } } /* XXX: Handle socket flags & multicast */ soisconnected(so); NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); return (0); } static void destroy_nlpcb_epoch(epoch_context_t ctx) { struct nlpcb *nlp; nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); NLP_LOCK_DESTROY(nlp); free(nlp, M_PCB); } static void nl_close(struct socket *so) { - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); MPASS(sotonlpcb(so) != NULL); struct nlpcb *nlp; struct nl_buf *nb; NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); nlp = sotonlpcb(so); /* Mark as inactive so no new work can be enqueued */ NLP_LOCK(nlp); bool was_bound = nlp->nl_bound; NLP_UNLOCK(nlp); /* Wait till all scheduled work has been completed */ taskqueue_drain_all(nlp->nl_taskqueue); taskqueue_free(nlp->nl_taskqueue); - NLCTL_WLOCK(ctl); + NLCTL_WLOCK(); NLP_LOCK(nlp); if (was_bound) { CK_LIST_REMOVE(nlp, nl_port_next); NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); } CK_LIST_REMOVE(nlp, nl_next); nlp->nl_socket = NULL; NLP_UNLOCK(nlp); - NLCTL_WUNLOCK(ctl); + NLCTL_WUNLOCK(); so->so_pcb = NULL; while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) { TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq); nl_buf_free(nb); } while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) { TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq); nl_buf_free(nb); } NL_LOG(LOG_DEBUG3, "socket %p, detached", so); /* XXX: is delayed free needed? */ NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); } static int nl_disconnect(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_sockaddr(struct socket *so, struct sockaddr *sa) { *(struct sockaddr_nl *)sa = (struct sockaddr_nl ){ /* TODO: set other fields */ .nl_len = sizeof(struct sockaddr_nl), .nl_family = AF_NETLINK, .nl_pid = sotonlpcb(so)->nl_port, }; return (0); } static int nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *m, struct mbuf *control, int flags, struct thread *td) { struct nlpcb *nlp = sotonlpcb(so); struct sockbuf *sb = &so->so_snd; struct nl_buf *nb; size_t len; int error; MPASS(m == NULL && uio != NULL); NL_LOG(LOG_DEBUG2, "sending message to kernel"); if (__predict_false(control != NULL)) { m_freem(control); return (EINVAL); } if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */ return (EOPNOTSUPP); if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr))) return (ENOBUFS); /* XXXGL: any better error? */ NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); if (error) return (error); len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE; if (nlp->nl_linux) len += roundup2(uio->uio_resid, 8); nb = nl_buf_alloc(len, M_WAITOK); nb->datalen = uio->uio_resid; error = uiomove(&nb->data[0], uio->uio_resid, uio); if (__predict_false(error)) goto out; SOCK_SENDBUF_LOCK(so); restart: if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) { TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); sb->sb_acc += nb->datalen; sb->sb_ccc += nb->datalen; nb = NULL; } else if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCK_SENDBUF_UNLOCK(so); error = EWOULDBLOCK; goto out; } else { if ((error = sbwait(so, SO_SND)) != 0) { SOCK_SENDBUF_UNLOCK(so); goto out; } else goto restart; } SOCK_SENDBUF_UNLOCK(so); if (nb == NULL) { NL_LOG(LOG_DEBUG3, "enqueue %u bytes", nb->datalen); NLP_LOCK(nlp); nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } out: SOCK_IO_SEND_UNLOCK(so); if (nb != NULL) nl_buf_free(nb); return (error); } /* Create control data for recvmsg(2) on Netlink socket. */ static struct mbuf * nl_createcontrol(struct nlpcb *nlp) { struct { struct nlattr nla; uint32_t val; } data[] = { { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, .val = nlp->nl_process_id, }, { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, .val = nlp->nl_port, }, }; return (sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO, SOL_NETLINK, M_WAITOK)); } static int nl_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp, struct mbuf **controlp, int *flagsp) { static const struct sockaddr_nl nl_empty_src = { .nl_len = sizeof(struct sockaddr_nl), .nl_family = PF_NETLINK, .nl_pid = 0 /* comes from the kernel */ }; struct sockbuf *sb = &so->so_rcv; struct nlpcb *nlp = sotonlpcb(so); struct nl_buf *first, *last, *nb, *next; struct nlmsghdr *hdr; int flags, error; u_int len, overflow, partoff, partlen, msgrcv, datalen; bool nonblock, trunc, peek; MPASS(mp == NULL && uio != NULL); NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (psa != NULL) *psa = sodupsockaddr((const struct sockaddr *)&nl_empty_src, M_WAITOK); if (controlp != NULL && (nlp->nl_flags & NLF_MSG_INFO)) *controlp = nl_createcontrol(nlp); flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0; trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false; nonblock = (so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT | MSG_NBIO)); peek = flags & MSG_PEEK; error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags)); if (__predict_false(error)) return (error); len = 0; overflow = 0; msgrcv = 0; datalen = 0; SOCK_RECVBUF_LOCK(so); while ((first = TAILQ_FIRST(&sb->nl_queue)) == NULL) { if (nonblock) { SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); return (EWOULDBLOCK); } error = sbwait(so, SO_RCV); if (error) { SOCK_RECVBUF_UNLOCK(so); SOCK_IO_RECV_UNLOCK(so); return (error); } } /* * Netlink socket buffer consists of a queue of nl_bufs, but for the * userland there should be no boundaries. However, there are Netlink * messages, that shouldn't be split. Internal invariant is that a * message never spans two nl_bufs. * If a large userland buffer is provided, we would traverse the queue * until either queue end is reached or the buffer is fulfilled. If * an application provides a buffer that isn't able to fit a single * message, we would truncate it and lose its tail. This is the only * condition where we would lose data. If buffer is able to fit at * least one message, we would return it and won't truncate the next. * * We use same code for normal and MSG_PEEK case. At first queue pass * we scan nl_bufs and count lenght. In case we can read entire buffer * at one write everything is trivial. In case we can not, we save * pointer to the last (or partial) nl_buf and in the !peek case we * split the queue into two pieces. We can safely drop the queue lock, * as kernel would only append nl_bufs to the end of the queue, and * we are the exclusive owner of queue beginning due to sleepable lock. * At the second pass we copy data out and in !peek case free nl_bufs. */ TAILQ_FOREACH(nb, &sb->nl_queue, tailq) { u_int offset; MPASS(nb->offset < nb->datalen); offset = nb->offset; while (offset < nb->datalen) { hdr = (struct nlmsghdr *)&nb->data[offset]; MPASS(nb->offset + hdr->nlmsg_len <= nb->datalen); if (uio->uio_resid < len + hdr->nlmsg_len) { overflow = len + hdr->nlmsg_len - uio->uio_resid; partoff = nb->offset; if (offset > partoff) { partlen = offset - partoff; if (!peek) { nb->offset = offset; datalen += partlen; } } else if (len == 0 && uio->uio_resid > 0) { flags |= MSG_TRUNC; partlen = uio->uio_resid; if (peek) goto nospace; datalen += hdr->nlmsg_len; if (nb->offset + hdr->nlmsg_len == nb->datalen) { /* * Avoid leaving empty nb. * Process last nb normally. * Trust uiomove() to care * about negative uio_resid. */ nb = TAILQ_NEXT(nb, tailq); overflow = 0; partlen = 0; } else nb->offset += hdr->nlmsg_len; msgrcv++; } else partlen = 0; goto nospace; } len += hdr->nlmsg_len; offset += hdr->nlmsg_len; MPASS(offset <= nb->buflen); msgrcv++; } MPASS(offset == nb->datalen); datalen += nb->datalen - nb->offset; } nospace: last = nb; if (!peek) { if (last == NULL) TAILQ_INIT(&sb->nl_queue); else { /* XXXGL: create TAILQ_SPLIT */ TAILQ_FIRST(&sb->nl_queue) = last; last->tailq.tqe_prev = &TAILQ_FIRST(&sb->nl_queue); } MPASS(sb->sb_acc >= datalen); sb->sb_acc -= datalen; sb->sb_ccc -= datalen; } SOCK_RECVBUF_UNLOCK(so); for (nb = first; nb != last; nb = next) { next = TAILQ_NEXT(nb, tailq); if (__predict_true(error == 0)) error = uiomove(&nb->data[nb->offset], (int)(nb->datalen - nb->offset), uio); if (!peek) nl_buf_free(nb); } if (last != NULL && partlen > 0 && __predict_true(error == 0)) error = uiomove(&nb->data[partoff], (int)partlen, uio); if (trunc && overflow > 0) { uio->uio_resid -= overflow; MPASS(uio->uio_resid < 0); } else MPASS(uio->uio_resid >= 0); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv += msgrcv; if (flagsp != NULL) *flagsp |= flags; SOCK_IO_RECV_UNLOCK(so); nl_on_transmit(sotonlpcb(so)); return (error); } static int nl_getoptflag(int sopt_name) { switch (sopt_name) { case NETLINK_CAP_ACK: return (NLF_CAP_ACK); case NETLINK_EXT_ACK: return (NLF_EXT_ACK); case NETLINK_GET_STRICT_CHK: return (NLF_STRICT); case NETLINK_MSG_INFO: return (NLF_MSG_INFO); } return (0); } static int nl_ctloutput(struct socket *so, struct sockopt *sopt) { - struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); uint32_t flag; int optval, error = 0; NLCTL_TRACKER; NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", so, sopt->sopt_name); switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; if (optval <= 0 || optval >= NLP_MAX_GROUPS) { error = ERANGE; break; } NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval); - NLCTL_WLOCK(ctl); + NLCTL_WLOCK(); if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) nl_add_group_locked(nlp, optval); else nl_del_group_locked(nlp, optval); - NLCTL_WUNLOCK(ctl); + NLCTL_WUNLOCK(); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; flag = nl_getoptflag(sopt->sopt_name); if ((flag == NLF_MSG_INFO) && nlp->nl_linux) { error = EINVAL; break; } - NLCTL_WLOCK(ctl); + NLCTL_WLOCK(); if (optval != 0) nlp->nl_flags |= flag; else nlp->nl_flags &= ~flag; - NLCTL_WUNLOCK(ctl); + NLCTL_WUNLOCK(); break; default: error = ENOPROTOOPT; } break; case SOPT_GET: switch (sopt->sopt_name) { case NETLINK_LIST_MEMBERSHIPS: - NLCTL_RLOCK(ctl); + NLCTL_RLOCK(); optval = nl_get_groups_compat(nlp); - NLCTL_RUNLOCK(ctl); + NLCTL_RUNLOCK(); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: - NLCTL_RLOCK(ctl); + NLCTL_RLOCK(); optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; - NLCTL_RUNLOCK(ctl); + NLCTL_RUNLOCK(); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = ENOPROTOOPT; } break; default: error = ENOPROTOOPT; } return (error); } static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_maxsockbuf = nl_maxsockbuf; error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req); if (error || !req->newptr) return (error); if (tmp_maxsockbuf < MSIZE + MCLBYTES) return (EINVAL); nl_maxsockbuf = tmp_maxsockbuf; return (0); } static int nl_setsbopt(struct socket *so, struct sockopt *sopt) { int error, optval; bool result; if (sopt->sopt_name != SO_RCVBUF) return (sbsetopt(so, sopt)); /* Allow to override max buffer size in certain conditions */ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error != 0) return (error); NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); if (optval > sb_max_adj) { if (priv_check(curthread, PRIV_NET_ROUTE) != 0) return (EPERM); } SOCK_RECVBUF_LOCK(so); result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); SOCK_RECVBUF_UNLOCK(so); return (result ? 0 : ENOBUFS); } #define NETLINK_PROTOSW \ .pr_flags = PR_ATOMIC | PR_ADDR | PR_SOCKBUF, \ .pr_ctloutput = nl_ctloutput, \ .pr_setsbopt = nl_setsbopt, \ .pr_attach = nl_attach, \ .pr_bind = nl_bind, \ .pr_connect = nl_connect, \ .pr_disconnect = nl_disconnect, \ .pr_sosend = nl_sosend, \ .pr_soreceive = nl_soreceive, \ .pr_sockaddr = nl_sockaddr, \ .pr_close = nl_close static struct protosw netlink_raw_sw = { .pr_type = SOCK_RAW, NETLINK_PROTOSW }; static struct protosw netlink_dgram_sw = { .pr_type = SOCK_DGRAM, NETLINK_PROTOSW }; static struct domain netlinkdomain = { .dom_family = PF_NETLINK, .dom_name = "netlink", .dom_flags = DOMF_UNLOADABLE, .dom_nprotosw = 2, .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw }, }; DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c index cf119a74cfb7..6c3cd90e61ab 100644 --- a/sys/netlink/netlink_module.c +++ b/sys/netlink/netlink_module.c @@ -1,250 +1,221 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(netlink, "Netlink support"); #define DEBUG_MOD_NAME nl_mod #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); #define NL_MAX_HANDLERS 20 struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS]; struct nl_proto_handler *nl_handlers = _nl_handlers; CK_LIST_HEAD(nl_control_head, nl_control); static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); -VNET_DEFINE(struct nl_control *, nl_ctl) = NULL; +VNET_DEFINE(struct nl_control, nl_ctl) = { + .ctl_port_head = CK_LIST_HEAD_INITIALIZER(), + .ctl_pcb_head = CK_LIST_HEAD_INITIALIZER(), +}; struct mtx nl_global_mtx; MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF); #define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx) #define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx) int netlink_unloading = 0; static void -free_nl_ctl(struct nl_control *ctl) +vnet_nl_init(const void *unused __unused) { - rm_destroy(&ctl->ctl_lock); - free(ctl, M_NETLINK); -} - -struct nl_control * -vnet_nl_ctl_init(void) -{ - struct nl_control *ctl; - - ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO); - rm_init(&ctl->ctl_lock, "netlink lock"); - CK_LIST_INIT(&ctl->ctl_port_head); - CK_LIST_INIT(&ctl->ctl_pcb_head); + rm_init(&V_nl_ctl.ctl_lock, "netlink lock"); NL_GLOBAL_LOCK(); - - struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl); - - if (tmp == NULL) { - atomic_store_ptr(&V_nl_ctl, ctl); - CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next); - NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", - curvnet, ctl); - } else { - NL_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance"); - free_nl_ctl(ctl); - ctl = tmp; - } - + CK_LIST_INSERT_HEAD(&vnets_head, &V_nl_ctl, ctl_next); + NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", + curvnet, &V_nl_ctl); NL_GLOBAL_UNLOCK(); - - return (ctl); } +VNET_SYSINIT(vnet_nl_init, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_nl_init, NULL); static void -vnet_nl_ctl_destroy(const void *unused __unused) +vnet_nl_uninit(const void *unused __unused) { - struct nl_control *ctl; - /* Assume at the time all of the processes / sockets are dead */ - NL_GLOBAL_LOCK(); - ctl = atomic_load_ptr(&V_nl_ctl); - atomic_store_ptr(&V_nl_ctl, NULL); - if (ctl != NULL) { - NL_LOG(LOG_DEBUG2, "Removing %p from global list", ctl); - CK_LIST_REMOVE(ctl, ctl_next); - } + NL_LOG(LOG_DEBUG2, "Removing %p from global list", &V_nl_ctl); + CK_LIST_REMOVE(&V_nl_ctl, ctl_next); NL_GLOBAL_UNLOCK(); - if (ctl != NULL) - free_nl_ctl(ctl); + rm_destroy(&V_nl_ctl.ctl_lock); } -VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY, - vnet_nl_ctl_destroy, NULL); +VNET_SYSUNINIT(vnet_nl_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_nl_uninit, + NULL); int nl_verify_proto(int proto) { if (proto < 0 || proto >= NL_MAX_HANDLERS) { return (EINVAL); } int handler_defined = nl_handlers[proto].cb != NULL; return (handler_defined ? 0 : EPROTONOSUPPORT); } const char * nl_get_proto_name(int proto) { return (nl_handlers[proto].proto_name); } bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto)); nl_handlers[proto].cb = handler; nl_handlers[proto].proto_name = proto_name; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Registered netlink %s(%d) handler", proto_name, proto); return (true); } bool netlink_unregister_proto(int proto) { if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) return (false); NL_GLOBAL_LOCK(); KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto)); nl_handlers[proto].cb = NULL; nl_handlers[proto].proto_name = NULL; NL_GLOBAL_UNLOCK(); NL_LOG(LOG_DEBUG2, "Unregistered netlink proto %d handler", proto); return (true); } #if !defined(NETLINK) && defined(NETLINK_MODULE) /* Non-stub function provider */ const static struct nl_function_wrapper nl_module = { .nlmsg_add = _nlmsg_add, .nlmsg_refill_buffer = _nlmsg_refill_buffer, .nlmsg_flush = _nlmsg_flush, .nlmsg_end = _nlmsg_end, .nlmsg_abort = _nlmsg_abort, .nl_writer_unicast = _nl_writer_unicast, .nl_writer_group = _nl_writer_group, .nlmsg_end_dump = _nlmsg_end_dump, .nl_modify_ifp_generic = _nl_modify_ifp_generic, .nl_store_ifp_cookie = _nl_store_ifp_cookie, .nl_get_thread_nlp = _nl_get_thread_nlp, }; #endif static bool can_unload(void) { struct nl_control *ctl; bool result = true; NL_GLOBAL_LOCK(); CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); result = false; break; } } NL_GLOBAL_UNLOCK(); return (result); } static int netlink_modevent(module_t mod __unused, int what, void *priv __unused) { int ret = 0; switch (what) { case MOD_LOAD: NL_LOG(LOG_DEBUG2, "Loading"); nl_osd_register(); #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(&nl_module); #endif break; case MOD_UNLOAD: NL_LOG(LOG_DEBUG2, "Unload called"); if (can_unload()) { NL_LOG(LOG_WARNING, "unloading"); netlink_unloading = 1; #if !defined(NETLINK) && defined(NETLINK_MODULE) nl_set_functions(NULL); #endif nl_osd_unregister(); } else ret = EBUSY; break; default: ret = EOPNOTSUPP; break; } return (ret); } static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(netlink, 1); diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h index c341abf7ca55..11b69eb604fe 100644 --- a/sys/netlink/netlink_var.h +++ b/sys/netlink/netlink_var.h @@ -1,189 +1,185 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_VAR_H_ #define _NETLINK_NETLINK_VAR_H_ #ifdef _KERNEL #include #include #include #include #include #define NLSNDQ 65536 /* Default socket sendspace */ #define NLRCVQ 65536 /* Default socket recvspace */ #define NLMBUFSIZE 2048 /* External storage size for Netlink mbufs */ struct ucred; struct nl_buf { TAILQ_ENTRY(nl_buf) tailq; u_int buflen; u_int datalen; u_int offset; char data[]; }; #define NLP_MAX_GROUPS 128 BITSET_DEFINE(nl_groups, NLP_MAX_GROUPS); struct nlpcb { struct socket *nl_socket; struct nl_groups nl_groups; uint32_t nl_port; uint32_t nl_flags; uint32_t nl_process_id; int nl_proto; bool nl_bound; bool nl_task_pending; bool nl_tx_blocked; /* No new requests accepted */ bool nl_linux; /* true if running under compat */ bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */ bool nl_need_thread_setup; struct taskqueue *nl_taskqueue; struct task nl_task; struct ucred *nl_cred; /* Copy of nl_socket->so_cred */ uint64_t nl_dropped_bytes; uint64_t nl_dropped_messages; CK_LIST_ENTRY(nlpcb) nl_next; CK_LIST_ENTRY(nlpcb) nl_port_next; volatile u_int nl_refcount; struct mtx nl_lock; struct epoch_context nl_epoch_ctx; }; #define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) #define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) #define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) #define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) #define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) #define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) /* nl_flags */ #define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ #define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ #define NLF_STRICT 0x04 /* Perform strict header checks */ #define NLF_MSG_INFO 0x08 /* Send caller info along with the notifications */ SYSCTL_DECL(_net_netlink); SYSCTL_DECL(_net_netlink_debug); struct nl_control { CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; CK_LIST_ENTRY(nl_control) ctl_next; struct rmlock ctl_lock; }; -VNET_DECLARE(struct nl_control *, nl_ctl); +VNET_DECLARE(struct nl_control, nl_ctl); #define V_nl_ctl VNET(nl_ctl) - struct sockaddr_nl; struct sockaddr; struct nlmsghdr; -/* netlink_module.c */ -struct nl_control *vnet_nl_ctl_init(void); - int nl_verify_proto(int proto); const char *nl_get_proto_name(int proto); extern int netlink_unloading; struct nl_proto_handler { nl_handler_f cb; const char *proto_name; }; extern struct nl_proto_handler *nl_handlers; /* netlink_domain.c */ bool nl_send_group(struct nl_writer *); void nl_osd_register(void); void nl_osd_unregister(void); void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp); /* netlink_io.c */ bool nl_send(struct nl_writer *, struct nlpcb *); void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg, struct nl_pstate *npt); void nl_on_transmit(struct nlpcb *nlp); void nl_taskqueue_handler(void *_arg, int pending); void nl_schedule_taskqueue(struct nlpcb *nlp); void nl_process_receive_locked(struct nlpcb *nlp); void nl_set_source_metadata(struct mbuf *m, int num_messages); struct nl_buf *nl_buf_alloc(size_t len, int mflag); void nl_buf_free(struct nl_buf *nb); /* netlink_generic.c */ struct genl_family *genl_get_family(uint16_t family_id); struct genl_group *genl_get_group(uint32_t group_id); #define MAX_FAMILIES 20 #define MAX_GROUPS 64 #define MIN_GROUP_NUM 48 #define CTRL_FAMILY_ID 0 #define CTRL_FAMILY_NAME "nlctrl" #define CTRL_GROUP_ID 0 #define CTRL_GROUP_NAME "notify" struct ifnet; struct nl_parsed_link; struct nlattr_bmask; struct nl_pstate; /* Function map */ struct nl_function_wrapper { bool (*nlmsg_add)(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool (*nlmsg_refill_buffer)(struct nl_writer *nw, size_t required_len); bool (*nlmsg_flush)(struct nl_writer *nw); bool (*nlmsg_end)(struct nl_writer *nw); void (*nlmsg_abort)(struct nl_writer *nw); void (*nlmsg_ignore_limit)(struct nl_writer *nw); bool (*nl_writer_unicast)(struct nl_writer *nw, size_t size, struct nlpcb *nlp, bool waitok); bool (*nl_writer_group)(struct nl_writer *nw, size_t size, uint16_t protocol, uint16_t group_id, int priv, bool waitok); bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr); int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt); void (*nl_store_ifp_cookie)(struct nl_pstate *npt, struct ifnet *ifp); struct nlpcb * (*nl_get_thread_nlp)(struct thread *td); }; void nl_set_functions(const struct nl_function_wrapper *nl); #endif #endif diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c index d856498b975f..93465e55e6aa 100644 --- a/sys/netlink/route/iface.c +++ b/sys/netlink/route/iface.c @@ -1,1536 +1,1530 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* scope deembedding */ #include #include #include #include #define DEBUG_MOD_NAME nl_iface #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); struct netlink_walkargs { struct nl_writer *nw; struct nlmsghdr hdr; struct nlpcb *so; struct ucred *cred; uint32_t fibnum; int family; int error; int count; int dumped; }; static eventhandler_tag ifdetach_event, ifattach_event, iflink_event, ifaddr_event; static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners); static struct sx rtnl_cloner_lock; SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock"); /* These are external hooks for CARP. */ extern int (*carp_get_vhid_p)(struct ifaddr *); /* * RTM_GETLINK request * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 * * Reply: * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, {{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} [ {{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, {{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, {{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, {{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, {{nla_len=8, nla_type=IFLA_MTU}, 1500}, {{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, {{nla_len=8, nla_type=IFLA_GROUP}, 0}, {{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, {{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, {{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, {{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, {{nla_len=5, nla_type=IFLA_CARRIER}, 1}, {{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, {{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, {{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, {{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, {{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, */ struct if_state { uint8_t ifla_operstate; uint8_t ifla_carrier; }; static void get_operstate_ether(if_t ifp, struct if_state *pstate) { struct ifmediareq ifmr = {}; int error; error = if_ioctl(ifp, SIOCGIFMEDIA, (void *)&ifmr); if (error != 0) { NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d", if_name(ifp), error); return; } switch (IFM_TYPE(ifmr.ifm_active)) { case IFM_ETHER: if (ifmr.ifm_status & IFM_ACTIVE) { pstate->ifla_carrier = 1; if (if_getflags(ifp) & IFF_MONITOR) pstate->ifla_operstate = IF_OPER_DORMANT; else pstate->ifla_operstate = IF_OPER_UP; } else pstate->ifla_operstate = IF_OPER_DOWN; } } static bool get_stats(struct nl_writer *nw, if_t ifp) { struct rtnl_link_stats64 *stats; int nla_len = sizeof(struct nlattr) + sizeof(*stats); struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); if (nla == NULL) return (false); nla->nla_type = IFLA_STATS64; nla->nla_len = nla_len; stats = (struct rtnl_link_stats64 *)(nla + 1); stats->rx_packets = if_getcounter(ifp, IFCOUNTER_IPACKETS); stats->tx_packets = if_getcounter(ifp, IFCOUNTER_OPACKETS); stats->rx_bytes = if_getcounter(ifp, IFCOUNTER_IBYTES); stats->tx_bytes = if_getcounter(ifp, IFCOUNTER_OBYTES); stats->rx_errors = if_getcounter(ifp, IFCOUNTER_IERRORS); stats->tx_errors = if_getcounter(ifp, IFCOUNTER_OERRORS); stats->rx_dropped = if_getcounter(ifp, IFCOUNTER_IQDROPS); stats->tx_dropped = if_getcounter(ifp, IFCOUNTER_OQDROPS); stats->multicast = if_getcounter(ifp, IFCOUNTER_IMCASTS); stats->rx_nohandler = if_getcounter(ifp, IFCOUNTER_NOPROTO); return (true); } static void get_operstate(if_t ifp, struct if_state *pstate) { pstate->ifla_operstate = IF_OPER_UNKNOWN; pstate->ifla_carrier = 0; /* no carrier */ switch (if_gettype(ifp)) { case IFT_ETHER: case IFT_L2VLAN: get_operstate_ether(ifp, pstate); break; default: /* Map admin state to the operstate */ if (if_getflags(ifp) & IFF_UP) { pstate->ifla_operstate = IF_OPER_UP; pstate->ifla_carrier = 1; } else pstate->ifla_operstate = IF_OPER_DOWN; break; } } static void get_hwaddr(struct nl_writer *nw, if_t ifp) { struct ifreq ifr = {}; if (if_gethwaddr(ifp, &ifr) == 0) { nlattr_add(nw, IFLAF_ORIG_HWADDR, if_getaddrlen(ifp), ifr.ifr_addr.sa_data); } } static unsigned ifp_flags_to_netlink(const if_t ifp) { return (if_getflags(ifp) | if_getdrvflags(ifp)); } #define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) static bool dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa) { uint32_t addr_len = 0; const void *addr_data = NULL; #ifdef INET6 struct in6_addr addr6; #endif if (sa == NULL) return (true); switch (sa->sa_family) { #ifdef INET case AF_INET: addr_len = sizeof(struct in_addr); addr_data = &((const struct sockaddr_in *)sa)->sin_addr; break; #endif #ifdef INET6 case AF_INET6: in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); addr_len = sizeof(struct in6_addr); addr_data = &addr6; break; #endif case AF_LINK: addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); break; case AF_UNSPEC: /* Ignore empty SAs without warning */ return (true); default: NL_LOG(LOG_DEBUG2, "unsupported family: %d, skipping", sa->sa_family); return (true); } return (nlattr_add(nw, attr, addr_len, addr_data)); } static bool dump_iface_caps(struct nl_writer *nw, struct ifnet *ifp) { int off = nlattr_add_nested(nw, IFLAF_CAPS); uint32_t active_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; uint32_t all_caps[roundup2(IFCAP_B_SIZE, 32) / 32] = {}; MPASS(sizeof(active_caps) >= 8); MPASS(sizeof(all_caps) >= 8); if (off == 0) return (false); active_caps[0] = (uint32_t)if_getcapabilities(ifp); all_caps[0] = (uint32_t)if_getcapenable(ifp); active_caps[1] = (uint32_t)if_getcapabilities2(ifp); all_caps[1] = (uint32_t)if_getcapenable2(ifp); nlattr_add_u32(nw, NLA_BITSET_SIZE, IFCAP_B_SIZE); nlattr_add(nw, NLA_BITSET_MASK, sizeof(all_caps), all_caps); nlattr_add(nw, NLA_BITSET_VALUE, sizeof(active_caps), active_caps); nlattr_set_len(nw, off); return (true); } /* * Dumps interface state, properties and metrics. * @nw: message writer * @ifp: target interface * @hdr: template header * @if_flags_mask: changed if_[drv]_flags bitmask * * This function is called without epoch and MAY sleep. */ static bool dump_iface(struct nl_writer *nw, if_t ifp, const struct nlmsghdr *hdr, int if_flags_mask) { struct epoch_tracker et; struct ifinfomsg *ifinfo; NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg))) goto enomem; ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg); ifinfo->ifi_family = AF_UNSPEC; ifinfo->__ifi_pad = 0; ifinfo->ifi_type = if_gettype(ifp); ifinfo->ifi_index = if_getindex(ifp); ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); ifinfo->ifi_change = if_flags_mask; struct if_state ifs = {}; get_operstate(ifp, &ifs); if (ifs.ifla_operstate == IF_OPER_UP) ifinfo->ifi_flags |= IFF_LOWER_UP; nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp)); nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate); nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier); /* nlattr_add_u8(nw, IFLA_PROTO_DOWN, val); nlattr_add_u8(nw, IFLA_LINKMODE, val); */ if (if_getaddrlen(ifp) != 0) { struct ifaddr *ifa; struct ifa_iter it; NET_EPOCH_ENTER(et); ifa = ifa_iter_start(ifp, &it); if (ifa != NULL) dump_sa(nw, IFLA_ADDRESS, ifa->ifa_addr); ifa_iter_finish(&it); NET_EPOCH_EXIT(et); } if ((if_getbroadcastaddr(ifp) != NULL)) { nlattr_add(nw, IFLA_BROADCAST, if_getaddrlen(ifp), if_getbroadcastaddr(ifp)); } nlattr_add_u32(nw, IFLA_MTU, if_getmtu(ifp)); /* nlattr_add_u32(nw, IFLA_MIN_MTU, 60); nlattr_add_u32(nw, IFLA_MAX_MTU, 9000); nlattr_add_u32(nw, IFLA_GROUP, 0); */ if (if_getdescr(ifp) != NULL) nlattr_add_string(nw, IFLA_IFALIAS, if_getdescr(ifp)); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFLA_FREEBSD); if (off != 0) { get_hwaddr(nw, ifp); dump_iface_caps(nw, ifp); nlattr_set_len(nw, off); } get_stats(nw, ifp); uint32_t val = (if_getflags(ifp) & IFF_PROMISC) != 0; nlattr_add_u32(nw, IFLA_PROMISCUITY, val); ifc_dump_ifp_nl(ifp, nw); if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); nlmsg_abort(nw); return (false); } static bool check_ifmsg(void *hdr, struct nl_pstate *npt) { struct ifinfomsg *ifm = hdr; if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 || ifm->ifi_flags != 0 || ifm->ifi_change != 0) { nlmsg_report_err_msg(npt, "strict checking: non-zero values in ifinfomsg header"); return (false); } return (true); } #define _IN(_field) offsetof(struct ifinfomsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_link, _field) static const struct nlfield_parser nlf_p_if[] = { { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 }, { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_flags), .off_out = _OUT(ifi_flags), .cb = nlf_get_u32 }, { .off_in = _IN(ifi_change), .off_out = _OUT(ifi_change), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_linfo[] = { { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn }, { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla }, }; NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo); static const struct nlattr_parser nla_p_if[] = { { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 }, { .type = IFLA_LINK, .off = _OUT(ifla_link), .cb = nlattr_get_uint32 }, { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested }, { .type = IFLA_IFALIAS, .off = _OUT(ifla_ifalias), .cb = nlattr_get_string }, { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string }, { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, }; #undef _IN #undef _OUT NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if); static bool match_iface(if_t ifp, void *_arg) { struct nl_parsed_link *attrs = (struct nl_parsed_link *)_arg; if (attrs->ifi_index != 0 && attrs->ifi_index != if_getindex(ifp)) return (false); if (attrs->ifi_type != 0 && attrs->ifi_index != if_gettype(ifp)) return (false); if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp))) return (false); /* TODO: add group match */ return (true); } static int dump_cb(if_t ifp, void *_arg) { struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; if (!dump_iface(wa->nw, ifp, &wa->hdr, 0)) return (ENOMEM); return (0); } /* * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0}, * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * [ * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"], * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF] * ] */ static int rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; if_t ifp; int error = 0; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags, .hdr.nlmsg_type = NL_RTM_NEWLINK, }; /* Fast track for an interface w/ explicit name or index match */ if ((attrs.ifi_index != 0) || (attrs.ifla_ifname != NULL)) { if (attrs.ifi_index != 0) { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", attrs.ifi_index); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); } else { NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching name %s", attrs.ifla_ifname); ifp = ifunit_ref(attrs.ifla_ifname); } if (ifp != NULL) { if (match_iface(ifp, &attrs)) { if (!dump_iface(wa.nw, ifp, &wa.hdr, 0)) error = ENOMEM; } else error = ENODEV; if_rele(ifp); } else error = ENODEV; return (error); } /* Always treat non-direct-match as a multipart message */ wa.hdr.nlmsg_flags |= NLM_F_MULTI; /* * Fetching some link properties require performing ioctl's that may be blocking. * Address it by saving referenced pointers of the matching links, * exiting from epoch and going through the list one-by-one. */ NL_LOG(LOG_DEBUG2, "Start dump"); if_foreach_sleep(match_iface, &attrs, dump_cb, &wa); NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } /* * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[ * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"], * [ * {nla_len=16, nla_type=IFLA_LINKINFO}, * [ * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"... * ] * ] */ static int rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; if_t ifp; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(attrs.ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index); return (ENOENT); } NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp)); sx_xlock(&ifnet_detach_sxlock); error = if_clone_destroy(if_name(ifp)); sx_xunlock(&ifnet_detach_sxlock); NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error); if_rele(ifp); return (error); } /* * New link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1668185590, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} * [ * {{nla_len=8, nla_type=IFLA_MTU}, 123}, * {{nla_len=10, nla_type=IFLA_IFNAME}, "vlan1"}, * {{nla_len=24, nla_type=IFLA_LINKINFO}, * [ * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x7b\x00\x00\x00"}]}]} * * Update link: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=1668185923, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=if_nametoindex("lo"), ifi_flags=0, ifi_change=0}, * {{nla_len=8, nla_type=IFLA_MTU}, 123}} * * * Check command availability: * type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0} */ static int create_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { if (lattrs->ifla_ifname == NULL || strlen(lattrs->ifla_ifname) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute"); return (EINVAL); } if (lattrs->ifla_cloner == NULL || strlen(lattrs->ifla_cloner) == 0) { NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute"); return (EINVAL); } struct ifc_data_nl ifd = { .flags = IFC_F_CREATE, .lattrs = lattrs, .bm = bm, .npt = npt, }; if (ifc_create_ifp_nl(lattrs->ifla_ifname, &ifd) && ifd.error == 0) nl_store_ifp_cookie(npt, ifd.ifp); return (ifd.error); } static int modify_link(struct nlmsghdr *hdr, struct nl_parsed_link *lattrs, struct nlattr_bmask *bm, struct nlpcb *nlp, struct nl_pstate *npt) { if_t ifp = NULL; struct epoch_tracker et; if (lattrs->ifi_index == 0 && lattrs->ifla_ifname == NULL) { /* * Applications like ip(8) verify RTM_NEWLINK command * existence by calling it with empty arguments. Always * return "innocent" error in that case. */ NLMSG_REPORT_ERR_MSG(npt, "empty ifi_index field"); return (EPERM); } if (lattrs->ifi_index != 0) { NET_EPOCH_ENTER(et); ifp = ifnet_byindex_ref(lattrs->ifi_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface #%u", lattrs->ifi_index); return (ENOENT); } } if (ifp == NULL && lattrs->ifla_ifname != NULL) { ifp = ifunit_ref(lattrs->ifla_ifname); if (ifp == NULL) { NLMSG_REPORT_ERR_MSG(npt, "unable to find interface %s", lattrs->ifla_ifname); return (ENOENT); } } MPASS(ifp != NULL); /* * Modification request can address either * 1) cloned interface, in which case we call the cloner-specific * modification routine * or * 2) non-cloned (e.g. "physical") interface, in which case we call * generic modification routine */ struct ifc_data_nl ifd = { .lattrs = lattrs, .bm = bm, .npt = npt }; if (!ifc_modify_ifp_nl(ifp, &ifd)) ifd.error = nl_modify_ifp_generic(ifp, lattrs, bm, npt); if_rele(ifp); return (ifd.error); } static int rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct nlattr_bmask bm; int error; struct nl_parsed_link attrs = {}; error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); if (error != 0) return (error); nl_get_attrs_bmask_nlmsg(hdr, &ifmsg_parser, &bm); if (hdr->nlmsg_flags & NLM_F_CREATE) return (create_link(hdr, &attrs, &bm, nlp, npt)); else return (modify_link(hdr, &attrs, &bm, nlp, npt)); } static void set_scope6(struct sockaddr *sa, uint32_t ifindex) { #ifdef INET6 if (sa != NULL && sa->sa_family == AF_INET6) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_LINKLOCAL(&sa6->sin6_addr)) in6_set_unicast_scopeid(&sa6->sin6_addr, ifindex); } #endif } static bool check_sa_family(const struct sockaddr *sa, int family, const char *attr_name, struct nl_pstate *npt) { if (sa == NULL || sa->sa_family == family) return (true); nlmsg_report_err_msg(npt, "wrong family for %s attribute: %d != %d", attr_name, family, sa->sa_family); return (false); } struct nl_parsed_ifa { uint8_t ifa_family; uint8_t ifa_prefixlen; uint8_t ifa_scope; uint32_t ifa_index; uint32_t ifa_flags; uint32_t ifaf_vhid; uint32_t ifaf_flags; struct sockaddr *ifa_address; struct sockaddr *ifa_local; struct sockaddr *ifa_broadcast; struct ifa_cacheinfo *ifa_cacheinfo; struct sockaddr *f_ifa_addr; struct sockaddr *f_ifa_dst; }; static int nlattr_get_cinfo(struct nlattr *nla, struct nl_pstate *npt, const void *arg __unused, void *target) { if (__predict_false(NLA_DATA_LEN(nla) != sizeof(struct ifa_cacheinfo))) { NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not ifa_cacheinfo", nla->nla_type, NLA_DATA_LEN(nla)); return (EINVAL); } *((struct ifa_cacheinfo **)target) = (struct ifa_cacheinfo *)NL_RTA_DATA(nla); return (0); } #define _IN(_field) offsetof(struct ifaddrmsg, _field) #define _OUT(_field) offsetof(struct nl_parsed_ifa, _field) static const struct nlfield_parser nlf_p_ifa[] = { { .off_in = _IN(ifa_family), .off_out = _OUT(ifa_family), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_prefixlen), .off_out = _OUT(ifa_prefixlen), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_scope), .off_out = _OUT(ifa_scope), .cb = nlf_get_u8 }, { .off_in = _IN(ifa_flags), .off_out = _OUT(ifa_flags), .cb = nlf_get_u8_u32 }, { .off_in = _IN(ifa_index), .off_out = _OUT(ifa_index), .cb = nlf_get_u32 }, }; static const struct nlattr_parser nla_p_ifa_fbsd[] = { { .type = IFAF_VHID, .off = _OUT(ifaf_vhid), .cb = nlattr_get_uint32 }, { .type = IFAF_FLAGS, .off = _OUT(ifaf_flags), .cb = nlattr_get_uint32 }, }; NL_DECLARE_ATTR_PARSER(ifa_fbsd_parser, nla_p_ifa_fbsd); static const struct nlattr_parser nla_p_ifa[] = { { .type = IFA_ADDRESS, .off = _OUT(ifa_address), .cb = nlattr_get_ip }, { .type = IFA_LOCAL, .off = _OUT(ifa_local), .cb = nlattr_get_ip }, { .type = IFA_BROADCAST, .off = _OUT(ifa_broadcast), .cb = nlattr_get_ip }, { .type = IFA_CACHEINFO, .off = _OUT(ifa_cacheinfo), .cb = nlattr_get_cinfo }, { .type = IFA_FLAGS, .off = _OUT(ifa_flags), .cb = nlattr_get_uint32 }, { .type = IFA_FREEBSD, .arg = &ifa_fbsd_parser, .cb = nlattr_get_nested }, }; #undef _IN #undef _OUT static bool post_p_ifa(void *_attrs, struct nl_pstate *npt) { struct nl_parsed_ifa *attrs = (struct nl_parsed_ifa *)_attrs; if (!check_sa_family(attrs->ifa_address, attrs->ifa_family, "IFA_ADDRESS", npt)) return (false); if (!check_sa_family(attrs->ifa_local, attrs->ifa_family, "IFA_LOCAL", npt)) return (false); if (!check_sa_family(attrs->ifa_broadcast, attrs->ifa_family, "IFA_BROADADDR", npt)) return (false); set_scope6(attrs->ifa_address, attrs->ifa_index); set_scope6(attrs->ifa_local, attrs->ifa_index); return (true); } NL_DECLARE_PARSER_EXT(ifa_parser, struct ifaddrmsg, NULL, nlf_p_ifa, nla_p_ifa, post_p_ifa); /* {ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, [ {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, --- {{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, [ {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, */ static uint8_t ifa_get_scope(const struct ifaddr *ifa) { const struct sockaddr *sa; uint8_t addr_scope = RT_SCOPE_UNIVERSE; sa = ifa->ifa_addr; switch (sa->sa_family) { #ifdef INET case AF_INET: { struct in_addr addr; addr = ((const struct sockaddr_in *)sa)->sin_addr; if (IN_LOOPBACK(addr.s_addr)) addr_scope = RT_SCOPE_HOST; else if (IN_LINKLOCAL(addr.s_addr)) addr_scope = RT_SCOPE_LINK; break; } #endif #ifdef INET6 case AF_INET6: { const struct in6_addr *addr; addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; if (IN6_IS_ADDR_LOOPBACK(addr)) addr_scope = RT_SCOPE_HOST; else if (IN6_IS_ADDR_LINKLOCAL(addr)) addr_scope = RT_SCOPE_LINK; break; } #endif } return (addr_scope); } #ifdef INET6 static uint8_t inet6_get_plen(const struct in6_addr *addr) { return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); } #endif static uint8_t get_sa_plen(const struct sockaddr *sa) { #ifdef INET const struct in_addr *paddr; #endif #ifdef INET6 const struct in6_addr *paddr6; #endif switch (sa->sa_family) { #ifdef INET case AF_INET: paddr = &(((const struct sockaddr_in *)sa)->sin_addr); return bitcount32(paddr->s_addr); #endif #ifdef INET6 case AF_INET6: paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); return inet6_get_plen(paddr6); #endif } return (0); } #ifdef INET6 static uint32_t in6_flags_to_nl(uint32_t flags) { uint32_t nl_flags = 0; if (flags & IN6_IFF_TEMPORARY) nl_flags |= IFA_F_TEMPORARY; if (flags & IN6_IFF_NODAD) nl_flags |= IFA_F_NODAD; if (flags & IN6_IFF_DEPRECATED) nl_flags |= IFA_F_DEPRECATED; if (flags & IN6_IFF_TENTATIVE) nl_flags |= IFA_F_TENTATIVE; if ((flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == 0) flags |= IFA_F_PERMANENT; if (flags & IN6_IFF_DUPLICATED) flags |= IFA_F_DADFAILED; return (nl_flags); } static uint32_t nl_flags_to_in6(uint32_t flags) { uint32_t in6_flags = 0; if (flags & IFA_F_TEMPORARY) in6_flags |= IN6_IFF_TEMPORARY; if (flags & IFA_F_NODAD) in6_flags |= IN6_IFF_NODAD; if (flags & IFA_F_DEPRECATED) in6_flags |= IN6_IFF_DEPRECATED; if (flags & IFA_F_TENTATIVE) in6_flags |= IN6_IFF_TENTATIVE; if (flags & IFA_F_DADFAILED) in6_flags |= IN6_IFF_DUPLICATED; return (in6_flags); } static void export_cache_info6(struct nl_writer *nw, const struct in6_ifaddr *ia) { struct ifa_cacheinfo ci = { .cstamp = ia->ia6_createtime * 1000, .tstamp = ia->ia6_updatetime * 1000, .ifa_prefered = ia->ia6_lifetime.ia6t_pltime, .ifa_valid = ia->ia6_lifetime.ia6t_vltime, }; nlattr_add(nw, IFA_CACHEINFO, sizeof(ci), &ci); } #endif static void export_cache_info(struct nl_writer *nw, struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET6 case AF_INET6: export_cache_info6(nw, (struct in6_ifaddr *)ifa); break; #endif } } /* * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), ('IFA_LOCAL', '12.0.0.1'), ('IFA_LABEL', 'eth10'), ('IFA_FLAGS', 128), ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], */ static bool dump_iface_addr(struct nl_writer *nw, if_t ifp, struct ifaddr *ifa, const struct nlmsghdr *hdr) { struct ifaddrmsg *ifamsg; struct sockaddr *sa = ifa->ifa_addr; struct sockaddr *sa_dst = ifa->ifa_dstaddr; NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg))) goto enomem; ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg); ifamsg->ifa_family = sa->sa_family; ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); ifamsg->ifa_flags = 0; // ifa_flags is useless ifamsg->ifa_scope = ifa_get_scope(ifa); ifamsg->ifa_index = if_getindex(ifp); if ((if_getflags(ifp) & IFF_POINTOPOINT) && sa_dst != NULL && sa_dst->sa_family != 0) { /* P2P interface may have IPv6 LL with no dst address */ dump_sa(nw, IFA_ADDRESS, sa_dst); dump_sa(nw, IFA_LOCAL, sa); } else { dump_sa(nw, IFA_ADDRESS, sa); #ifdef INET /* * In most cases, IFA_ADDRESS == IFA_LOCAL * Skip IFA_LOCAL for anything except INET */ if (sa->sa_family == AF_INET) dump_sa(nw, IFA_LOCAL, sa); #endif } if (if_getflags(ifp) & IFF_BROADCAST) dump_sa(nw, IFA_BROADCAST, ifa->ifa_broadaddr); nlattr_add_string(nw, IFA_LABEL, if_name(ifp)); uint32_t nl_ifa_flags = 0; #ifdef INET6 if (sa->sa_family == AF_INET6) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; nl_ifa_flags = in6_flags_to_nl(ia->ia6_flags); } #endif nlattr_add_u32(nw, IFA_FLAGS, nl_ifa_flags); export_cache_info(nw, ifa); /* Store FreeBSD-specific attributes */ int off = nlattr_add_nested(nw, IFA_FREEBSD); if (off != 0) { if (ifa->ifa_carp != NULL && carp_get_vhid_p != NULL) { uint32_t vhid = (uint32_t)(*carp_get_vhid_p)(ifa); nlattr_add_u32(nw, IFAF_VHID, vhid); } #ifdef INET6 if (sa->sa_family == AF_INET6) { uint32_t ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; nlattr_add_u32(nw, IFAF_FLAGS, ifa_flags); } #endif nlattr_set_len(nw, off); } if (nlmsg_end(nw)) return (true); enomem: NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); nlmsg_abort(nw); return (false); } static int dump_iface_addrs(struct netlink_walkargs *wa, if_t ifp) { struct ifaddr *ifa; struct ifa_iter it; int error = 0; for (ifa = ifa_iter_start(ifp, &it); ifa != NULL; ifa = ifa_iter_next(&it)) { if (wa->family != 0 && wa->family != ifa->ifa_addr->sa_family) continue; if (ifa->ifa_addr->sa_family == AF_LINK) continue; if (prison_if(wa->cred, ifa->ifa_addr) != 0) continue; wa->count++; if (!dump_iface_addr(wa->nw, ifp, ifa, &wa->hdr)) { error = ENOMEM; break; } wa->dumped++; } ifa_iter_finish(&it); return (error); } static int rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { if_t ifp; int error = 0; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); if (error != 0) return (error); struct netlink_walkargs wa = { .so = nlp, .nw = npt->nw, .cred = nlp_get_cred(nlp), .family = attrs.ifa_family, .hdr.nlmsg_pid = hdr->nlmsg_pid, .hdr.nlmsg_seq = hdr->nlmsg_seq, .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, .hdr.nlmsg_type = NL_RTM_NEWADDR, }; NL_LOG(LOG_DEBUG2, "Start dump"); if (attrs.ifa_index != 0) { ifp = ifnet_byindex(attrs.ifa_index); if (ifp == NULL) error = ENOENT; else error = dump_iface_addrs(&wa, ifp); } else { struct if_iter it; for (ifp = if_iter_start(&it); ifp != NULL; ifp = if_iter_next(&it)) { error = dump_iface_addrs(&wa, ifp); if (error != 0) break; } if_iter_finish(&it); } NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); return (ENOMEM); } return (error); } #ifdef INET static int handle_newaddr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { int plen = attrs->ifa_prefixlen; int if_flags = if_getflags(ifp); struct sockaddr_in *addr, *dst; if (plen > 32) { nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); return (EINVAL); }; if (if_flags & IFF_POINTOPOINT) { /* * Only P2P IFAs are allowed by the implementation. */ if (attrs->ifa_address == NULL || attrs->ifa_local == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } addr = (struct sockaddr_in *)attrs->ifa_local; dst = (struct sockaddr_in *)attrs->ifa_address; } else { /* * Map the Netlink attributes to FreeBSD ifa layout. * If only IFA_ADDRESS or IFA_LOCAL is set OR * both are set to the same value => ifa is not p2p * and the attribute value contains interface address. * * Otherwise (both IFA_ADDRESS and IFA_LOCAL are set and * different), IFA_LOCAL contains an interface address and * IFA_ADDRESS contains peer address. */ addr = (struct sockaddr_in *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in *)attrs->ifa_address; if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } /* Generate broadcast address if not set */ if ((if_flags & IFF_BROADCAST) && attrs->ifa_broadcast == NULL) { uint32_t s_baddr; struct sockaddr_in *sin_brd; if (plen == 31) s_baddr = INADDR_BROADCAST; /* RFC 3021 */ else { uint32_t s_mask; s_mask = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0); s_baddr = addr->sin_addr.s_addr | ~s_mask; } sin_brd = (struct sockaddr_in *)npt_alloc(npt, sizeof(*sin_brd)); if (sin_brd == NULL) return (ENOMEM); sin_brd->sin_family = AF_INET; sin_brd->sin_len = sizeof(*sin_brd); sin_brd->sin_addr.s_addr = s_baddr; attrs->ifa_broadcast = (struct sockaddr *)sin_brd; } dst = (struct sockaddr_in *)attrs->ifa_broadcast; } struct sockaddr_in mask = { .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_addr.s_addr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0), }; struct in_aliasreq req = { .ifra_addr = *addr, .ifra_mask = mask, .ifra_vhid = attrs->ifaf_vhid, }; if (dst != NULL) req.ifra_dstaddr = *dst; return (in_control_ioctl(SIOCAIFADDR, &req, ifp, nlp_get_cred(nlp))); } static int handle_deladdr_inet(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr *addr = attrs->ifa_local; if (addr == NULL) addr = attrs->ifa_address; if (addr == NULL) { nlmsg_report_err_msg(npt, "empty IFA_ADDRESS/IFA_LOCAL"); return (EINVAL); } struct ifreq req = { .ifr_addr = *addr }; return (in_control_ioctl(SIOCDIFADDR, &req, ifp, nlp_get_cred(nlp))); } #endif #ifdef INET6 static int handle_newaddr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in6 *addr, *dst; if (attrs->ifa_prefixlen > 128) { nlmsg_report_err_msg(npt, "invalid ifa_prefixlen"); return (EINVAL); } /* * In IPv6 implementation, adding non-P2P address to the P2P interface * is allowed. */ addr = (struct sockaddr_in6 *)(attrs->ifa_local); dst = (struct sockaddr_in6 *)(attrs->ifa_address); if (addr == NULL) { addr = dst; dst = NULL; } else if (dst != NULL) { if (IN6_ARE_ADDR_EQUAL(&addr->sin6_addr, &dst->sin6_addr)) { /* * Sometimes Netlink users fills in both attributes * with the same address. It still means "non-p2p". */ dst = NULL; } } if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } uint32_t flags = nl_flags_to_in6(attrs->ifa_flags) | attrs->ifaf_flags; uint32_t pltime = 0, vltime = 0; if (attrs->ifa_cacheinfo != 0) { pltime = attrs->ifa_cacheinfo->ifa_prefered; vltime = attrs->ifa_cacheinfo->ifa_valid; } struct sockaddr_in6 mask = { .sin6_len = sizeof(struct sockaddr_in6), .sin6_family = AF_INET6, }; ip6_writemask(&mask.sin6_addr, attrs->ifa_prefixlen); struct in6_aliasreq req = { .ifra_addr = *addr, .ifra_prefixmask = mask, .ifra_flags = flags, .ifra_lifetime = { .ia6t_vltime = vltime, .ia6t_pltime = pltime }, .ifra_vhid = attrs->ifaf_vhid, }; if (dst != NULL) req.ifra_dstaddr = *dst; return (in6_control_ioctl(SIOCAIFADDR_IN6, &req, ifp, nlp_get_cred(nlp))); } static int handle_deladdr_inet6(struct nlmsghdr *hdr, struct nl_parsed_ifa *attrs, if_t ifp, struct nlpcb *nlp, struct nl_pstate *npt) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)attrs->ifa_local; if (addr == NULL) addr = (struct sockaddr_in6 *)(attrs->ifa_address); if (addr == NULL) { nlmsg_report_err_msg(npt, "Empty IFA_LOCAL/IFA_ADDRESS"); return (EINVAL); } struct in6_ifreq req = { .ifr_addr = *addr }; return (in6_control_ioctl(SIOCDIFADDR_IN6, &req, ifp, nlp_get_cred(nlp))); } #endif static int rtnl_handle_addr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) { struct epoch_tracker et; int error; struct nl_parsed_ifa attrs = {}; error = nl_parse_nlmsg(hdr, &ifa_parser, npt, &attrs); if (error != 0) return (error); NET_EPOCH_ENTER(et); if_t ifp = ifnet_byindex_ref(attrs.ifa_index); NET_EPOCH_EXIT(et); if (ifp == NULL) { nlmsg_report_err_msg(npt, "Unable to find interface with index %u", attrs.ifa_index); return (ENOENT); } int if_flags = if_getflags(ifp); #if defined(INET) || defined(INET6) bool new = hdr->nlmsg_type == NL_RTM_NEWADDR; #endif /* * TODO: Properly handle NLM_F_CREATE / NLM_F_EXCL. * The current ioctl-based KPI always does an implicit create-or-replace. * It is not possible to specify fine-grained options. */ switch (attrs.ifa_family) { #ifdef INET case AF_INET: if (new) error = handle_newaddr_inet(hdr, &attrs, ifp, nlp, npt); else error = handle_deladdr_inet(hdr, &attrs, ifp, nlp, npt); break; #endif #ifdef INET6 case AF_INET6: if (new) error = handle_newaddr_inet6(hdr, &attrs, ifp, nlp, npt); else error = handle_deladdr_inet6(hdr, &attrs, ifp, nlp, npt); break; #endif default: error = EAFNOSUPPORT; } if (error == 0 && !(if_flags & IFF_UP) && (if_getflags(ifp) & IFF_UP)) if_up(ifp); if_rele(ifp); return (error); } static void rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) { struct nlmsghdr hdr = {}; struct nl_writer nw; uint32_t group = 0; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: group = RTNLGRP_IPV4_IFADDR; break; #endif #ifdef INET6 case AF_INET6: group = RTNLGRP_IPV6_IFADDR; break; #endif default: NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", ifa->ifa_addr->sa_family); return; } - if (!nl_has_listeners(NETLINK_ROUTE, group)) - return; - if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, group, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr); nlmsg_flush(&nw); } static void rtnl_handle_ifevent(if_t ifp, int nlmsg_type, int if_flags_mask) { struct nlmsghdr hdr = { .nlmsg_type = nlmsg_type }; struct nl_writer nw; - if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) - return; - if (!nl_writer_group(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK, 0, false)) { NL_LOG(LOG_DEBUG, "error allocating group writer"); return; } dump_iface(&nw, ifp, &hdr, if_flags_mask); nlmsg_flush(&nw); } static void rtnl_handle_ifattach(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } static void rtnl_handle_ifdetach(void *arg, if_t ifp) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_DELLINK, 0); } static void rtnl_handle_iflink(void *arg, if_t ifp, int link_state __unused) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, 0); } void rtnl_handle_ifnet_event(if_t ifp, int if_flags_mask) { NL_LOG(LOG_DEBUG2, "ifnet %s", if_name(ifp)); rtnl_handle_ifevent(ifp, NL_RTM_NEWLINK, if_flags_mask); } static const struct rtnl_cmd_handler cmd_handlers[] = { { .cmd = NL_RTM_GETLINK, .name = "RTM_GETLINK", .cb = &rtnl_handle_getlink, .flags = RTNL_F_NOEPOCH | RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_DELLINK, .name = "RTM_DELLINK", .cb = &rtnl_handle_dellink, .priv = PRIV_NET_IFDESTROY, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_NEWLINK, .name = "RTM_NEWLINK", .cb = &rtnl_handle_newlink, .priv = PRIV_NET_IFCREATE, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_GETADDR, .name = "RTM_GETADDR", .cb = &rtnl_handle_getaddr, .flags = RTNL_F_ALLOW_NONVNET_JAIL, }, { .cmd = NL_RTM_NEWADDR, .name = "RTM_NEWADDR", .cb = &rtnl_handle_addr, .priv = PRIV_NET_ADDIFADDR, .flags = RTNL_F_NOEPOCH, }, { .cmd = NL_RTM_DELADDR, .name = "RTM_DELADDR", .cb = &rtnl_handle_addr, .priv = PRIV_NET_DELIFADDR, .flags = RTNL_F_NOEPOCH, }, }; static const struct nlhdr_parser *all_parsers[] = { &ifmsg_parser, &ifa_parser, &ifa_fbsd_parser, }; void rtnl_iface_add_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_INSERT_HEAD(&nl_cloners, cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_iface_del_cloner(struct nl_cloner *cloner) { sx_xlock(&rtnl_cloner_lock); SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next); sx_xunlock(&rtnl_cloner_lock); } void rtnl_ifaces_init(void) { ifattach_event = EVENTHANDLER_REGISTER( ifnet_arrival_event, rtnl_handle_ifattach, NULL, EVENTHANDLER_PRI_ANY); ifdetach_event = EVENTHANDLER_REGISTER( ifnet_departure_event, rtnl_handle_ifdetach, NULL, EVENTHANDLER_PRI_ANY); ifaddr_event = EVENTHANDLER_REGISTER( rt_addrmsg, rtnl_handle_ifaddr, NULL, EVENTHANDLER_PRI_ANY); iflink_event = EVENTHANDLER_REGISTER( ifnet_link_event, rtnl_handle_iflink, NULL, EVENTHANDLER_PRI_ANY); NL_VERIFY_PARSERS(all_parsers); rtnl_register_messages(cmd_handlers, nitems(cmd_handlers)); } void rtnl_ifaces_destroy(void) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); EVENTHANDLER_DEREGISTER(ifnet_link_event, iflink_event); }