diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c index 38f4a2dfed94..ecd110d62c1f 100644 --- a/sys/netlink/netlink_domain.c +++ b/sys/netlink/netlink_domain.c @@ -1,781 +1,833 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file contains socket and protocol bindings for netlink. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* priv_check */ +#include #include #include #include #define DEBUG_MOD_NAME nl_domain #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); _Static_assert((NLP_MAX_GROUPS % 64) == 0, "NLP_MAX_GROUPS has to be multiple of 64"); _Static_assert(NLP_MAX_GROUPS >= 64, "NLP_MAX_GROUPS has to be at least 64"); #define NLCTL_TRACKER struct rm_priotracker nl_tracker #define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) #define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) #define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) static u_long nl_sendspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, "Default netlink socket send space"); static u_long nl_recvspace = NLSNDQ; SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, "Default netlink socket receive space"); extern u_long sb_max_adj; static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS); SYSCTL_OID(_net_netlink, OID_AUTO, nl_maxsockbuf, CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, &nl_maxsockbuf, 0, sysctl_handle_nl_maxsockbuf, "LU", "Maximum Netlink socket buffer size"); static unsigned int osd_slot_id = 0; void nl_osd_register(void) { osd_slot_id = osd_register(OSD_THREAD, NULL, NULL); } void nl_osd_unregister(void) { osd_deregister(OSD_THREAD, osd_slot_id); } struct nlpcb * _nl_get_thread_nlp(struct thread *td) { return (osd_get(OSD_THREAD, &td->td_osd, osd_slot_id)); } void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp) { NLP_LOG(LOG_DEBUG2, nlp, "Set thread %p nlp to %p (slot %u)", td, nlp, osd_slot_id); if (osd_set(OSD_THREAD, &td->td_osd, osd_slot_id, nlp) == 0) return; /* Failed, need to realloc */ void **rsv = osd_reserve(osd_slot_id); osd_set_reserved(OSD_THREAD, &td->td_osd, osd_slot_id, rsv, nlp); } /* * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. * Returns nlpcb pointer if present else NULL */ static struct nlpcb * nl_port_lookup(uint32_t port_id) { struct nlpcb *nlp; CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { if (nlp->nl_port == port_id) return (nlp); } return (NULL); } static void nl_add_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; /* TODO: add family handler callback */ if (!nlp_unconstrained_vnet(nlp)) return; nlp->nl_groups[group_id / 64] |= (uint64_t)1 << (group_id % 64); } static void nl_del_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; nlp->nl_groups[group_id / 64] &= ~((uint64_t)1 << (group_id % 64)); } static bool nl_isset_group_locked(struct nlpcb *nlp, unsigned int group_id) { MPASS(group_id <= NLP_MAX_GROUPS); --group_id; return (nlp->nl_groups[group_id / 64] & ((uint64_t)1 << (group_id % 64))); } static uint32_t nl_get_groups_compat(struct nlpcb *nlp) { uint32_t groups_mask = 0; for (int i = 0; i < 32; i++) { if (nl_isset_group_locked(nlp, i + 1)) groups_mask |= (1 << i); } return (groups_mask); } static void nl_send_one_group(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { if (__predict_false(nlp->nl_flags & NLF_MSG_INFO)) nl_add_msg_info(m); nl_send_one(m, nlp, num_messages, io_flags); } /* * Broadcasts message @m to the protocol @proto group specified by @group_id */ void nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id) { struct nlpcb *nlp_last = NULL; struct nlpcb *nlp; NLCTL_TRACKER; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d", m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id); } struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (__predict_false(ctl == NULL)) { /* * Can be the case when notification is sent within VNET * which doesn't have any netlink sockets. */ m_freem(m); return; } NLCTL_RLOCK(ctl); int io_flags = NL_IOF_UNTRANSLATED; CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) { if (nlp_last != NULL) { struct mbuf *m_copy; m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m_copy != NULL) nl_send_one_group(m_copy, nlp_last, num_messages, io_flags); else { NLP_LOCK(nlp_last); if (nlp_last->nl_socket != NULL) sorwakeup(nlp_last->nl_socket); NLP_UNLOCK(nlp_last); } } nlp_last = nlp; } } if (nlp_last != NULL) nl_send_one_group(m, nlp_last, num_messages, io_flags); else m_freem(m); NLCTL_RUNLOCK(ctl); } bool nl_has_listeners(int netlink_family, uint32_t groups_mask) { return (V_nl_ctl != NULL); } static uint32_t nl_find_port(void) { /* * app can open multiple netlink sockets. * Start with current pid, if already taken, * try random numbers in 65k..256k+65k space, * avoiding clash with pids. */ if (nl_port_lookup(curproc->p_pid) == NULL) return (curproc->p_pid); for (int i = 0; i < 16; i++) { uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; if (nl_port_lookup(nl_port) == 0) return (nl_port); NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); } return (curproc->p_pid); } static int nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) { if (nlp->nl_bound) { if (nlp->nl_port != snl->nl_pid) { NL_LOG(LOG_DEBUG, "bind() failed: program pid %d " "is different from provided pid %d", nlp->nl_port, snl->nl_pid); return (EINVAL); // XXX: better error } } else { if (snl->nl_pid == 0) snl->nl_pid = nl_find_port(); if (nl_port_lookup(snl->nl_pid) != NULL) return (EADDRINUSE); nlp->nl_port = snl->nl_pid; nlp->nl_bound = true; CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); } for (int i = 0; i < 32; i++) { if (snl->nl_groups & ((uint32_t)1 << i)) nl_add_group_locked(nlp, i + 1); else nl_del_group_locked(nlp, i + 1); } return (0); } static int nl_pru_attach(struct socket *so, int proto, struct thread *td) { struct nlpcb *nlp; int error; if (__predict_false(netlink_unloading != 0)) return (EAFNOSUPPORT); error = nl_verify_proto(proto); if (error != 0) return (error); bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", so, is_linux ? "(linux) " : "", curproc->p_pid, nl_get_proto_name(proto)); /* Create per-VNET state on first socket init */ struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); if (ctl == NULL) ctl = vnet_nl_ctl_init(); KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); MPASS(sotonlpcb(so) == NULL); nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); error = soreserve(so, nl_sendspace, nl_recvspace); if (error != 0) { free(nlp, M_PCB); return (error); } + so->so_rcv.sb_mtx = &so->so_rcv_mtx; + TAILQ_INIT(&so->so_snd.nl_queue); so->so_pcb = nlp; nlp->nl_socket = so; /* Copy so_cred to avoid having socket_var.h in every header */ nlp->nl_cred = so->so_cred; nlp->nl_proto = proto; nlp->nl_process_id = curproc->p_pid; nlp->nl_linux = is_linux; - nlp->nl_active = true; nlp->nl_unconstrained_vnet = !jailed_without_vnet(so->so_cred); nlp->nl_need_thread_setup = true; NLP_LOCK_INIT(nlp); refcount_init(&nlp->nl_refcount, 1); nl_init_io(nlp); nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, taskqueue_thread_enqueue, &nlp->nl_taskqueue); TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, "netlink_socket (PID %u)", nlp->nl_process_id); NLCTL_WLOCK(ctl); /* XXX: check ctl is still alive */ CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); NLCTL_WUNLOCK(ctl); soisconnected(so); return (0); } static int nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; int error; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } NLCTL_WLOCK(ctl); NLP_LOCK(nlp); error = nl_bind_locked(nlp, snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, snl->nl_pid, snl->nl_groups, error); return (error); } static int nl_assign_port(struct nlpcb *nlp, uint32_t port_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct sockaddr_nl snl = { .nl_pid = port_id, }; int error; NLCTL_WLOCK(ctl); NLP_LOCK(nlp); snl.nl_groups = nl_get_groups_compat(nlp); error = nl_bind_locked(nlp, &snl); NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); return (error); } /* * nl_autobind_port binds a unused portid to @nlp * @nlp: pcb data for the netlink socket * @candidate_id: first id to consider */ static int nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); uint32_t port_id = candidate_id; NLCTL_TRACKER; bool exist; int error = EADDRINUSE; for (int i = 0; i < 10; i++) { NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); NLCTL_RLOCK(ctl); exist = nl_port_lookup(port_id) != 0; NLCTL_RUNLOCK(ctl); if (!exist) { error = nl_assign_port(nlp, port_id); if (error != EADDRINUSE) break; } port_id++; } NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); return (error); } static int nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td) { struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; struct nlpcb *nlp; NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); if (snl->nl_len != sizeof(*snl)) { NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); return (EINVAL); } nlp = sotonlpcb(so); if (!nlp->nl_bound) { int error = nl_autobind_port(nlp, td->td_proc->p_pid); if (error != 0) { NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); return (error); } } /* XXX: Handle socket flags & multicast */ soisconnected(so); NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); return (0); } static void destroy_nlpcb(struct nlpcb *nlp) { NLP_LOCK(nlp); nl_free_io(nlp); NLP_LOCK_DESTROY(nlp); free(nlp, M_PCB); } static void destroy_nlpcb_epoch(epoch_context_t ctx) { struct nlpcb *nlp; nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); destroy_nlpcb(nlp); } static void nl_close(struct socket *so) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); MPASS(sotonlpcb(so) != NULL); struct nlpcb *nlp; + struct nl_buf *nb; NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); nlp = sotonlpcb(so); /* Mark as inactive so no new work can be enqueued */ NLP_LOCK(nlp); bool was_bound = nlp->nl_bound; - nlp->nl_active = false; NLP_UNLOCK(nlp); /* Wait till all scheduled work has been completed */ taskqueue_drain_all(nlp->nl_taskqueue); taskqueue_free(nlp->nl_taskqueue); NLCTL_WLOCK(ctl); NLP_LOCK(nlp); if (was_bound) { CK_LIST_REMOVE(nlp, nl_port_next); NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); } CK_LIST_REMOVE(nlp, nl_next); nlp->nl_socket = NULL; NLP_UNLOCK(nlp); NLCTL_WUNLOCK(ctl); so->so_pcb = NULL; + while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) { + TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq); + free(nb, M_NETLINK); + } + sbdestroy(so, SO_RCV); + NL_LOG(LOG_DEBUG3, "socket %p, detached", so); /* XXX: is delayed free needed? */ NET_EPOCH_CALL(destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); } static int nl_pru_disconnect(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); return (ENOTCONN); } static int nl_pru_shutdown(struct socket *so) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); socantsendmore(so); return (0); } static int nl_sockaddr(struct socket *so, struct sockaddr *sa) { *(struct sockaddr_nl *)sa = (struct sockaddr_nl ){ /* TODO: set other fields */ .nl_len = sizeof(struct sockaddr_nl), .nl_family = AF_NETLINK, .nl_pid = sotonlpcb(so)->nl_port, }; return (0); } static int -nl_pru_output(struct mbuf *m, struct socket *so, ...) +nl_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, + struct mbuf *m, struct mbuf *control, int flags, struct thread *td) { + struct nlpcb *nlp = sotonlpcb(so); + struct sockbuf *sb = &so->so_snd; + struct nl_buf *nb; + u_int len; + int error; - if (__predict_false(m == NULL || - ((m->m_len < sizeof(struct nlmsghdr)) && - (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL))) - return (ENOBUFS); - MPASS((m->m_flags & M_PKTHDR) != 0); - - NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); - nl_receive_async(m, so); - return (0); -} - + MPASS(m == NULL && uio != NULL); -static int -nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *sa, - struct mbuf *control, struct thread *td) -{ NL_LOG(LOG_DEBUG2, "sending message to kernel"); if (__predict_false(control != NULL)) { - if (control->m_len) { - m_freem(control); - return (EINVAL); - } m_freem(control); + return (EINVAL); } - return (nl_pru_output(m, so)); + if (__predict_false(flags & MSG_OOB)) /* XXXGL: or just ignore? */ + return (EOPNOTSUPP); + + if (__predict_false(uio->uio_resid < sizeof(struct nlmsghdr))) + return (ENOBUFS); /* XXXGL: any better error? */ + + NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); + + error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags)); + if (error) + return (error); + + len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE; + if (nlp->nl_linux) + len += roundup2(uio->uio_resid, 8); + nb = malloc(sizeof(*nb) + len, M_NETLINK, M_WAITOK); + nb->datalen = uio->uio_resid; + nb->buflen = len; + nb->offset = 0; + error = uiomove(&nb->data[0], uio->uio_resid, uio); + if (__predict_false(error)) + goto out; + + SOCK_SENDBUF_LOCK(so); +restart: + if (sb->sb_hiwat - sb->sb_ccc >= nb->datalen) { + TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq); + sb->sb_acc += nb->datalen; + sb->sb_ccc += nb->datalen; + nb = NULL; + } else if ((so->so_state & SS_NBIO) || + (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { + SOCK_SENDBUF_UNLOCK(so); + error = EWOULDBLOCK; + goto out; + } else { + if ((error = sbwait(so, SO_SND)) != 0) { + SOCK_SENDBUF_UNLOCK(so); + goto out; + } else + goto restart; + } + SOCK_SENDBUF_UNLOCK(so); + + if (nb == NULL) { + NL_LOG(LOG_DEBUG3, "enqueue %u bytes", nb->datalen); + NLP_LOCK(nlp); + nl_schedule_taskqueue(nlp); + NLP_UNLOCK(nlp); + } + +out: + SOCK_IO_SEND_UNLOCK(so); + free(nb, M_NETLINK); + return (error); } static int nl_pru_rcvd(struct socket *so, int flags) { NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); MPASS(sotonlpcb(so) != NULL); nl_on_transmit(sotonlpcb(so)); return (0); } static int nl_getoptflag(int sopt_name) { switch (sopt_name) { case NETLINK_CAP_ACK: return (NLF_CAP_ACK); case NETLINK_EXT_ACK: return (NLF_EXT_ACK); case NETLINK_GET_STRICT_CHK: return (NLF_STRICT); case NETLINK_MSG_INFO: return (NLF_MSG_INFO); } return (0); } static int nl_ctloutput(struct socket *so, struct sockopt *sopt) { struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); struct nlpcb *nlp = sotonlpcb(so); uint32_t flag; int optval, error = 0; NLCTL_TRACKER; NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", so, sopt->sopt_name); switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; if (optval <= 0 || optval >= NLP_MAX_GROUPS) { error = ERANGE; break; } NL_LOG(LOG_DEBUG2, "ADD/DEL group %d", (uint32_t)optval); NLCTL_WLOCK(ctl); if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) nl_add_group_locked(nlp, optval); else nl_del_group_locked(nlp, optval); NLCTL_WUNLOCK(ctl); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; flag = nl_getoptflag(sopt->sopt_name); if ((flag == NLF_MSG_INFO) && nlp->nl_linux) { error = EINVAL; break; } NLCTL_WLOCK(ctl); if (optval != 0) nlp->nl_flags |= flag; else nlp->nl_flags &= ~flag; NLCTL_WUNLOCK(ctl); break; default: error = ENOPROTOOPT; } break; case SOPT_GET: switch (sopt->sopt_name) { case NETLINK_LIST_MEMBERSHIPS: NLCTL_RLOCK(ctl); optval = nl_get_groups_compat(nlp); NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case NETLINK_CAP_ACK: case NETLINK_EXT_ACK: case NETLINK_GET_STRICT_CHK: case NETLINK_MSG_INFO: NLCTL_RLOCK(ctl); optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; NLCTL_RUNLOCK(ctl); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = ENOPROTOOPT; } break; default: error = ENOPROTOOPT; } return (error); } static int sysctl_handle_nl_maxsockbuf(SYSCTL_HANDLER_ARGS) { int error = 0; u_long tmp_maxsockbuf = nl_maxsockbuf; error = sysctl_handle_long(oidp, &tmp_maxsockbuf, arg2, req); if (error || !req->newptr) return (error); if (tmp_maxsockbuf < MSIZE + MCLBYTES) return (EINVAL); nl_maxsockbuf = tmp_maxsockbuf; return (0); } static int nl_setsbopt(struct socket *so, struct sockopt *sopt) { int error, optval; bool result; if (sopt->sopt_name != SO_RCVBUF) return (sbsetopt(so, sopt)); /* Allow to override max buffer size in certain conditions */ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error != 0) return (error); NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); if (optval > sb_max_adj) { if (priv_check(curthread, PRIV_NET_ROUTE) != 0) return (EPERM); } SOCK_RECVBUF_LOCK(so); result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); SOCK_RECVBUF_UNLOCK(so); return (result ? 0 : ENOBUFS); } #define NETLINK_PROTOSW \ - .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD, \ + .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD | \ + PR_SOCKBUF, \ .pr_ctloutput = nl_ctloutput, \ .pr_setsbopt = nl_setsbopt, \ .pr_attach = nl_pru_attach, \ .pr_bind = nl_pru_bind, \ .pr_connect = nl_pru_connect, \ .pr_disconnect = nl_pru_disconnect, \ - .pr_send = nl_pru_send, \ + .pr_sosend = nl_sosend, \ .pr_rcvd = nl_pru_rcvd, \ .pr_shutdown = nl_pru_shutdown, \ .pr_sockaddr = nl_sockaddr, \ .pr_close = nl_close static struct protosw netlink_raw_sw = { .pr_type = SOCK_RAW, NETLINK_PROTOSW }; static struct protosw netlink_dgram_sw = { .pr_type = SOCK_DGRAM, NETLINK_PROTOSW }; static struct domain netlinkdomain = { .dom_family = PF_NETLINK, .dom_name = "netlink", .dom_flags = DOMF_UNLOADABLE, .dom_nprotosw = 2, .dom_protosw = { &netlink_raw_sw, &netlink_dgram_sw }, }; DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c index 3fe01bb443a1..7e2e098e4a9a 100644 --- a/sys/netlink/netlink_io.c +++ b/sys/netlink/netlink_io.c @@ -1,598 +1,533 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_io #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_INFO); /* * The logic below provide a p2p interface for receiving and * sending netlink data between the kernel and userland. */ static const struct sockaddr_nl _nl_empty_src = { .nl_len = sizeof(struct sockaddr_nl), .nl_family = PF_NETLINK, .nl_pid = 0 /* comes from the kernel */ }; static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; -static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); - +static bool nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp); static void queue_push(struct nl_io_queue *q, struct mbuf *mq) { while (mq != NULL) { struct mbuf *m = mq; mq = mq->m_nextpkt; m->m_nextpkt = NULL; q->length += m_length(m, NULL); STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); } } -static void -queue_push_head(struct nl_io_queue *q, struct mbuf *m) -{ - MPASS(m->m_nextpkt == NULL); - - q->length += m_length(m, NULL); - STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); -} - static struct mbuf * queue_pop(struct nl_io_queue *q) { if (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; q->length -= m_length(m, NULL); return (m); } return (NULL); } static struct mbuf * queue_head(const struct nl_io_queue *q) { return (STAILQ_FIRST(&q->head)); } static inline bool queue_empty(const struct nl_io_queue *q) { return (q->length == 0); } static void queue_free(struct nl_io_queue *q) { while (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; m_freem(m); } q->length = 0; } void nl_add_msg_info(struct mbuf *m) { struct nlpcb *nlp = nl_get_thread_nlp(curthread); NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p", curthread, nlp); if (nlp == NULL) return; /* Prepare what we want to encode - PID, socket PID & msg seq */ struct { struct nlattr nla; uint32_t val; } data[] = { { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PROCESS_ID, .val = nlp->nl_process_id, }, { .nla.nla_len = sizeof(struct nlattr) + sizeof(uint32_t), .nla.nla_type = NLMSGINFO_ATTR_PORT_ID, .val = nlp->nl_port, }, }; while (m->m_next != NULL) m = m->m_next; m->m_next = sbcreatecontrol(data, sizeof(data), NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT); NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p", (unsigned)sizeof(data), m->m_next); } static __noinline struct mbuf * extract_msg_info(struct mbuf *m) { while (m->m_next != NULL) { if (m->m_next->m_type == MT_CONTROL) { struct mbuf *ctl = m->m_next; m->m_next = NULL; return (ctl); } m = m->m_next; } return (NULL); } -static void +void nl_schedule_taskqueue(struct nlpcb *nlp) { if (!nlp->nl_task_pending) { nlp->nl_task_pending = true; taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); } else { NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); } } -int -nl_receive_async(struct mbuf *m, struct socket *so) -{ - struct nlpcb *nlp = sotonlpcb(so); - int error = 0; - - m->m_nextpkt = NULL; - - NLP_LOCK(nlp); - - if ((__predict_true(nlp->nl_active))) { - sbappend(&so->so_snd, m, 0); - NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); - nl_schedule_taskqueue(nlp); - } else { - NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", - m_length(m, NULL)); - m_free(m); - error = EINVAL; - } - - NLP_UNLOCK(nlp); - - return (error); -} - static bool tx_check_locked(struct nlpcb *nlp) { if (queue_empty(&nlp->tx_queue)) return (true); /* * Check if something can be moved from the internal TX queue * to the socket queue. */ bool appended = false; struct sockbuf *sb = &nlp->nl_socket->so_rcv; SOCKBUF_LOCK(sb); while (true) { struct mbuf *m = queue_head(&nlp->tx_queue); if (m != NULL) { struct mbuf *ctl = NULL; if (__predict_false(m->m_next != NULL)) ctl = extract_msg_info(m); if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) { /* appended successfully */ queue_pop(&nlp->tx_queue); appended = true; } else break; } else break; } SOCKBUF_UNLOCK(sb); if (appended) sorwakeup(nlp->nl_socket); return (queue_empty(&nlp->tx_queue)); } static bool nl_process_received_one(struct nlpcb *nlp) { + struct socket *so = nlp->nl_socket; + struct sockbuf *sb = &so->so_snd; + struct nl_buf *nb; bool reschedule = false; NLP_LOCK(nlp); nlp->nl_task_pending = false; if (!tx_check_locked(nlp)) { /* TX overflow queue still not empty, ignore RX */ NLP_UNLOCK(nlp); return (false); } - if (queue_empty(&nlp->rx_queue)) { - /* - * Grab all data we have from the socket TX queue - * and store it the internal queue, so it can be worked on - * w/o holding socket lock. - */ - struct sockbuf *sb = &nlp->nl_socket->so_snd; - - SOCKBUF_LOCK(sb); - unsigned int avail = sbavail(sb); - if (avail > 0) { - NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); - queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); - } - SOCKBUF_UNLOCK(sb); - } else { - /* Schedule another pass to read from the socket queue */ - reschedule = true; - } - int prev_hiwat = nlp->tx_queue.hiwat; NLP_UNLOCK(nlp); - while (!queue_empty(&nlp->rx_queue)) { - struct mbuf *m = queue_pop(&nlp->rx_queue); - - m = nl_process_mbuf(m, nlp); - if (m != NULL) { - queue_push_head(&nlp->rx_queue, m); - reschedule = false; + SOCK_SENDBUF_LOCK(so); + while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) { + TAILQ_REMOVE(&sb->nl_queue, nb, tailq); + SOCK_SENDBUF_UNLOCK(so); + reschedule = nl_process_nbuf(nb, nlp); + SOCK_SENDBUF_LOCK(so); + if (reschedule) { + sb->sb_acc -= nb->datalen; + sb->sb_ccc -= nb->datalen; + /* XXXGL: potentially can reduce lock&unlock count. */ + sowwakeup_locked(so); + free(nb, M_NETLINK); + SOCK_SENDBUF_LOCK(so); + } else { + TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq); break; } } + SOCK_SENDBUF_UNLOCK(so); if (nlp->tx_queue.hiwat > prev_hiwat) { NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); } return (reschedule); } static void nl_process_received(struct nlpcb *nlp) { NL_LOG(LOG_DEBUG3, "taskqueue called"); if (__predict_false(nlp->nl_need_thread_setup)) { nl_set_thread_nlp(curthread, nlp); NLP_LOCK(nlp); nlp->nl_need_thread_setup = false; NLP_UNLOCK(nlp); } while (nl_process_received_one(nlp)) ; } void nl_init_io(struct nlpcb *nlp) { - STAILQ_INIT(&nlp->rx_queue.head); STAILQ_INIT(&nlp->tx_queue.head); } void nl_free_io(struct nlpcb *nlp) { - queue_free(&nlp->rx_queue); queue_free(&nlp->tx_queue); } /* * Called after some data have been read from the socket. */ void nl_on_transmit(struct nlpcb *nlp) { NLP_LOCK(nlp); struct socket *so = nlp->nl_socket; if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { unsigned long dropped_bytes = nlp->nl_dropped_bytes; unsigned long dropped_messages = nlp->nl_dropped_messages; nlp->nl_dropped_bytes = 0; nlp->nl_dropped_messages = 0; struct sockbuf *sb = &so->so_rcv; NLP_LOG(LOG_DEBUG, nlp, "socket RX overflowed, %lu messages (%lu bytes) dropped. " "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); /* TODO: send netlink message */ } nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } void nl_taskqueue_handler(void *_arg, int pending) { struct nlpcb *nlp = (struct nlpcb *)_arg; CURVNET_SET(nlp->nl_socket->so_vnet); nl_process_received(nlp); CURVNET_RESTORE(); } static __noinline void queue_push_tx(struct nlpcb *nlp, struct mbuf *m) { queue_push(&nlp->tx_queue, m); nlp->nl_tx_blocked = true; if (nlp->tx_queue.length > nlp->tx_queue.hiwat) nlp->tx_queue.hiwat = nlp->tx_queue.length; } /* * Tries to send @m to the socket @nlp. * * @m: mbuf(s) to send to. Consumed in any case. * @nlp: socket to send to * @cnt: number of messages in @m * @io_flags: combination of NL_IOF_* flags * * Returns true on success. * If no queue overrunes happened, wakes up socket owner. */ bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { bool untranslated = io_flags & NL_IOF_UNTRANSLATED; bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; bool result = true; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NLP_LOG(LOG_DEBUG2, nlp, "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, io_flags); } if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); if (m == NULL) return (false); } NLP_LOCK(nlp); if (__predict_false(nlp->nl_socket == NULL)) { NLP_UNLOCK(nlp); m_freem(m); return (false); } if (!queue_empty(&nlp->tx_queue)) { if (ignore_limits) { queue_push_tx(nlp, m); } else { m_free(m); result = false; } NLP_UNLOCK(nlp); return (result); } struct socket *so = nlp->nl_socket; struct mbuf *ctl = NULL; if (__predict_false(m->m_next != NULL)) ctl = extract_msg_info(m); if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) { sorwakeup(so); NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); } else { if (ignore_limits) { queue_push_tx(nlp, m); } else { /* * Store dropped data so it can be reported * on the next read */ nlp->nl_dropped_bytes += m_length(m, NULL); nlp->nl_dropped_messages += num_messages; NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", (unsigned long)nlp->nl_dropped_messages, num_messages, (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); soroverflow(so); m_freem(m); result = false; } } NLP_UNLOCK(nlp); return (result); } static int nl_receive_message(struct nlmsghdr *hdr, int remaining_length, struct nlpcb *nlp, struct nl_pstate *npt) { nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; int error = 0; NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, hdr->nlmsg_pid); if (__predict_false(hdr->nlmsg_len > remaining_length)) { NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", hdr->nlmsg_len, remaining_length); return (EINVAL); } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); return (EINVAL); } /* Stamp each message with sender pid */ hdr->nlmsg_pid = nlp->nl_port; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", hdr->nlmsg_type); if (nlp->nl_linux && linux_netlink_p != NULL) { struct nlmsghdr *hdr_orig = hdr; hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); if (hdr == NULL) { /* Failed to translate to kernel format. Report an error back */ hdr = hdr_orig; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_ACK) nlmsg_ack(nlp, EOPNOTSUPP, hdr, npt); return (0); } } error = handler(hdr, npt); NL_LOG(LOG_DEBUG2, "retcode: %d", error); } if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { if (!npt->nw->suppress_ack) { NL_LOG(LOG_DEBUG3, "ack"); nlmsg_ack(nlp, error, hdr, npt); } } return (0); } static void npt_clear(struct nl_pstate *npt) { lb_clear(&npt->lb); npt->error = 0; npt->err_msg = NULL; npt->err_off = 0; npt->hdr = NULL; npt->nw->suppress_ack = false; } /* * Processes an incoming packet, which can contain multiple netlink messages */ -static struct mbuf * -nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) +static bool +nl_process_nbuf(struct nl_buf *nb, struct nlpcb *nlp) { - int offset, buffer_length; struct nlmsghdr *hdr; - char *buffer; int error; - NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); + NL_LOG(LOG_DEBUG3, "RX netlink buf %p on %p", nb, nlp->nl_socket); struct nl_writer nw = {}; if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { - m_freem(m); NL_LOG(LOG_DEBUG, "error allocating socket writer"); - return (NULL); + return (true); } nlmsg_ignore_limit(&nw); - /* TODO: alloc this buf once for nlp */ - int data_length = m_length(m, NULL); - buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; - if (nlp->nl_linux) - buffer_length += roundup2(data_length, 8); - buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); - if (buffer == NULL) { - m_freem(m); - nlmsg_flush(&nw); - NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", - buffer_length); - return (NULL); - } - m_copydata(m, 0, data_length, buffer); struct nl_pstate npt = { .nlp = nlp, - .lb.base = &buffer[roundup2(data_length, 8)], - .lb.size = buffer_length - roundup2(data_length, 8), + .lb.base = &nb->data[roundup2(nb->datalen, 8)], + .lb.size = nb->buflen - roundup2(nb->datalen, 8), .nw = &nw, .strict = nlp->nl_flags & NLF_STRICT, }; - for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { - hdr = (struct nlmsghdr *)&buffer[offset]; + for (; nb->offset + sizeof(struct nlmsghdr) <= nb->datalen;) { + hdr = (struct nlmsghdr *)&nb->data[nb->offset]; /* Save length prior to calling handler */ int msglen = NLMSG_ALIGN(hdr->nlmsg_len); - NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); + NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", + nb->offset, nb->datalen); npt_clear(&npt); - error = nl_receive_message(hdr, data_length - offset, nlp, &npt); - offset += msglen; + error = nl_receive_message(hdr, nb->datalen - nb->offset, nlp, + &npt); + nb->offset += msglen; if (__predict_false(error != 0 || nlp->nl_tx_blocked)) break; } NL_LOG(LOG_DEBUG3, "packet parsing done"); - free(buffer, M_NETLINK); nlmsg_flush(&nw); if (nlp->nl_tx_blocked) { NLP_LOCK(nlp); nlp->nl_tx_blocked = false; NLP_UNLOCK(nlp); - m_adj(m, offset); - return (m); - } else { - m_freem(m); - return (NULL); - } + return (false); + } else + return (true); } diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h index 36b7c61974c9..ec174e17d1a2 100644 --- a/sys/netlink/netlink_var.h +++ b/sys/netlink/netlink_var.h @@ -1,208 +1,214 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_VAR_H_ #define _NETLINK_NETLINK_VAR_H_ #ifdef _KERNEL #include #include #include #include #include #define NLSNDQ 65536 /* Default socket sendspace */ #define NLRCVQ 65536 /* Default socket recvspace */ #define NLMBUFSIZE 2048 /* External storage size for Netlink mbufs */ struct ucred; struct nl_io_queue { STAILQ_HEAD(, mbuf) head; int length; int hiwat; }; +struct nl_buf { + TAILQ_ENTRY(nl_buf) tailq; + u_int buflen; + u_int datalen; + u_int offset; + char data[]; +}; + #define NLP_MAX_GROUPS 128 struct nlpcb { struct socket *nl_socket; uint64_t nl_groups[NLP_MAX_GROUPS / 64]; uint32_t nl_port; uint32_t nl_flags; uint32_t nl_process_id; int nl_proto; - bool nl_active; bool nl_bound; bool nl_task_pending; bool nl_tx_blocked; /* No new requests accepted */ bool nl_linux; /* true if running under compat */ bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */ bool nl_need_thread_setup; - struct nl_io_queue rx_queue; struct nl_io_queue tx_queue; struct taskqueue *nl_taskqueue; struct task nl_task; struct ucred *nl_cred; /* Copy of nl_socket->so_cred */ uint64_t nl_dropped_bytes; uint64_t nl_dropped_messages; CK_LIST_ENTRY(nlpcb) nl_next; CK_LIST_ENTRY(nlpcb) nl_port_next; volatile u_int nl_refcount; struct mtx nl_lock; struct epoch_context nl_epoch_ctx; }; #define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) #define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) #define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) #define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) #define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) #define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) /* nl_flags */ #define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ #define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ #define NLF_STRICT 0x04 /* Perform strict header checks */ #define NLF_MSG_INFO 0x08 /* Send caller info along with the notifications */ SYSCTL_DECL(_net_netlink); SYSCTL_DECL(_net_netlink_debug); struct nl_control { CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; CK_LIST_ENTRY(nl_control) ctl_next; struct rmlock ctl_lock; }; VNET_DECLARE(struct nl_control *, nl_ctl); #define V_nl_ctl VNET(nl_ctl) struct sockaddr_nl; struct sockaddr; struct nlmsghdr; /* netlink_module.c */ struct nl_control *vnet_nl_ctl_init(void); int nl_verify_proto(int proto); const char *nl_get_proto_name(int proto); extern int netlink_unloading; struct nl_proto_handler { nl_handler_f cb; const char *proto_name; }; extern struct nl_proto_handler *nl_handlers; /* netlink_domain.c */ void nl_send_group(struct mbuf *m, int cnt, int proto, int group_id); void nl_osd_register(void); void nl_osd_unregister(void); void nl_set_thread_nlp(struct thread *td, struct nlpcb *nlp); /* netlink_io.c */ #define NL_IOF_UNTRANSLATED 0x01 #define NL_IOF_IGNORE_LIMIT 0x02 bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int cnt, int io_flags); void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg, struct nl_pstate *npt); void nl_on_transmit(struct nlpcb *nlp); void nl_init_io(struct nlpcb *nlp); void nl_free_io(struct nlpcb *nlp); void nl_taskqueue_handler(void *_arg, int pending); -int nl_receive_async(struct mbuf *m, struct socket *so); +void nl_schedule_taskqueue(struct nlpcb *nlp); void nl_process_receive_locked(struct nlpcb *nlp); void nl_set_source_metadata(struct mbuf *m, int num_messages); void nl_add_msg_info(struct mbuf *m); /* netlink_message_writer.c */ void nl_init_msg_zone(void); void nl_destroy_msg_zone(void); /* netlink_generic.c */ struct genl_family { const char *family_name; uint16_t family_hdrsize; uint16_t family_id; uint16_t family_version; uint16_t family_attr_max; uint16_t family_cmd_size; uint16_t family_num_groups; struct genl_cmd *family_cmds; }; struct genl_group { struct genl_family *group_family; const char *group_name; }; struct genl_family *genl_get_family(uint32_t family_id); struct genl_group *genl_get_group(uint32_t group_id); #define MAX_FAMILIES 20 #define MAX_GROUPS 64 #define MIN_GROUP_NUM 48 #define CTRL_FAMILY_NAME "nlctrl" struct ifnet; struct nl_parsed_link; struct nlattr_bmask; struct nl_pstate; /* Function map */ struct nl_function_wrapper { bool (*nlmsg_add)(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool (*nlmsg_refill_buffer)(struct nl_writer *nw, int required_len); bool (*nlmsg_flush)(struct nl_writer *nw); bool (*nlmsg_end)(struct nl_writer *nw); void (*nlmsg_abort)(struct nl_writer *nw); void (*nlmsg_ignore_limit)(struct nl_writer *nw); bool (*nlmsg_get_unicast_writer)(struct nl_writer *nw, int size, struct nlpcb *nlp); bool (*nlmsg_get_group_writer)(struct nl_writer *nw, int size, int protocol, int group_id); bool (*nlmsg_get_chain_writer)(struct nl_writer *nw, int size, struct mbuf **pm); bool (*nlmsg_end_dump)(struct nl_writer *nw, int error, struct nlmsghdr *hdr); int (*nl_modify_ifp_generic)(struct ifnet *ifp, struct nl_parsed_link *lattrs, const struct nlattr_bmask *bm, struct nl_pstate *npt); void (*nl_store_ifp_cookie)(struct nl_pstate *npt, struct ifnet *ifp); struct nlpcb * (*nl_get_thread_nlp)(struct thread *td); }; void nl_set_functions(const struct nl_function_wrapper *nl); #endif #endif diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h index 92b9964072fb..c6093883be4a 100644 --- a/sys/sys/sockbuf.h +++ b/sys/sys/sockbuf.h @@ -1,317 +1,323 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ /* * Constants for sb_flags field of struct sockbuf/xsockbuf. */ #define SB_TLS_RX 0x01 /* using KTLS on RX */ #define SB_TLS_RX_RUNNING 0x02 /* KTLS RX operation running */ #define SB_WAIT 0x04 /* someone is waiting for data/space */ #define SB_SEL 0x08 /* someone is selecting */ #define SB_ASYNC 0x10 /* ASYNC I/O, need signals */ #define SB_UPCALL 0x20 /* someone wants an upcall */ #define SB_NOINTR 0x40 /* operations not interruptible */ #define SB_AIO 0x80 /* AIO operations queued */ #define SB_KNOTE 0x100 /* kernel note attached */ #define SB_NOCOALESCE 0x200 /* don't coalesce new data into existing mbufs */ #define SB_IN_TOE 0x400 /* socket buffer is in the middle of an operation */ #define SB_AUTOSIZE 0x800 /* automatically size socket buffer */ #define SB_STOP 0x1000 /* backpressure indicator */ #define SB_AIO_RUNNING 0x2000 /* AIO operation running */ #define SB_UNUSED 0x4000 /* previously used for SB_TLS_IFNET */ #define SB_TLS_RX_RESYNC 0x8000 /* KTLS RX lost HW sync */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ #if defined(_KERNEL) || defined(_WANT_SOCKET) #include #include #include #include #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ struct ktls_session; struct mbuf; struct sockaddr; struct socket; struct sockopt; struct thread; struct selinfo; /* * Socket buffer * * A buffer starts with the fields that are accessed by I/O multiplexing * APIs like select(2), kevent(2) or AIO and thus are shared between different * buffer implementations. They are protected by the SOCK_RECVBUF_LOCK() * or SOCK_SENDBUF_LOCK() of the owning socket. * * XXX: sb_acc, sb_ccc and sb_mbcnt shall become implementation specific * methods. * * Protocol specific implementations follow in a union. */ struct sockbuf { struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* socket state on sockbuf */ short sb_flags; /* flags, see above */ u_int sb_acc; /* available chars in buffer */ u_int sb_ccc; /* claimed chars in buffer */ u_int sb_mbcnt; /* chars of mbufs used */ u_int sb_ctl; /* non-data chars in buffer */ u_int sb_hiwat; /* max actual char count */ u_int sb_lowat; /* low water mark */ u_int sb_mbmax; /* max chars of mbufs to use */ sbintime_t sb_timeo; /* timeout for read/write */ int (*sb_upcall)(struct socket *, void *, int); void *sb_upcallarg; TAILQ_HEAD(, kaiocb) sb_aiojobq; /* pending AIO ops */ struct task sb_aiotask; /* AIO task */ union { /* * Classic BSD one-size-fits-all socket buffer, capable of * doing streams and datagrams. The stream part is able * to perform special features: * - not ready data (sendfile) * - TLS */ struct { /* compat: sockbuf lock pointer */ struct mtx *sb_mtx; /* first and last mbufs in the chain */ struct mbuf *sb_mb; struct mbuf *sb_mbtail; /* first mbuf of last record in socket buffer */ struct mbuf *sb_lastrecord; /* pointer to data to send next (TCP */ struct mbuf *sb_sndptr; /* pointer to first not ready buffer */ struct mbuf *sb_fnrdy; /* byte offset of ptr into chain, used with sb_sndptr */ u_int sb_sndptroff; /* TLS */ u_int sb_tlscc; /* TLS chain characters */ u_int sb_tlsdcc; /* characters being decrypted */ struct mbuf *sb_mtls; /* TLS mbuf chain */ struct mbuf *sb_mtlstail; /* last mbuf in TLS chain */ uint64_t sb_tls_seqno; /* TLS seqno */ struct ktls_session *sb_tls_info; /* TLS state */ }; /* * PF_UNIX/SOCK_DGRAM * * Local protocol, thus we should buffer on the receive side * only. However, in one to many configuration we don't want * a single receive buffer to be shared. So we would link * send buffers onto receive buffer. All the fields are locked * by the receive buffer lock. */ struct { /* * For receive buffer: own queue of this buffer for * unconnected sends. For send buffer: queue lended * to the peer receive buffer, to isolate ourselves * from other senders. */ STAILQ_HEAD(, mbuf) uxdg_mb; /* For receive buffer: datagram seen via MSG_PEEK. */ struct mbuf *uxdg_peeked; /* * For receive buffer: queue of send buffers of * connected peers. For send buffer: linkage on * connected peer receive buffer queue. */ union { TAILQ_HEAD(, sockbuf) uxdg_conns; TAILQ_ENTRY(sockbuf) uxdg_clist; }; /* Counters for this buffer uxdg_mb chain + peeked. */ u_int uxdg_cc; u_int uxdg_ctl; u_int uxdg_mbcnt; }; + /* + * Netlink socket. + */ + struct { + TAILQ_HEAD(, nl_buf) nl_queue; + }; }; }; #endif /* defined(_KERNEL) || defined(_WANT_SOCKET) */ #ifdef _KERNEL /* 'which' values for KPIs that operate on one buffer of a socket. */ typedef enum { SO_RCV, SO_SND } sb_which; /* * Per-socket buffer mutex used to protect most fields in the socket buffer. * These make use of the mutex pointer embedded in struct sockbuf, which * currently just references mutexes in the containing socket. The * SOCK_SENDBUF_LOCK() etc. macros can be used instead of or in combination with * these locking macros. */ #define SOCKBUF_MTX(_sb) ((_sb)->sb_mtx) #define SOCKBUF_LOCK(_sb) mtx_lock(SOCKBUF_MTX(_sb)) #define SOCKBUF_OWNED(_sb) mtx_owned(SOCKBUF_MTX(_sb)) #define SOCKBUF_UNLOCK(_sb) mtx_unlock(SOCKBUF_MTX(_sb)) #define SOCKBUF_LOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_OWNED) #define SOCKBUF_UNLOCK_ASSERT(_sb) mtx_assert(SOCKBUF_MTX(_sb), MA_NOTOWNED) /* * Socket buffer private mbuf(9) flags. */ #define M_NOTREADY M_PROTO1 /* m_data not populated yet */ #define M_BLOCKED M_PROTO2 /* M_NOTREADY in front of m */ #define M_NOTAVAIL (M_NOTREADY | M_BLOCKED) void sbappend(struct sockbuf *sb, struct mbuf *m, int flags); void sbappend_locked(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream(struct sockbuf *sb, struct mbuf *m, int flags); void sbappendstream_locked(struct sockbuf *sb, struct mbuf *m, int flags); int sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); int sbappendaddr_nospacecheck_locked(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0, struct mbuf *control); void sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags); void sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, int flags); void sbappendrecord(struct sockbuf *sb, struct mbuf *m0); void sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0); void sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n); struct mbuf * sbcreatecontrol(const void *p, u_int size, int type, int level, int wait); void sbdestroy(struct socket *, sb_which); void sbdrop(struct sockbuf *sb, int len); void sbdrop_locked(struct sockbuf *sb, int len); struct mbuf * sbcut_locked(struct sockbuf *sb, int len); void sbdroprecord(struct sockbuf *sb); void sbdroprecord_locked(struct sockbuf *sb); void sbflush(struct sockbuf *sb); void sbflush_locked(struct sockbuf *sb); void sbrelease(struct socket *, sb_which); void sbrelease_locked(struct socket *, sb_which); int sbsetopt(struct socket *so, struct sockopt *); bool sbreserve_locked(struct socket *so, sb_which which, u_long cc, struct thread *td); bool sbreserve_locked_limit(struct socket *so, sb_which which, u_long cc, u_long buf_max, struct thread *td); void sbsndptr_adv(struct sockbuf *sb, struct mbuf *mb, u_int len); struct mbuf * sbsndptr_noadv(struct sockbuf *sb, u_int off, u_int *moff); struct mbuf * sbsndmbuf(struct sockbuf *sb, u_int off, u_int *moff); int sbwait(struct socket *, sb_which); void sballoc(struct sockbuf *, struct mbuf *); void sbfree(struct sockbuf *, struct mbuf *); void sballoc_ktls_rx(struct sockbuf *sb, struct mbuf *m); void sbfree_ktls_rx(struct sockbuf *sb, struct mbuf *m); int sbready(struct sockbuf *, struct mbuf *, int); /* * Return how much data is available to be taken out of socket * buffer right now. */ static inline u_int sbavail(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_acc); } /* * Return how much data sits there in the socket buffer * It might be that some data is not yet ready to be read. */ static inline u_int sbused(struct sockbuf *sb) { #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif return (sb->sb_ccc); } /* * How much space is there in a socket buffer (so->so_snd or so->so_rcv)? * This is problematical if the fields are unsigned, as the space might * still be negative (ccc > hiwat or mbcnt > mbmax). */ static inline long sbspace(struct sockbuf *sb) { int bleft, mleft; /* size should match sockbuf fields */ #if 0 SOCKBUF_LOCK_ASSERT(sb); #endif if (sb->sb_flags & SB_STOP) return(0); bleft = sb->sb_hiwat - sb->sb_ccc; mleft = sb->sb_mbmax - sb->sb_mbcnt; return ((bleft < mleft) ? bleft : mleft); } #define SB_EMPTY_FIXUP(sb) do { \ if ((sb)->sb_mb == NULL) { \ (sb)->sb_mbtail = NULL; \ (sb)->sb_lastrecord = NULL; \ } \ } while (/*CONSTCOND*/0) #ifdef SOCKBUF_DEBUG void sblastrecordchk(struct sockbuf *, const char *, int); void sblastmbufchk(struct sockbuf *, const char *, int); void sbcheck(struct sockbuf *, const char *, int); #define SBLASTRECORDCHK(sb) sblastrecordchk((sb), __FILE__, __LINE__) #define SBLASTMBUFCHK(sb) sblastmbufchk((sb), __FILE__, __LINE__) #define SBCHECK(sb) sbcheck((sb), __FILE__, __LINE__) #else #define SBLASTRECORDCHK(sb) do {} while (0) #define SBLASTMBUFCHK(sb) do {} while (0) #define SBCHECK(sb) do {} while (0) #endif /* SOCKBUF_DEBUG */ #endif /* _KERNEL */ #endif /* _SYS_SOCKBUF_H_ */