diff --git a/share/man/man4/netlink.4 b/share/man/man4/netlink.4 index c75366f560f0..bbfa55049e2e 100644 --- a/share/man/man4/netlink.4 +++ b/share/man/man4/netlink.4 @@ -1,344 +1,349 @@ .\" .\" Copyright (C) 2022 Alexander Chernikov . .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd November 1, 2022 +.Dd November 30, 2022 .Dt NETLINK 4 .Os .Sh NAME .Nm Netlink .Nd Kernel network configuration protocol .Sh SYNOPSIS .In netlink/netlink.h .In netlink/netlink_route.h .Ft int .Fn socket AF_NETLINK SOCK_DGRAM int family .Sh DESCRIPTION Netlink is a user-kernel message-based communication protocol primarily used for network stack configuration. Netlink is easily extendable and supports large dumps and event notifications, all via a single socket. The protocol is fully asynchronous, allowing one to issue and track multiple requests at once. Netlink consists of multiple families, which commonly group the commands belonging to the particular kernel subsystem. Currently, the supported families are: .Pp .Bd -literal -offset indent -compact NETLINK_ROUTE network configuration, NETLINK_GENERIC "container" family .Ed .Pp The .Dv NETLINK_ROUTE family handles all interfaces, addresses, neighbors, routes, and VNETs configuration. More details can be found in .Xr rtnetlink 4 . The .Dv NETLINK_GENERIC family serves as a .Do container Dc , allowing registering other families under the .Dv NETLINK_GENERIC umbrella. This approach allows using a single netlink socket to interact with multiple netlink families at once. More details can be found in .Xr genetlink 4 . .Pp Netlink has its own sockaddr structure: .Bd -literal struct sockaddr_nl { uint8_t nl_len; /* sizeof(sockaddr_nl) */ sa_family_t nl_family; /* netlink family */ uint16_t nl_pad; /* reserved, set to 0 */ uint32_t nl_pid; /* automatically selected, set to 0 */ uint32_t nl_groups; /* multicast groups mask to bind to */ }; .Ed .Pp Typically, filling this structure is not required for socket operations. It is presented here for completeness. .Sh PROTOCOL DESCRIPTION The protocol is message-based. Each message starts with the mandatory .Va nlmsghdr header, followed by the family-specific header and the list of type-length-value pairs (TLVs). TLVs can be nested. All headers and TLVS are padded to 4-byte boundaries. Each .Xr send 2 or .Xr recv 2 system call may contain multiple messages. .Ss BASE HEADER .Bd -literal struct nlmsghdr { uint32_t nlmsg_len; /* Length of message including header */ uint16_t nlmsg_type; /* Message type identifier */ uint16_t nlmsg_flags; /* Flags (NLM_F_) */ uint32_t nlmsg_seq; /* Sequence number */ uint32_t nlmsg_pid; /* Sending process port ID */ }; .Ed .Pp The .Va nlmsg_len field stores the whole message length, in bytes, including the header. This length has to be rounded up to the nearest 4-byte boundary when iterating over messages. The .Va nlmsg_type field represents the command/request type. This value is family-specific. The list of supported commands can be found in the relevant family header file. .Va nlmsg_seq is a user-provided request identifier. An application can track the operation result using the .Dv NLMSG_ERROR messages and matching the .Va nlmsg_seq . The .Va nlmsg_pid field is the message sender id. This field is optional for userland. The kernel sender id is zero. The .Va nlmsg_flags field contains the message-specific flags. The following generic flags are defined: .Pp .Bd -literal -offset indent -compact NLM_F_REQUEST Indicates that the message is an actual request to the kernel NLM_F_ACK Request an explicit ACK message with an operation result .Ed .Pp The following generic flags are defined for the "GET" request types: .Pp .Bd -literal -offset indent -compact NLM_F_ROOT Return the whole dataset NLM_F_MATCH Return all entries matching the criteria .Ed These two flags are typically used together, aliased to .Dv NLM_F_DUMP .Pp The following generic flags are defined for the "NEW" request types: .Pp .Bd -literal -offset indent -compact NLM_F_CREATE Create an object if none exists NLM_F_EXCL Don't replace an object if it exists NLM_F_REPLACE Replace an existing matching object NLM_F_APPEND Append to an existing object .Ed .Pp The following generic flags are defined for the replies: .Pp .Bd -literal -offset indent -compact NLM_F_MULTI Indicates that the message is part of the message group NLM_F_DUMP_INTR Indicates that the state dump was not completed NLM_F_DUMP_FILTERED Indicates that the dump was filtered per request NLM_F_CAPPED Indicates the original message was capped to its header NLM_F_ACK_TLVS Indicates that extended ACK TLVs were included .Ed .Ss TLVs Most messages encode their attributes as type-length-value pairs (TLVs). The base TLV header: .Bd -literal struct nlattr { uint16_t nla_len; /* Total attribute length */ uint16_t nla_type; /* Attribute type */ }; .Ed The TLV type .Pq Va nla_type scope is typically the message type or group within a family. For example, the .Dv RTN_MULTICAST type value is only valid for .Dv RTM_NEWROUTE , .Dv RTM_DELROUTE and .Dv RTM_GETROUTE messages. TLVs can be nested; in that case internal TLVs may have their own sub-types. All TLVs are packed with 4-byte padding. .Ss CONTROL MESSAGES A number of generic control messages are reserved in each family. .Pp .Dv NLMSG_ERROR reports the operation result if requested, optionally followed by the metadata TLVs. The value of .Va nlmsg_seq is set to its value in the original messages, while .Va nlmsg_pid is set to the socket pid of the original socket. The operation result is reported via .Vt "struct nlmsgerr": .Bd -literal struct nlmsgerr { int error; /* Standard errno */ struct nlmsghdr msg; /* Original message header */ }; .Ed If the .Dv NETLINK_CAP_ACK socket option is not set, the remainder of the original message will follow. If the .Dv NETLINK_EXT_ACK -socket option is set, kernel may add a +socket option is set, the kernel may add a .Dv NLMSGERR_ATTR_MSG string TLV with the textual error description, optionally followed by the .Dv NLMSGERR_ATTR_OFFS TLV, indicating the offset from the message start that triggered an error. +If the operation reply is a multipart message, then no +.Dv NLMSG_ERROR +reply is generated, only a +.Dv NLMSG_DONE +message, closing multipart sequence. .Pp .Dv NLMSG_DONE indicates the end of the message group: typically, the end of the dump. It contains a single .Vt int field, describing the dump result as a standard errno value. .Sh SOCKET OPTIONS Netlink supports a number of custom socket options, which can be set with .Xr setsockopt 2 with the .Dv SOL_NETLINK .Fa level : .Bl -tag -width indent .It Dv NETLINK_ADD_MEMBERSHIP Subscribes to the notifications for the specific group (int). .It Dv NETLINK_DROP_MEMBERSHIP Unsubscribes from the notifications for the specific group (int). .It Dv NETLINK_LIST_MEMBERSHIPS Lists the memberships as a bitmask. .It Dv NETLINK_CAP_ACK Instructs the kernel to send the original message header in the reply without the message body. .It Dv NETLINK_EXT_ACK Acknowledges ability to receive additional TLVs in the ACK message. .El .Pp Additionally, netlink overrides the following socket options from the .Dv SOL_SOCKET .Fa level : .Bl -tag -width indent .It Dv SO_RCVBUF Sets the maximum size of the socket receive buffer. If the caller has .Dv PRIV_NET_ROUTE permission, the value can exceed the currently-set .Va kern.ipc.maxsockbuf value. .El .Sh SYSCTL VARIABLES A set of .Xr sysctl 8 variables is available to tweak run-time parameters: .Bl -tag -width indent .It Va net.netlink.sendspace Default send buffer for the netlink socket. Note that the socket sendspace has to be at least as long as the longest message that can be transmitted via this socket. .El .Bl -tag -width indent .It Va net.netlink.recvspace Default receive buffer for the netlink socket. Note that the socket recvspace has to be least as long as the longest message that can be received from this socket. .El .Sh DEBUGGING Netlink implements per-functional-unit debugging, with different severities controllable via the .Va net.netlink.debug branch. These messages are logged in the kernel message buffer and can be seen in .Xr dmesg 8 . The following severity levels are defined: .Bl -tag -width indent .It Dv LOG_DEBUG(7) Rare events or per-socket errors are reported here. This is the default level, not impacting production performance. .It Dv LOG_DEBUG2(8) Socket events such as groups memberships, privilege checks, commands and dumps are logged. This level does not incur significant performance overhead. .It Dv LOG_DEBUG9(9) All socket events, each dumped or modified entities are logged. Turning it on may result in significant performance overhead. .El .Sh ERRORS Netlink reports operation results, including errors and error metadata, by sending a .Dv NLMSG_ERROR message for each request message. The following errors can be returned: .Bl -tag -width Er .It Bq Er EPERM when the current privileges are insufficient to perform the required operation; .It Bo Er ENOBUFS Bc or Bo Er ENOMEM Bc when the system runs out of memory for an internal data structure; .It Bq Er ENOTSUP when the requested command is not supported by the family or the family is not supported; .It Bq Er EINVAL when some necessary TLVs are missing or invalid, detailed info may be provided in NLMSGERR_ATTR_MSG and NLMSGERR_ATTR_OFFS TLVs; .It Bq Er ENOENT when trying to delete a non-existent object. .Pp Additionally, a socket operation itself may fail with one of the errors specified in .Xr socket 2 , .Xr recv 2 or .Xr send 2 . .El .Sh SEE ALSO .Xr genetrlink 4 , .Xr rtnetlink 4 .Rs .%A "J. Salim" .%A "H. Khosravi" .%A "A. Kleen" .%A "A. Kuznetsov" .%T "Linux Netlink as an IP Services Protocol" .%O "RFC 3549" .Re .Sh HISTORY The netlink protocol appeared in .Fx 14.0 . .Sh AUTHORS The netlink was implemented by .An -nosplit .An Alexander Chernikov Aq Mt melifaro@FreeBSD.org . It was derived from the Google Summer of Code 2021 project by .An Ng Peng Nam Sean . diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c index b2a0023a143b..fb8006f689e4 100644 --- a/sys/netlink/netlink_io.c +++ b/sys/netlink/netlink_io.c @@ -1,529 +1,532 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_io #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_DEBUG); /* * The logic below provide a p2p interface for receiving and * sending netlink data between the kernel and userland. */ static const struct sockaddr_nl _nl_empty_src = { .nl_len = sizeof(struct sockaddr_nl), .nl_family = PF_NETLINK, .nl_pid = 0 /* comes from the kernel */ }; static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); static void queue_push(struct nl_io_queue *q, struct mbuf *mq) { while (mq != NULL) { struct mbuf *m = mq; mq = mq->m_nextpkt; m->m_nextpkt = NULL; q->length += m_length(m, NULL); STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); } } static void queue_push_head(struct nl_io_queue *q, struct mbuf *m) { MPASS(m->m_nextpkt == NULL); q->length += m_length(m, NULL); STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); } static struct mbuf * queue_pop(struct nl_io_queue *q) { if (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; q->length -= m_length(m, NULL); return (m); } return (NULL); } static struct mbuf * queue_head(const struct nl_io_queue *q) { return (STAILQ_FIRST(&q->head)); } static inline bool queue_empty(const struct nl_io_queue *q) { return (q->length == 0); } static void queue_free(struct nl_io_queue *q) { while (!STAILQ_EMPTY(&q->head)) { struct mbuf *m = STAILQ_FIRST(&q->head); STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); m->m_nextpkt = NULL; m_freem(m); } q->length = 0; } static void nl_schedule_taskqueue(struct nlpcb *nlp) { if (!nlp->nl_task_pending) { nlp->nl_task_pending = true; taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); } else { NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); } } int nl_receive_async(struct mbuf *m, struct socket *so) { struct nlpcb *nlp = sotonlpcb(so); int error = 0; m->m_nextpkt = NULL; NLP_LOCK(nlp); if ((__predict_true(nlp->nl_active))) { sbappend(&so->so_snd, m, 0); NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); nl_schedule_taskqueue(nlp); } else { NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", m_length(m, NULL)); m_free(m); error = EINVAL; } NLP_UNLOCK(nlp); return (error); } static bool tx_check_locked(struct nlpcb *nlp) { if (queue_empty(&nlp->tx_queue)) return (true); /* * Check if something can be moved from the internal TX queue * to the socket queue. */ bool appended = false; struct sockbuf *sb = &nlp->nl_socket->so_rcv; SOCKBUF_LOCK(sb); while (true) { struct mbuf *m = queue_head(&nlp->tx_queue); if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) { /* appended successfully */ queue_pop(&nlp->tx_queue); appended = true; } else break; } SOCKBUF_UNLOCK(sb); if (appended) sorwakeup(nlp->nl_socket); return (queue_empty(&nlp->tx_queue)); } static bool nl_process_received_one(struct nlpcb *nlp) { bool reschedule = false; NLP_LOCK(nlp); nlp->nl_task_pending = false; if (!tx_check_locked(nlp)) { /* TX overflow queue still not empty, ignore RX */ NLP_UNLOCK(nlp); return (false); } if (queue_empty(&nlp->rx_queue)) { /* * Grab all data we have from the socket TX queue * and store it the internal queue, so it can be worked on * w/o holding socket lock. */ struct sockbuf *sb = &nlp->nl_socket->so_snd; SOCKBUF_LOCK(sb); unsigned int avail = sbavail(sb); if (avail > 0) { NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); } SOCKBUF_UNLOCK(sb); } else { /* Schedule another pass to read from the socket queue */ reschedule = true; } int prev_hiwat = nlp->tx_queue.hiwat; NLP_UNLOCK(nlp); while (!queue_empty(&nlp->rx_queue)) { struct mbuf *m = queue_pop(&nlp->rx_queue); m = nl_process_mbuf(m, nlp); if (m != NULL) { queue_push_head(&nlp->rx_queue, m); reschedule = false; break; } } if (nlp->tx_queue.hiwat > prev_hiwat) { NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); } return (reschedule); } static void nl_process_received(struct nlpcb *nlp) { NL_LOG(LOG_DEBUG3, "taskqueue called"); while (nl_process_received_one(nlp)) ; } void nl_init_io(struct nlpcb *nlp) { STAILQ_INIT(&nlp->rx_queue.head); STAILQ_INIT(&nlp->tx_queue.head); } void nl_free_io(struct nlpcb *nlp) { queue_free(&nlp->rx_queue); queue_free(&nlp->tx_queue); } /* * Called after some data have been read from the socket. */ void nl_on_transmit(struct nlpcb *nlp) { NLP_LOCK(nlp); struct socket *so = nlp->nl_socket; if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { unsigned long dropped_bytes = nlp->nl_dropped_bytes; unsigned long dropped_messages = nlp->nl_dropped_messages; nlp->nl_dropped_bytes = 0; nlp->nl_dropped_messages = 0; struct sockbuf *sb = &so->so_rcv; NLP_LOG(LOG_DEBUG, nlp, "socket RX overflowed, %lu messages (%lu bytes) dropped. " "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); /* TODO: send netlink message */ } nl_schedule_taskqueue(nlp); NLP_UNLOCK(nlp); } void nl_taskqueue_handler(void *_arg, int pending) { struct nlpcb *nlp = (struct nlpcb *)_arg; CURVNET_SET(nlp->nl_socket->so_vnet); nl_process_received(nlp); CURVNET_RESTORE(); } static __noinline void queue_push_tx(struct nlpcb *nlp, struct mbuf *m) { queue_push(&nlp->tx_queue, m); nlp->nl_tx_blocked = true; if (nlp->tx_queue.length > nlp->tx_queue.hiwat) nlp->tx_queue.hiwat = nlp->tx_queue.length; } /* * Tries to send @m to the socket @nlp. * * @m: mbuf(s) to send to. Consumed in any case. * @nlp: socket to send to * @cnt: number of messages in @m * @io_flags: combination of NL_IOF_* flags * * Returns true on success. * If no queue overrunes happened, wakes up socket owner. */ bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) { bool untranslated = io_flags & NL_IOF_UNTRANSLATED; bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; bool result = true; IF_DEBUG_LEVEL(LOG_DEBUG2) { struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); NLP_LOG(LOG_DEBUG2, nlp, "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, io_flags); } if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); if (m == NULL) return (false); } NLP_LOCK(nlp); if (__predict_false(nlp->nl_socket == NULL)) { NLP_UNLOCK(nlp); m_freem(m); return (false); } if (!queue_empty(&nlp->tx_queue)) { if (ignore_limits) { queue_push_tx(nlp, m); } else { m_free(m); result = false; } NLP_UNLOCK(nlp); return (result); } struct socket *so = nlp->nl_socket; if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) { sorwakeup(so); NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); } else { if (ignore_limits) { queue_push_tx(nlp, m); } else { /* * Store dropped data so it can be reported * on the next read */ nlp->nl_dropped_bytes += m_length(m, NULL); nlp->nl_dropped_messages += num_messages; NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", (unsigned long)nlp->nl_dropped_messages, num_messages, (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL)); soroverflow(so); m_freem(m); result = false; } } NLP_UNLOCK(nlp); return (result); } static int nl_receive_message(struct nlmsghdr *hdr, int remaining_length, struct nlpcb *nlp, struct nl_pstate *npt) { nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; int error = 0; - NL_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len, - hdr->nlmsg_type); + NLP_LOG(LOG_DEBUG2, nlp, "msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", + hdr->nlmsg_len, hdr->nlmsg_type, hdr->nlmsg_flags, hdr->nlmsg_seq, + hdr->nlmsg_pid); if (__predict_false(hdr->nlmsg_len > remaining_length)) { NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", hdr->nlmsg_len, remaining_length); return (EINVAL); } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); return (EINVAL); } /* Stamp each message with sender pid */ hdr->nlmsg_pid = nlp->nl_port; npt->hdr = hdr; if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", hdr->nlmsg_type); if (nlp->nl_linux && linux_netlink_p != NULL) { struct nlmsghdr *hdr_orig = hdr; hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); if (hdr == NULL) { npt->hdr = hdr_orig; if (hdr->nlmsg_flags & NLM_F_ACK) nlmsg_ack(nlp, EAGAIN, hdr, npt); return (0); } } error = handler(hdr, npt); NL_LOG(LOG_DEBUG2, "retcode: %d", error); } if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { - NL_LOG(LOG_DEBUG3, "ack"); - nlmsg_ack(nlp, error, hdr, npt); - NL_LOG(LOG_DEBUG3, "done"); + if (!npt->nw->suppress_ack) { + NL_LOG(LOG_DEBUG3, "ack"); + nlmsg_ack(nlp, error, hdr, npt); + } } return (0); } static void npt_clear(struct nl_pstate *npt) { lb_clear(&npt->lb); npt->error = 0; npt->err_msg = NULL; npt->err_off = 0; npt->hdr = NULL; + npt->nw->suppress_ack = false; } /* * Processes an incoming packet, which can contain multiple netlink messages */ static struct mbuf * nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) { int offset, buffer_length; struct nlmsghdr *hdr; char *buffer; int error; NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); struct nl_writer nw = {}; if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { m_freem(m); NL_LOG(LOG_DEBUG, "error allocating socket writer"); return (NULL); } nlmsg_ignore_limit(&nw); /* TODO: alloc this buf once for nlp */ int data_length = m_length(m, NULL); buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; if (nlp->nl_linux) buffer_length += roundup2(data_length, 8); buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); if (buffer == NULL) { m_freem(m); nlmsg_flush(&nw); NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", buffer_length); return (NULL); } m_copydata(m, 0, data_length, buffer); struct nl_pstate npt = { .nlp = nlp, .lb.base = &buffer[roundup2(data_length, 8)], .lb.size = buffer_length - roundup2(data_length, 8), .nw = &nw, .strict = nlp->nl_flags & NLF_STRICT, }; for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { hdr = (struct nlmsghdr *)&buffer[offset]; /* Save length prior to calling handler */ int msglen = NLMSG_ALIGN(hdr->nlmsg_len); NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); npt_clear(&npt); error = nl_receive_message(hdr, data_length - offset, nlp, &npt); offset += msglen; if (__predict_false(error != 0 || nlp->nl_tx_blocked)) break; } NL_LOG(LOG_DEBUG3, "packet parsing done"); free(buffer, M_NETLINK); nlmsg_flush(&nw); if (nlp->nl_tx_blocked) { NLP_LOCK(nlp); nlp->nl_tx_blocked = false; NLP_UNLOCK(nlp); m_adj(m, offset); return (m); } else { m_freem(m); return (NULL); } } diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c index 1856f2859b01..37414703c6f6 100644 --- a/sys/netlink/netlink_message_writer.c +++ b/sys/netlink/netlink_message_writer.c @@ -1,686 +1,690 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_MOD_NAME nl_writer #define DEBUG_MAX_LEVEL LOG_DEBUG3 #include _DECLARE_DEBUG(LOG_DEBUG); /* * The goal of this file is to provide convenient message writing KPI on top of * different storage methods (mbufs, uio, temporary memory chunks). * * The main KPI guarantee is the the (last) message always resides in the contiguous * memory buffer, so one is able to update the header after writing the entire message. * * This guarantee comes with a side effect of potentially reallocating underlying * buffer, so one needs to update the desired pointers after something is added * to the header. * * Messaging layer contains hooks performing transparent Linux translation for the messages. * * There are 3 types of supported targets: * * socket (adds mbufs to the socket buffer, used for message replies) * * group (sends mbuf/chain to the specified groups, used for the notifications) * * chain (returns mbuf chain, used in Linux message translation code) * * There are 3 types of storage: * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message * fits in MCLBYTES) * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs * to be larger than one supported by NS_WRITER_TYPE_MBUF) * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for * Linux sockets, calls translation hook prior to sending messages to the socket). * * Internally, KPI switches between different types of storage when memory requirements * change. It happens transparently to the caller. */ typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok); typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt); struct nlwriter_ops { nlwriter_op_init *init; nlwriter_op_write *write_socket; nlwriter_op_write *write_group; nlwriter_op_write *write_chain; }; /* * NS_WRITER_TYPE_BUF * Writes message to a temporary memory buffer, * flushing to the socket/group when buffer size limit is reached */ static bool nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO); if (__predict_false(nw->_storage == NULL)) return (false); nw->alloc_len = size; nw->offset = 0; nw->hdr = NULL; nw->data = nw->_storage; nw->writer_type = NS_WRITER_TYPE_BUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; return (true); } static bool nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { /* XXX: should we set sorcverr? */ free(buf, M_NETLINK); return (false); } m_append(m, datalen, buf); free(buf, M_NETLINK); int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, (struct nlpcb *)(nw->arg_ptr), cnt, io_flags)); } static bool nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } bool success = m_append(m, datalen, buf) != 0; free(buf, M_NETLINK); if (!success) return (false); nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF); return (true); } static bool nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf **m0 = (struct mbuf **)(nw->arg_ptr); NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } if (*m0 == NULL) { struct mbuf *m; m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } *m0 = m; } if (__predict_false(m_append(*m0, datalen, buf) == 0)) { free(buf, M_NETLINK); return (false); } return (true); } /* * NS_WRITER_TYPE_MBUF * Writes message to the allocated mbuf, * flushing to socket/group when mbuf size limit is reached. * This is the most efficient mechanism as it avoids double-copying. * * Allocates a single mbuf suitable to store up to @size bytes of data. * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr * If size <= MCLBYTES (2k), allocate a single mbuf cluster * Otherwise, return NULL. */ static bool nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok) { struct mbuf *m; int mflag = waitok ? M_WAITOK : M_NOWAIT; m = m_get2(size, mflag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) return (false); nw->alloc_len = M_TRAILINGSPACE(m); nw->offset = 0; nw->hdr = NULL; nw->_storage = (void *)m; nw->data = mtod(m, void *); nw->writer_type = NS_WRITER_TYPE_MBUF; nw->malloc_flag = mflag; nw->num_messages = 0; nw->enomem = false; NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p", m, size, nw->alloc_len, nw->data); return (true); } static bool nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m = (struct mbuf *)buf; NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); if (__predict_false(datalen == 0)) { m_freem(m); return (true); } m->m_pkthdr.len = datalen; m->m_len = datalen; int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, (struct nlpcb *)(nw->arg_ptr), cnt, io_flags)); } static bool nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m = (struct mbuf *)buf; NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); if (__predict_false(datalen == 0)) { m_freem(m); return (true); } m->m_pkthdr.len = datalen; m->m_len = datalen; nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF); return (true); } static bool nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct mbuf *m_new = (struct mbuf *)buf; struct mbuf **m0 = (struct mbuf **)(nw->arg_ptr); NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); if (__predict_false(datalen == 0)) { m_freem(m_new); return (true); } m_new->m_pkthdr.len = datalen; m_new->m_len = datalen; if (*m0 == NULL) { *m0 = m_new; } else { struct mbuf *m_last; for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next) ; m_last->m_next = m_new; (*m0)->m_pkthdr.len += datalen; } return (true); } /* * NS_WRITER_TYPE_LBUF * Writes message to the allocated memory buffer, * flushing to socket/group when mbuf size limit is reached. * Calls linux handler to rewrite messages before sending to the socket. */ static bool nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok) { int mflag = waitok ? M_WAITOK : M_NOWAIT; size = roundup2(size, sizeof(void *)); int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE; char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO); if (__predict_false(buf == NULL)) return (false); /* Fill buffer header first */ struct linear_buffer *lb = (struct linear_buffer *)buf; lb->base = &buf[sizeof(struct linear_buffer) + size]; lb->size = size + SCRATCH_BUFFER_SIZE; nw->alloc_len = size; nw->offset = 0; nw->hdr = NULL; nw->_storage = buf; nw->data = (char *)(lb + 1); nw->malloc_flag = mflag; nw->writer_type = NS_WRITER_TYPE_LBUF; nw->num_messages = 0; nw->enomem = false; return (true); } static bool nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct linear_buffer *lb = (struct linear_buffer *)buf; char *data = (char *)(lb + 1); struct nlpcb *nlp = (struct nlpcb *)(nw->arg_ptr); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = NULL; if (linux_netlink_p != NULL) m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp); free(buf, M_NETLINK); if (__predict_false(m == NULL)) { /* XXX: should we set sorcverr? */ return (false); } int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; return (nl_send_one(m, nlp, cnt, io_flags)); } /* Shouldn't be called (maybe except Linux code originating message) */ static bool nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) { struct linear_buffer *lb = (struct linear_buffer *)buf; char *data = (char *)(lb + 1); if (__predict_false(datalen == 0)) { free(buf, M_NETLINK); return (true); } struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); if (__predict_false(m == NULL)) { free(buf, M_NETLINK); return (false); } m_append(m, datalen, data); free(buf, M_NETLINK); nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF); return (true); } struct nlwriter_ops nlmsg_writers[] = { /* NS_WRITER_TYPE_MBUF */ { .init = nlmsg_get_ns_mbuf, .write_socket = nlmsg_write_socket_mbuf, .write_group = nlmsg_write_group_mbuf, .write_chain = nlmsg_write_chain_mbuf, }, /* NS_WRITER_TYPE_BUF */ { .init = nlmsg_get_ns_buf, .write_socket = nlmsg_write_socket_buf, .write_group = nlmsg_write_group_buf, .write_chain = nlmsg_write_chain_buf, }, /* NS_WRITER_TYPE_LBUF */ { .init = nlmsg_get_ns_lbuf, .write_socket = nlmsg_write_socket_lbuf, .write_group = nlmsg_write_group_lbuf, }, }; static void nlmsg_set_callback(struct nl_writer *nw) { struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type]; switch (nw->writer_target) { case NS_WRITER_TARGET_SOCKET: nw->cb = pops->write_socket; break; case NS_WRITER_TARGET_GROUP: nw->cb = pops->write_group; break; case NS_WRITER_TARGET_CHAIN: nw->cb = pops->write_chain; break; default: panic("not implemented"); } } static bool nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok) { MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0])); NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type); return (nlmsg_writers[type].init(nw, size, waitok)); } static bool nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux) { int type; if (!is_linux) { if (__predict_true(size <= MCLBYTES)) type = NS_WRITER_TYPE_MBUF; else type = NS_WRITER_TYPE_BUF; } else type = NS_WRITER_TYPE_LBUF; return (nlmsg_get_buf_type(nw, size, type, waitok)); } bool nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) { if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux)) return (false); nw->arg_ptr = (void *)nlp; nw->writer_target = NS_WRITER_TARGET_SOCKET; nlmsg_set_callback(nw); return (true); } bool nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) { if (!nlmsg_get_buf(nw, size, false, false)) return (false); nw->arg_uint = (uint64_t)protocol << 16 | (uint64_t)group_id; nw->writer_target = NS_WRITER_TARGET_GROUP; nlmsg_set_callback(nw); return (true); } bool nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) { if (!nlmsg_get_buf(nw, size, false, false)) return (false); *pm = NULL; nw->arg_ptr = (void *)pm; nw->writer_target = NS_WRITER_TARGET_CHAIN; nlmsg_set_callback(nw); NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf); return (true); } void nlmsg_ignore_limit(struct nl_writer *nw) { nw->ignore_limit = true; } bool nlmsg_flush(struct nl_writer *nw) { if (__predict_false(nw->hdr != NULL)) { /* Last message has not been completed, skip it. */ int completed_len = (char *)nw->hdr - nw->data; /* Send completed messages */ nw->offset -= nw->offset - completed_len; nw->hdr = NULL; } NL_LOG(LOG_DEBUG2, "OUT"); bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages); nw->_storage = NULL; if (!result) { NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb); } return (result); } /* * Flushes previous data and allocates new underlying storage * sufficient for holding at least @required_len bytes. * Return true on success. */ bool nlmsg_refill_buffer(struct nl_writer *nw, int required_len) { struct nl_writer ns_new = {}; int completed_len, new_len; if (nw->enomem) return (false); NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim", nw->offset, nw->alloc_len, required_len); /* Calculated new buffer size and allocate it s*/ completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset; if (completed_len > 0 && required_len < MCLBYTES) { /* We already ran out of space, use the largest effective size */ new_len = max(nw->alloc_len, MCLBYTES); } else { if (nw->alloc_len < MCLBYTES) new_len = MCLBYTES; else new_len = nw->alloc_len * 2; while (new_len < required_len) new_len *= 2; } bool waitok = (nw->malloc_flag == M_WAITOK); bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF); if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) { nw->enomem = true; NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM"); return (false); } if (nw->ignore_limit) nlmsg_ignore_limit(&ns_new); /* Update callback data */ ns_new.writer_target = nw->writer_target; nlmsg_set_callback(&ns_new); ns_new.arg_uint = nw->arg_uint; /* Copy last (unfinished) header to the new storage */ int last_len = nw->offset - completed_len; if (last_len > 0) { memcpy(ns_new.data, nw->hdr, last_len); ns_new.hdr = (struct nlmsghdr *)ns_new.data; ns_new.offset = last_len; } NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len); /* Flush completed headers & switch to the new nw */ nlmsg_flush(nw); memcpy(nw, &ns_new, sizeof(struct nl_writer)); NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len); return (true); } bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len) { struct nlmsghdr *hdr; MPASS(nw->hdr == NULL); int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); if (__predict_false(nw->offset + required_len > nw->alloc_len)) { if (!nlmsg_refill_buffer(nw, required_len)) return (false); } hdr = (struct nlmsghdr *)(&nw->data[nw->offset]); hdr->nlmsg_len = len; hdr->nlmsg_type = type; hdr->nlmsg_flags = flags; hdr->nlmsg_seq = seq; hdr->nlmsg_pid = portid; nw->hdr = hdr; nw->offset += sizeof(struct nlmsghdr); return (true); } bool nlmsg_end(struct nl_writer *nw) { MPASS(nw->hdr != NULL); if (nw->enomem) { NL_LOG(LOG_DEBUG, "ENOMEM when dumping message"); nlmsg_abort(nw); return (false); } nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr); + NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u", + nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags, + nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid); nw->hdr = NULL; nw->num_messages++; return (true); } void nlmsg_abort(struct nl_writer *nw) { if (nw->hdr != NULL) { nw->offset = (uint32_t)((char *)nw->hdr - nw->data); nw->hdr = NULL; } } void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr, struct nl_pstate *npt) { struct nlmsgerr *errmsg; int payload_len; uint32_t flags = nlp->nl_flags; struct nl_writer *nw = npt->nw; bool cap_ack; payload_len = sizeof(struct nlmsgerr); /* * The only case when we send the full message in the * reply is when there is an error and NETLINK_CAP_ACK * is not set. */ cap_ack = (error == 0) || (flags & NLF_CAP_ACK); if (!cap_ack) payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr); payload_len = NETLINK_ALIGN(payload_len); uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0; if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK) nl_flags |= NLM_F_ACK_TLVS; /* * TODO: handle cookies */ NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d", hdr->nlmsg_type, hdr->nlmsg_seq); if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len)) goto enomem; errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr); errmsg->error = error; /* In case of error copy the whole message, else just the header */ memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len); if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg); if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK) nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off); if (nlmsg_end(nw)) return; enomem: NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u", hdr->nlmsg_type, hdr->nlmsg_seq); nlmsg_abort(nw); } bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) { if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { NL_LOG(LOG_DEBUG, "Error finalizing table dump"); return (false); } /* Save operation result */ int *perror = nlmsg_reserve_object(nw, int); NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, nw->offset, perror); *perror = error; nlmsg_end(nw); + nw->suppress_ack = true; return (true); } diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h index 424983282e59..99f50fb94213 100644 --- a/sys/netlink/netlink_message_writer.h +++ b/sys/netlink/netlink_message_writer.h @@ -1,252 +1,253 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2021 Ng Peng Nam Sean * Copyright (c) 2022 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETLINK_NETLINK_MESSAGE_WRITER_H_ #define _NETLINK_NETLINK_MESSAGE_WRITER_H_ #ifdef _KERNEL /* * It is not meant to be included directly */ struct mbuf; struct nl_writer; typedef bool nl_writer_cb(struct nl_writer *nw, void *buf, int buflen, int cnt); struct nl_writer { int alloc_len; /* allocated buffer length */ int offset; /* offset from the start of the buffer */ struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */ char *data; /* pointer to the contiguous storage */ void *_storage; /* Underlying storage pointer */ nl_writer_cb *cb; /* Callback to flush data */ union { void *arg_ptr; /* Callback argument as pointer */ uint64_t arg_uint; /* Callback argument as int */ }; int num_messages; /* Number of messages in the buffer */ int malloc_flag; /* M_WAITOK or M_NOWAIT */ uint8_t writer_type; /* NS_WRITER_TYPE_* */ uint8_t writer_target; /* NS_WRITER_TARGET_* */ bool ignore_limit; /* If true, ignores RCVBUF limit */ bool enomem; /* True if ENOMEM occured */ + bool suppress_ack; /* If true, don't send NLMSG_ERR */ }; #define NS_WRITER_TARGET_SOCKET 0 #define NS_WRITER_TARGET_GROUP 1 #define NS_WRITER_TARGET_CHAIN 2 #define NS_WRITER_TYPE_MBUF 0 #define NS_WRITER_TYPE_BUF 1 #define NS_WRITER_TYPE_LBUF 2 #define NS_WRITER_TYPE_MBUFC 3 #define NLMSG_SMALL 128 #define NLMSG_LARGE 2048 /* Message and attribute writing */ struct nlpcb; bool nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp); bool nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id); bool nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm); bool nlmsg_flush(struct nl_writer *nw); void nlmsg_ignore_limit(struct nl_writer *nw); bool nlmsg_refill_buffer(struct nl_writer *nw, int required_size); bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, uint16_t flags, uint32_t len); bool nlmsg_end(struct nl_writer *nw); void nlmsg_abort(struct nl_writer *nw); bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr); static inline bool nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len) { return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, hdr->nlmsg_flags, payload_len)); } #define nlmsg_data(_hdr) ((void *)((_hdr) + 1)) /* * KPI similar to mtodo(): * current (uncompleted) header is guaranteed to be contiguous, * but can be reallocated, thus pointers may need to be readjusted. */ static inline int nlattr_save_offset(const struct nl_writer *nw) { return (nw->offset - ((char *)nw->hdr - nw->data)); } static inline void * _nlattr_restore_offset(const struct nl_writer *nw, int off) { return ((void *)((char *)nw->hdr + off)); } #define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off)) static inline void nlattr_set_len(const struct nl_writer *nw, int off) { struct nlattr *nla = nlattr_restore_offset(nw, off, struct nlattr); nla->nla_len = nlattr_save_offset(nw) - off; } static inline void * nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz) { if (__predict_false(nw->offset + NETLINK_ALIGN(sz) > nw->alloc_len)) { if (!nlmsg_refill_buffer(nw, NETLINK_ALIGN(sz))) return (NULL); } void *data_ptr = &nw->data[nw->offset]; nw->offset += NLMSG_ALIGN(sz); return (data_ptr); } #define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, NLA_ALIGN(sizeof(_t)))) #define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz)) static inline int nlattr_add_nested(struct nl_writer *nw, uint16_t nla_type) { int off = nlattr_save_offset(nw); struct nlattr *nla = nlmsg_reserve_data(nw, sizeof(struct nlattr), struct nlattr); if (__predict_false(nla == NULL)) return (0); nla->nla_type = nla_type; return (off); } static inline void * _nlmsg_reserve_attr(struct nl_writer *nw, uint16_t nla_type, uint16_t sz) { sz += sizeof(struct nlattr); struct nlattr *nla = nlmsg_reserve_data(nw, sz, struct nlattr); if (__predict_false(nla == NULL)) return (NULL); nla->nla_type = nla_type; nla->nla_len = sz; return ((void *)(nla + 1)); } #define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t)))) static inline bool nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data) { int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); if (__predict_false(nw->offset + required_len > nw->alloc_len)) { if (!nlmsg_refill_buffer(nw, required_len)) return (false); } struct nlattr *nla = (struct nlattr *)(&nw->data[nw->offset]); nla->nla_len = attr_len + sizeof(struct nlattr); nla->nla_type = attr_type; if (attr_len > 0) { if ((attr_len % 4) != 0) { /* clear padding bytes */ bzero((char *)nla + required_len - 4, 4); } memcpy((nla + 1), data, attr_len); } nw->offset += required_len; return (true); } static inline bool nlattr_add_u8(struct nl_writer *nw, int attrtype, uint8_t value) { return (nlattr_add(nw, attrtype, sizeof(uint8_t), &value)); } static inline bool nlattr_add_u16(struct nl_writer *nw, int attrtype, uint16_t value) { return (nlattr_add(nw, attrtype, sizeof(uint16_t), &value)); } static inline bool nlattr_add_u32(struct nl_writer *nw, int attrtype, uint32_t value) { return (nlattr_add(nw, attrtype, sizeof(uint32_t), &value)); } static inline bool nlattr_add_u64(struct nl_writer *nw, int attrtype, uint64_t value) { return (nlattr_add(nw, attrtype, sizeof(uint64_t), &value)); } static inline bool nlattr_add_s8(struct nl_writer *nw, int attrtype, int8_t value) { return (nlattr_add(nw, attrtype, sizeof(int8_t), &value)); } static inline bool nlattr_add_s16(struct nl_writer *nw, int attrtype, int16_t value) { return (nlattr_add(nw, attrtype, sizeof(int16_t), &value)); } static inline bool nlattr_add_s32(struct nl_writer *nw, int attrtype, int32_t value) { return (nlattr_add(nw, attrtype, sizeof(int32_t), &value)); } static inline bool nlattr_add_s64(struct nl_writer *nw, int attrtype, int64_t value) { return (nlattr_add(nw, attrtype, sizeof(int64_t), &value)); } static inline bool nlattr_add_flag(struct nl_writer *nw, int attrtype) { return (nlattr_add(nw, attrtype, 0, NULL)); } static inline bool nlattr_add_string(struct nl_writer *nw, int attrtype, const char *str) { return (nlattr_add(nw, attrtype, strlen(str) + 1, str)); } #endif #endif