D42524.diff
No OneTemporary
Actions

Size

60 KB

Referenced Files

None

Subscribers

None

D42524.diff
View Options

	diff --git a/sys/compat/linux/linux_netlink.c b/sys/compat/linux/linux_netlink.c
	--- a/sys/compat/linux/linux_netlink.c
	+++ b/sys/compat/linux/linux_netlink.c
	@@ -32,7 +32,6 @@
	#include <sys/ck.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	-#include <sys/rmlock.h>
	#include <sys/socket.h>
	#include <sys/vnode.h>

	@@ -44,6 +43,7 @@
	#include <netlink/netlink.h>
	#include <netlink/netlink_ctl.h>
	#include <netlink/netlink_linux.h>
	+#include <netlink/netlink_var.h>
	#include <netlink/netlink_route.h>

	#include <compat/linux/linux.h>
	@@ -187,6 +187,7 @@

	if (out_hdr != NULL) {
	memcpy(out_hdr, hdr, hdr->nlmsg_len);
	+ nw->num_messages++;
	return (true);
	}
	return (false);
	@@ -518,8 +519,7 @@
	}

	static bool
	-nlmsg_to_linux(int netlink_family, struct nlmsghdr hdr, struct nlpcb nlp,
	- struct nl_writer *nw)
	+nlmsg_to_linux(struct nlmsghdr hdr, struct nlpcb nlp, struct nl_writer *nw)
	{
	if (hdr->nlmsg_type < NLMSG_MIN_TYPE) {
	switch (hdr->nlmsg_type) {
	@@ -536,7 +536,7 @@
	}
	}

	- switch (netlink_family) {
	+ switch (nlp->nl_proto) {
	case NETLINK_ROUTE:
	return (rtnl_to_linux(hdr, nlp, nw));
	default:
	@@ -544,64 +544,49 @@
	}
	}

	-static struct mbuf *
	-nlmsgs_to_linux(int netlink_family, char buf, int data_length, struct nlpcb nlp)
	+static bool
	+nlmsgs_to_linux(struct nl_writer nw, struct nlpcb nlp)
	{
	- RT_LOG(LOG_DEBUG3, "LINUX: get %p size %d", buf, data_length);
	- struct nl_writer nw = {};
	-
	- struct mbuf *m = NULL;
	- if (!nlmsg_get_chain_writer(&nw, data_length, &m)) {
	- RT_LOG(LOG_DEBUG, "unable to setup chain writer for size %d",
	- data_length);
	- return (NULL);
	- }
	+ struct nl_buf nb, orig;
	+ u_int offset, msglen, orig_messages __diagused;
	+
	+ RT_LOG(LOG_DEBUG3, "%p: in %u bytes %u messages", __func__,
	+ nw->buf->datalen, nw->num_messages);
	+
	+ orig = nw->buf;
	+ nb = nl_buf_alloc(orig->datalen + SCRATCH_BUFFER_SIZE, M_NOWAIT);
	+ if (__predict_false(nb == NULL))
	+ return (false);
	+ nw->buf = nb;
	+#ifdef INVARIANTS
	+ orig_messages = nw->num_messages;
	+#endif
	+ nw->num_messages = 0;

	/* Assume correct headers. Buffer IS mutable */
	- int count = 0;
	- for (int offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) {
	- struct nlmsghdr hdr = (struct nlmsghdr )&buf[offset];
	- int msglen = NLMSG_ALIGN(hdr->nlmsg_len);
	- count++;
	+ for (offset = 0;
	+ offset + sizeof(struct nlmsghdr) <= orig->datalen;
	+ offset += msglen) {
	+ struct nlmsghdr hdr = (struct nlmsghdr )&orig->data[offset];

	- if (!nlmsg_to_linux(netlink_family, hdr, nlp, &nw)) {
	+ msglen = NLMSG_ALIGN(hdr->nlmsg_len);
	+ if (!nlmsg_to_linux(hdr, nlp, nw)) {
	RT_LOG(LOG_DEBUG, "failed to process msg type %d",
	hdr->nlmsg_type);
	- m_freem(m);
	- return (NULL);
	+ nl_buf_free(nb);
	+ return (false);
	}
	- offset += msglen;
	}
	- nlmsg_flush(&nw);
	- RT_LOG(LOG_DEBUG3, "Processed %d messages, chain size %d", count,
	- m ? m_length(m, NULL) : 0);

	- return (m);
	-}
	+ MPASS(nw->num_messages == orig_messages);
	+ MPASS(nw->buf == nb);
	+ nl_buf_free(orig);
	+ RT_LOG(LOG_DEBUG3, "%p: out %u bytes", __func__, offset);

	-static struct mbuf *
	-mbufs_to_linux(int netlink_family, struct mbuf m, struct nlpcb nlp)
	-{
	- /* XXX: easiest solution, not optimized for performance */
	- int data_length = m_length(m, NULL);
	- char *buf = malloc(data_length, M_LINUX, M_NOWAIT);
	- if (buf == NULL) {
	- RT_LOG(LOG_DEBUG, "unable to allocate %d bytes, dropping message",
	- data_length);
	- m_freem(m);
	- return (NULL);
	- }
	- m_copydata(m, 0, data_length, buf);
	- m_freem(m);
	-
	- m = nlmsgs_to_linux(netlink_family, buf, data_length, nlp);
	- free(buf, M_LINUX);
	-
	- return (m);
	+ return (true);
	}

	static struct linux_netlink_provider linux_netlink_v1 = {
	- .mbufs_to_linux = mbufs_to_linux,
	.msgs_to_linux = nlmsgs_to_linux,
	.msg_from_linux = nlmsg_from_linux,
	};
	diff --git a/sys/netlink/ktest_netlink_message_writer.h b/sys/netlink/ktest_netlink_message_writer.h
	--- a/sys/netlink/ktest_netlink_message_writer.h
	+++ b/sys/netlink/ktest_netlink_message_writer.h
	@@ -30,28 +30,14 @@

	#if defined(_KERNEL) && defined(INVARIANTS)

	-bool nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok);
	-void nlmsg_set_callback_wrapper(struct nl_writer *nw);
	-struct mbuf *nl_get_mbuf_chain_wrapper(int len, int malloc_flags);
	+bool nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok);

	#ifndef KTEST_CALLER

	bool
	-nlmsg_get_buf_type_wrapper(struct nl_writer *nw, int size, int type, bool waitok)
	+nlmsg_get_buf_wrapper(struct nl_writer *nw, u_int size, bool waitok)
	{
	- return (nlmsg_get_buf_type(nw, size, type, waitok));
	-}
	-
	-void
	-nlmsg_set_callback_wrapper(struct nl_writer *nw)
	-{
	- nlmsg_set_callback(nw);
	-}
	-
	-struct mbuf *
	-nl_get_mbuf_chain_wrapper(int len, int malloc_flags)
	-{
	- return (nl_get_mbuf_chain(len, malloc_flags));
	+ return (nlmsg_get_buf(nw, size, waitok));
	}
	#endif

	diff --git a/sys/netlink/ktest_netlink_message_writer.c b/sys/netlink/ktest_netlink_message_writer.c
	--- a/sys/netlink/ktest_netlink_message_writer.c
	+++ b/sys/netlink/ktest_netlink_message_writer.c
	@@ -29,9 +29,9 @@
	#include <sys/cdefs.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	-#include <sys/mbuf.h>
	#include <netlink/netlink.h>
	#include <netlink/netlink_ctl.h>
	+#include <netlink/netlink_var.h>
	#include <netlink/netlink_message_writer.h>

	#define KTEST_CALLER
	@@ -39,54 +39,47 @@

	#ifdef INVARIANTS

	-struct test_mbuf_attrs {
	+struct test_nlbuf_attrs {
	uint32_t size;
	uint32_t expected_avail;
	- uint32_t expected_count;
	- uint32_t wtype;
	int waitok;
	};

	-#define _OUT(_field) offsetof(struct test_mbuf_attrs, _field)
	-static const struct nlattr_parser nla_p_mbuf_w[] = {
	+#define _OUT(_field) offsetof(struct test_nlbuf_attrs, _field)
	+static const struct nlattr_parser nla_p_nlbuf_w[] = {
	{ .type = 1, .off = _OUT(size), .cb = nlattr_get_uint32 },
	{ .type = 2, .off = _OUT(expected_avail), .cb = nlattr_get_uint32 },
	- { .type = 3, .off = _OUT(expected_count), .cb = nlattr_get_uint32 },
	- { .type = 4, .off = _OUT(wtype), .cb = nlattr_get_uint32 },
	- { .type = 5, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
	+ { .type = 3, .off = _OUT(waitok), .cb = nlattr_get_uint32 },
	};
	#undef _OUT
	-NL_DECLARE_ATTR_PARSER(mbuf_w_parser, nla_p_mbuf_w);
	+NL_DECLARE_ATTR_PARSER(nlbuf_w_parser, nla_p_nlbuf_w);

	static int
	-test_mbuf_parser(struct ktest_test_context ctx, struct nlattr nla)
	+test_nlbuf_parser(struct ktest_test_context ctx, struct nlattr nla)
	{
	- struct test_mbuf_attrs attrs = npt_alloc(ctx->npt, sizeof(attrs));
	+ struct test_nlbuf_attrs attrs = npt_alloc(ctx->npt, sizeof(attrs));

	ctx->arg = attrs;
	if (attrs != NULL)
	- return (nl_parse_nested(nla, &mbuf_w_parser, ctx->npt, attrs));
	+ return (nl_parse_nested(nla, &nlbuf_w_parser, ctx->npt, attrs));
	return (ENOMEM);
	}

	static int
	-test_mbuf_writer_allocation(struct ktest_test_context *ctx)
	+test_nlbuf_writer_allocation(struct ktest_test_context *ctx)
	{
	- struct test_mbuf_attrs *attrs = ctx->arg;
	- bool ret;
	+ struct test_nlbuf_attrs *attrs = ctx->arg;
	struct nl_writer nw = {};
	+ u_int alloc_len;
	+ bool ret;

	- ret = nlmsg_get_buf_type_wrapper(&nw, attrs->size, attrs->wtype, attrs->waitok);
	+ ret = nlmsg_get_buf_wrapper(&nw, attrs->size, attrs->waitok);
	if (!ret)
	return (EINVAL);

	- int alloc_len = nw.alloc_len;
	+ alloc_len = nw.buf->buflen;
	KTEST_LOG(ctx, "requested %u, allocated %d", attrs->size, alloc_len);

	- /* Set cleanup callback */
	- nw.writer_target = NS_WRITER_TARGET_SOCKET;
	- nlmsg_set_callback_wrapper(&nw);
	-
	/* Mark enomem to avoid reallocation */
	nw.enomem = true;

	@@ -95,9 +88,7 @@
	return (EINVAL);
	}

	- /* Mark as empty to free the storage */
	- nw.offset = 0;
	- nlmsg_flush(&nw);
	+ nl_buf_free(nw.buf);

	if (alloc_len < attrs->expected_avail) {
	KTEST_LOG(ctx, "alloc_len %d, expected %u",
	@@ -107,60 +98,15 @@

	return (0);
	}
	-
	-static int
	-test_mbuf_chain_allocation(struct ktest_test_context *ctx)
	-{
	- struct test_mbuf_attrs *attrs = ctx->arg;
	- int mflags = attrs->waitok ? M_WAITOK : M_NOWAIT;
	- struct mbuf *chain = nl_get_mbuf_chain_wrapper(attrs->size, mflags);
	-
	- if (chain == NULL) {
	- KTEST_LOG(ctx, "nl_get_mbuf_chain(%u) returned NULL", attrs->size);
	- return (EINVAL);
	- }
	-
	- /* Iterate and check number of mbufs and space */
	- uint32_t allocated_count = 0, allocated_size = 0;
	- for (struct mbuf *m = chain; m != NULL; m = m->m_next) {
	- allocated_count++;
	- allocated_size += M_SIZE(m);
	- }
	- m_freem(chain);
	-
	- if (attrs->expected_avail > allocated_size) {
	- KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u"
	- " expected/allocated count %u/%u",
	- attrs->expected_avail, allocated_size,
	- attrs->expected_count, allocated_count);
	- return (EINVAL);
	- }
	-
	- if (attrs->expected_count > 0 && (attrs->expected_count != allocated_count)) {
	- KTEST_LOG(ctx, "expected/allocated avail(bytes) %u/%u"
	- " expected/allocated count %u/%u",
	- attrs->expected_avail, allocated_size,
	- attrs->expected_count, allocated_count);
	- return (EINVAL);
	- }
	-
	- return (0);
	-}
	#endif

	static const struct ktest_test_info tests[] = {
	#ifdef INVARIANTS
	{
	- .name = "test_mbuf_writer_allocation",
	- .desc = "test different mbuf sizes in the mbuf writer",
	- .func = &test_mbuf_writer_allocation,
	- .parse = &test_mbuf_parser,
	- },
	- {
	- .name = "test_mbuf_chain_allocation",
	- .desc = "verify allocation different chain sizes",
	- .func = &test_mbuf_chain_allocation,
	- .parse = &test_mbuf_parser,
	+ .name = "test_nlbuf_writer_allocation",
	+ .desc = "test different buffer sizes in the netlink writer",
	+ .func = &test_nlbuf_writer_allocation,
	+ .parse = &test_nlbuf_parser,
	},
	#endif
	};
	diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c
	--- a/sys/netlink/netlink_domain.c
	+++ b/sys/netlink/netlink_domain.c
	@@ -179,53 +179,76 @@
	}

	static void
	-nl_send_one_group(struct mbuf m, struct nlpcb nlp, int num_messages,
	- int io_flags)
	+nl_send_one_group(struct nl_writer nw, struct nl_buf nb, struct nlpcb *nlp)
	{
	if (__predict_false(nlp->nl_flags & NLF_MSG_INFO))
	- nl_add_msg_info(m);
	- nl_send_one(m, nlp, num_messages, io_flags);
	+ nl_add_msg_info(nb);
	+ nw->buf = nb;
	+ (void)nl_send_one(nw);
	+}
	+
	+static struct nl_buf *
	+nl_buf_copy(struct nl_buf *nb)
	+{
	+ struct nl_buf *copy;
	+
	+ copy = nl_buf_alloc(nb->buflen, M_NOWAIT);
	+ if (__predict_false(copy == NULL))
	+ return (NULL);
	+ memcpy(copy, nb, sizeof(*nb) + nb->buflen);
	+ if (nb->control != NULL) {
	+ copy->control = m_copym(nb->control, 0, M_COPYALL, M_NOWAIT);
	+ if (__predict_false(copy->control == NULL)) {
	+ nl_buf_free(copy);
	+ return (NULL);
	+ }
	+ }
	+
	+ return (copy);
	}

	/*
	- * Broadcasts message @m to the protocol @proto group specified by @group_id
	+ * Broadcasts in the writer's buffer.
	*/
	-void
	-nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id)
	+bool
	+nl_send_group(struct nl_writer *nw)
	{
	+ struct nl_buf *nb = nw->buf;
	struct nlpcb *nlp_last = NULL;
	struct nlpcb *nlp;
	NLCTL_TRACKER;

	IF_DEBUG_LEVEL(LOG_DEBUG2) {
	- struct nlmsghdr hdr = mtod(m, struct nlmsghdr );
	- NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d",
	- m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id);
	+ struct nlmsghdr hdr = (struct nlmsghdr )nb->data;
	+ NL_LOG(LOG_DEBUG2, "MCAST len %u msg type %d len %u to group %d/%d",
	+ nb->datalen, hdr->nlmsg_type, hdr->nlmsg_len,
	+ nw->group.proto, nw->group.id);
	}

	+ nw->buf = NULL;
	+
	struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl);
	if (__predict_false(ctl == NULL)) {
	/*
	* Can be the case when notification is sent within VNET
	* which doesn't have any netlink sockets.
	*/
	- m_freem(m);
	- return;
	+ nl_buf_free(nb);
	+ return (false);
	}

	NLCTL_RLOCK(ctl);

	- int io_flags = NL_IOF_UNTRANSLATED;
	-
	CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) {
	- if (nl_isset_group_locked(nlp, group_id) && nlp->nl_proto == proto) {
	+ if (nl_isset_group_locked(nlp, nw->group.id) &&
	+ nlp->nl_proto == nw->group.proto) {
	if (nlp_last != NULL) {
	- struct mbuf *m_copy;
	- m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
	- if (m_copy != NULL)
	- nl_send_one_group(m_copy, nlp_last,
	- num_messages, io_flags);
	- else {
	+ struct nl_buf *copy;
	+
	+ copy = nl_buf_copy(nb);
	+ if (copy != NULL) {
	+ nl_send_one_group(nw, copy, nlp_last);
	+ } else {
	NLP_LOCK(nlp_last);
	if (nlp_last->nl_socket != NULL)
	sorwakeup(nlp_last->nl_socket);
	@@ -236,11 +259,13 @@
	}
	}
	if (nlp_last != NULL)
	- nl_send_one_group(m, nlp_last, num_messages, io_flags);
	+ nl_send_one_group(nw, nb, nlp_last);
	else
	- m_freem(m);
	+ nl_buf_free(nb);

	NLCTL_RUNLOCK(ctl);
	+
	+ return (true);
	}

	bool
	@@ -331,7 +356,7 @@
	free(nlp, M_PCB);
	return (error);
	}
	- so->so_rcv.sb_mtx = &so->so_rcv_mtx;
	+ TAILQ_INIT(&so->so_rcv.nl_queue);
	TAILQ_INIT(&so->so_snd.nl_queue);
	so->so_pcb = nlp;
	nlp->nl_socket = so;
	@@ -344,7 +369,6 @@
	nlp->nl_need_thread_setup = true;
	NLP_LOCK_INIT(nlp);
	refcount_init(&nlp->nl_refcount, 1);
	- nl_init_io(nlp);

	nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK,
	taskqueue_thread_enqueue, &nlp->nl_taskqueue);
	@@ -467,15 +491,6 @@
	return (0);
	}

	-static void
	-destroy_nlpcb(struct nlpcb *nlp)
	-{
	- NLP_LOCK(nlp);
	- nl_free_io(nlp);
	- NLP_LOCK_DESTROY(nlp);
	- free(nlp, M_PCB);
	-}
	-
	static void
	destroy_nlpcb_epoch(epoch_context_t ctx)
	{
	@@ -483,10 +498,10 @@

	nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx);

	- destroy_nlpcb(nlp);
	+ NLP_LOCK_DESTROY(nlp);
	+ free(nlp, M_PCB);
	}

	-
	static void
	nl_close(struct socket *so)
	{
	@@ -522,9 +537,12 @@

	while ((nb = TAILQ_FIRST(&so->so_snd.nl_queue)) != NULL) {
	TAILQ_REMOVE(&so->so_snd.nl_queue, nb, tailq);
	- free(nb, M_NETLINK);
	+ nl_buf_free(nb);
	+ }
	+ while ((nb = TAILQ_FIRST(&so->so_rcv.nl_queue)) != NULL) {
	+ TAILQ_REMOVE(&so->so_rcv.nl_queue, nb, tailq);
	+ nl_buf_free(nb);
	}
	- sbdestroy(so, SO_RCV);

	NL_LOG(LOG_DEBUG3, "socket %p, detached", so);

	@@ -597,10 +615,8 @@
	len = roundup2(uio->uio_resid, 8) + SCRATCH_BUFFER_SIZE;
	if (nlp->nl_linux)
	len += roundup2(uio->uio_resid, 8);
	- nb = malloc(sizeof(*nb) + len, M_NETLINK, M_WAITOK);
	+ nb = nl_buf_alloc(len, M_WAITOK);
	nb->datalen = uio->uio_resid;
	- nb->buflen = len;
	- nb->offset = 0;
	error = uiomove(&nb->data[0], uio->uio_resid, uio);
	if (__predict_false(error))
	goto out;
	@@ -635,19 +651,107 @@

	out:
	SOCK_IO_SEND_UNLOCK(so);
	- free(nb, M_NETLINK);
	+ if (nb != NULL)
	+ nl_buf_free(nb);
	return (error);
	}

	static int
	-nl_pru_rcvd(struct socket *so, int flags)
	+nl_soreceive(struct socket so, struct sockaddr psa, struct uio uio,
	+ struct mbuf mp, struct mbuf controlp, int *flagsp)
	{
	+ static const struct sockaddr_nl nl_empty_src = {
	+ .nl_len = sizeof(struct sockaddr_nl),
	+ .nl_family = PF_NETLINK,
	+ .nl_pid = 0 /* comes from the kernel */
	+ };
	+ struct sockbuf *sb = &so->so_rcv;
	+ struct nl_buf *nb;
	+ int flags, error;
	+ u_int overflow;
	+ bool nonblock, trunc, peek;
	+
	+ MPASS(mp == NULL && uio != NULL);
	+
	NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid);
	- MPASS(sotonlpcb(so) != NULL);
	+
	+ if (psa != NULL)
	+ psa = sodupsockaddr((const struct sockaddr )&nl_empty_src,
	+ M_WAITOK);
	+
	+ flags = flagsp != NULL ? *flagsp & ~MSG_TRUNC : 0;
	+ trunc = flagsp != NULL ? *flagsp & MSG_TRUNC : false;
	+ nonblock = (so->so_state & SS_NBIO) \|\|
	+ (flags & (MSG_DONTWAIT \| MSG_NBIO));
	+ peek = flags & MSG_PEEK;
	+
	+ error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
	+ if (__predict_false(error))
	+ return (error);
	+
	+ SOCK_RECVBUF_LOCK(so);
	+ while ((nb = TAILQ_FIRST(&sb->nl_queue)) == NULL) {
	+ if (nonblock) {
	+ SOCK_RECVBUF_UNLOCK(so);
	+ SOCK_IO_RECV_UNLOCK(so);
	+ return (EWOULDBLOCK);
	+ }
	+ error = sbwait(so, SO_RCV);
	+ if (error) {
	+ SOCK_RECVBUF_UNLOCK(so);
	+ SOCK_IO_RECV_UNLOCK(so);
	+ return (error);
	+ }
	+ }
	+
	+ /*
	+ * XXXGL
	+ * Here we emulate a PR_ATOMIC behavior of soreceive_generic() where
	+ * we take only the first "record" in the socket buffer and send it
	+ * to uio whole or truncated ignoring how many netlink messages are
	+ * in the record and how much space is left in the uio.
	+ * This needs to be fixed at next refactoring. First, we should perform
	+ * truncation only if the very first message doesn't fit into uio.
	+ * That will help an application with small buffer not to lose data.
	+ * Second, we should continue working on the sb->nl_queue as long as
	+ * there is more space in the uio. That will boost applications with
	+ * large buffers.
	+ */
	+ if (__predict_true(!peek)) {
	+ TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
	+ sb->sb_acc -= nb->datalen;
	+ sb->sb_ccc -= nb->datalen;
	+ }
	+ SOCK_RECVBUF_UNLOCK(so);
	+
	+ overflow = __predict_false(nb->datalen > uio->uio_resid) ?
	+ nb->datalen - uio->uio_resid : 0;
	+ error = uiomove(nb->data, (int)nb->datalen, uio);
	+ if (__predict_false(overflow > 0)) {
	+ flags \|= MSG_TRUNC;
	+ if (trunc)
	+ uio->uio_resid -= overflow;
	+ }
	+
	+ if (controlp != NULL) {
	+ *controlp = nb->control;
	+ nb->control = NULL;
	+ }
	+
	+ if (__predict_true(!peek))
	+ nl_buf_free(nb);
	+
	+ if (uio->uio_td)
	+ uio->uio_td->td_ru.ru_msgrcv++;
	+
	+ if (flagsp != NULL)
	+ *flagsp \|= flags;
	+
	+ SOCK_IO_RECV_UNLOCK(so);

	nl_on_transmit(sotonlpcb(so));

	- return (0);
	+ return (error);
	}

	static int
	@@ -798,8 +902,7 @@
	}

	#define NETLINK_PROTOSW \
	- .pr_flags = PR_ATOMIC \| PR_ADDR \| PR_WANTRCVD \| \
	- PR_SOCKBUF, \
	+ .pr_flags = PR_ATOMIC \| PR_ADDR \| PR_SOCKBUF, \
	.pr_ctloutput = nl_ctloutput, \
	.pr_setsbopt = nl_setsbopt, \
	.pr_attach = nl_pru_attach, \
	@@ -807,7 +910,7 @@
	.pr_connect = nl_pru_connect, \
	.pr_disconnect = nl_pru_disconnect, \
	.pr_sosend = nl_sosend, \
	- .pr_rcvd = nl_pru_rcvd, \
	+ .pr_soreceive = nl_soreceive, \
	.pr_shutdown = nl_pru_shutdown, \
	.pr_sockaddr = nl_sockaddr, \
	.pr_close = nl_close
	diff --git a/sys/netlink/netlink_glue.c b/sys/netlink/netlink_glue.c
	--- a/sys/netlink/netlink_glue.c
	+++ b/sys/netlink/netlink_glue.c
	@@ -111,7 +111,6 @@
	get_stub_writer(struct nl_writer *nw)
	{
	bzero(nw, sizeof(*nw));
	- nw->writer_type = NS_WRITER_TYPE_STUB;
	nw->enomem = true;

	return (false);
	diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c
	--- a/sys/netlink/netlink_io.c
	+++ b/sys/netlink/netlink_io.c
	@@ -51,69 +51,36 @@
	* sending netlink data between the kernel and userland.
	*/

	-static const struct sockaddr_nl _nl_empty_src = {
	- .nl_len = sizeof(struct sockaddr_nl),
	- .nl_family = PF_NETLINK,
	- .nl_pid = 0 /* comes from the kernel */
	-};
	-static const struct sockaddr nl_empty_src = (const struct sockaddr )&_nl_empty_src;
	-
	static bool nl_process_nbuf(struct nl_buf nb, struct nlpcb nlp);

	-static void
	-queue_push(struct nl_io_queue q, struct mbuf mq)
	-{
	- while (mq != NULL) {
	- struct mbuf *m = mq;
	- mq = mq->m_nextpkt;
	- m->m_nextpkt = NULL;
	-
	- q->length += m_length(m, NULL);
	- STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt);
	- }
	-}
	-
	-static struct mbuf *
	-queue_pop(struct nl_io_queue *q)
	+struct nl_buf *
	+nl_buf_alloc(size_t len, int mflag)
	{
	- if (!STAILQ_EMPTY(&q->head)) {
	- struct mbuf *m = STAILQ_FIRST(&q->head);
	- STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
	- m->m_nextpkt = NULL;
	- q->length -= m_length(m, NULL);
	+ struct nl_buf *nb;

	- return (m);
	+ nb = malloc(sizeof(struct nl_buf) + len, M_NETLINK, mflag);
	+ if (__predict_true(nb != NULL)) {
	+ nb->buflen = len;
	+ nb->datalen = nb->offset = 0;
	+ nb->control = NULL;
	}
	- return (NULL);
	-}

	-static struct mbuf *
	-queue_head(const struct nl_io_queue *q)
	-{
	- return (STAILQ_FIRST(&q->head));
	+ return (nb);
	}

	-static inline bool
	-queue_empty(const struct nl_io_queue *q)
	+void
	+nl_buf_free(struct nl_buf *nb)
	{
	- return (q->length == 0);
	-}

	-static void
	-queue_free(struct nl_io_queue *q)
	-{
	- while (!STAILQ_EMPTY(&q->head)) {
	- struct mbuf *m = STAILQ_FIRST(&q->head);
	- STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt);
	- m->m_nextpkt = NULL;
	- m_freem(m);
	- }
	- q->length = 0;
	+ if (nb->control)
	+ m_freem(nb->control);
	+ free(nb, M_NETLINK);
	}

	void
	-nl_add_msg_info(struct mbuf *m)
	+nl_add_msg_info(struct nl_buf *nb)
	{
	+ /* XXXGL pass nlp as arg? */
	struct nlpcb *nlp = nl_get_thread_nlp(curthread);
	NL_LOG(LOG_DEBUG2, "Trying to recover nlp from thread %p: %p",
	curthread, nlp);
	@@ -139,27 +106,15 @@
	};


	- while (m->m_next != NULL)
	- m = m->m_next;
	- m->m_next = sbcreatecontrol(data, sizeof(data),
	+ nb->control = sbcreatecontrol(data, sizeof(data),
	NETLINK_MSG_INFO, SOL_NETLINK, M_NOWAIT);

	- NL_LOG(LOG_DEBUG2, "Storing %u bytes of data, ctl: %p",
	- (unsigned)sizeof(data), m->m_next);
	-}
	-
	-static __noinline struct mbuf *
	-extract_msg_info(struct mbuf *m)
	-{
	- while (m->m_next != NULL) {
	- if (m->m_next->m_type == MT_CONTROL) {
	- struct mbuf *ctl = m->m_next;
	- m->m_next = NULL;
	- return (ctl);
	- }
	- m = m->m_next;
	- }
	- return (NULL);
	+ if (__predict_true(nb->control != NULL))
	+ NL_LOG(LOG_DEBUG2, "Storing %u bytes of control data, ctl: %p",
	+ (unsigned)sizeof(data), nb->control);
	+ else
	+ NL_LOG(LOG_DEBUG2, "Failed to allocate %u bytes of control",
	+ (unsigned)sizeof(data));
	}

	void
	@@ -174,65 +129,31 @@
	}
	}

	-static bool
	-tx_check_locked(struct nlpcb *nlp)
	-{
	- if (queue_empty(&nlp->tx_queue))
	- return (true);
	-
	- /*
	- * Check if something can be moved from the internal TX queue
	- * to the socket queue.
	- */
	-
	- bool appended = false;
	- struct sockbuf *sb = &nlp->nl_socket->so_rcv;
	- SOCKBUF_LOCK(sb);
	-
	- while (true) {
	- struct mbuf *m = queue_head(&nlp->tx_queue);
	- if (m != NULL) {
	- struct mbuf *ctl = NULL;
	- if (__predict_false(m->m_next != NULL))
	- ctl = extract_msg_info(m);
	- if (sbappendaddr_locked(sb, nl_empty_src, m, ctl) != 0) {
	- /* appended successfully */
	- queue_pop(&nlp->tx_queue);
	- appended = true;
	- } else
	- break;
	- } else
	- break;
	- }
	-
	- SOCKBUF_UNLOCK(sb);
	-
	- if (appended)
	- sorwakeup(nlp->nl_socket);
	-
	- return (queue_empty(&nlp->tx_queue));
	-}
	-
	static bool
	nl_process_received_one(struct nlpcb *nlp)
	{
	struct socket *so = nlp->nl_socket;
	- struct sockbuf *sb = &so->so_snd;
	+ struct sockbuf *sb;
	struct nl_buf *nb;
	bool reschedule = false;

	NLP_LOCK(nlp);
	nlp->nl_task_pending = false;
	+ NLP_UNLOCK(nlp);

	- if (!tx_check_locked(nlp)) {
	- /* TX overflow queue still not empty, ignore RX */
	- NLP_UNLOCK(nlp);
	+ /*
	+ * Do not process queued up requests if there is no space to queue
	+ * replies.
	+ */
	+ sb = &so->so_rcv;
	+ SOCK_RECVBUF_LOCK(so);
	+ if (sb->sb_hiwat <= sb->sb_ccc) {
	+ SOCK_RECVBUF_UNLOCK(so);
	return (false);
	}
	+ SOCK_RECVBUF_UNLOCK(so);

	- int prev_hiwat = nlp->tx_queue.hiwat;
	- NLP_UNLOCK(nlp);
	-
	+ sb = &so->so_snd;
	SOCK_SENDBUF_LOCK(so);
	while ((nb = TAILQ_FIRST(&sb->nl_queue)) != NULL) {
	TAILQ_REMOVE(&sb->nl_queue, nb, tailq);
	@@ -244,7 +165,7 @@
	sb->sb_ccc -= nb->datalen;
	/* XXXGL: potentially can reduce lock&unlock count. */
	sowwakeup_locked(so);
	- free(nb, M_NETLINK);
	+ nl_buf_free(nb);
	SOCK_SENDBUF_LOCK(so);
	} else {
	TAILQ_INSERT_HEAD(&sb->nl_queue, nb, tailq);
	@@ -252,10 +173,6 @@
	}
	}
	SOCK_SENDBUF_UNLOCK(so);
	- if (nlp->tx_queue.hiwat > prev_hiwat) {
	- NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat);
	-
	- }

	return (reschedule);
	}
	@@ -276,18 +193,6 @@
	;
	}

	-void
	-nl_init_io(struct nlpcb *nlp)
	-{
	- STAILQ_INIT(&nlp->tx_queue.head);
	-}
	-
	-void
	-nl_free_io(struct nlpcb *nlp)
	-{
	- queue_free(&nlp->tx_queue);
	-}
	-
	/*
	* Called after some data have been read from the socket.
	*/
	@@ -306,8 +211,8 @@
	struct sockbuf *sb = &so->so_rcv;
	NLP_LOG(LOG_DEBUG, nlp,
	"socket RX overflowed, %lu messages (%lu bytes) dropped. "
	- "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes,
	- sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax);
	+ "bytes: [%u/%u]", dropped_messages, dropped_bytes,
	+ sb->sb_ccc, sb->sb_hiwat);
	/* TODO: send netlink message */
	}

	@@ -325,95 +230,67 @@
	CURVNET_RESTORE();
	}

	-static __noinline void
	-queue_push_tx(struct nlpcb nlp, struct mbuf m)
	-{
	- queue_push(&nlp->tx_queue, m);
	- nlp->nl_tx_blocked = true;
	-
	- if (nlp->tx_queue.length > nlp->tx_queue.hiwat)
	- nlp->tx_queue.hiwat = nlp->tx_queue.length;
	-}
	-
	/*
	- * Tries to send @m to the socket @nlp.
	- *
	- * @m: mbuf(s) to send to. Consumed in any case.
	- * @nlp: socket to send to
	- * @cnt: number of messages in @m
	- * @io_flags: combination of NL_IOF_* flags
	+ * Tries to send current data buffer from writer.
	*
	* Returns true on success.
	* If no queue overrunes happened, wakes up socket owner.
	*/
	bool
	-nl_send_one(struct mbuf m, struct nlpcb nlp, int num_messages, int io_flags)
	+nl_send_one(struct nl_writer *nw)
	{
	- bool untranslated = io_flags & NL_IOF_UNTRANSLATED;
	- bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT;
	- bool result = true;
	+ struct nlpcb *nlp = nw->nlp;
	+ struct socket *so = nlp->nl_socket;
	+ struct sockbuf *sb = &so->so_rcv;
	+ struct nl_buf *nb;
	+
	+ MPASS(nw->hdr == NULL);

	IF_DEBUG_LEVEL(LOG_DEBUG2) {
	- struct nlmsghdr hdr = mtod(m, struct nlmsghdr );
	+ struct nlmsghdr hdr = (struct nlmsghdr )nw->buf->data;
	NLP_LOG(LOG_DEBUG2, nlp,
	- "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X",
	- m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len,
	- io_flags);
	+ "TX len %u msgs %u msg type %d first hdrlen %u",
	+ nw->buf->datalen, nw->num_messages, hdr->nlmsg_type,
	+ hdr->nlmsg_len);
	}

	- if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) {
	- m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp);
	- if (m == NULL)
	- return (false);
	+ if (nlp->nl_linux && linux_netlink_p != NULL &&
	+ __predict_false(!linux_netlink_p->msgs_to_linux(nw, nlp))) {
	+ nl_buf_free(nw->buf);
	+ nw->buf = NULL;
	+ return (false);
	}

	- NLP_LOCK(nlp);
	+ nb = nw->buf;
	+ nw->buf = NULL;

	- if (__predict_false(nlp->nl_socket == NULL)) {
	+ SOCK_RECVBUF_LOCK(so);
	+ if (!nw->ignore_limit && __predict_false(sb->sb_hiwat <= sb->sb_ccc)) {
	+ SOCK_RECVBUF_UNLOCK(so);
	+ NLP_LOCK(nlp);
	+ nlp->nl_dropped_bytes += nb->datalen;
	+ nlp->nl_dropped_messages += nw->num_messages;
	+ NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
	+ (unsigned long)nlp->nl_dropped_messages, nw->num_messages,
	+ (unsigned long)nlp->nl_dropped_bytes, nb->datalen);
	NLP_UNLOCK(nlp);
	- m_freem(m);
	+ nl_buf_free(nb);
	return (false);
	- }
	-
	- if (!queue_empty(&nlp->tx_queue)) {
	- if (ignore_limits) {
	- queue_push_tx(nlp, m);
	- } else {
	- m_free(m);
	- result = false;
	- }
	- NLP_UNLOCK(nlp);
	- return (result);
	- }
	-
	- struct socket *so = nlp->nl_socket;
	- struct mbuf *ctl = NULL;
	- if (__predict_false(m->m_next != NULL))
	- ctl = extract_msg_info(m);
	- if (sbappendaddr(&so->so_rcv, nl_empty_src, m, ctl) != 0) {
	- sorwakeup(so);
	- NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up");
	} else {
	- if (ignore_limits) {
	- queue_push_tx(nlp, m);
	- } else {
	- /*
	- * Store dropped data so it can be reported
	- * on the next read
	- */
	- nlp->nl_dropped_bytes += m_length(m, NULL);
	- nlp->nl_dropped_messages += num_messages;
	- NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)",
	- (unsigned long)nlp->nl_dropped_messages, num_messages,
	- (unsigned long)nlp->nl_dropped_bytes, m_length(m, NULL));
	- soroverflow(so);
	- m_freem(m);
	- result = false;
	+ bool full;
	+
	+ TAILQ_INSERT_TAIL(&sb->nl_queue, nb, tailq);
	+ sb->sb_acc += nb->datalen;
	+ sb->sb_ccc += nb->datalen;
	+ full = sb->sb_hiwat <= sb->sb_ccc;
	+ sorwakeup_locked(so);
	+ if (full) {
	+ NLP_LOCK(nlp);
	+ nlp->nl_tx_blocked = true;
	+ NLP_UNLOCK(nlp);
	}
	+ return (true);
	}
	- NLP_UNLOCK(nlp);
	-
	- return (result);
	}

	static int
	diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h
	--- a/sys/netlink/netlink_linux.h
	+++ b/sys/netlink/netlink_linux.h
	@@ -27,6 +27,7 @@

	#ifndef _NETLINK_LINUX_VAR_H_
	#define _NETLINK_LINUX_VAR_H_
	+#ifdef _KERNEL

	/*
	* The file contains headers for the bridge interface between
	@@ -34,16 +35,13 @@
	*/
	struct nlpcb;
	struct nl_pstate;
	+struct nl_writer;

	-typedef struct mbuf mbufs_to_linux_cb_t(int netlink_family, struct mbuf m,
	- struct nlpcb *nlp);
	-typedef struct mbuf msgs_to_linux_cb_t(int netlink_family, char buf, int data_length,
	- struct nlpcb *nlp);
	+typedef bool msgs_to_linux_cb_t(struct nl_writer nw, struct nlpcb nlp);
	typedef struct nlmsghdr msg_from_linux_cb_t(int netlink_family, struct nlmsghdr hdr,
	struct nl_pstate *npt);

	struct linux_netlink_provider {
	- mbufs_to_linux_cb_t *mbufs_to_linux;
	msgs_to_linux_cb_t *msgs_to_linux;
	msg_from_linux_cb_t *msg_from_linux;

	@@ -52,3 +50,4 @@
	extern struct linux_netlink_provider *linux_netlink_p;

	#endif
	+#endif
	diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h
	--- a/sys/netlink/netlink_message_writer.h
	+++ b/sys/netlink/netlink_message_writer.h
	@@ -37,60 +37,41 @@
	* It is not meant to be included directly
	*/

	-struct mbuf;
	+struct nl_buf;
	struct nl_writer;
	-typedef bool nl_writer_cb(struct nl_writer nw, void buf, int buflen, int cnt);
	+typedef bool nl_writer_cb(struct nl_writer *nw);

	struct nl_writer {
	- int alloc_len; /* allocated buffer length */
	- int offset; /* offset from the start of the buffer */
	- struct nlmsghdr hdr; / Pointer to the currently-filled msg */
	- char data; / pointer to the contiguous storage */
	- void _storage; / Underlying storage pointer */
	- nl_writer_cb cb; / Callback to flush data */
	+ struct nl_buf buf; / Underlying storage pointer */
	+ struct nlmsghdr hdr; / Pointer to the currently-filled msg */
	+ nl_writer_cb cb; / Callback to flush data */
	union {
	- void *ptr;
	+ struct nlpcb *nlp;
	struct {
	uint16_t proto;
	uint16_t id;
	} group;
	- } arg;
	- int num_messages; /* Number of messages in the buffer */
	- int malloc_flag; /* M_WAITOK or M_NOWAIT */
	- uint8_t writer_type; /* NS_WRITER_TYPE_* */
	- uint8_t writer_target; /* NS_WRITER_TARGET_* */
	- bool ignore_limit; /* If true, ignores RCVBUF limit */
	- bool enomem; /* True if ENOMEM occured */
	- bool suppress_ack; /* If true, don't send NLMSG_ERR */
	+ };
	+ u_int num_messages; /* Number of messages in the buffer */
	+ int malloc_flag; /* M_WAITOK or M_NOWAIT */
	+ bool ignore_limit; /* If true, ignores RCVBUF limit */
	+ bool enomem; /* True if ENOMEM occured */
	+ bool suppress_ack; /* If true, don't send NLMSG_ERR */
	};
	-#define NS_WRITER_TARGET_SOCKET 0
	-#define NS_WRITER_TARGET_GROUP 1
	-#define NS_WRITER_TARGET_CHAIN 2
	-
	-#define NS_WRITER_TYPE_MBUF 0
	-#define NS_WRITER_TYPE_BUF 1
	-#define NS_WRITER_TYPE_LBUF 2
	-#define NS_WRITER_TYPE_MBUFC 3
	-#define NS_WRITER_TYPE_STUB 4
	-

	#define NLMSG_SMALL 128
	#define NLMSG_LARGE 2048

	/* Message and attribute writing */
	-
	-struct nlpcb;
	-
	#if defined(NETLINK) \|\| defined(NETLINK_MODULE)
	/* Provide optimized calls to the functions inside the same linking unit */

	bool _nlmsg_get_unicast_writer(struct nl_writer nw, int expected_size, struct nlpcb nlp);
	bool _nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id);
	-bool _nlmsg_get_chain_writer(struct nl_writer nw, int expected_size, struct mbuf *pm);
	bool _nlmsg_flush(struct nl_writer *nw);
	void _nlmsg_ignore_limit(struct nl_writer *nw);

	-bool _nlmsg_refill_buffer(struct nl_writer *nw, int required_size);
	+bool _nlmsg_refill_buffer(struct nl_writer *nw, u_int required_len);
	bool _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
	uint16_t flags, uint32_t len);
	bool _nlmsg_end(struct nl_writer *nw);
	@@ -111,12 +92,6 @@
	return (_nlmsg_get_group_writer(nw, expected_size, proto, group_id));
	}

	-static inline bool
	-nlmsg_get_chain_writer(struct nl_writer nw, int expected_size, struct mbuf *pm)
	-{
	- return (_nlmsg_get_chain_writer(nw, expected_size, pm));
	-}
	-
	static inline bool
	nlmsg_flush(struct nl_writer *nw)
	{
	@@ -186,8 +161,6 @@
	hdr->nlmsg_flags, payload_len));
	}

	-#define nlmsg_data(_hdr) ((void *)((_hdr) + 1))
	-
	/*
	* KPI similar to mtodo():
	* current (uncompleted) header is guaranteed to be contiguous,
	diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c
	--- a/sys/netlink/netlink_message_writer.c
	+++ b/sys/netlink/netlink_message_writer.c
	@@ -30,7 +30,6 @@
	#include <sys/lock.h>
	#include <sys/rmlock.h>
	#include <sys/mbuf.h>
	-#include <sys/ck.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syslog.h>
	@@ -45,523 +44,44 @@
	#include <netlink/netlink_debug.h>
	_DECLARE_DEBUG(LOG_INFO);

	-/*
	- * The goal of this file is to provide convenient message writing KPI on top of
	- * different storage methods (mbufs, uio, temporary memory chunks).
	- *
	- * The main KPI guarantee is that the (last) message always resides in the contiguous
	- * memory buffer, so one is able to update the header after writing the entire message.
	- *
	- * This guarantee comes with a side effect of potentially reallocating underlying
	- * buffer, so one needs to update the desired pointers after something is added
	- * to the header.
	- *
	- * Messaging layer contains hooks performing transparent Linux translation for the messages.
	- *
	- * There are 3 types of supported targets:
	- * * socket (adds mbufs to the socket buffer, used for message replies)
	- * * group (sends mbuf/chain to the specified groups, used for the notifications)
	- * * chain (returns mbuf chain, used in Linux message translation code)
	- *
	- * There are 3 types of storage:
	- * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
	- * fits in NLMBUFSIZE)
	- * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
	- * to be larger than one supported by NS_WRITER_TYPE_MBUF)
	- * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
	- * Linux sockets, calls translation hook prior to sending messages to the socket).
	- *
	- * Internally, KPI switches between different types of storage when memory requirements
	- * change. It happens transparently to the caller.
	- */
	-
	-/*
	- * Uma zone for the mbuf-based Netlink storage
	- */
	-static uma_zone_t nlmsg_zone;
	-
	-static void
	-nl_free_mbuf_storage(struct mbuf *m)
	-{
	- uma_zfree(nlmsg_zone, m->m_ext.ext_buf);
	-}
	-
	-static int
	-nl_setup_mbuf_storage(void mem, int size, void arg, int how __unused)
	-{
	- struct mbuf m = (struct mbuf )arg;
	-
	- if (m != NULL)
	- m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE);
	-
	- return (0);
	-}
	-
	-static struct mbuf *
	-nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags)
	-{
	- struct mbuf m, m_storage;
	-
	- if (size <= MHLEN)
	- return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags));
	-
	- if (__predict_false(size > NLMBUFSIZE))
	- return (NULL);
	-
	- m = m_gethdr(malloc_flags, MT_DATA);
	- if (m == NULL)
	- return (NULL);
	-
	- m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags);
	- if (m_storage == NULL) {
	- m_free_raw(m);
	- return (NULL);
	- }
	-
	- return (m);
	-}
	-
	-static struct mbuf *
	-nl_get_mbuf(int size, int malloc_flags)
	-{
	- return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR));
	-}
	-
	-/*
	- * Gets a chain of Netlink mbufs.
	- * This is strip-down version of m_getm2()
	- */
	-static struct mbuf *
	-nl_get_mbuf_chain(int len, int malloc_flags)
	-{
	- struct mbuf m_chain = NULL, m_tail = NULL;
	- int mbuf_flags = M_PKTHDR;
	-
	- while (len > 0) {
	- int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len;
	- struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags);
	-
	- if (m == NULL) {
	- m_freem(m_chain);
	- return (NULL);
	- }
	-
	- /* Book keeping. */
	- len -= M_SIZE(m);
	- if (m_tail != NULL)
	- m_tail->m_next = m;
	- else
	- m_chain = m;
	- m_tail = m;
	- mbuf_flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */
	- }
	-
	- return (m_chain);
	-}
	-
	-void
	-nl_init_msg_zone(void)
	-{
	- nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage,
	- NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	-}
	-
	-void
	-nl_destroy_msg_zone(void)
	-{
	- uma_zdestroy(nlmsg_zone);
	-}
	-
	-
	-typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
	-typedef bool nlwriter_op_write(struct nl_writer nw, void buf, int buflen, int cnt);
	-
	-struct nlwriter_ops {
	- nlwriter_op_init *init;
	- nlwriter_op_write *write_socket;
	- nlwriter_op_write *write_group;
	- nlwriter_op_write *write_chain;
	-};
	-
	-/*
	- * NS_WRITER_TYPE_BUF
	- * Writes message to a temporary memory buffer,
	- * flushing to the socket/group when buffer size limit is reached
	- */
	-static bool
	-nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
	-{
	- int mflag = waitok ? M_WAITOK : M_NOWAIT;
	- nw->_storage = malloc(size, M_NETLINK, mflag \| M_ZERO);
	- if (__predict_false(nw->_storage == NULL))
	- return (false);
	- nw->alloc_len = size;
	- nw->offset = 0;
	- nw->hdr = NULL;
	- nw->data = nw->_storage;
	- nw->writer_type = NS_WRITER_TYPE_BUF;
	- nw->malloc_flag = mflag;
	- nw->num_messages = 0;
	- nw->enomem = false;
	- return (true);
	-}
	-
	static bool
	-nlmsg_write_socket_buf(struct nl_writer nw, void buf, int datalen, int cnt)
	+nlmsg_get_buf(struct nl_writer *nw, u_int len, bool waitok)
	{
	- NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
	- if (__predict_false(datalen == 0)) {
	- free(buf, M_NETLINK);
	- return (true);
	- }
	+ const int mflag = waitok ? M_WAITOK : M_NOWAIT;

	- struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
	- if (__predict_false(m == NULL)) {
	- /* XXX: should we set sorcverr? */
	- free(buf, M_NETLINK);
	- return (false);
	- }
	- m_append(m, datalen, buf);
	- free(buf, M_NETLINK);
	+ MPASS(nw->buf == NULL);

	- int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
	- return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
	-}
	-
	-static bool
	-nlmsg_write_group_buf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
	- nw->arg.group.proto, nw->arg.group.id);
	- if (__predict_false(datalen == 0)) {
	- free(buf, M_NETLINK);
	- return (true);
	- }
	+ NL_LOG(LOG_DEBUG3, "Setting up nw %p len %u %s", nw, len,
	+ waitok ? "wait" : "nowait");

	- struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
	- if (__predict_false(m == NULL)) {
	- free(buf, M_NETLINK);
	+ nw->buf = nl_buf_alloc(len, mflag);
	+ if (__predict_false(nw->buf == NULL))
	return (false);
	- }
	- bool success = m_append(m, datalen, buf) != 0;
	- free(buf, M_NETLINK);
	-
	- if (!success)
	- return (false);
	-
	- nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
	- return (true);
	-}
	-
	-static bool
	-nlmsg_write_chain_buf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- struct mbuf m0 = (struct mbuf )(nw->arg.ptr);
	- NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
	-
	- if (__predict_false(datalen == 0)) {
	- free(buf, M_NETLINK);
	- return (true);
	- }
	-
	- if (*m0 == NULL) {
	- struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
	-
	- if (__predict_false(m == NULL)) {
	- free(buf, M_NETLINK);
	- return (false);
	- }
	- *m0 = m;
	- }
	- if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
	- free(buf, M_NETLINK);
	- return (false);
	- }
	- return (true);
	-}
	-
	-
	-/*
	- * NS_WRITER_TYPE_MBUF
	- * Writes message to the allocated mbuf,
	- * flushing to socket/group when mbuf size limit is reached.
	- * This is the most efficient mechanism as it avoids double-copying.
	- *
	- * Allocates a single mbuf suitable to store up to @size bytes of data.
	- * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr.
	- * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone.
	- * Returns NULL on greater size or the allocation failure.
	- */
	-static bool
	-nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
	-{
	- int mflag = waitok ? M_WAITOK : M_NOWAIT;
	- struct mbuf *m = nl_get_mbuf(size, mflag);
	-
	- if (__predict_false(m == NULL))
	- return (false);
	- nw->alloc_len = M_TRAILINGSPACE(m);
	- nw->offset = 0;
	nw->hdr = NULL;
	- nw->_storage = (void *)m;
	- nw->data = mtod(m, void *);
	- nw->writer_type = NS_WRITER_TYPE_MBUF;
	nw->malloc_flag = mflag;
	nw->num_messages = 0;
	nw->enomem = false;
	- memset(nw->data, 0, size);
	- NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
	- m, size, nw->alloc_len, nw->data);
	- return (true);
	-}
	-
	-static bool
	-nlmsg_write_socket_mbuf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- struct mbuf m = (struct mbuf )buf;
	- NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
	-
	- if (__predict_false(datalen == 0)) {
	- m_freem(m);
	- return (true);
	- }
	-
	- m->m_pkthdr.len = datalen;
	- m->m_len = datalen;
	- int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
	- return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
	-}
	-
	-static bool
	-nlmsg_write_group_mbuf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- struct mbuf m = (struct mbuf )buf;
	- NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
	- nw->arg.group.proto, nw->arg.group.id);
	-
	- if (__predict_false(datalen == 0)) {
	- m_freem(m);
	- return (true);
	- }

	- m->m_pkthdr.len = datalen;
	- m->m_len = datalen;
	- nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
	return (true);
	}

	-static bool
	-nlmsg_write_chain_mbuf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- struct mbuf m_new = (struct mbuf )buf;
	- struct mbuf m0 = (struct mbuf )(nw->arg.ptr);
	-
	- NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
	-
	- if (__predict_false(datalen == 0)) {
	- m_freem(m_new);
	- return (true);
	- }
	-
	- m_new->m_pkthdr.len = datalen;
	- m_new->m_len = datalen;
	-
	- if (*m0 == NULL) {
	- *m0 = m_new;
	- } else {
	- struct mbuf *m_last;
	- for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
	- ;
	- m_last->m_next = m_new;
	- (*m0)->m_pkthdr.len += datalen;
	- }
	-
	- return (true);
	-}
	-
	-/*
	- * NS_WRITER_TYPE_LBUF
	- * Writes message to the allocated memory buffer,
	- * flushing to socket/group when mbuf size limit is reached.
	- * Calls linux handler to rewrite messages before sending to the socket.
	- */
	-static bool
	-nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
	-{
	- int mflag = waitok ? M_WAITOK : M_NOWAIT;
	- size = roundup2(size, sizeof(void *));
	- int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
	- char buf = malloc(add_size + size 2, M_NETLINK, mflag \| M_ZERO);
	- if (__predict_false(buf == NULL))
	- return (false);
	-
	- /* Fill buffer header first */
	- struct linear_buffer lb = (struct linear_buffer )buf;
	- lb->base = &buf[sizeof(struct linear_buffer) + size];
	- lb->size = size + SCRATCH_BUFFER_SIZE;
	-
	- nw->alloc_len = size;
	- nw->offset = 0;
	- nw->hdr = NULL;
	- nw->_storage = buf;
	- nw->data = (char *)(lb + 1);
	- nw->malloc_flag = mflag;
	- nw->writer_type = NS_WRITER_TYPE_LBUF;
	- nw->num_messages = 0;
	- nw->enomem = false;
	- return (true);
	-}
	-
	-static bool
	-nlmsg_write_socket_lbuf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- struct linear_buffer lb = (struct linear_buffer )buf;
	- char data = (char )(lb + 1);
	- struct nlpcb nlp = (struct nlpcb )(nw->arg.ptr);
	-
	- if (__predict_false(datalen == 0)) {
	- free(buf, M_NETLINK);
	- return (true);
	- }
	-
	- struct mbuf *m = NULL;
	- if (linux_netlink_p != NULL)
	- m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
	- free(buf, M_NETLINK);
	-
	- if (__predict_false(m == NULL)) {
	- /* XXX: should we set sorcverr? */
	- return (false);
	- }
	-
	- int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
	- return (nl_send_one(m, nlp, cnt, io_flags));
	-}
	-
	-/* Shouldn't be called (maybe except Linux code originating message) */
	-static bool
	-nlmsg_write_group_lbuf(struct nl_writer nw, void buf, int datalen, int cnt)
	-{
	- struct linear_buffer lb = (struct linear_buffer )buf;
	- char data = (char )(lb + 1);
	-
	- if (__predict_false(datalen == 0)) {
	- free(buf, M_NETLINK);
	- return (true);
	- }
	-
	- struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
	- if (__predict_false(m == NULL)) {
	- free(buf, M_NETLINK);
	- return (false);
	- }
	- m_append(m, datalen, data);
	- free(buf, M_NETLINK);
	-
	- nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
	- return (true);
	-}
	-
	-static const struct nlwriter_ops nlmsg_writers[] = {
	- /* NS_WRITER_TYPE_MBUF */
	- {
	- .init = nlmsg_get_ns_mbuf,
	- .write_socket = nlmsg_write_socket_mbuf,
	- .write_group = nlmsg_write_group_mbuf,
	- .write_chain = nlmsg_write_chain_mbuf,
	- },
	- /* NS_WRITER_TYPE_BUF */
	- {
	- .init = nlmsg_get_ns_buf,
	- .write_socket = nlmsg_write_socket_buf,
	- .write_group = nlmsg_write_group_buf,
	- .write_chain = nlmsg_write_chain_buf,
	- },
	- /* NS_WRITER_TYPE_LBUF */
	- {
	- .init = nlmsg_get_ns_lbuf,
	- .write_socket = nlmsg_write_socket_lbuf,
	- .write_group = nlmsg_write_group_lbuf,
	- },
	-};
	-
	-static void
	-nlmsg_set_callback(struct nl_writer *nw)
	-{
	- const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
	-
	- switch (nw->writer_target) {
	- case NS_WRITER_TARGET_SOCKET:
	- nw->cb = pops->write_socket;
	- break;
	- case NS_WRITER_TARGET_GROUP:
	- nw->cb = pops->write_group;
	- break;
	- case NS_WRITER_TARGET_CHAIN:
	- nw->cb = pops->write_chain;
	- break;
	- default:
	- panic("not implemented");
	- }
	-}
	-
	-static bool
	-nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
	-{
	- MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
	- NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
	- return (nlmsg_writers[type].init(nw, size, waitok));
	-}
	-
	-static bool
	-nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
	-{
	- int type;
	-
	- if (!is_linux) {
	- if (__predict_true(size <= NLMBUFSIZE))
	- type = NS_WRITER_TYPE_MBUF;
	- else
	- type = NS_WRITER_TYPE_BUF;
	- } else
	- type = NS_WRITER_TYPE_LBUF;
	- return (nlmsg_get_buf_type(nw, size, type, waitok));
	-}
	-
	bool
	_nlmsg_get_unicast_writer(struct nl_writer nw, int size, struct nlpcb nlp)
	{
	- if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
	- return (false);
	- nw->arg.ptr = (void *)nlp;
	- nw->writer_target = NS_WRITER_TARGET_SOCKET;
	- nlmsg_set_callback(nw);
	- return (true);
	+ nw->nlp = nlp;
	+ nw->cb = nl_send_one;
	+
	+ return (nlmsg_get_buf(nw, size, false));
	}

	bool
	_nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
	{
	- if (!nlmsg_get_buf(nw, size, false, false))
	- return (false);
	- nw->arg.group.proto = protocol;
	- nw->arg.group.id = group_id;
	- nw->writer_target = NS_WRITER_TARGET_GROUP;
	- nlmsg_set_callback(nw);
	- return (true);
	-}
	+ nw->group.proto = protocol;
	+ nw->group.id = group_id;
	+ nw->cb = nl_send_group;

	-bool
	-_nlmsg_get_chain_writer(struct nl_writer nw, int size, struct mbuf *pm)
	-{
	- if (!nlmsg_get_buf(nw, size, false, false))
	- return (false);
	- *pm = NULL;
	- nw->arg.ptr = (void *)pm;
	- nw->writer_target = NS_WRITER_TARGET_CHAIN;
	- nlmsg_set_callback(nw);
	- NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
	- return (true);
	+ return (nlmsg_get_buf(nw, size, false));
	}

	void
	@@ -576,18 +96,18 @@

	if (__predict_false(nw->hdr != NULL)) {
	/* Last message has not been completed, skip it. */
	- int completed_len = (char *)nw->hdr - nw->data;
	+ int completed_len = (char *)nw->hdr - nw->buf->data;
	/* Send completed messages */
	- nw->offset -= nw->offset - completed_len;
	+ nw->buf->datalen -= nw->buf->datalen - completed_len;
	nw->hdr = NULL;
	- }
	+ }

	NL_LOG(LOG_DEBUG2, "OUT");
	- bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
	- nw->_storage = NULL;
	+ bool result = nw->cb(nw);
	+ nw->num_messages = 0;

	if (!result) {
	- NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
	+ NL_LOG(LOG_DEBUG, "nw %p flush with %p() failed", nw, nw->cb);
	}

	return (result);
	@@ -599,59 +119,61 @@
	* Return true on success.
	*/
	bool
	-_nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
	+_nlmsg_refill_buffer(struct nl_writer *nw, u_int required_len)
	{
	- struct nl_writer ns_new = {};
	- int completed_len, new_len;
	+ struct nl_buf *new;
	+ u_int completed_len, new_len, last_len;
	+
	+ MPASS(nw->buf != NULL);

	if (nw->enomem)
	return (false);

	- NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
	- nw->offset, nw->alloc_len, required_len);
	+ NL_LOG(LOG_DEBUG3, "no space at offset %u/%u (want %u), trying to "
	+ "reclaim", nw->buf->datalen, nw->buf->buflen, required_len);

	- /* Calculated new buffer size and allocate it s*/
	- completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
	+ /* Calculate new buffer size and allocate it. */
	+ completed_len = (nw->hdr != NULL) ?
	+ (char *)nw->hdr - nw->buf->data : nw->buf->datalen;
	if (completed_len > 0 && required_len < NLMBUFSIZE) {
	- /* We already ran out of space, use the largest effective size */
	- new_len = max(nw->alloc_len, NLMBUFSIZE);
	+ /* We already ran out of space, use largest effective size. */
	+ new_len = max(nw->buf->buflen, NLMBUFSIZE);
	} else {
	- if (nw->alloc_len < NLMBUFSIZE)
	+ if (nw->buf->buflen < NLMBUFSIZE)
	+ /* XXXGL: does this happen? */
	new_len = NLMBUFSIZE;
	else
	- new_len = nw->alloc_len * 2;
	+ new_len = nw->buf->buflen * 2;
	while (new_len < required_len)
	new_len *= 2;
	}
	- bool waitok = (nw->malloc_flag == M_WAITOK);
	- bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
	- if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
	+
	+ new = nl_buf_alloc(new_len, nw->malloc_flag \| M_ZERO);
	+ if (__predict_false(new == NULL)) {
	nw->enomem = true;
	NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
	return (false);
	}
	- if (nw->ignore_limit)
	- nlmsg_ignore_limit(&ns_new);

	- /* Update callback data */
	- ns_new.writer_target = nw->writer_target;
	- nlmsg_set_callback(&ns_new);
	- ns_new.arg = nw->arg;
	-
	- /* Copy last (unfinished) header to the new storage */
	- int last_len = nw->offset - completed_len;
	+ /* Copy last (unfinished) header to the new storage. */
	+ last_len = nw->buf->datalen - completed_len;
	if (last_len > 0) {
	- memcpy(ns_new.data, nw->hdr, last_len);
	- ns_new.hdr = (struct nlmsghdr *)ns_new.data;
	- ns_new.offset = last_len;
	+ memcpy(new->data, nw->hdr, last_len);
	+ new->datalen = last_len;
	}

	- NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
	+ NL_LOG(LOG_DEBUG2, "completed: %u bytes, copied: %u bytes",
	+ completed_len, last_len);

	- /* Flush completed headers & switch to the new nw */
	- nlmsg_flush(nw);
	- memcpy(nw, &ns_new, sizeof(struct nl_writer));
	- NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
	+ if (completed_len > 0) {
	+ nlmsg_flush(nw);
	+ MPASS(nw->buf == NULL);
	+ } else
	+ nl_buf_free(nw->buf);
	+ nw->buf = new;
	+ nw->hdr = (last_len > 0) ? (struct nlmsghdr *)new->data : NULL;
	+ NL_LOG(LOG_DEBUG2, "switched buffer: used %u/%u bytes",
	+ new->datalen, new->buflen);

	return (true);
	}
	@@ -660,17 +182,20 @@
	_nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
	uint16_t flags, uint32_t len)
	{
	+ struct nl_buf *nb = nw->buf;
	struct nlmsghdr *hdr;
	+ u_int required_len;

	MPASS(nw->hdr == NULL);

	- int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
	- if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
	+ required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
	+ if (__predict_false(nb->datalen + required_len > nb->buflen)) {
	if (!nlmsg_refill_buffer(nw, required_len))
	return (false);
	+ nb = nw->buf;
	}

	- hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
	+ hdr = (struct nlmsghdr *)(&nb->data[nb->datalen]);

	hdr->nlmsg_len = len;
	hdr->nlmsg_type = type;
	@@ -679,7 +204,7 @@
	hdr->nlmsg_pid = portid;

	nw->hdr = hdr;
	- nw->offset += sizeof(struct nlmsghdr);
	+ nb->datalen += sizeof(struct nlmsghdr);

	return (true);
	}
	@@ -687,6 +212,8 @@
	bool
	_nlmsg_end(struct nl_writer *nw)
	{
	+ struct nl_buf *nb = nw->buf;
	+
	MPASS(nw->hdr != NULL);

	if (nw->enomem) {
	@@ -695,7 +222,7 @@
	return (false);
	}

	- nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
	+ nw->hdr->nlmsg_len = nb->data + nb->datalen - (char *)nw->hdr;
	NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
	nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
	nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
	@@ -707,8 +234,10 @@
	void
	_nlmsg_abort(struct nl_writer *nw)
	{
	+ struct nl_buf *nb = nw->buf;
	+
	if (nw->hdr != NULL) {
	- nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
	+ nb->datalen = (char *)nw->hdr - nb->data;
	nw->hdr = NULL;
	}
	}
	@@ -775,7 +304,7 @@
	/* Save operation result */
	int *perror = nlmsg_reserve_object(nw, int);
	NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
	- nw->offset, perror);
	+ nw->buf->datalen, perror);
	*perror = error;
	nlmsg_end(nw);
	nw->suppress_ack = true;
	@@ -787,40 +316,47 @@
	* KPI functions.
	*/

	-int
	+u_int
	nlattr_save_offset(const struct nl_writer *nw)
	{
	- return (nw->offset - ((char *)nw->hdr - nw->data));
	+ return (nw->buf->datalen - ((char *)nw->hdr - nw->buf->data));
	}

	void *
	nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz)
	{
	- sz = NETLINK_ALIGN(sz);
	+ struct nl_buf *nb = nw->buf;
	+ void *data;

	- if (__predict_false(nw->offset + sz > nw->alloc_len)) {
	+ sz = NETLINK_ALIGN(sz);
	+ if (__predict_false(nb->datalen + sz > nb->buflen)) {
	if (!nlmsg_refill_buffer(nw, sz))
	return (NULL);
	+ nb = nw->buf;
	}

	- void *data_ptr = &nw->data[nw->offset];
	- nw->offset += sz;
	- bzero(data_ptr, sz);
	+ data = &nb->data[nb->datalen];
	+ bzero(data, sz);
	+ nb->datalen += sz;

	- return (data_ptr);
	+ return (data);
	}

	bool
	nlattr_add(struct nl_writer nw, int attr_type, int attr_len, const void data)
	{
	- int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
	+ struct nl_buf *nb = nw->buf;
	+ struct nlattr *nla;
	+ u_int required_len;

	- if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
	+ required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr));
	+ if (__predict_false(nb->datalen + required_len > nb->buflen)) {
	if (!nlmsg_refill_buffer(nw, required_len))
	return (false);
	+ nb = nw->buf;
	}

	- struct nlattr nla = (struct nlattr )(&nw->data[nw->offset]);
	+ nla = (struct nlattr *)(&nb->data[nb->datalen]);

	nla->nla_len = attr_len + sizeof(struct nlattr);
	nla->nla_type = attr_type;
	@@ -831,7 +367,7 @@
	}
	memcpy((nla + 1), data, attr_len);
	}
	- nw->offset += required_len;
	+ nb->datalen += required_len;
	return (true);
	}

	diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c
	--- a/sys/netlink/netlink_module.c
	+++ b/sys/netlink/netlink_module.c
	@@ -181,7 +181,6 @@
	.nlmsg_abort = _nlmsg_abort,
	.nlmsg_get_unicast_writer = _nlmsg_get_unicast_writer,
	.nlmsg_get_group_writer = _nlmsg_get_group_writer,
	- .nlmsg_get_chain_writer = _nlmsg_get_chain_writer,
	.nlmsg_end_dump = _nlmsg_end_dump,
	.nl_modify_ifp_generic = _nl_modify_ifp_generic,
	.nl_store_ifp_cookie = _nl_store_ifp_cookie,
	@@ -219,7 +218,6 @@
	switch (what) {
	case MOD_LOAD:
	NL_LOG(LOG_DEBUG2, "Loading");
	- nl_init_msg_zone();
	nl_osd_register();
	#if !defined(NETLINK) && defined(NETLINK_MODULE)
	nl_set_functions(&nl_module);
	@@ -235,7 +233,6 @@
	nl_set_functions(NULL);
	#endif
	nl_osd_unregister();
	- nl_destroy_msg_zone();
	} else
	ret = EBUSY;
	break;
	diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h
	--- a/sys/netlink/netlink_var.h
	+++ b/sys/netlink/netlink_var.h
	@@ -43,14 +43,9 @@

	struct ucred;

	-struct nl_io_queue {
	- STAILQ_HEAD(, mbuf) head;
	- int length;
	- int hiwat;
	-};
	-
	struct nl_buf {
	TAILQ_ENTRY(nl_buf) tailq;
	+ struct mbuf *control;
	u_int buflen;
	u_int datalen;
	u_int offset;
	@@ -72,7 +67,6 @@
	bool nl_linux; /* true if running under compat */
	bool nl_unconstrained_vnet; /* true if running under VNET jail (or without jail) */
	bool nl_need_thread_setup;
	- struct nl_io_queue tx_queue;
	struct taskqueue *nl_taskqueue;
	struct task nl_task;
	struct ucred nl_cred; / Copy of nl_socket->so_cred */
	@@ -131,7 +125,7 @@
	extern struct nl_proto_handler *nl_handlers;

	/* netlink_domain.c */
	-void nl_send_group(struct mbuf *m, int cnt, int proto, int group_id);
	+bool nl_send_group(struct nl_writer *);
	void nl_osd_register(void);
	void nl_osd_unregister(void);
	void nl_set_thread_nlp(struct thread td, struct nlpcb nlp);
	@@ -139,22 +133,18 @@
	/* netlink_io.c */
	#define NL_IOF_UNTRANSLATED 0x01
	#define NL_IOF_IGNORE_LIMIT 0x02
	-bool nl_send_one(struct mbuf m, struct nlpcb nlp, int cnt, int io_flags);
	+bool nl_send_one(struct nl_writer *);
	void nlmsg_ack(struct nlpcb nlp, int error, struct nlmsghdr nlmsg,
	struct nl_pstate *npt);
	void nl_on_transmit(struct nlpcb *nlp);
	-void nl_init_io(struct nlpcb *nlp);
	-void nl_free_io(struct nlpcb *nlp);

	void nl_taskqueue_handler(void *_arg, int pending);
	void nl_schedule_taskqueue(struct nlpcb *nlp);
	void nl_process_receive_locked(struct nlpcb *nlp);
	void nl_set_source_metadata(struct mbuf *m, int num_messages);
	-void nl_add_msg_info(struct mbuf *m);
	-
	-/* netlink_message_writer.c */
	-void nl_init_msg_zone(void);
	-void nl_destroy_msg_zone(void);
	+void nl_add_msg_info(struct nl_buf *nb);
	+struct nl_buf *nl_buf_alloc(size_t len, int mflag);
	+void nl_buf_free(struct nl_buf *nb);

	/* netlink_generic.c */
	struct genl_family {
	diff --git a/sys/netlink/route/rt.c b/sys/netlink/route/rt.c
	--- a/sys/netlink/route/rt.c
	+++ b/sys/netlink/route/rt.c
	@@ -556,9 +556,8 @@
	IF_DEBUG_LEVEL(LOG_DEBUG3) {
	char rtbuf[INET6_ADDRSTRLEN + 5];
	FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family,
	- "Dump %s, offset %u, error %d",
	- rt_print_buf(rt, rtbuf, sizeof(rtbuf)),
	- wa->nw->offset, error);
	+ "Dump %s, error %d",
	+ rt_print_buf(rt, rtbuf, sizeof(rtbuf)), error);
	}
	wa->error = error;

	@@ -578,7 +577,6 @@

	FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d",
	wa->count, wa->dumped);
	- NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset);
	}

	static int
	diff --git a/tests/sys/netlink/test_netlink_message_writer.py b/tests/sys/netlink/test_netlink_message_writer.py
	--- a/tests/sys/netlink/test_netlink_message_writer.py
	+++ b/tests/sys/netlink/test_netlink_message_writer.py
	@@ -4,19 +4,11 @@
	from atf_python.ktest import BaseKernelTest
	from atf_python.sys.netlink.attrs import NlAttrU32

	-
	M_NOWAIT = 1
	M_WAITOK = 2
	-NS_WRITER_TYPE_MBUF = 0
	-NS_WRITER_TYPE_BUF = 1
	-NS_WRITER_TYPE_LBUF = 1
	-
	-MHLEN = 160
	-MCLBYTES = 2048 # XXX: may differ on some archs?
	-MJUMPAGESIZE = mmap.PAGESIZE
	-MJUM9BYTES = 9 * 1024
	-MJUM16BYTES = 16 * 1024

	+NLMSG_SMALL = 128
	+NLMSG_LARGE = 2048

	class TestNetlinkMessageWriter(BaseKernelTest):
	KTEST_MODULE_NAME = "ktest_netlink_message_writer"
	@@ -28,52 +20,20 @@
	pytest.param(M_WAITOK, id="WAITOK"),
	],
	)
	- @pytest.mark.parametrize(
	- "writer_type",
	- [
	- pytest.param(NS_WRITER_TYPE_MBUF, id="MBUF"),
	- pytest.param(NS_WRITER_TYPE_BUF, id="BUF"),
	- ],
	- )
	@pytest.mark.parametrize(
	"sz",
	[
	- pytest.param([160, 160], id="MHLEN"),
	- pytest.param([MCLBYTES, MCLBYTES], id="MCLBYTES"),
	+ pytest.param([NLMSG_SMALL, NLMSG_SMALL], id="NLMSG_SMALL"),
	+ pytest.param([NLMSG_LARGE, NLMSG_LARGE], id="NLMSG_LARGE"),
	+ pytest.param([NLMSG_LARGE + 256, NLMSG_LARGE + 256], id="NLMSG_LARGE+256"),
	],
	)
	- def test_mbuf_writer_allocation(self, sz, writer_type, malloc_flags):
	+ def test_nlbuf_writer_allocation(self, sz, malloc_flags):
	"""override to parametrize"""

	test_meta = [
	NlAttrU32(1, sz[0]), # size
	NlAttrU32(2, sz[1]), # expected_avail
	- NlAttrU32(4, writer_type),
	- NlAttrU32(5, malloc_flags),
	- ]
	- self.runtest(test_meta)
	-
	- @pytest.mark.parametrize(
	- "malloc_flags",
	- [
	- pytest.param(M_NOWAIT, id="NOWAIT"),
	- pytest.param(M_WAITOK, id="WAITOK"),
	- ],
	- )
	- @pytest.mark.parametrize(
	- "sz",
	- [
	- pytest.param([160, 160, 1], id="MHLEN"),
	- pytest.param([MCLBYTES, MCLBYTES, 1], id="MCLBYTES"),
	- pytest.param([MCLBYTES + 1, MCLBYTES + 1, 2], id="MCLBYTES_MHLEN"),
	- pytest.param([MCLBYTES + 256, MCLBYTES * 2, 2], id="MCLBYTESx2"),
	- ],
	- )
	- def test_mbuf_chain_allocation(self, sz, malloc_flags):
	- test_meta = [
	- NlAttrU32(1, sz[0]), # size
	- NlAttrU32(2, sz[1]), # expected_avail
	- NlAttrU32(3, sz[2]), # expected_count
	- NlAttrU32(5, malloc_flags),
	+ NlAttrU32(3, malloc_flags),
	]
	self.runtest(test_meta)

File Metadata

Mime Type: text/plain
Expires: Sat, Jul 4, 3:25 AM (14 h, 40 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 34648851
Default Alt Text: D42524.diff (60 KB)

D42524.diffNo OneTemporaryActions

D42524.diffView Options

File Metadata

Event Timeline

D42524.diff
No OneTemporary
Actions

D42524.diff
View Options