Page MenuHomeFreeBSD

D56563.id178629.diff
No OneTemporary

D56563.id178629.diff

diff --git a/lib/libsys/getsockopt.2 b/lib/libsys/getsockopt.2
--- a/lib/libsys/getsockopt.2
+++ b/lib/libsys/getsockopt.2
@@ -25,7 +25,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd November 25, 2024
+.Dd April 21, 2026
.Dt GETSOCKOPT 2
.Os
.Sh NAME
@@ -220,6 +220,10 @@
listening sockets based on a hash function of local port number, and foreign IP
address and port number.
A maximum of 256 sockets can be bound to the same load-balancing group.
+.Dv PF_DIVERT
+sockets may also be bound to a group, see the
+.Xr divert 4
+manual page for details.
.Pp
.Dv SO_KEEPALIVE
enables the
diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4
--- a/share/man/man4/divert.4
+++ b/share/man/man4/divert.4
@@ -1,5 +1,4 @@
-.\"
-.Dd January 23, 2026
+.Dd April 21, 2026
.Dt DIVERT 4
.Os
.Sh NAME
@@ -57,7 +56,26 @@
.Pp
By reading from and writing to a divert socket, matching packets
can be passed through an arbitrary ``filter'' as they travel through
-the host machine, special routing tricks can be done, etc.
+the host machine, special routing tricks can be done, etc..
+.Pp
+Multiple divert sockets may be bound to the same port if the
+.Dv SO_REUSEPORT_LB
+socket option is set on all of them.
+In this case, the kernel will attempt to load-balance packets among
+the sockets.
+The implementation ensures that packets from the same flow are delivered
+to the same socket.
+To this end it relies on the firewall to provide a flow identifier with
+each diverted packet.
+When using the
+.Xr pf 4
+firewall, this is the associated state ID, if one exists, otherwise all
+packets are diverted to the first socket in the group.
+Currently the
+.Xr ipfw 4
+firewall does not provide a flow identifier, so all packets are diverted
+to the first socket in the group.
+At most 32 sockets can be bound to the same port.
.Sh READING PACKETS
Packets are diverted either as they are ``incoming'' or ``outgoing.''
Incoming packets are diverted after reception on an IP interface,
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -36,6 +36,7 @@
#include <sys/param.h>
#include <sys/ck.h>
#include <sys/eventhandler.h>
+#include <sys/hash.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -88,7 +89,7 @@
*/
#define DIVHASHSIZE (1 << 3) /* 8 entries, one cache line. */
#define DIVHASH(port) (port % DIVHASHSIZE)
-#define DCBHASH(dcb) ((dcb)->dcb_port % DIVHASHSIZE)
+#define DCBHASH(dcb) (DIVHASH((dcb)->dcb_port))
/*
* Divert sockets work in conjunction with ipfw or other packet filters,
@@ -147,10 +148,22 @@
struct epoch_context dcb_epochctx;
};
+struct divcblbgroup {
+ CK_SLIST_ENTRY(divcblbgroup) dl_next;
+ struct epoch_context dl_epochctx;
+ uint16_t dl_port;
+ uint16_t dl_count;
+#define DIVCBLBGROUP_SIZE 32
+ struct divcb *dl_dcb[DIVCBLBGROUP_SIZE];
+};
+
CK_SLIST_HEAD(divhashhead, divcb);
+CK_SLIST_HEAD(divlbgrouphashhead, divcblbgroup);
-VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {};
+VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]);
#define V_divhash VNET(divhash)
+VNET_DEFINE_STATIC(struct divlbgrouphashhead, divlbhash[DIVHASHSIZE]);
+#define V_divlbhash VNET(divlbhash)
VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0;
#define V_dcb_count VNET(dcb_count)
VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0;
@@ -163,10 +176,15 @@
/*
* Divert a packet by passing it up to the divert socket at port 'port'.
+ *
+ * 'id' is an opaque identifier for the flow and is used to load-balance packets
+ * across multiple divert sockets bound to the same port. Packets with the same
+ * identifier will be delivered to the same socket.
*/
static void
-divert_packet(struct mbuf *m, bool incoming)
+divert_packet(struct mbuf *m, uint64_t id, bool incoming)
{
+ struct divcblbgroup *dlb;
struct divcb *dcb;
u_int16_t nport;
struct sockaddr_in divsrc;
@@ -272,10 +290,27 @@
sizeof(divsrc.sin_zero));
}
- /* Put packet on socket queue, if any */
- CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
- if (dcb->dcb_port == nport)
+ /*
+ * Look for a matching divert socket or socket group, and enqueue the
+ * packet.
+ */
+ CK_SLIST_FOREACH(dlb, &V_divlbhash[DIVHASH(nport)], dl_next) {
+ uint16_t count;
+
+ count = atomic_load_acq_16(&dlb->dl_count);
+ if (dlb->dl_port == nport && count > 0) {
+ uint32_t hash;
+
+ hash = jenkins_hash(&id, sizeof(uint64_t), 0);
+ dcb = dlb->dl_dcb[hash % count];
break;
+ }
+ }
+ if (dlb == NULL) {
+ CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
+ if (dcb->dcb_port == nport)
+ break;
+ }
if (dcb != NULL) {
struct socket *sa = dcb->dcb_socket;
@@ -596,6 +631,53 @@
free(dcb, M_PCB);
}
+static void
+divlbgroup_free(epoch_context_t ctx)
+{
+ struct divcblbgroup *dlb = __containerof(ctx, struct divcblbgroup,
+ dl_epochctx);
+
+ free(dlb, M_PCB);
+}
+
+static void
+div_lbgroup_detach(struct divcb *dcb)
+{
+ struct divcblbgroup *dlb;
+
+ CK_SLIST_FOREACH(dlb, &V_divlbhash[DCBHASH(dcb)], dl_next) {
+ if (dlb->dl_port != dcb->dcb_port)
+ continue;
+
+ /*
+ * Delicately remove the socket from its group, taking
+ * care to synchronize with lookups, which do not handle
+ * NULL slots in the group table.
+ *
+ * Note that the hash is not stable across different
+ * group sizes.
+ */
+ for (int i = 0; i < dlb->dl_count; i++) {
+ unsigned int count;
+
+ if (dlb->dl_dcb[i] != dcb)
+ continue;
+
+ count = dlb->dl_count;
+ if (i != count - 1)
+ dlb->dl_dcb[i] = dlb->dl_dcb[count - 1];
+ atomic_store_rel_16(&dlb->dl_count, count - 1);
+ if (count == 1) {
+ CK_SLIST_REMOVE(&V_divlbhash[DCBHASH(dcb)], dlb,
+ divcblbgroup, dl_next);
+ NET_EPOCH_CALL(divlbgroup_free,
+ &dlb->dl_epochctx);
+ }
+ return;
+ }
+ }
+}
+
static void
div_detach(struct socket *so)
{
@@ -603,8 +685,10 @@
so->so_pcb = NULL;
DIVERT_LOCK();
- if (dcb->dcb_bound != DCB_UNBOUND)
+ if (dcb->dcb_bound != DCB_UNBOUND) {
CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
+ div_lbgroup_detach(dcb);
+ }
V_dcb_count--;
V_dcb_gencnt++;
DIVERT_UNLOCK();
@@ -614,28 +698,70 @@
static int
div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
+ struct divcblbgroup *dlb;
struct divcb *dcb;
+ int error;
uint16_t port;
if (nam->sa_family != AF_INET)
return EAFNOSUPPORT;
if (nam->sa_len != sizeof(struct sockaddr_in))
return EINVAL;
+
+ error = 0;
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ dlb = malloc(sizeof(*dlb), M_PCB, M_WAITOK | M_ZERO);
+ else
+ dlb = NULL;
+
port = ((struct sockaddr_in *)nam)->sin_port;
DIVERT_LOCK();
- CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next)
- if (dcb->dcb_port == port) {
- DIVERT_UNLOCK();
- return (EADDRINUSE);
+ if (dlb == NULL) {
+ CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next) {
+ if (dcb->dcb_port == port) {
+ DIVERT_UNLOCK();
+ return (EADDRINUSE);
+ }
}
+ }
dcb = so->so_pcb;
- if (dcb->dcb_bound != DCB_UNBOUND)
- CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
- dcb->dcb_port = port;
- CK_SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
+ if (dlb != NULL) {
+ struct divcblbgroup *tmp;
+
+ CK_SLIST_FOREACH(tmp, &V_divlbhash[DIVHASH(port)], dl_next) {
+ if (tmp->dl_port == port)
+ break;
+ }
+ if (tmp == NULL) {
+ dlb->dl_port = port;
+ dlb->dl_count = 1;
+ dlb->dl_dcb[0] = dcb;
+ CK_SLIST_INSERT_HEAD(&V_divlbhash[DIVHASH(port)], dlb,
+ dl_next);
+ } else if (tmp->dl_count < DIVCBLBGROUP_SIZE) {
+ KASSERT(tmp->dl_count > 0,
+ ("div_bind: lbgroup %p has count 0", tmp));
+
+ tmp->dl_dcb[tmp->dl_count] = dcb;
+ atomic_store_rel_16(&tmp->dl_count, tmp->dl_count + 1);
+ free(dlb, M_PCB);
+ } else {
+ error = ENOSPC;
+ free(dlb, M_PCB);
+ }
+ }
+ if (error == 0) {
+ if (dcb->dcb_bound != DCB_UNBOUND) {
+ CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb,
+ dcb_next);
+ div_lbgroup_detach(dcb);
+ }
+ dcb->dcb_port = port;
+ CK_SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
+ }
DIVERT_UNLOCK();
- return (0);
+ return (error);
}
static int
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -324,7 +324,7 @@
#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr)
/* Divert hooks. */
-extern void (*ip_divert_ptr)(struct mbuf *m, bool incoming);
+extern void (*ip_divert_ptr)(struct mbuf *m, uint64_t id, bool incoming);
/* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
extern int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
extern int (*ip_dn_ctl_ptr)(struct sockopt *);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -96,7 +96,7 @@
int (*ip_dn_ctl_ptr)(struct sockopt *);
int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
-void (*ip_divert_ptr)(struct mbuf *, bool);
+void (*ip_divert_ptr)(struct mbuf *, uint64_t, bool);
int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
#ifdef INET
diff --git a/sys/netpfil/ipfw/ip_fw_pfil.c b/sys/netpfil/ipfw/ip_fw_pfil.c
--- a/sys/netpfil/ipfw/ip_fw_pfil.c
+++ b/sys/netpfil/ipfw/ip_fw_pfil.c
@@ -563,7 +563,7 @@
m_tag_prepend(clone, tag);
/* Do the dirty job... */
- ip_divert_ptr(clone, args->flags & IPFW_ARGS_IN);
+ ip_divert_ptr(clone, 0, args->flags & IPFW_ARGS_IN);
return 0;
}
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -11966,7 +11966,7 @@
pd.m->m_flags &= ~M_FASTFWD_OURS;
}
}
- ip_divert_ptr(*m0, dir == PF_IN);
+ ip_divert_ptr(*m0, s != NULL ? s->id : 0, dir == PF_IN);
*m0 = NULL;
return (action);
} else if (mtag == NULL) {

File Metadata

Mime Type
text/plain
Expires
Thu, Jun 18, 10:52 AM (25 m, 57 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34021622
Default Alt Text
D56563.id178629.diff (9 KB)

Event Timeline