Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F159716619
D56563.id178629.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
9 KB
Referenced Files
None
Subscribers
None
D56563.id178629.diff
View Options
diff --git a/lib/libsys/getsockopt.2 b/lib/libsys/getsockopt.2
--- a/lib/libsys/getsockopt.2
+++ b/lib/libsys/getsockopt.2
@@ -25,7 +25,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd November 25, 2024
+.Dd April 21, 2026
.Dt GETSOCKOPT 2
.Os
.Sh NAME
@@ -220,6 +220,10 @@
listening sockets based on a hash function of local port number, and foreign IP
address and port number.
A maximum of 256 sockets can be bound to the same load-balancing group.
+.Dv PF_DIVERT
+sockets may also be bound to a group, see the
+.Xr divert 4
+manual page for details.
.Pp
.Dv SO_KEEPALIVE
enables the
diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4
--- a/share/man/man4/divert.4
+++ b/share/man/man4/divert.4
@@ -1,5 +1,4 @@
-.\"
-.Dd January 23, 2026
+.Dd April 21, 2026
.Dt DIVERT 4
.Os
.Sh NAME
@@ -57,7 +56,26 @@
.Pp
By reading from and writing to a divert socket, matching packets
can be passed through an arbitrary ``filter'' as they travel through
-the host machine, special routing tricks can be done, etc.
+the host machine, special routing tricks can be done, etc..
+.Pp
+Multiple divert sockets may be bound to the same port if the
+.Dv SO_REUSEPORT_LB
+socket option is set on all of them.
+In this case, the kernel will attempt to load-balance packets among
+the sockets.
+The implementation ensures that packets from the same flow are delivered
+to the same socket.
+To this end it relies on the firewall to provide a flow identifier with
+each diverted packet.
+When using the
+.Xr pf 4
+firewall, this is the associated state ID, if one exists, otherwise all
+packets are diverted to the first socket in the group.
+Currently the
+.Xr ipfw 4
+firewall does not provide a flow identifier, so all packets are diverted
+to the first socket in the group.
+At most 32 sockets can be bound to the same port.
.Sh READING PACKETS
Packets are diverted either as they are ``incoming'' or ``outgoing.''
Incoming packets are diverted after reception on an IP interface,
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -36,6 +36,7 @@
#include <sys/param.h>
#include <sys/ck.h>
#include <sys/eventhandler.h>
+#include <sys/hash.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -88,7 +89,7 @@
*/
#define DIVHASHSIZE (1 << 3) /* 8 entries, one cache line. */
#define DIVHASH(port) (port % DIVHASHSIZE)
-#define DCBHASH(dcb) ((dcb)->dcb_port % DIVHASHSIZE)
+#define DCBHASH(dcb) (DIVHASH((dcb)->dcb_port))
/*
* Divert sockets work in conjunction with ipfw or other packet filters,
@@ -147,10 +148,22 @@
struct epoch_context dcb_epochctx;
};
+struct divcblbgroup {
+ CK_SLIST_ENTRY(divcblbgroup) dl_next;
+ struct epoch_context dl_epochctx;
+ uint16_t dl_port;
+ uint16_t dl_count;
+#define DIVCBLBGROUP_SIZE 32
+ struct divcb *dl_dcb[DIVCBLBGROUP_SIZE];
+};
+
CK_SLIST_HEAD(divhashhead, divcb);
+CK_SLIST_HEAD(divlbgrouphashhead, divcblbgroup);
-VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {};
+VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]);
#define V_divhash VNET(divhash)
+VNET_DEFINE_STATIC(struct divlbgrouphashhead, divlbhash[DIVHASHSIZE]);
+#define V_divlbhash VNET(divlbhash)
VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0;
#define V_dcb_count VNET(dcb_count)
VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0;
@@ -163,10 +176,15 @@
/*
* Divert a packet by passing it up to the divert socket at port 'port'.
+ *
+ * 'id' is an opaque identifier for the flow and is used to load-balance packets
+ * across multiple divert sockets bound to the same port. Packets with the same
+ * identifier will be delivered to the same socket.
*/
static void
-divert_packet(struct mbuf *m, bool incoming)
+divert_packet(struct mbuf *m, uint64_t id, bool incoming)
{
+ struct divcblbgroup *dlb;
struct divcb *dcb;
u_int16_t nport;
struct sockaddr_in divsrc;
@@ -272,10 +290,27 @@
sizeof(divsrc.sin_zero));
}
- /* Put packet on socket queue, if any */
- CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
- if (dcb->dcb_port == nport)
+ /*
+ * Look for a matching divert socket or socket group, and enqueue the
+ * packet.
+ */
+ CK_SLIST_FOREACH(dlb, &V_divlbhash[DIVHASH(nport)], dl_next) {
+ uint16_t count;
+
+ count = atomic_load_acq_16(&dlb->dl_count);
+ if (dlb->dl_port == nport && count > 0) {
+ uint32_t hash;
+
+ hash = jenkins_hash(&id, sizeof(uint64_t), 0);
+ dcb = dlb->dl_dcb[hash % count];
break;
+ }
+ }
+ if (dlb == NULL) {
+ CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
+ if (dcb->dcb_port == nport)
+ break;
+ }
if (dcb != NULL) {
struct socket *sa = dcb->dcb_socket;
@@ -596,6 +631,53 @@
free(dcb, M_PCB);
}
+static void
+divlbgroup_free(epoch_context_t ctx)
+{
+ struct divcblbgroup *dlb = __containerof(ctx, struct divcblbgroup,
+ dl_epochctx);
+
+ free(dlb, M_PCB);
+}
+
+static void
+div_lbgroup_detach(struct divcb *dcb)
+{
+ struct divcblbgroup *dlb;
+
+ CK_SLIST_FOREACH(dlb, &V_divlbhash[DCBHASH(dcb)], dl_next) {
+ if (dlb->dl_port != dcb->dcb_port)
+ continue;
+
+ /*
+ * Delicately remove the socket from its group, taking
+ * care to synchronize with lookups, which do not handle
+ * NULL slots in the group table.
+ *
+ * Note that the hash is not stable across different
+ * group sizes.
+ */
+ for (int i = 0; i < dlb->dl_count; i++) {
+ unsigned int count;
+
+ if (dlb->dl_dcb[i] != dcb)
+ continue;
+
+ count = dlb->dl_count;
+ if (i != count - 1)
+ dlb->dl_dcb[i] = dlb->dl_dcb[count - 1];
+ atomic_store_rel_16(&dlb->dl_count, count - 1);
+ if (count == 1) {
+ CK_SLIST_REMOVE(&V_divlbhash[DCBHASH(dcb)], dlb,
+ divcblbgroup, dl_next);
+ NET_EPOCH_CALL(divlbgroup_free,
+ &dlb->dl_epochctx);
+ }
+ return;
+ }
+ }
+}
+
static void
div_detach(struct socket *so)
{
@@ -603,8 +685,10 @@
so->so_pcb = NULL;
DIVERT_LOCK();
- if (dcb->dcb_bound != DCB_UNBOUND)
+ if (dcb->dcb_bound != DCB_UNBOUND) {
CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
+ div_lbgroup_detach(dcb);
+ }
V_dcb_count--;
V_dcb_gencnt++;
DIVERT_UNLOCK();
@@ -614,28 +698,70 @@
static int
div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
{
+ struct divcblbgroup *dlb;
struct divcb *dcb;
+ int error;
uint16_t port;
if (nam->sa_family != AF_INET)
return EAFNOSUPPORT;
if (nam->sa_len != sizeof(struct sockaddr_in))
return EINVAL;
+
+ error = 0;
+ if ((so->so_options & SO_REUSEPORT_LB) != 0)
+ dlb = malloc(sizeof(*dlb), M_PCB, M_WAITOK | M_ZERO);
+ else
+ dlb = NULL;
+
port = ((struct sockaddr_in *)nam)->sin_port;
DIVERT_LOCK();
- CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next)
- if (dcb->dcb_port == port) {
- DIVERT_UNLOCK();
- return (EADDRINUSE);
+ if (dlb == NULL) {
+ CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next) {
+ if (dcb->dcb_port == port) {
+ DIVERT_UNLOCK();
+ return (EADDRINUSE);
+ }
}
+ }
dcb = so->so_pcb;
- if (dcb->dcb_bound != DCB_UNBOUND)
- CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
- dcb->dcb_port = port;
- CK_SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
+ if (dlb != NULL) {
+ struct divcblbgroup *tmp;
+
+ CK_SLIST_FOREACH(tmp, &V_divlbhash[DIVHASH(port)], dl_next) {
+ if (tmp->dl_port == port)
+ break;
+ }
+ if (tmp == NULL) {
+ dlb->dl_port = port;
+ dlb->dl_count = 1;
+ dlb->dl_dcb[0] = dcb;
+ CK_SLIST_INSERT_HEAD(&V_divlbhash[DIVHASH(port)], dlb,
+ dl_next);
+ } else if (tmp->dl_count < DIVCBLBGROUP_SIZE) {
+ KASSERT(tmp->dl_count > 0,
+ ("div_bind: lbgroup %p has count 0", tmp));
+
+ tmp->dl_dcb[tmp->dl_count] = dcb;
+ atomic_store_rel_16(&tmp->dl_count, tmp->dl_count + 1);
+ free(dlb, M_PCB);
+ } else {
+ error = ENOSPC;
+ free(dlb, M_PCB);
+ }
+ }
+ if (error == 0) {
+ if (dcb->dcb_bound != DCB_UNBOUND) {
+ CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb,
+ dcb_next);
+ div_lbgroup_detach(dcb);
+ }
+ dcb->dcb_port = port;
+ CK_SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
+ }
DIVERT_UNLOCK();
- return (0);
+ return (error);
}
static int
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -324,7 +324,7 @@
#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr)
/* Divert hooks. */
-extern void (*ip_divert_ptr)(struct mbuf *m, bool incoming);
+extern void (*ip_divert_ptr)(struct mbuf *m, uint64_t id, bool incoming);
/* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
extern int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
extern int (*ip_dn_ctl_ptr)(struct sockopt *);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -96,7 +96,7 @@
int (*ip_dn_ctl_ptr)(struct sockopt *);
int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
-void (*ip_divert_ptr)(struct mbuf *, bool);
+void (*ip_divert_ptr)(struct mbuf *, uint64_t, bool);
int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
#ifdef INET
diff --git a/sys/netpfil/ipfw/ip_fw_pfil.c b/sys/netpfil/ipfw/ip_fw_pfil.c
--- a/sys/netpfil/ipfw/ip_fw_pfil.c
+++ b/sys/netpfil/ipfw/ip_fw_pfil.c
@@ -563,7 +563,7 @@
m_tag_prepend(clone, tag);
/* Do the dirty job... */
- ip_divert_ptr(clone, args->flags & IPFW_ARGS_IN);
+ ip_divert_ptr(clone, 0, args->flags & IPFW_ARGS_IN);
return 0;
}
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -11966,7 +11966,7 @@
pd.m->m_flags &= ~M_FASTFWD_OURS;
}
}
- ip_divert_ptr(*m0, dir == PF_IN);
+ ip_divert_ptr(*m0, s != NULL ? s->id : 0, dir == PF_IN);
*m0 = NULL;
return (action);
} else if (mtag == NULL) {
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jun 18, 10:52 AM (25 m, 57 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34021622
Default Alt Text
D56563.id178629.diff (9 KB)
Attached To
Mode
D56563: divert: Define semantics for SO_REUSEPORT_LB on divert sockets
Attached
Detach File
Event Timeline
Log In to Comment