Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F107284129
D20953.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
55 KB
Referenced Files
None
Subscribers
None
D20953.diff
View Options
Index: head/sys/conf/files
===================================================================
--- head/sys/conf/files
+++ head/sys/conf/files
@@ -4276,6 +4276,7 @@
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
+netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
compile-with "${NORMAL_C} ${NO_WNONNULL}"
netinet/tcp_reass.c optional inet | inet6
Index: head/sys/dev/cxgbe/adapter.h
===================================================================
--- head/sys/dev/cxgbe/adapter.h
+++ head/sys/dev/cxgbe/adapter.h
@@ -1247,6 +1247,7 @@
int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
void cxgbe_snd_tag_free(struct m_snd_tag *);
void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
+void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
#endif
/* t4_filter.c */
Index: head/sys/dev/cxgbe/t4_main.c
===================================================================
--- head/sys/dev/cxgbe/t4_main.c
+++ head/sys/dev/cxgbe/t4_main.c
@@ -1658,6 +1658,7 @@
ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
ifp->if_snd_tag_query = cxgbe_snd_tag_query;
ifp->if_snd_tag_free = cxgbe_snd_tag_free;
+ ifp->if_ratelimit_query = cxgbe_ratelimit_query;
#endif
ifp->if_capabilities = T4_CAP;
Index: head/sys/dev/cxgbe/t4_sched.c
===================================================================
--- head/sys/dev/cxgbe/t4_sched.c
+++ head/sys/dev/cxgbe/t4_sched.c
@@ -903,4 +903,35 @@
}
mtx_unlock(&cst->lock);
}
+
+#define CXGBE_MAX_FLOWS 4000 /* Testing show so far thats all this adapter can do */
+#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */
+
+void
+cxgbe_ratelimit_query(struct ifnet *ifp __unused,
+ struct if_ratelimit_query_results *q)
+{
+ /*
+ * This is a skeleton and needs future work
+ * by the driver supporters. It should be
+ * enhanced to look at the specific type of
+ * interface and select approprate values
+ * for these settings. This example goes
+ * with an earlier card (t5), it has a maximum
+ * number of 16 rates that the first guys in
+ * select (thus the flags value RT_IS_SELECTABLE).
+ * If it was a fixed table then we would setup a
+ * const array (example mlx5). Note the card tested
+ * can only support reasonably 4000 flows before
+ * the adapter has issues with sending so here
+ * we limit the number of flows using hardware
+ * pacing to that number, other cards may
+ * be able to raise or eliminate this limit.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_IS_SELECTABLE;
+ q->max_flows = CXGBE_MAX_FLOWS;
+ q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
+ q->min_segment_burst = 4; /* Driver emits 4 in a burst */
+}
#endif
Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
===================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -4070,7 +4070,49 @@
}
}
+#define NUM_HDWR_RATES_MLX 13
+static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
+ 135375, /* 1,083,000 */
+ 180500, /* 1,444,000 */
+ 270750, /* 2,166,000 */
+ 361000, /* 2,888,000 */
+ 541500, /* 4,332,000 */
+ 721875, /* 5,775,000 */
+ 1082875, /* 8,663,000 */
+ 1443875, /* 11,551,000 */
+ 2165750, /* 17,326,000 */
+ 2887750, /* 23,102,000 */
+ 4331625, /* 34,653,000 */
+ 5775500, /* 46,204,000 */
+ 8663125 /* 69,305,000 */
+};
+
static void
+mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+ /*
+ * This function needs updating by the driver maintainer!
+ * For the MLX card there are currently (ConectX-4?) 13
+ * pre-set rates and others i.e. ConnectX-5, 6, 7??
+ *
+ * This will change based on later adapters
+ * and this code should be updated to look at ifp
+ * and figure out the specific adapter type
+ * settings i.e. how many rates as well
+ * as if they are fixed (as is shown here) or
+ * if they are dynamic (example chelsio t4). Also if there
+ * is a maximum number of flows that the adapter
+ * can handle that too needs to be updated in
+ * the max_flows field.
+ */
+ q->rate_table = adapter_rates_mlx;
+ q->flags = RT_IS_FIXED_TABLE;
+ q->max_flows = 0; /* mlx has no limit */
+ q->number_of_rates = NUM_HDWR_RATES_MLX;
+ q->min_segment_burst = 1;
+}
+
+static void
mlx5e_snd_tag_free(struct m_snd_tag *pmt)
{
struct mlx5e_snd_tag *tag =
@@ -4155,7 +4197,9 @@
ifp->if_snd_tag_free = mlx5e_snd_tag_free;
ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
ifp->if_snd_tag_query = mlx5e_snd_tag_query;
-
+#ifdef RATELIMIT
+ ifp->if_ratelimit_query = mlx5e_ratelimit_query;
+#endif
/* set TSO limits so that we don't have to drop TX packets */
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
Index: head/sys/net/if_dead.c
===================================================================
--- head/sys/net/if_dead.c
+++ head/sys/net/if_dead.c
@@ -126,6 +126,23 @@
{
}
+static void
+ifdead_ratelimit_query(struct ifnet *ifp __unused,
+ struct if_ratelimit_query_results *q)
+{
+ /*
+ * This guy does not support
+ * this interface. Not sure
+ * why we would specify a
+ * flag on the interface
+ * that says we do.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_NOSUPPORT;
+ q->max_flows = 0;
+ q->number_of_rates = 0;
+}
+
void
if_dead(struct ifnet *ifp)
{
@@ -142,4 +159,5 @@
ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
ifp->if_snd_tag_query = ifdead_snd_tag_query;
ifp->if_snd_tag_free = ifdead_snd_tag_free;
+ ifp->if_ratelimit_query = ifdead_ratelimit_query;
}
Index: head/sys/net/if_lagg.c
===================================================================
--- head/sys/net/if_lagg.c
+++ head/sys/net/if_lagg.c
@@ -144,6 +144,8 @@
static int lagg_snd_tag_query(struct m_snd_tag *,
union if_snd_tag_query_params *);
static void lagg_snd_tag_free(struct m_snd_tag *);
+static void lagg_ratelimit_query(struct ifnet *,
+ struct if_ratelimit_query_results *);
#endif
static int lagg_setmulti(struct lagg_port *);
static int lagg_clrmulti(struct lagg_port *);
@@ -537,6 +539,7 @@
ifp->if_snd_tag_modify = lagg_snd_tag_modify;
ifp->if_snd_tag_query = lagg_snd_tag_query;
ifp->if_snd_tag_free = lagg_snd_tag_free;
+ ifp->if_ratelimit_query = lagg_ratelimit_query;
#endif
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
@@ -1670,6 +1673,20 @@
free(lst, M_LAGG);
}
+static void
+lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+ /*
+ * For lagg, we have an indirect
+ * interface. The caller needs to
+ * get a ratelimit tag on the actual
+ * interface the flow will go on.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_IS_INDIRECT;
+ q->max_flows = 0;
+ q->number_of_rates = 0;
+}
#endif
static int
Index: head/sys/net/if_var.h
===================================================================
--- head/sys/net/if_var.h
+++ head/sys/net/if_var.h
@@ -203,6 +203,8 @@
struct if_snd_tag_alloc_rate_limit {
struct if_snd_tag_alloc_header hdr;
uint64_t max_rate; /* in bytes/s */
+ uint32_t flags; /* M_NOWAIT or M_WAITOK */
+ uint32_t reserved; /* alignment */
};
struct if_snd_tag_rate_limit_params {
@@ -210,7 +212,7 @@
uint32_t queue_level; /* 0 (empty) .. 65535 (full) */
#define IF_SND_QUEUE_LEVEL_MIN 0
#define IF_SND_QUEUE_LEVEL_MAX 65535
- uint32_t reserved; /* padding */
+ uint32_t flags; /* M_NOWAIT or M_WAITOK */
};
union if_snd_tag_alloc_params {
@@ -229,12 +231,38 @@
struct if_snd_tag_rate_limit_params unlimited;
};
+/* Query return flags */
+#define RT_NOSUPPORT 0x00000000 /* Not supported */
+#define RT_IS_INDIRECT 0x00000001 /*
+ * Interface like a lagg, select
+ * the actual interface for
+ * capabilities.
+ */
+#define RT_IS_SELECTABLE 0x00000002 /*
+ * No rate table, you select
+ * rates and the first
+ * number_of_rates are created.
+ */
+#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */
+#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */
+
+struct if_ratelimit_query_results {
+ const uint64_t *rate_table; /* Pointer to table if present */
+ uint32_t flags; /* Flags indicating results */
+ uint32_t max_flows; /* Max flows using, 0=unlimited */
+ uint32_t number_of_rates; /* How many unique rates can be created */
+ uint32_t min_segment_burst; /* The amount the adapter bursts at each send */
+};
+
typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
struct m_snd_tag **);
typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef void (if_ratelimit_query_t)(struct ifnet *,
+ struct if_ratelimit_query_results *);
+
/*
* Structure defining a network interface.
*/
@@ -374,6 +402,7 @@
if_snd_tag_modify_t *if_snd_tag_modify;
if_snd_tag_query_t *if_snd_tag_query;
if_snd_tag_free_t *if_snd_tag_free;
+ if_ratelimit_query_t *if_ratelimit_query;
/* Ethernet PCP */
uint8_t if_pcp;
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h
+++ head/sys/netinet/in_pcb.h
@@ -883,8 +883,13 @@
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
#ifdef RATELIMIT
-int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
+ struct mbuf *, uint32_t);
+int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
+ uint32_t, struct m_snd_tag **);
void in_pcbdetach_txrtlmt(struct inpcb *);
+void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
int in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c
+++ head/sys/netinet/in_pcb.c
@@ -210,6 +210,22 @@
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
+
+#ifdef RATELIMIT
+counter_u64_t rate_limit_active;
+counter_u64_t rate_limit_alloc_fail;
+counter_u64_t rate_limit_set_ok;
+
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
+ "IP Rate Limiting");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
+ &rate_limit_active, "Active rate limited connections");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
+ &rate_limit_alloc_fail, "Rate limited connection failures");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
+ &rate_limit_set_ok, "Rate limited setting succeeded");
+#endif /* RATELIMIT */
+
#endif /* INET */
/*
@@ -3170,6 +3186,7 @@
{
union if_snd_tag_modify_params params = {
.rate_limit.max_rate = max_pacing_rate,
+ .rate_limit.flags = M_NOWAIT,
};
struct m_snd_tag *mst;
struct ifnet *ifp;
@@ -3256,7 +3273,8 @@
*/
int
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
- uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+ uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
+
{
union if_snd_tag_alloc_params params = {
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
@@ -3264,22 +3282,47 @@
.rate_limit.hdr.flowid = flowid,
.rate_limit.hdr.flowtype = flowtype,
.rate_limit.max_rate = max_pacing_rate,
+ .rate_limit.flags = M_NOWAIT,
};
int error;
INP_WLOCK_ASSERT(inp);
- if (inp->inp_snd_tag != NULL)
+ if (*st != NULL)
return (EINVAL);
if (ifp->if_snd_tag_alloc == NULL) {
error = EOPNOTSUPP;
} else {
error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
+
+ if (error == 0) {
+ counter_u64_add(rate_limit_set_ok, 1);
+ counter_u64_add(rate_limit_active, 1);
+ } else
+ counter_u64_add(rate_limit_alloc_fail, 1);
}
return (error);
}
+void
+in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
+{
+ if (ifp == NULL)
+ return;
+
+ /*
+ * If the device was detached while we still had reference(s)
+ * on the ifp, we assume if_snd_tag_free() was replaced with
+ * stubs.
+ */
+ ifp->if_snd_tag_free(mst);
+
+ /* release reference count on network interface */
+ if_rele(ifp);
+ counter_u64_add(rate_limit_active, -1);
+}
+
/*
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
* if any:
@@ -3300,49 +3343,12 @@
m_snd_tag_rele(mst);
}
-/*
- * This function should be called when the INP_RATE_LIMIT_CHANGED flag
- * is set in the fast path and will attach/detach/modify the TX rate
- * limit send tag based on the socket's so_max_pacing_rate value.
- */
-void
-in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
{
- struct socket *socket;
- uint32_t max_pacing_rate;
- bool did_upgrade;
int error;
- if (inp == NULL)
- return;
-
- socket = inp->inp_socket;
- if (socket == NULL)
- return;
-
- if (!INP_WLOCKED(inp)) {
- /*
- * NOTE: If the write locking fails, we need to bail
- * out and use the non-ratelimited ring for the
- * transmit until there is a new chance to get the
- * write lock.
- */
- if (!INP_TRY_UPGRADE(inp))
- return;
- did_upgrade = 1;
- } else {
- did_upgrade = 0;
- }
-
/*
- * NOTE: The so_max_pacing_rate value is read unlocked,
- * because atomic updates are not required since the variable
- * is checked at every mbuf we send. It is assumed that the
- * variable read itself will be atomic.
- */
- max_pacing_rate = socket->so_max_pacing_rate;
-
- /*
* If the existing send tag is for the wrong interface due to
* a route change, first drop the existing tag. Set the
* CHANGED flag so that we will keep trying to allocate a new
@@ -3376,13 +3382,61 @@
error = EAGAIN;
} else {
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
- mb->m_pkthdr.flowid, max_pacing_rate);
+ mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
}
} else {
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
}
if (error == 0 || error == EOPNOTSUPP)
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+
+ return (error);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+ struct socket *socket;
+ uint32_t max_pacing_rate;
+ bool did_upgrade;
+ int error;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /*
+ * NOTE: The so_max_pacing_rate value is read unlocked,
+ * because atomic updates are not required since the variable
+ * is checked at every mbuf we send. It is assumed that the
+ * variable read itself will be atomic.
+ */
+ max_pacing_rate = socket->so_max_pacing_rate;
+
+ error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
+
if (did_upgrade)
INP_DOWNGRADE(inp);
}
@@ -3424,4 +3478,14 @@
if (did_upgrade)
INP_DOWNGRADE(inp);
}
+
+static void
+rl_init(void *st)
+{
+ rate_limit_active = counter_u64_alloc(M_WAITOK);
+ rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
+ rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
+}
+
+SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
#endif /* RATELIMIT */
Index: head/sys/netinet/tcp_ratelimit.h
===================================================================
--- head/sys/netinet/tcp_ratelimit.h
+++ head/sys/netinet/tcp_ratelimit.h
@@ -0,0 +1,141 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * __FBSDID("$FreeBSD$");
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+#ifndef __tcp_ratelimit_h__
+#define __tcp_ratelimit_h__
+
+struct m_snd_tag;
+
+/* Flags on an individual rate */
+#define HDWRPACE_INITED 0x0001
+#define HDWRPACE_TAGPRESENT 0x0002
+#define HDWRPACE_IFPDEPARTED 0x0004
+struct tcp_hwrate_limit_table {
+ const struct tcp_rate_set *ptbl; /* Pointer to parent table */
+ struct m_snd_tag *tag; /* Send tag if needed (chelsio) */
+ uint64_t rate; /* Rate we get in Bytes per second (Bps) */
+ uint32_t time_between; /* Time-Gap between packets at this rate */
+ uint32_t flags;
+};
+
+/* Rateset flags */
+#define RS_IS_DEFF 0x0001 /* Its a lagg, do a double lookup */
+#define RS_IS_INTF 0x0002 /* Its a plain interface */
+#define RS_NO_PRE 0x0004 /* The interfacd has set rates */
+#define RS_INT_TBL 0x0010 /*
+ * The table is the internal version
+ * which has special setup requirements.
+ */
+#define RS_IS_DEAD 0x0020 /* The RS is dead list */
+#define RS_FUNERAL_SCHD 0x0040 /* Is a epoch call scheduled to bury this guy?*/
+#define RS_INTF_NO_SUP 0x0100 /* The interface does not support the ratelimiting */
+
+struct tcp_rate_set {
+ struct sysctl_ctx_list sysctl_ctx;
+ CK_LIST_ENTRY(tcp_rate_set) next;
+ struct ifnet *rs_ifp;
+ struct tcp_hwrate_limit_table *rs_rlt;
+ uint64_t rs_flows_using;
+ uint64_t rs_flow_limit;
+ uint32_t rs_if_dunit;
+ int rs_rate_cnt;
+ int rs_min_seg;
+ int rs_highest_valid;
+ int rs_lowest_valid;
+ int rs_disable;
+ int rs_flags;
+ struct epoch_context rs_epoch_ctx;
+};
+
+CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
+
+/* Request flags */
+#define RS_PACING_EXACT_MATCH 0x0001 /* Need an exact match for rate */
+#define RS_PACING_GT 0x0002 /* Greater than requested */
+#define RS_PACING_GEQ 0x0004 /* Greater than or equal too */
+#define RS_PACING_LT 0x0008 /* Less than requested rate */
+#define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the
+ * next best rate (highest or lowest). */
+#ifdef RATELIMIT
+#ifdef _KERNEL
+#define DETAILED_RATELIMIT_SYSCTL 1 /*
+ * Undefine this if you don't want
+ * detailed rates to appear in
+ * net.inet.tcp.rl.
+ * With the defintion each rate
+ * shows up in your sysctl tree
+ * this can be big.
+ */
+
+const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error);
+
+const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error);
+void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp);
+#else
+static inline const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ if (error)
+ *error = EOPNOTSUPP;
+ return (NULL);
+}
+
+static inline const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ if (error)
+ *error = EOPNOTSUPP;
+ return (NULL);
+}
+
+static inline void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp)
+{
+ return;
+}
+
+#endif
+#endif
+#endif
Index: head/sys/netinet/tcp_ratelimit.c
===================================================================
--- head/sys/netinet/tcp_ratelimit.c
+++ head/sys/netinet/tcp_ratelimit.c
@@ -0,0 +1,1234 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/mutex.h>
+#include <sys/ck.h>
+#define TCPSTATES /* for logging */
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ratelimit.h>
+#ifndef USECS_IN_SECOND
+#define USECS_IN_SECOND 1000000
+#endif
+/*
+ * For the purposes of each send, what is the size
+ * of an ethernet frame.
+ */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
+MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
+#ifdef RATELIMIT
+
+#define COMMON_RATE 180500
+uint64_t desired_rates[] = {
+ 62500, /* 500Kbps */
+ 180500, /* 1.44Mpbs */
+ 375000, /* 3Mbps */
+ 500000, /* 4Mbps */
+ 625000, /* 5Mbps */
+ 750000, /* 6Mbps */
+ 1000000, /* 8Mbps */
+ 1250000, /* 10Mbps */
+ 2500000, /* 20Mbps */
+ 3750000, /* 30Mbps */
+ 5000000, /* 40Meg */
+ 6250000, /* 50Mbps */
+ 12500000, /* 100Mbps */
+ 25000000, /* 200Mbps */
+ 50000000, /* 400Mbps */
+ 100000000, /* 800Mbps */
+ 12500, /* 100kbps */
+ 25000, /* 200kbps */
+ 875000, /* 7Mbps */
+ 1125000, /* 9Mbps */
+ 1875000, /* 15Mbps */
+ 3125000, /* 25Mbps */
+ 8125000, /* 65Mbps */
+ 10000000, /* 80Mbps */
+ 18750000, /* 150Mbps */
+ 20000000, /* 250Mbps */
+ 37500000, /* 350Mbps */
+ 62500000, /* 500Mbps */
+ 78125000, /* 625Mbps */
+ 125000000, /* 1Gbps */
+};
+#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
+#define RS_ORDERED_COUNT 16 /*
+ * Number that are in order
+ * at the beginning of the table,
+ * over this a sort is required.
+ */
+#define RS_NEXT_ORDER_GROUP 16 /*
+ * The point in our table where
+ * we come fill in a second ordered
+ * group (index wise means -1).
+ */
+#define ALL_HARDWARE_RATES 1004 /*
+ * 1Meg - 1Gig in 1 Meg steps
+ * plus 100, 200k and 500k and
+ * 10Gig
+ */
+
+#define RS_ONE_MEGABIT_PERSEC 1000000
+#define RS_ONE_GIGABIT_PERSEC 1000000000
+#define RS_TEN_GIGABIT_PERSEC 10000000000
+
+static struct head_tcp_rate_set int_rs;
+static struct mtx rs_mtx;
+uint32_t rs_number_alive;
+uint32_t rs_number_dead;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
+ "TCP Ratelimit stats");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
+ &rs_number_alive, 0,
+ "Number of interfaces initialized for ratelimiting");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
+ &rs_number_dead, 0,
+ "Number of interfaces departing from ratelimiting");
+
+static void
+rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
+{
+ /*
+ * Add sysctl entries for thus interface.
+ */
+ if (rs->rs_flags & RS_INTF_NO_SUP) {
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "disable", CTLFLAG_RD,
+ &rs->rs_disable, 0,
+ "Disable this interface from new hdwr limiting?");
+ } else {
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "disable", CTLFLAG_RW,
+ &rs->rs_disable, 0,
+ "Disable this interface from new hdwr limiting?");
+ }
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "minseg", CTLFLAG_RW,
+ &rs->rs_min_seg, 0,
+ "What is the minimum we need to send on this interface?");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flow_limit", CTLFLAG_RW,
+ &rs->rs_flow_limit, 0,
+ "What is the limit for number of flows (0=unlimited)?");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "highest", CTLFLAG_RD,
+ &rs->rs_highest_valid, 0,
+ "Highest valid rate");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "lowest", CTLFLAG_RD,
+ &rs->rs_lowest_valid, 0,
+ "Lowest valid rate");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flags", CTLFLAG_RD,
+ &rs->rs_flags, 0,
+ "What lags are on the entry?");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "numrates", CTLFLAG_RD,
+ &rs->rs_rate_cnt, 0,
+ "How many rates re there?");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flows_using", CTLFLAG_RD,
+ &rs->rs_flows_using, 0,
+ "How many flows are using this interface now?");
+#ifdef DETAILED_RATELIMIT_SYSCTL
+ if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
+ /* Lets display the rates */
+ int i;
+ struct sysctl_oid *rl_rates;
+ struct sysctl_oid *rl_rate_num;
+ char rate_num[16];
+ rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO,
+ "rate",
+ CTLFLAG_RW, 0,
+ "Ratelist");
+ for( i = 0; i < rs->rs_rate_cnt; i++) {
+ sprintf(rate_num, "%d", i);
+ rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rates),
+ OID_AUTO,
+ rate_num,
+ CTLFLAG_RW, 0,
+ "Individual Rate");
+ SYSCTL_ADD_U32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "flags", CTLFLAG_RD,
+ &rs->rs_rlt[i].flags, 0,
+ "Flags on this rate");
+ SYSCTL_ADD_U32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "pacetime", CTLFLAG_RD,
+ &rs->rs_rlt[i].time_between, 0,
+ "Time hardware inserts between 1500 byte sends");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "rate", CTLFLAG_RD,
+ &rs->rs_rlt[i].rate, 0,
+ "Rate in bytes per second");
+ }
+ }
+#endif
+}
+
+static void
+rs_destroy(epoch_context_t ctx)
+{
+ struct tcp_rate_set *rs;
+
+ rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
+ mtx_lock(&rs_mtx);
+ rs->rs_flags &= ~RS_FUNERAL_SCHD;
+ if (rs->rs_flows_using == 0) {
+ /*
+ * In theory its possible (but unlikely)
+ * that while the delete was occuring
+ * and we were applying the DEAD flag
+ * someone slipped in and found the
+ * interface in a lookup. While we
+ * decided rs_flows_using were 0 and
+ * scheduling the epoch_call, the other
+ * thread incremented rs_flow_using. This
+ * is because users have a pointer and
+ * we only use the rs_flows_using in an
+ * atomic fashion, i.e. the other entities
+ * are not protected. To assure this did
+ * not occur, we check rs_flows_using here
+ * before deleteing.
+ */
+ sysctl_ctx_free(&rs->sysctl_ctx);
+ free(rs->rs_rlt, M_TCPPACE);
+ free(rs, M_TCPPACE);
+ rs_number_dead--;
+ }
+ mtx_unlock(&rs_mtx);
+
+}
+
+extern counter_u64_t rate_limit_set_ok;
+extern counter_u64_t rate_limit_active;
+extern counter_u64_t rate_limit_alloc_fail;
+
+static int
+rl_attach_txrtlmt(struct ifnet *ifp,
+ uint32_t flowtype,
+ int flowid,
+ uint64_t cfg_rate,
+ struct m_snd_tag **tag)
+{
+ int error;
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = flowid,
+ .rate_limit.hdr.flowtype = flowtype,
+ .rate_limit.max_rate = cfg_rate,
+ .rate_limit.flags = M_NOWAIT,
+ };
+
+ if (ifp->if_snd_tag_alloc == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag);
+ if (error == 0) {
+ if_ref((*tag)->ifp);
+ counter_u64_add(rate_limit_set_ok, 1);
+ counter_u64_add(rate_limit_active, 1);
+ } else
+ counter_u64_add(rate_limit_alloc_fail, 1);
+ }
+ return (error);
+}
+
+static void
+populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
+{
+ /*
+ * The internal table is "special", it
+ * is two seperate ordered tables that
+ * must be merged. We get here when the
+ * adapter specifies a number of rates that
+ * covers both ranges in the table in some
+ * form.
+ */
+ int i, at_low, at_high;
+ uint8_t low_disabled = 0, high_disabled = 0;
+
+ for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
+ rs->rs_rlt[i].flags = 0;
+ rs->rs_rlt[i].time_between = 0;
+ if ((low_disabled == 0) &&
+ (high_disabled ||
+ (rate_table_act[at_low] < rate_table_act[at_high]))) {
+ rs->rs_rlt[i].rate = rate_table_act[at_low];
+ at_low++;
+ if (at_low == RS_NEXT_ORDER_GROUP)
+ low_disabled = 1;
+ } else if (high_disabled == 0) {
+ rs->rs_rlt[i].rate = rate_table_act[at_high];
+ at_high++;
+ if (at_high == MAX_HDWR_RATES)
+ high_disabled = 1;
+ }
+ }
+}
+
+static struct tcp_rate_set *
+rt_setup_new_rs(struct ifnet *ifp, int *error)
+{
+ struct tcp_rate_set *rs;
+ const uint64_t *rate_table_act;
+ uint64_t lentim, res;
+ size_t sz;
+ uint32_t hash_type;
+ int i;
+ struct if_ratelimit_query_results rl;
+ struct sysctl_oid *rl_sysctl_root;
+ /*
+ * We expect to enter with the
+ * mutex locked.
+ */
+
+ if (ifp->if_ratelimit_query == NULL) {
+ /*
+ * We can do nothing if we cannot
+ * get a query back from the driver.
+ */
+ return (NULL);
+ }
+ rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
+ if (rs == NULL) {
+ if (error)
+ *error = ENOMEM;
+ return (NULL);
+ }
+ rl.flags = RT_NOSUPPORT;
+ ifp->if_ratelimit_query(ifp, &rl);
+ if (rl.flags & RT_IS_UNUSABLE) {
+ /*
+ * The interface does not really support
+ * the rate-limiting.
+ */
+ memset(rs, 0, sizeof(struct tcp_rate_set));
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_flags = RS_INTF_NO_SUP;
+ rs->rs_disable = 1;
+ rs_number_alive++;
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+ } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
+ memset(rs, 0, sizeof(struct tcp_rate_set));
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_flags = RS_IS_DEFF;
+ rs_number_alive++;
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+ } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
+ /* Mellanox most likely */
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_rate_cnt = rl.number_of_rates;
+ rs->rs_min_seg = rl.min_segment_burst;
+ rs->rs_highest_valid = 0;
+ rs->rs_flow_limit = rl.max_flows;
+ rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
+ rs->rs_disable = 0;
+ rate_table_act = rl.rate_table;
+ } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
+ /* Chelsio */
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_rate_cnt = rl.number_of_rates;
+ rs->rs_min_seg = rl.min_segment_burst;
+ rs->rs_disable = 0;
+ rs->rs_flow_limit = rl.max_flows;
+ rate_table_act = desired_rates;
+ if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
+ (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
+ /*
+ * Our desired table is not big
+ * enough, do what we can.
+ */
+ rs->rs_rate_cnt = MAX_HDWR_RATES;
+ }
+ if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
+ rs->rs_flags = RS_IS_INTF;
+ else
+ rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
+ if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
+ rs->rs_rate_cnt = ALL_HARDWARE_RATES;
+ } else {
+ printf("Interface:%s unit:%d not one known to have rate-limits\n",
+ ifp->if_dname,
+ ifp->if_dunit);
+ free(rs, M_TCPPACE);
+ return (NULL);
+ }
+ sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
+ rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
+ if (rs->rs_rlt == NULL) {
+ if (error)
+ *error = ENOMEM;
+bail:
+ free(rs, M_TCPPACE);
+ return (NULL);
+ }
+ if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
+ /*
+ * The interface supports all
+ * the rates we could possibly want.
+ */
+ uint64_t rat;
+
+ rs->rs_rlt[0].rate = 12500; /* 100k */
+ rs->rs_rlt[1].rate = 25000; /* 200k */
+ rs->rs_rlt[2].rate = 62500; /* 500k */
+ /* Note 125000 == 1Megabit
+ * populate 1Meg - 1000meg.
+ */
+ for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
+ rs->rs_rlt[i].rate = rat;
+ rat += 125000;
+ }
+ rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
+ } else if (rs->rs_flags & RS_INT_TBL) {
+ /* We populate this in a special way */
+ populate_canned_table(rs, rate_table_act);
+ } else {
+ /*
+ * Just copy in the rates from
+ * the table, it is in order.
+ */
+ for (i=0; i<rs->rs_rate_cnt; i++) {
+ rs->rs_rlt[i].rate = rate_table_act[i];
+ rs->rs_rlt[i].time_between = 0;
+ rs->rs_rlt[i].flags = 0;
+ }
+ }
+ for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
+ /*
+ * We go backwards through the list so that if we can't get
+ * a rate and fail to init one, we have at least a chance of
+ * getting the highest one.
+ */
+ rs->rs_rlt[i].ptbl = rs;
+ rs->rs_rlt[i].tag = NULL;
+ /*
+ * Calculate the time between.
+ */
+ lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
+ res = lentim / rs->rs_rlt[i].rate;
+ if (res > 0)
+ rs->rs_rlt[i].time_between = res;
+ else
+ rs->rs_rlt[i].time_between = 1;
+ if (rs->rs_flags & RS_NO_PRE) {
+ rs->rs_rlt[i].flags = HDWRPACE_INITED;
+ rs->rs_lowest_valid = i;
+ } else {
+ int err;
+#ifdef RSS
+ hash_type = M_HASHTYPE_RSS_TCP_IPV4;
+#else
+ hash_type = M_HASHTYPE_OPAQUE_HASH;
+#endif
+ err = rl_attach_txrtlmt(ifp,
+ hash_type,
+ (i + 1),
+ rs->rs_rlt[i].rate,
+ &rs->rs_rlt[i].tag);
+ if (err) {
+ if (i == (rs->rs_rate_cnt - 1)) {
+ /*
+ * Huh - first rate and we can't get
+ * it?
+ */
+ free(rs->rs_rlt, M_TCPPACE);
+ if (error)
+ *error = err;
+ goto bail;
+ } else {
+ if (error)
+ *error = err;
+ }
+ break;
+ } else {
+ rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
+ rs->rs_lowest_valid = i;
+ }
+ }
+ }
+ /* Did we get at least 1 rate? */
+ if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
+ rs->rs_highest_valid = rs->rs_rate_cnt - 1;
+ else {
+ free(rs->rs_rlt, M_TCPPACE);
+ goto bail;
+ }
+ rs_number_alive++;
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+}
+
+static const struct tcp_hwrate_limit_table *
+tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
+ uint64_t bytes_per_sec, uint32_t flags)
+{
+ struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
+ uint64_t mbits_per_sec, ind_calc;
+ int i;
+
+ mbits_per_sec = (bytes_per_sec * 8);
+ if (flags & RS_PACING_LT) {
+ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+ (rs->rs_lowest_valid <= 2)){
+ /*
+ * Smaller than 1Meg, only
+ * 3 entries can match it.
+ */
+ for(i = rs->rs_lowest_valid; i < 3; i++) {
+ if (bytes_per_sec <= rs->rs_rlt[i].rate) {
+ rte = &rs->rs_rlt[i];
+ break;
+ } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
+ arte = &rs->rs_rlt[i];
+ }
+ }
+ goto done;
+ } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
+ /*
+ * Larger than 1G (the majority of
+ * our table.
+ */
+ if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ else
+ arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ goto done;
+ }
+ /*
+ * If we reach here its in our table (between 1Meg - 1000Meg),
+ * just take the rounded down mbits per second, and add
+ * 1Megabit to it, from this we can calculate
+ * the index in the table.
+ */
+ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+ if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
+ ind_calc++;
+ /* our table is offset by 3, we add 2 */
+ ind_calc += 2;
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = ALL_HARDWARE_RATES-1;
+ }
+ if ((ind_calc >= rs->rs_lowest_valid) &&
+ (ind_calc <= rs->rs_highest_valid))
+ rte = &rs->rs_rlt[ind_calc];
+ } else if (flags & RS_PACING_EXACT_MATCH) {
+ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+ (rs->rs_lowest_valid <= 2)){
+ for(i = rs->rs_lowest_valid; i < 3; i++) {
+ if (bytes_per_sec == rs->rs_rlt[i].rate) {
+ rte = &rs->rs_rlt[i];
+ break;
+ }
+ }
+ } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
+ /* > 1Gbps only one rate */
+ if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
+ /* Its 10G wow */
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ }
+ } else {
+ /* Ok it must be a exact meg (its between 1G and 1Meg) */
+ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+ if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
+ /* its an exact Mbps */
+ ind_calc += 2;
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = ALL_HARDWARE_RATES-1;
+ }
+ if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
+ rte = &rs->rs_rlt[ind_calc];
+ }
+ }
+ } else {
+ /* we want greater than the requested rate */
+ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+ (rs->rs_lowest_valid <= 2)){
+ arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
+ for (i=2; i>=rs->rs_lowest_valid; i--) {
+ if (bytes_per_sec < rs->rs_rlt[i].rate) {
+ rte = &rs->rs_rlt[i];
+ break;
+ } else if ((flags & RS_PACING_GEQ) &&
+ (bytes_per_sec == rs->rs_rlt[i].rate)) {
+ rte = &rs->rs_rlt[i];
+ break;
+ } else {
+ arte = &rs->rs_rlt[i]; /* new alternate */
+ }
+ }
+ } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
+ if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
+ /* Our top rate is larger than the request */
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ } else if ((flags & RS_PACING_GEQ) &&
+ (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
+ /* It matches our top rate */
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
+ /* The top rate is an alternative */
+ arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ }
+ } else {
+ /* Its in our range 1Meg - 1Gig */
+ if (flags & RS_PACING_GEQ) {
+ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+ if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = (ALL_HARDWARE_RATES-1);
+ }
+ rte = &rs->rs_rlt[ind_calc];
+ }
+ goto done;
+ }
+ ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
+ ind_calc += 2;
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = ALL_HARDWARE_RATES-1;
+ }
+ if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
+ rte = &rs->rs_rlt[ind_calc];
+ }
+ }
+done:
+ if ((rte == NULL) &&
+ (arte != NULL) &&
+ (flags & RS_PACING_SUB_OK)) {
+ /* We can use the substitute */
+ rte = arte;
+ }
+ return (rte);
+}
+
+static const struct tcp_hwrate_limit_table *
+tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
+{
+ /**
+ * Hunt the rate table with the restrictions in flags and find a
+ * suitable rate if possible.
+ * RS_PACING_EXACT_MATCH - look for an exact match to rate.
+ * RS_PACING_GT - must be greater than.
+ * RS_PACING_GEQ - must be greater than or equal.
+ * RS_PACING_LT - must be less than.
+ * RS_PACING_SUB_OK - If we don't meet criteria a
+ * substitute is ok.
+ */
+ int i, matched;
+ struct tcp_hwrate_limit_table *rte = NULL;
+
+
+ if ((rs->rs_flags & RS_INT_TBL) &&
+ (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
+ /*
+ * Here we don't want to paw thru
+ * a big table, we have everything
+ * from 1Meg - 1000Meg in 1Meg increments.
+ * Use an alternate method to "lookup".
+ */
+ return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
+ }
+ if ((flags & RS_PACING_LT) ||
+ (flags & RS_PACING_EXACT_MATCH)) {
+ /*
+ * For exact and less than we go forward through the table.
+ * This way when we find one larger we stop (exact was a
+ * toss up).
+ */
+ for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
+ if ((flags & RS_PACING_EXACT_MATCH) &&
+ (bytes_per_sec == rs->rs_rlt[i].rate)) {
+ rte = &rs->rs_rlt[i];
+ matched = 1;
+ break;
+ } else if ((flags & RS_PACING_LT) &&
+ (bytes_per_sec <= rs->rs_rlt[i].rate)) {
+ rte = &rs->rs_rlt[i];
+ matched = 1;
+ break;
+ }
+ if (bytes_per_sec > rs->rs_rlt[i].rate)
+ break;
+ }
+ if ((matched == 0) &&
+ (flags & RS_PACING_LT) &&
+ (flags & RS_PACING_SUB_OK)) {
+ /* Kick in a substitute (the lowest) */
+ rte = &rs->rs_rlt[rs->rs_lowest_valid];
+ }
+ } else {
+ /*
+ * Here we go backward through the table so that we can find
+ * the one greater in theory faster (but its probably a
+ * wash).
+ */
+ for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
+ if (rs->rs_rlt[i].rate > bytes_per_sec) {
+ /* A possible candidate */
+ rte = &rs->rs_rlt[i];
+ }
+ if ((flags & RS_PACING_GEQ) &&
+ (bytes_per_sec == rs->rs_rlt[i].rate)) {
+ /* An exact match and we want equal */
+ matched = 1;
+ rte = &rs->rs_rlt[i];
+ break;
+ } else if (rte) {
+ /*
+ * Found one that is larger than but don't
+ * stop, there may be a more closer match.
+ */
+ matched = 1;
+ }
+ if (rs->rs_rlt[i].rate < bytes_per_sec) {
+ /*
+ * We found a table entry that is smaller,
+ * stop there will be none greater or equal.
+ */
+ break;
+ }
+ }
+ if ((matched == 0) &&
+ (flags & RS_PACING_SUB_OK)) {
+ /* Kick in a substitute (the highest) */
+ rte = &rs->rs_rlt[rs->rs_highest_valid];
+ }
+ }
+ return (rte);
+}
+
+static struct ifnet *
+rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
+{
+ struct ifnet *tifp;
+ struct m_snd_tag *tag;
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = 1,
+ .rate_limit.max_rate = COMMON_RATE,
+ .rate_limit.flags = M_NOWAIT,
+ };
+ int err;
+#ifdef RSS
+ params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
+ M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
+#else
+ params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
+#endif
+ tag = NULL;
+ if (ifp->if_snd_tag_alloc) {
+ if (error)
+ *error = ENODEV;
+ return (NULL);
+ }
+ err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag);
+ if (err) {
+ /* Failed to setup a tag? */
+ if (error)
+ *error = err;
+ return (NULL);
+ }
+ tifp = tag->ifp;
+ tifp->if_snd_tag_free(tag);
+ return (tifp);
+}
+
+static const struct tcp_hwrate_limit_table *
+rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
+ uint32_t flags, int *error)
+{
+ /* First lets find the interface if it exists */
+ const struct tcp_hwrate_limit_table *rte;
+ struct tcp_rate_set *rs;
+ struct epoch_tracker et;
+ int err;
+
+ epoch_enter_preempt(net_epoch_preempt, &et);
+use_real_interface:
+ CK_LIST_FOREACH(rs, &int_rs, next) {
+ /*
+ * Note we don't look with the lock since we either see a
+ * new entry or will get one when we try to add it.
+ */
+ if (rs->rs_flags & RS_IS_DEAD) {
+ /* The dead are not looked at */
+ continue;
+ }
+ if ((rs->rs_ifp == ifp) &&
+ (rs->rs_if_dunit == ifp->if_dunit)) {
+ /* Ok we found it */
+ break;
+ }
+ }
+ if ((rs == NULL) ||
+ (rs->rs_flags & RS_INTF_NO_SUP) ||
+ (rs->rs_flags & RS_IS_DEAD)) {
+ /*
+ * This means we got a packet *before*
+ * the IF-UP was processed below, <or>
+ * while or after we already received an interface
+ * departed event. In either case we really don't
+ * want to do anything with pacing, in
+ * the departing case the packet is not
+ * going to go very far. The new case
+ * might be arguable, but its impossible
+ * to tell from the departing case.
+ */
+ if (rs->rs_disable && error)
+ *error = ENODEV;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+
+ if ((rs == NULL) || (rs->rs_disable != 0)) {
+ if (rs->rs_disable && error)
+ *error = ENOSPC;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+ if (rs->rs_flags & RS_IS_DEFF) {
+ /* We need to find the real interface */
+ struct ifnet *tifp;
+
+ tifp = rt_find_real_interface(ifp, inp, error);
+ if (tifp == NULL) {
+ if (rs->rs_disable && error)
+ *error = ENOTSUP;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+ goto use_real_interface;
+ }
+ if (rs->rs_flow_limit &&
+ ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
+ if (error)
+ *error = ENOSPC;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+ rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
+ if (rte) {
+ err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
+ inp->inp_flowtype,
+ inp->inp_flowid,
+ rte->rate,
+ &inp->inp_snd_tag);
+ if (err) {
+ /* Failed to attach */
+ if (error)
+ *error = err;
+ rte = NULL;
+ }
+ }
+ if (rte) {
+ /*
+ * We use an atomic here for accounting so we don't have to
+ * use locks when freeing.
+ */
+ atomic_add_long(&rs->rs_flows_using, 1);
+ }
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (rte);
+}
+
+static void
+tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
+{
+ int error;
+ struct tcp_rate_set *rs;
+
+ if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
+ (link_state != LINK_STATE_UP)) {
+ /*
+ * We only care on an interface going up that is rate-limit
+ * capable.
+ */
+ return;
+ }
+ mtx_lock(&rs_mtx);
+ CK_LIST_FOREACH(rs, &int_rs, next) {
+ if ((rs->rs_ifp == ifp) &&
+ (rs->rs_if_dunit == ifp->if_dunit)) {
+ /* We already have initialized this guy */
+ mtx_unlock(&rs_mtx);
+ return;
+ }
+ }
+ rt_setup_new_rs(ifp, &error);
+ mtx_unlock(&rs_mtx);
+}
+
+static void
+tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
+{
+ struct tcp_rate_set *rs, *nrs;
+ struct ifnet *tifp;
+ int i;
+
+ mtx_lock(&rs_mtx);
+ CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
+ if ((rs->rs_ifp == ifp) &&
+ (rs->rs_if_dunit == ifp->if_dunit)) {
+ CK_LIST_REMOVE(rs, next);
+ rs_number_alive--;
+ rs_number_dead++;
+ rs->rs_flags |= RS_IS_DEAD;
+ for (i = 0; i < rs->rs_rate_cnt; i++) {
+ if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
+ tifp = rs->rs_rlt[i].tag->ifp;
+ in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
+ rs->rs_rlt[i].tag = NULL;
+ }
+ rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
+ }
+ if (rs->rs_flows_using == 0) {
+ /*
+ * No references left, so we can schedule the
+ * destruction after the epoch (with a caveat).
+ */
+ rs->rs_flags |= RS_FUNERAL_SCHD;
+ epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+ }
+ break;
+ }
+ }
+ mtx_unlock(&rs_mtx);
+}
+
+static void
+tcp_rl_shutdown(void *arg __unused, int howto __unused)
+{
+ struct tcp_rate_set *rs, *nrs;
+ struct ifnet *tifp;
+ int i;
+
+ mtx_lock(&rs_mtx);
+ CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
+ CK_LIST_REMOVE(rs, next);
+ rs_number_alive--;
+ rs_number_dead++;
+ rs->rs_flags |= RS_IS_DEAD;
+ for (i = 0; i < rs->rs_rate_cnt; i++) {
+ if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
+ tifp = rs->rs_rlt[i].tag->ifp;
+ in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
+ rs->rs_rlt[i].tag = NULL;
+ }
+ rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
+ }
+ if (rs->rs_flows_using != 0) {
+ /*
+ * We dont hold a reference
+ * so we have nothing left to
+ * do.
+ */
+ } else {
+ /*
+ * No references left, so we can destroy it
+ * after the epoch.
+ */
+ rs->rs_flags |= RS_FUNERAL_SCHD;
+ epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+ }
+ }
+ mtx_unlock(&rs_mtx);
+}
+
+const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ const struct tcp_hwrate_limit_table *rte;
+
+ if (tp->t_inpcb->inp_snd_tag == NULL) {
+ /*
+ * We are setting up a rate for the first time.
+ */
+ if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
+ /* Not supported by the egress */
+ if (error)
+ *error = ENODEV;
+ return (NULL);
+ }
+#ifdef KERN_TLS
+ if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) {
+ /*
+ * We currently can't do both TLS and hardware
+ * pacing
+ */
+ if (error)
+ *error = EINVAL;
+ return (NULL);
+ }
+#endif
+ rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
+ } else {
+ /*
+ * We are modifying a rate, wrong interface?
+ */
+ if (error)
+ *error = EINVAL;
+ rte = NULL;
+ }
+ return (rte);
+}
+
+const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ const struct tcp_hwrate_limit_table *nrte;
+ const struct tcp_rate_set *rs;
+ int is_indirect = 0;
+ int err;
+
+
+ if ((tp->t_inpcb->inp_snd_tag == NULL) ||
+ (crte == NULL)) {
+ /* Wrong interface */
+ if (error)
+ *error = EINVAL;
+ return (NULL);
+ }
+ rs = crte->ptbl;
+ if ((rs->rs_flags & RS_IS_DEAD) ||
+ (crte->flags & HDWRPACE_IFPDEPARTED)) {
+ /* Release the rate, and try anew */
+re_rate:
+ tcp_rel_pacing_rate(crte, tp);
+ nrte = tcp_set_pacing_rate(tp, ifp,
+ bytes_per_sec, flags, error);
+ return (nrte);
+ }
+ if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
+ is_indirect = 1;
+ else
+ is_indirect = 0;
+ if ((is_indirect == 0) &&
+ ((ifp != rs->rs_ifp) ||
+ (ifp->if_dunit != rs->rs_if_dunit))) {
+ /*
+ * Something changed, the user is not pointing to the same
+ * ifp? Maybe a route updated on this guy?
+ */
+ goto re_rate;
+ } else if (is_indirect) {
+ /*
+ * For indirect we have to dig in and find the real interface.
+ */
+ struct ifnet *rifp;
+
+ rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
+ if (rifp == NULL) {
+ /* Can't find it? */
+ goto re_rate;
+ }
+ if ((rifp != rs->rs_ifp) ||
+ (ifp->if_dunit != rs->rs_if_dunit)) {
+ goto re_rate;
+ }
+ }
+ nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
+ if (nrte == crte) {
+ /* No change */
+ if (error)
+ *error = 0;
+ return (crte);
+ }
+ if (nrte == NULL) {
+ /* Release the old rate */
+ tcp_rel_pacing_rate(crte, tp);
+ return (NULL);
+ }
+ /* Change rates to our new entry */
+ err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
+ if (err) {
+ if (error)
+ *error = err;
+ return (NULL);
+ }
+ if (error)
+ *error = 0;
+ return (nrte);
+}
+
+void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
+{
+ const struct tcp_rate_set *crs;
+ struct tcp_rate_set *rs;
+ uint64_t pre;
+
+ crs = crte->ptbl;
+ /*
+ * Now we must break the const
+ * in order to release our refcount.
+ */
+ rs = __DECONST(struct tcp_rate_set *, crs);
+ pre = atomic_fetchadd_long(&rs->rs_flows_using, -1);
+ if (pre == 1) {
+ mtx_lock(&rs_mtx);
+ /*
+ * Is it dead?
+ */
+ if ((rs->rs_flags & RS_IS_DEAD) &&
+ ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
+ /*
+ * We were the last,
+ * and a funeral is not pending, so
+ * we must schedule it.
+ */
+ rs->rs_flags |= RS_FUNERAL_SCHD;
+ epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+ }
+ mtx_unlock(&rs_mtx);
+ }
+ in_pcbdetach_txrtlmt(tp->t_inpcb);
+}
+
+static eventhandler_tag rl_ifnet_departs;
+static eventhandler_tag rl_ifnet_arrives;
+static eventhandler_tag rl_shutdown_start;
+
+static void
+tcp_rs_init(void *st __unused)
+{
+ CK_LIST_INIT(&int_rs);
+ rs_number_alive = 0;
+ rs_number_dead = 0;;
+ mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
+ rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ tcp_rl_ifnet_departure,
+ NULL, EVENTHANDLER_PRI_ANY);
+ rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
+ tcp_rl_ifnet_link,
+ NULL, EVENTHANDLER_PRI_ANY);
+ rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+ tcp_rl_shutdown, NULL,
+ SHUTDOWN_PRI_FIRST);
+ printf("TCP_ratelimit: Is now initialized\n");
+}
+
+SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
+#endif
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Jan 12, 11:26 PM (21 h, 38 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15773749
Default Alt Text
D20953.diff (55 KB)
Attached To
Mode
D20953: Third patchset in the set of patches to bring BBRv1 into the FreeBSD tree
Attached
Detach File
Event Timeline
Log In to Comment