Page MenuHomeFreeBSD

D20953.diff
No OneTemporary

D20953.diff

Index: head/sys/conf/files
===================================================================
--- head/sys/conf/files
+++ head/sys/conf/files
@@ -4276,6 +4276,7 @@
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
+netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
compile-with "${NORMAL_C} ${NO_WNONNULL}"
netinet/tcp_reass.c optional inet | inet6
Index: head/sys/dev/cxgbe/adapter.h
===================================================================
--- head/sys/dev/cxgbe/adapter.h
+++ head/sys/dev/cxgbe/adapter.h
@@ -1247,6 +1247,7 @@
int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
void cxgbe_snd_tag_free(struct m_snd_tag *);
void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
+void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
#endif
/* t4_filter.c */
Index: head/sys/dev/cxgbe/t4_main.c
===================================================================
--- head/sys/dev/cxgbe/t4_main.c
+++ head/sys/dev/cxgbe/t4_main.c
@@ -1658,6 +1658,7 @@
ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
ifp->if_snd_tag_query = cxgbe_snd_tag_query;
ifp->if_snd_tag_free = cxgbe_snd_tag_free;
+ ifp->if_ratelimit_query = cxgbe_ratelimit_query;
#endif
ifp->if_capabilities = T4_CAP;
Index: head/sys/dev/cxgbe/t4_sched.c
===================================================================
--- head/sys/dev/cxgbe/t4_sched.c
+++ head/sys/dev/cxgbe/t4_sched.c
@@ -903,4 +903,35 @@
}
mtx_unlock(&cst->lock);
}
+
+#define CXGBE_MAX_FLOWS 4000 /* Testing show so far thats all this adapter can do */
+#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */
+
+void
+cxgbe_ratelimit_query(struct ifnet *ifp __unused,
+ struct if_ratelimit_query_results *q)
+{
+ /*
+ * This is a skeleton and needs future work
+ * by the driver supporters. It should be
+ * enhanced to look at the specific type of
+ * interface and select approprate values
+ * for these settings. This example goes
+ * with an earlier card (t5), it has a maximum
+ * number of 16 rates that the first guys in
+ * select (thus the flags value RT_IS_SELECTABLE).
+ * If it was a fixed table then we would setup a
+ * const array (example mlx5). Note the card tested
+ * can only support reasonably 4000 flows before
+ * the adapter has issues with sending so here
+ * we limit the number of flows using hardware
+ * pacing to that number, other cards may
+ * be able to raise or eliminate this limit.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_IS_SELECTABLE;
+ q->max_flows = CXGBE_MAX_FLOWS;
+ q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
+ q->min_segment_burst = 4; /* Driver emits 4 in a burst */
+}
#endif
Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
===================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -4070,7 +4070,49 @@
}
}
+#define NUM_HDWR_RATES_MLX 13
+static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
+ 135375, /* 1,083,000 */
+ 180500, /* 1,444,000 */
+ 270750, /* 2,166,000 */
+ 361000, /* 2,888,000 */
+ 541500, /* 4,332,000 */
+ 721875, /* 5,775,000 */
+ 1082875, /* 8,663,000 */
+ 1443875, /* 11,551,000 */
+ 2165750, /* 17,326,000 */
+ 2887750, /* 23,102,000 */
+ 4331625, /* 34,653,000 */
+ 5775500, /* 46,204,000 */
+ 8663125 /* 69,305,000 */
+};
+
static void
+mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+ /*
+ * This function needs updating by the driver maintainer!
+ * For the MLX card there are currently (ConectX-4?) 13
+ * pre-set rates and others i.e. ConnectX-5, 6, 7??
+ *
+ * This will change based on later adapters
+ * and this code should be updated to look at ifp
+ * and figure out the specific adapter type
+ * settings i.e. how many rates as well
+ * as if they are fixed (as is shown here) or
+ * if they are dynamic (example chelsio t4). Also if there
+ * is a maximum number of flows that the adapter
+ * can handle that too needs to be updated in
+ * the max_flows field.
+ */
+ q->rate_table = adapter_rates_mlx;
+ q->flags = RT_IS_FIXED_TABLE;
+ q->max_flows = 0; /* mlx has no limit */
+ q->number_of_rates = NUM_HDWR_RATES_MLX;
+ q->min_segment_burst = 1;
+}
+
+static void
mlx5e_snd_tag_free(struct m_snd_tag *pmt)
{
struct mlx5e_snd_tag *tag =
@@ -4155,7 +4197,9 @@
ifp->if_snd_tag_free = mlx5e_snd_tag_free;
ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
ifp->if_snd_tag_query = mlx5e_snd_tag_query;
-
+#ifdef RATELIMIT
+ ifp->if_ratelimit_query = mlx5e_ratelimit_query;
+#endif
/* set TSO limits so that we don't have to drop TX packets */
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
Index: head/sys/net/if_dead.c
===================================================================
--- head/sys/net/if_dead.c
+++ head/sys/net/if_dead.c
@@ -126,6 +126,23 @@
{
}
+static void
+ifdead_ratelimit_query(struct ifnet *ifp __unused,
+ struct if_ratelimit_query_results *q)
+{
+ /*
+ * This guy does not support
+ * this interface. Not sure
+ * why we would specify a
+ * flag on the interface
+ * that says we do.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_NOSUPPORT;
+ q->max_flows = 0;
+ q->number_of_rates = 0;
+}
+
void
if_dead(struct ifnet *ifp)
{
@@ -142,4 +159,5 @@
ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
ifp->if_snd_tag_query = ifdead_snd_tag_query;
ifp->if_snd_tag_free = ifdead_snd_tag_free;
+ ifp->if_ratelimit_query = ifdead_ratelimit_query;
}
Index: head/sys/net/if_lagg.c
===================================================================
--- head/sys/net/if_lagg.c
+++ head/sys/net/if_lagg.c
@@ -144,6 +144,8 @@
static int lagg_snd_tag_query(struct m_snd_tag *,
union if_snd_tag_query_params *);
static void lagg_snd_tag_free(struct m_snd_tag *);
+static void lagg_ratelimit_query(struct ifnet *,
+ struct if_ratelimit_query_results *);
#endif
static int lagg_setmulti(struct lagg_port *);
static int lagg_clrmulti(struct lagg_port *);
@@ -537,6 +539,7 @@
ifp->if_snd_tag_modify = lagg_snd_tag_modify;
ifp->if_snd_tag_query = lagg_snd_tag_query;
ifp->if_snd_tag_free = lagg_snd_tag_free;
+ ifp->if_ratelimit_query = lagg_ratelimit_query;
#endif
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
@@ -1670,6 +1673,20 @@
free(lst, M_LAGG);
}
+static void
+lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+ /*
+ * For lagg, we have an indirect
+ * interface. The caller needs to
+ * get a ratelimit tag on the actual
+ * interface the flow will go on.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_IS_INDIRECT;
+ q->max_flows = 0;
+ q->number_of_rates = 0;
+}
#endif
static int
Index: head/sys/net/if_var.h
===================================================================
--- head/sys/net/if_var.h
+++ head/sys/net/if_var.h
@@ -203,6 +203,8 @@
struct if_snd_tag_alloc_rate_limit {
struct if_snd_tag_alloc_header hdr;
uint64_t max_rate; /* in bytes/s */
+ uint32_t flags; /* M_NOWAIT or M_WAITOK */
+ uint32_t reserved; /* alignment */
};
struct if_snd_tag_rate_limit_params {
@@ -210,7 +212,7 @@
uint32_t queue_level; /* 0 (empty) .. 65535 (full) */
#define IF_SND_QUEUE_LEVEL_MIN 0
#define IF_SND_QUEUE_LEVEL_MAX 65535
- uint32_t reserved; /* padding */
+ uint32_t flags; /* M_NOWAIT or M_WAITOK */
};
union if_snd_tag_alloc_params {
@@ -229,12 +231,38 @@
struct if_snd_tag_rate_limit_params unlimited;
};
+/* Query return flags */
+#define RT_NOSUPPORT 0x00000000 /* Not supported */
+#define RT_IS_INDIRECT 0x00000001 /*
+ * Interface like a lagg, select
+ * the actual interface for
+ * capabilities.
+ */
+#define RT_IS_SELECTABLE 0x00000002 /*
+ * No rate table, you select
+ * rates and the first
+ * number_of_rates are created.
+ */
+#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */
+#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */
+
+struct if_ratelimit_query_results {
+ const uint64_t *rate_table; /* Pointer to table if present */
+ uint32_t flags; /* Flags indicating results */
+ uint32_t max_flows; /* Max flows using, 0=unlimited */
+ uint32_t number_of_rates; /* How many unique rates can be created */
+ uint32_t min_segment_burst; /* The amount the adapter bursts at each send */
+};
+
typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
struct m_snd_tag **);
typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef void (if_ratelimit_query_t)(struct ifnet *,
+ struct if_ratelimit_query_results *);
+
/*
* Structure defining a network interface.
*/
@@ -374,6 +402,7 @@
if_snd_tag_modify_t *if_snd_tag_modify;
if_snd_tag_query_t *if_snd_tag_query;
if_snd_tag_free_t *if_snd_tag_free;
+ if_ratelimit_query_t *if_ratelimit_query;
/* Ethernet PCP */
uint8_t if_pcp;
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h
+++ head/sys/netinet/in_pcb.h
@@ -883,8 +883,13 @@
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
#ifdef RATELIMIT
-int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
+ struct mbuf *, uint32_t);
+int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
+ uint32_t, struct m_snd_tag **);
void in_pcbdetach_txrtlmt(struct inpcb *);
+void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
int in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c
+++ head/sys/netinet/in_pcb.c
@@ -210,6 +210,22 @@
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
+
+#ifdef RATELIMIT
+counter_u64_t rate_limit_active;
+counter_u64_t rate_limit_alloc_fail;
+counter_u64_t rate_limit_set_ok;
+
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
+ "IP Rate Limiting");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
+ &rate_limit_active, "Active rate limited connections");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
+ &rate_limit_alloc_fail, "Rate limited connection failures");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
+ &rate_limit_set_ok, "Rate limited setting succeeded");
+#endif /* RATELIMIT */
+
#endif /* INET */
/*
@@ -3170,6 +3186,7 @@
{
union if_snd_tag_modify_params params = {
.rate_limit.max_rate = max_pacing_rate,
+ .rate_limit.flags = M_NOWAIT,
};
struct m_snd_tag *mst;
struct ifnet *ifp;
@@ -3256,7 +3273,8 @@
*/
int
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
- uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+ uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
+
{
union if_snd_tag_alloc_params params = {
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
@@ -3264,22 +3282,47 @@
.rate_limit.hdr.flowid = flowid,
.rate_limit.hdr.flowtype = flowtype,
.rate_limit.max_rate = max_pacing_rate,
+ .rate_limit.flags = M_NOWAIT,
};
int error;
INP_WLOCK_ASSERT(inp);
- if (inp->inp_snd_tag != NULL)
+ if (*st != NULL)
return (EINVAL);
if (ifp->if_snd_tag_alloc == NULL) {
error = EOPNOTSUPP;
} else {
error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
+
+ if (error == 0) {
+ counter_u64_add(rate_limit_set_ok, 1);
+ counter_u64_add(rate_limit_active, 1);
+ } else
+ counter_u64_add(rate_limit_alloc_fail, 1);
}
return (error);
}
+void
+in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
+{
+ if (ifp == NULL)
+ return;
+
+ /*
+ * If the device was detached while we still had reference(s)
+ * on the ifp, we assume if_snd_tag_free() was replaced with
+ * stubs.
+ */
+ ifp->if_snd_tag_free(mst);
+
+ /* release reference count on network interface */
+ if_rele(ifp);
+ counter_u64_add(rate_limit_active, -1);
+}
+
/*
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
* if any:
@@ -3300,49 +3343,12 @@
m_snd_tag_rele(mst);
}
-/*
- * This function should be called when the INP_RATE_LIMIT_CHANGED flag
- * is set in the fast path and will attach/detach/modify the TX rate
- * limit send tag based on the socket's so_max_pacing_rate value.
- */
-void
-in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
{
- struct socket *socket;
- uint32_t max_pacing_rate;
- bool did_upgrade;
int error;
- if (inp == NULL)
- return;
-
- socket = inp->inp_socket;
- if (socket == NULL)
- return;
-
- if (!INP_WLOCKED(inp)) {
- /*
- * NOTE: If the write locking fails, we need to bail
- * out and use the non-ratelimited ring for the
- * transmit until there is a new chance to get the
- * write lock.
- */
- if (!INP_TRY_UPGRADE(inp))
- return;
- did_upgrade = 1;
- } else {
- did_upgrade = 0;
- }
-
/*
- * NOTE: The so_max_pacing_rate value is read unlocked,
- * because atomic updates are not required since the variable
- * is checked at every mbuf we send. It is assumed that the
- * variable read itself will be atomic.
- */
- max_pacing_rate = socket->so_max_pacing_rate;
-
- /*
* If the existing send tag is for the wrong interface due to
* a route change, first drop the existing tag. Set the
* CHANGED flag so that we will keep trying to allocate a new
@@ -3376,13 +3382,61 @@
error = EAGAIN;
} else {
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
- mb->m_pkthdr.flowid, max_pacing_rate);
+ mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
}
} else {
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
}
if (error == 0 || error == EOPNOTSUPP)
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+
+ return (error);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+ struct socket *socket;
+ uint32_t max_pacing_rate;
+ bool did_upgrade;
+ int error;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /*
+ * NOTE: The so_max_pacing_rate value is read unlocked,
+ * because atomic updates are not required since the variable
+ * is checked at every mbuf we send. It is assumed that the
+ * variable read itself will be atomic.
+ */
+ max_pacing_rate = socket->so_max_pacing_rate;
+
+ error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
+
if (did_upgrade)
INP_DOWNGRADE(inp);
}
@@ -3424,4 +3478,14 @@
if (did_upgrade)
INP_DOWNGRADE(inp);
}
+
+static void
+rl_init(void *st)
+{
+ rate_limit_active = counter_u64_alloc(M_WAITOK);
+ rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
+ rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
+}
+
+SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
#endif /* RATELIMIT */
Index: head/sys/netinet/tcp_ratelimit.h
===================================================================
--- head/sys/netinet/tcp_ratelimit.h
+++ head/sys/netinet/tcp_ratelimit.h
@@ -0,0 +1,141 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * __FBSDID("$FreeBSD$");
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+#ifndef __tcp_ratelimit_h__
+#define __tcp_ratelimit_h__
+
+struct m_snd_tag;
+
+/* Flags on an individual rate */
+#define HDWRPACE_INITED 0x0001
+#define HDWRPACE_TAGPRESENT 0x0002
+#define HDWRPACE_IFPDEPARTED 0x0004
+struct tcp_hwrate_limit_table {
+ const struct tcp_rate_set *ptbl; /* Pointer to parent table */
+ struct m_snd_tag *tag; /* Send tag if needed (chelsio) */
+ uint64_t rate; /* Rate we get in Bytes per second (Bps) */
+ uint32_t time_between; /* Time-Gap between packets at this rate */
+ uint32_t flags;
+};
+
+/* Rateset flags */
+#define RS_IS_DEFF 0x0001 /* Its a lagg, do a double lookup */
+#define RS_IS_INTF 0x0002 /* Its a plain interface */
+#define RS_NO_PRE 0x0004 /* The interfacd has set rates */
+#define RS_INT_TBL 0x0010 /*
+ * The table is the internal version
+ * which has special setup requirements.
+ */
+#define RS_IS_DEAD 0x0020 /* The RS is dead list */
+#define RS_FUNERAL_SCHD 0x0040 /* Is a epoch call scheduled to bury this guy?*/
+#define RS_INTF_NO_SUP 0x0100 /* The interface does not support the ratelimiting */
+
+struct tcp_rate_set {
+ struct sysctl_ctx_list sysctl_ctx;
+ CK_LIST_ENTRY(tcp_rate_set) next;
+ struct ifnet *rs_ifp;
+ struct tcp_hwrate_limit_table *rs_rlt;
+ uint64_t rs_flows_using;
+ uint64_t rs_flow_limit;
+ uint32_t rs_if_dunit;
+ int rs_rate_cnt;
+ int rs_min_seg;
+ int rs_highest_valid;
+ int rs_lowest_valid;
+ int rs_disable;
+ int rs_flags;
+ struct epoch_context rs_epoch_ctx;
+};
+
+CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
+
+/* Request flags */
+#define RS_PACING_EXACT_MATCH 0x0001 /* Need an exact match for rate */
+#define RS_PACING_GT 0x0002 /* Greater than requested */
+#define RS_PACING_GEQ 0x0004 /* Greater than or equal too */
+#define RS_PACING_LT 0x0008 /* Less than requested rate */
+#define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the
+ * next best rate (highest or lowest). */
+#ifdef RATELIMIT
+#ifdef _KERNEL
+#define DETAILED_RATELIMIT_SYSCTL 1 /*
+ * Undefine this if you don't want
+ * detailed rates to appear in
+ * net.inet.tcp.rl.
+ * With the defintion each rate
+ * shows up in your sysctl tree
+ * this can be big.
+ */
+
+const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error);
+
+const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error);
+void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp);
+#else
+static inline const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ if (error)
+ *error = EOPNOTSUPP;
+ return (NULL);
+}
+
+static inline const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ if (error)
+ *error = EOPNOTSUPP;
+ return (NULL);
+}
+
+static inline void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp)
+{
+ return;
+}
+
+#endif
+#endif
+#endif
Index: head/sys/netinet/tcp_ratelimit.c
===================================================================
--- head/sys/netinet/tcp_ratelimit.c
+++ head/sys/netinet/tcp_ratelimit.c
@@ -0,0 +1,1234 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/mutex.h>
+#include <sys/ck.h>
+#define TCPSTATES /* for logging */
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ratelimit.h>
+#ifndef USECS_IN_SECOND
+#define USECS_IN_SECOND 1000000
+#endif
+/*
+ * For the purposes of each send, what is the size
+ * of an ethernet frame.
+ */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
+MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
+#ifdef RATELIMIT
+
+#define COMMON_RATE 180500
+uint64_t desired_rates[] = {
+ 62500, /* 500Kbps */
+ 180500, /* 1.44Mpbs */
+ 375000, /* 3Mbps */
+ 500000, /* 4Mbps */
+ 625000, /* 5Mbps */
+ 750000, /* 6Mbps */
+ 1000000, /* 8Mbps */
+ 1250000, /* 10Mbps */
+ 2500000, /* 20Mbps */
+ 3750000, /* 30Mbps */
+ 5000000, /* 40Meg */
+ 6250000, /* 50Mbps */
+ 12500000, /* 100Mbps */
+ 25000000, /* 200Mbps */
+ 50000000, /* 400Mbps */
+ 100000000, /* 800Mbps */
+ 12500, /* 100kbps */
+ 25000, /* 200kbps */
+ 875000, /* 7Mbps */
+ 1125000, /* 9Mbps */
+ 1875000, /* 15Mbps */
+ 3125000, /* 25Mbps */
+ 8125000, /* 65Mbps */
+ 10000000, /* 80Mbps */
+ 18750000, /* 150Mbps */
+ 20000000, /* 250Mbps */
+ 37500000, /* 350Mbps */
+ 62500000, /* 500Mbps */
+ 78125000, /* 625Mbps */
+ 125000000, /* 1Gbps */
+};
+#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
+#define RS_ORDERED_COUNT 16 /*
+ * Number that are in order
+ * at the beginning of the table,
+ * over this a sort is required.
+ */
+#define RS_NEXT_ORDER_GROUP 16 /*
+ * The point in our table where
+ * we come fill in a second ordered
+ * group (index wise means -1).
+ */
+#define ALL_HARDWARE_RATES 1004 /*
+ * 1Meg - 1Gig in 1 Meg steps
+ * plus 100, 200k and 500k and
+ * 10Gig
+ */
+
+#define RS_ONE_MEGABIT_PERSEC 1000000
+#define RS_ONE_GIGABIT_PERSEC 1000000000
+#define RS_TEN_GIGABIT_PERSEC 10000000000
+
+static struct head_tcp_rate_set int_rs;
+static struct mtx rs_mtx;
+uint32_t rs_number_alive;
+uint32_t rs_number_dead;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
+ "TCP Ratelimit stats");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
+ &rs_number_alive, 0,
+ "Number of interfaces initialized for ratelimiting");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
+ &rs_number_dead, 0,
+ "Number of interfaces departing from ratelimiting");
+
+static void
+rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
+{
+ /*
+ * Add sysctl entries for thus interface.
+ */
+ if (rs->rs_flags & RS_INTF_NO_SUP) {
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "disable", CTLFLAG_RD,
+ &rs->rs_disable, 0,
+ "Disable this interface from new hdwr limiting?");
+ } else {
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "disable", CTLFLAG_RW,
+ &rs->rs_disable, 0,
+ "Disable this interface from new hdwr limiting?");
+ }
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "minseg", CTLFLAG_RW,
+ &rs->rs_min_seg, 0,
+ "What is the minimum we need to send on this interface?");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flow_limit", CTLFLAG_RW,
+ &rs->rs_flow_limit, 0,
+ "What is the limit for number of flows (0=unlimited)?");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "highest", CTLFLAG_RD,
+ &rs->rs_highest_valid, 0,
+ "Highest valid rate");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "lowest", CTLFLAG_RD,
+ &rs->rs_lowest_valid, 0,
+ "Lowest valid rate");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flags", CTLFLAG_RD,
+ &rs->rs_flags, 0,
+ "What lags are on the entry?");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "numrates", CTLFLAG_RD,
+ &rs->rs_rate_cnt, 0,
+ "How many rates re there?");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flows_using", CTLFLAG_RD,
+ &rs->rs_flows_using, 0,
+ "How many flows are using this interface now?");
+#ifdef DETAILED_RATELIMIT_SYSCTL
+ if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
+ /* Lets display the rates */
+ int i;
+ struct sysctl_oid *rl_rates;
+ struct sysctl_oid *rl_rate_num;
+ char rate_num[16];
+ rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO,
+ "rate",
+ CTLFLAG_RW, 0,
+ "Ratelist");
+ for( i = 0; i < rs->rs_rate_cnt; i++) {
+ sprintf(rate_num, "%d", i);
+ rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rates),
+ OID_AUTO,
+ rate_num,
+ CTLFLAG_RW, 0,
+ "Individual Rate");
+ SYSCTL_ADD_U32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "flags", CTLFLAG_RD,
+ &rs->rs_rlt[i].flags, 0,
+ "Flags on this rate");
+ SYSCTL_ADD_U32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "pacetime", CTLFLAG_RD,
+ &rs->rs_rlt[i].time_between, 0,
+ "Time hardware inserts between 1500 byte sends");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "rate", CTLFLAG_RD,
+ &rs->rs_rlt[i].rate, 0,
+ "Rate in bytes per second");
+ }
+ }
+#endif
+}
+
+static void
+rs_destroy(epoch_context_t ctx)
+{
+ struct tcp_rate_set *rs;
+
+ rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
+ mtx_lock(&rs_mtx);
+ rs->rs_flags &= ~RS_FUNERAL_SCHD;
+ if (rs->rs_flows_using == 0) {
+ /*
+ * In theory its possible (but unlikely)
+ * that while the delete was occuring
+ * and we were applying the DEAD flag
+ * someone slipped in and found the
+ * interface in a lookup. While we
+ * decided rs_flows_using were 0 and
+ * scheduling the epoch_call, the other
+ * thread incremented rs_flow_using. This
+ * is because users have a pointer and
+ * we only use the rs_flows_using in an
+ * atomic fashion, i.e. the other entities
+ * are not protected. To assure this did
+ * not occur, we check rs_flows_using here
+ * before deleteing.
+ */
+ sysctl_ctx_free(&rs->sysctl_ctx);
+ free(rs->rs_rlt, M_TCPPACE);
+ free(rs, M_TCPPACE);
+ rs_number_dead--;
+ }
+ mtx_unlock(&rs_mtx);
+
+}
+
+extern counter_u64_t rate_limit_set_ok;
+extern counter_u64_t rate_limit_active;
+extern counter_u64_t rate_limit_alloc_fail;
+
+static int
+rl_attach_txrtlmt(struct ifnet *ifp,
+ uint32_t flowtype,
+ int flowid,
+ uint64_t cfg_rate,
+ struct m_snd_tag **tag)
+{
+ int error;
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = flowid,
+ .rate_limit.hdr.flowtype = flowtype,
+ .rate_limit.max_rate = cfg_rate,
+ .rate_limit.flags = M_NOWAIT,
+ };
+
+ if (ifp->if_snd_tag_alloc == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_alloc(ifp, &params, tag);
+ if (error == 0) {
+ if_ref((*tag)->ifp);
+ counter_u64_add(rate_limit_set_ok, 1);
+ counter_u64_add(rate_limit_active, 1);
+ } else
+ counter_u64_add(rate_limit_alloc_fail, 1);
+ }
+ return (error);
+}
+
+static void
+populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
+{
+ /*
+ * The internal table is "special", it
+ * is two seperate ordered tables that
+ * must be merged. We get here when the
+ * adapter specifies a number of rates that
+ * covers both ranges in the table in some
+ * form.
+ */
+ int i, at_low, at_high;
+ uint8_t low_disabled = 0, high_disabled = 0;
+
+ for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
+ rs->rs_rlt[i].flags = 0;
+ rs->rs_rlt[i].time_between = 0;
+ if ((low_disabled == 0) &&
+ (high_disabled ||
+ (rate_table_act[at_low] < rate_table_act[at_high]))) {
+ rs->rs_rlt[i].rate = rate_table_act[at_low];
+ at_low++;
+ if (at_low == RS_NEXT_ORDER_GROUP)
+ low_disabled = 1;
+ } else if (high_disabled == 0) {
+ rs->rs_rlt[i].rate = rate_table_act[at_high];
+ at_high++;
+ if (at_high == MAX_HDWR_RATES)
+ high_disabled = 1;
+ }
+ }
+}
+
+static struct tcp_rate_set *
+rt_setup_new_rs(struct ifnet *ifp, int *error)
+{
+ struct tcp_rate_set *rs;
+ const uint64_t *rate_table_act;
+ uint64_t lentim, res;
+ size_t sz;
+ uint32_t hash_type;
+ int i;
+ struct if_ratelimit_query_results rl;
+ struct sysctl_oid *rl_sysctl_root;
+ /*
+ * We expect to enter with the
+ * mutex locked.
+ */
+
+ if (ifp->if_ratelimit_query == NULL) {
+ /*
+ * We can do nothing if we cannot
+ * get a query back from the driver.
+ */
+ return (NULL);
+ }
+ rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
+ if (rs == NULL) {
+ if (error)
+ *error = ENOMEM;
+ return (NULL);
+ }
+ rl.flags = RT_NOSUPPORT;
+ ifp->if_ratelimit_query(ifp, &rl);
+ if (rl.flags & RT_IS_UNUSABLE) {
+ /*
+ * The interface does not really support
+ * the rate-limiting.
+ */
+ memset(rs, 0, sizeof(struct tcp_rate_set));
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_flags = RS_INTF_NO_SUP;
+ rs->rs_disable = 1;
+ rs_number_alive++;
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+ } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
+ memset(rs, 0, sizeof(struct tcp_rate_set));
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_flags = RS_IS_DEFF;
+ rs_number_alive++;
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+ } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
+ /* Mellanox most likely */
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_rate_cnt = rl.number_of_rates;
+ rs->rs_min_seg = rl.min_segment_burst;
+ rs->rs_highest_valid = 0;
+ rs->rs_flow_limit = rl.max_flows;
+ rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
+ rs->rs_disable = 0;
+ rate_table_act = rl.rate_table;
+ } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
+ /* Chelsio */
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_rate_cnt = rl.number_of_rates;
+ rs->rs_min_seg = rl.min_segment_burst;
+ rs->rs_disable = 0;
+ rs->rs_flow_limit = rl.max_flows;
+ rate_table_act = desired_rates;
+ if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
+ (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
+ /*
+ * Our desired table is not big
+ * enough, do what we can.
+ */
+ rs->rs_rate_cnt = MAX_HDWR_RATES;
+ }
+ if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
+ rs->rs_flags = RS_IS_INTF;
+ else
+ rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
+ if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
+ rs->rs_rate_cnt = ALL_HARDWARE_RATES;
+ } else {
+ printf("Interface:%s unit:%d not one known to have rate-limits\n",
+ ifp->if_dname,
+ ifp->if_dunit);
+ free(rs, M_TCPPACE);
+ return (NULL);
+ }
+ sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
+ rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
+ if (rs->rs_rlt == NULL) {
+ if (error)
+ *error = ENOMEM;
+bail:
+ free(rs, M_TCPPACE);
+ return (NULL);
+ }
+ if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
+ /*
+ * The interface supports all
+ * the rates we could possibly want.
+ */
+ uint64_t rat;
+
+ rs->rs_rlt[0].rate = 12500; /* 100k */
+ rs->rs_rlt[1].rate = 25000; /* 200k */
+ rs->rs_rlt[2].rate = 62500; /* 500k */
+ /* Note 125000 == 1Megabit
+ * populate 1Meg - 1000meg.
+ */
+ for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
+ rs->rs_rlt[i].rate = rat;
+ rat += 125000;
+ }
+ rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
+ } else if (rs->rs_flags & RS_INT_TBL) {
+ /* We populate this in a special way */
+ populate_canned_table(rs, rate_table_act);
+ } else {
+ /*
+ * Just copy in the rates from
+ * the table, it is in order.
+ */
+ for (i=0; i<rs->rs_rate_cnt; i++) {
+ rs->rs_rlt[i].rate = rate_table_act[i];
+ rs->rs_rlt[i].time_between = 0;
+ rs->rs_rlt[i].flags = 0;
+ }
+ }
+ for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
+ /*
+ * We go backwards through the list so that if we can't get
+ * a rate and fail to init one, we have at least a chance of
+ * getting the highest one.
+ */
+ rs->rs_rlt[i].ptbl = rs;
+ rs->rs_rlt[i].tag = NULL;
+ /*
+ * Calculate the time between.
+ */
+ lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
+ res = lentim / rs->rs_rlt[i].rate;
+ if (res > 0)
+ rs->rs_rlt[i].time_between = res;
+ else
+ rs->rs_rlt[i].time_between = 1;
+ if (rs->rs_flags & RS_NO_PRE) {
+ rs->rs_rlt[i].flags = HDWRPACE_INITED;
+ rs->rs_lowest_valid = i;
+ } else {
+ int err;
+#ifdef RSS
+ hash_type = M_HASHTYPE_RSS_TCP_IPV4;
+#else
+ hash_type = M_HASHTYPE_OPAQUE_HASH;
+#endif
+ err = rl_attach_txrtlmt(ifp,
+ hash_type,
+ (i + 1),
+ rs->rs_rlt[i].rate,
+ &rs->rs_rlt[i].tag);
+ if (err) {
+ if (i == (rs->rs_rate_cnt - 1)) {
+ /*
+ * Huh - first rate and we can't get
+ * it?
+ */
+ free(rs->rs_rlt, M_TCPPACE);
+ if (error)
+ *error = err;
+ goto bail;
+ } else {
+ if (error)
+ *error = err;
+ }
+ break;
+ } else {
+ rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
+ rs->rs_lowest_valid = i;
+ }
+ }
+ }
+ /* Did we get at least 1 rate? */
+ if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
+ rs->rs_highest_valid = rs->rs_rate_cnt - 1;
+ else {
+ free(rs->rs_rlt, M_TCPPACE);
+ goto bail;
+ }
+ rs_number_alive++;
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+}
+
+static const struct tcp_hwrate_limit_table *
+tcp_int_find_suitable_rate(const struct tcp_rate_set *rs,
+ uint64_t bytes_per_sec, uint32_t flags)
+{
+ struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
+ uint64_t mbits_per_sec, ind_calc;
+ int i;
+
+ mbits_per_sec = (bytes_per_sec * 8);
+ if (flags & RS_PACING_LT) {
+ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+ (rs->rs_lowest_valid <= 2)){
+ /*
+ * Smaller than 1Meg, only
+ * 3 entries can match it.
+ */
+ for(i = rs->rs_lowest_valid; i < 3; i++) {
+ if (bytes_per_sec <= rs->rs_rlt[i].rate) {
+ rte = &rs->rs_rlt[i];
+ break;
+ } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
+ arte = &rs->rs_rlt[i];
+ }
+ }
+ goto done;
+ } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
+ /*
+ * Larger than 1G (the majority of
+ * our table.
+ */
+ if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ else
+ arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ goto done;
+ }
+ /*
+ * If we reach here its in our table (between 1Meg - 1000Meg),
+ * just take the rounded down mbits per second, and add
+ * 1Megabit to it, from this we can calculate
+ * the index in the table.
+ */
+ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+ if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
+ ind_calc++;
+ /* our table is offset by 3, we add 2 */
+ ind_calc += 2;
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = ALL_HARDWARE_RATES-1;
+ }
+ if ((ind_calc >= rs->rs_lowest_valid) &&
+ (ind_calc <= rs->rs_highest_valid))
+ rte = &rs->rs_rlt[ind_calc];
+ } else if (flags & RS_PACING_EXACT_MATCH) {
+ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+ (rs->rs_lowest_valid <= 2)){
+ for(i = rs->rs_lowest_valid; i < 3; i++) {
+ if (bytes_per_sec == rs->rs_rlt[i].rate) {
+ rte = &rs->rs_rlt[i];
+ break;
+ }
+ }
+ } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
+ /* > 1Gbps only one rate */
+ if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
+ /* Its 10G wow */
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ }
+ } else {
+ /* Ok it must be a exact meg (its between 1G and 1Meg) */
+ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+ if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
+ /* its an exact Mbps */
+ ind_calc += 2;
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = ALL_HARDWARE_RATES-1;
+ }
+ if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
+ rte = &rs->rs_rlt[ind_calc];
+ }
+ }
+ } else {
+ /* we want greater than the requested rate */
+ if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
+ (rs->rs_lowest_valid <= 2)){
+ arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
+ for (i=2; i>=rs->rs_lowest_valid; i--) {
+ if (bytes_per_sec < rs->rs_rlt[i].rate) {
+ rte = &rs->rs_rlt[i];
+ break;
+ } else if ((flags & RS_PACING_GEQ) &&
+ (bytes_per_sec == rs->rs_rlt[i].rate)) {
+ rte = &rs->rs_rlt[i];
+ break;
+ } else {
+ arte = &rs->rs_rlt[i]; /* new alternate */
+ }
+ }
+ } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
+ if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
+ /* Our top rate is larger than the request */
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ } else if ((flags & RS_PACING_GEQ) &&
+ (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
+ (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
+ /* It matches our top rate */
+ rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
+ /* The top rate is an alternative */
+ arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
+ }
+ } else {
+ /* Its in our range 1Meg - 1Gig */
+ if (flags & RS_PACING_GEQ) {
+ ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
+ if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = (ALL_HARDWARE_RATES-1);
+ }
+ rte = &rs->rs_rlt[ind_calc];
+ }
+ goto done;
+ }
+ ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
+ ind_calc += 2;
+ if (ind_calc > (ALL_HARDWARE_RATES-1)) {
+ /* This should not happen */
+ ind_calc = ALL_HARDWARE_RATES-1;
+ }
+ if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
+ rte = &rs->rs_rlt[ind_calc];
+ }
+ }
+done:
+ if ((rte == NULL) &&
+ (arte != NULL) &&
+ (flags & RS_PACING_SUB_OK)) {
+ /* We can use the substitute */
+ rte = arte;
+ }
+ return (rte);
+}
+
+static const struct tcp_hwrate_limit_table *
+tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags)
+{
+ /**
+ * Hunt the rate table with the restrictions in flags and find a
+ * suitable rate if possible.
+ * RS_PACING_EXACT_MATCH - look for an exact match to rate.
+ * RS_PACING_GT - must be greater than.
+ * RS_PACING_GEQ - must be greater than or equal.
+ * RS_PACING_LT - must be less than.
+ * RS_PACING_SUB_OK - If we don't meet criteria a
+ * substitute is ok.
+ */
+ int i, matched;
+ struct tcp_hwrate_limit_table *rte = NULL;
+
+
+ if ((rs->rs_flags & RS_INT_TBL) &&
+ (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
+ /*
+ * Here we don't want to paw thru
+ * a big table, we have everything
+ * from 1Meg - 1000Meg in 1Meg increments.
+ * Use an alternate method to "lookup".
+ */
+ return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags));
+ }
+ if ((flags & RS_PACING_LT) ||
+ (flags & RS_PACING_EXACT_MATCH)) {
+ /*
+ * For exact and less than we go forward through the table.
+ * This way when we find one larger we stop (exact was a
+ * toss up).
+ */
+ for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
+ if ((flags & RS_PACING_EXACT_MATCH) &&
+ (bytes_per_sec == rs->rs_rlt[i].rate)) {
+ rte = &rs->rs_rlt[i];
+ matched = 1;
+ break;
+ } else if ((flags & RS_PACING_LT) &&
+ (bytes_per_sec <= rs->rs_rlt[i].rate)) {
+ rte = &rs->rs_rlt[i];
+ matched = 1;
+ break;
+ }
+ if (bytes_per_sec > rs->rs_rlt[i].rate)
+ break;
+ }
+ if ((matched == 0) &&
+ (flags & RS_PACING_LT) &&
+ (flags & RS_PACING_SUB_OK)) {
+ /* Kick in a substitute (the lowest) */
+ rte = &rs->rs_rlt[rs->rs_lowest_valid];
+ }
+ } else {
+ /*
+ * Here we go backward through the table so that we can find
+ * the one greater in theory faster (but its probably a
+ * wash).
+ */
+ for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
+ if (rs->rs_rlt[i].rate > bytes_per_sec) {
+ /* A possible candidate */
+ rte = &rs->rs_rlt[i];
+ }
+ if ((flags & RS_PACING_GEQ) &&
+ (bytes_per_sec == rs->rs_rlt[i].rate)) {
+ /* An exact match and we want equal */
+ matched = 1;
+ rte = &rs->rs_rlt[i];
+ break;
+ } else if (rte) {
+ /*
+ * Found one that is larger than but don't
+ * stop, there may be a more closer match.
+ */
+ matched = 1;
+ }
+ if (rs->rs_rlt[i].rate < bytes_per_sec) {
+ /*
+ * We found a table entry that is smaller,
+ * stop there will be none greater or equal.
+ */
+ break;
+ }
+ }
+ if ((matched == 0) &&
+ (flags & RS_PACING_SUB_OK)) {
+ /* Kick in a substitute (the highest) */
+ rte = &rs->rs_rlt[rs->rs_highest_valid];
+ }
+ }
+ return (rte);
+}
+
+static struct ifnet *
+rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
+{
+ struct ifnet *tifp;
+ struct m_snd_tag *tag;
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = 1,
+ .rate_limit.max_rate = COMMON_RATE,
+ .rate_limit.flags = M_NOWAIT,
+ };
+ int err;
+#ifdef RSS
+ params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
+ M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
+#else
+ params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
+#endif
+ tag = NULL;
+ if (ifp->if_snd_tag_alloc) {
+ if (error)
+ *error = ENODEV;
+ return (NULL);
+ }
+ err = ifp->if_snd_tag_alloc(ifp, &params, &tag);
+ if (err) {
+ /* Failed to setup a tag? */
+ if (error)
+ *error = err;
+ return (NULL);
+ }
+ tifp = tag->ifp;
+ tifp->if_snd_tag_free(tag);
+ return (tifp);
+}
+
+static const struct tcp_hwrate_limit_table *
+rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
+ uint32_t flags, int *error)
+{
+ /* First lets find the interface if it exists */
+ const struct tcp_hwrate_limit_table *rte;
+ struct tcp_rate_set *rs;
+ struct epoch_tracker et;
+ int err;
+
+ epoch_enter_preempt(net_epoch_preempt, &et);
+use_real_interface:
+ CK_LIST_FOREACH(rs, &int_rs, next) {
+ /*
+ * Note we don't look with the lock since we either see a
+ * new entry or will get one when we try to add it.
+ */
+ if (rs->rs_flags & RS_IS_DEAD) {
+ /* The dead are not looked at */
+ continue;
+ }
+ if ((rs->rs_ifp == ifp) &&
+ (rs->rs_if_dunit == ifp->if_dunit)) {
+ /* Ok we found it */
+ break;
+ }
+ }
+ if ((rs == NULL) ||
+ (rs->rs_flags & RS_INTF_NO_SUP) ||
+ (rs->rs_flags & RS_IS_DEAD)) {
+ /*
+ * This means we got a packet *before*
+ * the IF-UP was processed below, <or>
+ * while or after we already received an interface
+ * departed event. In either case we really don't
+ * want to do anything with pacing, in
+ * the departing case the packet is not
+ * going to go very far. The new case
+ * might be arguable, but its impossible
+ * to tell from the departing case.
+ */
+ if (rs->rs_disable && error)
+ *error = ENODEV;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+
+ if ((rs == NULL) || (rs->rs_disable != 0)) {
+ if (rs->rs_disable && error)
+ *error = ENOSPC;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+ if (rs->rs_flags & RS_IS_DEFF) {
+ /* We need to find the real interface */
+ struct ifnet *tifp;
+
+ tifp = rt_find_real_interface(ifp, inp, error);
+ if (tifp == NULL) {
+ if (rs->rs_disable && error)
+ *error = ENOTSUP;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+ goto use_real_interface;
+ }
+ if (rs->rs_flow_limit &&
+ ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
+ if (error)
+ *error = ENOSPC;
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (NULL);
+ }
+ rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
+ if (rte) {
+ err = in_pcbattach_txrtlmt(inp, rs->rs_ifp,
+ inp->inp_flowtype,
+ inp->inp_flowid,
+ rte->rate,
+ &inp->inp_snd_tag);
+ if (err) {
+ /* Failed to attach */
+ if (error)
+ *error = err;
+ rte = NULL;
+ }
+ }
+ if (rte) {
+ /*
+ * We use an atomic here for accounting so we don't have to
+ * use locks when freeing.
+ */
+ atomic_add_long(&rs->rs_flows_using, 1);
+ }
+ epoch_exit_preempt(net_epoch_preempt, &et);
+ return (rte);
+}
+
+static void
+tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
+{
+ int error;
+ struct tcp_rate_set *rs;
+
+ if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) ||
+ (link_state != LINK_STATE_UP)) {
+ /*
+ * We only care on an interface going up that is rate-limit
+ * capable.
+ */
+ return;
+ }
+ mtx_lock(&rs_mtx);
+ CK_LIST_FOREACH(rs, &int_rs, next) {
+ if ((rs->rs_ifp == ifp) &&
+ (rs->rs_if_dunit == ifp->if_dunit)) {
+ /* We already have initialized this guy */
+ mtx_unlock(&rs_mtx);
+ return;
+ }
+ }
+ rt_setup_new_rs(ifp, &error);
+ mtx_unlock(&rs_mtx);
+}
+
+static void
+tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
+{
+ struct tcp_rate_set *rs, *nrs;
+ struct ifnet *tifp;
+ int i;
+
+ mtx_lock(&rs_mtx);
+ CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
+ if ((rs->rs_ifp == ifp) &&
+ (rs->rs_if_dunit == ifp->if_dunit)) {
+ CK_LIST_REMOVE(rs, next);
+ rs_number_alive--;
+ rs_number_dead++;
+ rs->rs_flags |= RS_IS_DEAD;
+ for (i = 0; i < rs->rs_rate_cnt; i++) {
+ if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
+ tifp = rs->rs_rlt[i].tag->ifp;
+ in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
+ rs->rs_rlt[i].tag = NULL;
+ }
+ rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
+ }
+ if (rs->rs_flows_using == 0) {
+ /*
+ * No references left, so we can schedule the
+ * destruction after the epoch (with a caveat).
+ */
+ rs->rs_flags |= RS_FUNERAL_SCHD;
+ epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+ }
+ break;
+ }
+ }
+ mtx_unlock(&rs_mtx);
+}
+
+static void
+tcp_rl_shutdown(void *arg __unused, int howto __unused)
+{
+ struct tcp_rate_set *rs, *nrs;
+ struct ifnet *tifp;
+ int i;
+
+ mtx_lock(&rs_mtx);
+ CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
+ CK_LIST_REMOVE(rs, next);
+ rs_number_alive--;
+ rs_number_dead++;
+ rs->rs_flags |= RS_IS_DEAD;
+ for (i = 0; i < rs->rs_rate_cnt; i++) {
+ if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
+ tifp = rs->rs_rlt[i].tag->ifp;
+ in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag);
+ rs->rs_rlt[i].tag = NULL;
+ }
+ rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
+ }
+ if (rs->rs_flows_using != 0) {
+ /*
+ * We dont hold a reference
+ * so we have nothing left to
+ * do.
+ */
+ } else {
+ /*
+ * No references left, so we can destroy it
+ * after the epoch.
+ */
+ rs->rs_flags |= RS_FUNERAL_SCHD;
+ epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+ }
+ }
+ mtx_unlock(&rs_mtx);
+}
+
+const struct tcp_hwrate_limit_table *
+tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ const struct tcp_hwrate_limit_table *rte;
+
+ if (tp->t_inpcb->inp_snd_tag == NULL) {
+ /*
+ * We are setting up a rate for the first time.
+ */
+ if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) {
+ /* Not supported by the egress */
+ if (error)
+ *error = ENODEV;
+ return (NULL);
+ }
+#ifdef KERN_TLS
+ if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) {
+ /*
+ * We currently can't do both TLS and hardware
+ * pacing
+ */
+ if (error)
+ *error = EINVAL;
+ return (NULL);
+ }
+#endif
+ rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error);
+ } else {
+ /*
+ * We are modifying a rate, wrong interface?
+ */
+ if (error)
+ *error = EINVAL;
+ rte = NULL;
+ }
+ return (rte);
+}
+
+const struct tcp_hwrate_limit_table *
+tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
+ struct tcpcb *tp, struct ifnet *ifp,
+ uint64_t bytes_per_sec, int flags, int *error)
+{
+ const struct tcp_hwrate_limit_table *nrte;
+ const struct tcp_rate_set *rs;
+ int is_indirect = 0;
+ int err;
+
+
+ if ((tp->t_inpcb->inp_snd_tag == NULL) ||
+ (crte == NULL)) {
+ /* Wrong interface */
+ if (error)
+ *error = EINVAL;
+ return (NULL);
+ }
+ rs = crte->ptbl;
+ if ((rs->rs_flags & RS_IS_DEAD) ||
+ (crte->flags & HDWRPACE_IFPDEPARTED)) {
+ /* Release the rate, and try anew */
+re_rate:
+ tcp_rel_pacing_rate(crte, tp);
+ nrte = tcp_set_pacing_rate(tp, ifp,
+ bytes_per_sec, flags, error);
+ return (nrte);
+ }
+ if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT)
+ is_indirect = 1;
+ else
+ is_indirect = 0;
+ if ((is_indirect == 0) &&
+ ((ifp != rs->rs_ifp) ||
+ (ifp->if_dunit != rs->rs_if_dunit))) {
+ /*
+ * Something changed, the user is not pointing to the same
+ * ifp? Maybe a route updated on this guy?
+ */
+ goto re_rate;
+ } else if (is_indirect) {
+ /*
+ * For indirect we have to dig in and find the real interface.
+ */
+ struct ifnet *rifp;
+
+ rifp = rt_find_real_interface(ifp, tp->t_inpcb, error);
+ if (rifp == NULL) {
+ /* Can't find it? */
+ goto re_rate;
+ }
+ if ((rifp != rs->rs_ifp) ||
+ (ifp->if_dunit != rs->rs_if_dunit)) {
+ goto re_rate;
+ }
+ }
+ nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
+ if (nrte == crte) {
+ /* No change */
+ if (error)
+ *error = 0;
+ return (crte);
+ }
+ if (nrte == NULL) {
+ /* Release the old rate */
+ tcp_rel_pacing_rate(crte, tp);
+ return (NULL);
+ }
+ /* Change rates to our new entry */
+ err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate);
+ if (err) {
+ if (error)
+ *error = err;
+ return (NULL);
+ }
+ if (error)
+ *error = 0;
+ return (nrte);
+}
+
+void
+tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
+{
+ const struct tcp_rate_set *crs;
+ struct tcp_rate_set *rs;
+ uint64_t pre;
+
+ crs = crte->ptbl;
+ /*
+ * Now we must break the const
+ * in order to release our refcount.
+ */
+ rs = __DECONST(struct tcp_rate_set *, crs);
+ pre = atomic_fetchadd_long(&rs->rs_flows_using, -1);
+ if (pre == 1) {
+ mtx_lock(&rs_mtx);
+ /*
+ * Is it dead?
+ */
+ if ((rs->rs_flags & RS_IS_DEAD) &&
+ ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){
+ /*
+ * We were the last,
+ * and a funeral is not pending, so
+ * we must schedule it.
+ */
+ rs->rs_flags |= RS_FUNERAL_SCHD;
+ epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
+ }
+ mtx_unlock(&rs_mtx);
+ }
+ in_pcbdetach_txrtlmt(tp->t_inpcb);
+}
+
+static eventhandler_tag rl_ifnet_departs;
+static eventhandler_tag rl_ifnet_arrives;
+static eventhandler_tag rl_shutdown_start;
+
+static void
+tcp_rs_init(void *st __unused)
+{
+ CK_LIST_INIT(&int_rs);
+ rs_number_alive = 0;
+ rs_number_dead = 0;;
+ mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
+ rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
+ tcp_rl_ifnet_departure,
+ NULL, EVENTHANDLER_PRI_ANY);
+ rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
+ tcp_rl_ifnet_link,
+ NULL, EVENTHANDLER_PRI_ANY);
+ rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
+ tcp_rl_shutdown, NULL,
+ SHUTDOWN_PRI_FIRST);
+ printf("TCP_ratelimit: Is now initialized\n");
+}
+
+SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
+#endif

File Metadata

Mime Type
text/plain
Expires
Sun, Jan 12, 11:26 PM (21 h, 38 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15773749
Default Alt Text
D20953.diff (55 KB)

Event Timeline