Index: conf/files =================================================================== --- conf/files +++ conf/files @@ -4255,6 +4255,7 @@ netinet/tcp_output.c optional inet | inet6 netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6 +netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6 netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \ compile-with "${NORMAL_C} ${NO_WNONNULL}" netinet/tcp_reass.c optional inet | inet6 Index: dev/cxgbe/adapter.h =================================================================== --- dev/cxgbe/adapter.h +++ dev/cxgbe/adapter.h @@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); void cxgbe_snd_tag_free(struct m_snd_tag *); void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *); +void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif /* t4_filter.c */ Index: dev/cxgbe/t4_main.c =================================================================== --- dev/cxgbe/t4_main.c +++ dev/cxgbe/t4_main.c @@ -1658,6 +1658,7 @@ ifp->if_snd_tag_modify = cxgbe_snd_tag_modify; ifp->if_snd_tag_query = cxgbe_snd_tag_query; ifp->if_snd_tag_free = cxgbe_snd_tag_free; + ifp->if_ratelimit_query = cxgbe_ratelimit_query; #endif ifp->if_capabilities = T4_CAP; Index: dev/cxgbe/t4_sched.c =================================================================== --- dev/cxgbe/t4_sched.c +++ dev/cxgbe/t4_sched.c @@ -903,4 +903,35 @@ } mtx_unlock(&cst->lock); } + +#define CXGBE_MAX_FLOWS 4000 /* Testing show so far thats all this adapter can do */ +#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */ + +void +cxgbe_ratelimit_query(struct ifnet *ifp __unused, + struct if_ratelimit_query_results *q) +{ + /* + * This is a skeleton and needs future work + * by the driver supporters. It should be + * enhanced to look at the specific type of + * interface and select approprate values + * for these settings. This example goes + * with an earlier card (t5), it has a maximum + * number of 16 rates that the first guys in + * select (thus the flags value RT_IS_SELECTABLE). + * If it was a fixed table then we would setup a + * const array (example mlx5). Note the card tested + * can only support reasonably 4000 flows before + * the adapter has issues with sending so here + * we limit the number of flows using hardware + * pacing to that number, other cards may + * be able to raise or eliminate this limit. + */ + q->rate_table = NULL; + q->flags = RT_IS_SELECTABLE; + q->max_flows = CXGBE_MAX_FLOWS; + q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT; + q->min_segment_burst = 4; /* Driver emits 4 in a burst */ +} #endif Index: dev/mlx5/mlx5_en/mlx5_en_main.c =================================================================== --- dev/mlx5/mlx5_en/mlx5_en_main.c +++ dev/mlx5/mlx5_en/mlx5_en_main.c @@ -4070,7 +4070,49 @@ } } +#define NUM_HDWR_RATES_MLX 13 +static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = { + 135375, /* 1,083,000 */ + 180500, /* 1,444,000 */ + 270750, /* 2,166,000 */ + 361000, /* 2,888,000 */ + 541500, /* 4,332,000 */ + 721875, /* 5,775,000 */ + 1082875, /* 8,663,000 */ + 1443875, /* 11,551,000 */ + 2165750, /* 17,326,000 */ + 2887750, /* 23,102,000 */ + 4331625, /* 34,653,000 */ + 5775500, /* 46,204,000 */ + 8663125 /* 69,305,000 */ +}; + static void +mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +{ + /* + * This function needs updating by the driver maintainer! + * For the MLX card there are currently (ConectX-4?) 13 + * pre-set rates and others i.e. ConnectX-5, 6, 7?? + * + * This will change based on later adapters + * and this code should be updated to look at ifp + * and figure out the specific adapter type + * settings i.e. how many rates as well + * as if they are fixed (as is shown here) or + * if they are dynamic (example chelsio t4). Also if there + * is a maximum number of flows that the adapter + * can handle that too needs to be updated in + * the max_flows field. + */ + q->rate_table = adapter_rates_mlx; + q->flags = RT_IS_FIXED_TABLE; + q->max_flows = 0; /* mlx has no limit */ + q->number_of_rates = NUM_HDWR_RATES_MLX; + q->min_segment_burst = 1; +} + +static void mlx5e_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_snd_tag *tag = @@ -4155,7 +4197,9 @@ ifp->if_snd_tag_free = mlx5e_snd_tag_free; ifp->if_snd_tag_modify = mlx5e_snd_tag_modify; ifp->if_snd_tag_query = mlx5e_snd_tag_query; - +#ifdef RATELIMIT + ifp->if_ratelimit_query = mlx5e_ratelimit_query; +#endif /* set TSO limits so that we don't have to drop TX packets */ ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */; Index: net/if_dead.c =================================================================== --- net/if_dead.c +++ net/if_dead.c @@ -126,6 +126,23 @@ { } +static void +ifdead_ratelimit_query(struct ifnet *ifp __unused, + struct if_ratelimit_query_results *q) +{ + /* + * This guy does not support + * this interface. Not sure + * why we would specify a + * flag on the interface + * that says we do. + */ + q->rate_table = NULL; + q->flags = RT_NOSUPPORT; + q->max_flows = 0; + q->number_of_rates = 0; +} + void if_dead(struct ifnet *ifp) { @@ -142,4 +159,5 @@ ifp->if_snd_tag_modify = ifdead_snd_tag_modify; ifp->if_snd_tag_query = ifdead_snd_tag_query; ifp->if_snd_tag_free = ifdead_snd_tag_free; + ifp->if_ratelimit_query = ifdead_ratelimit_query; } Index: net/if_lagg.c =================================================================== --- net/if_lagg.c +++ net/if_lagg.c @@ -144,6 +144,8 @@ static int lagg_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); +static void lagg_ratelimit_query(struct ifnet *, + struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); @@ -537,6 +539,7 @@ ifp->if_snd_tag_modify = lagg_snd_tag_modify; ifp->if_snd_tag_query = lagg_snd_tag_query; ifp->if_snd_tag_free = lagg_snd_tag_free; + ifp->if_ratelimit_query = lagg_ratelimit_query; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; @@ -1670,6 +1673,20 @@ free(lst, M_LAGG); } +static void +lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +{ + /* + * For lagg, we have an indirect + * interface. The caller needs to + * get a ratelimit tag on the actual + * interface the flow will go on. + */ + q->rate_table = NULL; + q->flags = RT_IS_INDIRECT; + q->max_flows = 0; + q->number_of_rates = 0; +} #endif static int Index: net/if_var.h =================================================================== --- net/if_var.h +++ net/if_var.h @@ -203,6 +203,8 @@ struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ + uint32_t reserved; /* alignment */ }; struct if_snd_tag_rate_limit_params { @@ -210,7 +212,7 @@ uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 - uint32_t reserved; /* padding */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ }; union if_snd_tag_alloc_params { @@ -229,12 +231,37 @@ struct if_snd_tag_rate_limit_params unlimited; }; +/* Query return flags */ +#define RT_NOSUPPORT 0x00000000 /* Not supported */ +#define RT_IS_INDIRECT 0x00000001 /* + * Interface like a lagg, select + * the actual interface for + * capabilities. + */ +#define RT_IS_SELECTABLE 0x00000002 /* + * No rate table, you select + * rates and the first + * number_of_rates are created. + */ +#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */ + +struct if_ratelimit_query_results { + const uint64_t *rate_table; /* Pointer to table if present */ + uint32_t flags; /* Flags indicating results */ + uint32_t max_flows; /* Max flows using, 0=unlimited */ + uint32_t number_of_rates; /* How many unique rates can be created */ + uint32_t min_segment_burst; /* The amount the adapter bursts at each send */ +}; + typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); +typedef void (if_ratelimit_query_t)(struct ifnet *, + struct if_ratelimit_query_results *); + /* * Structure defining a network interface. */ @@ -374,6 +401,7 @@ if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; + if_ratelimit_query_t *if_ratelimit_query; /* Ethernet PCP */ uint8_t if_pcp; Index: netinet/in_pcb.h =================================================================== --- netinet/in_pcb.h +++ netinet/in_pcb.h @@ -883,8 +883,13 @@ in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); #ifdef RATELIMIT -int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t); +int +in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *, + struct mbuf *, uint32_t); +int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, + uint32_t, struct m_snd_tag **); void in_pcbdetach_txrtlmt(struct inpcb *); +void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst); int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t); int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *); int in_pcbquery_txrlevel(struct inpcb *, uint32_t *); Index: netinet/in_pcb.c =================================================================== --- netinet/in_pcb.c +++ netinet/in_pcb.c @@ -210,6 +210,22 @@ &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); + +#ifdef RATELIMIT +counter_u64_t rate_limit_active; +counter_u64_t rate_limit_alloc_fail; +counter_u64_t rate_limit_set_ok; + +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0, + "IP Rate Limiting"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, + &rate_limit_active, "Active rate limited connections"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, + &rate_limit_alloc_fail, "Rate limited connection failures"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, + &rate_limit_set_ok, "Rate limited setting succeeded"); +#endif /* RATELIMIT */ + #endif /* INET */ /* @@ -3170,6 +3186,7 @@ { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, + .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; @@ -3256,7 +3273,8 @@ */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, - uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) + uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) + { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? @@ -3264,12 +3282,13 @@ .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, + .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); - if (inp->inp_snd_tag != NULL) + if (*st != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { @@ -3276,10 +3295,34 @@ error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); + + if (error == 0) { + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else + counter_u64_add(rate_limit_alloc_fail, 1); } return (error); } +void +in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst) +{ + if (ifp == NULL) + return; + + /* + * If the device was detached while we still had reference(s) + * on the ifp, we assume if_snd_tag_free() was replaced with + * stubs. + */ + ifp->if_snd_tag_free(mst); + + /* release reference count on network interface */ + if_rele(ifp); + counter_u64_add(rate_limit_active, -1); +} + /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: @@ -3300,6 +3343,56 @@ m_snd_tag_rele(mst); } +int +in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) +{ + int error; + + /* + * If the existing send tag is for the wrong interface due to + * a route change, first drop the existing tag. Set the + * CHANGED flag so that we will keep trying to allocate a new + * tag if we fail to allocate one this time. + */ + if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { + in_pcbdetach_txrtlmt(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + } + + /* + * NOTE: When attaching to a network interface a reference is + * made to ensure the network interface doesn't go away until + * all ratelimit connections are gone. The network interface + * pointers compared below represent valid network interfaces, + * except when comparing towards NULL. + */ + if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { + error = 0; + } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { + if (inp->inp_snd_tag != NULL) + in_pcbdetach_txrtlmt(inp); + error = 0; + } else if (inp->inp_snd_tag == NULL) { + /* + * In order to utilize packet pacing with RSS, we need + * to wait until there is a valid RSS hash before we + * can proceed: + */ + if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { + error = EAGAIN; + } else { + error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), + mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); + } + } else { + error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); + } + if (error == 0 || error == EOPNOTSUPP) + inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + + return (error); +} + /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate @@ -3342,47 +3435,8 @@ */ max_pacing_rate = socket->so_max_pacing_rate; - /* - * If the existing send tag is for the wrong interface due to - * a route change, first drop the existing tag. Set the - * CHANGED flag so that we will keep trying to allocate a new - * tag if we fail to allocate one this time. - */ - if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { - in_pcbdetach_txrtlmt(inp); - inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; - } + error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); - /* - * NOTE: When attaching to a network interface a reference is - * made to ensure the network interface doesn't go away until - * all ratelimit connections are gone. The network interface - * pointers compared below represent valid network interfaces, - * except when comparing towards NULL. - */ - if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { - error = 0; - } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { - if (inp->inp_snd_tag != NULL) - in_pcbdetach_txrtlmt(inp); - error = 0; - } else if (inp->inp_snd_tag == NULL) { - /* - * In order to utilize packet pacing with RSS, we need - * to wait until there is a valid RSS hash before we - * can proceed: - */ - if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { - error = EAGAIN; - } else { - error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), - mb->m_pkthdr.flowid, max_pacing_rate); - } - } else { - error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); - } - if (error == 0 || error == EOPNOTSUPP) - inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } @@ -3424,4 +3478,14 @@ if (did_upgrade) INP_DOWNGRADE(inp); } + +static void +rl_init(void *st) +{ + rate_limit_active = counter_u64_alloc(M_WAITOK); + rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); + rate_limit_set_ok = counter_u64_alloc(M_WAITOK); +} + +SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif /* RATELIMIT */ Index: netinet/tcp_ratelimit.h =================================================================== --- netinet/tcp_ratelimit.h +++ netinet/tcp_ratelimit.h @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 2018-2019 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * __FBSDID("$FreeBSD$"); + * + */ +/** + * Author: Randall Stewart + */ +#ifndef __tcp_ratelimit_h__ +#define __tcp_ratelimit_h__ + +struct m_snd_tag; + +/* Flags on an individual rate */ +#define HDWRPACE_INITED 0x0001 +#define HDWRPACE_TAGPRESENT 0x0002 +#define HDWRPACE_IFPDEPARTED 0x0004 +struct tcp_hwrate_limit_table { + const struct tcp_rate_set *ptbl; /* Pointer to parent table */ + struct m_snd_tag *tag; /* Send tag if needed (chelsio) */ + uint64_t rate; /* Rate we get in Bytes per second (Bps) */ + uint32_t time_between; /* Time-Gap between packets at this rate */ + uint32_t flags; +}; + +/* Rateset flags */ +#define RS_IS_DEFF 0x0001 /* Its a lagg, do a double lookup */ +#define RS_IS_INTF 0x0002 /* Its a plain interface */ +#define RS_NO_PRE 0x0004 /* The interfacd has set rates */ +#define RS_INT_TBL 0x0010 /* + * The table is the internal version + * which has special setup requirements. + */ +#define RS_IS_DEAD 0x0020 /* The RS is dead list */ +#define RS_FUNERAL_SCHD 0x0040 /* Is a epoch call scheduled to bury this guy?*/ + +struct tcp_rate_set { + struct sysctl_ctx_list sysctl_ctx; + CK_LIST_ENTRY(tcp_rate_set) next; + struct ifnet *rs_ifp; + struct tcp_hwrate_limit_table *rs_rlt; + uint64_t rs_flows_using; + uint64_t rs_flow_limit; + uint32_t rs_if_dunit; + int rs_rate_cnt; + int rs_min_seg; + int rs_highest_valid; + int rs_lowest_valid; + int rs_disable; + int rs_flags; + struct epoch_context rs_epoch_ctx; +}; + +CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set); + +/* Request flags */ +#define RS_PACING_EXACT_MATCH 0x0001 /* Need an exact match for rate */ +#define RS_PACING_GT 0x0002 /* Greater than requested */ +#define RS_PACING_GEQ 0x0004 /* Greater than or equal too */ +#define RS_PACING_LT 0x0008 /* Less than requested rate */ +#define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the + * next best rate (highest or lowest). */ +#ifdef RATELIMIT +#ifdef _KERNEL +#define DETAILED_RATELIMIT_SYSCTL 1 /* + * Undefine this if you don't want + * detailed rates to appear in + * net.inet.tcp.rl. + * With the defintion each rate + * shows up in your sysctl tree + * this can be big. + */ + +const struct tcp_hwrate_limit_table * +tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, + uint64_t bytes_per_sec, int flags, int *error); + +const struct tcp_hwrate_limit_table * +tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, + struct tcpcb *tp, struct ifnet *ifp, + uint64_t bytes_per_sec, int flags, int *error); +void +tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, + struct tcpcb *tp); +#else +static inline const struct tcp_hwrate_limit_table * +tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, + uint64_t bytes_per_sec, int flags, int *error) +{ + if (error) + *error = EOPNOTSUPP; + return (NULL); +} + +static inline const struct tcp_hwrate_limit_table * +tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, + struct tcpcb *tp, struct ifnet *ifp, + uint64_t bytes_per_sec, int flags, int *error) +{ + if (error) + *error = EOPNOTSUPP; + return (NULL); +} + +static inline void +tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, + struct tcpcb *tp) +{ + return; +} + +#endif +#endif +#endif Index: netinet/tcp_ratelimit.c =================================================================== --- netinet/tcp_ratelimit.c +++ netinet/tcp_ratelimit.c @@ -0,0 +1,1197 @@ +/*- + * Copyright (c) 2018-2019 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/** + * Author: Randall Stewart + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +/* #include "opt_kern_tls.h" -- when ktls hits the tree */ +#include +#include +#include +#include +#include +#include +#ifdef KERN_TLS +#include +#endif +#include +#include +#include +#include +#define TCPSTATES /* for logging */ +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifndef USECS_IN_SECOND +#define USECS_IN_SECOND 1000000 +#endif +/* + * For the purposes of each send, what is the size + * of an ethernet frame. + */ +#ifndef ETHERNET_SEGMENT_SIZE +#define ETHERNET_SEGMENT_SIZE 1500 +#endif +MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); +#ifdef RATELIMIT + +#define COMMON_RATE 180500 +uint64_t desired_rates[] = { + 62500, /* 500Kbps */ + 180500, /* 1.44Mpbs */ + 375000, /* 3Mbps */ + 500000, /* 4Mbps */ + 625000, /* 5Mbps */ + 750000, /* 6Mbps */ + 1000000, /* 8Mbps */ + 1250000, /* 10Mbps */ + 2500000, /* 20Mbps */ + 3750000, /* 30Mbps */ + 5000000, /* 40Meg */ + 6250000, /* 50Mbps */ + 12500000, /* 100Mbps */ + 25000000, /* 200Mbps */ + 50000000, /* 400Mbps */ + 100000000, /* 800Mbps */ + 12500, /* 100kbps */ + 25000, /* 200kbps */ + 875000, /* 7Mbps */ + 1125000, /* 9Mbps */ + 1875000, /* 15Mbps */ + 3125000, /* 25Mbps */ + 8125000, /* 65Mbps */ + 10000000, /* 80Mbps */ + 18750000, /* 150Mbps */ + 20000000, /* 250Mbps */ + 37500000, /* 350Mbps */ + 62500000, /* 500Mbps */ + 78125000, /* 625Mbps */ + 125000000, /* 1Gbps */ +}; +#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) +#define RS_ORDERED_COUNT 16 /* + * Number that are in order + * at the beginning of the table, + * over this a sort is required. + */ +#define RS_NEXT_ORDER_GROUP 16 /* + * The point in our table where + * we come fill in a second ordered + * group (index wise means -1). + */ +#define ALL_HARDWARE_RATES 1004 /* + * 1Meg - 1Gig in 1 Meg steps + * plus 100, 200k and 500k and + * 10Gig + */ + +#define RS_ONE_MEGABIT_PERSEC 1000000 +#define RS_ONE_GIGABIT_PERSEC 1000000000 +#define RS_TEN_GIGABIT_PERSEC 10000000000 + +static struct head_tcp_rate_set int_rs; +static struct mtx rs_mtx; +uint32_t rs_number_alive; +uint32_t rs_number_dead; + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0, + "TCP Ratelimit stats"); +SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, + &rs_number_alive, 0, + "Number of interfaces initialized for ratelimiting"); +SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, + &rs_number_dead, 0, + "Number of interfaces departing from ratelimiting"); + +static void +rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) +{ + /* + * Add sysctl entries for thus interface. + */ + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "disable", CTLFLAG_RW, + &rs->rs_disable, 0, + "Disable this interface from new hdwr limiting?"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "minseg", CTLFLAG_RW, + &rs->rs_min_seg, 0, + "What is the minimum we need to send on this interface?"); + SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "flow_limit", CTLFLAG_RW, + &rs->rs_flow_limit, 0, + "What is the limit for number of flows (0=unlimited)?"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "highest", CTLFLAG_RD, + &rs->rs_highest_valid, 0, + "Highest valid rate"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "lowest", CTLFLAG_RD, + &rs->rs_lowest_valid, 0, + "Lowest valid rate"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "flags", CTLFLAG_RD, + &rs->rs_flags, 0, + "What lags are on the entry?"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "numrates", CTLFLAG_RD, + &rs->rs_rate_cnt, 0, + "How many rates re there?"); + SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "flows_using", CTLFLAG_RD, + &rs->rs_flows_using, 0, + "How many flows are using this interface now?"); +#ifdef DETAILED_RATELIMIT_SYSCTL + if (rs->rs_rlt && rs->rs_rate_cnt > 0) { + /* Lets display the rates */ + int i; + struct sysctl_oid *rl_rates; + struct sysctl_oid *rl_rate_num; + char rate_num[16]; + rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, + "rate", + CTLFLAG_RW, 0, + "Ratelist"); + for( i = 0; i < rs->rs_rate_cnt; i++) { + sprintf(rate_num, "%d", i); + rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rates), + OID_AUTO, + rate_num, + CTLFLAG_RW, 0, + "Individual Rate"); + SYSCTL_ADD_U32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "flags", CTLFLAG_RD, + &rs->rs_rlt[i].flags, 0, + "Flags on this rate"); + SYSCTL_ADD_U32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "pacetime", CTLFLAG_RD, + &rs->rs_rlt[i].time_between, 0, + "Time hardware inserts between 1500 byte sends"); + SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "rate", CTLFLAG_RD, + &rs->rs_rlt[i].rate, 0, + "Rate in bytes per second"); + } + } +#endif +} + +static void +rs_destroy(epoch_context_t ctx) +{ + struct tcp_rate_set *rs; + + rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); + mtx_lock(&rs_mtx); + rs->rs_flags &= ~RS_FUNERAL_SCHD; + if (rs->rs_flows_using == 0) { + /* + * In theory its possible (but unlikely) + * that while the delete was occuring + * and we were applying the DEAD flag + * someone slipped in and found the + * interface in a lookup. While we + * decided rs_flows_using were 0 and + * scheduling the epoch_call, the other + * thread incremented rs_flow_using. This + * is because users have a pointer and + * we only use the rs_flows_using in an + * atomic fashion, i.e. the other entities + * are not protected. To assure this did + * not occur, we check rs_flows_using here + * before deleteing. + */ + sysctl_ctx_free(&rs->sysctl_ctx); + free(rs->rs_rlt, M_TCPPACE); + free(rs, M_TCPPACE); + rs_number_dead--; + } + mtx_unlock(&rs_mtx); + +} + +extern counter_u64_t rate_limit_set_ok; +extern counter_u64_t rate_limit_active; +extern counter_u64_t rate_limit_alloc_fail; + +static int +rl_attach_txrtlmt(struct ifnet *ifp, + uint32_t flowtype, + int flowid, + uint64_t cfg_rate, + struct m_snd_tag **tag) +{ + int error; + union if_snd_tag_alloc_params params = { + .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, + .rate_limit.hdr.flowid = flowid, + .rate_limit.hdr.flowtype = flowtype, + .rate_limit.max_rate = cfg_rate, + .rate_limit.flags = M_NOWAIT, + }; + + if (ifp->if_snd_tag_alloc == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); + if (error == 0) { + if_ref((*tag)->ifp); + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else + counter_u64_add(rate_limit_alloc_fail, 1); + } + return (error); +} + +static void +populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) +{ + /* + * The internal table is "special", it + * is two seperate ordered tables that + * must be merged. We get here when the + * adapter specifies a number of rates that + * covers both ranges in the table in some + * form. + */ + int i, at_low, at_high; + uint8_t low_disabled = 0, high_disabled = 0; + + for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { + rs->rs_rlt[i].flags = 0; + rs->rs_rlt[i].time_between = 0; + if ((low_disabled == 0) && + (high_disabled || + (rate_table_act[at_low] < rate_table_act[at_high]))) { + rs->rs_rlt[i].rate = rate_table_act[at_low]; + at_low++; + if (at_low == RS_NEXT_ORDER_GROUP) + low_disabled = 1; + } else if (high_disabled == 0) { + rs->rs_rlt[i].rate = rate_table_act[at_high]; + at_high++; + if (at_high == MAX_HDWR_RATES) + high_disabled = 1; + } + } +} + +static struct tcp_rate_set * +rt_setup_new_rs(struct ifnet *ifp, int *error) +{ + struct tcp_rate_set *rs; + const uint64_t *rate_table_act; + uint64_t lentim, res; + size_t sz; + uint32_t hash_type; + int i; + struct if_ratelimit_query_results rl; + struct sysctl_oid *rl_sysctl_root; + /* + * We expect to enter with the + * mutex locked. + */ + + if (ifp->if_ratelimit_query == NULL) { + /* + * We can do nothing if we cannot + * get a query back from the driver. + */ + return (NULL); + } + rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); + if (rs == NULL) { + if (error) + *error = ENOMEM; + return (NULL); + } + rl.flags = RT_NOSUPPORT; + ifp->if_ratelimit_query(ifp, &rl); + if ((rl.flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) { + memset(rs, 0, sizeof(struct tcp_rate_set)); + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_flags = RS_IS_DEFF; + rs_number_alive++; + sysctl_ctx_init(&rs->sysctl_ctx); + rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), + OID_AUTO, + rs->rs_ifp->if_xname, + CTLFLAG_RW, 0, + ""); + CK_LIST_INSERT_HEAD(&int_rs, rs, next); + /* Unlock to allow the sysctl stuff to allocate */ + mtx_unlock(&rs_mtx); + rl_add_syctl_entries(rl_sysctl_root, rs); + /* re-lock for our caller */ + mtx_lock(&rs_mtx); + return (rs); + } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { + /* Mellanox most likely */ + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_rate_cnt = rl.number_of_rates; + rs->rs_min_seg = rl.min_segment_burst; + rs->rs_highest_valid = 0; + rs->rs_flow_limit = rl.max_flows; + rs->rs_flags = RS_IS_INTF | RS_NO_PRE; + rs->rs_disable = 0; + rate_table_act = rl.rate_table; + } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { + /* Chelsio */ + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_rate_cnt = rl.number_of_rates; + rs->rs_min_seg = rl.min_segment_burst; + rs->rs_disable = 0; + rs->rs_flow_limit = rl.max_flows; + rate_table_act = desired_rates; + if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && + (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { + /* + * Our desired table is not big + * enough, do what we can. + */ + rs->rs_rate_cnt = MAX_HDWR_RATES; + } + if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) + rs->rs_flags = RS_IS_INTF; + else + rs->rs_flags = RS_IS_INTF | RS_INT_TBL; + if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) + rs->rs_rate_cnt = ALL_HARDWARE_RATES; + } else { + printf("Interface:%s unit:%d not one known to have rate-limits\n", + ifp->if_dname, + ifp->if_dunit); + free(rs, M_TCPPACE); + return (NULL); + } + sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; + rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); + if (rs->rs_rlt == NULL) { + if (error) + *error = ENOMEM; +bail: + free(rs, M_TCPPACE); + return (NULL); + } + if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { + /* + * The interface supports all + * the rates we could possibly want. + */ + uint64_t rat; + + rs->rs_rlt[0].rate = 12500; /* 100k */ + rs->rs_rlt[1].rate = 25000; /* 200k */ + rs->rs_rlt[2].rate = 62500; /* 500k */ + /* Note 125000 == 1Megabit + * populate 1Meg - 1000meg. + */ + for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { + rs->rs_rlt[i].rate = rat; + rat += 125000; + } + rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; + } else if (rs->rs_flags & RS_INT_TBL) { + /* We populate this in a special way */ + populate_canned_table(rs, rate_table_act); + } else { + /* + * Just copy in the rates from + * the table, it is in order. + */ + for (i=0; irs_rate_cnt; i++) { + rs->rs_rlt[i].rate = rate_table_act[i]; + rs->rs_rlt[i].time_between = 0; + rs->rs_rlt[i].flags = 0; + } + } + for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) { + /* + * We go backwards through the list so that if we can't get + * a rate and fail to init one, we have at least a chance of + * getting the highest one. + */ + rs->rs_rlt[i].ptbl = rs; + rs->rs_rlt[i].tag = NULL; + /* + * Calculate the time between. + */ + lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; + res = lentim / rs->rs_rlt[i].rate; + if (res > 0) + rs->rs_rlt[i].time_between = res; + else + rs->rs_rlt[i].time_between = 1; + if (rs->rs_flags & RS_NO_PRE) { + rs->rs_rlt[i].flags = HDWRPACE_INITED; + rs->rs_lowest_valid = i; + } else { + int err; +#ifdef RSS + hash_type = M_HASHTYPE_RSS_TCP_IPV4; +#else + hash_type = M_HASHTYPE_OPAQUE_HASH; +#endif + err = rl_attach_txrtlmt(ifp, + hash_type, + (i + 1), + rs->rs_rlt[i].rate, + &rs->rs_rlt[i].tag); + if (err) { + if (i == (rs->rs_rate_cnt - 1)) { + /* + * Huh - first rate and we can't get + * it? + */ + free(rs->rs_rlt, M_TCPPACE); + if (error) + *error = err; + goto bail; + } else { + if (error) + *error = err; + } + break; + } else { + rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT; + rs->rs_lowest_valid = i; + } + } + } + /* Did we get at least 1 rate? */ + if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED) + rs->rs_highest_valid = rs->rs_rate_cnt - 1; + else { + free(rs->rs_rlt, M_TCPPACE); + goto bail; + } + rs_number_alive++; + CK_LIST_INSERT_HEAD(&int_rs, rs, next); + sysctl_ctx_init(&rs->sysctl_ctx); + rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), + OID_AUTO, + rs->rs_ifp->if_xname, + CTLFLAG_RW, 0, + ""); + /* Unlock to allow the sysctl stuff to allocate */ + mtx_unlock(&rs_mtx); + rl_add_syctl_entries(rl_sysctl_root, rs); + /* re-lock for our caller */ + mtx_lock(&rs_mtx); + return (rs); +} + +static const struct tcp_hwrate_limit_table * +tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, + uint64_t bytes_per_sec, uint32_t flags) +{ + struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; + uint64_t mbits_per_sec, ind_calc; + int i; + + mbits_per_sec = (bytes_per_sec * 8); + if (flags & RS_PACING_LT) { + if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && + (rs->rs_lowest_valid <= 2)){ + /* + * Smaller than 1Meg, only + * 3 entries can match it. + */ + for(i = rs->rs_lowest_valid; i < 3; i++) { + if (bytes_per_sec <= rs->rs_rlt[i].rate) { + rte = &rs->rs_rlt[i]; + break; + } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { + arte = &rs->rs_rlt[i]; + } + } + goto done; + } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && + (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ + /* + * Larger than 1G (the majority of + * our table. + */ + if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC) + rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; + else + arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; + goto done; + } + /* + * If we reach here its in our table (between 1Meg - 1000Meg), + * just take the rounded down mbits per second, and add + * 1Megabit to it, from this we can calculate + * the index in the table. + */ + ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; + if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec) + ind_calc++; + /* our table is offset by 3, we add 2 */ + ind_calc += 2; + if (ind_calc > (ALL_HARDWARE_RATES-1)) { + /* This should not happen */ + ind_calc = ALL_HARDWARE_RATES-1; + } + if ((ind_calc >= rs->rs_lowest_valid) && + (ind_calc <= rs->rs_highest_valid)) + rte = &rs->rs_rlt[ind_calc]; + } else if (flags & RS_PACING_EXACT_MATCH) { + if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && + (rs->rs_lowest_valid <= 2)){ + for(i = rs->rs_lowest_valid; i < 3; i++) { + if (bytes_per_sec == rs->rs_rlt[i].rate) { + rte = &rs->rs_rlt[i]; + break; + } + } + } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && + (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { + /* > 1Gbps only one rate */ + if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) { + /* Its 10G wow */ + rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; + } + } else { + /* Ok it must be a exact meg (its between 1G and 1Meg) */ + ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; + if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { + /* its an exact Mbps */ + ind_calc += 2; + if (ind_calc > (ALL_HARDWARE_RATES-1)) { + /* This should not happen */ + ind_calc = ALL_HARDWARE_RATES-1; + } + if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) + rte = &rs->rs_rlt[ind_calc]; + } + } + } else { + /* we want greater than the requested rate */ + if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && + (rs->rs_lowest_valid <= 2)){ + arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */ + for (i=2; i>=rs->rs_lowest_valid; i--) { + if (bytes_per_sec < rs->rs_rlt[i].rate) { + rte = &rs->rs_rlt[i]; + break; + } else if ((flags & RS_PACING_GEQ) && + (bytes_per_sec == rs->rs_rlt[i].rate)) { + rte = &rs->rs_rlt[i]; + break; + } else { + arte = &rs->rs_rlt[i]; /* new alternate */ + } + } + } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) { + if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && + (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){ + /* Our top rate is larger than the request */ + rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; + } else if ((flags & RS_PACING_GEQ) && + (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) && + (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) { + /* It matches our top rate */ + rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; + } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) { + /* The top rate is an alternative */ + arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; + } + } else { + /* Its in our range 1Meg - 1Gig */ + if (flags & RS_PACING_GEQ) { + ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC; + if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) { + if (ind_calc > (ALL_HARDWARE_RATES-1)) { + /* This should not happen */ + ind_calc = (ALL_HARDWARE_RATES-1); + } + rte = &rs->rs_rlt[ind_calc]; + } + goto done; + } + ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC; + ind_calc += 2; + if (ind_calc > (ALL_HARDWARE_RATES-1)) { + /* This should not happen */ + ind_calc = ALL_HARDWARE_RATES-1; + } + if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) + rte = &rs->rs_rlt[ind_calc]; + } + } +done: + if ((rte == NULL) && + (arte != NULL) && + (flags & RS_PACING_SUB_OK)) { + /* We can use the substitute */ + rte = arte; + } + return (rte); +} + +static const struct tcp_hwrate_limit_table * +tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) +{ + /** + * Hunt the rate table with the restrictions in flags and find a + * suitable rate if possible. + * RS_PACING_EXACT_MATCH - look for an exact match to rate. + * RS_PACING_GT - must be greater than. + * RS_PACING_GEQ - must be greater than or equal. + * RS_PACING_LT - must be less than. + * RS_PACING_SUB_OK - If we don't meet criteria a + * substitute is ok. + */ + int i, matched; + struct tcp_hwrate_limit_table *rte = NULL; + + + if ((rs->rs_flags & RS_INT_TBL) && + (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { + /* + * Here we don't want to paw thru + * a big table, we have everything + * from 1Meg - 1000Meg in 1Meg increments. + * Use an alternate method to "lookup". + */ + return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); + } + if ((flags & RS_PACING_LT) || + (flags & RS_PACING_EXACT_MATCH)) { + /* + * For exact and less than we go forward through the table. + * This way when we find one larger we stop (exact was a + * toss up). + */ + for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) { + if ((flags & RS_PACING_EXACT_MATCH) && + (bytes_per_sec == rs->rs_rlt[i].rate)) { + rte = &rs->rs_rlt[i]; + matched = 1; + break; + } else if ((flags & RS_PACING_LT) && + (bytes_per_sec <= rs->rs_rlt[i].rate)) { + rte = &rs->rs_rlt[i]; + matched = 1; + break; + } + if (bytes_per_sec > rs->rs_rlt[i].rate) + break; + } + if ((matched == 0) && + (flags & RS_PACING_LT) && + (flags & RS_PACING_SUB_OK)) { + /* Kick in a substitute (the lowest) */ + rte = &rs->rs_rlt[rs->rs_lowest_valid]; + } + } else { + /* + * Here we go backward through the table so that we can find + * the one greater in theory faster (but its probably a + * wash). + */ + for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) { + if (rs->rs_rlt[i].rate > bytes_per_sec) { + /* A possible candidate */ + rte = &rs->rs_rlt[i]; + } + if ((flags & RS_PACING_GEQ) && + (bytes_per_sec == rs->rs_rlt[i].rate)) { + /* An exact match and we want equal */ + matched = 1; + rte = &rs->rs_rlt[i]; + break; + } else if (rte) { + /* + * Found one that is larger than but don't + * stop, there may be a more closer match. + */ + matched = 1; + } + if (rs->rs_rlt[i].rate < bytes_per_sec) { + /* + * We found a table entry that is smaller, + * stop there will be none greater or equal. + */ + break; + } + } + if ((matched == 0) && + (flags & RS_PACING_SUB_OK)) { + /* Kick in a substitute (the highest) */ + rte = &rs->rs_rlt[rs->rs_highest_valid]; + } + } + return (rte); +} + +static struct ifnet * +rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) +{ + struct ifnet *tifp; + struct m_snd_tag *tag; + union if_snd_tag_alloc_params params = { + .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, + .rate_limit.hdr.flowid = 1, + .rate_limit.max_rate = COMMON_RATE, + .rate_limit.flags = M_NOWAIT, + }; + int err; +#ifdef RSS + params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ? + M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4); +#else + params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH; +#endif + tag = NULL; + if (ifp->if_snd_tag_alloc) { + if (error) + *error = ENODEV; + return (NULL); + } + err = ifp->if_snd_tag_alloc(ifp, ¶ms, &tag); + if (err) { + /* Failed to setup a tag? */ + if (error) + *error = err; + return (NULL); + } + tifp = tag->ifp; + tifp->if_snd_tag_free(tag); + return (tifp); +} + +static const struct tcp_hwrate_limit_table * +rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, + uint32_t flags, int *error) +{ + /* First lets find the interface if it exists */ + const struct tcp_hwrate_limit_table *rte; + struct tcp_rate_set *rs; + struct epoch_tracker et; + int err; + + epoch_enter_preempt(net_epoch_preempt, &et); +use_real_interface: + CK_LIST_FOREACH(rs, &int_rs, next) { + /* + * Note we don't look with the lock since we either see a + * new entry or will get one when we try to add it. + */ + if (rs->rs_flags & RS_IS_DEAD) { + /* The dead are not looked at */ + continue; + } + if ((rs->rs_ifp == ifp) && + (rs->rs_if_dunit == ifp->if_dunit)) { + /* Ok we found it */ + break; + } + } + if ((rs == NULL) || (rs->rs_flags & RS_IS_DEAD)) { + /* + * This means we got a packet *before* + * the IF-UP was processed below, + * while or after we already received an interface + * departed event. In either case we really don't + * want to do anything with pacing, in + * the departing case the packet is not + * going to go very far. The new case + * might be arguable, but its impossible + * to tell from the departing case. + */ + if (rs->rs_disable && error) + *error = ENODEV; + epoch_exit_preempt(net_epoch_preempt, &et); + return (NULL); + } + + if ((rs == NULL) || (rs->rs_disable != 0)) { + if (rs->rs_disable && error) + *error = ENOSPC; + epoch_exit_preempt(net_epoch_preempt, &et); + return (NULL); + } + if (rs->rs_flags & RS_IS_DEFF) { + /* We need to find the real interface */ + struct ifnet *tifp; + + tifp = rt_find_real_interface(ifp, inp, error); + if (tifp == NULL) { + if (rs->rs_disable && error) + *error = ENOTSUP; + epoch_exit_preempt(net_epoch_preempt, &et); + return (NULL); + } + goto use_real_interface; + } + if (rs->rs_flow_limit && + ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) { + if (error) + *error = ENOSPC; + epoch_exit_preempt(net_epoch_preempt, &et); + return (NULL); + } + rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); + if (rte) { + err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, + inp->inp_flowtype, + inp->inp_flowid, + rte->rate, + &inp->inp_snd_tag); + if (err) { + /* Failed to attach */ + if (error) + *error = err; + rte = NULL; + } + } + if (rte) { + /* + * We use an atomic here for accounting so we don't have to + * use locks when freeing. + */ + atomic_add_long(&rs->rs_flows_using, 1); + } + epoch_exit_preempt(net_epoch_preempt, &et); + return (rte); +} + +static void +tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state) +{ + int error; + struct tcp_rate_set *rs; + + if (((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) || + (link_state != LINK_STATE_UP)) { + /* + * We only care on an interface going up that is rate-limit + * capable. + */ + return; + } + mtx_lock(&rs_mtx); + CK_LIST_FOREACH(rs, &int_rs, next) { + if ((rs->rs_ifp == ifp) && + (rs->rs_if_dunit == ifp->if_dunit)) { + /* We already have initialized this guy */ + mtx_unlock(&rs_mtx); + return; + } + } + rt_setup_new_rs(ifp, &error); + mtx_unlock(&rs_mtx); +} + +static void +tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) +{ + struct tcp_rate_set *rs, *nrs; + struct ifnet *tifp; + int i; + + mtx_lock(&rs_mtx); + CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { + if ((rs->rs_ifp == ifp) && + (rs->rs_if_dunit == ifp->if_dunit)) { + CK_LIST_REMOVE(rs, next); + rs_number_alive--; + rs_number_dead++; + rs->rs_flags |= RS_IS_DEAD; + for (i = 0; i < rs->rs_rate_cnt; i++) { + if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { + tifp = rs->rs_rlt[i].tag->ifp; + in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); + rs->rs_rlt[i].tag = NULL; + } + rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; + } + if (rs->rs_flows_using == 0) { + /* + * No references left, so we can schedule the + * destruction after the epoch (with a caveat). + */ + rs->rs_flags |= RS_FUNERAL_SCHD; + epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy); + } + break; + } + } + mtx_unlock(&rs_mtx); +} + +static void +tcp_rl_shutdown(void *arg __unused, int howto __unused) +{ + struct tcp_rate_set *rs, *nrs; + struct ifnet *tifp; + int i; + + mtx_lock(&rs_mtx); + CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { + CK_LIST_REMOVE(rs, next); + rs_number_alive--; + rs_number_dead++; + rs->rs_flags |= RS_IS_DEAD; + for (i = 0; i < rs->rs_rate_cnt; i++) { + if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { + tifp = rs->rs_rlt[i].tag->ifp; + in_pcbdetach_tag(tifp, rs->rs_rlt[i].tag); + rs->rs_rlt[i].tag = NULL; + } + rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; + } + if (rs->rs_flows_using != 0) { + /* + * We dont hold a reference + * so we have nothing left to + * do. + */ + } else { + /* + * No references left, so we can destroy it + * after the epoch. + */ + rs->rs_flags |= RS_FUNERAL_SCHD; + epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy); + } + } + mtx_unlock(&rs_mtx); +} + +const struct tcp_hwrate_limit_table * +tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, + uint64_t bytes_per_sec, int flags, int *error) +{ + const struct tcp_hwrate_limit_table *rte; + + if (tp->t_inpcb->inp_snd_tag == NULL) { + /* + * We are setting up a rate for the first time. + */ + if ((ifp->if_capabilities & IFCAP_TXRTLMT) == 0) { + /* Not supported by the egress */ + if (error) + *error = ENODEV; + return (NULL); + } +#ifdef KERN_TLS + if (tp->t_inpcb->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) { + /* + * We currently can't do both TLS and hardware + * pacing + */ + if (error) + *error = EINVAL; + return (NULL); + } +#endif + rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); + } else { + /* + * We are modifying a rate, wrong interface? + */ + if (error) + *error = EINVAL; + rte = NULL; + } + return (rte); +} + +const struct tcp_hwrate_limit_table * +tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, + struct tcpcb *tp, struct ifnet *ifp, + uint64_t bytes_per_sec, int flags, int *error) +{ + const struct tcp_hwrate_limit_table *nrte; + const struct tcp_rate_set *rs; + int is_indirect = 0; + int err; + + + if ((tp->t_inpcb->inp_snd_tag == NULL) || + (crte == NULL)) { + /* Wrong interface */ + if (error) + *error = EINVAL; + return (NULL); + } + rs = crte->ptbl; + if ((rs->rs_flags & RS_IS_DEAD) || + (crte->flags & HDWRPACE_IFPDEPARTED)) { + /* Release the rate, and try anew */ +re_rate: + tcp_rel_pacing_rate(crte, tp); + nrte = tcp_set_pacing_rate(tp, ifp, + bytes_per_sec, flags, error); + return (nrte); + } + if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) + is_indirect = 1; + else + is_indirect = 0; + if ((is_indirect == 0) && + ((ifp != rs->rs_ifp) || + (ifp->if_dunit != rs->rs_if_dunit))) { + /* + * Something changed, the user is not pointing to the same + * ifp? Maybe a route updated on this guy? + */ + goto re_rate; + } else if (is_indirect) { + /* + * For indirect we have to dig in and find the real interface. + */ + struct ifnet *rifp; + + rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); + if (rifp == NULL) { + /* Can't find it? */ + goto re_rate; + } + if ((rifp != rs->rs_ifp) || + (ifp->if_dunit != rs->rs_if_dunit)) { + goto re_rate; + } + } + nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); + if (nrte == crte) { + /* No change */ + if (error) + *error = 0; + return (crte); + } + if (nrte == NULL) { + /* Release the old rate */ + tcp_rel_pacing_rate(crte, tp); + return (NULL); + } + /* Change rates to our new entry */ + err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); + if (err) { + if (error) + *error = err; + return (NULL); + } + if (error) + *error = 0; + return (nrte); +} + +void +tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp) +{ + const struct tcp_rate_set *crs; + struct tcp_rate_set *rs; + uint64_t pre; + + crs = crte->ptbl; + /* + * Now we must break the const + * in order to release our refcount. + */ + rs = __DECONST(struct tcp_rate_set *, crs); + pre = atomic_fetchadd_long(&rs->rs_flows_using, -1); + if (pre == 1) { + mtx_lock(&rs_mtx); + /* + * Is it dead? + */ + if ((rs->rs_flags & RS_IS_DEAD) && + ((rs->rs_flags & RS_FUNERAL_SCHD) == 0)){ + /* + * We were the last, + * and a funeral is not pending, so + * we must schedule it. + */ + rs->rs_flags |= RS_FUNERAL_SCHD; + epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy); + } + mtx_unlock(&rs_mtx); + } + in_pcbdetach_txrtlmt(tp->t_inpcb); +} + +static eventhandler_tag rl_ifnet_departs; +static eventhandler_tag rl_ifnet_arrives; +static eventhandler_tag rl_shutdown_start; + +static void +tcp_rs_init(void *st __unused) +{ + CK_LIST_INIT(&int_rs); + rs_number_alive = 0; + rs_number_dead = 0;; + mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF); + rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event, + tcp_rl_ifnet_departure, + NULL, EVENTHANDLER_PRI_ANY); + rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event, + tcp_rl_ifnet_link, + NULL, EVENTHANDLER_PRI_ANY); + rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync, + tcp_rl_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + printf("TCP_ratelimit: Is now initialized\n"); +} + +SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL); +#endif