Changeset View
Changeset View
Standalone View
Standalone View
sys/netinet/tcp_ratelimit.c
Context not available. | |||||
#ifdef INET6 | #ifdef INET6 | ||||
#include <netinet6/tcp6_var.h> | #include <netinet6/tcp6_var.h> | ||||
#endif | #endif | ||||
#include <netinet/tcp_hpts.h> | |||||
#include <netinet/tcp_log_buf.h> | |||||
#include <netinet/tcp_ratelimit.h> | #include <netinet/tcp_ratelimit.h> | ||||
#ifndef USECS_IN_SECOND | #ifndef USECS_IN_SECOND | ||||
#define USECS_IN_SECOND 1000000 | #define USECS_IN_SECOND 1000000 | ||||
Context not available. | |||||
180500, /* 1.44Mpbs - rate 2 common rate */ | 180500, /* 1.44Mpbs - rate 2 common rate */ | ||||
375000, /* 3Mbps - rate 3 */ | 375000, /* 3Mbps - rate 3 */ | ||||
625000, /* 5Mbps - rate 4 */ | 625000, /* 5Mbps - rate 4 */ | ||||
875000, /* 7Mbps - rate 5 */ | 1250000, /* 10Mbps - rate 5 */ | ||||
1125000, /* 9Mbps - rate 6 */ | 1875000, /* 15Mbps - rate 6 */ | ||||
1375000, /* 11Mbps - rate 7 */ | 2500000, /* 20Mbps - rate 7 */ | ||||
1625000, /* 13Mbps - rate 8 */ | 3125000, /* 25Mbps - rate 8 */ | ||||
2625000, /* 21Mbps - rate 9 */ | 3750000, /* 30Mbps - rate 9 */ | ||||
3875000, /* 31Mbps - rate 10 */ | 4375000, /* 35Mbps - rate 10 */ | ||||
5125000, /* 41Meg - rate 11 */ | 5000000, /* 40Meg - rate 11 */ | ||||
12500000, /* 100Mbps - rate 12 */ | 6250000, /* 50Mbps - rate 12 */ | ||||
25000000, /* 200Mbps - rate 13 */ | 12500000, /* 100Mbps - rate 13 */ | ||||
50000000, /* 400Mbps - rate 14 */ | 25000000, /* 200Mbps - rate 14 */ | ||||
63750000, /* 51Mbps - rate 15 */ | 50000000, /* 400Mbps - rate 15 */ | ||||
100000000, /* 800Mbps - rate 16 */ | 100000000, /* 800Mbps - rate 16 */ | ||||
1875000, /* 15Mbps - rate 17 */ | 5625000, /* 45Mbps - rate 17 */ | ||||
2125000, /* 17Mbps - rate 18 */ | 6875000, /* 55Mbps - rate 19 */ | ||||
2375000, /* 19Mbps - rate 19 */ | 7500000, /* 60Mbps - rate 20 */ | ||||
2875000, /* 23Mbps - rate 20 */ | 8125000, /* 65Mbps - rate 21 */ | ||||
3125000, /* 25Mbps - rate 21 */ | 8750000, /* 70Mbps - rate 22 */ | ||||
3375000, /* 27Mbps - rate 22 */ | 9375000, /* 75Mbps - rate 23 */ | ||||
3625000, /* 29Mbps - rate 23 */ | 10000000, /* 80Mbps - rate 24 */ | ||||
4125000, /* 33Mbps - rate 24 */ | 10625000, /* 85Mbps - rate 25 */ | ||||
4375000, /* 35Mbps - rate 25 */ | 11250000, /* 90Mbps - rate 26 */ | ||||
4625000, /* 37Mbps - rate 26 */ | 11875000, /* 95Mbps - rate 27 */ | ||||
4875000, /* 39Mbps - rate 27 */ | 12500000, /* 100Mbps - rate 28 */ | ||||
5375000, /* 43Mbps - rate 28 */ | 13750000, /* 110Mbps - rate 29 */ | ||||
5625000, /* 45Mbps - rate 29 */ | 15000000, /* 120Mbps - rate 30 */ | ||||
5875000, /* 47Mbps - rate 30 */ | 16250000, /* 130Mbps - rate 31 */ | ||||
6125000, /* 49Mbps - rate 31 */ | 17500000, /* 140Mbps - rate 32 */ | ||||
6625000, /* 53Mbps - rate 32 */ | 18750000, /* 150Mbps - rate 33 */ | ||||
6875000, /* 55Mbps - rate 33 */ | 20000000, /* 160Mbps - rate 34 */ | ||||
7125000, /* 57Mbps - rate 34 */ | 21250000, /* 170Mbps - rate 35 */ | ||||
7375000, /* 59Mbps - rate 35 */ | 22500000, /* 180Mbps - rate 36 */ | ||||
7625000, /* 61Mbps - rate 36 */ | 23750000, /* 190Mbps - rate 37 */ | ||||
7875000, /* 63Mbps - rate 37 */ | 26250000, /* 210Mbps - rate 38 */ | ||||
8125000, /* 65Mbps - rate 38 */ | 27500000, /* 220Mbps - rate 39 */ | ||||
8375000, /* 67Mbps - rate 39 */ | 28750000, /* 230Mbps - rate 40 */ | ||||
8625000, /* 69Mbps - rate 40 */ | 30000000, /* 240Mbps - rate 41 */ | ||||
8875000, /* 71Mbps - rate 41 */ | 31250000, /* 250Mbps - rate 42 */ | ||||
9125000, /* 73Mbps - rate 42 */ | 34375000, /* 275Mbps - rate 43 */ | ||||
9375000, /* 75Mbps - rate 43 */ | 37500000, /* 300Mbps - rate 44 */ | ||||
9625000, /* 77Mbps - rate 44 */ | 40625000, /* 325Mbps - rate 45 */ | ||||
9875000, /* 79Mbps - rate 45 */ | 43750000, /* 350Mbps - rate 46 */ | ||||
10125000, /* 81Mbps - rate 46 */ | 46875000, /* 375Mbps - rate 47 */ | ||||
10375000, /* 83Mbps - rate 47 */ | 53125000, /* 425Mbps - rate 48 */ | ||||
10625000, /* 85Mbps - rate 48 */ | 56250000, /* 450Mbps - rate 49 */ | ||||
10875000, /* 87Mbps - rate 49 */ | 59375000, /* 475Mbps - rate 50 */ | ||||
11125000, /* 89Mbps - rate 50 */ | 62500000, /* 500Mbps - rate 51 */ | ||||
11375000, /* 91Mbps - rate 51 */ | 68750000, /* 550Mbps - rate 52 */ | ||||
11625000, /* 93Mbps - rate 52 */ | 75000000, /* 600Mbps - rate 53 */ | ||||
11875000, /* 95Mbps - rate 53 */ | 81250000, /* 650Mbps - rate 54 */ | ||||
13125000, /* 105Mbps - rate 54 */ | 87500000, /* 700Mbps - rate 55 */ | ||||
13750000, /* 110Mbps - rate 55 */ | 93750000, /* 750Mbps - rate 56 */ | ||||
14375000, /* 115Mbps - rate 56 */ | 106250000, /* 850Mbps - rate 57 */ | ||||
15000000, /* 120Mbps - rate 57 */ | 112500000, /* 900Mbps - rate 58 */ | ||||
15625000, /* 125Mbps - rate 58 */ | 125000000, /* 1Gbps - rate 59 */ | ||||
16250000, /* 130Mbps - rate 59 */ | 156250000, /* 1.25Gps - rate 60 */ | ||||
16875000, /* 135Mbps - rate 60 */ | 187500000, /* 1.5Gps - rate 61 */ | ||||
17500000, /* 140Mbps - rate 61 */ | 218750000, /* 1.75Gps - rate 62 */ | ||||
18125000, /* 145Mbps - rate 62 */ | 250000000, /* 2Gbps - rate 63 */ | ||||
18750000, /* 150Mbps - rate 64 */ | 281250000, /* 2.25Gps - rate 64 */ | ||||
20000000, /* 160Mbps - rate 65 */ | 312500000, /* 2.5Gbps - rate 65 */ | ||||
21250000, /* 170Mbps - rate 66 */ | 343750000, /* 2.75Gbps - rate 66 */ | ||||
22500000, /* 180Mbps - rate 67 */ | 375000000, /* 3Gbps - rate 67 */ | ||||
23750000, /* 190Mbps - rate 68 */ | 500000000, /* 4Gbps - rate 68 */ | ||||
26250000, /* 210Mbps - rate 69 */ | 625000000, /* 5Gbps - rate 69 */ | ||||
27500000, /* 220Mbps - rate 70 */ | 750000000, /* 6Gbps - rate 70 */ | ||||
28750000, /* 230Mbps - rate 71 */ | 875000000, /* 7Gbps - rate 71 */ | ||||
30000000, /* 240Mbps - rate 72 */ | 1000000000, /* 8Gbps - rate 72 */ | ||||
31250000, /* 250Mbps - rate 73 */ | 1125000000, /* 9Gbps - rate 73 */ | ||||
34375000, /* 275Mbps - rate 74 */ | 1250000000, /* 10Gbps - rate 74 */ | ||||
37500000, /* 300Mbps - rate 75 */ | 1875000000, /* 15Gbps - rate 75 */ | ||||
40625000, /* 325Mbps - rate 76 */ | 2500000000 /* 20Gbps - rate 76 */ | ||||
43750000, /* 350Mbps - rate 77 */ | |||||
46875000, /* 375Mbps - rate 78 */ | |||||
53125000, /* 425Mbps - rate 79 */ | |||||
56250000, /* 450Mbps - rate 80 */ | |||||
59375000, /* 475Mbps - rate 81 */ | |||||
62500000, /* 500Mbps - rate 82 */ | |||||
68750000, /* 550Mbps - rate 83 */ | |||||
75000000, /* 600Mbps - rate 84 */ | |||||
81250000, /* 650Mbps - rate 85 */ | |||||
87500000, /* 700Mbps - rate 86 */ | |||||
93750000, /* 750Mbps - rate 87 */ | |||||
106250000, /* 850Mbps - rate 88 */ | |||||
112500000, /* 900Mbps - rate 89 */ | |||||
125000000, /* 1Gbps - rate 90 */ | |||||
156250000, /* 1.25Gps - rate 91 */ | |||||
187500000, /* 1.5Gps - rate 92 */ | |||||
218750000, /* 1.75Gps - rate 93 */ | |||||
250000000, /* 2Gbps - rate 94 */ | |||||
281250000, /* 2.25Gps - rate 95 */ | |||||
312500000, /* 2.5Gbps - rate 96 */ | |||||
343750000, /* 2.75Gbps - rate 97 */ | |||||
375000000, /* 3Gbps - rate 98 */ | |||||
500000000, /* 4Gbps - rate 99 */ | |||||
625000000, /* 5Gbps - rate 100 */ | |||||
750000000, /* 6Gbps - rate 101 */ | |||||
875000000, /* 7Gbps - rate 102 */ | |||||
1000000000, /* 8Gbps - rate 103 */ | |||||
1125000000, /* 9Gbps - rate 104 */ | |||||
1250000000, /* 10Gbps - rate 105 */ | |||||
1875000000, /* 15Gbps - rate 106 */ | |||||
2500000000 /* 20Gbps - rate 107 */ | |||||
}; | }; | ||||
#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) | #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) | ||||
Context not available. | |||||
static struct mtx rs_mtx; | static struct mtx rs_mtx; | ||||
uint32_t rs_number_alive; | uint32_t rs_number_alive; | ||||
uint32_t rs_number_dead; | uint32_t rs_number_dead; | ||||
static uint32_t rs_floor_mss = 0; | |||||
static uint32_t wait_time_floor = 8000; /* 8 ms */ | |||||
static uint32_t rs_hw_floor_mss = 16; | |||||
static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */ | |||||
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||
"TCP Ratelimit stats"); | "TCP Ratelimit stats"); | ||||
Context not available. | |||||
SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, | SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, | ||||
&rs_number_dead, 0, | &rs_number_dead, 0, | ||||
"Number of interfaces departing from ratelimiting"); | "Number of interfaces departing from ratelimiting"); | ||||
SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW, | |||||
&rs_floor_mss, 0, | |||||
"Number of MSS that will override the normal minimums (0 means don't enforce)"); | |||||
SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW, | |||||
&wait_time_floor, 2000, | |||||
"Has b/w increases what is the wait floor we are willing to wait at the end?"); | |||||
SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW, | |||||
&num_of_waits_allowed, 1, | |||||
"How many time blocks on the end should software pacing be willing to wait?"); | |||||
SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW, | |||||
&rs_hw_floor_mss, 16, | |||||
"Number of mss that are a minum for hardware pacing?"); | |||||
static void | static void | ||||
rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) | rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) | ||||
Context not available. | |||||
OID_AUTO, "rate", CTLFLAG_RD, | OID_AUTO, "rate", CTLFLAG_RD, | ||||
&rs->rs_rlt[i].rate, 0, | &rs->rs_rlt[i].rate, 0, | ||||
"Rate in bytes per second"); | "Rate in bytes per second"); | ||||
SYSCTL_ADD_U64(&rs->sysctl_ctx, | |||||
SYSCTL_CHILDREN(rl_rate_num), | |||||
OID_AUTO, "using", CTLFLAG_RD, | |||||
&rs->rs_rlt[i].using, 0, | |||||
"Number of flows using"); | |||||
SYSCTL_ADD_U64(&rs->sysctl_ctx, | |||||
SYSCTL_CHILDREN(rl_rate_num), | |||||
OID_AUTO, "enobufs", CTLFLAG_RD, | |||||
&rs->rs_rlt[i].rs_num_enobufs, 0, | |||||
"Number of enobufs logged on this rate"); | |||||
} | } | ||||
} | } | ||||
#endif | #endif | ||||
Context not available. | |||||
} | } | ||||
#ifdef INET | #ifdef INET | ||||
#ifdef NETFLIX_STATS | |||||
extern counter_u64_t rate_limit_new; | |||||
extern counter_u64_t rate_limit_chg; | |||||
extern counter_u64_t rate_limit_set_ok; | extern counter_u64_t rate_limit_set_ok; | ||||
extern counter_u64_t rate_limit_active; | extern counter_u64_t rate_limit_active; | ||||
extern counter_u64_t rate_limit_alloc_fail; | extern counter_u64_t rate_limit_alloc_fail; | ||||
#endif | #endif | ||||
#endif | |||||
static int | static int | ||||
rl_attach_txrtlmt(struct ifnet *ifp, | rl_attach_txrtlmt(struct ifnet *ifp, | ||||
Context not available. | |||||
error = m_snd_tag_alloc(ifp, ¶ms, tag); | error = m_snd_tag_alloc(ifp, ¶ms, tag); | ||||
#ifdef INET | #ifdef INET | ||||
#ifdef NETFLIX_STATS | |||||
if (error == 0) { | if (error == 0) { | ||||
counter_u64_add(rate_limit_set_ok, 1); | counter_u64_add(rate_limit_set_ok, 1); | ||||
counter_u64_add(rate_limit_active, 1); | counter_u64_add(rate_limit_active, 1); | ||||
} else if (error != EOPNOTSUPP) | } else if (error != EOPNOTSUPP) | ||||
counter_u64_add(rate_limit_alloc_fail, 1); | counter_u64_add(rate_limit_alloc_fail, 1); | ||||
#endif | |||||
#endif | #endif | ||||
return (error); | return (error); | ||||
} | } | ||||
Context not available. | |||||
int i; | int i; | ||||
struct if_ratelimit_query_results rl; | struct if_ratelimit_query_results rl; | ||||
struct sysctl_oid *rl_sysctl_root; | struct sysctl_oid *rl_sysctl_root; | ||||
struct epoch_tracker et; | |||||
/* | /* | ||||
* We expect to enter with the | * We expect to enter with the | ||||
* mutex locked. | * mutex locked. | ||||
Context not available. | |||||
CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||
""); | ""); | ||||
rl_add_syctl_entries(rl_sysctl_root, rs); | rl_add_syctl_entries(rl_sysctl_root, rs); | ||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
CK_LIST_INSERT_HEAD(&int_rs, rs, next); | CK_LIST_INSERT_HEAD(&int_rs, rs, next); | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
return (rs); | return (rs); | ||||
} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { | } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { | ||||
memset(rs, 0, sizeof(struct tcp_rate_set)); | memset(rs, 0, sizeof(struct tcp_rate_set)); | ||||
Context not available. | |||||
CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||
""); | ""); | ||||
rl_add_syctl_entries(rl_sysctl_root, rs); | rl_add_syctl_entries(rl_sysctl_root, rs); | ||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
CK_LIST_INSERT_HEAD(&int_rs, rs, next); | CK_LIST_INSERT_HEAD(&int_rs, rs, next); | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
return (rs); | return (rs); | ||||
} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { | } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { | ||||
/* Mellanox C4 likely */ | /* Mellanox C4 likely */ | ||||
Context not available. | |||||
*/ | */ | ||||
rs->rs_rlt[i].ptbl = rs; | rs->rs_rlt[i].ptbl = rs; | ||||
rs->rs_rlt[i].tag = NULL; | rs->rs_rlt[i].tag = NULL; | ||||
rs->rs_rlt[i].using = 0; | |||||
rs->rs_rlt[i].rs_num_enobufs = 0; | |||||
/* | /* | ||||
* Calculate the time between. | * Calculate the time between. | ||||
*/ | */ | ||||
Context not available. | |||||
CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, | ||||
""); | ""); | ||||
rl_add_syctl_entries(rl_sysctl_root, rs); | rl_add_syctl_entries(rl_sysctl_root, rs); | ||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
CK_LIST_INSERT_HEAD(&int_rs, rs, next); | CK_LIST_INSERT_HEAD(&int_rs, rs, next); | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
return (rs); | return (rs); | ||||
} | } | ||||
/* | |||||
* For an explanation of why the argument is volatile please | |||||
* look at the comments around rt_setup_rate(). | |||||
*/ | |||||
static const struct tcp_hwrate_limit_table * | static const struct tcp_hwrate_limit_table * | ||||
tcp_int_find_suitable_rate(const struct tcp_rate_set *rs, | tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs, | ||||
uint64_t bytes_per_sec, uint32_t flags) | uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) | ||||
{ | { | ||||
struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; | struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL; | ||||
uint64_t mbits_per_sec, ind_calc; | uint64_t mbits_per_sec, ind_calc, previous_rate = 0; | ||||
int i; | int i; | ||||
mbits_per_sec = (bytes_per_sec * 8); | mbits_per_sec = (bytes_per_sec * 8); | ||||
Context not available. | |||||
* Smaller than 1Meg, only | * Smaller than 1Meg, only | ||||
* 3 entries can match it. | * 3 entries can match it. | ||||
*/ | */ | ||||
previous_rate = 0; | |||||
for(i = rs->rs_lowest_valid; i < 3; i++) { | for(i = rs->rs_lowest_valid; i < 3; i++) { | ||||
if (bytes_per_sec <= rs->rs_rlt[i].rate) { | if (bytes_per_sec <= rs->rs_rlt[i].rate) { | ||||
rte = &rs->rs_rlt[i]; | rte = &rs->rs_rlt[i]; | ||||
Context not available. | |||||
} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { | } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) { | ||||
arte = &rs->rs_rlt[i]; | arte = &rs->rs_rlt[i]; | ||||
} | } | ||||
previous_rate = rs->rs_rlt[i].rate; | |||||
} | } | ||||
goto done; | goto done; | ||||
} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && | } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) && | ||||
Context not available. | |||||
rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; | rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; | ||||
else | else | ||||
arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; | arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; | ||||
previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; | |||||
goto done; | goto done; | ||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
ind_calc = ALL_HARDWARE_RATES-1; | ind_calc = ALL_HARDWARE_RATES-1; | ||||
} | } | ||||
if ((ind_calc >= rs->rs_lowest_valid) && | if ((ind_calc >= rs->rs_lowest_valid) && | ||||
(ind_calc <= rs->rs_highest_valid)) | (ind_calc <= rs->rs_highest_valid)) { | ||||
rte = &rs->rs_rlt[ind_calc]; | rte = &rs->rs_rlt[ind_calc]; | ||||
if (ind_calc >= 1) | |||||
previous_rate = rs->rs_rlt[(ind_calc-1)].rate; | |||||
} | |||||
} else if (flags & RS_PACING_EXACT_MATCH) { | } else if (flags & RS_PACING_EXACT_MATCH) { | ||||
if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && | if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) && | ||||
(rs->rs_lowest_valid <= 2)){ | (rs->rs_lowest_valid <= 2)){ | ||||
Context not available. | |||||
for (i=2; i>=rs->rs_lowest_valid; i--) { | for (i=2; i>=rs->rs_lowest_valid; i--) { | ||||
if (bytes_per_sec < rs->rs_rlt[i].rate) { | if (bytes_per_sec < rs->rs_rlt[i].rate) { | ||||
rte = &rs->rs_rlt[i]; | rte = &rs->rs_rlt[i]; | ||||
if (i >= 1) { | |||||
previous_rate = rs->rs_rlt[(i-1)].rate; | |||||
} | |||||
break; | break; | ||||
} else if ((flags & RS_PACING_GEQ) && | } else if ((flags & RS_PACING_GEQ) && | ||||
(bytes_per_sec == rs->rs_rlt[i].rate)) { | (bytes_per_sec == rs->rs_rlt[i].rate)) { | ||||
rte = &rs->rs_rlt[i]; | rte = &rs->rs_rlt[i]; | ||||
if (i >= 1) { | |||||
previous_rate = rs->rs_rlt[(i-1)].rate; | |||||
} | |||||
break; | break; | ||||
} else { | } else { | ||||
arte = &rs->rs_rlt[i]; /* new alternate */ | arte = &rs->rs_rlt[i]; /* new alternate */ | ||||
Context not available. | |||||
/* The top rate is an alternative */ | /* The top rate is an alternative */ | ||||
arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; | arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)]; | ||||
} | } | ||||
previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate; | |||||
} else { | } else { | ||||
/* Its in our range 1Meg - 1Gig */ | /* Its in our range 1Meg - 1Gig */ | ||||
if (flags & RS_PACING_GEQ) { | if (flags & RS_PACING_GEQ) { | ||||
Context not available. | |||||
ind_calc = (ALL_HARDWARE_RATES-1); | ind_calc = (ALL_HARDWARE_RATES-1); | ||||
} | } | ||||
rte = &rs->rs_rlt[ind_calc]; | rte = &rs->rs_rlt[ind_calc]; | ||||
if (ind_calc >= 1) | |||||
previous_rate = rs->rs_rlt[(ind_calc-1)].rate; | |||||
} | } | ||||
goto done; | goto done; | ||||
} | } | ||||
Context not available. | |||||
/* This should not happen */ | /* This should not happen */ | ||||
ind_calc = ALL_HARDWARE_RATES-1; | ind_calc = ALL_HARDWARE_RATES-1; | ||||
} | } | ||||
if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) | if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) { | ||||
rte = &rs->rs_rlt[ind_calc]; | rte = &rs->rs_rlt[ind_calc]; | ||||
if (ind_calc >= 1) | |||||
previous_rate = rs->rs_rlt[(ind_calc-1)].rate; | |||||
} | |||||
} | } | ||||
} | } | ||||
done: | done: | ||||
Context not available. | |||||
/* We can use the substitute */ | /* We can use the substitute */ | ||||
rte = arte; | rte = arte; | ||||
} | } | ||||
if (lower_rate) | |||||
*lower_rate = previous_rate; | |||||
return (rte); | return (rte); | ||||
} | } | ||||
/* | |||||
* For an explanation of why the argument is volatile please | |||||
* look at the comments around rt_setup_rate(). | |||||
*/ | |||||
static const struct tcp_hwrate_limit_table * | static const struct tcp_hwrate_limit_table * | ||||
tcp_find_suitable_rate(const struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags) | tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate) | ||||
{ | { | ||||
/** | /** | ||||
* Hunt the rate table with the restrictions in flags and find a | * Hunt the rate table with the restrictions in flags and find a | ||||
Context not available. | |||||
*/ | */ | ||||
int i, matched; | int i, matched; | ||||
struct tcp_hwrate_limit_table *rte = NULL; | struct tcp_hwrate_limit_table *rte = NULL; | ||||
uint64_t previous_rate = 0; | |||||
if ((rs->rs_flags & RS_INT_TBL) && | if ((rs->rs_flags & RS_INT_TBL) && | ||||
(rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { | (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) { | ||||
Context not available. | |||||
* from 1Meg - 1000Meg in 1Meg increments. | * from 1Meg - 1000Meg in 1Meg increments. | ||||
* Use an alternate method to "lookup". | * Use an alternate method to "lookup". | ||||
*/ | */ | ||||
return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags)); | return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate)); | ||||
} | } | ||||
if ((flags & RS_PACING_LT) || | if ((flags & RS_PACING_LT) || | ||||
(flags & RS_PACING_EXACT_MATCH)) { | (flags & RS_PACING_EXACT_MATCH)) { | ||||
Context not available. | |||||
(bytes_per_sec == rs->rs_rlt[i].rate)) { | (bytes_per_sec == rs->rs_rlt[i].rate)) { | ||||
rte = &rs->rs_rlt[i]; | rte = &rs->rs_rlt[i]; | ||||
matched = 1; | matched = 1; | ||||
if (lower_rate != NULL) | |||||
*lower_rate = previous_rate; | |||||
break; | break; | ||||
} else if ((flags & RS_PACING_LT) && | } else if ((flags & RS_PACING_LT) && | ||||
(bytes_per_sec <= rs->rs_rlt[i].rate)) { | (bytes_per_sec <= rs->rs_rlt[i].rate)) { | ||||
rte = &rs->rs_rlt[i]; | rte = &rs->rs_rlt[i]; | ||||
matched = 1; | matched = 1; | ||||
if (lower_rate != NULL) | |||||
*lower_rate = previous_rate; | |||||
break; | break; | ||||
} | } | ||||
previous_rate = rs->rs_rlt[i].rate; | |||||
if (bytes_per_sec > rs->rs_rlt[i].rate) | if (bytes_per_sec > rs->rs_rlt[i].rate) | ||||
break; | break; | ||||
} | } | ||||
Context not available. | |||||
* We found a table entry that is smaller, | * We found a table entry that is smaller, | ||||
* stop there will be none greater or equal. | * stop there will be none greater or equal. | ||||
*/ | */ | ||||
if (lower_rate != NULL) | |||||
*lower_rate = rs->rs_rlt[i].rate; | |||||
break; | break; | ||||
} | } | ||||
} | } | ||||
Context not available. | |||||
rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) | rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error) | ||||
{ | { | ||||
struct ifnet *tifp; | struct ifnet *tifp; | ||||
struct m_snd_tag *tag; | struct m_snd_tag *tag, *ntag; | ||||
union if_snd_tag_alloc_params params = { | union if_snd_tag_alloc_params params = { | ||||
.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, | .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, | ||||
.rate_limit.hdr.flowid = 1, | .rate_limit.hdr.flowid = inp->inp_flowid, | ||||
.rate_limit.hdr.numa_domain = inp->inp_numa_domain, | .rate_limit.hdr.numa_domain = inp->inp_numa_domain, | ||||
.rate_limit.max_rate = COMMON_RATE, | .rate_limit.max_rate = COMMON_RATE, | ||||
.rate_limit.flags = M_NOWAIT, | .rate_limit.flags = M_NOWAIT, | ||||
Context not available. | |||||
*error = err; | *error = err; | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
tifp = tag->ifp; | ntag = tag; | ||||
while(ntag->ifp->if_next_snd_tag != NULL) { | |||||
ntag = ntag->ifp->if_next_snd_tag(ntag); | |||||
} | |||||
tifp = ntag->ifp; | |||||
m_snd_tag_rele(tag); | m_snd_tag_rele(tag); | ||||
return (tifp); | return (tifp); | ||||
} | } | ||||
static void | |||||
rl_increment_using(const struct tcp_hwrate_limit_table *rte) | |||||
{ | |||||
struct tcp_hwrate_limit_table *decon_rte; | |||||
decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); | |||||
atomic_add_long(&decon_rte->using, 1); | |||||
} | |||||
static void | |||||
rl_decrement_using(const struct tcp_hwrate_limit_table *rte) | |||||
{ | |||||
struct tcp_hwrate_limit_table *decon_rte; | |||||
decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); | |||||
atomic_subtract_long(&decon_rte->using, 1); | |||||
} | |||||
void | |||||
tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) | |||||
{ | |||||
struct tcp_hwrate_limit_table *decon_rte; | |||||
decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); | |||||
atomic_add_long(&decon_rte->rs_num_enobufs, 1); | |||||
} | |||||
/* | |||||
* Do NOT take the __noinline out of the | |||||
* find_rs_for_ifp() function. If you do the inline | |||||
* of it for the rt_setup_rate() will show you a | |||||
* compiler bug. For some reason the compiler thinks | |||||
* the list can never be empty. The consequence of | |||||
* this will be a crash when we dereference NULL | |||||
* if an ifp is removed just has a hw rate limit | |||||
* is attempted. If you are working on the compiler | |||||
* and want to "test" this go ahead and take the noinline | |||||
* out otherwise let sleeping dogs ly until such time | |||||
* as we get a compiler fix 10/2/20 -- RRS | |||||
*/ | |||||
static __noinline struct tcp_rate_set * | |||||
find_rs_for_ifp(struct ifnet *ifp) | |||||
{ | |||||
struct tcp_rate_set *rs; | |||||
CK_LIST_FOREACH(rs, &int_rs, next) { | |||||
if ((rs->rs_ifp == ifp) && | |||||
(rs->rs_if_dunit == ifp->if_dunit)) { | |||||
/* Ok we found it */ | |||||
return (rs); | |||||
} | |||||
} | |||||
return (NULL); | |||||
} | |||||
static const struct tcp_hwrate_limit_table * | static const struct tcp_hwrate_limit_table * | ||||
rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, | rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec, | ||||
uint32_t flags, int *error) | uint32_t flags, int *error, uint64_t *lower_rate) | ||||
{ | { | ||||
/* First lets find the interface if it exists */ | /* First lets find the interface if it exists */ | ||||
const struct tcp_hwrate_limit_table *rte; | const struct tcp_hwrate_limit_table *rte; | ||||
struct tcp_rate_set *rs; | /* | ||||
* So why is rs volatile? This is to defeat a | |||||
* compiler bug where in the compiler is convinced | |||||
* that rs can never be NULL (which is not true). Because | |||||
* of its conviction it nicely optimizes out the if ((rs == NULL | |||||
* below which means if you get a NULL back you dereference it. | |||||
*/ | |||||
volatile struct tcp_rate_set *rs; | |||||
struct epoch_tracker et; | struct epoch_tracker et; | ||||
struct ifnet *oifp = ifp; | |||||
int err; | int err; | ||||
NET_EPOCH_ENTER(et); | NET_EPOCH_ENTER(et); | ||||
use_real_interface: | use_real_interface: | ||||
CK_LIST_FOREACH(rs, &int_rs, next) { | rs = find_rs_for_ifp(ifp); | ||||
/* | |||||
* Note we don't look with the lock since we either see a | |||||
* new entry or will get one when we try to add it. | |||||
*/ | |||||
if (rs->rs_flags & RS_IS_DEAD) { | |||||
/* The dead are not looked at */ | |||||
continue; | |||||
} | |||||
if ((rs->rs_ifp == ifp) && | |||||
(rs->rs_if_dunit == ifp->if_dunit)) { | |||||
/* Ok we found it */ | |||||
break; | |||||
} | |||||
} | |||||
if ((rs == NULL) || | if ((rs == NULL) || | ||||
(rs->rs_flags & RS_INTF_NO_SUP) || | (rs->rs_flags & RS_INTF_NO_SUP) || | ||||
(rs->rs_flags & RS_IS_DEAD)) { | (rs->rs_flags & RS_IS_DEAD)) { | ||||
Context not available. | |||||
* might be arguable, but its impossible | * might be arguable, but its impossible | ||||
* to tell from the departing case. | * to tell from the departing case. | ||||
*/ | */ | ||||
if (rs->rs_disable && error) | if (error) | ||||
*error = ENODEV; | *error = ENODEV; | ||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
if ((rs == NULL) || (rs->rs_disable != 0)) { | if ((rs == NULL) || (rs->rs_disable != 0)) { | ||||
if (rs->rs_disable && error) | if (error) | ||||
*error = ENOSPC; | *error = ENOSPC; | ||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
return (NULL); | return (NULL); | ||||
Context not available. | |||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
KASSERT((tifp != ifp), | |||||
("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n", | |||||
ifp, inp, tifp)); | |||||
ifp = tifp; | |||||
goto use_real_interface; | goto use_real_interface; | ||||
} | } | ||||
if (rs->rs_flow_limit && | if (rs->rs_flow_limit && | ||||
Context not available. | |||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); | rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); | ||||
if (rte) { | if (rte) { | ||||
err = in_pcbattach_txrtlmt(inp, rs->rs_ifp, | err = in_pcbattach_txrtlmt(inp, oifp, | ||||
inp->inp_flowtype, | inp->inp_flowtype, | ||||
inp->inp_flowid, | inp->inp_flowid, | ||||
rte->rate, | rte->rate, | ||||
Context not available. | |||||
if (error) | if (error) | ||||
*error = err; | *error = err; | ||||
rte = NULL; | rte = NULL; | ||||
} else { | |||||
KASSERT((inp->inp_snd_tag != NULL) , | |||||
("Setup rate has no snd_tag inp:%p rte:%p rate:%lu rs:%p", | |||||
inp, rte, rte->rate, rs)); | |||||
#ifdef NETFLIX_STATS | |||||
counter_u64_add(rate_limit_new, 1); | |||||
#endif | |||||
} | } | ||||
} | } | ||||
if (rte) { | if (rte) { | ||||
Context not available. | |||||
{ | { | ||||
int error; | int error; | ||||
struct tcp_rate_set *rs; | struct tcp_rate_set *rs; | ||||
struct epoch_tracker et; | |||||
if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || | if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) || | ||||
(link_state != LINK_STATE_UP)) { | (link_state != LINK_STATE_UP)) { | ||||
Context not available. | |||||
*/ | */ | ||||
return; | return; | ||||
} | } | ||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
CK_LIST_FOREACH(rs, &int_rs, next) { | rs = find_rs_for_ifp(ifp); | ||||
if ((rs->rs_ifp == ifp) && | if (rs) { | ||||
(rs->rs_if_dunit == ifp->if_dunit)) { | /* We already have initialized this guy */ | ||||
/* We already have initialized this guy */ | mtx_unlock(&rs_mtx); | ||||
mtx_unlock(&rs_mtx); | NET_EPOCH_EXIT(et); | ||||
return; | return; | ||||
} | |||||
} | } | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
rt_setup_new_rs(ifp, &error); | rt_setup_new_rs(ifp, &error); | ||||
} | } | ||||
static void | static void | ||||
tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) | tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp) | ||||
{ | { | ||||
struct tcp_rate_set *rs, *nrs; | struct tcp_rate_set *rs; | ||||
struct epoch_tracker et; | |||||
int i; | int i; | ||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { | rs = find_rs_for_ifp(ifp); | ||||
if ((rs->rs_ifp == ifp) && | if (rs) { | ||||
(rs->rs_if_dunit == ifp->if_dunit)) { | CK_LIST_REMOVE(rs, next); | ||||
CK_LIST_REMOVE(rs, next); | rs_number_alive--; | ||||
rs_number_alive--; | rs->rs_flags |= RS_IS_DEAD; | ||||
rs->rs_flags |= RS_IS_DEAD; | for (i = 0; i < rs->rs_rate_cnt; i++) { | ||||
for (i = 0; i < rs->rs_rate_cnt; i++) { | if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { | ||||
if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) { | in_pcbdetach_tag(rs->rs_rlt[i].tag); | ||||
in_pcbdetach_tag(rs->rs_rlt[i].tag); | rs->rs_rlt[i].tag = NULL; | ||||
rs->rs_rlt[i].tag = NULL; | |||||
} | |||||
rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; | |||||
} | } | ||||
if (rs->rs_flows_using == 0) | rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED; | ||||
rs_defer_destroy(rs); | |||||
break; | |||||
} | } | ||||
if (rs->rs_flows_using == 0) | |||||
rs_defer_destroy(rs); | |||||
} | } | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
} | } | ||||
static void | static void | ||||
tcp_rl_shutdown(void *arg __unused, int howto __unused) | tcp_rl_shutdown(void *arg __unused, int howto __unused) | ||||
{ | { | ||||
struct tcp_rate_set *rs, *nrs; | struct tcp_rate_set *rs, *nrs; | ||||
struct epoch_tracker et; | |||||
int i; | int i; | ||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { | CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) { | ||||
CK_LIST_REMOVE(rs, next); | CK_LIST_REMOVE(rs, next); | ||||
Context not available. | |||||
rs_defer_destroy(rs); | rs_defer_destroy(rs); | ||||
} | } | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
} | } | ||||
const struct tcp_hwrate_limit_table * | const struct tcp_hwrate_limit_table * | ||||
tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, | tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp, | ||||
uint64_t bytes_per_sec, int flags, int *error) | uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) | ||||
{ | { | ||||
const struct tcp_hwrate_limit_table *rte; | const struct tcp_hwrate_limit_table *rte; | ||||
#ifdef KERN_TLS | #ifdef KERN_TLS | ||||
Context not available. | |||||
} | } | ||||
} | } | ||||
#endif | #endif | ||||
rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error); | rte = rt_setup_rate(tp->t_inpcb, ifp, bytes_per_sec, flags, error, lower_rate); | ||||
if (rte) | |||||
rl_increment_using(rte); | |||||
#ifdef KERN_TLS | #ifdef KERN_TLS | ||||
if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { | if (rte != NULL && tls != NULL && tls->snd_tag != NULL) { | ||||
/* | /* | ||||
Context not available. | |||||
*error = EINVAL; | *error = EINVAL; | ||||
rte = NULL; | rte = NULL; | ||||
} | } | ||||
tp->t_pacing_rate = rte->rate; | if (rte != NULL) { | ||||
*error = 0; | tp->t_pacing_rate = rte->rate; | ||||
*error = 0; | |||||
} | |||||
return (rte); | return (rte); | ||||
} | } | ||||
const struct tcp_hwrate_limit_table * | const struct tcp_hwrate_limit_table * | ||||
tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, | tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte, | ||||
struct tcpcb *tp, struct ifnet *ifp, | struct tcpcb *tp, struct ifnet *ifp, | ||||
uint64_t bytes_per_sec, int flags, int *error) | uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate) | ||||
{ | { | ||||
const struct tcp_hwrate_limit_table *nrte; | const struct tcp_hwrate_limit_table *nrte; | ||||
const struct tcp_rate_set *rs; | const struct tcp_rate_set *rs; | ||||
#ifdef KERN_TLS | #ifdef KERN_TLS | ||||
struct ktls_session *tls = NULL; | struct ktls_session *tls = NULL; | ||||
#endif | #endif | ||||
int is_indirect = 0; | |||||
int err; | int err; | ||||
INP_WLOCK_ASSERT(tp->t_inpcb); | INP_WLOCK_ASSERT(tp->t_inpcb); | ||||
Context not available. | |||||
if ((rs->rs_flags & RS_IS_DEAD) || | if ((rs->rs_flags & RS_IS_DEAD) || | ||||
(crte->flags & HDWRPACE_IFPDEPARTED)) { | (crte->flags & HDWRPACE_IFPDEPARTED)) { | ||||
/* Release the rate, and try anew */ | /* Release the rate, and try anew */ | ||||
re_rate: | |||||
tcp_rel_pacing_rate(crte, tp); | tcp_rel_pacing_rate(crte, tp); | ||||
nrte = tcp_set_pacing_rate(tp, ifp, | nrte = tcp_set_pacing_rate(tp, ifp, | ||||
bytes_per_sec, flags, error); | bytes_per_sec, flags, error, lower_rate); | ||||
return (nrte); | return (nrte); | ||||
} | } | ||||
if ((rs->rs_flags & RT_IS_INDIRECT ) == RT_IS_INDIRECT) | nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate); | ||||
is_indirect = 1; | |||||
else | |||||
is_indirect = 0; | |||||
if ((is_indirect == 0) && | |||||
((ifp != rs->rs_ifp) || | |||||
(ifp->if_dunit != rs->rs_if_dunit))) { | |||||
/* | |||||
* Something changed, the user is not pointing to the same | |||||
* ifp? Maybe a route updated on this guy? | |||||
*/ | |||||
goto re_rate; | |||||
} else if (is_indirect) { | |||||
/* | |||||
* For indirect we have to dig in and find the real interface. | |||||
*/ | |||||
struct ifnet *rifp; | |||||
rifp = rt_find_real_interface(ifp, tp->t_inpcb, error); | |||||
if (rifp == NULL) { | |||||
/* Can't find it? */ | |||||
goto re_rate; | |||||
} | |||||
if ((rifp != rs->rs_ifp) || | |||||
(ifp->if_dunit != rs->rs_if_dunit)) { | |||||
goto re_rate; | |||||
} | |||||
} | |||||
nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags); | |||||
if (nrte == crte) { | if (nrte == crte) { | ||||
/* No change */ | /* No change */ | ||||
if (error) | if (error) | ||||
Context not available. | |||||
} | } | ||||
if (nrte == NULL) { | if (nrte == NULL) { | ||||
/* Release the old rate */ | /* Release the old rate */ | ||||
if (error) | |||||
*error = ENOENT; | |||||
tcp_rel_pacing_rate(crte, tp); | tcp_rel_pacing_rate(crte, tp); | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
rl_decrement_using(crte); | |||||
rl_increment_using(nrte); | |||||
/* Change rates to our new entry */ | /* Change rates to our new entry */ | ||||
#ifdef KERN_TLS | #ifdef KERN_TLS | ||||
if (tls != NULL) | if (tls != NULL) | ||||
Context not available. | |||||
#endif | #endif | ||||
err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); | err = in_pcbmodify_txrtlmt(tp->t_inpcb, nrte->rate); | ||||
if (err) { | if (err) { | ||||
rl_decrement_using(nrte); | |||||
/* Do we still have a snd-tag attached? */ | |||||
if (tp->t_inpcb->inp_snd_tag) | |||||
in_pcbdetach_txrtlmt(tp->t_inpcb); | |||||
if (error) | if (error) | ||||
*error = err; | *error = err; | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
#ifdef NETFLIX_STATS | |||||
else | |||||
counter_u64_add(rate_limit_chg, 1); | |||||
#endif | |||||
if (error) | if (error) | ||||
*error = 0; | *error = 0; | ||||
tp->t_pacing_rate = nrte->rate; | tp->t_pacing_rate = nrte->rate; | ||||
Context not available. | |||||
* in order to release our refcount. | * in order to release our refcount. | ||||
*/ | */ | ||||
rs = __DECONST(struct tcp_rate_set *, crs); | rs = __DECONST(struct tcp_rate_set *, crs); | ||||
rl_decrement_using(crte); | |||||
pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); | pre = atomic_fetchadd_64(&rs->rs_flows_using, -1); | ||||
if (pre == 1) { | if (pre == 1) { | ||||
struct epoch_tracker et; | |||||
NET_EPOCH_ENTER(et); | |||||
mtx_lock(&rs_mtx); | mtx_lock(&rs_mtx); | ||||
/* | /* | ||||
* Is it dead? | * Is it dead? | ||||
Context not available. | |||||
if (rs->rs_flags & RS_IS_DEAD) | if (rs->rs_flags & RS_IS_DEAD) | ||||
rs_defer_destroy(rs); | rs_defer_destroy(rs); | ||||
mtx_unlock(&rs_mtx); | mtx_unlock(&rs_mtx); | ||||
NET_EPOCH_EXIT(et); | |||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
#define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ | #define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */ | ||||
#define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ | #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */ | ||||
static void | |||||
tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso, | |||||
uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between, | |||||
uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod) | |||||
{ | |||||
if (tp->t_logstate != TCP_LOG_STATE_OFF) { | |||||
union tcp_log_stackspecific log; | |||||
struct timeval tv; | |||||
uint32_t cts; | |||||
memset(&log, 0, sizeof(log)); | |||||
cts = tcp_get_usecs(&tv); | |||||
log.u_bbr.flex1 = segsiz; | |||||
log.u_bbr.flex2 = new_tso; | |||||
log.u_bbr.flex3 = time_between; | |||||
log.u_bbr.flex4 = calc_time_between; | |||||
log.u_bbr.flex5 = segs; | |||||
log.u_bbr.flex6 = res_div; | |||||
log.u_bbr.flex7 = mult; | |||||
log.u_bbr.flex8 = mod; | |||||
log.u_bbr.timeStamp = tcp_get_usecs(&tv); | |||||
log.u_bbr.cur_del_rate = bw; | |||||
log.u_bbr.delRate = hw_rate; | |||||
TCP_LOG_EVENTP(tp, NULL, | |||||
&tp->t_inpcb->inp_socket->so_rcv, | |||||
&tp->t_inpcb->inp_socket->so_snd, | |||||
TCP_HDWR_PACE_SIZE, 0, | |||||
0, &log, false, &tv); | |||||
} | |||||
} | |||||
uint32_t | uint32_t | ||||
tcp_get_pacing_burst_size (uint64_t bw, uint32_t segsiz, int can_use_1mss, | tcp_get_pacing_burst_size (struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss, | ||||
const struct tcp_hwrate_limit_table *te, int *err) | const struct tcp_hwrate_limit_table *te, int *err) | ||||
{ | { | ||||
/* | /* | ||||
Context not available. | |||||
min_tso_segs = 1; | min_tso_segs = 1; | ||||
else | else | ||||
min_tso_segs = 2; | min_tso_segs = 2; | ||||
if (new_tso < min_tso_segs) | if (rs_floor_mss && (new_tso < rs_floor_mss)) | ||||
new_tso = rs_floor_mss; | |||||
else if (new_tso < min_tso_segs) | |||||
new_tso = min_tso_segs; | new_tso = min_tso_segs; | ||||
if (new_tso > MAX_MSS_SENT) | if (new_tso > MAX_MSS_SENT) | ||||
new_tso = MAX_MSS_SENT; | new_tso = MAX_MSS_SENT; | ||||
new_tso *= segsiz; | new_tso *= segsiz; | ||||
tcp_log_pacing_size(tp, bw, segsiz, new_tso, | |||||
0, 0, 0, 0, 0, 0, 1); | |||||
/* | /* | ||||
* If we are not doing hardware pacing | * If we are not doing hardware pacing | ||||
* then we are done. | * then we are done. | ||||
Context not available. | |||||
* max (43 segments). | * max (43 segments). | ||||
*/ | */ | ||||
if (te->rate > FIVE_HUNDRED_MBPS) | if (te->rate > FIVE_HUNDRED_MBPS) | ||||
return (segsiz * MAX_MSS_SENT); | goto max; | ||||
if (te->rate == bw) { | if (te->rate == bw) { | ||||
/* We are pacing at exactly the hdwr rate */ | /* We are pacing at exactly the hdwr rate */ | ||||
max: | |||||
tcp_log_pacing_size(tp, bw, segsiz, new_tso, | |||||
te->rate, te->time_between, (uint32_t)0, | |||||
(segsiz * MAX_MSS_SENT), 0, 0, 3); | |||||
return (segsiz * MAX_MSS_SENT); | return (segsiz * MAX_MSS_SENT); | ||||
} | } | ||||
lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; | lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND; | ||||
res = lentim / bw; | res = lentim / bw; | ||||
if (res > te->time_between) { | if (res > te->time_between) { | ||||
uint32_t delta, segs; | uint32_t delta, segs, res_div; | ||||
res_div = ((res * num_of_waits_allowed) + wait_time_floor); | |||||
delta = res - te->time_between; | delta = res - te->time_between; | ||||
segs = (res + delta - 1)/delta; | segs = (res_div + delta - 1)/delta; | ||||
if (te->rate > ONE_HUNDRED_MBPS) | |||||
segs *= 2; | |||||
if (segs < min_tso_segs) | if (segs < min_tso_segs) | ||||
segs = min_tso_segs; | segs = min_tso_segs; | ||||
if (segs < rs_hw_floor_mss) | |||||
segs = rs_hw_floor_mss; | |||||
if (segs > MAX_MSS_SENT) | if (segs > MAX_MSS_SENT) | ||||
segs = MAX_MSS_SENT; | segs = MAX_MSS_SENT; | ||||
segs *= segsiz; | segs *= segsiz; | ||||
tcp_log_pacing_size(tp, bw, segsiz, new_tso, | |||||
te->rate, te->time_between, (uint32_t)res, | |||||
segs, res_div, 1, 3); | |||||
if (err) | if (err) | ||||
*err = 0; | *err = 0; | ||||
if (segs < new_tso) { | if (segs < new_tso) { | ||||
Context not available. | |||||
* hardware. Send back the non-hardware | * hardware. Send back the non-hardware | ||||
* rate. | * rate. | ||||
*/ | */ | ||||
tcp_log_pacing_size(tp, bw, segsiz, new_tso, | |||||
te->rate, te->time_between, (uint32_t)res, | |||||
0, 0, 0, 4); | |||||
if (err) | if (err) | ||||
*err = -1; | *err = -1; | ||||
return (new_tso); | return (new_tso); | ||||
} | } | ||||
} | } | ||||
uint64_t | |||||
tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp) | |||||
{ | |||||
struct epoch_tracker et; | |||||
struct tcp_rate_set *rs; | |||||
uint64_t rate_ret; | |||||
NET_EPOCH_ENTER(et); | |||||
use_next_interface: | |||||
rs = find_rs_for_ifp(ifp); | |||||
if (rs == NULL) { | |||||
/* This interface does not do ratelimiting */ | |||||
rate_ret = 0; | |||||
} else if (rs->rs_flags & RS_IS_DEFF) { | |||||
/* We need to find the real interface */ | |||||
struct ifnet *tifp; | |||||
tifp = rt_find_real_interface(ifp, inp, NULL); | |||||
if (tifp == NULL) { | |||||
NET_EPOCH_EXIT(et); | |||||
return (0); | |||||
} | |||||
ifp = tifp; | |||||
goto use_next_interface; | |||||
} else { | |||||
/* Lets return the highest rate this guy has */ | |||||
rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate; | |||||
} | |||||
NET_EPOCH_EXIT(et); | |||||
return(rate_ret); | |||||
} | |||||
static eventhandler_tag rl_ifnet_departs; | static eventhandler_tag rl_ifnet_departs; | ||||
static eventhandler_tag rl_ifnet_arrives; | static eventhandler_tag rl_ifnet_arrives; | ||||
static eventhandler_tag rl_shutdown_start; | static eventhandler_tag rl_shutdown_start; | ||||
Context not available. |