Changeset View
Changeset View
Standalone View
Standalone View
netinet/tcp_stacks/rack.c
/*- | /*- | ||||
* Copyright (c) 2016-2019 Netflix, Inc. | * Copyright (c) 2016 | ||||
* Netflix Inc. All rights reserved. | |||||
* | * | ||||
* Redistribution and use in source and binary forms, with or without | * Redistribution and use in source and binary forms, with or without | ||||
* modification, are permitted provided that the following conditions | * modification, are permitted provided that the following conditions | ||||
Context not available. | |||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/mbuf.h> | #include <sys/mbuf.h> | ||||
#include <sys/proc.h> /* for proc0 declaration */ | #include <sys/proc.h> /* for proc0 declaration */ | ||||
#ifdef NETFLIX_STATS | |||||
#include <sys/qmath.h> | |||||
#endif | |||||
#include <sys/socket.h> | #include <sys/socket.h> | ||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/systm.h> | #include <sys/systm.h> | ||||
#include <sys/tree.h> | |||||
#ifdef NETFLIX_STATS | #ifdef NETFLIX_STATS | ||||
#include <sys/stats.h> | #include <sys/stats.h> /* Must come after qmath.h and tree.h */ | ||||
#endif | #endif | ||||
#include <sys/refcount.h> | #include <sys/refcount.h> | ||||
#include <sys/queue.h> | #include <sys/queue.h> | ||||
Context not available. | |||||
#include <netinet/ip6.h> | #include <netinet/ip6.h> | ||||
#include <netinet6/in6_pcb.h> | #include <netinet6/in6_pcb.h> | ||||
#include <netinet6/ip6_var.h> | #include <netinet6/ip6_var.h> | ||||
#define TCPOUTFLAGS | |||||
#include <netinet/tcp.h> | #include <netinet/tcp.h> | ||||
#define TCPOUTFLAGS | |||||
#include <netinet/tcp_fsm.h> | #include <netinet/tcp_fsm.h> | ||||
#include <netinet/tcp_log_buf.h> | #include <netinet/tcp_log_buf.h> | ||||
#include <netinet/tcp_seq.h> | #include <netinet/tcp_seq.h> | ||||
Context not available. | |||||
#include <netinet/tcp_hpts.h> | #include <netinet/tcp_hpts.h> | ||||
#include <netinet/tcpip.h> | #include <netinet/tcpip.h> | ||||
#include <netinet/cc/cc.h> | #include <netinet/cc/cc.h> | ||||
#ifdef NETFLIX_CWV | |||||
#include <netinet/tcp_newcwv.h> | |||||
#endif | |||||
#include <netinet/tcp_fastopen.h> | #include <netinet/tcp_fastopen.h> | ||||
#ifdef TCPDEBUG | #ifdef TCPDEBUG | ||||
#include <netinet/tcp_debug.h> | #include <netinet/tcp_debug.h> | ||||
Context not available. | |||||
struct sysctl_ctx_list rack_sysctl_ctx; | struct sysctl_ctx_list rack_sysctl_ctx; | ||||
struct sysctl_oid *rack_sysctl_root; | struct sysctl_oid *rack_sysctl_root; | ||||
#ifndef TCPHPTS | |||||
fatal error missing option TCPHSTS in the build; | |||||
#endif | |||||
#define CUM_ACKED 1 | #define CUM_ACKED 1 | ||||
#define SACKED 2 | #define SACKED 2 | ||||
Context not available. | |||||
static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ | static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ | ||||
static int32_t rack_verbose_logging = 0; | static int32_t rack_verbose_logging = 0; | ||||
static int32_t rack_ignore_data_after_close = 1; | static int32_t rack_ignore_data_after_close = 1; | ||||
static int32_t rack_map_entries_limit = 1024; | |||||
static int32_t rack_map_split_limit = 256; | |||||
/* | /* | ||||
* Currently regular tcp has a rto_min of 30ms | * Currently regular tcp has a rto_min of 30ms | ||||
* the backoff goes 12 times so that ends up | * the backoff goes 12 times so that ends up | ||||
Context not available. | |||||
static int32_t rack_sack_block_limit = 128; | static int32_t rack_sack_block_limit = 128; | ||||
static int32_t rack_use_sack_filter = 1; | static int32_t rack_use_sack_filter = 1; | ||||
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; | static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; | ||||
static uint32_t rack_map_split_limit = 0; /* unlimited by default */ | |||||
/* Rack specific counters */ | /* Rack specific counters */ | ||||
counter_u64_t rack_badfr; | counter_u64_t rack_badfr; | ||||
Context not available. | |||||
counter_u64_t rack_to_alloc; | counter_u64_t rack_to_alloc; | ||||
counter_u64_t rack_to_alloc_hard; | counter_u64_t rack_to_alloc_hard; | ||||
counter_u64_t rack_to_alloc_emerg; | counter_u64_t rack_to_alloc_emerg; | ||||
counter_u64_t rack_to_alloc_limited; | |||||
counter_u64_t rack_alloc_limited_conns; | counter_u64_t rack_alloc_limited_conns; | ||||
counter_u64_t rack_split_limited; | counter_u64_t rack_split_limited; | ||||
Context not available. | |||||
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; | counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; | ||||
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; | counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; | ||||
/* | |||||
* This was originally defined in tcp_timer.c, but is now reproduced here given | |||||
* the unification of the SYN and non-SYN retransmit timer exponents combined | |||||
* with wanting to retain previous behaviour for previously deployed stack | |||||
* versions. | |||||
*/ | |||||
int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = | |||||
{ 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; | |||||
static void | static void | ||||
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); | rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); | ||||
static int | static int | ||||
rack_process_ack(struct mbuf *m, struct tcphdr *th, | rack_process_ack(struct mbuf *m, struct tcphdr *th, | ||||
struct socket *so, struct tcpcb *tp, struct tcpopt *to, | struct socket *so, struct tcpcb *tp, struct tcpopt *to, | ||||
uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); | uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); | ||||
static int | static int | ||||
rack_process_data(struct mbuf *m, struct tcphdr *th, | rack_process_data(struct mbuf *m, struct tcphdr *th, | ||||
Context not available. | |||||
rack_do_closing(struct mbuf *m, struct tcphdr *th, | rack_do_closing(struct mbuf *m, struct tcphdr *th, | ||||
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, | struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, | ||||
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); | int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); | ||||
static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); | |||||
static void | static void | ||||
rack_do_drop(struct mbuf *m, struct tcpcb *tp); | |||||
static void | |||||
rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, | rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, | ||||
struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); | struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); | ||||
static void | static void | ||||
rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, | rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, | ||||
struct tcphdr *th, int32_t rstreason, int32_t tlen); | struct tcphdr *th, int32_t rstreason, int32_t tlen); | ||||
static int | static int | ||||
rack_do_established(struct mbuf *m, struct tcphdr *th, | rack_do_established(struct mbuf *m, struct tcphdr *th, | ||||
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, | struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, | ||||
Context not available. | |||||
counter_u64_zero(rack_sack_proc_short); | counter_u64_zero(rack_sack_proc_short); | ||||
counter_u64_zero(rack_sack_proc_restart); | counter_u64_zero(rack_sack_proc_restart); | ||||
counter_u64_zero(rack_to_alloc); | counter_u64_zero(rack_to_alloc); | ||||
counter_u64_zero(rack_to_alloc_limited); | |||||
counter_u64_zero(rack_alloc_limited_conns); | counter_u64_zero(rack_alloc_limited_conns); | ||||
counter_u64_zero(rack_split_limited); | counter_u64_zero(rack_split_limited); | ||||
counter_u64_zero(rack_find_high); | counter_u64_zero(rack_find_high); | ||||
Context not available. | |||||
{ | { | ||||
SYSCTL_ADD_S32(&rack_sysctl_ctx, | SYSCTL_ADD_S32(&rack_sysctl_ctx, | ||||
SYSCTL_CHILDREN(rack_sysctl_root), | SYSCTL_CHILDREN(rack_sysctl_root), | ||||
OID_AUTO, "map_limit", CTLFLAG_RW, | |||||
&rack_map_entries_limit , 1024, | |||||
"Is there a limit on how big the sendmap can grow? "); | |||||
SYSCTL_ADD_S32(&rack_sysctl_ctx, | |||||
SYSCTL_CHILDREN(rack_sysctl_root), | |||||
OID_AUTO, "map_splitlimit", CTLFLAG_RW, | |||||
&rack_map_split_limit , 256, | |||||
"Is there a limit on how much splitting a peer can do?"); | |||||
SYSCTL_ADD_S32(&rack_sysctl_ctx, | |||||
SYSCTL_CHILDREN(rack_sysctl_root), | |||||
OID_AUTO, "rate_sample_method", CTLFLAG_RW, | OID_AUTO, "rate_sample_method", CTLFLAG_RW, | ||||
&rack_rate_sample_method , USE_RTT_LOW, | &rack_rate_sample_method , USE_RTT_LOW, | ||||
"What method should we use for rate sampling 0=high, 1=low "); | "What method should we use for rate sampling 0=high, 1=low "); | ||||
Context not available. | |||||
OID_AUTO, "pktdelay", CTLFLAG_RW, | OID_AUTO, "pktdelay", CTLFLAG_RW, | ||||
&rack_pkt_delay, 1, | &rack_pkt_delay, 1, | ||||
"Extra RACK time (in ms) besides reordering thresh"); | "Extra RACK time (in ms) besides reordering thresh"); | ||||
SYSCTL_ADD_U32(&rack_sysctl_ctx, | |||||
SYSCTL_CHILDREN(rack_sysctl_root), | |||||
OID_AUTO, "split_limit", CTLFLAG_RW, | |||||
&rack_map_split_limit, 0, | |||||
"Is there a limit on the number of map split entries (0=unlimited)"); | |||||
SYSCTL_ADD_S32(&rack_sysctl_ctx, | SYSCTL_ADD_S32(&rack_sysctl_ctx, | ||||
SYSCTL_CHILDREN(rack_sysctl_root), | SYSCTL_CHILDREN(rack_sysctl_root), | ||||
OID_AUTO, "inc_var", CTLFLAG_RW, | OID_AUTO, "inc_var", CTLFLAG_RW, | ||||
Context not available. | |||||
OID_AUTO, "allocemerg", CTLFLAG_RD, | OID_AUTO, "allocemerg", CTLFLAG_RD, | ||||
&rack_to_alloc_emerg, | &rack_to_alloc_emerg, | ||||
"Total allocations done from emergency cache"); | "Total allocations done from emergency cache"); | ||||
rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); | |||||
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, | |||||
SYSCTL_CHILDREN(rack_sysctl_root), | |||||
OID_AUTO, "alloc_limited", CTLFLAG_RD, | |||||
&rack_to_alloc_limited, | |||||
"Total allocations dropped due to limit"); | |||||
rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); | rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); | ||||
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, | SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, | ||||
SYSCTL_CHILDREN(rack_sysctl_root), | SYSCTL_CHILDREN(rack_sysctl_root), | ||||
Context not available. | |||||
static inline int32_t | static inline int32_t | ||||
rack_progress_timeout_check(struct tcpcb *tp) | rack_progress_timeout_check(struct tcpcb *tp) | ||||
{ | { | ||||
#ifdef NETFLIX_PROGRESS | |||||
if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { | if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { | ||||
if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { | if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { | ||||
/* | /* | ||||
Context not available. | |||||
struct tcp_rack *rack; | struct tcp_rack *rack; | ||||
rack = (struct tcp_rack *)tp->t_fb_ptr; | rack = (struct tcp_rack *)tp->t_fb_ptr; | ||||
counter_u64_add(rack_progress_drops, 1); | counter_u64_add(rack_progress_drops, 1); | ||||
#ifdef NETFLIX_STATS | |||||
TCPSTAT_INC(tcps_progdrops); | TCPSTAT_INC(tcps_progdrops); | ||||
#endif | |||||
rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); | rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); | ||||
return (1); | return (1); | ||||
} | } | ||||
} | } | ||||
#endif | |||||
return (0); | return (0); | ||||
} | } | ||||
Context not available. | |||||
union tcp_log_stackspecific log; | union tcp_log_stackspecific log; | ||||
struct timeval tv; | struct timeval tv; | ||||
memset(&log, 0, sizeof(log)); | |||||
/* Convert our ms to a microsecond */ | /* Convert our ms to a microsecond */ | ||||
log.u_bbr.flex1 = rtt * 1000; | log.u_bbr.flex1 = rtt * 1000; | ||||
log.u_bbr.timeStamp = tcp_get_usecs(&tv); | log.u_bbr.timeStamp = tcp_get_usecs(&tv); | ||||
Context not available. | |||||
{ | { | ||||
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { | if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { | ||||
union tcp_log_stackspecific log; | union tcp_log_stackspecific log; | ||||
memset(&log, 0, sizeof(log)); | |||||
log.u_bbr.flex1 = did_out; | log.u_bbr.flex1 = did_out; | ||||
log.u_bbr.flex2 = nxt_pkt; | log.u_bbr.flex2 = nxt_pkt; | ||||
log.u_bbr.flex3 = way_out; | log.u_bbr.flex3 = way_out; | ||||
Context not available. | |||||
counter_u64_free(rack_sack_proc_short); | counter_u64_free(rack_sack_proc_short); | ||||
counter_u64_free(rack_sack_proc_restart); | counter_u64_free(rack_sack_proc_restart); | ||||
counter_u64_free(rack_to_alloc); | counter_u64_free(rack_to_alloc); | ||||
counter_u64_free(rack_to_alloc_limited); | |||||
counter_u64_free(rack_split_limited); | |||||
counter_u64_free(rack_find_high); | counter_u64_free(rack_find_high); | ||||
counter_u64_free(rack_runt_sacks); | counter_u64_free(rack_runt_sacks); | ||||
counter_u64_free(rack_enter_tlp_calc); | counter_u64_free(rack_enter_tlp_calc); | ||||
Context not available. | |||||
rsm = uma_zalloc(rack_zone, M_NOWAIT); | rsm = uma_zalloc(rack_zone, M_NOWAIT); | ||||
if (rsm) { | if (rsm) { | ||||
alloc_done: | rack->r_ctl.rc_num_maps_alloced++; | ||||
counter_u64_add(rack_to_alloc, 1); | counter_u64_add(rack_to_alloc, 1); | ||||
rack->r_ctl.rc_num_maps_alloced++; | |||||
return (rsm); | return (rsm); | ||||
} | } | ||||
if (rack->rc_free_cnt) { | if (rack->rc_free_cnt) { | ||||
Context not available. | |||||
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); | rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); | ||||
TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); | TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); | ||||
rack->rc_free_cnt--; | rack->rc_free_cnt--; | ||||
goto alloc_done; | return (rsm); | ||||
} | } | ||||
return (NULL); | return (NULL); | ||||
} | } | ||||
static struct rack_sendmap * | |||||
rack_alloc_full_limit(struct tcp_rack *rack) | |||||
{ | |||||
if ((rack_map_entries_limit > 0) && | |||||
(rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { | |||||
counter_u64_add(rack_to_alloc_limited, 1); | |||||
if (!rack->alloc_limit_reported) { | |||||
rack->alloc_limit_reported = 1; | |||||
counter_u64_add(rack_alloc_limited_conns, 1); | |||||
} | |||||
return (NULL); | |||||
} | |||||
return (rack_alloc(rack)); | |||||
} | |||||
/* wrapper to allocate a sendmap entry, subject to a specific limit */ | /* wrapper to allocate a sendmap entry, subject to a specific limit */ | ||||
static struct rack_sendmap * | static struct rack_sendmap * | ||||
rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) | rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) | ||||
Context not available. | |||||
/* currently there is only one limit type */ | /* currently there is only one limit type */ | ||||
rack->r_ctl.rc_num_split_allocs--; | rack->r_ctl.rc_num_split_allocs--; | ||||
} | } | ||||
rack->r_ctl.rc_num_maps_alloced--; | |||||
if (rack->r_ctl.rc_tlpsend == rsm) | if (rack->r_ctl.rc_tlpsend == rsm) | ||||
rack->r_ctl.rc_tlpsend = NULL; | rack->r_ctl.rc_tlpsend = NULL; | ||||
if (rack->r_ctl.rc_next == rsm) | if (rack->r_ctl.rc_next == rsm) | ||||
Context not available. | |||||
if (rack->rc_free_cnt < rack_free_cache) { | if (rack->rc_free_cnt < rack_free_cache) { | ||||
memset(rsm, 0, sizeof(struct rack_sendmap)); | memset(rsm, 0, sizeof(struct rack_sendmap)); | ||||
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); | TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); | ||||
rsm->r_limit_type = 0; | |||||
rack->rc_free_cnt++; | rack->rc_free_cnt++; | ||||
return; | return; | ||||
} | } | ||||
rack->r_ctl.rc_num_maps_alloced--; | |||||
uma_zfree(rack_zone, rsm); | uma_zfree(rack_zone, rsm); | ||||
} | } | ||||
Context not available. | |||||
#ifdef NETFLIX_STATS | #ifdef NETFLIX_STATS | ||||
int32_t gput; | int32_t gput; | ||||
#endif | #endif | ||||
#ifdef NETFLIX_CWV | |||||
u_long old_cwnd = tp->snd_cwnd; | |||||
#endif | |||||
INP_WLOCK_ASSERT(tp->t_inpcb); | INP_WLOCK_ASSERT(tp->t_inpcb); | ||||
tp->ccv->nsegs = nsegs; | tp->ccv->nsegs = nsegs; | ||||
tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); | tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); | ||||
if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { | if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { | ||||
Context not available. | |||||
tp->t_stats_gput_prev); | tp->t_stats_gput_prev); | ||||
tp->t_flags &= ~TF_GPUTINPROG; | tp->t_flags &= ~TF_GPUTINPROG; | ||||
tp->t_stats_gput_prev = gput; | tp->t_stats_gput_prev = gput; | ||||
#ifdef NETFLIX_CWV | |||||
if (tp->t_maxpeakrate) { | if (tp->t_maxpeakrate) { | ||||
/* | /* | ||||
* We update t_peakrate_thr. This gives us roughly | * We update t_peakrate_thr. This gives us roughly | ||||
Context not available. | |||||
*/ | */ | ||||
tcp_update_peakrate_thr(tp); | tcp_update_peakrate_thr(tp); | ||||
} | } | ||||
#endif | |||||
} | } | ||||
#endif | #endif | ||||
if (tp->snd_cwnd > tp->snd_ssthresh) { | if (tp->snd_cwnd > tp->snd_ssthresh) { | ||||
Context not available. | |||||
if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { | if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { | ||||
rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; | rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; | ||||
} | } | ||||
#ifdef NETFLIX_CWV | |||||
if (tp->cwv_enabled) { | |||||
/* | |||||
* Per RFC 7661: The behaviour in the non-validated phase is | |||||
* specified as: o A sender determines whether to increase | |||||
* the cwnd based upon whether it is cwnd-limited (see | |||||
* Section 4.5.3): * A sender that is cwnd-limited MAY use | |||||
* the standard TCP method to increase cwnd (i.e., the | |||||
* standard method permits a TCP sender that fully utilises | |||||
* the cwnd to increase the cwnd each time it receives an | |||||
* ACK). * A sender that is not cwnd-limited MUST NOT | |||||
* increase the cwnd when ACK packets are received in this | |||||
* phase (i.e., needs to avoid growing the cwnd when it has | |||||
* not recently sent using the current size of cwnd). | |||||
*/ | |||||
if ((tp->snd_cwnd > old_cwnd) && | |||||
(tp->cwv_cwnd_valid == 0) && | |||||
(!(tp->ccv->flags & CCF_CWND_LIMITED))) { | |||||
tp->snd_cwnd = old_cwnd; | |||||
} | |||||
/* Try to update pipeAck and NCWV state */ | |||||
if (TCPS_HAVEESTABLISHED(tp->t_state) && | |||||
!IN_RECOVERY(tp->t_flags)) { | |||||
uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); | |||||
tcp_newcwv_update_pipeack(tp, data); | |||||
} | |||||
} | |||||
/* we enforce max peak rate if it is set. */ | /* we enforce max peak rate if it is set. */ | ||||
if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { | if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { | ||||
tp->snd_cwnd = tp->t_peakrate_thr; | tp->snd_cwnd = tp->t_peakrate_thr; | ||||
} | } | ||||
#endif | |||||
} | } | ||||
static void | static void | ||||
Context not available. | |||||
tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; | tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; | ||||
rack->r_ctl.rc_prr_sndcnt = 0; | rack->r_ctl.rc_prr_sndcnt = 0; | ||||
} | } | ||||
tp->snd_recover = tp->snd_una; | |||||
EXIT_RECOVERY(tp->t_flags); | EXIT_RECOVERY(tp->t_flags); | ||||
#ifdef NETFLIX_CWV | |||||
if (tp->cwv_enabled) { | |||||
if ((tp->cwv_cwnd_valid == 0) && | |||||
(tp->snd_cwv.in_recovery)) | |||||
tcp_newcwv_end_recovery(tp); | |||||
} | |||||
#endif | |||||
} | } | ||||
static void | static void | ||||
Context not available. | |||||
tp->ccv->curack = th->th_ack; | tp->ccv->curack = th->th_ack; | ||||
CC_ALGO(tp)->cong_signal(tp->ccv, type); | CC_ALGO(tp)->cong_signal(tp->ccv, type); | ||||
} | } | ||||
#ifdef NETFLIX_CWV | |||||
if (tp->cwv_enabled) { | |||||
if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { | |||||
tcp_newcwv_enter_recovery(tp); | |||||
} | |||||
if (type == CC_RTO) { | |||||
tcp_newcwv_reset(tp); | |||||
} | |||||
} | |||||
#endif | |||||
} | } | ||||
Context not available. | |||||
if (CC_ALGO(tp)->after_idle != NULL) | if (CC_ALGO(tp)->after_idle != NULL) | ||||
CC_ALGO(tp)->after_idle(tp->ccv); | CC_ALGO(tp)->after_idle(tp->ccv); | ||||
if (tp->snd_cwnd == 1) | if (V_tcp_initcwnd_segments) | ||||
i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ | i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), | ||||
else | max(2 * tp->t_maxseg, 14600)); | ||||
i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); | else if (V_tcp_do_rfc3390) | ||||
i_cwnd = min(4 * tp->t_maxseg, | |||||
max(2 * tp->t_maxseg, 4380)); | |||||
else { | |||||
/* Per RFC5681 Section 3.1 */ | |||||
if (tp->t_maxseg > 2190) | |||||
i_cwnd = 2 * tp->t_maxseg; | |||||
else if (tp->t_maxseg > 1095) | |||||
i_cwnd = 3 * tp->t_maxseg; | |||||
else | |||||
i_cwnd = 4 * tp->t_maxseg; | |||||
} | |||||
if (reduce_largest) { | if (reduce_largest) { | ||||
/* | /* | ||||
* Do we reduce the largest cwnd to make | * Do we reduce the largest cwnd to make | ||||
Context not available. | |||||
} | } | ||||
static void | static void | ||||
rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, | rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) | ||||
int32_t rstreason, int32_t tlen) | |||||
{ | { | ||||
if (tp != NULL) { | if (tp != NULL) { | ||||
tcp_dropwithreset(m, th, tp, tlen, rstreason); | tcp_dropwithreset(m, th, tp, tlen, rstreason); | ||||
Context not available. | |||||
* TCB is still valid and locked. | * TCB is still valid and locked. | ||||
*/ | */ | ||||
static int | static int | ||||
rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) | rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) | ||||
{ | { | ||||
int32_t todrop; | int32_t todrop; | ||||
int32_t thflags; | int32_t thflags; | ||||
Context not available. | |||||
TCPSTAT_INC(tcps_rcvpartduppack); | TCPSTAT_INC(tcps_rcvpartduppack); | ||||
TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); | TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); | ||||
} | } | ||||
/* | |||||
* DSACK - add SACK block for dropped range | |||||
*/ | |||||
if (tp->t_flags & TF_SACK_PERMIT) { | |||||
tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); | |||||
/* | |||||
* ACK now, as the next in-sequence segment | |||||
* will clear the DSACK block again | |||||
*/ | |||||
tp->t_flags |= TF_ACKNOW; | |||||
} | |||||
*drop_hdrlen += todrop; /* drop from the top afterwards */ | *drop_hdrlen += todrop; /* drop from the top afterwards */ | ||||
th->th_seq += todrop; | th->th_seq += todrop; | ||||
tlen -= todrop; | tlen -= todrop; | ||||
Context not available. | |||||
/* We can't start any timer in persists */ | /* We can't start any timer in persists */ | ||||
return (rack_get_persists_timer_val(tp, rack)); | return (rack_get_persists_timer_val(tp, rack)); | ||||
} | } | ||||
if (tp->t_state < TCPS_ESTABLISHED) | |||||
goto activate_rxt; | |||||
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); | rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); | ||||
if (rsm == NULL) { | if (rsm == NULL) { | ||||
/* Nothing on the send map */ | /* Nothing on the send map */ | ||||
Context not available. | |||||
*/ | */ | ||||
goto activate_rxt; | goto activate_rxt; | ||||
} | } | ||||
if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) { | |||||
/* | |||||
* Peer collapsed rwnd, don't do TLP. | |||||
*/ | |||||
goto activate_rxt; | |||||
} | |||||
rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); | rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); | ||||
if (rsm == NULL) { | if (rsm == NULL) { | ||||
/* We found no rsm to TLP with. */ | /* We found no rsm to TLP with. */ | ||||
Context not available. | |||||
/* A previous call is already set up */ | /* A previous call is already set up */ | ||||
return; | return; | ||||
} | } | ||||
if (tp->t_state == TCPS_CLOSED) { | |||||
if ((tp->t_state == TCPS_CLOSED) || | |||||
(tp->t_state == TCPS_LISTEN)) { | |||||
return; | return; | ||||
} | } | ||||
stopped = rack->rc_tmr_stopped; | stopped = rack->rc_tmr_stopped; | ||||
Context not available. | |||||
* We are still left on the hpts when the to goes | * We are still left on the hpts when the to goes | ||||
* it will be for output. | * it will be for output. | ||||
*/ | */ | ||||
if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) | if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) | ||||
slot = cts - rack->r_ctl.rc_last_output_to; | slot = rack->r_ctl.rc_last_output_to - cts; | ||||
else | else | ||||
slot = 1; | slot = 1; | ||||
} | } | ||||
Context not available. | |||||
} | } | ||||
hpts_timeout = rack_timer_start(tp, rack, cts); | hpts_timeout = rack_timer_start(tp, rack, cts); | ||||
if (tp->t_flags & TF_DELACK) { | if (tp->t_flags & TF_DELACK) { | ||||
delayed_ack = TICKS_2_MSEC(tcp_delacktime); | delayed_ack = tcp_delacktime; | ||||
rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; | rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; | ||||
} | } | ||||
if (delayed_ack && ((hpts_timeout == 0) || | if (delayed_ack && ((hpts_timeout == 0) || | ||||
Context not available. | |||||
return (0); | return (0); | ||||
} | } | ||||
static struct rack_sendmap * | |||||
rack_merge_rsm(struct tcp_rack *rack, | |||||
struct rack_sendmap *l_rsm, | |||||
struct rack_sendmap *r_rsm) | |||||
{ | |||||
/* | |||||
* We are merging two ack'd RSM's, | |||||
* the l_rsm is on the left (lower seq | |||||
* values) and the r_rsm is on the right | |||||
* (higher seq value). The simplest way | |||||
* to merge these is to move the right | |||||
* one into the left. I don't think there | |||||
* is any reason we need to try to find | |||||
* the oldest (or last oldest retransmitted). | |||||
*/ | |||||
l_rsm->r_end = r_rsm->r_end; | |||||
if (r_rsm->r_rtr_bytes) | |||||
l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; | |||||
if (r_rsm->r_in_tmap) { | |||||
/* This really should not happen */ | |||||
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); | |||||
} | |||||
/* Now the flags */ | |||||
if (r_rsm->r_flags & RACK_HAS_FIN) | |||||
l_rsm->r_flags |= RACK_HAS_FIN; | |||||
if (r_rsm->r_flags & RACK_TLP) | |||||
l_rsm->r_flags |= RACK_TLP; | |||||
TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next); | |||||
if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { | |||||
/* Transfer the split limit to the map we free */ | |||||
r_rsm->r_limit_type = l_rsm->r_limit_type; | |||||
l_rsm->r_limit_type = 0; | |||||
} | |||||
rack_free(rack, r_rsm); | |||||
return(l_rsm); | |||||
} | |||||
/* | /* | ||||
* TLP Timer, here we simply setup what segment we want to | * TLP Timer, here we simply setup what segment we want to | ||||
* have the TLP expire on, the normal rack_output() will then | * have the TLP expire on, the normal rack_output() will then | ||||
Context not available. | |||||
int32_t idx; | int32_t idx; | ||||
struct rack_sendmap *nrsm; | struct rack_sendmap *nrsm; | ||||
nrsm = rack_alloc(rack); | nrsm = rack_alloc_full_limit(rack); | ||||
if (nrsm == NULL) { | if (nrsm == NULL) { | ||||
/* | /* | ||||
* No memory to split, we will just exit and punt | * No memory to split, we will just exit and punt | ||||
Context not available. | |||||
TCPSTAT_INC(tcps_rexmttimeo); | TCPSTAT_INC(tcps_rexmttimeo); | ||||
if ((tp->t_state == TCPS_SYN_SENT) || | if ((tp->t_state == TCPS_SYN_SENT) || | ||||
(tp->t_state == TCPS_SYN_RECEIVED)) | (tp->t_state == TCPS_SYN_RECEIVED)) | ||||
rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); | rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); | ||||
else | else | ||||
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; | rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; | ||||
TCPT_RANGESET(tp->t_rxtcur, rexmt, | TCPT_RANGESET(tp->t_rxtcur, rexmt, | ||||
Context not available. | |||||
* Here we retransmitted less than the whole thing which means we | * Here we retransmitted less than the whole thing which means we | ||||
* have to split this into what was transmitted and what was not. | * have to split this into what was transmitted and what was not. | ||||
*/ | */ | ||||
nrsm = rack_alloc(rack); | nrsm = rack_alloc_full_limit(rack); | ||||
if (nrsm == NULL) { | if (nrsm == NULL) { | ||||
/* | /* | ||||
* We can't get memory, so lets not proceed. | * We can't get memory, so lets not proceed. | ||||
Context not available. | |||||
* Hmm out of memory and the tcb got destroyed while | * Hmm out of memory and the tcb got destroyed while | ||||
* we tried to wait. | * we tried to wait. | ||||
*/ | */ | ||||
#ifdef INVARIANTS | |||||
panic("Out of memory when we should not be rack:%p", rack); | |||||
#endif | |||||
return; | return; | ||||
} | } | ||||
if (th_flags & TH_FIN) { | if (th_flags & TH_FIN) { | ||||
Context not available. | |||||
rsm->r_tim_lastsent[0] = ts; | rsm->r_tim_lastsent[0] = ts; | ||||
rsm->r_rtr_cnt = 1; | rsm->r_rtr_cnt = 1; | ||||
rsm->r_rtr_bytes = 0; | rsm->r_rtr_bytes = 0; | ||||
if (th_flags & TH_SYN) { | rsm->r_start = seq_out; | ||||
/* The data space is one beyond snd_una */ | rsm->r_end = rsm->r_start + len; | ||||
rsm->r_start = seq_out + 1; | |||||
rsm->r_end = rsm->r_start + (len - 1); | |||||
} else { | |||||
/* Normal case */ | |||||
rsm->r_start = seq_out; | |||||
rsm->r_end = rsm->r_start + len; | |||||
} | |||||
rsm->r_sndcnt = 0; | rsm->r_sndcnt = 0; | ||||
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); | TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); | ||||
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); | TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); | ||||
Context not available. | |||||
* Ok we must split off the front and then let the | * Ok we must split off the front and then let the | ||||
* update do the rest | * update do the rest | ||||
*/ | */ | ||||
nrsm = rack_alloc(rack); | nrsm = rack_alloc_full_limit(rack); | ||||
if (nrsm == NULL) { | if (nrsm == NULL) { | ||||
#ifdef INVARIANTS | |||||
panic("Ran out of memory that was preallocated? rack:%p", rack); | |||||
#endif | |||||
rack_update_rsm(tp, rack, rsm, ts); | rack_update_rsm(tp, rack, rsm, ts); | ||||
return; | return; | ||||
} | } | ||||
Context not available. | |||||
if (nrsm->r_flags & RACK_ACKED) { | if (nrsm->r_flags & RACK_ACKED) { | ||||
/* Skip ack'd segments */ | /* Skip ack'd segments */ | ||||
continue; | continue; | ||||
} | |||||
if (nrsm->r_flags & RACK_SACK_PASSED) { | |||||
/* | |||||
* We found one that is already marked | |||||
* passed, we have been here before and | |||||
* so all others below this are marked. | |||||
*/ | |||||
break; | |||||
} | } | ||||
idx = nrsm->r_rtr_cnt - 1; | idx = nrsm->r_rtr_cnt - 1; | ||||
if (ts == nrsm->r_tim_lastsent[idx]) { | if (ts == nrsm->r_tim_lastsent[idx]) { | ||||
Context not available. | |||||
rsm->r_in_tmap = 0; | rsm->r_in_tmap = 0; | ||||
} | } | ||||
out: | out: | ||||
if (rsm && (rsm->r_flags & RACK_ACKED)) { | |||||
/* | |||||
* Now can we merge this newly acked | |||||
* block with either the previous or | |||||
* next block? | |||||
*/ | |||||
nrsm = TAILQ_NEXT(rsm, r_next); | |||||
if (nrsm && | |||||
(nrsm->r_flags & RACK_ACKED)) { | |||||
/* yep this and next can be merged */ | |||||
rsm = rack_merge_rsm(rack, rsm, nrsm); | |||||
} | |||||
/* Now what about the previous? */ | |||||
nrsm = TAILQ_PREV(rsm, rack_head, r_next); | |||||
if (nrsm && | |||||
(nrsm->r_flags & RACK_ACKED)) { | |||||
/* yep the previous and this can be merged */ | |||||
rsm = rack_merge_rsm(rack, nrsm, rsm); | |||||
} | |||||
} | |||||
if (used_ref == 0) { | if (used_ref == 0) { | ||||
counter_u64_add(rack_sack_proc_all, 1); | counter_u64_add(rack_sack_proc_all, 1); | ||||
} else { | } else { | ||||
Context not available. | |||||
} | } | ||||
sack_blocks[num_sack_blks] = sack; | sack_blocks[num_sack_blks] = sack; | ||||
num_sack_blks++; | num_sack_blks++; | ||||
#ifdef NETFLIX_STATS | |||||
} else if (SEQ_LEQ(sack.start, th_ack) && | } else if (SEQ_LEQ(sack.start, th_ack) && | ||||
SEQ_LEQ(sack.end, th_ack)) { | SEQ_LEQ(sack.end, th_ack)) { | ||||
/* | /* | ||||
* Its a D-SACK block. | * Its a D-SACK block. | ||||
*/ | */ | ||||
tcp_record_dsack(sack.start, sack.end); | /* tcp_record_dsack(sack.start, sack.end); */ | ||||
#endif | |||||
} | } | ||||
} | } | ||||
if (num_sack_blks == 0) | if (num_sack_blks == 0) | ||||
goto out; | goto out; | ||||
Context not available. | |||||
* just one pass. | * just one pass. | ||||
*/ | */ | ||||
if (rack_use_sack_filter) { | if (rack_use_sack_filter) { | ||||
num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); | num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, | ||||
num_sack_blks, th->th_ack); | |||||
ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); | |||||
} | } | ||||
if (num_sack_blks < 2) { | if (num_sack_blks < 2) { | ||||
goto do_sack_work; | goto do_sack_work; | ||||
Context not available. | |||||
return (0); | return (0); | ||||
} | } | ||||
if (rack->r_ctl.rc_early_recovery) { | if (rack->r_ctl.rc_early_recovery) { | ||||
if (IN_FASTRECOVERY(tp->t_flags)) { | if (IN_RECOVERY(tp->t_flags)) { | ||||
if (SEQ_LT(th->th_ack, tp->snd_recover)) { | if (SEQ_LT(th->th_ack, tp->snd_recover) && | ||||
(SEQ_LT(th->th_ack, tp->snd_max))) { | |||||
tcp_rack_partialack(tp, th); | tcp_rack_partialack(tp, th); | ||||
} else { | } else { | ||||
rack_post_recovery(tp, th); | rack_post_recovery(tp, th); | ||||
Context not available. | |||||
sowwakeup_locked(so); | sowwakeup_locked(so); | ||||
m_freem(mfree); | m_freem(mfree); | ||||
if (rack->r_ctl.rc_early_recovery == 0) { | if (rack->r_ctl.rc_early_recovery == 0) { | ||||
if (IN_FASTRECOVERY(tp->t_flags)) { | if (IN_RECOVERY(tp->t_flags)) { | ||||
if (SEQ_LT(th->th_ack, tp->snd_recover)) { | if (SEQ_LT(th->th_ack, tp->snd_recover) && | ||||
(SEQ_LT(th->th_ack, tp->snd_max))) { | |||||
tcp_rack_partialack(tp, th); | tcp_rack_partialack(tp, th); | ||||
} else { | } else { | ||||
rack_post_recovery(tp, th); | rack_post_recovery(tp, th); | ||||
Context not available. | |||||
* send garbage on first SYN. | * send garbage on first SYN. | ||||
*/ | */ | ||||
int32_t nsegs; | int32_t nsegs; | ||||
#ifdef TCP_RFC7413 | |||||
int32_t tfo_syn; | int32_t tfo_syn; | ||||
#else | |||||
#define tfo_syn (FALSE) | |||||
#endif | |||||
struct tcp_rack *rack; | struct tcp_rack *rack; | ||||
rack = (struct tcp_rack *)tp->t_fb_ptr; | rack = (struct tcp_rack *)tp->t_fb_ptr; | ||||
Context not available. | |||||
* PRU_RCVD). If a FIN has already been received on this connection | * PRU_RCVD). If a FIN has already been received on this connection | ||||
* then we just ignore the text. | * then we just ignore the text. | ||||
*/ | */ | ||||
#ifdef TCP_RFC7413 | |||||
tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && | tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && | ||||
IS_FASTOPEN(tp->t_flags)); | (tp->t_flags & TF_FASTOPEN)); | ||||
#endif | |||||
if ((tlen || (thflags & TH_FIN) || tfo_syn) && | if ((tlen || (thflags & TH_FIN) || tfo_syn) && | ||||
TCPS_HAVERCVDFIN(tp->t_state) == 0) { | TCPS_HAVERCVDFIN(tp->t_state) == 0) { | ||||
tcp_seq save_start = th->th_seq; | tcp_seq save_start = th->th_seq; | ||||
Context not available. | |||||
/* Clean receiver SACK report if present */ | /* Clean receiver SACK report if present */ | ||||
if (tp->rcv_numsacks) | /* if (tp->rcv_numsacks) | ||||
tcp_clean_sackreport(tp); | tcp_clean_sackreport(tp); | ||||
*/ | |||||
TCPSTAT_INC(tcps_preddat); | TCPSTAT_INC(tcps_preddat); | ||||
tp->rcv_nxt += tlen; | tp->rcv_nxt += tlen; | ||||
/* | /* | ||||
Context not available. | |||||
tp->irs = th->th_seq; | tp->irs = th->th_seq; | ||||
tcp_rcvseqinit(tp); | tcp_rcvseqinit(tp); | ||||
if (thflags & TH_ACK) { | if (thflags & TH_ACK) { | ||||
int tfo_partial = 0; | |||||
TCPSTAT_INC(tcps_connects); | TCPSTAT_INC(tcps_connects); | ||||
soisconnected(so); | soisconnected(so); | ||||
#ifdef MAC | #ifdef MAC | ||||
Context not available. | |||||
tp->rcv_adv += min(tp->rcv_wnd, | tp->rcv_adv += min(tp->rcv_wnd, | ||||
TCP_MAXWIN << tp->rcv_scale); | TCP_MAXWIN << tp->rcv_scale); | ||||
/* | /* | ||||
* If not all the data that was sent in the TFO SYN | |||||
* has been acked, resend the remainder right away. | |||||
*/ | |||||
if (IS_FASTOPEN(tp->t_flags) && | |||||
(tp->snd_una != tp->snd_max)) { | |||||
tp->snd_nxt = th->th_ack; | |||||
tfo_partial = 1; | |||||
} | |||||
/* | |||||
* If there's data, delay ACK; if there's also a FIN ACKNOW | * If there's data, delay ACK; if there's also a FIN ACKNOW | ||||
* will be turned on later. | * will be turned on later. | ||||
*/ | */ | ||||
if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { | if (DELAY_ACK(tp, tlen) && tlen != 0) { | ||||
rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, | rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, | ||||
((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); | ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); | ||||
tp->t_flags |= TF_DELACK; | tp->t_flags |= TF_DELACK; | ||||
Context not available. | |||||
tp->t_flags |= TF_ACKNOW; | tp->t_flags |= TF_ACKNOW; | ||||
} | } | ||||
if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && | if ((thflags & TH_ECE) && V_tcp_do_ecn) { | ||||
V_tcp_do_ecn) { | |||||
tp->t_flags |= TF_ECN_PERMIT; | tp->t_flags |= TF_ECN_PERMIT; | ||||
TCPSTAT_INC(tcps_ecn_shs); | TCPSTAT_INC(tcps_ecn_shs); | ||||
} | } | ||||
if (SEQ_GT(th->th_ack, tp->snd_una)) { | |||||
/* | |||||
* We advance snd_una for the | |||||
* fast open case. If th_ack is | |||||
* acknowledging data beyond | |||||
* snd_una we can't just call | |||||
* ack-processing since the | |||||
* data stream in our send-map | |||||
* will start at snd_una + 1 (one | |||||
* beyond the SYN). If its just | |||||
* equal we don't need to do that | |||||
* and there is no send_map. | |||||
*/ | |||||
tp->snd_una++; | |||||
} | |||||
/* | /* | ||||
* Received <SYN,ACK> in SYN_SENT[*] state. Transitions: | * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: | ||||
* SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 | * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 | ||||
Context not available. | |||||
} | } | ||||
} | } | ||||
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, | return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, | ||||
tiwin, thflags, nxt_pkt)); | tiwin, thflags, nxt_pkt)); | ||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); | rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); | ||||
return (1); | return (1); | ||||
} | } | ||||
if (IS_FASTOPEN(tp->t_flags)) { | #ifdef TCP_RFC7413 | ||||
if (tp->t_flags & TF_FASTOPEN) { | |||||
/* | /* | ||||
* When a TFO connection is in SYN_RECEIVED, the | * When a TFO connection is in SYN_RECEIVED, the only valid | ||||
* only valid packets are the initial SYN, a | * packets are the initial SYN, a retransmit/copy of the | ||||
* retransmit/copy of the initial SYN (possibly with | * initial SYN (possibly with a subset of the original | ||||
* a subset of the original data), a valid ACK, a | * data), a valid ACK, a FIN, or a RST. | ||||
* FIN, or a RST. | |||||
*/ | */ | ||||
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { | if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { | ||||
rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); | rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); | ||||
Context not available. | |||||
return (0); | return (0); | ||||
} | } | ||||
} | } | ||||
#endif | |||||
if (thflags & TH_RST) | if (thflags & TH_RST) | ||||
return (rack_process_rst(m, th, so, tp)); | return (rack_process_rst(m, th, so, tp)); | ||||
/* | /* | ||||
* RFC5961 Section 4.2 Send challenge ACK for any SYN in | |||||
* synchronized state. | |||||
*/ | |||||
if (thflags & TH_SYN) { | |||||
rack_challenge_ack(m, th, tp, &ret_val); | |||||
return (ret_val); | |||||
} | |||||
/* | |||||
* RFC 1323 PAWS: If we have a timestamp reply on this segment and | * RFC 1323 PAWS: If we have a timestamp reply on this segment and | ||||
* it's less than ts_recent, drop it. | * it's less than ts_recent, drop it. | ||||
*/ | */ | ||||
Context not available. | |||||
tp->ts_recent_age = tcp_ts_getticks(); | tp->ts_recent_age = tcp_ts_getticks(); | ||||
tp->ts_recent = to->to_tsval; | tp->ts_recent = to->to_tsval; | ||||
} | } | ||||
tp->snd_wnd = tiwin; | |||||
/* | /* | ||||
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag | * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag | ||||
* is on (half-synchronized state), then queue data for later | * is on (half-synchronized state), then queue data for later | ||||
Context not available. | |||||
* processing; else drop segment and return. | * processing; else drop segment and return. | ||||
*/ | */ | ||||
if ((thflags & TH_ACK) == 0) { | if ((thflags & TH_ACK) == 0) { | ||||
if (IS_FASTOPEN(tp->t_flags)) { | #ifdef TCP_RFC7413 | ||||
if (tp->t_flags & TF_FASTOPEN) { | |||||
tp->snd_wnd = tiwin; | |||||
cc_conn_init(tp); | cc_conn_init(tp); | ||||
} | } | ||||
#endif | |||||
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, | return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, | ||||
tiwin, thflags, nxt_pkt)); | tiwin, thflags, nxt_pkt)); | ||||
} | } | ||||
Context not available. | |||||
if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == | if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == | ||||
(TF_RCVD_SCALE | TF_REQ_SCALE)) { | (TF_RCVD_SCALE | TF_REQ_SCALE)) { | ||||
tp->rcv_scale = tp->request_r_scale; | tp->rcv_scale = tp->request_r_scale; | ||||
tp->snd_wnd = tiwin; | |||||
} | } | ||||
/* | /* | ||||
* Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> | * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> | ||||
Context not available. | |||||
* FIN-WAIT-1 | * FIN-WAIT-1 | ||||
*/ | */ | ||||
tp->t_starttime = ticks; | tp->t_starttime = ticks; | ||||
if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { | |||||
tcp_fastopen_decrement_counter(tp->t_tfo_pending); | |||||
tp->t_tfo_pending = NULL; | |||||
/* | |||||
* Account for the ACK of our SYN prior to | |||||
* regular ACK processing below. | |||||
*/ | |||||
tp->snd_una++; | |||||
} | |||||
if (tp->t_flags & TF_NEEDFIN) { | if (tp->t_flags & TF_NEEDFIN) { | ||||
tcp_state_change(tp, TCPS_FIN_WAIT_1); | tcp_state_change(tp, TCPS_FIN_WAIT_1); | ||||
tp->t_flags &= ~TF_NEEDFIN; | tp->t_flags &= ~TF_NEEDFIN; | ||||
Context not available. | |||||
tcp_state_change(tp, TCPS_ESTABLISHED); | tcp_state_change(tp, TCPS_ESTABLISHED); | ||||
TCP_PROBE5(accept__established, NULL, tp, | TCP_PROBE5(accept__established, NULL, tp, | ||||
mtod(m, const char *), tp, th); | mtod(m, const char *), tp, th); | ||||
#ifdef TCP_RFC7413 | |||||
if (tp->t_tfo_pending) { | |||||
tcp_fastopen_decrement_counter(tp->t_tfo_pending); | |||||
tp->t_tfo_pending = NULL; | |||||
/* | |||||
* Account for the ACK of our SYN prior to regular | |||||
* ACK processing below. | |||||
*/ | |||||
tp->snd_una++; | |||||
} | |||||
/* | /* | ||||
* TFO connections call cc_conn_init() during SYN | * TFO connections call cc_conn_init() during SYN | ||||
* processing. Calling it again here for such connections | * processing. Calling it again here for such connections | ||||
Context not available. | |||||
* is not harmless as it would undo the snd_cwnd reduction | * is not harmless as it would undo the snd_cwnd reduction | ||||
* that occurs when a TFO SYN|ACK is retransmitted. | * that occurs when a TFO SYN|ACK is retransmitted. | ||||
*/ | */ | ||||
if (!IS_FASTOPEN(tp->t_flags)) | if (!(tp->t_flags & TF_FASTOPEN)) | ||||
#endif | |||||
cc_conn_init(tp); | cc_conn_init(tp); | ||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
* not, do so now to pass queued data to user. | * not, do so now to pass queued data to user. | ||||
*/ | */ | ||||
if (tlen == 0 && (thflags & TH_FIN) == 0) | if (tlen == 0 && (thflags & TH_FIN) == 0) | ||||
(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, | (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, | ||||
(struct mbuf *)0); | (struct mbuf *)0); | ||||
tp->snd_wl1 = th->th_seq - 1; | tp->snd_wl1 = th->th_seq - 1; | ||||
if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { | if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { | ||||
Context not available. | |||||
rack_check_data_after_close(struct mbuf *m, | rack_check_data_after_close(struct mbuf *m, | ||||
struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) | struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) | ||||
{ | { | ||||
struct tcp_rack *rack; | struct tcp_rack *rack; | ||||
INP_INFO_RLOCK_ASSERT(&V_tcbinfo); | INP_INFO_RLOCK_ASSERT(&V_tcbinfo); | ||||
rack = (struct tcp_rack *)tp->t_fb_ptr; | rack = (struct tcp_rack *)tp->t_fb_ptr; | ||||
Context not available. | |||||
rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; | rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; | ||||
rack->r_ctl.rc_min_to = rack_min_to; | rack->r_ctl.rc_min_to = rack_min_to; | ||||
rack->r_ctl.rc_prr_inc_var = rack_inc_var; | rack->r_ctl.rc_prr_inc_var = rack_inc_var; | ||||
rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); | |||||
if (tp->snd_una != tp->snd_max) { | if (tp->snd_una != tp->snd_max) { | ||||
/* Create a send map for the current outstanding data */ | /* Create a send map for the current outstanding data */ | ||||
struct rack_sendmap *rsm; | struct rack_sendmap *rsm; | ||||
Context not available. | |||||
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); | TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); | ||||
rsm->r_in_tmap = 1; | rsm->r_in_tmap = 1; | ||||
} | } | ||||
rack_stop_all_timers(tp); | |||||
rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); | |||||
return (0); | return (0); | ||||
} | } | ||||
Context not available. | |||||
uma_zfree(rack_pcb_zone, tp->t_fb_ptr); | uma_zfree(rack_pcb_zone, tp->t_fb_ptr); | ||||
tp->t_fb_ptr = NULL; | tp->t_fb_ptr = NULL; | ||||
} | } | ||||
/* Make sure snd_nxt is correctly set */ | |||||
tp->snd_nxt = tp->snd_max; | |||||
} | } | ||||
static void | static void | ||||
Context not available. | |||||
case TCPS_CLOSED: | case TCPS_CLOSED: | ||||
case TCPS_TIME_WAIT: | case TCPS_TIME_WAIT: | ||||
default: | default: | ||||
#ifdef INVARIANTS | |||||
panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); | |||||
#endif | |||||
break; | break; | ||||
}; | }; | ||||
} | } | ||||
Context not available. | |||||
* allow the tcbinfo to be in either locked or unlocked, as the | * allow the tcbinfo to be in either locked or unlocked, as the | ||||
* caller may have unnecessarily acquired a lock due to a race. | * caller may have unnecessarily acquired a lock due to a race. | ||||
*/ | */ | ||||
if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || | |||||
tp->t_state != TCPS_ESTABLISHED) { | |||||
INP_INFO_RLOCK_ASSERT(&V_tcbinfo); | |||||
} | |||||
INP_WLOCK_ASSERT(tp->t_inpcb); | INP_WLOCK_ASSERT(tp->t_inpcb); | ||||
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", | KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", | ||||
__func__)); | __func__)); | ||||
Context not available. | |||||
memset(&log.u_bbr, 0, sizeof(log.u_bbr)); | memset(&log.u_bbr, 0, sizeof(log.u_bbr)); | ||||
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; | log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; | ||||
log.u_bbr.ininput = rack->rc_inp->inp_in_input; | log.u_bbr.ininput = rack->rc_inp->inp_in_input; | ||||
log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; | |||||
TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, | TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, | ||||
tlen, &log, true); | tlen, &log, true); | ||||
} | } | ||||
if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { | |||||
way_out = 4; | |||||
goto done_with_input; | |||||
} | |||||
/* | /* | ||||
* If a segment with the ACK-bit set arrives in the SYN-SENT state | |||||
* check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. | |||||
*/ | |||||
if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && | |||||
(SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { | |||||
rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); | |||||
return; | |||||
} | |||||
/* | |||||
* Segment received on connection. Reset idle time and keep-alive | * Segment received on connection. Reset idle time and keep-alive | ||||
* timer. XXX: This should be done after segment validation to | * timer. XXX: This should be done after segment validation to | ||||
* ignore broken/spoofed segs. | * ignore broken/spoofed segs. | ||||
*/ | */ | ||||
if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { | if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { | ||||
#ifdef NETFLIX_CWV | if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { | ||||
if ((tp->cwv_enabled) && | |||||
((tp->cwv_cwnd_valid == 0) && | |||||
TCPS_HAVEESTABLISHED(tp->t_state) && | |||||
(tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { | |||||
tcp_newcwv_nvp_closedown(tp); | |||||
} else | |||||
#endif | |||||
if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { | |||||
counter_u64_add(rack_input_idle_reduces, 1); | counter_u64_add(rack_input_idle_reduces, 1); | ||||
rack_cc_after_idle(tp, | rack_cc_after_idle(tp, | ||||
(rack->r_idle_reduce_largest ? 1 :0)); | (rack->r_idle_reduce_largest ? 1 :0)); | ||||
Context not available. | |||||
rack->r_ctl.rc_rcvtime = cts; | rack->r_ctl.rc_rcvtime = cts; | ||||
tp->t_rcvtime = ticks; | tp->t_rcvtime = ticks; | ||||
#ifdef NETFLIX_CWV | |||||
if (tp->cwv_enabled) { | |||||
if ((tp->cwv_cwnd_valid == 0) && | |||||
TCPS_HAVEESTABLISHED(tp->t_state) && | |||||
(tp->snd_cwnd > tp->snd_cwv.init_cwnd)) | |||||
tcp_newcwv_nvp_closedown(tp); | |||||
} | |||||
#endif | |||||
/* | /* | ||||
* Unscale the window into a 32-bit value. For the SYN_SENT state | * Unscale the window into a 32-bit value. For the SYN_SENT state | ||||
* the scale is zero. | * the scale is zero. | ||||
Context not available. | |||||
if ((tp->t_flags & TF_SACK_PERMIT) && | if ((tp->t_flags & TF_SACK_PERMIT) && | ||||
(to.to_flags & TOF_SACKPERM) == 0) | (to.to_flags & TOF_SACKPERM) == 0) | ||||
tp->t_flags &= ~TF_SACK_PERMIT; | tp->t_flags &= ~TF_SACK_PERMIT; | ||||
if (IS_FASTOPEN(tp->t_flags)) { | |||||
if (to.to_flags & TOF_FASTOPEN) { | |||||
uint16_t mss; | |||||
if (to.to_flags & TOF_MSS) | |||||
mss = to.to_mss; | |||||
else | |||||
if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) | |||||
mss = TCP6_MSS; | |||||
else | |||||
mss = TCP_MSS; | |||||
tcp_fastopen_update_cache(tp, mss, | |||||
to.to_tfo_len, to.to_tfo_cookie); | |||||
} else | |||||
tcp_fastopen_disable_path(tp); | |||||
} | |||||
} | } | ||||
/* | /* | ||||
* At this point we are at the initial call. Here we decide | * At this point we are at the initial call. Here we decide | ||||
Context not available. | |||||
/* Set the flag */ | /* Set the flag */ | ||||
rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; | rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; | ||||
tcp_set_hpts(tp->t_inpcb); | tcp_set_hpts(tp->t_inpcb); | ||||
rack_stop_all_timers(tp); | |||||
sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); | sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); | ||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
*/ | */ | ||||
INP_WLOCK_ASSERT(tp->t_inpcb); | INP_WLOCK_ASSERT(tp->t_inpcb); | ||||
tcp_rack_xmit_timer_commit(rack, tp); | tcp_rack_xmit_timer_commit(rack, tp); | ||||
if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && | |||||
(rack->rc_in_persist == 0)){ | |||||
/* | |||||
* The peer shrunk its window on us to the point | |||||
* where we have sent too much. The only thing | |||||
* we can do here is stop any timers and | |||||
* enter persist. We most likely lost the last | |||||
* bytes we sent but oh well, we will have to | |||||
* retransmit them after the peer is caught up. | |||||
*/ | |||||
if (rack->rc_inp->inp_in_hpts) | |||||
tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); | |||||
rack_timer_cancel(tp, rack, cts, __LINE__); | |||||
rack_enter_persist(tp, rack, cts); | |||||
rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); | |||||
way_out = 3; | |||||
goto done_with_input; | |||||
} | |||||
if (nxt_pkt == 0) { | if (nxt_pkt == 0) { | ||||
if (rack->r_wanted_output != 0) { | if (rack->r_wanted_output != 0) { | ||||
did_out = 1; | did_out = 1; | ||||
Context not available. | |||||
rack_timer_audit(tp, rack, &so->so_snd); | rack_timer_audit(tp, rack, &so->so_snd); | ||||
way_out = 2; | way_out = 2; | ||||
} | } | ||||
done_with_input: | |||||
rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); | rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); | ||||
if (did_out) | if (did_out) | ||||
rack->r_wanted_output = 0; | rack->r_wanted_output = 0; | ||||
Context not available. | |||||
#ifdef RSS | #ifdef RSS | ||||
struct tcp_function_block *tfb; | struct tcp_function_block *tfb; | ||||
struct tcp_rack *rack; | struct tcp_rack *rack; | ||||
struct epoch_tracker et; | struct inpcb *inp; | ||||
rack = (struct tcp_rack *)tp->t_fb_ptr; | rack = (struct tcp_rack *)tp->t_fb_ptr; | ||||
if (rack->r_state == 0) { | if (rack->r_state == 0) { | ||||
Context not available. | |||||
* Initial input (ACK to SYN-ACK etc)lets go ahead and get | * Initial input (ACK to SYN-ACK etc)lets go ahead and get | ||||
* it processed | * it processed | ||||
*/ | */ | ||||
INP_INFO_RLOCK_ET(&V_tcbinfo, et); | |||||
tcp_get_usecs(&tv); | tcp_get_usecs(&tv); | ||||
rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, | rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, | ||||
tlen, iptos, 0, &tv); | tlen, iptos, 0, &tv); | ||||
INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); | |||||
return; | return; | ||||
} | } | ||||
tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); | tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); | ||||
Context not available. | |||||
#ifdef TCPDEBUG | #ifdef TCPDEBUG | ||||
struct ipovly *ipov = NULL; | struct ipovly *ipov = NULL; | ||||
#endif | #endif | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
struct udphdr *udp = NULL; | struct udphdr *udp = NULL; | ||||
#endif | |||||
struct tcp_rack *rack; | struct tcp_rack *rack; | ||||
struct tcphdr *th; | struct tcphdr *th; | ||||
uint8_t pass = 0; | uint8_t pass = 0; | ||||
uint8_t wanted_cookie = 0; | |||||
u_char opt[TCP_MAXOLEN]; | u_char opt[TCP_MAXOLEN]; | ||||
unsigned ipoptlen, optlen, hdrlen, ulen=0; | unsigned ipoptlen, optlen, hdrlen; | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
unsigned ulen; | |||||
#endif | |||||
uint32_t rack_seq; | uint32_t rack_seq; | ||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) | #if defined(IPSEC) || defined(IPSEC_SUPPORT) | ||||
Context not available. | |||||
if (tp->t_flags & TF_TOE) | if (tp->t_flags & TF_TOE) | ||||
return (tcp_offload_output(tp)); | return (tcp_offload_output(tp)); | ||||
#endif | #endif | ||||
#ifdef TCP_RFC7413 | |||||
/* | |||||
* For TFO connections in SYN_RECEIVED, only allow the initial | |||||
* SYN|ACK and those sent by the retransmit timer. | |||||
*/ | |||||
if ((tp->t_flags & TF_FASTOPEN) && | |||||
(tp->t_state == TCPS_SYN_RECEIVED) && | |||||
SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ | |||||
(tp->snd_nxt != tp->snd_una)) /* not a retransmit */ | |||||
return (0); | |||||
#endif | |||||
#ifdef INET6 | #ifdef INET6 | ||||
if (rack->r_state) { | if (rack->r_state) { | ||||
/* Use the cache line loaded if possible */ | /* Use the cache line loaded if possible */ | ||||
Context not available. | |||||
rack->r_wanted_output = 0; | rack->r_wanted_output = 0; | ||||
rack->r_timer_override = 0; | rack->r_timer_override = 0; | ||||
/* | /* | ||||
* For TFO connections in SYN_SENT or SYN_RECEIVED, | |||||
* only allow the initial SYN or SYN|ACK and those sent | |||||
* by the retransmit timer. | |||||
*/ | |||||
if (IS_FASTOPEN(tp->t_flags) && | |||||
((tp->t_state == TCPS_SYN_RECEIVED) || | |||||
(tp->t_state == TCPS_SYN_SENT)) && | |||||
SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ | |||||
(tp->t_rxtshift == 0)) /* not a retransmit */ | |||||
return (0); | |||||
/* | |||||
* Determine length of data that should be transmitted, and flags | * Determine length of data that should be transmitted, and flags | ||||
* that will be used. If there is some data or critical controls | * that will be used. If there is some data or critical controls | ||||
* (SYN, RST) to send, then transmit; otherwise, investigate | * (SYN, RST) to send, then transmit; otherwise, investigate | ||||
Context not available. | |||||
* further. | * further. | ||||
*/ | */ | ||||
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); | idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); | ||||
#ifdef NETFLIX_CWV | |||||
if (tp->cwv_enabled) { | |||||
if ((tp->cwv_cwnd_valid == 0) && | |||||
TCPS_HAVEESTABLISHED(tp->t_state) && | |||||
(tp->snd_cwnd > tp->snd_cwv.init_cwnd)) | |||||
tcp_newcwv_nvp_closedown(tp); | |||||
} else | |||||
#endif | |||||
if (tp->t_idle_reduce) { | if (tp->t_idle_reduce) { | ||||
if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) | if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) | ||||
rack_cc_after_idle(tp, | rack_cc_after_idle(tp, | ||||
Context not available. | |||||
tlen = rsm->r_end - rsm->r_start; | tlen = rsm->r_end - rsm->r_start; | ||||
if (tlen > tp->t_maxseg) | if (tlen > tp->t_maxseg) | ||||
tlen = tp->t_maxseg; | tlen = tp->t_maxseg; | ||||
KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), | #ifdef INVARIANTS | ||||
("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", | if (SEQ_GT(tp->snd_una, rsm->r_start)) { | ||||
__func__, __LINE__, | panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", | ||||
rsm->r_start, tp->snd_una, tp, rack, rsm)); | tp, rack, tp->snd_una, rsm, rsm->r_start); | ||||
} | |||||
#endif | |||||
sb_offset = rsm->r_start - tp->snd_una; | sb_offset = rsm->r_start - tp->snd_una; | ||||
cwin = min(tp->snd_wnd, tlen); | cwin = min(tp->snd_wnd, tlen); | ||||
len = cwin; | len = cwin; | ||||
Context not available. | |||||
len = rsm->r_end - rsm->r_start; | len = rsm->r_end - rsm->r_start; | ||||
sack_rxmit = 1; | sack_rxmit = 1; | ||||
sendalot = 0; | sendalot = 0; | ||||
KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), | |||||
("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", | |||||
__func__, __LINE__, | |||||
rsm->r_start, tp->snd_una, tp, rack, rsm)); | |||||
sb_offset = rsm->r_start - tp->snd_una; | sb_offset = rsm->r_start - tp->snd_una; | ||||
if (len >= tp->t_maxseg) { | if (len >= tp->t_maxseg) { | ||||
len = tp->t_maxseg; | len = tp->t_maxseg; | ||||
} | } | ||||
KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", | |||||
__func__, sb_offset)); | |||||
} else if ((rack->rc_in_persist == 0) && | } else if ((rack->rc_in_persist == 0) && | ||||
((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { | ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { | ||||
long tlen; | long tlen; | ||||
Context not available. | |||||
} | } | ||||
#endif | #endif | ||||
tlen = rsm->r_end - rsm->r_start; | tlen = rsm->r_end - rsm->r_start; | ||||
KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), | |||||
("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", | |||||
__func__, __LINE__, | |||||
rsm->r_start, tp->snd_una, tp, rack, rsm)); | |||||
sb_offset = rsm->r_start - tp->snd_una; | sb_offset = rsm->r_start - tp->snd_una; | ||||
if (tlen > rack->r_ctl.rc_prr_sndcnt) { | if (tlen > rack->r_ctl.rc_prr_sndcnt) { | ||||
len = rack->r_ctl.rc_prr_sndcnt; | len = rack->r_ctl.rc_prr_sndcnt; | ||||
Context not available. | |||||
goto just_return_nolock; | goto just_return_nolock; | ||||
} | } | ||||
} | } | ||||
KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", | |||||
__func__, sb_offset)); | |||||
if (len > 0) { | if (len > 0) { | ||||
sub_from_prr = 1; | sub_from_prr = 1; | ||||
sack_rxmit = 1; | sack_rxmit = 1; | ||||
Context not available. | |||||
/* For debugging */ | /* For debugging */ | ||||
rack->r_ctl.rc_rsm_at_retran = rsm; | rack->r_ctl.rc_rsm_at_retran = rsm; | ||||
#endif | #endif | ||||
/* | |||||
* Enforce a connection sendmap count limit if set | |||||
* as long as we are not retransmiting. | |||||
*/ | |||||
if ((rsm == NULL) && | |||||
(rack_map_entries_limit > 0) && | |||||
(rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { | |||||
counter_u64_add(rack_to_alloc_limited, 1); | |||||
if (!rack->alloc_limit_reported) { | |||||
rack->alloc_limit_reported = 1; | |||||
counter_u64_add(rack_alloc_limited_conns, 1); | |||||
} | |||||
goto just_return_nolock; | |||||
} | |||||
/* | /* | ||||
* Get standard flags, and add SYN or FIN if requested by 'hidden' | * Get standard flags, and add SYN or FIN if requested by 'hidden' | ||||
* state flags. | * state flags. | ||||
Context not available. | |||||
uint32_t avail; | uint32_t avail; | ||||
avail = sbavail(sb); | avail = sbavail(sb); | ||||
if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) | if (SEQ_GT(tp->snd_nxt, tp->snd_una)) | ||||
sb_offset = tp->snd_nxt - tp->snd_una; | sb_offset = tp->snd_nxt - tp->snd_una; | ||||
else | else | ||||
sb_offset = 0; | sb_offset = 0; | ||||
Context not available. | |||||
* data possible so far in the scoreboard. | * data possible so far in the scoreboard. | ||||
*/ | */ | ||||
outstanding = tp->snd_max - tp->snd_una; | outstanding = tp->snd_max - tp->snd_una; | ||||
if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) | if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { | ||||
len = 0; | if (tp->snd_wnd > outstanding) { | ||||
else if (avail > sb_offset) | len = tp->snd_wnd - outstanding; | ||||
/* Check to see if we have the data */ | |||||
if (((sb_offset + len) > avail) && | |||||
(avail > sb_offset)) | |||||
len = avail - sb_offset; | |||||
else | |||||
len = 0; | |||||
} else | |||||
len = 0; | |||||
} else if (avail > sb_offset) | |||||
len = avail - sb_offset; | len = avail - sb_offset; | ||||
else | else | ||||
len = 0; | len = 0; | ||||
Context not available. | |||||
* SYN-SENT state and if segment contains data and if we don't know | * SYN-SENT state and if segment contains data and if we don't know | ||||
* that foreign host supports TAO, suppress sending segment. | * that foreign host supports TAO, suppress sending segment. | ||||
*/ | */ | ||||
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && | if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { | ||||
((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { | if ((tp->t_state != TCPS_SYN_RECEIVED) && | ||||
if (tp->t_state != TCPS_SYN_RECEIVED) | (tp->t_state != TCPS_SYN_SENT)) | ||||
flags &= ~TH_SYN; | flags &= ~TH_SYN; | ||||
#ifdef TCP_RFC7413 | |||||
/* | /* | ||||
* When sending additional segments following a TFO SYN|ACK, | * When sending additional segments following a TFO SYN|ACK, | ||||
* do not include the SYN bit. | * do not include the SYN bit. | ||||
*/ | */ | ||||
if (IS_FASTOPEN(tp->t_flags) && | if ((tp->t_flags & TF_FASTOPEN) && | ||||
(tp->t_state == TCPS_SYN_RECEIVED)) | (tp->t_state == TCPS_SYN_RECEIVED)) | ||||
flags &= ~TH_SYN; | flags &= ~TH_SYN; | ||||
#endif | |||||
sb_offset--, len++; | sb_offset--, len++; | ||||
if (sbavail(sb) == 0) | |||||
len = 0; | |||||
} | } | ||||
/* | /* | ||||
* Be careful not to send data and/or FIN on SYN segments. This | * Be careful not to send data and/or FIN on SYN segments. This | ||||
Context not available. | |||||
len = 0; | len = 0; | ||||
flags &= ~TH_FIN; | flags &= ~TH_FIN; | ||||
} | } | ||||
#ifdef TCP_RFC7413 | |||||
/* | /* | ||||
* On TFO sockets, ensure no data is sent in the following cases: | * When retransmitting SYN|ACK on a passively-created TFO socket, | ||||
* | * don't include data, as the presence of data may have caused the | ||||
* - When retransmitting SYN|ACK on a passively-created socket | * original SYN|ACK to have been dropped by a middlebox. | ||||
* | |||||
* - When retransmitting SYN on an actively created socket | |||||
* | |||||
* - When sending a zero-length cookie (cookie request) on an | |||||
* actively created socket | |||||
* | |||||
* - When the socket is in the CLOSED state (RST is being sent) | |||||
*/ | */ | ||||
if (IS_FASTOPEN(tp->t_flags) && | if ((tp->t_flags & TF_FASTOPEN) && | ||||
(((flags & TH_SYN) && (tp->t_rxtshift > 0)) || | ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) | ||||
((tp->t_state == TCPS_SYN_SENT) && | |||||
(tp->t_tfo_client_cookie_len == 0)) || | |||||
(flags & TH_RST))) { | |||||
sack_rxmit = 0; | |||||
len = 0; | len = 0; | ||||
} | #endif | ||||
/* Without fast-open there should never be data sent on a SYN */ | |||||
if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) | |||||
len = 0; | |||||
if (len <= 0) { | if (len <= 0) { | ||||
/* | /* | ||||
* If FIN has been sent but not acked, but we haven't been | * If FIN has been sent but not acked, but we haven't been | ||||
Context not available. | |||||
ipoptlen += ipsec_optlen; | ipoptlen += ipsec_optlen; | ||||
#endif | #endif | ||||
if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && | if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
(tp->t_port == 0) && | (tp->t_port == 0) && | ||||
#endif | |||||
((tp->t_flags & TF_SIGNATURE) == 0) && | ((tp->t_flags & TF_SIGNATURE) == 0) && | ||||
tp->rcv_numsacks == 0 && sack_rxmit == 0 && | tp->rcv_numsacks == 0 && sack_rxmit == 0 && | ||||
ipoptlen == 0) | ipoptlen == 0) | ||||
Context not available. | |||||
* If our state indicates that FIN should be sent and we have not | * If our state indicates that FIN should be sent and we have not | ||||
* yet done so, then we need to send. | * yet done so, then we need to send. | ||||
*/ | */ | ||||
if ((flags & TH_FIN) && | if (flags & TH_FIN) { | ||||
(tp->snd_nxt == tp->snd_una)) { | if ((tp->t_flags & TF_SENTFIN) || | ||||
pass = 11; | (((tp->t_flags & TF_SENTFIN) == 0) && | ||||
goto send; | (tp->snd_nxt == tp->snd_una))) { | ||||
pass = 11; | |||||
goto send; | |||||
} | |||||
} | } | ||||
/* | /* | ||||
* No reason to send a segment, just return. | * No reason to send a segment, just return. | ||||
Context not available. | |||||
if (flags & TH_SYN) { | if (flags & TH_SYN) { | ||||
tp->snd_nxt = tp->iss; | tp->snd_nxt = tp->iss; | ||||
to.to_mss = tcp_mssopt(&inp->inp_inc); | to.to_mss = tcp_mssopt(&inp->inp_inc); | ||||
#ifdef NETFLIX_TCPOUDP | #ifdef NETFLIX_TCP_O_UDP | ||||
if (tp->t_port) | if (tp->t_port) | ||||
to.to_mss -= V_tcp_udp_tunneling_overhead; | to.to_mss -= V_tcp_udp_tunneling_overhead; | ||||
#endif | #endif | ||||
to.to_flags |= TOF_MSS; | to.to_flags |= TOF_MSS; | ||||
#ifdef TCP_RFC7413 | |||||
/* | /* | ||||
* On SYN or SYN|ACK transmits on TFO connections, | * Only include the TFO option on the first | ||||
* only include the TFO option if it is not a | * transmission of the SYN|ACK on a | ||||
* retransmit, as the presence of the TFO option may | * passively-created TFO socket, as the presence of | ||||
* have caused the original SYN or SYN|ACK to have | * the TFO option may have caused the original | ||||
* been dropped by a middlebox. | * SYN|ACK to have been dropped by a middlebox. | ||||
*/ | */ | ||||
if (IS_FASTOPEN(tp->t_flags) && | if ((tp->t_flags & TF_FASTOPEN) && | ||||
(tp->t_state == TCPS_SYN_RECEIVED) && | |||||
(tp->t_rxtshift == 0)) { | (tp->t_rxtshift == 0)) { | ||||
if (tp->t_state == TCPS_SYN_RECEIVED) { | to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; | ||||
to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; | to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; | ||||
to.to_tfo_cookie = | to.to_flags |= TOF_FASTOPEN; | ||||
(u_int8_t *)&tp->t_tfo_cookie.server; | |||||
to.to_flags |= TOF_FASTOPEN; | |||||
wanted_cookie = 1; | |||||
} else if (tp->t_state == TCPS_SYN_SENT) { | |||||
to.to_tfo_len = | |||||
tp->t_tfo_client_cookie_len; | |||||
to.to_tfo_cookie = | |||||
tp->t_tfo_cookie.client; | |||||
to.to_flags |= TOF_FASTOPEN; | |||||
wanted_cookie = 1; | |||||
/* | |||||
* If we wind up having more data to | |||||
* send with the SYN than can fit in | |||||
* one segment, don't send any more | |||||
* until the SYN|ACK comes back from | |||||
* the other end. | |||||
*/ | |||||
sendalot = 0; | |||||
} | |||||
} | } | ||||
#endif | |||||
} | } | ||||
/* Window scaling. */ | /* Window scaling. */ | ||||
if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { | if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { | ||||
Context not available. | |||||
/* Processing the options. */ | /* Processing the options. */ | ||||
hdrlen += optlen = tcp_addoptions(&to, opt); | hdrlen += optlen = tcp_addoptions(&to, opt); | ||||
/* | |||||
* If we wanted a TFO option to be added, but it was unable | |||||
* to fit, ensure no data is sent. | |||||
*/ | |||||
if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && | |||||
!(to.to_flags & TOF_FASTOPEN)) | |||||
len = 0; | |||||
} | } | ||||
#ifdef NETFLIX_TCPOUDP | #ifdef NETFLIX_TCP_O_UDP | ||||
if (tp->t_port) { | if (tp->t_port) { | ||||
if (V_tcp_udp_tunneling_port == 0) { | if (V_tcp_udp_tunneling_port == 0) { | ||||
/* The port was removed?? */ | /* The port was removed?? */ | ||||
Context not available. | |||||
msb = NULL; | msb = NULL; | ||||
else | else | ||||
msb = sb; | msb = sb; | ||||
m->m_next = tcp_m_copym(mb, moff, &len, | m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len, | ||||
if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); | if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/); | ||||
if (len <= (tp->t_maxseg - optlen)) { | if (len <= (tp->t_maxseg - optlen)) { | ||||
/* | /* | ||||
* Must have ran out of mbufs for the copy | * Must have ran out of mbufs for the copy | ||||
Context not available. | |||||
* TLP should not count in retran count, but | * TLP should not count in retran count, but | ||||
* in its own bin | * in its own bin | ||||
*/ | */ | ||||
/* tp->t_sndtlppack++;*/ | |||||
/* tp->t_sndtlpbyte += len;*/ | |||||
counter_u64_add(rack_tlp_retran, 1); | counter_u64_add(rack_tlp_retran, 1); | ||||
counter_u64_add(rack_tlp_retran_bytes, len); | counter_u64_add(rack_tlp_retran_bytes, len); | ||||
} else { | } else { | ||||
Context not available. | |||||
#ifdef INET6 | #ifdef INET6 | ||||
if (isipv6) { | if (isipv6) { | ||||
ip6 = mtod(m, struct ip6_hdr *); | ip6 = mtod(m, struct ip6_hdr *); | ||||
#ifdef NETFLIX_TCPOUDP | #ifdef NETFLIX_TCP_O_UDP | ||||
if (tp->t_port) { | if (tp->t_port) { | ||||
udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); | udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); | ||||
udp->uh_sport = htons(V_tcp_udp_tunneling_port); | udp->uh_sport = htons(V_tcp_udp_tunneling_port); | ||||
Context not available. | |||||
ulen = hdrlen + len - sizeof(struct ip6_hdr); | ulen = hdrlen + len - sizeof(struct ip6_hdr); | ||||
udp->uh_ulen = htons(ulen); | udp->uh_ulen = htons(ulen); | ||||
th = (struct tcphdr *)(udp + 1); | th = (struct tcphdr *)(udp + 1); | ||||
} else | } else | ||||
#endif | #endif | ||||
th = (struct tcphdr *)(ip6 + 1); | th = (struct tcphdr *)(ip6 + 1); | ||||
tcpip_fillheaders(inp, ip6, th); | tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th); | ||||
} else | } else | ||||
#endif /* INET6 */ | #endif /* INET6 */ | ||||
{ | { | ||||
Context not available. | |||||
#ifdef TCPDEBUG | #ifdef TCPDEBUG | ||||
ipov = (struct ipovly *)ip; | ipov = (struct ipovly *)ip; | ||||
#endif | #endif | ||||
#ifdef NETFLIX_TCPOUDP | #ifdef NETFLIX_TCP_O_UDP | ||||
if (tp->t_port) { | if (tp->t_port) { | ||||
udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); | udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); | ||||
udp->uh_sport = htons(V_tcp_udp_tunneling_port); | udp->uh_sport = htons(V_tcp_udp_tunneling_port); | ||||
Context not available. | |||||
} else | } else | ||||
#endif | #endif | ||||
th = (struct tcphdr *)(ip + 1); | th = (struct tcphdr *)(ip + 1); | ||||
tcpip_fillheaders(inp, ip, th); | tcpip_fillheaders(inp,/*tp->t_port, */ ip, th); | ||||
} | } | ||||
/* | /* | ||||
* Fill in fields, remembering maximum advertised window for use in | * Fill in fields, remembering maximum advertised window for use in | ||||
Context not available. | |||||
/* | /* | ||||
* Calculate receive window. Don't shrink window, but avoid silly | * Calculate receive window. Don't shrink window, but avoid silly | ||||
* window syndrome. | * window syndrome. | ||||
* If a RST segment is sent, advertise a window of zero. | |||||
*/ | */ | ||||
if (flags & TH_RST) { | if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && | ||||
recwin < (long)tp->t_maxseg) | |||||
recwin = 0; | recwin = 0; | ||||
} else { | if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && | ||||
if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && | recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) | ||||
recwin < (long)tp->t_maxseg) | recwin = (long)(tp->rcv_adv - tp->rcv_nxt); | ||||
recwin = 0; | if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) | ||||
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && | recwin = (long)TCP_MAXWIN << tp->rcv_scale; | ||||
recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) | |||||
recwin = (long)(tp->rcv_adv - tp->rcv_nxt); | |||||
if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) | |||||
recwin = (long)TCP_MAXWIN << tp->rcv_scale; | |||||
} | |||||
/* | /* | ||||
* According to RFC1323 the window field in a SYN (i.e., a <SYN> or | * According to RFC1323 the window field in a SYN (i.e., a <SYN> or | ||||
Context not available. | |||||
* ip6_plen is not need to be filled now, and will be filled | * ip6_plen is not need to be filled now, and will be filled | ||||
* in ip6_output. | * in ip6_output. | ||||
*/ | */ | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
if (tp->t_port) { | if (tp->t_port) { | ||||
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; | m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; | ||||
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); | m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); | ||||
udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); | udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); | ||||
th->th_sum = htons(0); | th->th_sum = htons(0); | ||||
UDPSTAT_INC(udps_opackets); | |||||
} else { | } else { | ||||
#endif | |||||
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; | m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; | ||||
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | ||||
th->th_sum = in6_cksum_pseudo(ip6, | th->th_sum = in6_cksum_pseudo(ip6, | ||||
sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, | sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, | ||||
0); | 0); | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
} | } | ||||
#endif | |||||
} | } | ||||
#endif | #endif | ||||
#if defined(INET6) && defined(INET) | #if defined(INET6) && defined(INET) | ||||
Context not available. | |||||
#endif | #endif | ||||
#ifdef INET | #ifdef INET | ||||
{ | { | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
if (tp->t_port) { | if (tp->t_port) { | ||||
m->m_pkthdr.csum_flags = CSUM_UDP; | m->m_pkthdr.csum_flags = CSUM_UDP; | ||||
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); | m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); | ||||
Context not available. | |||||
udp->uh_sum = in_pseudo(ip->ip_src.s_addr, | udp->uh_sum = in_pseudo(ip->ip_src.s_addr, | ||||
ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); | ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); | ||||
th->th_sum = htons(0); | th->th_sum = htons(0); | ||||
UDPSTAT_INC(udps_opackets); | |||||
} else { | } else { | ||||
#endif | |||||
m->m_pkthdr.csum_flags = CSUM_TCP; | m->m_pkthdr.csum_flags = CSUM_TCP; | ||||
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); | ||||
th->th_sum = in_pseudo(ip->ip_src.s_addr, | th->th_sum = in_pseudo(ip->ip_src.s_addr, | ||||
ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + | ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + | ||||
IPPROTO_TCP + len + optlen)); | IPPROTO_TCP + len + optlen)); | ||||
#ifdef NETFLIX_TCP_O_UDP | |||||
} | } | ||||
#endif | |||||
/* IP version must be set here for ipv4/ipv6 checking later */ | /* IP version must be set here for ipv4/ipv6 checking later */ | ||||
KASSERT(ip->ip_v == IPVERSION, | KASSERT(ip->ip_v == IPVERSION, | ||||
("%s: IP version incorrect: %d", __func__, ip->ip_v)); | ("%s: IP version incorrect: %d", __func__, ip->ip_v)); | ||||
Context not available. | |||||
* retransmit. In persist state, just set snd_max. | * retransmit. In persist state, just set snd_max. | ||||
*/ | */ | ||||
if (error == 0) { | if (error == 0) { | ||||
/* if (TCPS_HAVEESTABLISHED(tp->t_state) && | |||||
(tp->t_flags & TF_SACK_PERMIT) && | |||||
tp->rcv_numsacks > 0) | |||||
tcp_clean_dsack_blocks(tp);*/ | |||||
if (len == 0) | if (len == 0) | ||||
counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); | counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); | ||||
else if (len == 1) { | else if (len == 1) { | ||||
Context not available. | |||||
} | } | ||||
} | } | ||||
if (sub_from_prr && (error == 0)) { | if (sub_from_prr && (error == 0)) { | ||||
rack->r_ctl.rc_prr_sndcnt -= len; | if (rack->r_ctl.rc_prr_sndcnt >= len) | ||||
rack->r_ctl.rc_prr_sndcnt -= len; | |||||
else | |||||
rack->r_ctl.rc_prr_sndcnt = 0; | |||||
} | } | ||||
sub_from_prr = 0; | sub_from_prr = 0; | ||||
rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, | rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, | ||||
Context not available. | |||||
pass, rsm); | pass, rsm); | ||||
if ((tp->t_flags & TF_FORCEDATA) == 0 || | if ((tp->t_flags & TF_FORCEDATA) == 0 || | ||||
(rack->rc_in_persist == 0)) { | (rack->rc_in_persist == 0)) { | ||||
#ifdef NETFLIX_STATS | |||||
tcp_seq startseq = tp->snd_nxt; | tcp_seq startseq = tp->snd_nxt; | ||||
#endif | |||||
/* | /* | ||||
* Advance snd_nxt over sequence space of this segment. | * Advance snd_nxt over sequence space of this segment. | ||||
*/ | */ | ||||
Context not available. | |||||
tp->t_acktime = ticks; | tp->t_acktime = ticks; | ||||
} | } | ||||
tp->snd_max = tp->snd_nxt; | tp->snd_max = tp->snd_nxt; | ||||
/* | |||||
* Time this transmission if not a retransmission and | |||||
* not currently timing anything. | |||||
* This is only relevant in case of switching back to | |||||
* the base stack. | |||||
*/ | |||||
if (tp->t_rtttime == 0) { | |||||
tp->t_rtttime = ticks; | |||||
tp->t_rtseq = startseq; | |||||
TCPSTAT_INC(tcps_segstimed); | |||||
} | |||||
#ifdef NETFLIX_STATS | #ifdef NETFLIX_STATS | ||||
if (!(tp->t_flags & TF_GPUTINPROG) && len) { | if (!(tp->t_flags & TF_GPUTINPROG) && len) { | ||||
tp->t_flags |= TF_GPUTINPROG; | tp->t_flags |= TF_GPUTINPROG; | ||||
Context not available. | |||||
return (tcp_default_ctloutput(so, sopt, inp, tp)); | return (tcp_default_ctloutput(so, sopt, inp, tp)); | ||||
break; | break; | ||||
} | } | ||||
#ifdef NETFLIX_STATS | /* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/ | ||||
tcp_log_socket_option(tp, sopt->sopt_name, optval, error); | |||||
#endif | |||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
return (error); | return (error); | ||||
} | } | ||||
Context not available. | |||||
.tfb_tcp_block_name = __XSTRING(STACKNAME), | .tfb_tcp_block_name = __XSTRING(STACKNAME), | ||||
.tfb_tcp_output = rack_output, | .tfb_tcp_output = rack_output, | ||||
.tfb_tcp_do_segment = rack_do_segment, | .tfb_tcp_do_segment = rack_do_segment, | ||||
.tfb_tcp_hpts_do_segment = rack_hpts_do_segment, | |||||
.tfb_tcp_ctloutput = rack_ctloutput, | .tfb_tcp_ctloutput = rack_ctloutput, | ||||
.tfb_tcp_fb_init = rack_init, | .tfb_tcp_fb_init = rack_init, | ||||
.tfb_tcp_fb_fini = rack_fini, | .tfb_tcp_fb_fini = rack_fini, | ||||
Context not available. | |||||
MODULE_VERSION(MODNAME, 1); | MODULE_VERSION(MODNAME, 1); | ||||
DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); | DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); | ||||
MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); | |||||
Context not available. |