Index: lib/libc/sys/getsockopt.2 =================================================================== --- lib/libc/sys/getsockopt.2 +++ lib/libc/sys/getsockopt.2 @@ -28,7 +28,7 @@ .\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95 .\" $FreeBSD$ .\" -.Dd April 5, 2013 +.Dd October 11, 2016 .Dt GETSOCKOPT 2 .Os .Sh NAME @@ -187,6 +187,7 @@ .It Dv SO_LISTENQLEN Ta "get complete queue length of the socket (get only)" .It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)" .It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)" +.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket" .El .Pp .Dv SO_DEBUG @@ -496,6 +497,11 @@ returns the number of unaccepted complete connections. .Dv SO_LISTENINCQLEN returns the number of unaccepted incomplete connections. +.Pp +.Dv SO_MAX_PACING_RATE +instructs the socket and underlying network adapter layers that the +transmit rate should be limited to the given unsigned 32-bit value in +bytes per second. .Sh RETURN VALUES .Rv -std .Sh ERRORS Index: sbin/ifconfig/ifconfig.8 =================================================================== --- sbin/ifconfig/ifconfig.8 +++ sbin/ifconfig/ifconfig.8 @@ -28,7 +28,7 @@ .\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94 .\" $FreeBSD$ .\" -.Dd September 17, 2016 +.Dd October 11, 2016 .Dt IFCONFIG 8 .Os .Sh NAME @@ -460,6 +460,8 @@ and 802.11g .Pq Cm 11g operating modes. +.It Cm txrtlmt +Set if the driver supports TX rate limiting. .It Cm inst Ar minst , Cm instance Ar minst Set the media instance to .Ar minst . Index: sbin/ifconfig/ifconfig.c =================================================================== --- sbin/ifconfig/ifconfig.c +++ sbin/ifconfig/ifconfig.c @@ -1145,7 +1145,7 @@ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT" /* * Print the status of the interface. If an address family was @@ -1453,6 +1453,8 @@ DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap), DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap), DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap), + DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap), + DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap), DEF_CMD("normal", -IFF_LINK0, setifflags), DEF_CMD("compress", IFF_LINK0, setifflags), DEF_CMD("noicmp", IFF_LINK1, setifflags), Index: sys/conf/NOTES =================================================================== --- sys/conf/NOTES +++ sys/conf/NOTES @@ -616,6 +616,8 @@ options INET #Internet communications protocols options INET6 #IPv6 communications protocols +options RATELIMIT # TX rate limiting support + options ROUTETABLES=2 # allocated fibs up to 65536. default is 1. # but that would be a bad idea as they are large. Index: sys/conf/config.mk =================================================================== --- sys/conf/config.mk +++ sys/conf/config.mk @@ -19,6 +19,10 @@ opt_inet6.h: @echo "#define INET6 1" > ${.TARGET} .endif +.if ${MK_RATELIMIT} != "no" +opt_ratelimit.h: + @echo "#define RATELIMIT 1" > ${.TARGET} +.endif .if ${MK_EISA} != "no" opt_eisa.h: @echo "#define DEV_EISA 1" > ${.TARGET} Index: sys/conf/kern.opts.mk =================================================================== --- sys/conf/kern.opts.mk +++ sys/conf/kern.opts.mk @@ -47,7 +47,8 @@ EISA \ EXTRA_TCP_STACKS \ NAND \ - OFED + OFED \ + RATELIMIT # Some options are totally broken on some architectures. We disable # them. If you need to enable them on an experimental basis, you Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -407,6 +407,7 @@ BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h +RATELIMIT opt_ratelimit.h INET opt_inet.h INET6 opt_inet6.h IPDIVERT Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -105,6 +105,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include "opt_compat.h" #include @@ -2683,6 +2684,18 @@ #endif break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + error = sooptcopyin(sopt, &val32, sizeof(val32), + sizeof(val32)); + if (error) + goto bad; + so->so_max_pacing_rate = val32; +#else + error = EOPNOTSUPP; +#endif + break; + default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, @@ -2738,6 +2751,9 @@ #ifdef MAC struct mac extmac; #endif +#ifdef RATELIMIT + uint32_t val32; +#endif CURVNET_SET(so->so_vnet); error = 0; @@ -2870,6 +2886,15 @@ optval = so->so_incqlen; goto integer; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + val32 = so->so_max_pacing_rate; + error = sooptcopyout(sopt, &val32, sizeof(val32)); +#else + error = EOPNOTSUPP; +#endif + break; + default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -239,6 +239,7 @@ #define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ +#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) @@ -371,6 +372,16 @@ }; /* + * Interface to create/delete/modify TX rate limiting. + */ +struct ifreq_txrtlmt { + uint32_t txring_max_rate; /* limit in bytes per second */ + uint32_t txring_id; /* driver specific value */ + uint32_t txring_flow_id; /* current flowid */ + uint32_t txring_flow_type; /* current flowtype */ +}; + +/* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -2767,6 +2767,17 @@ ifr = (struct ifreq *)data; switch (cmd) { + /* + * The TX rate limiting IOCTLs should only be used + * within the kernel. Prevent user-space from using + * them: + */ + case SIOCARATECTL: + case SIOCSRATECTL: + case SIOCDRATECTL: + CURVNET_RESTORE(); + return (EOPNOTSUPP); + #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -202,10 +202,13 @@ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ - void *inp_pspare[5]; /* (x) packet pacing / general use */ + struct ifnet *inp_txring_ifp; /* (i) ifp of TX ring */ + void *inp_pspare[4]; /* (x) packet pacing / general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ - u_int inp_ispare[4]; /* (x) packet pacing / user cookie / + uint32_t inp_txring_max_rate; /* (i) driver TX ring rate */ + uint32_t inp_txring_id; /* (i) driver TX ring ID */ + u_int inp_ispare[2]; /* (x) packet pacing / user cookie / * general use */ /* Local and foreign ports, local and foreign addr. */ @@ -736,6 +739,9 @@ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); +#ifdef RATELIMIT +void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *); +#endif #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -42,6 +42,7 @@ #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" @@ -57,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -136,6 +138,9 @@ #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); +#ifdef RATELIMIT +static void in_pcbdetach_txrtlmt(struct inpcb *inp); +#endif #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, @@ -1140,6 +1145,10 @@ KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); +#ifdef RATELIMIT + if (inp->inp_txring_ifp != NULL) + in_pcbdetach_txrtlmt(inp); +#endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } @@ -2683,3 +2692,200 @@ db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ + +#ifdef RATELIMIT +/* + * Modify existing TX rate limit on inp_txring_ifp and update + * inpcb info: + */ +static int +in_pcbmodify_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + uint32_t max_pacing_rate) +{ + struct ifreq_txrtlmt req; + int error; + + INP_WLOCK_ASSERT(inp); + + req.txring_max_rate = max_pacing_rate; + req.txring_id = inp->inp_txring_id; + req.txring_flow_id = inp->inp_flowid; + req.txring_flow_type = inp->inp_flowtype; + + error = ifp->if_ioctl(ifp, SIOCSRATECTL, (caddr_t)&req); + + if (error) + return (error); + + inp->inp_txring_max_rate = max_pacing_rate; + return (0); +} + +/* + * Create a TX rate limit on ifp and attach it to inpcb: + */ +static int +in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + uint32_t max_pacing_rate) +{ + struct ifreq_txrtlmt req; + int error; + + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_txring_ifp == NULL, + ("%s: inp_txring_ifp != NULL", __func__)); + + req.txring_max_rate = max_pacing_rate; + req.txring_flow_id = inp->inp_flowid; + req.txring_flow_type = inp->inp_flowtype; + + if_ref(ifp); + error = ifp->if_ioctl(ifp, SIOCARATECTL, (caddr_t)&req); + + if (error) { + if_rele(ifp); + return (error); + } + + inp->inp_txring_ifp = ifp; + inp->inp_txring_max_rate = max_pacing_rate; + inp->inp_txring_id = req.txring_id; + return (0); +} + +/* + * Remove TX rate limit from inp_txring_ifp and detach it from + * the inpcb: + */ +static void +in_pcbdetach_txrtlmt(struct inpcb *inp) +{ + struct ifreq_txrtlmt req; + struct ifnet *ifp; + + INP_WLOCK_ASSERT(inp); + + KASSERT(inp->inp_txring_ifp != NULL, + ("%s: inp->inp_txring_ifp == NULL", __func__)); + + ifp = inp->inp_txring_ifp; + req.txring_id = inp->inp_txring_id; + req.txring_flow_id = inp->inp_flowid; + req.txring_flow_type = inp->inp_flowtype; + + inp->inp_txring_ifp = NULL; + inp->inp_txring_id = 0; + inp->inp_txring_max_rate = 0; + + /* + * If the device was detached while we still had reference on + * ifp, we assume if_dead() was called and replaced callbacks + * with stubs. + */ + ifp->if_ioctl(ifp, SIOCDRATECTL, (caddr_t)&req); + if_rele(ifp); +} + +/* + * Track route changes and modify the TX rate limit hint in the given + * mbuf to match what the network driver expects. + */ +void +in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) +{ + struct socket *socket; + uint32_t max_pacing_rate; + int error; + + if (inp == NULL) + return; + + socket = inp->inp_socket; + if (socket == NULL) + return; + + /* + * NOTE: The so_max_pacing_rate value is read unlocked, + * because atomic updates are not required since the variable + * is checked at every mbuf we send. It is assumed that the + * variable read itself will be atomic. + */ + max_pacing_rate = socket->so_max_pacing_rate; + + if (max_pacing_rate == 0 && inp->inp_txring_ifp == NULL) + return; + + /* + * NOTE: When attaching to a network interface a reference is + * made to ensure the network interface doesn't go away until + * all ratelimit connections are gone. The network interface + * pointers compared below represent valid network interfaces, + * except when comparing towards NULL. + */ + if (ifp != inp->inp_txring_ifp) { + bool wlocked = INP_WLOCKED(inp); + + if (!wlocked) { + /* + * NOTE: If the write locking fails, we need + * to bail out and use the non-ratelimited + * ring for the transmit until there is a new + * chance to get the write lock. + */ + if (!INP_TRY_UPGRADE(inp)) + return; + } + + if (inp->inp_txring_ifp != NULL) + in_pcbdetach_txrtlmt(inp); + + /* + * In order to utilize packet pacing with RSS, we need + * to wait until there is a valid RSS hash before we + * can proceed: + */ + if (inp->inp_flowtype == M_HASHTYPE_NONE) { + if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { + if (!wlocked) + INP_DOWNGRADE(inp); + return; + } + /* typically UDP ends up here */ + inp->inp_flowid = mb->m_pkthdr.flowid; + inp->inp_flowtype = M_HASHTYPE_GET(mb); + } + + error = in_pcbattach_txrtlmt(inp, ifp, max_pacing_rate); + + if (!wlocked) + INP_DOWNGRADE(inp); + if (error) + return; + + } else if (inp->inp_txring_max_rate != max_pacing_rate) { + bool wlocked = INP_WLOCKED(inp); + + if (!wlocked) { + /* + * NOTE: If the write locking fails, use the + * current pacing rate until there is a new + * chance to write lock: + */ + if (!INP_TRY_UPGRADE(inp)) + goto done; + } + + error = in_pcbmodify_txrtlmt(inp, ifp, max_pacing_rate); + if (!wlocked) + INP_DOWNGRADE(inp); + if (error) + goto done; /* use old rate */ + } +done: + /* + * Update the flow ID and RSS hash for the transmitted mbuf. + */ + mb->m_pkthdr.flowid = inp->inp_txring_id; + M_HASHTYPE_SET(mb, M_HASHTYPE_TXRTLMT); +} +#endif /* RATELIMIT */ Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" +#include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" @@ -658,6 +659,10 @@ */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); +#ifdef RATELIMIT + if (ifp->if_capabilities & IFCAP_TXRTLMT) + in_pcboutput_txrtlmt(inp, ifp, m); +#endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); goto done; @@ -694,6 +699,10 @@ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); +#ifdef RATELIMIT + if (ifp->if_capabilities & IFCAP_TXRTLMT) + in_pcboutput_txrtlmt(inp, ifp, m); +#endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); } else Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -65,6 +65,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" @@ -954,6 +955,10 @@ m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } +#ifdef RATELIMIT + if (ifp->if_capabilities & IFCAP_TXRTLMT) + in_pcboutput_txrtlmt(inp, ifp, m); +#endif error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); goto done; @@ -1054,6 +1059,10 @@ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } +#ifdef RATELIMIT + if (ifp->if_capabilities & IFCAP_TXRTLMT) + in_pcboutput_txrtlmt(inp, ifp, m); +#endif error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); } else Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -345,6 +345,7 @@ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ +#define M_HASHTYPE_TXRTLMT 62 /* rate limited TX traffic */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -158,6 +158,7 @@ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ +#define SO_MAX_PACING_RATE 0x1017 /* set max TX pacing rate per socket */ #endif /* Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -79,6 +79,7 @@ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ + uint32_t so_max_pacing_rate; /* (f) TX pacing rate info */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. Index: sys/sys/sockio.h =================================================================== --- sys/sys/sockio.h +++ sys/sys/sockio.h @@ -133,4 +133,8 @@ #define SIOCGIFGMEMB _IOWR('i', 138, struct ifgroupreq) /* get members */ #define SIOCGIFXMEDIA _IOWR('i', 139, struct ifmediareq) /* get net xmedia */ +#define SIOCARATECTL _IOWR('i', 140, struct ifreq_txrtlmt) /* add TX rate limit */ +#define SIOCSRATECTL _IOWR('i', 141, struct ifreq_txrtlmt) /* set TX rate limit */ +#define SIOCDRATECTL _IOW('i', 142, struct ifreq_txrtlmt) /* del TX rate limit */ + #endif /* !_SYS_SOCKIO_H_ */