Index: sbin/ifconfig/ifconfig.8 =================================================================== --- sbin/ifconfig/ifconfig.8 +++ sbin/ifconfig/ifconfig.8 @@ -373,6 +373,8 @@ and 802.11g .Pq Cm 11g operating modes. +.It Cm txrtlmt +Set if the driver supports tx rate limit. .It Cm inst Ar minst , Cm instance Ar minst Set the media instance to .Ar minst . Index: sbin/ifconfig/ifconfig.c =================================================================== --- sbin/ifconfig/ifconfig.c +++ sbin/ifconfig/ifconfig.c @@ -1069,7 +1069,7 @@ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT" /* * Print the status of the interface. If an address family was @@ -1377,6 +1377,8 @@ DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap), DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap), DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap), + DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap), + DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap), DEF_CMD("normal", -IFF_LINK0, setifflags), DEF_CMD("compress", IFF_LINK0, setifflags), DEF_CMD("noicmp", IFF_LINK1, setifflags), Index: sys/conf/NOTES =================================================================== --- sys/conf/NOTES +++ sys/conf/NOTES @@ -591,6 +591,8 @@ options INET #Internet communications protocols options INET6 #IPv6 communications protocols +options RATELIMIT #rate limit support. + options ROUTETABLES=2 # allocated fibs up to 65536. default is 1. # but that would be a bad idea as they are large. Index: sys/conf/config.mk =================================================================== --- sys/conf/config.mk +++ sys/conf/config.mk @@ -10,6 +10,10 @@ .if !defined(KERNBUILDDIR) opt_bpf.h: echo "#define DEV_BPF 1" > ${.TARGET} +.if ${MK_RATELIMIT} != "no" +opt_ratelimit.h: + @echo "#define RATELIMIT 1" > ${.TARGET} +.endif .if ${MK_INET_SUPPORT} != "no" opt_inet.h: @echo "#define INET 1" > ${.TARGET} Index: sys/conf/kern.opts.mk =================================================================== --- sys/conf/kern.opts.mk +++ sys/conf/kern.opts.mk @@ -46,6 +46,7 @@ __DEFAULT_NO_OPTIONS = \ EISA \ NAND \ + RATELIMIT \ OFED # Some options are totally broken on some architectures. We disable Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -404,6 +404,7 @@ BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h +RATELIMIT INET opt_inet.h INET6 opt_inet6.h IPDIVERT Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -103,6 +103,7 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_ratelimit.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" @@ -2668,6 +2669,18 @@ #endif break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + error = sooptcopyin(sopt, &val32, sizeof(val32), + sizeof(val32)); + if (error) + goto bad; + so->so_max_pacing_rate = val32; +#else + error = EOPNOTSUPP; +#endif + break; + default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, @@ -2723,6 +2736,9 @@ #ifdef MAC struct mac extmac; #endif +#ifdef RATELIMIT + uint32_t val32; +#endif CURVNET_SET(so->so_vnet); error = 0; @@ -2843,6 +2859,15 @@ #endif break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + val32 = so->so_max_pacing_rate; + error = sooptcopyout(sopt, &val32, sizeof(val32)); +#else + error = EOPNOTSUPP; +#endif + break; + case SO_LISTENQLIMIT: optval = so->so_qlimit; goto integer; Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -239,6 +239,7 @@ #define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ +#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limit */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) @@ -371,6 +372,14 @@ }; /* + * Interface to create/delete/modify/get tx rate limit. + */ +struct ifreq_txrtlmt { + uint32_t txringid_max_rate; + uint32_t txringid; +}; + +/* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -2601,6 +2601,17 @@ ifr = (struct ifreq *)data; switch (cmd) { + /* + * We want these IOCTLs to be run only inside/by kernel code, + * From a very specific location we kill it here so it won't + * get to hw. + */ + case SIOCARATECTL: + case SIOCSRATECTL: + case SIOCDRATECTL: + CURVNET_RESTORE(); + return (EOPNOTSUPP); + #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -202,6 +202,9 @@ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ void *inp_pspare[5]; /* (x) route caching / general use */ + struct ifnet *inp_txringid_ifp; /* (i) ifp of ring id */ + uint32_t inp_txringid_max_rate; /* (i) driver ring id rate */ + uint32_t inp_txringid; /* (i) driver ring id */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ u_int inp_ispare[4]; /* (x) route caching / user cookie / @@ -728,6 +731,11 @@ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); +int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate); +int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + uint32_t max_pacing_rate); +void in_pcbdetach_txrtlmt(struct inpcb *inp); +int in_pcb_getwlock(struct inpcb *inp, bool *excl); #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -39,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include "opt_ddb.h" +#include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" @@ -55,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -1138,6 +1140,11 @@ KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); +#ifdef RATELIMIT + if (inp->inp_txringid_ifp != NULL) + in_pcbdetach_txrtlmt(inp); +#endif + inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } @@ -2646,3 +2653,124 @@ db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ + + +#ifdef RATELIMIT +/* + * Modify existing tx rate limit on inp_txringid_ifp and update inp info. + */ +int +in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) +{ + struct ifreq_txrtlmt req; + int error; + + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_txringid_ifp != NULL, + ("%s: inp_txringid_ifp == NULL", __func__)); + + req.txringid_max_rate = max_pacing_rate; + req.txringid = inp->inp_txringid; + error = inp->inp_txringid_ifp->if_ioctl(inp->inp_txringid_ifp, + SIOCSRATECTL, (caddr_t)&req); + + if (error) + return (error); + + inp->inp_txringid_max_rate = max_pacing_rate; + return (0); +} + +/* + * Create a tx rate limit on ifp and attach it to inp + */ +int +in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + uint32_t max_pacing_rate) +{ + struct ifreq_txrtlmt req; + int error; + + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_txringid_ifp == NULL, + ("%s: inp_txringid_ifp != NULL", __func__)); + + req.txringid_max_rate = max_pacing_rate; + + if_ref(ifp); + error = ifp->if_ioctl(ifp, SIOCARATECTL, (caddr_t)&req); + + if (error) { + if_rele(ifp); + return (error); + } + + inp->inp_txringid_ifp = ifp; + inp->inp_txringid_max_rate = max_pacing_rate; + inp->inp_txringid = req.txringid; + return (0); +} + +/* + * Remove tx rate limit from inp_txringid_ifp + * and detach it from the in_pcb + */ +void +in_pcbdetach_txrtlmt(struct inpcb *inp) +{ + struct ifreq_txrtlmt req; + struct ifnet *ifp; + + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_txringid_ifp != NULL, + ("%s: inp->inp_txringid_ifp == NULL", __func__)); + + ifp = inp->inp_txringid_ifp; + req.txringid = inp->inp_txringid; + + inp->inp_txringid_ifp = NULL; + inp->inp_txringid = 0; + inp->inp_txringid_max_rate = 0; + + /* + * If the device went down (module removed) + * while we still had ref to ifp + * we assume if_dead was called + * and replaced callbacks with stubs. + */ + ifp->if_ioctl(ifp, SIOCDRATECTL, (caddr_t)&req); + if_rele(ifp); +} + +/* + * Make sure we have wlock on inpcb. + * 1) If we have wlock before calling, on return: *excl unchanged, return status 0. + * 2) If we have read lock and in_pcbfree() wasn't called when we upgraded, + * on return: *excl = false, and return status 1. + * 3) If in_pcbfree() was called when we upgraded, return status -1, + * and return to the same lock status we held before calling + * in_pcb_getwlock (Which means we hold rlock). + */ +int +in_pcb_getwlock(struct inpcb *inp, bool *excl) +{ + if (!INP_WLOCKED(inp)) { + *excl = false; + /* shared lock at this point + * try to upgrade, if can't need to drop shared + * and get exclusive lock + */ + if (!INP_TRY_UPGRADE(inp)) { + in_pcbref(inp); + INP_RUNLOCK(inp); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) { + INP_DOWNGRADE(inp); + return (-1); + } + return (1); + } + } + return (0); +} +#endif /* RATELIMIT */ Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -32,6 +32,7 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_ratelimit.h" #include "opt_inet.h" #include "opt_ipfw.h" #include "opt_ipsec.h" @@ -231,6 +232,11 @@ #ifdef IPSEC int no_route_but_check_spd = 0; #endif +#ifdef RATELIMIT + uint32_t max_pacing_rate = 0; + bool excl = true; +#endif + M_ASSERTPKTHDR(m); if (inp != NULL) { @@ -240,6 +246,16 @@ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } + /* + * We don't need to take a lock here because every mbuf we send + * we check if max_pacing_rate is set, + * even if we read incorrect value at first, + * eventually we will get the right value. + */ +#ifdef RATELIMIT + if (inp->inp_socket != NULL) + max_pacing_rate = inp->inp_socket->so_max_pacing_rate; +#endif } if (ro == NULL) { @@ -598,6 +614,68 @@ } #endif +#ifdef RATELIMIT + /* + * Check for HW pacing capability of the interface, if present + * set flowid and rsstype on the mbuf If inp->inp_socket == + * NULL , the last bits of communication just want to get over + * with this connection, don't care about pacing + */ +check_pacing: + if (inp != NULL && inp->inp_socket != NULL && + (max_pacing_rate || inp->inp_txringid_ifp != NULL) && + (ifp->if_capabilities & IFCAP_TXRTLMT)) { + + /* + * Note that if we have ref of ifp, a new ifp can't be + * created at the same memory address of the old ifp. this + * lets us insure that if we transmit on one interface + * and its module is unloaded and then loaded, we + * won't try to transmit on a invalid ring on the + * new ifp, but first we delete the ring on the old + * ifp, and then will create a new one on the new ifp. + */ + if (ifp != inp->inp_txringid_ifp) { + + error = in_pcb_getwlock(inp, &excl); + if (error == -1) + goto bad; + if (error) + goto check_pacing; + if (inp->inp_txringid_ifp != NULL) + in_pcbdetach_txrtlmt(inp); + + error = in_pcbattach_txrtlmt(inp, ifp, + max_pacing_rate); + if (error) { + inp->inp_socket->so_max_pacing_rate = 0; + goto done_pacing; + } + } else if (inp->inp_txringid_max_rate != max_pacing_rate) { + + error = in_pcb_getwlock(inp, &excl); + if (error == -1) + goto bad; + if (error) + goto check_pacing; + + error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); + if (error) { + inp->inp_socket->so_max_pacing_rate = 0; + goto done_pacing; + } + } + + m->m_pkthdr.flowid = inp->inp_txringid; + M_HASHTYPE_SET(m, M_HASHTYPE_TXRTLMT); + } +done_pacing: + if (!excl) { + excl = true; + INP_DOWNGRADE(inp); + } +#endif /* RATELIMIT */ + /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -63,6 +63,7 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_ratelimit.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" @@ -318,6 +319,10 @@ uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; +#ifdef RATELIMIT + uint32_t max_pacing_rate = 0; + bool excl = true; +#endif ip6 = mtod(m, struct ip6_hdr *); if (ip6 == NULL) { @@ -332,6 +337,16 @@ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } + /* + * We don't need to take a lock here because every mbuf we send + * we check if max_pacing_rate is set, + * even if we read incorrect value at first, + * eventually we will get the right value. + */ +#ifdef RATELIMIT + if (inp->inp_socket != NULL) + max_pacing_rate = inp->inp_socket->so_max_pacing_rate; +#endif } finaldst = ip6->ip6_dst; @@ -920,6 +935,68 @@ goto bad; } +#ifdef RATELIMIT + /* + * Check for HW pacing capability of the interface, if present + * set flowid and rsstype on the mbuf If inp->inp_socket == + * NULL , the last bits of communication just want to get over + * with this connection, don't care about pacing + */ +check_pacing: + if (inp != NULL && inp->inp_socket != NULL && + (max_pacing_rate || inp->inp_txringid_ifp != NULL) && + (ifp->if_capabilities & IFCAP_TXRTLMT)) { + + /* + * Note that if we have ref of ifp, a new ifp can't be + * created at the same memory address of the old ifp. this + * lets us insure that if we transmit on one interface + * and its module is unloaded and then loaded, we + * won't try to transmit on a none valid ring on the + * new ifp, but first we delete the ring on the old + * ifp, and then will create a new one on the new ifp. + */ + if (ifp != inp->inp_txringid_ifp) { + + error = in_pcb_getwlock(inp, &excl); + if (error == -1) + goto bad; + if (error) + goto check_pacing; + if (inp->inp_txringid_ifp != NULL) + in_pcbdetach_txrtlmt(inp); + + error = in_pcbattach_txrtlmt(inp, ifp, + max_pacing_rate); + if (error) { + inp->inp_socket->so_max_pacing_rate = 0; + goto done_pacing; + } + } else if (inp->inp_txringid_max_rate != max_pacing_rate) { + + error = in_pcb_getwlock(inp, &excl); + if (error == -1) + goto bad; + if (error) + goto check_pacing; + + error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); + if (error) { + inp->inp_socket->so_max_pacing_rate = 0; + goto done_pacing; + } + } + + m->m_pkthdr.flowid = inp->inp_txringid; + M_HASHTYPE_SET(m, M_HASHTYPE_TXRTLMT); + } +done_pacing: + if (!excl) { + excl = true; + INP_DOWNGRADE(inp); + } +#endif /* RATELIMIT */ + /* * transmit packet without fragmentation */ Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -304,6 +304,7 @@ #define M_HASHTYPE_RSS_UDP_IPV4_EX 8 /* IPv4 UDP 4-tuple + ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV6 9 /* IPv6 UDP 4-tuple */ #define M_HASHTYPE_RSS_UDP_IPV6_EX 10 /* IPv6 UDP 4-tuple + ext hdrs */ +#define M_HASHTYPE_TXRTLMT 11 /* rate limit tx traffic */ #define M_HASHTYPE_OPAQUE 255 /* ordering, not affinity */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -157,6 +157,7 @@ #define SO_SETFIB 0x1014 /* use this FIB to route */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ +#define SO_MAX_PACING_RATE 0x1017 /* set max pacing rate per socket */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #endif Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -80,6 +80,7 @@ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ + uint32_t so_max_pacing_rate; /* (f) pacing rate info */ /* * Variables for connection queuing. * Socket where accepts occur is so_head in all subsidiary sockets. Index: sys/sys/sockio.h =================================================================== --- sys/sys/sockio.h +++ sys/sys/sockio.h @@ -133,4 +133,8 @@ #define SIOCGIFGMEMB _IOWR('i', 138, struct ifgroupreq) /* get members */ #define SIOCGIFXMEDIA _IOWR('i', 139, struct ifmediareq) /* get net xmedia */ +#define SIOCARATECTL _IOWR('i', 139, struct ifreq_txrtlmt) /* add tx rate limit */ +#define SIOCSRATECTL _IOWR('i', 140, struct ifreq_txrtlmt) /* set tx rate limit */ +#define SIOCDRATECTL _IOW('i', 141, struct ifreq_txrtlmt) /* del tx rate limit */ + #endif /* !_SYS_SOCKIO_H_ */