Index: lib/libc/sys/getsockopt.2 =================================================================== --- lib/libc/sys/getsockopt.2 +++ lib/libc/sys/getsockopt.2 @@ -28,7 +28,7 @@ .\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95 .\" $FreeBSD$ .\" -.Dd April 5, 2013 +.Dd October 11, 2016 .Dt GETSOCKOPT 2 .Os .Sh NAME @@ -187,6 +187,7 @@ .It Dv SO_LISTENQLEN Ta "get complete queue length of the socket (get only)" .It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)" .It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)" +.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket" .El .Pp .Dv SO_DEBUG @@ -496,6 +497,11 @@ returns the number of unaccepted complete connections. .Dv SO_LISTENINCQLEN returns the number of unaccepted incomplete connections. +.Pp +.Dv SO_MAX_PACING_RATE +instructs the socket and underlying network adapter layers that the +transmit rate should be limited to the given unsigned 32-bit value in +bytes per second. .Sh RETURN VALUES .Rv -std .Sh ERRORS Index: sbin/ifconfig/ifconfig.8 =================================================================== --- sbin/ifconfig/ifconfig.8 +++ sbin/ifconfig/ifconfig.8 @@ -28,7 +28,7 @@ .\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94 .\" $FreeBSD$ .\" -.Dd September 17, 2016 +.Dd October 11, 2016 .Dt IFCONFIG 8 .Os .Sh NAME @@ -460,6 +460,8 @@ and 802.11g .Pq Cm 11g operating modes. +.It Cm txrtlmt +Set if the driver supports TX rate limiting. .It Cm inst Ar minst , Cm instance Ar minst Set the media instance to .Ar minst . Index: sbin/ifconfig/ifconfig.c =================================================================== --- sbin/ifconfig/ifconfig.c +++ sbin/ifconfig/ifconfig.c @@ -1145,7 +1145,7 @@ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT" /* * Print the status of the interface. If an address family was @@ -1453,6 +1453,8 @@ DEF_CMD("-wol_mcast", -IFCAP_WOL_MCAST, setifcap), DEF_CMD("wol_magic", IFCAP_WOL_MAGIC, setifcap), DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap), + DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap), + DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap), DEF_CMD("normal", -IFF_LINK0, setifflags), DEF_CMD("compress", IFF_LINK0, setifflags), DEF_CMD("noicmp", IFF_LINK1, setifflags), Index: sys/conf/NOTES =================================================================== --- sys/conf/NOTES +++ sys/conf/NOTES @@ -619,6 +619,8 @@ options INET #Internet communications protocols options INET6 #IPv6 communications protocols +options RATELIMIT # TX rate limiting support + options ROUTETABLES=2 # allocated fibs up to 65536. default is 1. # but that would be a bad idea as they are large. Index: sys/conf/config.mk =================================================================== --- sys/conf/config.mk +++ sys/conf/config.mk @@ -19,6 +19,10 @@ opt_inet6.h: @echo "#define INET6 1" > ${.TARGET} .endif +.if ${MK_RATELIMIT} != "no" +opt_ratelimit.h: + @echo "#define RATELIMIT 1" > ${.TARGET} +.endif .if ${MK_EISA} != "no" opt_eisa.h: @echo "#define DEV_EISA 1" > ${.TARGET} Index: sys/conf/kern.opts.mk =================================================================== --- sys/conf/kern.opts.mk +++ sys/conf/kern.opts.mk @@ -47,7 +47,8 @@ EISA \ EXTRA_TCP_STACKS \ NAND \ - OFED + OFED \ + RATELIMIT # Some options are totally broken on some architectures. We disable # them. If you need to enable them on an experimental basis, you Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -411,6 +411,7 @@ BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h +RATELIMIT opt_ratelimit.h INET opt_inet.h INET6 opt_inet6.h IPDIVERT Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -105,6 +105,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include "opt_compat.h" #include @@ -2679,6 +2680,18 @@ #endif break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + error = sooptcopyin(sopt, &val32, sizeof(val32), + sizeof(val32)); + if (error) + goto bad; + so->so_max_pacing_rate = val32; +#else + error = EOPNOTSUPP; +#endif + break; + default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, @@ -2734,6 +2747,9 @@ #ifdef MAC struct mac extmac; #endif +#ifdef RATELIMIT + uint32_t val32; +#endif CURVNET_SET(so->so_vnet); error = 0; @@ -2866,6 +2882,15 @@ optval = so->so_incqlen; goto integer; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + val32 = so->so_max_pacing_rate; + error = sooptcopyout(sopt, &val32, sizeof(val32)); +#else + error = EOPNOTSUPP; +#endif + break; + default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, Index: sys/modules/if_lagg/Makefile =================================================================== --- sys/modules/if_lagg/Makefile +++ sys/modules/if_lagg/Makefile @@ -2,6 +2,6 @@ .PATH: ${.CURDIR}/../../net KMOD= if_lagg -SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h +SRCS= if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h opt_ratelimit.h .include Index: sys/modules/if_vlan/Makefile =================================================================== --- sys/modules/if_vlan/Makefile +++ sys/modules/if_vlan/Makefile @@ -4,6 +4,6 @@ KMOD= if_vlan SRCS= if_vlan.c -SRCS+= opt_inet.h opt_vlan.h +SRCS+= opt_inet.h opt_vlan.h opt_ratelimit.h .include Index: sys/net/ieee8023ad_lacp.h =================================================================== --- sys/net/ieee8023ad_lacp.h +++ sys/net/ieee8023ad_lacp.h @@ -284,6 +284,9 @@ struct mbuf *lacp_input(struct lagg_port *, struct mbuf *); struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *); +#ifdef RATELIMIT +struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t); +#endif void lacp_attach(struct lagg_softc *); void lacp_detach(void *); void lacp_init(struct lagg_softc *); Index: sys/net/ieee8023ad_lacp.c =================================================================== --- sys/net/ieee8023ad_lacp.c +++ sys/net/ieee8023ad_lacp.c @@ -30,6 +30,8 @@ #include __FBSDID("$FreeBSD$"); +#include "opt_ratelimit.h" + #include #include #include @@ -853,6 +855,35 @@ return (lp->lp_lagg); } + +#ifdef RATELIMIT +struct lagg_port * +lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid) +{ + struct lacp_softc *lsc = LACP_SOFTC(sc); + struct lacp_portmap *pm; + struct lacp_port *lp; + uint32_t hash; + + if (__predict_false(lsc->lsc_suppress_distributing)) { + LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__)); + return (NULL); + } + + pm = &lsc->lsc_pmap[lsc->lsc_activemap]; + if (pm->pm_count == 0) { + LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__)); + return (NULL); + } + + hash = flowid >> sc->flowid_shift; + hash %= pm->pm_count; + lp = pm->pm_map[hash]; + + return (lp->lp_lagg); +} +#endif + /* * lacp_suppress_distributing: drop transmit packets for a while * to preserve packet ordering. Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -239,6 +239,7 @@ #define IFCAP_RXCSUM_IPV6 0x200000 /* can offload checksum on IPv6 RX */ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ +#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) Index: sys/net/if_dead.c =================================================================== --- sys/net/if_dead.c +++ sys/net/if_dead.c @@ -100,6 +100,30 @@ return (0); } +static int +ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + return (EOPNOTSUPP); +} + +static int +ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) +{ + return (EOPNOTSUPP); +} + +static int +ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) +{ + return (EOPNOTSUPP); +} + +static void +ifdead_snd_tag_free(struct m_snd_tag *pmt) +{ +} + void if_dead(struct ifnet *ifp) { @@ -112,4 +136,8 @@ ifp->if_qflush = ifdead_qflush; ifp->if_transmit = ifdead_transmit; ifp->if_get_counter = ifdead_get_counter; + ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc; + ifp->if_snd_tag_modify = ifdead_snd_tag_modify; + ifp->if_snd_tag_query = ifdead_snd_tag_query; + ifp->if_snd_tag_free = ifdead_snd_tag_free; } Index: sys/net/if_lagg.c =================================================================== --- sys/net/if_lagg.c +++ sys/net/if_lagg.c @@ -23,6 +23,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include #include @@ -118,6 +119,11 @@ static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); +#ifdef RATELIMIT +static int lagg_snd_tag_alloc(struct ifnet *, + union if_snd_tag_alloc_params *, + struct m_snd_tag **); +#endif static int lagg_ether_setmulti(struct lagg_softc *); static int lagg_ether_cmdmulti(struct lagg_port *, int); static int lagg_setflag(struct lagg_port *, int, int, @@ -503,7 +509,12 @@ ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; +#ifdef RATELIMIT + ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; + ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT; +#else ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; +#endif /* * Attach as an ordinary ethernet device, children will be attached @@ -1549,6 +1560,52 @@ return (error); } +#ifdef RATELIMIT +static int +lagg_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + struct lagg_port *lp; + struct lagg_lb *lb; + uint32_t p; + + switch (sc->sc_proto) { + case LAGG_PROTO_FAILOVER: + lp = lagg_link_active(sc, sc->sc_primary); + break; + case LAGG_PROTO_LOADBALANCE: + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || + params->hdr.flowtype == M_HASHTYPE_NONE) + return (EOPNOTSUPP); + p = params->hdr.flowid >> sc->flowid_shift; + p %= sc->sc_count; + lb = (struct lagg_lb *)sc->sc_psc; + lp = lb->lb_ports[p]; + lp = lagg_link_active(sc, lp); + break; + case LAGG_PROTO_LACP: + if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || + params->hdr.flowtype == M_HASHTYPE_NONE) + return (EOPNOTSUPP); + lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); + break; + default: + return (EOPNOTSUPP); + } + if (lp == NULL) + return (EOPNOTSUPP); + ifp = lp->lp_ifp; + if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || + (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + return (EOPNOTSUPP); + + /* forward allocation request */ + return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); +} +#endif + static int lagg_ether_setmulti(struct lagg_softc *sc) { Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -175,6 +175,49 @@ #define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ +/* + * Network interface send tag support. The storage of "struct + * m_snd_tag" comes from the network driver and it is free to allocate + * as much additional space as it wants for its own use. + */ +struct m_snd_tag; + +#define IF_SND_TAG_TYPE_RATE_LIMIT 0 +#define IF_SND_TAG_TYPE_MAX 1 + +struct if_snd_tag_alloc_header { + uint32_t type; /* send tag type, see IF_SND_TAG_XXX */ + uint32_t flowid; /* mbuf hash value */ + uint32_t flowtype; /* mbuf hash type */ +}; + +struct if_snd_tag_alloc_rate_limit { + struct if_snd_tag_alloc_header hdr; + uint64_t max_rate; /* in bytes/s */ +}; + +struct if_snd_tag_rate_limit_params { + uint64_t max_rate; /* in bytes/s */ +}; + +union if_snd_tag_alloc_params { + struct if_snd_tag_alloc_header hdr; + struct if_snd_tag_alloc_rate_limit rate_limit; +}; + +union if_snd_tag_modify_params { + struct if_snd_tag_rate_limit_params rate_limit; +}; + +union if_snd_tag_query_params { + struct if_snd_tag_rate_limit_params rate_limit; +}; + +typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, + struct m_snd_tag **); +typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); +typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); +typedef void (if_snd_tag_free_t)(struct m_snd_tag *); /* * Structure defining a network interface. @@ -304,12 +347,19 @@ u_int if_hw_tsomaxsegsize; /* TSO maximum segment size in bytes */ /* + * Network adapter send tag support: + */ + if_snd_tag_alloc_t *if_snd_tag_alloc; + if_snd_tag_modify_t *if_snd_tag_modify; + if_snd_tag_query_t *if_snd_tag_query; + if_snd_tag_free_t *if_snd_tag_free; + + /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel * binary interface. */ - void *if_pspare[4]; /* packet pacing / general use */ - int if_ispare[4]; /* packet pacing / general use */ + int if_ispare[4]; /* general use */ }; /* for compatibility with other BSDs */ Index: sys/net/if_vlan.c =================================================================== --- sys/net/if_vlan.c +++ sys/net/if_vlan.c @@ -46,6 +46,7 @@ #include "opt_inet.h" #include "opt_vlan.h" +#include "opt_ratelimit.h" #include #include @@ -212,6 +213,10 @@ static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); +#ifdef RATELIMIT +static int vlan_snd_tag_alloc(struct ifnet *, + union if_snd_tag_alloc_params *, struct m_snd_tag **); +#endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); @@ -971,6 +976,9 @@ ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; +#ifdef RATELIMIT + ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; +#endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ @@ -1591,6 +1599,15 @@ TOEDEV(ifp) = TOEDEV(p); ifp->if_capenable |= p->if_capenable & IFCAP_TOE; } + +#ifdef RATELIMIT + /* + * If the parent interface supports ratelimiting, so does the + * VLAN interface. + */ + ifp->if_capabilities |= (p->if_capabilities & IFCAP_TXRTLMT); + ifp->if_capenable |= (p->if_capenable & IFCAP_TXRTLMT); +#endif } static void @@ -1801,3 +1818,19 @@ return (error); } + +#ifdef RATELIMIT +static int +vlan_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + + /* get trunk device */ + ifp = vlan_trunkdev(ifp); + if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + return (EOPNOTSUPP); + /* forward allocation request */ + return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); +} +#endif Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -181,6 +181,7 @@ * read-lock usage during modification, this model can be applied to other * protocols (especially SCTP). */ +struct m_snd_tag; struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ @@ -202,11 +203,11 @@ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ - void *inp_pspare[5]; /* (x) packet pacing / general use */ + struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */ + void *inp_pspare[4]; /* (x) general use */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ - u_int inp_ispare[4]; /* (x) packet pacing / user cookie / - * general use */ + u_int inp_ispare[4]; /* (x) user cookie / general use */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i) list for PCB's local port */ @@ -616,6 +617,7 @@ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ +#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ /* * Flags passed to in_pcblookup*() functions. @@ -736,6 +738,14 @@ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); +#ifdef RATELIMIT +int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t); +void in_pcbdetach_txrtlmt(struct inpcb *); +int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t); +int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *); +void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *); +void in_pcboutput_eagain(struct inpcb *); +#endif #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -42,6 +42,7 @@ #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" @@ -57,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -1140,6 +1142,10 @@ KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); +#ifdef RATELIMIT + if (inp->inp_snd_tag != NULL) + in_pcbdetach_txrtlmt(inp); +#endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } @@ -2677,3 +2683,251 @@ db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ + +#ifdef RATELIMIT +/* + * Modify TX rate limit based on the existing "inp->inp_snd_tag", + * if any. + */ +int +in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) +{ + union if_snd_tag_modify_params params = { + .rate_limit.max_rate = max_pacing_rate, + }; + struct m_snd_tag *mst; + struct ifnet *ifp; + int error; + + mst = inp->inp_snd_tag; + if (mst == NULL) + return (EINVAL); + + ifp = mst->ifp; + if (ifp == NULL) + return (EINVAL); + + if (ifp->if_snd_tag_modify == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_modify(mst, ¶ms); + } + return (error); +} + +/* + * Query existing TX rate limit based on the existing + * "inp->inp_snd_tag", if any. + */ +int +in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) +{ + union if_snd_tag_query_params params = { }; + struct m_snd_tag *mst; + struct ifnet *ifp; + int error; + + mst = inp->inp_snd_tag; + if (mst == NULL) + return (EINVAL); + + ifp = mst->ifp; + if (ifp == NULL) + return (EINVAL); + + if (ifp->if_snd_tag_query == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_query(mst, ¶ms); + if (error == 0 && p_max_pacing_rate != NULL) + *p_max_pacing_rate = params.rate_limit.max_rate; + } + return (error); +} + +/* + * Allocate a new TX rate limit send tag from the network interface + * given by the "ifp" argument and save it in "inp->inp_snd_tag": + */ +int +in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, + uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) +{ + union if_snd_tag_alloc_params params = { + .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, + .rate_limit.hdr.flowid = flowid, + .rate_limit.hdr.flowtype = flowtype, + .rate_limit.max_rate = max_pacing_rate, + }; + int error; + + INP_WLOCK_ASSERT(inp); + + if (inp->inp_snd_tag != NULL) + return (EINVAL); + + if (ifp->if_snd_tag_alloc == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); + + /* + * At success increment the refcount on + * the send tag's network interface: + */ + if (error == 0) + if_ref(inp->inp_snd_tag->ifp); + } + return (error); +} + +/* + * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", + * if any: + */ +void +in_pcbdetach_txrtlmt(struct inpcb *inp) +{ + struct m_snd_tag *mst; + struct ifnet *ifp; + + INP_WLOCK_ASSERT(inp); + + mst = inp->inp_snd_tag; + inp->inp_snd_tag = NULL; + + if (mst == NULL) + return; + + ifp = mst->ifp; + if (ifp == NULL) + return; + + /* + * If the device was detached while we still had reference(s) + * on the ifp, we assume if_snd_tag_free() was replaced with + * stubs. + */ + ifp->if_snd_tag_free(mst); + + /* release reference count on network interface */ + if_rele(ifp); +} + +/* + * This function should be called when the INP_RATE_LIMIT_CHANGED flag + * is set in the fast path and will attach/detach/modify the TX rate + * limit send tag based on the socket's so_max_pacing_rate value. + */ +void +in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) +{ + struct socket *socket; + uint32_t max_pacing_rate; + bool was_wlocked; + int error; + + if (inp == NULL) + return; + + socket = inp->inp_socket; + if (socket == NULL) + return; + + was_wlocked = INP_WLOCKED(inp); + + if (!was_wlocked) { + /* + * NOTE: If the write locking fails, we need to bail + * out and use the non-ratelimited ring for the + * transmit until there is a new chance to get the + * write lock. + */ + if (!INP_TRY_UPGRADE(inp)) + return; + } + + /* + * NOTE: The so_max_pacing_rate value is read unlocked, + * because atomic updates are not required since the variable + * is checked at every mbuf we send. It is assumed that the + * variable read itself will be atomic. + */ + max_pacing_rate = socket->so_max_pacing_rate; + + /* + * NOTE: When attaching to a network interface a reference is + * made to ensure the network interface doesn't go away until + * all ratelimit connections are gone. The network interface + * pointers compared below represent valid network interfaces, + * except when comparing towards NULL. + */ + if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { + error = 0; + } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { + if (inp->inp_snd_tag != NULL) + in_pcbdetach_txrtlmt(inp); + error = 0; + } else if (inp->inp_snd_tag == NULL) { + /* + * In order to utilize packet pacing with RSS, we need + * to wait until there is a valid RSS hash before we + * can proceed: + */ + if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { + error = EAGAIN; + } else { + error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), + mb->m_pkthdr.flowid, max_pacing_rate); + } + } else { + error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); + } + if (error == 0 || error == EOPNOTSUPP) + inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + if (!was_wlocked) + INP_DOWNGRADE(inp); +} + +/* + * Track route changes for TX rate limiting. + */ +void +in_pcboutput_eagain(struct inpcb *inp) +{ + struct socket *socket; + bool was_wlocked; + + if (inp == NULL) + return; + + socket = inp->inp_socket; + if (socket == NULL) + return; + + if (inp->inp_snd_tag == NULL) + return; + + was_wlocked = INP_WLOCKED(inp); + + if (!was_wlocked) { + /* + * NOTE: If the write locking fails, we need to bail + * out and use the non-ratelimited ring for the + * transmit until there is a new chance to get the + * write lock. + */ + if (!INP_TRY_UPGRADE(inp)) + return; + } + + /* detach rate limiting */ + in_pcbdetach_txrtlmt(inp); + + /* make sure new mbuf send tag allocation is made */ + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + + if (!was_wlocked) + INP_DOWNGRADE(inp); +} +#endif /* RATELIMIT */ Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" +#include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" @@ -661,8 +662,23 @@ */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); +#ifdef RATELIMIT + if (inp != NULL) { + if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) + in_pcboutput_txrtlmt(inp, ifp, m); + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = inp->inp_snd_tag; + } else { + m->m_pkthdr.snd_tag = NULL; + } +#endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); +#ifdef RATELIMIT + /* check for route change */ + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif goto done; } @@ -697,8 +713,23 @@ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); +#ifdef RATELIMIT + if (inp != NULL) { + if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) + in_pcboutput_txrtlmt(inp, ifp, m); + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = inp->inp_snd_tag; + } else { + m->m_pkthdr.snd_tag = NULL; + } +#endif error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); +#ifdef RATELIMIT + /* check for route change */ + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif } else m_freem(m); } @@ -973,6 +1004,16 @@ INP_WUNLOCK(inp); error = 0; break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + INP_WLOCK(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + INP_WUNLOCK(inp); + error = 0; +#else + error = EOPNOTSUPP; +#endif + break; default: break; } Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -65,6 +65,7 @@ #include "opt_inet.h" #include "opt_inet6.h" +#include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" @@ -954,8 +955,23 @@ m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } +#ifdef RATELIMIT + if (inp != NULL) { + if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) + in_pcboutput_txrtlmt(inp, ifp, m); + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = inp->inp_snd_tag; + } else { + m->m_pkthdr.snd_tag = NULL; + } +#endif error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); +#ifdef RATELIMIT + /* check for route change */ + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif goto done; } @@ -1054,8 +1070,23 @@ counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } +#ifdef RATELIMIT + if (inp != NULL) { + if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) + in_pcboutput_txrtlmt(inp, ifp, m); + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = inp->inp_snd_tag; + } else { + m->m_pkthdr.snd_tag = NULL; + } +#endif error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); +#ifdef RATELIMIT + /* check for route change */ + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif } else m_freem(m); } @@ -1441,6 +1472,16 @@ INP_WUNLOCK(in6p); error = 0; break; + case SO_MAX_PACING_RATE: +#ifdef RATELIMIT + INP_WLOCK(in6p); + in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + INP_WUNLOCK(in6p); + error = 0; +#else + error = EOPNOTSUPP; +#endif + break; default: break; } Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -130,6 +130,14 @@ }; /* + * Static network interface owned tag. + * Allocated through ifp->if_snd_tag_alloc(). + */ +struct m_snd_tag { + struct ifnet *ifp; /* network interface tag belongs to */ +}; + +/* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 @@ -137,7 +145,10 @@ * they are correct. */ struct pkthdr { - struct ifnet *rcvif; /* rcv interface */ + union { + struct m_snd_tag *snd_tag; /* send tag, if any */ + struct ifnet *rcvif; /* rcv interface */ + }; SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -158,6 +158,7 @@ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ +#define SO_MAX_PACING_RATE 0x1017 /* set max TX pacing rate per socket */ #endif /* Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -127,8 +127,9 @@ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; - void *so_pspare[2]; /* packet pacing / general use */ - int so_ispare[2]; /* packet pacing / general use */ + void *so_pspare[2]; /* general use */ + uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ + int so_ispare[1]; /* general use */ }; /*