Index: head/tools/tools/netmap/pkt-gen.8 =================================================================== --- head/tools/tools/netmap/pkt-gen.8 +++ head/tools/tools/netmap/pkt-gen.8 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd October 23, 2018 +.Dd October 31, 2018 .Dt PKT-GEN 8 .Os .Sh NAME @@ -36,96 +36,215 @@ .Bl -item -compact .It .Nm +.Op Fl h46XzZNIWvrAB .Op Fl i Ar interface .Op Fl f Ar function .Op Fl n Ar count -.Op Fl t Ar pkts_to_send -.Op Fl r Ar pkts_to_receive .Op Fl l Ar pkt_size +.Op Fl b Ar burst_size .Op Fl d Ar dst_ip[:port[-dst_ip:port]] .Op Fl s Ar src_ip[:port[-src_ip:port]] -.Op Fl D Ar dst-mac -.Op Fl S Ar src-mac +.Op Fl D Ar dst_mac +.Op Fl S Ar src_mac .Op Fl a Ar cpu_id -.Op Fl b Ar burst size -.Op Fl c Ar cores +.Op Fl c Ar cpus .Op Fl p Ar threads .Op Fl T Ar report_ms -.Op Fl P +.Op Fl P Ar file .Op Fl w Ar wait_for_link_time .Op Fl R Ar rate -.Op Fl X .Op Fl H Ar len -.Op Fl P Ar xfile -.Op Fl z -.Op Fl Z +.Op Fl F Ar num_frags +.Op Fl M Ar frag_size +.Op Fl C Ar port_config +.El .Sh DESCRIPTION .Nm -generates and receives raw network packets using -.Xr netmap 4 . +leverages +.Xr netmap 4 +to generate and receive raw network packets in batches. The arguments are as follows: -.Pp .Bl -tag -width Ds +.It Fl h +Show program usage and exit. .It Fl i Ar interface -Network interface name. -.It Fl f Ar function tx rx ping pong -Set the function to transmit, receive of ping/pong. -.It Fl n count -Number of iterations (can be 0). -.It Fl t pkts_to_send -Number of packets to send. Also forces transmit mode. -.It Fl r Ar pkts_to_receive -Number of packets to receive. Also forces rx mode. +Name of the network interface that +.Nm +operates on. +It can be a system network interface (e.g., em0), +the name of a +.Xr vale 4 +port (e.g., valeSSS:PPP), the name of a netmap pipe or monitor, +or any valid netmap port name accepted by the +.Ar nm_open +library function, as documented in +.Xr netmap 4 +(NIOCREGIF section). +.It Fl f Ar function +The function to be executed by +.Nm . +Specify +.Cm tx +for transmission, +.Cm rx +for reception, +.Cm ping +for client-side ping-pong operation, and +.Cm pong +for server-side ping-pong operation. +.It Fl n Ar count +Number of iterations of the +.Nm +function, with 0 meaning infinite). +In case of +.Cm tx +or +.Cm rx , +.Ar count +is the number of packets to receive or transmit. +In case of +.Cm ping +or +.Cm pong , +.Ar count +is the number of ping-pong transactions. .It Fl l Ar pkt_size Packet size in bytes excluding CRC. +If passed a second time, use random sizes larger or equal than the +second one and lower than the first one. +.It Fl b Ar burst_size +Transmit or receive up to +.Ar burst_size +packets at a time. +.It Fl 4 +Use IPv4 addresses. +.It Fl 6 +Use IPv6 addresses. .It Fl d Ar dst_ip[:port[-dst_ip:port]] -Destination IPv4 address and port, single or range. +Destination IPv4/IPv6 address and port, single or range. .It Fl s Ar src_ip[:port[-src_ip:port]] -Source IPv4 address and port, single or range. -.It Fl D Ar dst-mac -Destination MAC address in colon notation. -.It Fl S Ar src-mac +Source IPv4/IPv6 address and port, single or range. +.It Fl D Ar dst_mac +Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00). +.It Fl S Ar src_mac Source MAC address in colon notation. .It Fl a Ar cpu_id -Tie +Pin the first thread of .Nm -to a particular CPU core using -.Xr setaffinity 2. -.It Fl b Ar burst size -Set the size of a burst of packets. -.It Fl c Ar cores -Number of cores to use. +to a particular CPU using +.Xr pthread_setaffinity_np 3 . +If more threads are used, they are pinned to the subsequent CPUs, +one per thread. +.It Fl c Ar cpus +Maximum number of CPUs to use (0 means to use all the available ones). .It Fl p Ar threads Number of threads to use. +By default, only a single thread is used +to handle all the netmap rings. +If +.Ar threads +is larger than one, each thread handles a single TX ring (in +.Cm tx +mode), a single RX ring (in +.Cm rx +mode), or a TX/RX ring couple. +The number of +.Ar threads +must be less or equal than the number of TX (or RX) ring available +in the device specified by +.Ar interface . .It Fl T Ar report_ms Number of milliseconds between reports. -.It Fl P -Use libpcap instead of netmap for reading or writing. .It Fl w Ar wait_for_link_time -Number of seconds to wait to make sure that the network link is up. A -network device driver may take some time to create a new -transmit/receive ring pair when +Number of seconds to wait before starting the +.Nm +function, useuful to make sure that the network link is up. +A network device driver may take some time to enter netmap mode, or +to create a new transmit/receive ring pair when .Xr netmap 4 requests one. .It Fl R Ar rate -Packet transmission rate. Not setting the packet transmission rate tells +Packet transmission rate. +Not setting the packet transmission rate tells .Nm -to transmit packets as quickly as possible. On servers from 2010 on-wards +to transmit packets as quickly as possible. +On servers from 2010 on-wards .Xr netmap 4 is able to completely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless your intention is to saturate the link. .It Fl X -Dump payload transmitted or received. +Dump payload of each packet transmitted or received. .It Fl H Ar len -Add empty virtio-net-header with size 'len'. This option is only use -with Virtual Machine technologies that use virtio as a network interface. +Add empty virtio-net-header with size +.Ar len . +Valid sizes are 0, 10 and 12. +This option is only used with Virtual Machine technologies that use virtio +as a network interface. .It Fl P Ar file -Load the packet from a pcap file rather than constructing it inside of -.Nm +Load the packet to be transmitted from a pcap file rather than constructing +it within +.Nm . .It Fl z -Use random IPv4 src address/port +Use random IPv4/IPv6 src address/port. .It Fl Z -Use random IPv4 dst address/port +Use random IPv4/IPv6 dst address/port. +.It Fl N +Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.). +.It Fl F Ar num_frags +Send multi-slot packets, each one with +.Ar num_frags +fragments. +A multi-slot packet is represented by two or more consecutive netmap slots +with the +.Ar NS_MOREFRAG +flag set (except for the last slot). +This is useful to transmit or receive packets larger than the netmap +buffer size. +.It Fl M Ar frag_size +In multi-slot mode, +.Ar frag_size +specifies the size of each fragment, if smaller than the packet length +divided by +.Ar num_frags . +.It Fl I +Use indirect buffers. +It is only valid for transmitting on VALE ports, +and it is implemented by setting the +.Ar NS_INDIRECT +flag in the netmap slots. +.It Fl W +Exit immediately if all the RX rings are empty the first time they are +examined. +.It Fl v +Increase the verbosity level. +.It Fl r +In +.Cm tx +mode, do not initialize packets, but send whatever the content of +the uninitialized netmap buffers is (rubbish mode). +.It Fl A +Compute mean and standard deviation (over a sliding window) for the +transmit or receive rate. +.It Fl B +Take Ethernet framing and CRC into account when computing the average bps. +This adds 4 bytes of CRC and 20 bytes of framing to each packet. +.It Fl C Ar tx_slots Ns Oo Cm \&, Ns Ar rx_slots Ns Oo Cm \&, Ns Ar tx_rings Ns Oo Cm \&, Ns Ar rx_rings Oc Oc Oc +Configuration in terms of number of rings and slots to be used when +opening the netmap port. +Such configuration has effect on software ports +created on the fly, such as VALE ports and netmap pipes. +The configuration may consist of 1 to 4 numbers separated by commas: +.Dq tx_slots,rx_slots,tx_rings,rx_rings . +Missing numbers or zeroes stand for default values. +As an additional convenience, if exactly one number is specified, +then this is assigned to both +.Ar tx_slots +and +.Ar rx_slots . +If there is no fourth number, then the third one is assigned to both +.Ar tx_rings +and +.Ar rx_rings . .El .Pp .Nm @@ -133,7 +252,7 @@ .Xr netmap 4 or .Xr bpf 4 -but which is most often uses with +but which is most often used with .Xr netmap 4 . The .Ar interface name @@ -146,7 +265,8 @@ .Nm can peel off one or more of the transmit or receive rings for its own use without interfering with packets that might otherwise be destined -for the host. For example on a system with a Chelsio Network +for the host. +For example on a system with a Chelsio Network Interface Card (NIC) the interface specification of .Ar -i netmap:ncxl0 gives @@ -156,20 +276,20 @@ system's TCP/IP stack. .Sh EXAMPLES Capture and count all packets arriving on the operating system's cxl0 -interface. Using this will block packets from reaching the operating +interface. +Using this will block packets from reaching the operating system's network stack. -.Dl +.Bd -literal -offset indent +pkt-gen -i cxl0 -f rx +.Ed .Pp -.Nm --i cxl0 -f rx -.Pp Send a stream of fake DNS packets between two hosts with a packet -length of 128 bytes. You must set the destination MAC address for +length of 128 bytes. +You must set the destination MAC address for packets to be received by the target host. -.Pp -.Dl -.Nm --i netmap:ncxl0 -f tx -s 172.16.0.1:53 -d 172.16.1.3:53 -D 00:07:43:29:2a:e0 +.Bd -literal -offset intent +pkt-gen -i netmap:ncxl0 -f tx -s 172.16.0.1:53 -d 172.16.1.3:53 -D 00:07:43:29:2a:e0 +.Ed .Sh SEE ALSO .Xr netmap 4 , .Xr bridge 8 Index: head/tools/tools/netmap/pkt-gen.c =================================================================== --- head/tools/tools/netmap/pkt-gen.c +++ head/tools/tools/netmap/pkt-gen.c @@ -55,6 +55,11 @@ #include #include #include +#include +#ifdef linux +#define IPV6_VERSION 0x60 +#define IPV6_DEFHLIM 64 +#endif #include #include @@ -66,16 +71,18 @@ #include "ctrs.h" +static void usage(int); + #ifdef _WIN32 #define cpuset_t DWORD_PTR //uint64_t static inline void CPU_ZERO(cpuset_t *p) { - *p = 0; + *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { - *p |= 1<< (i & 0x3f); + *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0) @@ -155,12 +162,12 @@ #define cpuset_t uint64_t // XXX static inline void CPU_ZERO(cpuset_t *p) { - *p = 0; + *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { - *p |= 1<< (i & 0x3f); + *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) ((void)a, 0) @@ -169,7 +176,7 @@ #define IFF_PPROMISC IFF_PROMISC #include /* LLADDR */ #define clock_gettime(a,b) \ - do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) + do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) #endif /* __APPLE__ */ const char *default_payload="netmap pkt-gen DIRECT payload\n" @@ -179,10 +186,8 @@ "http://info.iet.unipi.it/~luigi/netmap/ "; int verbose = 0; +int normalize = 1; -#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */ - - #define VIRT_HDR_1 10 /* length of a base vnet-hdr */ #define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ #define VIRT_HDR_MAX VIRT_HDR_2 @@ -195,14 +200,34 @@ struct pkt { struct virt_header vh; struct ether_header eh; - struct ip ip; - struct udphdr udp; - uint8_t body[MAX_BODYSIZE]; // XXX hardwired + union { + struct { + struct ip ip; + struct udphdr udp; + uint8_t body[MAX_BODYSIZE]; /* hardwired */ + } ipv4; + struct { + struct ip6_hdr ip; + struct udphdr udp; + uint8_t body[MAX_BODYSIZE]; /* hardwired */ + } ipv6; + }; } __attribute__((__packed__)); +#define PKT(p, f, af) \ + ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f) + struct ip_range { char *name; - uint32_t start, end; /* same as struct in_addr */ + union { + struct { + uint32_t start, end; /* same as struct in_addr */ + } ipv4; + struct { + struct in6_addr start, end; + uint8_t sgroup, egroup; + } ipv6; + }; uint16_t port0, port1; }; @@ -227,15 +252,18 @@ */ struct glob_arg { + int af; /* address family AF_INET/AF_INET6 */ struct ip_range src_ip; struct ip_range dst_ip; struct mac_range dst_mac; struct mac_range src_mac; int pkt_size; + int pkt_min_size; int burst; int forever; uint64_t npackets; /* total packets to send */ - int frags; /* fragments per packet */ + int frags; /* fragments per packet */ + u_int mtu; /* size of each fragment */ int nthreads; int cpus; /* cpus used for running */ int system_cpus; /* cpus on the system */ @@ -271,12 +299,12 @@ char *nmr_config; int dummy_send; int virt_header; /* send also the virt_header */ - int extra_bufs; /* goes in nr_arg3 */ - int extra_pipes; /* goes in nr_arg1 */ char *packet_file; /* -P option */ #define STATS_WIN 15 int win_idx; int64_t win[STATS_WIN]; + int wait_link; + int framing; /* #bits of framing (for bw output) */ }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; @@ -304,79 +332,166 @@ struct pkt pkt; void *frame; + uint16_t seed[3]; + u_int frags; + u_int frag_size; }; +static __inline uint16_t +cksum_add(uint16_t sum, uint16_t a) +{ + uint16_t res; + res = sum + a; + return (res + (res < a)); +} + +static void +extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port) +{ + struct in_addr a; + char *pp; + + pp = strchr(name, ':'); + if (pp != NULL) { /* do we have ports ? */ + *pp++ = '\0'; + *port = (uint16_t)strtol(pp, NULL, 0); + } + + inet_pton(AF_INET, name, &a); + *addr = ntohl(a.s_addr); +} + +static void +extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port, + uint8_t *group) +{ + char *pp; + + /* + * We accept IPv6 address in the following form: + * group@[2001:DB8::1001]:port (w/ brackets and port) + * group@[2001:DB8::1] (w/ brackets and w/o port) + * group@2001:DB8::1234 (w/o brackets and w/o port) + */ + pp = strchr(name, '@'); + if (pp != NULL) { + *pp++ = '\0'; + *group = (uint8_t)strtol(name, NULL, 0); + if (*group > 7) + *group = 7; + name = pp; + } + if (name[0] == '[') + name++; + pp = strchr(name, ']'); + if (pp != NULL) + *pp++ = '\0'; + if (pp != NULL && *pp != ':') + pp = NULL; + if (pp != NULL) { /* do we have ports ? */ + *pp++ = '\0'; + *port = (uint16_t)strtol(pp, NULL, 0); + } + inet_pton(AF_INET6, name, addr); +} /* * extract the extremes from a range of ipv4 addresses. * addr_lo[-addr_hi][:port_lo[-port_hi]] */ -static void -extract_ip_range(struct ip_range *r) +static int +extract_ip_range(struct ip_range *r, int af) { - char *ap, *pp; + char *name, *ap, start[INET6_ADDRSTRLEN]; + char end[INET6_ADDRSTRLEN]; struct in_addr a; + uint32_t tmp; if (verbose) D("extract IP range from %s", r->name); - r->port0 = r->port1 = 0; - r->start = r->end = 0; + name = strdup(r->name); + if (name == NULL) { + D("strdup failed"); + usage(-1); + } /* the first - splits start/end of range */ - ap = index(r->name, '-'); /* do we have ports ? */ - if (ap) { + ap = strchr(name, '-'); + if (ap != NULL) *ap++ = '\0'; - } - /* grab the initial values (mandatory) */ - pp = index(r->name, ':'); - if (pp) { - *pp++ = '\0'; - r->port0 = r->port1 = strtol(pp, NULL, 0); - }; - inet_aton(r->name, &a); - r->start = r->end = ntohl(a.s_addr); - if (ap) { - pp = index(ap, ':'); - if (pp) { - *pp++ = '\0'; - if (*pp) - r->port1 = strtol(pp, NULL, 0); + r->port0 = 1234; /* default port */ + if (af == AF_INET6) { + r->ipv6.sgroup = 7; /* default group */ + extract_ipv6_addr(name, &r->ipv6.start, &r->port0, + &r->ipv6.sgroup); + } else + extract_ipv4_addr(name, &r->ipv4.start, &r->port0); + + r->port1 = r->port0; + if (af == AF_INET6) { + if (ap != NULL) { + r->ipv6.egroup = r->ipv6.sgroup; + extract_ipv6_addr(ap, &r->ipv6.end, &r->port1, + &r->ipv6.egroup); + } else { + r->ipv6.end = r->ipv6.start; + r->ipv6.egroup = r->ipv6.sgroup; } - if (*ap) { - inet_aton(ap, &a); - r->end = ntohl(a.s_addr); - } + } else { + if (ap != NULL) { + extract_ipv4_addr(ap, &r->ipv4.end, &r->port1); + if (r->ipv4.start > r->ipv4.end) { + tmp = r->ipv4.end; + r->ipv4.end = r->ipv4.start; + r->ipv4.start = tmp; + } + } else + r->ipv4.end = r->ipv4.start; } + if (r->port0 > r->port1) { - uint16_t tmp = r->port0; + tmp = r->port0; r->port0 = r->port1; r->port1 = tmp; } - if (r->start > r->end) { - uint32_t tmp = r->start; - r->start = r->end; - r->end = tmp; + if (af == AF_INET) { + a.s_addr = htonl(r->ipv4.start); + inet_ntop(af, &a, start, sizeof(start)); + a.s_addr = htonl(r->ipv4.end); + inet_ntop(af, &a, end, sizeof(end)); + } else { + inet_ntop(af, &r->ipv6.start, start, sizeof(start)); + inet_ntop(af, &r->ipv6.end, end, sizeof(end)); } - { - struct in_addr a; - char buf1[16]; // one ip address + if (af == AF_INET) + D("range is %s:%d to %s:%d", start, r->port0, end, r->port1); + else + D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup, + start, r->port0, r->ipv6.egroup, end, r->port1); - a.s_addr = htonl(r->end); - strncpy(buf1, inet_ntoa(a), sizeof(buf1)); - a.s_addr = htonl(r->start); - if (1) - D("range is %s:%d to %s:%d", - inet_ntoa(a), r->port0, buf1, r->port1); - } + free(name); + if (r->port0 != r->port1 || + (af == AF_INET && r->ipv4.start != r->ipv4.end) || + (af == AF_INET6 && + !IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end))) + return (OPT_COPY); + return (0); } -static void +static int extract_mac_range(struct mac_range *r) { + struct ether_addr *e; if (verbose) D("extract MAC range from %s", r->name); - bcopy(ether_aton(r->name), &r->start, 6); - bcopy(ether_aton(r->name), &r->end, 6); + + e = ether_aton(r->name); + if (e == NULL) { + D("invalid MAC address '%s'", r->name); + return 1; + } + bcopy(e, &r->start, 6); + bcopy(e, &r->end, 6); #if 0 bcopy(targ->src_mac, eh->ether_shost, 6); p = index(targ->g->src_mac, '-'); @@ -391,6 +506,7 @@ #endif if (verbose) D("%s starts at %s", r->name, ether_ntoa(&r->start)); + return 0; } static struct targ *targs; @@ -456,7 +572,7 @@ /* * parse the vale configuration in conf and put it in nmr. * Return the flag set if necessary. - * The configuration may consist of 0 to 4 numbers separated + * The configuration may consist of 1 to 4 numbers separated * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number @@ -500,7 +616,7 @@ nmr->nr_rx_rings, nmr->nr_rx_slots); free(w); return (nmr->nr_tx_rings || nmr->nr_tx_slots || - nmr->nr_rx_rings || nmr->nr_rx_slots) ? + nmr->nr_rx_rings || nmr->nr_rx_slots) ? NM_OPEN_RING_CFG : 0; } @@ -513,7 +629,6 @@ source_hwaddr(const char *ifname, char *buf) { struct ifaddrs *ifaphead, *ifap; - int l = sizeof(ifap->ifa_name); if (getifaddrs(&ifaphead) != 0) { D("getifaddrs %s failed", ifname); @@ -527,7 +642,7 @@ if (!sdl || sdl->sdl_family != AF_LINK) continue; - if (strncmp(ifap->ifa_name, ifname, l) != 0) + if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0) continue; mac = (uint8_t *)LLADDR(sdl); sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", @@ -562,19 +677,20 @@ return 0; } + /* Compute the checksum of the given ip header. */ -static uint16_t +static uint32_t checksum(const void *data, uint16_t len, uint32_t sum) { - const uint8_t *addr = data; + const uint8_t *addr = data; uint32_t i; - /* Checksum all the pairs of bytes first... */ - for (i = 0; i < (len & ~1U); i += 2) { - sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i))); - if (sum > 0xFFFF) - sum -= 0xFFFF; - } + /* Checksum all the pairs of bytes first... */ + for (i = 0; i < (len & ~1U); i += 2) { + sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i))); + if (sum > 0xFFFF) + sum -= 0xFFFF; + } /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is @@ -588,8 +704,8 @@ return sum; } -static u_int16_t -wrapsum(u_int32_t sum) +static uint16_t +wrapsum(uint32_t sum) { sum = ~sum & 0xFFFF; return (htons(sum)); @@ -637,64 +753,198 @@ #define uh_sum check #endif /* linux */ -/* - * increment the addressed in the packet, - * starting from the least significant field. - * DST_IP DST_PORT SRC_IP SRC_PORT - */ static void -update_addresses(struct pkt *pkt, struct glob_arg *g) +update_ip(struct pkt *pkt, struct targ *t) { - uint32_t a; - uint16_t p; - struct ip *ip = &pkt->ip; - struct udphdr *udp = &pkt->udp; + struct glob_arg *g = t->g; + struct ip ip; + struct udphdr udp; + uint32_t oaddr, naddr; + uint16_t oport, nport; + uint16_t ip_sum, udp_sum; - do { - /* XXX for now it doesn't handle non-random src, random dst */ - if (g->options & OPT_RANDOM_SRC) { - udp->uh_sport = random(); - ip->ip_src.s_addr = random(); - } else { - p = ntohs(udp->uh_sport); - if (p < g->src_ip.port1) { /* just inc, no wrap */ - udp->uh_sport = htons(p + 1); + memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); + memcpy(&udp, &pkt->ipv4.udp, sizeof(udp)); + do { + ip_sum = udp_sum = 0; + naddr = oaddr = ntohl(ip.ip_src.s_addr); + nport = oport = ntohs(udp.uh_sport); + if (g->options & OPT_RANDOM_SRC) { + ip.ip_src.s_addr = nrand48(t->seed); + udp.uh_sport = nrand48(t->seed); + naddr = ntohl(ip.ip_src.s_addr); + nport = ntohs(udp.uh_sport); break; } - udp->uh_sport = htons(g->src_ip.port0); - - a = ntohl(ip->ip_src.s_addr); - if (a < g->src_ip.end) { /* just inc, no wrap */ - ip->ip_src.s_addr = htonl(a + 1); + if (oport < g->src_ip.port1) { + nport = oport + 1; + udp.uh_sport = htons(nport); break; } - ip->ip_src.s_addr = htonl(g->src_ip.start); - - udp->uh_sport = htons(g->src_ip.port0); + nport = g->src_ip.port0; + udp.uh_sport = htons(nport); + if (oaddr < g->src_ip.ipv4.end) { + naddr = oaddr + 1; + ip.ip_src.s_addr = htonl(naddr); + break; + } + naddr = g->src_ip.ipv4.start; + ip.ip_src.s_addr = htonl(naddr); + } while (0); + /* update checksums if needed */ + if (oaddr != naddr) { + ip_sum = cksum_add(ip_sum, ~oaddr >> 16); + ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); + ip_sum = cksum_add(ip_sum, naddr >> 16); + ip_sum = cksum_add(ip_sum, naddr & 0xffff); } - - if (g->options & OPT_RANDOM_DST) { - udp->uh_dport = random(); - ip->ip_dst.s_addr = random(); - } else { - p = ntohs(udp->uh_dport); - if (p < g->dst_ip.port1) { /* just inc, no wrap */ - udp->uh_dport = htons(p + 1); + if (oport != nport) { + udp_sum = cksum_add(udp_sum, ~oport); + udp_sum = cksum_add(udp_sum, nport); + } + do { + naddr = oaddr = ntohl(ip.ip_dst.s_addr); + nport = oport = ntohs(udp.uh_dport); + if (g->options & OPT_RANDOM_DST) { + ip.ip_dst.s_addr = nrand48(t->seed); + udp.uh_dport = nrand48(t->seed); + naddr = ntohl(ip.ip_dst.s_addr); + nport = ntohs(udp.uh_dport); break; } - udp->uh_dport = htons(g->dst_ip.port0); - - a = ntohl(ip->ip_dst.s_addr); - if (a < g->dst_ip.end) { /* just inc, no wrap */ - ip->ip_dst.s_addr = htonl(a + 1); + if (oport < g->dst_ip.port1) { + nport = oport + 1; + udp.uh_dport = htons(nport); break; } + nport = g->dst_ip.port0; + udp.uh_dport = htons(nport); + if (oaddr < g->dst_ip.ipv4.end) { + naddr = oaddr + 1; + ip.ip_dst.s_addr = htonl(naddr); + break; + } + naddr = g->dst_ip.ipv4.start; + ip.ip_dst.s_addr = htonl(naddr); + } while (0); + /* update checksums */ + if (oaddr != naddr) { + ip_sum = cksum_add(ip_sum, ~oaddr >> 16); + ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); + ip_sum = cksum_add(ip_sum, naddr >> 16); + ip_sum = cksum_add(ip_sum, naddr & 0xffff); } - ip->ip_dst.s_addr = htonl(g->dst_ip.start); - } while (0); - // update checksum + if (oport != nport) { + udp_sum = cksum_add(udp_sum, ~oport); + udp_sum = cksum_add(udp_sum, nport); + } + if (udp_sum != 0) + udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum)); + if (ip_sum != 0) { + ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum)); + udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum)); + } + memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); + memcpy(&pkt->ipv4.udp, &udp, sizeof(udp)); } +#ifndef s6_addr16 +#define s6_addr16 __u6_addr.__u6_addr16 +#endif +static void +update_ip6(struct pkt *pkt, struct targ *t) +{ + struct glob_arg *g = t->g; + struct ip6_hdr ip6; + struct udphdr udp; + uint16_t udp_sum; + uint16_t oaddr, naddr; + uint16_t oport, nport; + uint8_t group; + + memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6)); + memcpy(&udp, &pkt->ipv6.udp, sizeof(udp)); + do { + udp_sum = 0; + group = g->src_ip.ipv6.sgroup; + naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]); + nport = oport = ntohs(udp.uh_sport); + if (g->options & OPT_RANDOM_SRC) { + ip6.ip6_src.s6_addr16[group] = nrand48(t->seed); + udp.uh_sport = nrand48(t->seed); + naddr = ntohs(ip6.ip6_src.s6_addr16[group]); + nport = ntohs(udp.uh_sport); + break; + } + if (oport < g->src_ip.port1) { + nport = oport + 1; + udp.uh_sport = htons(nport); + break; + } + nport = g->src_ip.port0; + udp.uh_sport = htons(nport); + if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) { + naddr = oaddr + 1; + ip6.ip6_src.s6_addr16[group] = htons(naddr); + break; + } + naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]); + ip6.ip6_src.s6_addr16[group] = htons(naddr); + } while (0); + /* update checksums if needed */ + if (oaddr != naddr) + udp_sum = cksum_add(~oaddr, naddr); + if (oport != nport) + udp_sum = cksum_add(udp_sum, + cksum_add(~oport, nport)); + do { + group = g->dst_ip.ipv6.egroup; + naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]); + nport = oport = ntohs(udp.uh_dport); + if (g->options & OPT_RANDOM_DST) { + ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed); + udp.uh_dport = nrand48(t->seed); + naddr = ntohs(ip6.ip6_dst.s6_addr16[group]); + nport = ntohs(udp.uh_dport); + break; + } + if (oport < g->dst_ip.port1) { + nport = oport + 1; + udp.uh_dport = htons(nport); + break; + } + nport = g->dst_ip.port0; + udp.uh_dport = htons(nport); + if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) { + naddr = oaddr + 1; + ip6.ip6_dst.s6_addr16[group] = htons(naddr); + break; + } + naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]); + ip6.ip6_dst.s6_addr16[group] = htons(naddr); + } while (0); + /* update checksums */ + if (oaddr != naddr) + udp_sum = cksum_add(udp_sum, + cksum_add(~oaddr, naddr)); + if (oport != nport) + udp_sum = cksum_add(udp_sum, + cksum_add(~oport, nport)); + if (udp_sum != 0) + udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum); + memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); + memcpy(&pkt->ipv6.udp, &udp, sizeof(udp)); +} + +static void +update_addresses(struct pkt *pkt, struct targ *t) +{ + + if (t->g->af == AF_INET) + update_ip(pkt, t); + else + update_ip6(pkt, t); +} /* * initialize one packet and prepare for the next one. * The copy could be done better instead of repeating it each time. @@ -704,9 +954,12 @@ { struct pkt *pkt = &targ->pkt; struct ether_header *eh; - struct ip *ip; - struct udphdr *udp; - uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip); + struct ip6_hdr ip6; + struct ip ip; + struct udphdr udp; + void *udp_ptr; + uint16_t paylen; + uint32_t csum = 0; const char *payload = targ->g->options & OPT_INDIRECT ? indirect_payload : default_payload; int i, l0 = strlen(payload); @@ -716,7 +969,7 @@ pcap_t *file; struct pcap_pkthdr *header; const unsigned char *packet; - + /* Read a packet from a PCAP file if asked. */ if (targ->g->packet_file != NULL) { if ((file = pcap_open_offline(targ->g->packet_file, @@ -735,49 +988,80 @@ } #endif + paylen = targ->g->pkt_size - sizeof(*eh) - + (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6)); + /* create a nice NUL-terminated string */ for (i = 0; i < paylen; i += l0) { if (l0 > paylen - i) l0 = paylen - i; // last round - bcopy(payload, pkt->body + i, l0); + bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0); } - pkt->body[i-1] = '\0'; - ip = &pkt->ip; + PKT(pkt, body, targ->g->af)[i - 1] = '\0'; /* prepare the headers */ - ip->ip_v = IPVERSION; - ip->ip_hl = 5; - ip->ip_id = 0; - ip->ip_tos = IPTOS_LOWDELAY; - ip->ip_len = ntohs(targ->g->pkt_size - sizeof(*eh)); - ip->ip_id = 0; - ip->ip_off = htons(IP_DF); /* Don't fragment */ - ip->ip_ttl = IPDEFTTL; - ip->ip_p = IPPROTO_UDP; - ip->ip_dst.s_addr = htonl(targ->g->dst_ip.start); - ip->ip_src.s_addr = htonl(targ->g->src_ip.start); - ip->ip_sum = wrapsum(checksum(ip, sizeof(*ip), 0)); - - - udp = &pkt->udp; - udp->uh_sport = htons(targ->g->src_ip.port0); - udp->uh_dport = htons(targ->g->dst_ip.port0); - udp->uh_ulen = htons(paylen); - /* Magic: taken from sbin/dhclient/packet.c */ - udp->uh_sum = wrapsum(checksum(udp, sizeof(*udp), - checksum(pkt->body, - paylen - sizeof(*udp), - checksum(&ip->ip_src, 2 * sizeof(ip->ip_src), - IPPROTO_UDP + (u_int32_t)ntohs(udp->uh_ulen) - ) - ) - )); - eh = &pkt->eh; bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); - eh->ether_type = htons(ETHERTYPE_IP); + if (targ->g->af == AF_INET) { + eh->ether_type = htons(ETHERTYPE_IP); + memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); + udp_ptr = &pkt->ipv4.udp; + ip.ip_v = IPVERSION; + ip.ip_hl = sizeof(ip) >> 2; + ip.ip_id = 0; + ip.ip_tos = IPTOS_LOWDELAY; + ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh)); + ip.ip_id = 0; + ip.ip_off = htons(IP_DF); /* Don't fragment */ + ip.ip_ttl = IPDEFTTL; + ip.ip_p = IPPROTO_UDP; + ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start); + ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start); + ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0)); + memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); + } else { + eh->ether_type = htons(ETHERTYPE_IPV6); + memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6)); + udp_ptr = &pkt->ipv6.udp; + ip6.ip6_flow = 0; + ip6.ip6_plen = htons(paylen); + ip6.ip6_vfc = IPV6_VERSION; + ip6.ip6_nxt = IPPROTO_UDP; + ip6.ip6_hlim = IPV6_DEFHLIM; + ip6.ip6_src = targ->g->src_ip.ipv6.start; + ip6.ip6_dst = targ->g->dst_ip.ipv6.start; + } + memcpy(&udp, udp_ptr, sizeof(udp)); + + udp.uh_sport = htons(targ->g->src_ip.port0); + udp.uh_dport = htons(targ->g->dst_ip.port0); + udp.uh_ulen = htons(paylen); + if (targ->g->af == AF_INET) { + /* Magic: taken from sbin/dhclient/packet.c */ + udp.uh_sum = wrapsum( + checksum(&udp, sizeof(udp), /* udp header */ + checksum(pkt->ipv4.body, /* udp payload */ + paylen - sizeof(udp), + checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */ + 2 * sizeof(pkt->ipv4.ip.ip_src), + IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen))))); + memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); + } else { + /* Save part of pseudo header checksum into csum */ + csum = IPPROTO_UDP << 24; + csum = checksum(&csum, sizeof(csum), paylen); + udp.uh_sum = wrapsum( + checksum(udp_ptr, sizeof(udp), /* udp header */ + checksum(pkt->ipv6.body, /* udp payload */ + paylen - sizeof(udp), + checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */ + 2 * sizeof(pkt->ipv6.ip.ip6_src), csum)))); + memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); + } + memcpy(udp_ptr, &udp, sizeof(udp)); + bzero(&pkt->vh, sizeof(pkt->vh)); // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } @@ -825,7 +1109,6 @@ } } - /* * create and enqueue a batch of packets on a ring. * On the last one set NS_REPORT to tell the driver to generate @@ -833,19 +1116,14 @@ */ static int send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, - int size, struct glob_arg *g, u_int count, int options, - u_int nfrags) + int size, struct targ *t, u_int count, int options) { u_int n, sent, cur = ring->cur; - u_int fcnt; + u_int frags = t->frags; + u_int frag_size = t->frag_size; + struct netmap_slot *slot = &ring->slot[cur]; n = nm_ring_space(ring); - if (n < count) - count = n; - if (count < nfrags) { - D("truncating packet, no room for frags %d %d", - count, nfrags); - } #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { for (sent = 0; sent < count; sent++) { @@ -858,42 +1136,64 @@ cur = ring->cur; } #endif - for (fcnt = nfrags, sent = 0; sent < count; sent++) { - struct netmap_slot *slot = &ring->slot[cur]; - char *p = NETMAP_BUF(ring, slot->buf_idx); - int buf_changed = slot->flags & NS_BUF_CHANGED; + for (sent = 0; sent < count && n >= frags; sent++, n--) { + char *p; + int buf_changed; + u_int tosend = size; + slot = &ring->slot[cur]; + p = NETMAP_BUF(ring, slot->buf_idx); + buf_changed = slot->flags & NS_BUF_CHANGED; + slot->flags = 0; if (options & OPT_RUBBISH) { /* do nothing */ } else if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; slot->ptr = (uint64_t)((uintptr_t)frame); - } else if ((options & OPT_COPY) || buf_changed) { - nm_pkt_copy(frame, p, size); - if (fcnt == nfrags) - update_addresses(pkt, g); - } else if (options & OPT_MEMCPY) { - memcpy(p, frame, size); - if (fcnt == nfrags) - update_addresses(pkt, g); + } else if (frags > 1) { + u_int i; + const char *f = frame; + char *fp = p; + for (i = 0; i < frags - 1; i++) { + memcpy(fp, f, frag_size); + slot->len = frag_size; + slot->flags = NS_MOREFRAG; + if (options & OPT_DUMP) + dump_payload(fp, frag_size, ring, cur); + tosend -= frag_size; + f += frag_size; + cur = nm_ring_next(ring, cur); + slot = &ring->slot[cur]; + fp = NETMAP_BUF(ring, slot->buf_idx); + } + n -= (frags - 1); + p = fp; + slot->flags = 0; + memcpy(p, f, tosend); + update_addresses(pkt, t); + } else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) { + if (options & OPT_COPY) + nm_pkt_copy(frame, p, size); + else + memcpy(p, frame, size); + update_addresses(pkt, t); } else if (options & OPT_PREFETCH) { __builtin_prefetch(p); } + slot->len = tosend; if (options & OPT_DUMP) - dump_payload(p, size, ring, cur); - slot->len = size; - if (--fcnt > 0) - slot->flags |= NS_MOREFRAG; - else - fcnt = nfrags; - if (sent == count - 1) { - slot->flags &= ~NS_MOREFRAG; - slot->flags |= NS_REPORT; - } + dump_payload(p, tosend, ring, cur); cur = nm_ring_next(ring, cur); } - ring->head = ring->cur = cur; + if (sent) { + slot->flags |= NS_REPORT; + ring->head = ring->cur = cur; + } + if (sent < count) { + /* tell netmap that we need more slots */ + ring->cur = ring->tail; + } return (sent); } @@ -914,28 +1214,47 @@ } /* + * wait until ts, either busy or sleeping if more than 1ms. + * Return wakeup time. + */ +static struct timespec +wait_time(struct timespec ts) +{ + for (;;) { + struct timespec w, cur; + clock_gettime(CLOCK_REALTIME_PRECISE, &cur); + w = timespec_sub(ts, cur); + if (w.tv_sec < 0) + return cur; + else if (w.tv_sec > 0 || w.tv_nsec > 1000000) + poll(NULL, 0, 1); + } +} + +/* * Send a packet, and wait for a response. * The payload (after UDP header, ofs 42) has a 4-byte sequence * followed by a struct timeval (or bintime?) */ -#define PAY_OFS 42 /* where in the pkt... */ static void * -pinger_body(void *data) +ping_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; - int i, rx = 0; + int i, m, rx = 0; void *frame; int size; struct timespec ts, now, last_print; + struct timespec nexttime = {0, 0}; /* silence compiler */ uint64_t sent = 0, n = targ->g->npackets; uint64_t count = 0, t_cur, t_min = ~0, av = 0; + uint64_t g_min = ~0, g_av = 0; uint64_t buckets[64]; /* bins for delays, ns */ + int rate_limit = targ->g->tx_rate, tosend = 0; - frame = &targ->pkt; - frame += sizeof(targ->pkt.vh) - targ->g->virt_header; + frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; @@ -947,38 +1266,75 @@ bzero(&buckets, sizeof(buckets)); clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); now = last_print; + if (rate_limit) { + targ->tic = timespec_add(now, (struct timespec){2,0}); + targ->tic.tv_nsec = 0; + wait_time(targ->tic); + nexttime = targ->tic; + } while (!targ->cancel && (n == 0 || sent < n)) { - struct netmap_ring *ring = NETMAP_TXRING(nifp, 0); + struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); struct netmap_slot *slot; char *p; - for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */ - slot = &ring->slot[ring->cur]; - slot->len = size; - p = NETMAP_BUF(ring, slot->buf_idx); + int rv; + uint64_t limit, event = 0; - if (nm_ring_empty(ring)) { - D("-- ouch, cannot send"); - } else { - struct tstamp *tp; - nm_pkt_copy(frame, p, size); - clock_gettime(CLOCK_REALTIME_PRECISE, &ts); - bcopy(&sent, p+42, sizeof(sent)); - tp = (struct tstamp *)(p+46); - tp->sec = (uint32_t)ts.tv_sec; - tp->nsec = (uint32_t)ts.tv_nsec; - sent++; - ring->head = ring->cur = nm_ring_next(ring, ring->cur); + if (rate_limit && tosend <= 0) { + tosend = targ->g->burst; + nexttime = timespec_add(nexttime, targ->g->tx_period); + wait_time(nexttime); } - } - /* should use a parameter to decide how often to send */ - if (poll(&pfd, 1, 3000) <= 0) { - D("poll error/timeout on queue %d: %s", targ->me, + + limit = rate_limit ? tosend : targ->g->burst; + if (n > 0 && n - sent < limit) + limit = n - sent; + for (m = 0; (unsigned)m < limit; m++) { + slot = &ring->slot[ring->cur]; + slot->len = size; + p = NETMAP_BUF(ring, slot->buf_idx); + + if (nm_ring_empty(ring)) { + D("-- ouch, cannot send"); + break; + } else { + struct tstamp *tp; + nm_pkt_copy(frame, p, size); + clock_gettime(CLOCK_REALTIME_PRECISE, &ts); + bcopy(&sent, p+42, sizeof(sent)); + tp = (struct tstamp *)(p+46); + tp->sec = (uint32_t)ts.tv_sec; + tp->nsec = (uint32_t)ts.tv_nsec; + sent++; + ring->head = ring->cur = nm_ring_next(ring, ring->cur); + } + } + if (m > 0) + event++; + targ->ctr.pkts = sent; + targ->ctr.bytes = sent*size; + targ->ctr.events = event; + if (rate_limit) + tosend -= m; +#ifdef BUSYWAIT + rv = ioctl(pfd.fd, NIOCTXSYNC, NULL); + if (rv < 0) { + D("TXSYNC error on queue %d: %s", targ->me, strerror(errno)); + } + again: + ioctl(pfd.fd, NIOCRXSYNC, NULL); +#else + /* should use a parameter to decide how often to send */ + if ( (rv = poll(&pfd, 1, 3000)) <= 0) { + D("poll error on queue %d: %s", targ->me, + (rv ? strerror(errno) : "timeout")); continue; } +#endif /* BUSYWAIT */ /* see what we got back */ - for (i = targ->nmd->first_tx_ring; - i <= targ->nmd->last_tx_ring; i++) { + rx = 0; + for (i = targ->nmd->first_rx_ring; + i <= targ->nmd->last_rx_ring; i++) { ring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(ring)) { uint32_t seq; @@ -999,7 +1355,8 @@ ts.tv_nsec += 1000000000; ts.tv_sec--; } - if (0) D("seq %d/%lu delta %d.%09d", seq, sent, + if (0) D("seq %d/%llu delta %d.%09d", seq, + (unsigned long long)sent, (int)ts.tv_sec, (int)ts.tv_nsec); t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec; if (t_cur < t_min) @@ -1024,7 +1381,7 @@ if (ts.tv_sec >= 1) { D("count %d RTT: min %d av %d ns", (int)count, (int)t_min, (int)(av/count)); - int k, j, kmin; + int k, j, kmin, off; char buf[512]; for (kmin = 0; kmin < 64; kmin ++) @@ -1034,17 +1391,33 @@ if (buckets[k]) break; buf[0] = '\0'; - for (j = kmin; j <= k; j++) - sprintf(buf, "%s %5d", buf, (int)buckets[j]); + off = 0; + for (j = kmin; j <= k; j++) { + off += sprintf(buf + off, " %5d", (int)buckets[j]); + } D("k: %d .. %d\n\t%s", 1<cancel) + goto again; +#endif /* BUSYWAIT */ } + if (sent > 0) { + D("RTT over %llu packets: min %d av %d ns", + (long long unsigned)sent, (int)g_min, + (int)((double)g_av/sent)); + } + targ->completed = 1; + /* reset the ``used`` flag. */ targ->used = 0; @@ -1056,7 +1429,7 @@ * reply to ping requests */ static void * -ponger_body(void *data) +pong_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; @@ -1069,20 +1442,23 @@ D("can only reply ping with 1 thread"); return NULL; } - D("understood ponger %lu but don't know how to do it", n); + if (n > 0) + D("understood ponger %llu but don't know how to do it", + (unsigned long long)n); while (!targ->cancel && (n == 0 || sent < n)) { uint32_t txcur, txavail; //#define BUSYWAIT #ifdef BUSYWAIT ioctl(pfd.fd, NIOCRXSYNC, NULL); #else - if (poll(&pfd, 1, 1000) <= 0) { - D("poll error/timeout on queue %d: %s", targ->me, - strerror(errno)); + int rv; + if ( (rv = poll(&pfd, 1, 1000)) <= 0) { + D("poll error on queue %d: %s", targ->me, + rv ? strerror(errno) : "timeout"); continue; } #endif - txring = NETMAP_TXRING(nifp, 0); + txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); txcur = txring->cur; txavail = nm_ring_space(txring); /* see what we got back */ @@ -1105,6 +1481,7 @@ dpkt = (uint16_t *)dst; spkt = (uint16_t *)src; nm_pkt_copy(src, dst, slot->len); + /* swap source and destination MAC */ dpkt[0] = spkt[3]; dpkt[1] = spkt[4]; dpkt[2] = spkt[5]; @@ -1112,7 +1489,6 @@ dpkt[4] = spkt[1]; dpkt[5] = spkt[2]; txring->slot[txcur].len = slot->len; - /* XXX swap src dst mac */ txcur = nm_ring_next(txring, txcur); txavail--; sent++; @@ -1126,6 +1502,8 @@ //D("tx %d rx %d", sent, rx); } + targ->completed = 1; + /* reset the ``used`` flag. */ targ->used = 0; @@ -1133,24 +1511,6 @@ } -/* - * wait until ts, either busy or sleeping if more than 1ms. - * Return wakeup time. - */ -static struct timespec -wait_time(struct timespec ts) -{ - for (;;) { - struct timespec w, cur; - clock_gettime(CLOCK_REALTIME_PRECISE, &cur); - w = timespec_sub(ts, cur); - if (w.tv_sec < 0) - return cur; - else if (w.tv_sec > 0 || w.tv_nsec > 1000000) - poll(NULL, 0, 1); - } -} - static void * sender_body(void *data) { @@ -1170,14 +1530,13 @@ int size; if (targ->frame == NULL) { - frame = pkt; - frame += sizeof(pkt->vh) - targ->g->virt_header; + frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; } else { frame = targ->frame; size = targ->g->pkt_size; } - + D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); if (setaffinity(targ->thread, targ->affinity)) goto quit; @@ -1190,13 +1549,13 @@ wait_time(targ->tic); nexttime = targ->tic; } - if (targ->g->dev_type == DEV_TAP) { + if (targ->g->dev_type == DEV_TAP) { D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (write(targ->g->main_fd, frame, size) != -1) sent++; - update_addresses(pkt, targ->g); + update_addresses(pkt, targ); if (i > 10000) { targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; @@ -1211,7 +1570,7 @@ for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (pcap_inject(p, frame, size) != -1) sent++; - update_addresses(pkt, targ->g); + update_addresses(pkt, targ); if (i > 10000) { targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; @@ -1222,10 +1581,23 @@ #endif /* NO_PCAP */ } else { int tosend = 0; - int frags = targ->g->frags; + u_int bufsz, mtu = targ->g->mtu; nifp = targ->nmd->nifp; + txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); + bufsz = txring->nr_buf_size; + if (bufsz < mtu) + mtu = bufsz; + targ->frag_size = targ->g->pkt_size / targ->frags; + if (targ->frag_size > mtu) { + targ->frags = targ->g->pkt_size / mtu; + targ->frag_size = mtu; + if (targ->g->pkt_size % mtu != 0) + targ->frags++; + } + D("frags %u frag_size %u", targ->frags, targ->frag_size); while (!targ->cancel && (n == 0 || sent < n)) { + int rv; if (rate_limit && tosend <= 0) { tosend = targ->g->burst; @@ -1237,17 +1609,18 @@ * wait for available room in the send queue(s) */ #ifdef BUSYWAIT + (void)rv; if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ - if (poll(&pfd, 1, 2000) <= 0) { + if ( (rv = poll(&pfd, 1, 2000)) <= 0) { if (targ->cancel) break; - D("poll error/timeout on queue %d: %s", targ->me, - strerror(errno)); + D("poll error on queue %d: %s", targ->me, + rv ? strerror(errno) : "timeout"); // goto quit; } if (pfd.revents & POLLERR) { @@ -1266,23 +1639,30 @@ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { int m; uint64_t limit = rate_limit ? tosend : targ->g->burst; + + if (n > 0 && n == sent) + break; + if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); if (nm_ring_empty(txring)) continue; - if (frags > 1) - limit = ((limit + frags - 1) / frags) * frags; - m = send_packets(txring, pkt, frame, size, targ->g, - limit, options, frags); - ND("limit %d tail %d frags %d m %d", - limit, txring->tail, frags, m); + if (targ->g->pkt_min_size > 0) { + size = nrand48(targ->seed) % + (targ->g->pkt_size - targ->g->pkt_min_size) + + targ->g->pkt_min_size; + } + m = send_packets(txring, pkt, frame, size, targ, + limit, options); + ND("limit %lu tail %d m %d", + limit, txring->tail, m); sent += m; if (m > 0) //XXX-ste: can m be 0? event++; targ->ctr.pkts = sent; - targ->ctr.bytes = sent*size; + targ->ctr.bytes += m*size; targ->ctr.events = event; if (rate_limit) { tosend -= m; @@ -1292,10 +1672,12 @@ } } /* flush any remaining packets */ - D("flush tail %d head %d on thread %p", - txring->tail, txring->head, - (void *)pthread_self()); - ioctl(pfd.fd, NIOCTXSYNC, NULL); + if (txring != NULL) { + D("flush tail %d head %d on thread %p", + txring->tail, txring->head, + (void *)pthread_self()); + ioctl(pfd.fd, NIOCTXSYNC, NULL); + } /* final part: wait all the TX queues to be empty. */ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { @@ -1340,6 +1722,7 @@ { u_int cur, rx, n; uint64_t b = 0; + u_int complete = 0; if (bytes == NULL) bytes = &b; @@ -1355,12 +1738,14 @@ *bytes += slot->len; if (dump) dump_payload(p, slot->len, ring, cur); + if (!(slot->flags & NS_MOREFRAG)) + complete++; cur = nm_ring_next(ring, cur); } ring->head = ring->cur = cur; - return (rx); + return (complete); } static void * @@ -1373,8 +1758,7 @@ int i; struct my_ctrs cur; - cur.pkts = cur.bytes = cur.events = cur.min_space = 0; - cur.t.tv_usec = cur.t.tv_sec = 0; // unused, just silence the compiler + memset(&cur, 0, sizeof(cur)); if (setaffinity(targ->thread, targ->affinity)) goto quit; @@ -1386,6 +1770,14 @@ i = poll(&pfd, 1, 1000); if (i > 0 && !(pfd.revents & POLLERR)) break; + if (i < 0) { + D("poll() error: %s", strerror(errno)); + goto quit; + } + if (pfd.revents & POLLERR) { + D("fd error"); + goto quit; + } RD(1, "waiting for initial packets, poll returns %d %d", i, pfd.revents); } @@ -1408,7 +1800,7 @@ /* XXX should we poll ? */ pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, (u_char *)&targ->ctr); - targ->ctr.events++; + targ->ctr.events++; } #endif /* !NO_PCAP */ } else { @@ -1451,7 +1843,7 @@ m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes); cur.pkts += m; - if (m > 0) //XXX-ste: can m be 0? + if (m > 0) cur.events++; } cur.min_space = targ->ctr.min_space; @@ -1503,8 +1895,7 @@ D("Ignoring -n argument"); } - frame = pkt; - frame += sizeof(pkt->vh) - targ->g->virt_header; + frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); @@ -1527,6 +1918,8 @@ unsigned int space; unsigned int head; int fcnt; + uint16_t sum = 0; + int rv; if (!rate_limit) { budget = targ->g->burst; @@ -1538,17 +1931,27 @@ } /* wait for available room in the send queue */ - if (poll(&pfd, 1, 2000) <= 0) { +#ifdef BUSYWAIT + (void)rv; + if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { + D("ioctl error on queue %d: %s", targ->me, + strerror(errno)); + goto quit; + } +#else /* !BUSYWAIT */ + if ( (rv = poll(&pfd, 1, 2000)) <= 0) { if (targ->cancel) break; - D("poll error/timeout on queue %d: %s", targ->me, - strerror(errno)); + D("poll error on queue %d: %s", targ->me, + rv ? strerror(errno) : "timeout"); + // goto quit; } if (pfd.revents & POLLERR) { D("poll error on %d ring %d-%d", pfd.fd, targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); goto quit; } +#endif /* !BUSYWAIT */ /* If no room poll() again. */ space = nm_ring_space(ring); @@ -1573,15 +1976,23 @@ sent < limit; sent++, sequence++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); + uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t; + memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum)); + slot->flags = 0; - pkt->body[0] = sequence >> 24; - pkt->body[1] = (sequence >> 16) & 0xff; - pkt->body[2] = (sequence >> 8) & 0xff; - pkt->body[3] = sequence & 0xff; + t = *w; + PKT(pkt, body, targ->g->af)[0] = sequence >> 24; + PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff; + sum = ~cksum_add(~sum, cksum_add(~t, *w)); + t = *++w; + PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff; + PKT(pkt, body, targ->g->af)[3] = sequence & 0xff; + sum = ~cksum_add(~sum, cksum_add(~t, *w)); + memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum)); nm_pkt_copy(frame, p, size); if (fcnt == frags) { - update_addresses(pkt, targ->g); + update_addresses(pkt, targ); } if (options & OPT_DUMP) { @@ -1675,19 +2086,25 @@ int dump = targ->g->options & OPT_DUMP; struct netmap_ring *ring; unsigned int frags_exp = 1; - uint32_t seq_exp = 0; struct my_ctrs cur; unsigned int frags = 0; int first_packet = 1; int first_slot = 1; - int i; + int i, j, af, nrings; + uint32_t seq, *seq_exp = NULL; - cur.pkts = cur.bytes = cur.events = cur.min_space = 0; - cur.t.tv_usec = cur.t.tv_sec = 0; // unused, just silence the compiler + memset(&cur, 0, sizeof(cur)); if (setaffinity(targ->thread, targ->affinity)) goto quit; + nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1; + seq_exp = calloc(nrings, sizeof(uint32_t)); + if (seq_exp == NULL) { + D("failed to allocate seq array"); + goto quit; + } + D("reading from %s fd %d main_fd %d", targ->g->ifname, targ->fd, targ->g->main_fd); /* unbounded wait for the first packet. */ @@ -1701,15 +2118,18 @@ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); - ring = NETMAP_RXRING(targ->nmd->nifp, targ->nmd->first_rx_ring); while (!targ->cancel) { unsigned int head; - uint32_t seq; int limit; - /* Once we started to receive packets, wait at most 1 seconds - before quitting. */ +#ifdef BUSYWAIT + if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { + D("ioctl error on queue %d: %s", targ->me, + strerror(errno)); + goto quit; + } +#else /* !BUSYWAIT */ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ @@ -1720,108 +2140,123 @@ D("poll err"); goto quit; } +#endif /* !BUSYWAIT */ - if (nm_ring_empty(ring)) - continue; + for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) { + ring = NETMAP_RXRING(targ->nmd->nifp, j); + if (nm_ring_empty(ring)) + continue; - limit = nm_ring_space(ring); - if (limit > targ->g->burst) - limit = targ->g->burst; + limit = nm_ring_space(ring); + if (limit > targ->g->burst) + limit = targ->g->burst; #if 0 - /* Enable this if - * 1) we remove the early-return optimization from - * the netmap poll implementation, or - * 2) pipes get NS_MOREFRAG support. - * With the current netmap implementation, an experiment like - * pkt-gen -i vale:1{1 -f txseq -F 9 - * pkt-gen -i vale:1}1 -f rxseq - * would get stuck as soon as we find nm_ring_space(ring) < 9, - * since here limit is rounded to 0 and - * pipe rxsync is not called anymore by the poll() of this loop. - */ - if (frags_exp > 1) { - int o = limit; - /* Cut off to the closest smaller multiple. */ - limit = (limit / frags_exp) * frags_exp; - RD(2, "LIMIT %d --> %d", o, limit); - } + /* Enable this if + * 1) we remove the early-return optimization from + * the netmap poll implementation, or + * 2) pipes get NS_MOREFRAG support. + * With the current netmap implementation, an experiment like + * pkt-gen -i vale:1{1 -f txseq -F 9 + * pkt-gen -i vale:1}1 -f rxseq + * would get stuck as soon as we find nm_ring_space(ring) < 9, + * since here limit is rounded to 0 and + * pipe rxsync is not called anymore by the poll() of this loop. + */ + if (frags_exp > 1) { + int o = limit; + /* Cut off to the closest smaller multiple. */ + limit = (limit / frags_exp) * frags_exp; + RD(2, "LIMIT %d --> %d", o, limit); + } #endif - for (head = ring->head, i = 0; i < limit; i++) { - struct netmap_slot *slot = &ring->slot[head]; - char *p = NETMAP_BUF(ring, slot->buf_idx); - int len = slot->len; - struct pkt *pkt; + for (head = ring->head, i = 0; i < limit; i++) { + struct netmap_slot *slot = &ring->slot[head]; + char *p = NETMAP_BUF(ring, slot->buf_idx); + int len = slot->len; + struct pkt *pkt; - if (dump) { - dump_payload(p, slot->len, ring, head); - } + if (dump) { + dump_payload(p, slot->len, ring, head); + } - frags++; - if (!(slot->flags & NS_MOREFRAG)) { - if (first_packet) { + frags++; + if (!(slot->flags & NS_MOREFRAG)) { + if (first_packet) { + first_packet = 0; + } else if (frags != frags_exp) { + char prbuf[512]; + RD(1, "Received packets with %u frags, " + "expected %u, '%s'", frags, frags_exp, + multi_slot_to_string(ring, head-frags+1, + frags, + prbuf, sizeof(prbuf))); + } first_packet = 0; - } else if (frags != frags_exp) { - char prbuf[512]; - RD(1, "Received packets with %u frags, " - "expected %u, '%s'", frags, frags_exp, - multi_slot_to_string(ring, head-frags+1, frags, - prbuf, sizeof(prbuf))); + frags_exp = frags; + frags = 0; } - first_packet = 0; - frags_exp = frags; - frags = 0; - } - p -= sizeof(pkt->vh) - targ->g->virt_header; - len += sizeof(pkt->vh) - targ->g->virt_header; - pkt = (struct pkt *)p; + p -= sizeof(pkt->vh) - targ->g->virt_header; + len += sizeof(pkt->vh) - targ->g->virt_header; + pkt = (struct pkt *)p; + if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP) + af = AF_INET; + else + af = AF_INET6; - if ((char *)pkt + len < ((char *)pkt->body) + sizeof(seq)) { - RD(1, "%s: packet too small (len=%u)", __func__, - slot->len); - } else { - seq = (pkt->body[0] << 24) | (pkt->body[1] << 16) - | (pkt->body[2] << 8) | pkt->body[3]; - if (first_slot) { - /* Grab the first one, whatever it - is. */ - seq_exp = seq; - first_slot = 0; - } else if (seq != seq_exp) { - uint32_t delta = seq - seq_exp; + if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) + + sizeof(seq)) { + RD(1, "%s: packet too small (len=%u)", __func__, + slot->len); + } else { + seq = (PKT(pkt, body, af)[0] << 24) | + (PKT(pkt, body, af)[1] << 16) | + (PKT(pkt, body, af)[2] << 8) | + PKT(pkt, body, af)[3]; + if (first_slot) { + /* Grab the first one, whatever it + is. */ + seq_exp[j] = seq; + first_slot = 0; + } else if (seq != seq_exp[j]) { + uint32_t delta = seq - seq_exp[j]; - if (delta < (0xFFFFFFFF >> 1)) { - RD(2, "Sequence GAP: exp %u found %u", - seq_exp, seq); - } else { - RD(2, "Sequence OUT OF ORDER: " - "exp %u found %u", seq_exp, seq); + if (delta < (0xFFFFFFFF >> 1)) { + RD(2, "Sequence GAP: exp %u found %u", + seq_exp[j], seq); + } else { + RD(2, "Sequence OUT OF ORDER: " + "exp %u found %u", seq_exp[j], seq); + } + seq_exp[j] = seq; } - seq_exp = seq; + seq_exp[j]++; } - seq_exp++; + + cur.bytes += slot->len; + head = nm_ring_next(ring, head); + cur.pkts++; } - cur.bytes += slot->len; - head = nm_ring_next(ring, head); - cur.pkts++; - } + ring->cur = ring->head = head; - ring->cur = ring->head = head; - - cur.events++; - targ->ctr = cur; + cur.events++; + targ->ctr = cur; + } } - clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); +#ifndef BUSYWAIT out: +#endif /* !BUSYWAIT */ targ->completed = 1; targ->ctr = cur; quit: + if (seq_exp != NULL) + free(seq_exp); /* reset the ``used`` flag. */ targ->used = 0; @@ -1830,7 +2265,7 @@ static void -tx_output(struct my_ctrs *cur, double delta, const char *msg) +tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg) { double bw, raw_bw, pps, abs; char b1[40], b2[80], b3[80]; @@ -1854,51 +2289,156 @@ size = 60; pps = cur->pkts / delta; bw = (8.0 * cur->bytes) / delta; - /* raw packets have4 bytes crc + 20 bytes framing */ - raw_bw = (8.0 * (cur->pkts * 24 + cur->bytes)) / delta; + raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta; abs = cur->pkts / (double)(cur->events); printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n", - norm(b1, pps), norm(b2, bw), norm(b3, raw_bw), abs); + norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs); } static void -usage(void) +usage(int errcode) { +/* This usage is generated from the pkt-gen man page: + * $ man pkt-gen > x + * and pasted here adding the string terminators and endlines with simple + * regular expressions. */ const char *cmd = "pkt-gen"; fprintf(stderr, "Usage:\n" "%s arguments\n" - "\t-i interface interface name\n" - "\t-f function tx rx ping pong txseq rxseq\n" - "\t-n count number of iterations (can be 0)\n" - "\t-t pkts_to_send also forces tx mode\n" - "\t-r pkts_to_receive also forces rx mode\n" - "\t-l pkt_size in bytes excluding CRC\n" - "\t-d dst_ip[:port[-dst_ip:port]] single or range\n" - "\t-s src_ip[:port[-src_ip:port]] single or range\n" - "\t-D dst-mac\n" - "\t-S src-mac\n" - "\t-a cpu_id use setaffinity\n" - "\t-b burst size testing, mostly\n" - "\t-c cores cores to use\n" - "\t-p threads processes/threads to use\n" - "\t-T report_ms milliseconds between reports\n" - "\t-w wait_for_link_time in seconds\n" - "\t-R rate in packets per second\n" - "\t-X dump payload\n" - "\t-H len add empty virtio-net-header with size 'len'\n" - "\t-E pipes allocate extra space for a number of pipes\n" - "\t-r do not touch the buffers (send rubbish)\n" - "\t-P file load packet from pcap file\n" - "\t-z use random IPv4 src address/port\n" - "\t-Z use random IPv4 dst address/port\n" - "\t-F num_frags send multi-slot packets\n" - "\t-A activate pps stats on receiver\n" - "", +" -h Show program usage and exit.\n" +"\n" +" -i interface\n" +" Name of the network interface that pkt-gen operates on. It can be a system network interface\n" +" (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n" +" monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n" +" mented in netmap(4) (NIOCREGIF section).\n" +"\n" +" -f function\n" +" The function to be executed by pkt-gen. Specify tx for transmission, rx for reception, ping\n" +" for client-side ping-pong operation, and pong for server-side ping-pong operation.\n" +"\n" +" -n count\n" +" Number of iterations of the pkt-gen function, with 0 meaning infinite). In case of tx or rx,\n" +" count is the number of packets to receive or transmit. In case of ping or pong, count is the\n" +" number of ping-pong transactions.\n" +"\n" +" -l pkt_size\n" +" Packet size in bytes excluding CRC. If passed a second time, use random sizes larger or\n" +" equal than the second one and lower than the first one.\n" +"\n" +" -b burst_size\n" +" Transmit or receive up to burst_size packets at a time.\n" +"\n" +" -4 Use IPv4 addresses.\n" +"\n" +" -6 Use IPv6 addresses.\n" +"\n" +" -d dst_ip[:port[-dst_ip:port]]\n" +" Destination IPv4/IPv6 address and port, single or range.\n" +"\n" +" -s src_ip[:port[-src_ip:port]]\n" +" Source IPv4/IPv6 address and port, single or range.\n" +"\n" +" -D dst_mac\n" +" Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n" +"\n" +" -S src_mac\n" +" Source MAC address in colon notation.\n" +"\n" +" -a cpu_id\n" +" Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3). If more\n" +" threads are used, they are pinned to the subsequent CPUs, one per thread.\n" +"\n" +" -c cpus\n" +" Maximum number of CPUs to use (0 means to use all the available ones).\n" +"\n" +" -p threads\n" +" Number of threads to use. By default, only a single thread is used to handle all the netmap\n" +" rings. If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n" +" single RX ring (in rx mode), or a TX/RX ring couple. The number of threads must be less or\n" +" equal than the number of TX (or RX) ring available in the device specified by interface.\n" +"\n" +" -T report_ms\n" +" Number of milliseconds between reports.\n" +"\n" +" -w wait_for_link_time\n" +" Number of seconds to wait before starting the pkt-gen function, useuful to make sure that the\n" +" network link is up. A network device driver may take some time to enter netmap mode, or to\n" +" create a new transmit/receive ring pair when netmap(4) requests one.\n" +"\n" +" -R rate\n" +" Packet transmission rate. Not setting the packet transmission rate tells pkt-gen to transmit\n" +" packets as quickly as possible. On servers from 2010 on-wards netmap(4) is able to com-\n" +" pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n" +" your intention is to saturate the link.\n" +"\n" +" -X Dump payload of each packet transmitted or received.\n" +"\n" +" -H len Add empty virtio-net-header with size 'len'. Valid sizes are 0, 10 and 12. This option is\n" +" only used with Virtual Machine technologies that use virtio as a network interface.\n" +"\n" +" -P file\n" +" Load the packet to be transmitted from a pcap file rather than constructing it within\n" +" pkt-gen.\n" +"\n" +" -z Use random IPv4/IPv6 src address/port.\n" +"\n" +" -Z Use random IPv4/IPv6 dst address/port.\n" +"\n" +" -N Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n" +"\n" +" -F num_frags\n" +" Send multi-slot packets, each one with num_frags fragments. A multi-slot packet is repre-\n" +" sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n" +" last slot). This is useful to transmit or receive packets larger than the netmap buffer\n" +" size.\n" +"\n" +" -M frag_size\n" +" In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n" +" length divided by num_frags.\n" +"\n" +" -I Use indirect buffers. It is only valid for transmitting on VALE ports, and it is implemented\n" +" by setting the NS_INDIRECT flag in the netmap slots.\n" +"\n" +" -W Exit immediately if all the RX rings are empty the first time they are examined.\n" +"\n" +" -v Increase the verbosity level.\n" +"\n" +" -r In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n" +" netmap buffers is (rubbish mode).\n" +"\n" +" -A Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n" +"\n" +" -B Take Ethernet framing and CRC into account when computing the average bps. This adds 4 bytes\n" +" of CRC and 20 bytes of framing to each packet.\n" +"\n" +" -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n" +" Configuration in terms of number of rings and slots to be used when opening the netmap port.\n" +" Such configuration has effect on software ports created on the fly, such as VALE ports and\n" +" netmap pipes. The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n" +" rx_slots, tx_rings, rx_rings. Missing numbers or zeroes stand for default values. As an\n" +" additional convenience, if exactly one number is specified, then this is assigned to both\n" +" tx_slots and rx_slots. If there is no fourth number, then the third one is assigned to both\n" +" tx_rings and rx_rings.\n" +"\n" +" -o options data generation options (parsed using atoi)\n" +" OPT_PREFETCH 1\n" +" OPT_ACCESS 2\n" +" OPT_COPY 4\n" +" OPT_MEMCPY 8\n" +" OPT_TS 16 (add a timestamp)\n" +" OPT_INDIRECT 32 (use indirect buffers)\n" +" OPT_DUMP 64 (dump rx/tx traffic)\n" +" OPT_RUBBISH 256\n" +" (send wathever the buffers contain)\n" +" OPT_RANDOM_SRC 512\n" +" OPT_RANDOM_DST 1024\n" +" OPT_PPS_STATS 2048\n" + "", cmd); - - exit(0); + exit(errcode); } enum { @@ -1908,67 +2448,76 @@ }; static void -start_threads(struct glob_arg *g) -{ +start_threads(struct glob_arg *g) { int i; targs = calloc(g->nthreads, sizeof(*targs)); + struct targ *t; /* * Now create the desired number of threads, each one * using a single descriptor. - */ + */ for (i = 0; i < g->nthreads; i++) { - struct targ *t = &targs[i]; + uint64_t seed = time(0) | (time(0) << 32); + t = &targs[i]; bzero(t, sizeof(*t)); t->fd = -1; /* default, with pcap */ t->g = g; + memcpy(t->seed, &seed, sizeof(t->seed)); - if (g->dev_type == DEV_NETMAP) { - struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ - uint64_t nmd_flags = 0; - nmd.self = &nmd; + if (g->dev_type == DEV_NETMAP) { + struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ + uint64_t nmd_flags = 0; + nmd.self = &nmd; - if (i > 0) { - /* the first thread uses the fd opened by the main - * thread, the other threads re-open /dev/netmap - */ - if (g->nthreads > 1) { - nmd.req.nr_flags = - g->nmd->req.nr_flags & ~NR_REG_MASK; - nmd.req.nr_flags |= NR_REG_ONE_NIC; - nmd.req.nr_ringid = i; - } - /* Only touch one of the rings (rx is already ok) */ - if (g->td_type == TD_TYPE_RECEIVER) - nmd_flags |= NETMAP_NO_TX_POLL; + if (i > 0) { + /* the first thread uses the fd opened by the main + * thread, the other threads re-open /dev/netmap + */ + if (g->nthreads > 1) { + nmd.req.nr_flags = + g->nmd->req.nr_flags & ~NR_REG_MASK; + nmd.req.nr_flags |= NR_REG_ONE_NIC; + nmd.req.nr_ringid = i; + } + /* Only touch one of the rings (rx is already ok) */ + if (g->td_type == TD_TYPE_RECEIVER) + nmd_flags |= NETMAP_NO_TX_POLL; - /* register interface. Override ifname and ringid etc. */ - t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | - NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); - if (t->nmd == NULL) { - D("Unable to open %s: %s", - t->g->ifname, strerror(errno)); - continue; + /* register interface. Override ifname and ringid etc. */ + t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | + NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); + if (t->nmd == NULL) { + D("Unable to open %s: %s", + t->g->ifname, strerror(errno)); + continue; + } + } else { + t->nmd = g->nmd; } + t->fd = t->nmd->fd; + t->frags = g->frags; } else { - t->nmd = g->nmd; + targs[i].fd = g->main_fd; } - t->fd = t->nmd->fd; - - } else { - targs[i].fd = g->main_fd; - } t->used = 1; t->me = i; if (g->affinity >= 0) { - t->affinity = (g->affinity + i) % g->system_cpus; + t->affinity = (g->affinity + i) % g->cpus; } else { t->affinity = -1; } /* default, init packets */ initialize_packet(t); + } + /* Wait for PHY reset. */ + D("Wait %d secs for phy reset", g->wait_link); + sleep(g->wait_link); + D("Ready..."); + for (i = 0; i < g->nthreads; i++) { + t = &targs[i]; if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { D("Unable to create thread %d: %s", i, strerror(errno)); t->used = 0; @@ -1988,7 +2537,7 @@ prev.pkts = prev.bytes = prev.events = 0; gettimeofday(&prev.t, NULL); for (;;) { - char b1[40], b2[40], b3[40], b4[70]; + char b1[40], b2[40], b3[40], b4[100]; uint64_t pps, usec; struct my_ctrs x; double abs; @@ -2045,13 +2594,13 @@ ppsdev = sqrt(ppsdev); snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]", - norm(b1, ppsavg), norm(b2, ppsdev)); + norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize)); } D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space", - norm(b1, pps), b4, - norm(b2, (double)x.pkts), - norm(b3, (double)x.bytes*8), + norm(b1, pps, normalize), b4, + norm(b2, (double)x.pkts, normalize), + norm(b3, (double)x.bytes*8+(double)x.pkts*g->framing, normalize), (unsigned long long)usec, abs, (int)cur.min_space); prev = cur; @@ -2105,25 +2654,26 @@ timersub(&toc, &tic, &toc); delta_t = toc.tv_sec + 1e-6* toc.tv_usec; if (g->td_type == TD_TYPE_SENDER) - tx_output(&cur, delta_t, "Sent"); - else - tx_output(&cur, delta_t, "Received"); + tx_output(g, &cur, delta_t, "Sent"); + else if (g->td_type == TD_TYPE_RECEIVER) + tx_output(g, &cur, delta_t, "Received"); } struct td_desc { int ty; char *key; void *f; + int default_burst; }; static struct td_desc func[] = { - { TD_TYPE_SENDER, "tx", sender_body }, - { TD_TYPE_RECEIVER, "rx", receiver_body }, - { TD_TYPE_OTHER, "ping", pinger_body }, - { TD_TYPE_OTHER, "pong", ponger_body }, - { TD_TYPE_SENDER, "txseq", txseq_body }, - { TD_TYPE_RECEIVER, "rxseq", rxseq_body }, - { 0, NULL, NULL } + { TD_TYPE_RECEIVER, "rx", receiver_body, 512}, /* default */ + { TD_TYPE_SENDER, "tx", sender_body, 512 }, + { TD_TYPE_OTHER, "ping", ping_body, 1 }, + { TD_TYPE_OTHER, "pong", pong_body, 1 }, + { TD_TYPE_SENDER, "txseq", txseq_body, 512 }, + { TD_TYPE_RECEIVER, "rxseq", rxseq_body, 512 }, + { 0, NULL, NULL, 0 } }; static int @@ -2165,7 +2715,12 @@ /* if a device name was specified, put it in the structure; otherwise, * the kernel will try to allocate the "next" device of the * specified type */ - strncpy(ifr.ifr_name, dev, IFNAMSIZ); + size_t len = strlen(dev); + if (len > IFNAMSIZ) { + D("%s too long", dev); + return -1; + } + memcpy(ifr.ifr_name, dev, len); } /* try to create the device */ @@ -2183,9 +2738,9 @@ D("new name is %s", dev); #endif /* linux */ - /* this is the special file descriptor that the caller will use to talk - * with the virtual interface */ - return fd; + /* this is the special file descriptor that the caller will use to talk + * with the virtual interface */ + return fd; } int @@ -2198,41 +2753,63 @@ struct glob_arg g; int ch; - int wait_link = 2; int devqueues = 1; /* how many device queues */ + int wait_link_arg = 0; + int pkt_size_done = 0; + + struct td_desc *fn = func; + bzero(&g, sizeof(g)); g.main_fd = -1; - g.td_body = receiver_body; - g.td_type = TD_TYPE_RECEIVER; + g.td_body = fn->f; + g.td_type = fn->ty; g.report_interval = 1000; /* report interval */ g.affinity = -1; /* ip addresses can also be a range x.x.x.x-x.x.x.y */ + g.af = AF_INET; /* default */ g.src_ip.name = "10.0.0.1"; g.dst_ip.name = "10.1.0.1"; g.dst_mac.name = "ff:ff:ff:ff:ff:ff"; g.src_mac.name = NULL; g.pkt_size = 60; - g.burst = 512; // default + g.pkt_min_size = 0; g.nthreads = 1; - g.cpus = 1; // default + g.cpus = 1; /* default */ g.forever = 1; g.tx_rate = 0; - g.frags = 1; + g.frags =1; + g.mtu = 1500; g.nmr_config = ""; g.virt_header = 0; + g.wait_link = 2; /* wait 2 seconds for physical ports */ - while ( (ch = getopt(arc, argv, - "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:E:m:rP:zZA")) != -1) { - struct td_desc *fn; + while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:" + "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) { switch(ch) { default: D("bad option %c %s", ch, optarg); - usage(); + usage(-1); break; + case 'h': + usage(0); + break; + + case '4': + g.af = AF_INET; + break; + + case '6': + g.af = AF_INET6; + break; + + case 'N': + normalize = 0; + break; + case 'n': g.npackets = strtoull(optarg, NULL, 10); break; @@ -2246,6 +2823,10 @@ g.frags = i; break; + case 'M': + g.mtu = atoi(optarg); + break; + case 'f': for (fn = func; fn->key; fn++) { if (!strcmp(fn->key, optarg)) @@ -2260,7 +2841,7 @@ break; case 'o': /* data generation options */ - g.options = atoi(optarg); + g.options |= atoi(optarg); break; case 'a': /* force affinity */ @@ -2298,11 +2879,16 @@ break; case 'I': - g.options |= OPT_INDIRECT; /* XXX use indirect buffer */ + g.options |= OPT_INDIRECT; /* use indirect buffers */ break; case 'l': /* pkt_size */ - g.pkt_size = atoi(optarg); + if (pkt_size_done) { + g.pkt_min_size = atoi(optarg); + } else { + g.pkt_size = atoi(optarg); + pkt_size_done = 1; + } break; case 'd': @@ -2318,11 +2904,12 @@ break; case 'w': - wait_link = atoi(optarg); + g.wait_link = atoi(optarg); + wait_link_arg = 1; break; - case 'W': /* XXX changed default */ - g.forever = 0; /* do not exit rx even with no traffic */ + case 'W': + g.forever = 0; /* exit RX with no traffic */ break; case 'b': /* burst */ @@ -2357,18 +2944,9 @@ case 'H': g.virt_header = atoi(optarg); break; - case 'e': /* extra bufs */ - g.extra_bufs = atoi(optarg); - break; - case 'E': - g.extra_pipes = atoi(optarg); - break; case 'P': g.packet_file = strdup(optarg); break; - case 'm': - /* ignored */ - break; case 'r': g.options |= OPT_RUBBISH; break; @@ -2381,28 +2959,47 @@ case 'A': g.options |= OPT_PPS_STATS; break; + case 'B': + /* raw packets have4 bytes crc + 20 bytes framing */ + // XXX maybe add an option to pass the IFG + g.framing = 24 * 8; + break; } } if (strlen(g.ifname) <=0 ) { D("missing ifname"); - usage(); + usage(-1); } + if (g.burst == 0) { + g.burst = fn->default_burst; + D("using default burst size: %d", g.burst); + } + g.system_cpus = i = system_ncpus(); if (g.cpus < 0 || g.cpus > i) { D("%d cpus is too high, have only %d cpus", g.cpus, i); - usage(); + usage(-1); } -D("running on %d cpus (have %d)", g.cpus, i); + D("running on %d cpus (have %d)", g.cpus, i); if (g.cpus == 0) g.cpus = i; + if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) { + g.wait_link = 0; + } + if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) { D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE); - usage(); + usage(-1); } + if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) { + D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size); + usage(-1); + } + if (g.src_mac.name == NULL) { static char mybuf[20] = "00:00:00:00:00:00"; /* retrieve source mac address. */ @@ -2413,21 +3010,15 @@ g.src_mac.name = mybuf; } /* extract address ranges */ - extract_ip_range(&g.src_ip); - extract_ip_range(&g.dst_ip); - extract_mac_range(&g.src_mac); - extract_mac_range(&g.dst_mac); + if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac)) + usage(-1); + g.options |= extract_ip_range(&g.src_ip, g.af); + g.options |= extract_ip_range(&g.dst_ip, g.af); - if (g.src_ip.start != g.src_ip.end || - g.src_ip.port0 != g.src_ip.port1 || - g.dst_ip.start != g.dst_ip.end || - g.dst_ip.port0 != g.dst_ip.port1) - g.options |= OPT_COPY; - if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 && g.virt_header != VIRT_HDR_2) { D("bad virtio-net-header length"); - usage(); + usage(-1); } if (g.dev_type == DEV_TAP) { @@ -2435,7 +3026,7 @@ g.main_fd = tap_alloc(g.ifname); if (g.main_fd < 0) { D("cannot open tap %s", g.ifname); - usage(); + usage(-1); } #ifndef NO_PCAP } else if (g.dev_type == DEV_PCAP) { @@ -2445,7 +3036,7 @@ g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf); if (g.p == NULL) { D("cannot open pcap on %s", g.ifname); - usage(); + usage(-1); } g.main_fd = pcap_fileno(g.p); D("using pcap on %s fileno %d", g.ifname, g.main_fd); @@ -2453,20 +3044,21 @@ } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { - struct nmreq base_nmd; + struct nm_desc base_nmd; + char errmsg[MAXERRMSG]; + u_int flags; bzero(&base_nmd, sizeof(base_nmd)); - parse_nmr_config(g.nmr_config, &base_nmd); - if (g.extra_bufs) { - base_nmd.nr_arg3 = g.extra_bufs; - } - if (g.extra_pipes) { - base_nmd.nr_arg1 = g.extra_pipes; - } + parse_nmr_config(g.nmr_config, &base_nmd.req); - base_nmd.nr_flags |= NR_ACCEPT_VNET_HDR; + base_nmd.req.nr_flags |= NR_ACCEPT_VNET_HDR; + if (nm_parse(g.ifname, &base_nmd, errmsg) < 0) { + D("Invalid name '%s': %s", g.ifname, errmsg); + goto out; + } + /* * Open the netmap device using nm_open(). * @@ -2474,28 +3066,21 @@ * which in turn may take some time for the PHY to * reconfigure. We do the open here to have time to reset. */ - g.nmd = nm_open(g.ifname, &base_nmd, 0, NULL); + flags = NM_OPEN_IFNAME | NM_OPEN_ARG1 | NM_OPEN_ARG2 | + NM_OPEN_ARG3 | NM_OPEN_RING_CFG; + if (g.nthreads > 1) { + base_nmd.req.nr_flags &= ~NR_REG_MASK; + base_nmd.req.nr_flags |= NR_REG_ONE_NIC; + base_nmd.req.nr_ringid = 0; + } + g.nmd = nm_open(g.ifname, NULL, flags, &base_nmd); if (g.nmd == NULL) { D("Unable to open %s: %s", g.ifname, strerror(errno)); goto out; } - - if (g.nthreads > 1) { - struct nm_desc saved_desc = *g.nmd; - saved_desc.self = &saved_desc; - saved_desc.mem = NULL; - nm_close(g.nmd); - saved_desc.req.nr_flags &= ~NR_REG_MASK; - saved_desc.req.nr_flags |= NR_REG_ONE_NIC; - saved_desc.req.nr_ringid = 0; - g.nmd = nm_open(g.ifname, &base_nmd, NM_OPEN_IFNAME, &saved_desc); - if (g.nmd == NULL) { - D("Unable to open %s: %s", g.ifname, strerror(errno)); - goto out; - } - } g.main_fd = g.nmd->fd; - D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem); + D("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10), + g.nmd->mem); if (g.virt_header) { /* Set the virtio-net header length, since the user asked @@ -2558,7 +3143,7 @@ /* Exit if something went wrong. */ if (g.main_fd < 0) { D("aborting"); - usage(); + usage(-1); } } @@ -2583,8 +3168,8 @@ int lim = (g.tx_rate)/300; if (g.burst > lim) g.burst = lim; - if (g.burst < g.frags) - g.burst = g.frags; + if (g.burst == 0) + g.burst = 1; x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; g.tx_period.tv_nsec = x; g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; @@ -2593,11 +3178,6 @@ if (g.td_type == TD_TYPE_SENDER) D("Sending %d packets every %ld.%09ld s", g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); - /* Wait for PHY reset. */ - D("Wait %d secs for phy reset", wait_link); - sleep(wait_link); - D("Ready..."); - /* Install ^C handler. */ global_nthreads = g.nthreads; sigemptyset(&ss); @@ -2608,6 +3188,7 @@ } start_threads(&g); /* Install the handler and re-enable SIGINT for the main thread */ + memset(&sa, 0, sizeof(sa)); sa.sa_handler = sigint_h; if (sigaction(SIGINT, &sa, NULL) < 0) { D("failed to install ^C handler: %s", strerror(errno));