diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index c6bae50188e2..c6cf78ad85ee 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -1,3299 +1,3303 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $ * * Example program to show how to build a multithreaded packet * source/sink using the netmap device. * * In this example we create a programmable number of threads * to take care of all the queues of the interface used to * send or receive traffic. * */ #define _GNU_SOURCE /* for CPU_SET() */ #include /* ntohs */ #include #include // isprint() #include #include #include /* getifaddrs */ #include #include #include #include #include #include #include #ifndef NO_PCAP #include #endif #include #include #include #include #include #include #include #include #if !defined(_WIN32) && !defined(linux) #include /* sysctl */ #endif #include #include // sysconf() #ifdef linux #define IPV6_VERSION 0x60 #define IPV6_DEFHLIM 64 #endif #include "ctrs.h" static void usage(int); #ifdef _WIN32 #define cpuset_t DWORD_PTR //uint64_t static inline void CPU_ZERO(cpuset_t *p) { *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0) #define TAP_CLONEDEV "/dev/tap" #define AF_LINK 18 //defined in winsocks.h #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME #include /* * Convert an ASCII representation of an ethernet address to * binary form. */ struct ether_addr * ether_aton(const char *a) { int i; static struct ether_addr o; unsigned int o0, o1, o2, o3, o4, o5; i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5); if (i != 6) return (NULL); o.octet[0]=o0; o.octet[1]=o1; o.octet[2]=o2; o.octet[3]=o3; o.octet[4]=o4; o.octet[5]=o5; return ((struct ether_addr *)&o); } /* * Convert a binary representation of an ethernet address to * an ASCII string. */ char * ether_ntoa(const struct ether_addr *n) { int i; static char a[18]; i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x", n->octet[0], n->octet[1], n->octet[2], n->octet[3], n->octet[4], n->octet[5]); return (i < 17 ? NULL : (char *)&a); } #endif /* _WIN32 */ #ifdef linux #define cpuset_t cpu_set_t #define ifr_flagshigh ifr_flags /* only the low 16 bits here */ #define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ #include #include #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME #include /* ether_aton */ #include /* sockaddr_ll */ #endif /* linux */ #ifdef __FreeBSD__ #include /* le64toh */ #include #include /* pthread w/ affinity */ #include /* cpu_set */ #include /* LLADDR */ #endif /* __FreeBSD__ */ #ifdef __APPLE__ #define cpuset_t uint64_t // XXX static inline void CPU_ZERO(cpuset_t *p) { *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) ((void)a, 0) #define ifr_flagshigh ifr_flags // XXX #define IFF_PPROMISC IFF_PROMISC #include /* LLADDR */ #define clock_gettime(a,b) \ do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) #endif /* __APPLE__ */ static const char *default_payload = "netmap pkt-gen DIRECT payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; static const char *indirect_payload = "netmap pkt-gen indirect payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; static int verbose = 0; static int normalize = 1; #define VIRT_HDR_1 10 /* length of a base vnet-hdr */ #define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ #define VIRT_HDR_MAX VIRT_HDR_2 struct virt_header { uint8_t fields[VIRT_HDR_MAX]; }; #define MAX_BODYSIZE 65536 struct pkt { struct virt_header vh; struct ether_header eh; union { struct { struct ip ip; struct udphdr udp; uint8_t body[MAX_BODYSIZE]; /* hardwired */ } ipv4; struct { struct ip6_hdr ip; struct udphdr udp; uint8_t body[MAX_BODYSIZE]; /* hardwired */ } ipv6; }; } __attribute__((__packed__)); #define PKT(p, f, af) \ ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f) struct ip_range { const char *name; union { struct { uint32_t start, end; /* same as struct in_addr */ } ipv4; struct { struct in6_addr start, end; uint8_t sgroup, egroup; } ipv6; }; uint16_t port0, port1; }; struct mac_range { const char *name; struct ether_addr start, end; }; /* ifname can be netmap:foo-xxxx */ #define MAX_IFNAMELEN 512 /* our buffer for ifname */ //#define MAX_PKTSIZE 1536 #define MAX_PKTSIZE MAX_BODYSIZE /* XXX: + IP_HDR + ETH_HDR */ /* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */ struct tstamp { uint32_t sec; uint32_t nsec; }; /* * global arguments for all threads */ struct glob_arg { int af; /* address family AF_INET/AF_INET6 */ struct ip_range src_ip; struct ip_range dst_ip; struct mac_range dst_mac; struct mac_range src_mac; int pkt_size; int pkt_min_size; int burst; int forever; uint64_t npackets; /* total packets to send */ int frags; /* fragments per packet */ u_int frag_size; /* size of each fragment */ int nthreads; int cpus; /* cpus used for running */ int system_cpus; /* cpus on the system */ int options; /* testing */ #define OPT_PREFETCH 1 #define OPT_ACCESS 2 #define OPT_COPY 4 #define OPT_MEMCPY 8 #define OPT_TS 16 /* add a timestamp */ #define OPT_INDIRECT 32 /* use indirect buffers, tx only */ #define OPT_DUMP 64 /* dump rx/tx traffic */ #define OPT_RUBBISH 256 /* send whatever the buffers contain */ #define OPT_RANDOM_SRC 512 #define OPT_RANDOM_DST 1024 #define OPT_PPS_STATS 2048 int dev_type; #ifndef NO_PCAP pcap_t *p; #endif int tx_rate; struct timespec tx_period; int affinity; int main_fd; struct nmport_d *nmd; uint32_t orig_mode; int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); int td_type; void *mmap_addr; char ifname[MAX_IFNAMELEN]; const char *nmr_config; int dummy_send; int virt_header; /* send also the virt_header */ char *packet_file; /* -P option */ #define STATS_WIN 15 int win_idx; int64_t win[STATS_WIN]; int wait_link; int framing; /* #bits of framing (for bw output) */ }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; enum { TD_TYPE_SENDER = 1, TD_TYPE_RECEIVER, TD_TYPE_OTHER, }; /* * Arguments for a new thread. The same structure is used by * the source and the sink */ struct targ { struct glob_arg *g; int used; int completed; int cancel; int fd; struct nmport_d *nmd; /* these ought to be volatile, but they are * only sampled and errors should not accumulate */ struct my_ctrs ctr; struct timespec tic, toc; int me; pthread_t thread; int affinity; struct pkt pkt; void *frame; uint16_t seed[3]; u_int frags; u_int frag_size; }; static __inline uint16_t cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } static void extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port) { struct in_addr a; char *pp; pp = strchr(name, ':'); if (pp != NULL) { /* do we have ports ? */ *pp++ = '\0'; *port = (uint16_t)strtol(pp, NULL, 0); } inet_pton(AF_INET, name, &a); *addr = ntohl(a.s_addr); } static void extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port, uint8_t *group) { char *pp; /* * We accept IPv6 address in the following form: * group@[2001:DB8::1001]:port (w/ brackets and port) * group@[2001:DB8::1] (w/ brackets and w/o port) * group@2001:DB8::1234 (w/o brackets and w/o port) */ pp = strchr(name, '@'); if (pp != NULL) { *pp++ = '\0'; *group = (uint8_t)strtol(name, NULL, 0); if (*group > 7) *group = 7; name = pp; } if (name[0] == '[') name++; pp = strchr(name, ']'); if (pp != NULL) *pp++ = '\0'; if (pp != NULL && *pp != ':') pp = NULL; if (pp != NULL) { /* do we have ports ? */ *pp++ = '\0'; *port = (uint16_t)strtol(pp, NULL, 0); } inet_pton(AF_INET6, name, addr); } /* * extract the extremes from a range of ipv4 addresses. * addr_lo[-addr_hi][:port_lo[-port_hi]] */ static int extract_ip_range(struct ip_range *r, int af) { char *name, *ap, start[INET6_ADDRSTRLEN]; char end[INET6_ADDRSTRLEN]; struct in_addr a; uint32_t tmp; if (verbose) D("extract IP range from %s", r->name); name = strdup(r->name); if (name == NULL) { D("strdup failed"); usage(-1); } /* the first - splits start/end of range */ ap = strchr(name, '-'); if (ap != NULL) *ap++ = '\0'; r->port0 = 1234; /* default port */ if (af == AF_INET6) { r->ipv6.sgroup = 7; /* default group */ extract_ipv6_addr(name, &r->ipv6.start, &r->port0, &r->ipv6.sgroup); } else extract_ipv4_addr(name, &r->ipv4.start, &r->port0); r->port1 = r->port0; if (af == AF_INET6) { if (ap != NULL) { r->ipv6.egroup = r->ipv6.sgroup; extract_ipv6_addr(ap, &r->ipv6.end, &r->port1, &r->ipv6.egroup); } else { r->ipv6.end = r->ipv6.start; r->ipv6.egroup = r->ipv6.sgroup; } } else { if (ap != NULL) { extract_ipv4_addr(ap, &r->ipv4.end, &r->port1); if (r->ipv4.start > r->ipv4.end) { tmp = r->ipv4.end; r->ipv4.end = r->ipv4.start; r->ipv4.start = tmp; } } else r->ipv4.end = r->ipv4.start; } if (r->port0 > r->port1) { tmp = r->port0; r->port0 = r->port1; r->port1 = tmp; } if (af == AF_INET) { a.s_addr = htonl(r->ipv4.start); inet_ntop(af, &a, start, sizeof(start)); a.s_addr = htonl(r->ipv4.end); inet_ntop(af, &a, end, sizeof(end)); } else { inet_ntop(af, &r->ipv6.start, start, sizeof(start)); inet_ntop(af, &r->ipv6.end, end, sizeof(end)); } if (af == AF_INET) D("range is %s:%d to %s:%d", start, r->port0, end, r->port1); else D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup, start, r->port0, r->ipv6.egroup, end, r->port1); free(name); if (r->port0 != r->port1 || (af == AF_INET && r->ipv4.start != r->ipv4.end) || (af == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end))) return (OPT_COPY); return (0); } static int extract_mac_range(struct mac_range *r) { struct ether_addr *e; if (verbose) D("extract MAC range from %s", r->name); e = ether_aton(r->name); if (e == NULL) { D("invalid MAC address '%s'", r->name); return 1; } bcopy(e, &r->start, 6); bcopy(e, &r->end, 6); #if 0 bcopy(targ->src_mac, eh->ether_shost, 6); p = index(targ->g->src_mac, '-'); if (p) targ->src_mac_range = atoi(p+1); bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6); bcopy(targ->dst_mac, eh->ether_dhost, 6); p = index(targ->g->dst_mac, '-'); if (p) targ->dst_mac_range = atoi(p+1); #endif if (verbose) D("%s starts at %s", r->name, ether_ntoa(&r->start)); return 0; } static int get_if_mtu(const struct glob_arg *g) { struct ifreq ifreq; int s, ret; const char *ifname = g->nmd->hdr.nr_name; size_t len; if (!strncmp(g->ifname, "netmap:", 7) && !strchr(ifname, '{') && !strchr(ifname, '}')) { len = strlen(ifname); if (len > IFNAMSIZ) { D("'%s' too long, cannot ask for MTU", ifname); return -1; } s = socket(AF_INET, SOCK_DGRAM, 0); if (s < 0) { D("socket() failed: %s", strerror(errno)); return s; } memset(&ifreq, 0, sizeof(ifreq)); memcpy(ifreq.ifr_name, ifname, len); ret = ioctl(s, SIOCGIFMTU, &ifreq); if (ret) { D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno)); } close(s); return ifreq.ifr_mtu; } /* This is a pipe or a VALE port, where the MTU is very large, * so we use some practical limit. */ return 65536; } static struct targ *targs; static int global_nthreads; /* control-C handler */ static void sigint_h(int sig) { int i; (void)sig; /* UNUSED */ D("received control-C on thread %p", (void *)pthread_self()); for (i = 0; i < global_nthreads; i++) { targs[i].cancel = 1; } } /* sysctl wrapper to return the number of active CPUs */ static int system_ncpus(void) { int ncpus; #if defined (__FreeBSD__) int mib[2] = { CTL_HW, HW_NCPU }; size_t len = sizeof(mib); sysctl(mib, 2, &ncpus, &len, NULL, 0); #elif defined(linux) ncpus = sysconf(_SC_NPROCESSORS_ONLN); #elif defined(_WIN32) { SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); ncpus = sysinfo.dwNumberOfProcessors; } #else /* others */ ncpus = 1; #endif /* others */ return (ncpus); } #ifdef __linux__ #define sockaddr_dl sockaddr_ll #define sdl_family sll_family #define AF_LINK AF_PACKET #define LLADDR(s) s->sll_addr; #include #define TAP_CLONEDEV "/dev/net/tun" #endif /* __linux__ */ #ifdef __FreeBSD__ #include #define TAP_CLONEDEV "/dev/tap" #endif /* __FreeBSD */ #ifdef __APPLE__ // #warning TAP not supported on apple ? #include #define TAP_CLONEDEV "/dev/tap" #endif /* __APPLE__ */ /* * parse the vale configuration in conf and put it in nmr. * Return the flag set if necessary. * The configuration may consist of 1 to 4 numbers separated * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number * is specified, then this is assigned to both #tx-slots and #rx-slots. * If there is no 4th number, then the 3rd is assigned to both #tx-rings * and #rx-rings. */ static int parse_nmr_config(const char* conf, struct nmreq_register *nmr) { char *w, *tok; int i, v; if (conf == NULL || ! *conf) return 0; nmr->nr_tx_rings = nmr->nr_rx_rings = 0; nmr->nr_tx_slots = nmr->nr_rx_slots = 0; w = strdup(conf); for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { v = atoi(tok); switch (i) { case 0: nmr->nr_tx_slots = nmr->nr_rx_slots = v; break; case 1: nmr->nr_rx_slots = v; break; case 2: nmr->nr_tx_rings = nmr->nr_rx_rings = v; break; case 3: nmr->nr_rx_rings = v; break; default: D("ignored config: %s", tok); break; } } D("txr %d txd %d rxr %d rxd %d", nmr->nr_tx_rings, nmr->nr_tx_slots, nmr->nr_rx_rings, nmr->nr_rx_slots); free(w); return 0; } /* * locate the src mac address for our interface, put it * into the user-supplied buffer. return 0 if ok, -1 on error. */ static int source_hwaddr(const char *ifname, char *buf) { struct ifaddrs *ifaphead, *ifap; if (getifaddrs(&ifaphead) != 0) { D("getifaddrs %s failed", ifname); return (-1); } + /* remove 'netmap:' prefix before comparing interfaces */ + if (!strncmp(ifname, "netmap:", 7)) + ifname = &ifname[7]; + for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifap->ifa_addr; uint8_t *mac; if (!sdl || sdl->sdl_family != AF_LINK) continue; if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0) continue; mac = (uint8_t *)LLADDR(sdl); sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); if (verbose) D("source hwaddr %s", buf); break; } freeifaddrs(ifaphead); return ifap ? 0 : 1; } /* set the thread affinity. */ static int setaffinity(pthread_t me, int i) { cpuset_t cpumask; if (i == -1) return 0; /* Set thread affinity affinity.*/ CPU_ZERO(&cpumask); CPU_SET(i, &cpumask); if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { D("Unable to set affinity: %s", strerror(errno)); return 1; } return 0; } /* Compute the checksum of the given ip header. */ static uint32_t checksum(const void *data, uint16_t len, uint32_t sum) { const uint8_t *addr = data; uint32_t i; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1U); i += 2) { sum += (uint16_t)ntohs(*((const uint16_t *)(addr + i))); if (sum > 0xFFFF) sum -= 0xFFFF; } /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) { sum += addr[i] << 8; if (sum > 0xFFFF) sum -= 0xFFFF; } return sum; } static uint16_t wrapsum(uint32_t sum) { sum = ~sum & 0xFFFF; return (htons(sum)); } /* Check the payload of the packet for errors (use it for debug). * Look for consecutive ascii representations of the size of the packet. */ static void dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur) { char buf[128]; int i, j, i0; const unsigned char *p = (const unsigned char *)_p; /* get the length in ASCII of the length of the packet. */ printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", ring, cur, ring->slot[cur].buf_idx, ring->slot[cur].flags, len); /* hexdump routine */ for (i = 0; i < len; ) { memset(buf, ' ', sizeof(buf)); sprintf(buf, "%5d: ", i); i0 = i; for (j=0; j < 16 && i < len; i++, j++) sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i])); i = i0; for (j=0; j < 16 && i < len; i++, j++) sprintf(buf+7+j + 48, "%c", isprint(p[i]) ? p[i] : '.'); printf("%s\n", buf); } } /* * Fill a packet with some payload. * We create a UDP packet so the payload starts at * 14+20+8 = 42 bytes. */ #ifdef __linux__ #define uh_sport source #define uh_dport dest #define uh_ulen len #define uh_sum check #endif /* linux */ static uint16_t new_ip_sum(uint16_t ip_sum, uint32_t oaddr, uint32_t naddr) { ip_sum = cksum_add(ip_sum, ~oaddr >> 16); ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); ip_sum = cksum_add(ip_sum, naddr >> 16); ip_sum = cksum_add(ip_sum, naddr & 0xffff); return ip_sum; } static uint16_t new_udp_sum(uint16_t udp_sum, uint16_t oport, uint16_t nport) { udp_sum = cksum_add(udp_sum, ~oport); udp_sum = cksum_add(udp_sum, nport); return udp_sum; } static void update_ip(struct pkt *pkt, struct targ *t) { struct glob_arg *g = t->g; struct ip ip; struct udphdr udp; uint32_t oaddr, naddr; uint16_t oport, nport; uint16_t ip_sum = 0, udp_sum = 0; memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); memcpy(&udp, &pkt->ipv4.udp, sizeof(udp)); do { ip_sum = udp_sum = 0; naddr = oaddr = ntohl(ip.ip_src.s_addr); nport = oport = ntohs(udp.uh_sport); if (g->options & OPT_RANDOM_SRC) { ip.ip_src.s_addr = nrand48(t->seed); udp.uh_sport = nrand48(t->seed); naddr = ntohl(ip.ip_src.s_addr); nport = ntohs(udp.uh_sport); ip_sum = new_ip_sum(ip_sum, oaddr, naddr); udp_sum = new_udp_sum(udp_sum, oport, nport); } else { if (oport < g->src_ip.port1) { nport = oport + 1; udp.uh_sport = htons(nport); udp_sum = new_udp_sum(udp_sum, oport, nport); break; } nport = g->src_ip.port0; udp.uh_sport = htons(nport); if (oaddr < g->src_ip.ipv4.end) { naddr = oaddr + 1; ip.ip_src.s_addr = htonl(naddr); ip_sum = new_ip_sum(ip_sum, oaddr, naddr); break; } naddr = g->src_ip.ipv4.start; ip.ip_src.s_addr = htonl(naddr); ip_sum = new_ip_sum(ip_sum, oaddr, naddr); } naddr = oaddr = ntohl(ip.ip_dst.s_addr); nport = oport = ntohs(udp.uh_dport); if (g->options & OPT_RANDOM_DST) { ip.ip_dst.s_addr = nrand48(t->seed); udp.uh_dport = nrand48(t->seed); naddr = ntohl(ip.ip_dst.s_addr); nport = ntohs(udp.uh_dport); ip_sum = new_ip_sum(ip_sum, oaddr, naddr); udp_sum = new_udp_sum(udp_sum, oport, nport); } else { if (oport < g->dst_ip.port1) { nport = oport + 1; udp.uh_dport = htons(nport); udp_sum = new_udp_sum(udp_sum, oport, nport); break; } nport = g->dst_ip.port0; udp.uh_dport = htons(nport); if (oaddr < g->dst_ip.ipv4.end) { naddr = oaddr + 1; ip.ip_dst.s_addr = htonl(naddr); ip_sum = new_ip_sum(ip_sum, oaddr, naddr); break; } naddr = g->dst_ip.ipv4.start; ip.ip_dst.s_addr = htonl(naddr); ip_sum = new_ip_sum(ip_sum, oaddr, naddr); } } while (0); /* update checksums */ if (udp_sum != 0) udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum)); if (ip_sum != 0) { ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum)); udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum)); } memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); memcpy(&pkt->ipv4.udp, &udp, sizeof(udp)); } #ifndef s6_addr16 #define s6_addr16 __u6_addr.__u6_addr16 #endif static void update_ip6(struct pkt *pkt, struct targ *t) { struct glob_arg *g = t->g; struct ip6_hdr ip6; struct udphdr udp; uint16_t udp_sum; uint16_t oaddr, naddr; uint16_t oport, nport; uint8_t group; memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6)); memcpy(&udp, &pkt->ipv6.udp, sizeof(udp)); do { udp_sum = 0; group = g->src_ip.ipv6.sgroup; naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]); nport = oport = ntohs(udp.uh_sport); if (g->options & OPT_RANDOM_SRC) { ip6.ip6_src.s6_addr16[group] = nrand48(t->seed); udp.uh_sport = nrand48(t->seed); naddr = ntohs(ip6.ip6_src.s6_addr16[group]); nport = ntohs(udp.uh_sport); break; } if (oport < g->src_ip.port1) { nport = oport + 1; udp.uh_sport = htons(nport); break; } nport = g->src_ip.port0; udp.uh_sport = htons(nport); if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) { naddr = oaddr + 1; ip6.ip6_src.s6_addr16[group] = htons(naddr); break; } naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]); ip6.ip6_src.s6_addr16[group] = htons(naddr); /* update checksums if needed */ if (oaddr != naddr) udp_sum = cksum_add(~oaddr, naddr); if (oport != nport) udp_sum = cksum_add(udp_sum, cksum_add(~oport, nport)); group = g->dst_ip.ipv6.egroup; naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]); nport = oport = ntohs(udp.uh_dport); if (g->options & OPT_RANDOM_DST) { ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed); udp.uh_dport = nrand48(t->seed); naddr = ntohs(ip6.ip6_dst.s6_addr16[group]); nport = ntohs(udp.uh_dport); break; } if (oport < g->dst_ip.port1) { nport = oport + 1; udp.uh_dport = htons(nport); break; } nport = g->dst_ip.port0; udp.uh_dport = htons(nport); if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) { naddr = oaddr + 1; ip6.ip6_dst.s6_addr16[group] = htons(naddr); break; } naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]); ip6.ip6_dst.s6_addr16[group] = htons(naddr); } while (0); /* update checksums */ if (oaddr != naddr) udp_sum = cksum_add(udp_sum, cksum_add(~oaddr, naddr)); if (oport != nport) udp_sum = cksum_add(udp_sum, cksum_add(~oport, nport)); if (udp_sum != 0) udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum); memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); memcpy(&pkt->ipv6.udp, &udp, sizeof(udp)); } static void update_addresses(struct pkt *pkt, struct targ *t) { if (t->g->af == AF_INET) update_ip(pkt, t); else update_ip6(pkt, t); } /* * initialize one packet and prepare for the next one. * The copy could be done better instead of repeating it each time. */ static void initialize_packet(struct targ *targ) { struct pkt *pkt = &targ->pkt; struct ether_header *eh; struct ip6_hdr ip6; struct ip ip; struct udphdr udp; void *udp_ptr; uint16_t paylen; uint32_t csum = 0; const char *payload = targ->g->options & OPT_INDIRECT ? indirect_payload : default_payload; int i, l0 = strlen(payload); #ifndef NO_PCAP char errbuf[PCAP_ERRBUF_SIZE]; pcap_t *file; struct pcap_pkthdr *header; const unsigned char *packet; /* Read a packet from a PCAP file if asked. */ if (targ->g->packet_file != NULL) { if ((file = pcap_open_offline(targ->g->packet_file, errbuf)) == NULL) D("failed to open pcap file %s", targ->g->packet_file); if (pcap_next_ex(file, &header, &packet) < 0) D("failed to read packet from %s", targ->g->packet_file); if ((targ->frame = malloc(header->caplen)) == NULL) D("out of memory"); bcopy(packet, (unsigned char *)targ->frame, header->caplen); targ->g->pkt_size = header->caplen; pcap_close(file); return; } #endif paylen = targ->g->pkt_size - sizeof(*eh) - (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6)); /* create a nice NUL-terminated string */ for (i = 0; i < paylen; i += l0) { if (l0 > paylen - i) l0 = paylen - i; // last round bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0); } PKT(pkt, body, targ->g->af)[i - 1] = '\0'; /* prepare the headers */ eh = &pkt->eh; bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); if (targ->g->af == AF_INET) { eh->ether_type = htons(ETHERTYPE_IP); memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); udp_ptr = &pkt->ipv4.udp; ip.ip_v = IPVERSION; ip.ip_hl = sizeof(ip) >> 2; ip.ip_id = 0; ip.ip_tos = IPTOS_LOWDELAY; ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh)); ip.ip_id = 0; ip.ip_off = htons(IP_DF); /* Don't fragment */ ip.ip_ttl = IPDEFTTL; ip.ip_p = IPPROTO_UDP; ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start); ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start); ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0)); memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); } else { eh->ether_type = htons(ETHERTYPE_IPV6); memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6)); udp_ptr = &pkt->ipv6.udp; ip6.ip6_flow = 0; ip6.ip6_plen = htons(paylen); ip6.ip6_vfc = IPV6_VERSION; ip6.ip6_nxt = IPPROTO_UDP; ip6.ip6_hlim = IPV6_DEFHLIM; ip6.ip6_src = targ->g->src_ip.ipv6.start; ip6.ip6_dst = targ->g->dst_ip.ipv6.start; } memcpy(&udp, udp_ptr, sizeof(udp)); udp.uh_sport = htons(targ->g->src_ip.port0); udp.uh_dport = htons(targ->g->dst_ip.port0); udp.uh_ulen = htons(paylen); if (targ->g->af == AF_INET) { /* Magic: taken from sbin/dhclient/packet.c */ udp.uh_sum = wrapsum( checksum(&udp, sizeof(udp), /* udp header */ checksum(pkt->ipv4.body, /* udp payload */ paylen - sizeof(udp), checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */ 2 * sizeof(pkt->ipv4.ip.ip_src), IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen))))); memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); } else { /* Save part of pseudo header checksum into csum */ csum = IPPROTO_UDP << 24; csum = checksum(&csum, sizeof(csum), paylen); udp.uh_sum = wrapsum( checksum(udp_ptr, sizeof(udp), /* udp header */ checksum(pkt->ipv6.body, /* udp payload */ paylen - sizeof(udp), checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */ 2 * sizeof(pkt->ipv6.ip.ip6_src), csum)))); memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); } memcpy(udp_ptr, &udp, sizeof(udp)); bzero(&pkt->vh, sizeof(pkt->vh)); // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } static void get_vnet_hdr_len(struct glob_arg *g) { struct nmreq_header hdr; struct nmreq_port_hdr ph; int err; hdr = g->nmd->hdr; /* copy name and version */ hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_GET; hdr.nr_options = 0; memset(&ph, 0, sizeof(ph)); hdr.nr_body = (uintptr_t)&ph; err = ioctl(g->main_fd, NIOCCTRL, &hdr); if (err) { D("Unable to get virtio-net header length"); return; } g->virt_header = ph.nr_hdr_len; if (g->virt_header) { D("Port requires virtio-net header, length = %d", g->virt_header); } } static void set_vnet_hdr_len(struct glob_arg *g) { int err, l = g->virt_header; struct nmreq_header hdr; struct nmreq_port_hdr ph; if (l == 0) return; hdr = g->nmd->hdr; /* copy name and version */ hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_SET; hdr.nr_options = 0; memset(&ph, 0, sizeof(ph)); hdr.nr_body = (uintptr_t)&ph; err = ioctl(g->main_fd, NIOCCTRL, &hdr); if (err) { D("Unable to set virtio-net header length %d", l); } } /* * create and enqueue a batch of packets on a ring. * On the last one set NS_REPORT to tell the driver to generate * an interrupt when done. */ static int send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, int size, struct targ *t, u_int count, int options) { u_int n, sent, head = ring->head; u_int frags = t->frags; u_int frag_size = t->frag_size; struct netmap_slot *slot = &ring->slot[head]; n = nm_ring_space(ring); #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { for (sent = 0; sent < count; sent++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); __builtin_prefetch(p); head = nm_ring_next(ring, head); } head = ring->head; } #endif for (sent = 0; sent < count && n >= frags; sent++, n--) { char *p; int buf_changed; u_int tosend = size; slot = &ring->slot[head]; p = NETMAP_BUF(ring, slot->buf_idx); buf_changed = slot->flags & NS_BUF_CHANGED; slot->flags = 0; if (options & OPT_RUBBISH) { /* do nothing */ } else if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; slot->ptr = (uint64_t)((uintptr_t)frame); } else if (frags > 1) { u_int i; const char *f = frame; char *fp = p; for (i = 0; i < frags - 1; i++) { memcpy(fp, f, frag_size); slot->len = frag_size; slot->flags = NS_MOREFRAG; if (options & OPT_DUMP) dump_payload(fp, frag_size, ring, head); tosend -= frag_size; f += frag_size; head = nm_ring_next(ring, head); slot = &ring->slot[head]; fp = NETMAP_BUF(ring, slot->buf_idx); } n -= (frags - 1); p = fp; slot->flags = 0; memcpy(p, f, tosend); update_addresses(pkt, t); } else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) { if (options & OPT_COPY) nm_pkt_copy(frame, p, size); else memcpy(p, frame, size); update_addresses(pkt, t); } else if (options & OPT_PREFETCH) { __builtin_prefetch(p); } slot->len = tosend; if (options & OPT_DUMP) dump_payload(p, tosend, ring, head); head = nm_ring_next(ring, head); } if (sent) { slot->flags |= NS_REPORT; ring->head = ring->cur = head; } if (sent < count) { /* tell netmap that we need more slots */ ring->cur = ring->tail; } return (sent); } /* * Index of the highest bit set */ static uint32_t msb64(uint64_t x) { uint64_t m = 1ULL << 63; int i; for (i = 63; i >= 0; i--, m >>=1) if (m & x) return i; return 0; } /* * wait until ts, either busy or sleeping if more than 1ms. * Return wakeup time. */ static struct timespec wait_time(struct timespec ts) { for (;;) { struct timespec w, cur; clock_gettime(CLOCK_REALTIME_PRECISE, &cur); w = timespec_sub(ts, cur); if (w.tv_sec < 0) return cur; else if (w.tv_sec > 0 || w.tv_nsec > 1000000) poll(NULL, 0, 1); } } /* * Send a packet, and wait for a response. * The payload (after UDP header, ofs 42) has a 4-byte sequence * followed by a struct timeval (or bintime?) */ static void * ping_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; int i, m, rx = 0; void *frame; int size; struct timespec ts, now, last_print; struct timespec nexttime = {0, 0}; /* silence compiler */ uint64_t sent = 0, n = targ->g->npackets; uint64_t count = 0, t_cur, t_min = ~0, av = 0; uint64_t g_min = ~0, g_av = 0; uint64_t buckets[64]; /* bins for delays, ns */ int rate_limit = targ->g->tx_rate, tosend = 0; frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; if (targ->g->nthreads > 1) { D("can only ping with 1 thread"); return NULL; } if (targ->g->af == AF_INET6) { D("Warning: ping-pong with IPv6 not supported"); } bzero(&buckets, sizeof(buckets)); clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); now = last_print; if (rate_limit) { targ->tic = timespec_add(now, (struct timespec){2,0}); targ->tic.tv_nsec = 0; wait_time(targ->tic); nexttime = targ->tic; } while (!targ->cancel && (n == 0 || sent < n)) { struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); struct netmap_slot *slot; char *p; int rv; uint64_t limit, event = 0; if (rate_limit && tosend <= 0) { tosend = targ->g->burst; nexttime = timespec_add(nexttime, targ->g->tx_period); wait_time(nexttime); } limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n - sent < limit) limit = n - sent; for (m = 0; (unsigned)m < limit; m++) { slot = &ring->slot[ring->head]; slot->len = size; p = NETMAP_BUF(ring, slot->buf_idx); if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); break; } else { struct tstamp *tp; nm_pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); tp = (struct tstamp *)(p+46); tp->sec = (uint32_t)ts.tv_sec; tp->nsec = (uint32_t)ts.tv_nsec; sent++; ring->head = ring->cur = nm_ring_next(ring, ring->head); } } if (m > 0) event++; targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = event; if (rate_limit) tosend -= m; #ifdef BUSYWAIT rv = ioctl(pfd.fd, NIOCTXSYNC, NULL); if (rv < 0) { D("TXSYNC error on queue %d: %s", targ->me, strerror(errno)); } again: ioctl(pfd.fd, NIOCRXSYNC, NULL); #else /* should use a parameter to decide how often to send */ if ( (rv = poll(&pfd, 1, 3000)) <= 0) { D("poll error on queue %d: %s", targ->me, (rv ? strerror(errno) : "timeout")); continue; } #endif /* BUSYWAIT */ /* see what we got back */ rx = 0; for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { ring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(ring)) { uint32_t seq; struct tstamp *tp; int pos; slot = &ring->slot[ring->head]; p = NETMAP_BUF(ring, slot->buf_idx); clock_gettime(CLOCK_REALTIME_PRECISE, &now); bcopy(p+42, &seq, sizeof(seq)); tp = (struct tstamp *)(p+46); ts.tv_sec = (time_t)tp->sec; ts.tv_nsec = (long)tp->nsec; ts.tv_sec = now.tv_sec - ts.tv_sec; ts.tv_nsec = now.tv_nsec - ts.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_nsec += 1000000000; ts.tv_sec--; } if (0) D("seq %d/%llu delta %d.%09d", seq, (unsigned long long)sent, (int)ts.tv_sec, (int)ts.tv_nsec); t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec; if (t_cur < t_min) t_min = t_cur; count ++; av += t_cur; pos = msb64(t_cur); buckets[pos]++; /* now store it in a bucket */ ring->head = ring->cur = nm_ring_next(ring, ring->head); rx++; } } //D("tx %d rx %d", sent, rx); //usleep(100000); ts.tv_sec = now.tv_sec - last_print.tv_sec; ts.tv_nsec = now.tv_nsec - last_print.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_nsec += 1000000000; ts.tv_sec--; } if (ts.tv_sec >= 1) { D("count %d RTT: min %d av %d ns", (int)count, (int)t_min, (int)(av/count)); int k, j, kmin, off; char buf[512]; for (kmin = 0; kmin < 64; kmin ++) if (buckets[kmin]) break; for (k = 63; k >= kmin; k--) if (buckets[k]) break; buf[0] = '\0'; off = 0; for (j = kmin; j <= k; j++) { off += sprintf(buf + off, " %5d", (int)buckets[j]); } D("k: %d .. %d\n\t%s", 1<cancel) goto again; #endif /* BUSYWAIT */ } if (sent > 0) { D("RTT over %llu packets: min %d av %d ns", (long long unsigned)sent, (int)g_min, (int)((double)g_av/sent)); } targ->completed = 1; /* reset the ``used`` flag. */ targ->used = 0; return NULL; } /* * reply to ping requests */ static void * pong_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring, *rxring; int i, rx = 0; uint64_t sent = 0, n = targ->g->npackets; if (targ->g->nthreads > 1) { D("can only reply ping with 1 thread"); return NULL; } if (n > 0) D("understood ponger %llu but don't know how to do it", (unsigned long long)n); if (targ->g->af == AF_INET6) { D("Warning: ping-pong with IPv6 not supported"); } while (!targ->cancel && (n == 0 || sent < n)) { uint32_t txhead, txavail; //#define BUSYWAIT #ifdef BUSYWAIT ioctl(pfd.fd, NIOCRXSYNC, NULL); #else int rv; if ( (rv = poll(&pfd, 1, 1000)) <= 0) { D("poll error on queue %d: %s", targ->me, rv ? strerror(errno) : "timeout"); continue; } #endif txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); txhead = txring->head; txavail = nm_ring_space(txring); /* see what we got back */ for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { rxring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(rxring)) { uint16_t *spkt, *dpkt; uint32_t head = rxring->head; struct netmap_slot *slot = &rxring->slot[head]; char *src, *dst; src = NETMAP_BUF(rxring, slot->buf_idx); //D("got pkt %p of size %d", src, slot->len); rxring->head = rxring->cur = nm_ring_next(rxring, head); rx++; if (txavail == 0) continue; dst = NETMAP_BUF(txring, txring->slot[txhead].buf_idx); /* copy... */ dpkt = (uint16_t *)dst; spkt = (uint16_t *)src; nm_pkt_copy(src, dst, slot->len); /* swap source and destination MAC */ dpkt[0] = spkt[3]; dpkt[1] = spkt[4]; dpkt[2] = spkt[5]; dpkt[3] = spkt[0]; dpkt[4] = spkt[1]; dpkt[5] = spkt[2]; /* swap source and destination IPv4 */ if (spkt[6] == htons(ETHERTYPE_IP)) { dpkt[13] = spkt[15]; dpkt[14] = spkt[16]; dpkt[15] = spkt[13]; dpkt[16] = spkt[14]; } txring->slot[txhead].len = slot->len; //dump_payload(dst, slot->len, txring, txhead); txhead = nm_ring_next(txring, txhead); txavail--; sent++; } } txring->head = txring->cur = txhead; targ->ctr.pkts = sent; #ifdef BUSYWAIT ioctl(pfd.fd, NIOCTXSYNC, NULL); #endif //D("tx %d rx %d", sent, rx); } targ->completed = 1; /* reset the ``used`` flag. */ targ->used = 0; return NULL; } static void * sender_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; struct netmap_if *nifp; struct netmap_ring *txring = NULL; int i; uint64_t n = targ->g->npackets / targ->g->nthreads; uint64_t sent = 0; uint64_t event = 0; int options = targ->g->options | OPT_COPY; struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; struct pkt *pkt = &targ->pkt; void *frame; int size; if (targ->frame == NULL) { frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; } else { frame = targ->frame; size = targ->g->pkt_size; } D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); if (setaffinity(targ->thread, targ->affinity)) goto quit; /* main loop.*/ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; wait_time(targ->tic); nexttime = targ->tic; } if (targ->g->dev_type == DEV_TAP) { D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (write(targ->g->main_fd, frame, size) != -1) sent++; update_addresses(pkt, targ); if (i > 10000) { targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = sent; i = 0; } } #ifndef NO_PCAP } else if (targ->g->dev_type == DEV_PCAP) { pcap_t *p = targ->g->p; for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (pcap_inject(p, frame, size) != -1) sent++; update_addresses(pkt, targ); if (i > 10000) { targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = sent; i = 0; } } #endif /* NO_PCAP */ } else { int tosend = 0; u_int bufsz, frag_size = targ->g->frag_size; nifp = targ->nmd->nifp; txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); bufsz = txring->nr_buf_size; if (bufsz < frag_size) frag_size = bufsz; targ->frag_size = targ->g->pkt_size / targ->frags; if (targ->frag_size > frag_size) { targ->frags = targ->g->pkt_size / frag_size; targ->frag_size = frag_size; if (targ->g->pkt_size % frag_size != 0) targ->frags++; } D("frags %u frag_size %u", targ->frags, targ->frag_size); while (!targ->cancel && (n == 0 || sent < n)) { int rv; if (rate_limit && tosend <= 0) { tosend = targ->g->burst; nexttime = timespec_add(nexttime, targ->g->tx_period); wait_time(nexttime); } /* * wait for available room in the send queue(s) */ #ifdef BUSYWAIT (void)rv; if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if ( (rv = poll(&pfd, 1, 2000)) <= 0) { if (targ->cancel) break; D("poll error on queue %d: %s", targ->me, rv ? strerror(errno) : "timeout"); // goto quit; } if (pfd.revents & POLLERR) { D("poll error on %d ring %d-%d", pfd.fd, targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); goto quit; } #endif /* !BUSYWAIT */ /* * scan our queues and send on those with room */ if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) { D("drop copy"); options &= ~OPT_COPY; } for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { int m; uint64_t limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n == sent) break; if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); if (nm_ring_empty(txring)) continue; if (targ->g->pkt_min_size > 0) { size = nrand48(targ->seed) % (targ->g->pkt_size - targ->g->pkt_min_size) + targ->g->pkt_min_size; } m = send_packets(txring, pkt, frame, size, targ, limit, options); ND("limit %lu tail %d m %d", limit, txring->tail, m); sent += m; if (m > 0) //XXX-ste: can m be 0? event++; targ->ctr.pkts = sent; targ->ctr.bytes += m*size; targ->ctr.events = event; if (rate_limit) { tosend -= m; if (tosend <= 0) break; } } } /* flush any remaining packets */ if (txring != NULL) { D("flush tail %d head %d on thread %p", txring->tail, txring->head, (void *)pthread_self()); ioctl(pfd.fd, NIOCTXSYNC, NULL); } /* final part: wait all the TX queues to be empty. */ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); while (!targ->cancel && nm_tx_pending(txring)) { RD(5, "pending tx tail %d head %d on ring %d", txring->tail, txring->head, i); ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } } } /* end DEV_NETMAP */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = event; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } #ifndef NO_PCAP static void receive_pcap(u_char *user, const struct pcap_pkthdr * h, const u_char * bytes) { struct my_ctrs *ctr = (struct my_ctrs *)user; (void)bytes; /* UNUSED */ ctr->bytes += h->len; ctr->pkts++; } #endif /* !NO_PCAP */ static int receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes) { u_int head, rx, n; uint64_t b = 0; u_int complete = 0; if (bytes == NULL) bytes = &b; head = ring->head; n = nm_ring_space(ring); if (n < limit) limit = n; for (rx = 0; rx < limit; rx++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); *bytes += slot->len; if (dump) dump_payload(p, slot->len, ring, head); if (!(slot->flags & NS_MOREFRAG)) complete++; head = nm_ring_next(ring, head); } ring->head = ring->cur = head; return (complete); } static void * receiver_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp; struct netmap_ring *rxring; int i; struct my_ctrs cur; uint64_t n = targ->g->npackets / targ->g->nthreads; memset(&cur, 0, sizeof(cur)); if (setaffinity(targ->thread, targ->affinity)) goto quit; D("reading from %s fd %d main_fd %d", targ->g->ifname, targ->fd, targ->g->main_fd); /* unbounded wait for the first packet. */ for (;!targ->cancel;) { i = poll(&pfd, 1, 1000); if (i > 0 && !(pfd.revents & POLLERR)) break; if (i < 0) { D("poll() error: %s", strerror(errno)); goto quit; } if (pfd.revents & POLLERR) { D("fd error"); goto quit; } RD(1, "waiting for initial packets, poll returns %d %d", i, pfd.revents); } /* main loop, exit after 1s silence */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (targ->g->dev_type == DEV_TAP) { while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) { char buf[MAX_BODYSIZE]; /* XXX should we poll ? */ i = read(targ->g->main_fd, buf, sizeof(buf)); if (i > 0) { targ->ctr.pkts++; targ->ctr.bytes += i; targ->ctr.events++; } } #ifndef NO_PCAP } else if (targ->g->dev_type == DEV_PCAP) { while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) { /* XXX should we poll ? */ pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, (u_char *)&targ->ctr); targ->ctr.events++; } #endif /* !NO_PCAP */ } else { int dump = targ->g->options & OPT_DUMP; nifp = targ->nmd->nifp; while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ #ifdef BUSYWAIT if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ goto out; } if (pfd.revents & POLLERR) { D("poll err"); goto quit; } #endif /* !BUSYWAIT */ uint64_t cur_space = 0; for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { int m; rxring = NETMAP_RXRING(nifp, i); /* compute free space in the ring */ m = rxring->head + rxring->num_slots - rxring->tail; if (m >= (int) rxring->num_slots) m -= rxring->num_slots; cur_space += m; if (nm_ring_empty(rxring)) continue; m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes); cur.pkts += m; if (m > 0) cur.events++; } cur.min_space = targ->ctr.min_space; if (cur_space < cur.min_space) cur.min_space = cur_space; targ->ctr = cur; } } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); #if !defined(BUSYWAIT) out: #endif targ->completed = 1; targ->ctr = cur; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } static void * txseq_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; struct netmap_ring *ring; int64_t sent = 0; uint64_t event = 0; int options = targ->g->options | OPT_COPY; struct timespec nexttime = {0, 0}; int rate_limit = targ->g->tx_rate; struct pkt *pkt = &targ->pkt; int frags = targ->g->frags; uint32_t sequence = 0; int budget = 0; void *frame; int size; if (targ->g->nthreads > 1) { D("can only txseq ping with 1 thread"); return NULL; } if (targ->g->npackets > 0) { D("Ignoring -n argument"); } frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); if (setaffinity(targ->thread, targ->affinity)) goto quit; clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; wait_time(targ->tic); nexttime = targ->tic; } /* Only use the first queue. */ ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring); while (!targ->cancel) { int64_t limit; unsigned int space; unsigned int head; int fcnt; uint16_t sum = 0; int rv; if (!rate_limit) { budget = targ->g->burst; } else if (budget <= 0) { budget = targ->g->burst; nexttime = timespec_add(nexttime, targ->g->tx_period); wait_time(nexttime); } /* wait for available room in the send queue */ #ifdef BUSYWAIT (void)rv; if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if ( (rv = poll(&pfd, 1, 2000)) <= 0) { if (targ->cancel) break; D("poll error on queue %d: %s", targ->me, rv ? strerror(errno) : "timeout"); // goto quit; } if (pfd.revents & POLLERR) { D("poll error on %d ring %d-%d", pfd.fd, targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); goto quit; } #endif /* !BUSYWAIT */ /* If no room poll() again. */ space = nm_ring_space(ring); if (!space) { continue; } limit = budget; if (space < limit) { limit = space; } /* Cut off ``limit`` to make sure is multiple of ``frags``. */ if (frags > 1) { limit = (limit / frags) * frags; } limit = sent + limit; /* Convert to absolute. */ for (fcnt = frags, head = ring->head; sent < limit; sent++, sequence++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t; memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum)); slot->flags = 0; t = *w; PKT(pkt, body, targ->g->af)[0] = sequence >> 24; PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff; sum = ~cksum_add(~sum, cksum_add(~t, *w)); t = *++w; PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff; PKT(pkt, body, targ->g->af)[3] = sequence & 0xff; sum = ~cksum_add(~sum, cksum_add(~t, *w)); memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum)); nm_pkt_copy(frame, p, size); if (fcnt == frags) { update_addresses(pkt, targ); } if (options & OPT_DUMP) { dump_payload(p, size, ring, head); } slot->len = size; if (--fcnt > 0) { slot->flags |= NS_MOREFRAG; } else { fcnt = frags; } if (sent == limit - 1) { /* Make sure we don't push an incomplete * packet. */ assert(!(slot->flags & NS_MOREFRAG)); slot->flags |= NS_REPORT; } head = nm_ring_next(ring, head); if (rate_limit) { budget--; } } ring->cur = ring->head = head; event ++; targ->ctr.pkts = sent; targ->ctr.bytes = sent * size; targ->ctr.events = event; } /* flush any remaining packets */ D("flush tail %d head %d on thread %p", ring->tail, ring->head, (void *)pthread_self()); ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait the TX queues to become empty. */ while (!targ->cancel && nm_tx_pending(ring)) { RD(5, "pending tx tail %d head %d on ring %d", ring->tail, ring->head, targ->nmd->first_tx_ring); ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; targ->ctr.pkts = sent; targ->ctr.bytes = sent * size; targ->ctr.events = event; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } static char * multi_slot_to_string(struct netmap_ring *ring, unsigned int head, unsigned int nfrags, char *strbuf, size_t strbuflen) { unsigned int f; char *ret = strbuf; for (f = 0; f < nfrags; f++) { struct netmap_slot *slot = &ring->slot[head]; int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len, slot->flags); if (m >= (int)strbuflen) { break; } strbuf += m; strbuflen -= m; head = nm_ring_next(ring, head); } return ret; } static void * rxseq_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; int dump = targ->g->options & OPT_DUMP; struct netmap_ring *ring; unsigned int frags_exp = 1; struct my_ctrs cur; unsigned int frags = 0; int first_packet = 1; int first_slot = 1; int i, j, af, nrings; uint32_t seq, *seq_exp = NULL; memset(&cur, 0, sizeof(cur)); if (setaffinity(targ->thread, targ->affinity)) goto quit; nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1; seq_exp = calloc(nrings, sizeof(uint32_t)); if (seq_exp == NULL) { D("failed to allocate seq array"); goto quit; } D("reading from %s fd %d main_fd %d", targ->g->ifname, targ->fd, targ->g->main_fd); /* unbounded wait for the first packet. */ for (;!targ->cancel;) { i = poll(&pfd, 1, 1000); if (i > 0 && !(pfd.revents & POLLERR)) break; RD(1, "waiting for initial packets, poll returns %d %d", i, pfd.revents); } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); while (!targ->cancel) { unsigned int head; int limit; #ifdef BUSYWAIT if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ goto out; } if (pfd.revents & POLLERR) { D("poll err"); goto quit; } #endif /* !BUSYWAIT */ for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) { ring = NETMAP_RXRING(targ->nmd->nifp, j); if (nm_ring_empty(ring)) continue; limit = nm_ring_space(ring); if (limit > targ->g->burst) limit = targ->g->burst; #if 0 /* Enable this if * 1) we remove the early-return optimization from * the netmap poll implementation, or * 2) pipes get NS_MOREFRAG support. * With the current netmap implementation, an experiment like * pkt-gen -i vale:1{1 -f txseq -F 9 * pkt-gen -i vale:1}1 -f rxseq * would get stuck as soon as we find nm_ring_space(ring) < 9, * since here limit is rounded to 0 and * pipe rxsync is not called anymore by the poll() of this loop. */ if (frags_exp > 1) { int o = limit; /* Cut off to the closest smaller multiple. */ limit = (limit / frags_exp) * frags_exp; RD(2, "LIMIT %d --> %d", o, limit); } #endif for (head = ring->head, i = 0; i < limit; i++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); int len = slot->len; struct pkt *pkt; if (dump) { dump_payload(p, slot->len, ring, head); } frags++; if (!(slot->flags & NS_MOREFRAG)) { if (first_packet) { first_packet = 0; } else if (frags != frags_exp) { char prbuf[512]; RD(1, "Received packets with %u frags, " "expected %u, '%s'", frags, frags_exp, multi_slot_to_string(ring, head-frags+1, frags, prbuf, sizeof(prbuf))); } first_packet = 0; frags_exp = frags; frags = 0; } p -= sizeof(pkt->vh) - targ->g->virt_header; len += sizeof(pkt->vh) - targ->g->virt_header; pkt = (struct pkt *)p; if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP) af = AF_INET; else af = AF_INET6; if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) + sizeof(seq)) { RD(1, "%s: packet too small (len=%u)", __func__, slot->len); } else { seq = (PKT(pkt, body, af)[0] << 24) | (PKT(pkt, body, af)[1] << 16) | (PKT(pkt, body, af)[2] << 8) | PKT(pkt, body, af)[3]; if (first_slot) { /* Grab the first one, whatever it is. */ seq_exp[j] = seq; first_slot = 0; } else if (seq != seq_exp[j]) { uint32_t delta = seq - seq_exp[j]; if (delta < (0xFFFFFFFF >> 1)) { RD(2, "Sequence GAP: exp %u found %u", seq_exp[j], seq); } else { RD(2, "Sequence OUT OF ORDER: " "exp %u found %u", seq_exp[j], seq); } seq_exp[j] = seq; } seq_exp[j]++; } cur.bytes += slot->len; head = nm_ring_next(ring, head); cur.pkts++; } ring->cur = ring->head = head; cur.events++; targ->ctr = cur; } } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); #ifndef BUSYWAIT out: #endif /* !BUSYWAIT */ targ->completed = 1; targ->ctr = cur; quit: if (seq_exp != NULL) free(seq_exp); /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } static void tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg) { double bw, raw_bw, pps, abs; char b1[40], b2[80], b3[80]; int size; if (cur->pkts == 0) { printf("%s nothing.\n", msg); return; } size = (int)(cur->bytes / cur->pkts); printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n", msg, (unsigned long long)cur->pkts, (unsigned long long)cur->bytes, (unsigned long long)cur->events, size, delta); if (delta == 0) delta = 1e-6; if (size < 60) /* correct for min packet size */ size = 60; pps = cur->pkts / delta; bw = (8.0 * cur->bytes) / delta; raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta; abs = cur->pkts / (double)(cur->events); printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n", norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs); } static void usage(int errcode) { /* This usage is generated from the pkt-gen man page: * $ man pkt-gen > x * and pasted here adding the string terminators and endlines with simple * regular expressions. */ const char *cmd = "pkt-gen"; fprintf(stderr, "Usage:\n" "%s arguments\n" " -h Show program usage and exit.\n" "\n" " -i interface\n" " Name of the network interface that pkt-gen operates on. It can be a system network interface\n" " (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n" " monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n" " mented in netmap(4) (NIOCREGIF section).\n" "\n" " -f function\n" " The function to be executed by pkt-gen. Specify tx for transmission, rx for reception, ping\n" " for client-side ping-pong operation, and pong for server-side ping-pong operation.\n" "\n" " -n count\n" " Number of iterations of the pkt-gen function (with 0 meaning infinite). In case of tx or rx,\n" " count is the number of packets to receive or transmit. In case of ping or pong, count is the\n" " number of ping-pong transactions.\n" "\n" " -l pkt_size\n" " Packet size in bytes excluding CRC. If passed a second time, use random sizes larger or\n" " equal than the second one and lower than the first one.\n" "\n" " -b burst_size\n" " Transmit or receive up to burst_size packets at a time.\n" "\n" " -4 Use IPv4 addresses.\n" "\n" " -6 Use IPv6 addresses.\n" "\n" " -d dst_ip[:port[-dst_ip:port]]\n" " Destination IPv4/IPv6 address and port, single or range.\n" "\n" " -s src_ip[:port[-src_ip:port]]\n" " Source IPv4/IPv6 address and port, single or range.\n" "\n" " -D dst_mac\n" " Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n" "\n" " -S src_mac\n" " Source MAC address in colon notation.\n" "\n" " -a cpu_id\n" " Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3). If more\n" " threads are used, they are pinned to the subsequent CPUs, one per thread.\n" "\n" " -c cpus\n" " Maximum number of CPUs to use (0 means to use all the available ones).\n" "\n" " -p threads\n" " Number of threads to use. By default, only a single thread is used to handle all the netmap\n" " rings. If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n" " single RX ring (in rx mode), or a TX/RX ring pair. The number of threads must be less than or\n" " equal to the number of TX (or RX) rings available in the device specified by interface.\n" "\n" " -T report_ms\n" " Number of milliseconds between reports.\n" "\n" " -w wait_for_link_time\n" " Number of seconds to wait before starting the pkt-gen function, useful to make sure that the\n" " network link is up. A network device driver may take some time to enter netmap mode, or to\n" " create a new transmit/receive ring pair when netmap(4) requests one.\n" "\n" " -R rate\n" " Packet transmission rate. Not setting the packet transmission rate tells pkt-gen to transmit\n" " packets as quickly as possible. On servers from 2010 onward netmap(4) is able to com-\n" " pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n" " your intention is to saturate the link.\n" "\n" " -X Dump payload of each packet transmitted or received.\n" "\n" " -H len Add empty virtio-net-header with size 'len'. Valid sizes are 0, 10 and 12. This option is\n" " only used with Virtual Machine technologies that use virtio as a network interface.\n" "\n" " -P file\n" " Load the packet to be transmitted from a pcap file rather than constructing it within\n" " pkt-gen.\n" "\n" " -z Use random IPv4/IPv6 src address/port.\n" "\n" " -Z Use random IPv4/IPv6 dst address/port.\n" "\n" " -N Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n" "\n" " -F num_frags\n" " Send multi-slot packets, each one with num_frags fragments. A multi-slot packet is repre-\n" " sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n" " last slot). This is useful to transmit or receive packets larger than the netmap buffer\n" " size.\n" "\n" " -M frag_size\n" " In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n" " length divided by num_frags.\n" "\n" " -I Use indirect buffers. It is only valid for transmitting on VALE ports, and it is implemented\n" " by setting the NS_INDIRECT flag in the netmap slots.\n" "\n" " -W Exit immediately if all the RX rings are empty the first time they are examined.\n" "\n" " -v Increase the verbosity level.\n" "\n" " -r In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n" " netmap buffers is (rubbish mode).\n" "\n" " -A Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n" "\n" " -B Take Ethernet framing and CRC into account when computing the average bps. This adds 4 bytes\n" " of CRC and 20 bytes of framing to each packet.\n" "\n" " -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n" " Configuration in terms of number of rings and slots to be used when opening the netmap port.\n" " Such configuration has an effect on software ports created on the fly, such as VALE ports and\n" " netmap pipes. The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n" " rx_slots, tx_rings, rx_rings. Missing numbers or zeroes stand for default values. As an\n" " additional convenience, if exactly one number is specified, then this is assigned to both\n" " tx_slots and rx_slots. If there is no fourth number, then the third one is assigned to both\n" " tx_rings and rx_rings.\n" "\n" " -o options data generation options (parsed using atoi)\n" " OPT_PREFETCH 1\n" " OPT_ACCESS 2\n" " OPT_COPY 4\n" " OPT_MEMCPY 8\n" " OPT_TS 16 (add a timestamp)\n" " OPT_INDIRECT 32 (use indirect buffers)\n" " OPT_DUMP 64 (dump rx/tx traffic)\n" " OPT_RUBBISH 256\n" " (send whatever the buffers contain)\n" " OPT_RANDOM_SRC 512\n" " OPT_RANDOM_DST 1024\n" " OPT_PPS_STATS 2048\n" "", cmd); exit(errcode); } static int start_threads(struct glob_arg *g) { int i; targs = calloc(g->nthreads, sizeof(*targs)); struct targ *t; /* * Now create the desired number of threads, each one * using a single descriptor. */ for (i = 0; i < g->nthreads; i++) { uint64_t seed = (uint64_t)time(0) | ((uint64_t)time(0) << 32); t = &targs[i]; bzero(t, sizeof(*t)); t->fd = -1; /* default, with pcap */ t->g = g; memcpy(t->seed, &seed, sizeof(t->seed)); if (g->dev_type == DEV_NETMAP) { int m = -1; /* * if the user wants both HW and SW rings, we need to * know when to switch from NR_REG_ONE_NIC to NR_REG_ONE_SW */ if (g->orig_mode == NR_REG_NIC_SW) { m = (g->td_type == TD_TYPE_RECEIVER ? g->nmd->reg.nr_rx_rings : g->nmd->reg.nr_tx_rings); } if (i > 0) { int j; /* the first thread uses the fd opened by the main * thread, the other threads re-open /dev/netmap */ t->nmd = nmport_clone(g->nmd); if (t->nmd == NULL) return -1; j = i; if (m > 0 && j >= m) { /* switch to the software rings */ t->nmd->reg.nr_mode = NR_REG_ONE_SW; j -= m; } t->nmd->reg.nr_ringid = j & NETMAP_RING_MASK; /* Only touch one of the rings (rx is already ok) */ if (g->td_type == TD_TYPE_RECEIVER) t->nmd->reg.nr_flags |= NETMAP_NO_TX_POLL; /* register interface. Override ifname and ringid etc. */ if (nmport_open_desc(t->nmd) < 0) { nmport_undo_prepare(t->nmd); t->nmd = NULL; return -1; } } else { t->nmd = g->nmd; } t->fd = t->nmd->fd; t->frags = g->frags; } else { targs[i].fd = g->main_fd; } t->used = 1; t->me = i; if (g->affinity >= 0) { t->affinity = (g->affinity + i) % g->cpus; } else { t->affinity = -1; } /* default, init packets */ initialize_packet(t); } /* Wait for PHY reset. */ D("Wait %d secs for phy reset", g->wait_link); sleep(g->wait_link); D("Ready..."); for (i = 0; i < g->nthreads; i++) { t = &targs[i]; if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { D("Unable to create thread %d: %s", i, strerror(errno)); t->used = 0; } } return 0; } static void main_thread(struct glob_arg *g) { int i; struct my_ctrs prev, cur; double delta_t; struct timeval tic, toc; prev.pkts = prev.bytes = prev.events = 0; gettimeofday(&prev.t, NULL); for (;;) { char b1[40], b2[40], b3[40], b4[100]; uint64_t pps, usec; struct my_ctrs x; double abs; int done = 0; usec = wait_for_next_report(&prev.t, &cur.t, g->report_interval); cur.pkts = cur.bytes = cur.events = 0; cur.min_space = 0; if (usec < 10000) /* too short to be meaningful */ continue; /* accumulate counts for all threads */ for (i = 0; i < g->nthreads; i++) { cur.pkts += targs[i].ctr.pkts; cur.bytes += targs[i].ctr.bytes; cur.events += targs[i].ctr.events; cur.min_space += targs[i].ctr.min_space; targs[i].ctr.min_space = 99999; if (targs[i].used == 0) done++; } x.pkts = cur.pkts - prev.pkts; x.bytes = cur.bytes - prev.bytes; x.events = cur.events - prev.events; pps = (x.pkts*1000000 + usec/2) / usec; abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0; if (!(g->options & OPT_PPS_STATS)) { strcpy(b4, ""); } else { /* Compute some pps stats using a sliding window. */ double ppsavg = 0.0, ppsdev = 0.0; int nsamples = 0; g->win[g->win_idx] = pps; g->win_idx = (g->win_idx + 1) % STATS_WIN; for (i = 0; i < STATS_WIN; i++) { ppsavg += g->win[i]; if (g->win[i]) { nsamples ++; } } ppsavg /= nsamples; for (i = 0; i < STATS_WIN; i++) { if (g->win[i] == 0) { continue; } ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg); } ppsdev /= nsamples; ppsdev = sqrt(ppsdev); snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]", norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize)); } D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space", norm(b1, pps, normalize), b4, norm(b2, (double)x.pkts, normalize), norm(b3, 1000000*((double)x.bytes*8+(double)x.pkts*g->framing)/usec, normalize), (unsigned long long)usec, abs, (int)cur.min_space); prev = cur; if (done == g->nthreads) break; } timerclear(&tic); timerclear(&toc); cur.pkts = cur.bytes = cur.events = 0; /* final round */ for (i = 0; i < g->nthreads; i++) { struct timespec t_tic, t_toc; /* * Join active threads, unregister interfaces and close * file descriptors. */ if (targs[i].used) pthread_join(targs[i].thread, NULL); /* blocking */ if (g->dev_type == DEV_NETMAP) { nmport_close(targs[i].nmd); targs[i].nmd = NULL; } else { close(targs[i].fd); } if (targs[i].completed == 0) D("ouch, thread %d exited with error", i); /* * Collect threads output and extract information about * how long it took to send all the packets. */ cur.pkts += targs[i].ctr.pkts; cur.bytes += targs[i].ctr.bytes; cur.events += targs[i].ctr.events; /* collect the largest start (tic) and end (toc) times, * XXX maybe we should do the earliest tic, or do a weighted * average ? */ t_tic = timeval2spec(&tic); t_toc = timeval2spec(&toc); if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic)) tic = timespec2val(&targs[i].tic); if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc)) toc = timespec2val(&targs[i].toc); } /* print output. */ timersub(&toc, &tic, &toc); delta_t = toc.tv_sec + 1e-6* toc.tv_usec; if (g->td_type == TD_TYPE_SENDER) tx_output(g, &cur, delta_t, "Sent"); else if (g->td_type == TD_TYPE_RECEIVER) tx_output(g, &cur, delta_t, "Received"); } struct td_desc { int ty; const char *key; void *f; int default_burst; }; static struct td_desc func[] = { { TD_TYPE_RECEIVER, "rx", receiver_body, 512}, /* default */ { TD_TYPE_SENDER, "tx", sender_body, 512 }, { TD_TYPE_OTHER, "ping", ping_body, 1 }, { TD_TYPE_OTHER, "pong", pong_body, 1 }, { TD_TYPE_SENDER, "txseq", txseq_body, 512 }, { TD_TYPE_RECEIVER, "rxseq", rxseq_body, 512 }, { 0, NULL, NULL, 0 } }; static int tap_alloc(char *dev) { struct ifreq ifr; int fd, err; const char *clonedev = TAP_CLONEDEV; (void)err; (void)dev; /* Arguments taken by the function: * * char *dev: the name of an interface (or '\0'). MUST have enough * space to hold the interface name if '\0' is passed * int flags: interface flags (eg, IFF_TUN etc.) */ #ifdef __FreeBSD__ if (dev[3]) { /* tapSomething */ static char buf[128]; snprintf(buf, sizeof(buf), "/dev/%s", dev); clonedev = buf; } #endif /* open the device */ if( (fd = open(clonedev, O_RDWR)) < 0 ) { return fd; } D("%s open successful", clonedev); /* preparation of the struct ifr, of type "struct ifreq" */ memset(&ifr, 0, sizeof(ifr)); #ifdef linux ifr.ifr_flags = IFF_TAP | IFF_NO_PI; if (*dev) { /* if a device name was specified, put it in the structure; otherwise, * the kernel will try to allocate the "next" device of the * specified type */ size_t len = strlen(dev); if (len > IFNAMSIZ) { D("%s too long", dev); return -1; } memcpy(ifr.ifr_name, dev, len); } /* try to create the device */ if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) { D("failed to do a TUNSETIFF: %s", strerror(errno)); close(fd); return err; } /* if the operation was successful, write back the name of the * interface to the variable "dev", so the caller can know * it. Note that the caller MUST reserve space in *dev (see calling * code below) */ strcpy(dev, ifr.ifr_name); D("new name is %s", dev); #endif /* linux */ /* this is the special file descriptor that the caller will use to talk * with the virtual interface */ return fd; } int main(int arc, char **argv) { int i; struct sigaction sa; sigset_t ss; struct glob_arg g; int ch; int devqueues = 1; /* how many device queues */ int wait_link_arg = 0; int pkt_size_done = 0; struct td_desc *fn = func; bzero(&g, sizeof(g)); g.main_fd = -1; g.td_body = fn->f; g.td_type = fn->ty; g.report_interval = 1000; /* report interval */ g.affinity = -1; /* ip addresses can also be a range x.x.x.x-x.x.x.y */ g.af = AF_INET; /* default */ g.src_ip.name = "10.0.0.1"; g.dst_ip.name = "10.1.0.1"; g.dst_mac.name = "ff:ff:ff:ff:ff:ff"; g.src_mac.name = NULL; g.pkt_size = 60; g.pkt_min_size = 0; g.nthreads = 1; g.cpus = 1; /* default */ g.forever = 1; g.tx_rate = 0; g.frags = 1; g.frag_size = (u_int)-1; /* use the netmap buffer size by default */ g.nmr_config = ""; g.virt_header = 0; g.wait_link = 2; /* wait 2 seconds for physical ports */ while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:" "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) { switch(ch) { default: D("bad option %c %s", ch, optarg); usage(-1); break; case 'h': usage(0); break; case '4': g.af = AF_INET; break; case '6': g.af = AF_INET6; break; case 'N': normalize = 0; break; case 'n': g.npackets = strtoull(optarg, NULL, 10); break; case 'F': i = atoi(optarg); if (i < 1 || i > 63) { D("invalid frags %d [1..63], ignore", i); break; } g.frags = i; break; case 'M': g.frag_size = atoi(optarg); break; case 'f': for (fn = func; fn->key; fn++) { if (!strcmp(fn->key, optarg)) break; } if (fn->key) { g.td_body = fn->f; g.td_type = fn->ty; } else { D("unrecognised function %s", optarg); } break; case 'o': /* data generation options */ g.options |= atoi(optarg); break; case 'a': /* force affinity */ g.affinity = atoi(optarg); break; case 'i': /* interface */ /* a prefix of tap: netmap: or pcap: forces the mode. * otherwise we guess */ D("interface is %s", optarg); if (strlen(optarg) > MAX_IFNAMELEN - 8) { D("ifname too long %s", optarg); break; } strcpy(g.ifname, optarg); if (!strcmp(optarg, "null")) { g.dev_type = DEV_NETMAP; g.dummy_send = 1; } else if (!strncmp(optarg, "tap:", 4)) { g.dev_type = DEV_TAP; strcpy(g.ifname, optarg + 4); } else if (!strncmp(optarg, "pcap:", 5)) { g.dev_type = DEV_PCAP; strcpy(g.ifname, optarg + 5); } else if (!strncmp(optarg, "netmap:", 7) || !strncmp(optarg, "vale", 4)) { g.dev_type = DEV_NETMAP; } else if (!strncmp(optarg, "tap", 3)) { g.dev_type = DEV_TAP; } else { /* prepend netmap: */ g.dev_type = DEV_NETMAP; sprintf(g.ifname, "netmap:%s", optarg); } break; case 'I': g.options |= OPT_INDIRECT; /* use indirect buffers */ break; case 'l': /* pkt_size */ if (pkt_size_done) { g.pkt_min_size = atoi(optarg); } else { g.pkt_size = atoi(optarg); pkt_size_done = 1; } break; case 'd': g.dst_ip.name = optarg; break; case 's': g.src_ip.name = optarg; break; case 'T': /* report interval */ g.report_interval = atoi(optarg); break; case 'w': g.wait_link = atoi(optarg); wait_link_arg = 1; break; case 'W': g.forever = 0; /* exit RX with no traffic */ break; case 'b': /* burst */ g.burst = atoi(optarg); break; case 'c': g.cpus = atoi(optarg); break; case 'p': g.nthreads = atoi(optarg); break; case 'D': /* destination mac */ g.dst_mac.name = optarg; break; case 'S': /* source mac */ g.src_mac.name = optarg; break; case 'v': verbose++; break; case 'R': g.tx_rate = atoi(optarg); break; case 'X': g.options |= OPT_DUMP; break; case 'C': D("WARNING: the 'C' option is deprecated, use the '+conf:' libnetmap option instead"); g.nmr_config = strdup(optarg); break; case 'H': g.virt_header = atoi(optarg); break; case 'P': g.packet_file = strdup(optarg); break; case 'r': g.options |= OPT_RUBBISH; break; case 'z': g.options |= OPT_RANDOM_SRC; break; case 'Z': g.options |= OPT_RANDOM_DST; break; case 'A': g.options |= OPT_PPS_STATS; break; case 'B': /* raw packets have4 bytes crc + 20 bytes framing */ // XXX maybe add an option to pass the IFG g.framing = 24 * 8; break; } } if (strlen(g.ifname) <=0 ) { D("missing ifname"); usage(-1); } if (g.burst == 0) { g.burst = fn->default_burst; D("using default burst size: %d", g.burst); } g.system_cpus = i = system_ncpus(); if (g.cpus < 0 || g.cpus > i) { D("%d cpus is too high, have only %d cpus", g.cpus, i); usage(-1); } D("running on %d cpus (have %d)", g.cpus, i); if (g.cpus == 0) g.cpus = i; if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) { g.wait_link = 0; } if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) { D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE); usage(-1); } if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) { D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size); usage(-1); } if (g.src_mac.name == NULL) { static char mybuf[20] = "00:00:00:00:00:00"; /* retrieve source mac address. */ if (source_hwaddr(g.ifname, mybuf) == -1) { D("Unable to retrieve source mac"); // continue, fail later } g.src_mac.name = mybuf; } /* extract address ranges */ if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac)) usage(-1); g.options |= extract_ip_range(&g.src_ip, g.af); g.options |= extract_ip_range(&g.dst_ip, g.af); if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 && g.virt_header != VIRT_HDR_2) { D("bad virtio-net-header length"); usage(-1); } if (g.dev_type == DEV_TAP) { D("want to use tap %s", g.ifname); g.main_fd = tap_alloc(g.ifname); if (g.main_fd < 0) { D("cannot open tap %s", g.ifname); usage(-1); } #ifndef NO_PCAP } else if (g.dev_type == DEV_PCAP) { char pcap_errbuf[PCAP_ERRBUF_SIZE]; pcap_errbuf[0] = '\0'; // init the buffer g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf); if (g.p == NULL) { D("cannot open pcap on %s", g.ifname); usage(-1); } g.main_fd = pcap_fileno(g.p); D("using pcap on %s fileno %d", g.ifname, g.main_fd); #endif /* !NO_PCAP */ } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { g.nmd = nmport_prepare(g.ifname); if (g.nmd == NULL) goto out; parse_nmr_config(g.nmr_config, &g.nmd->reg); g.nmd->reg.nr_flags |= NR_ACCEPT_VNET_HDR; /* * Open the netmap device using nm_open(). * * protocol stack and may cause a reset of the card, * which in turn may take some time for the PHY to * reconfigure. We do the open here to have time to reset. */ g.orig_mode = g.nmd->reg.nr_mode; if (g.nthreads > 1) { switch (g.orig_mode) { case NR_REG_ALL_NIC: case NR_REG_NIC_SW: g.nmd->reg.nr_mode = NR_REG_ONE_NIC; break; case NR_REG_SW: g.nmd->reg.nr_mode = NR_REG_ONE_SW; break; default: break; } g.nmd->reg.nr_ringid = 0; } if (nmport_open_desc(g.nmd) < 0) goto out; g.main_fd = g.nmd->fd; ND("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10), g.nmd->mem); if (g.virt_header) { /* Set the virtio-net header length, since the user asked * for it explicitly. */ set_vnet_hdr_len(&g); } else { /* Check whether the netmap port we opened requires us to send * and receive frames with virtio-net header. */ get_vnet_hdr_len(&g); } /* get num of queues in tx or rx */ if (g.td_type == TD_TYPE_SENDER) devqueues = g.nmd->reg.nr_tx_rings + g.nmd->reg.nr_host_tx_rings; else devqueues = g.nmd->reg.nr_rx_rings + g.nmd->reg.nr_host_rx_rings; /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { D("bad nthreads %d, have %d queues", g.nthreads, devqueues); // continue, fail later } if (g.td_type == TD_TYPE_SENDER) { int mtu = get_if_mtu(&g); if (mtu > 0 && g.pkt_size > mtu) { D("pkt_size (%d) must be <= mtu (%d)", g.pkt_size, mtu); return -1; } } if (verbose) { struct netmap_if *nifp = g.nmd->nifp; struct nmreq_register *req = &g.nmd->reg; D("nifp at offset %"PRIu64" ntxqs %d nrxqs %d memid %d", req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, req->nr_mem_id); for (i = 0; i < req->nr_tx_rings + req->nr_host_tx_rings; i++) { struct netmap_ring *ring = NETMAP_TXRING(nifp, i); D(" TX%d at offset %p slots %d", i, (void *)((char *)ring - (char *)nifp), ring->num_slots); } for (i = 0; i < req->nr_rx_rings + req->nr_host_rx_rings; i++) { struct netmap_ring *ring = NETMAP_RXRING(nifp, i); D(" RX%d at offset %p slots %d", i, (void *)((char *)ring - (char *)nifp), ring->num_slots); } } /* Print some debug information. */ fprintf(stdout, "%s %s: %d queues, %d threads and %d cpus.\n", (g.td_type == TD_TYPE_SENDER) ? "Sending on" : ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" : "Working on"), g.ifname, devqueues, g.nthreads, g.cpus); if (g.td_type == TD_TYPE_SENDER) { fprintf(stdout, "%s -> %s (%s -> %s)\n", g.src_ip.name, g.dst_ip.name, g.src_mac.name, g.dst_mac.name); } out: /* Exit if something went wrong. */ if (g.main_fd < 0) { D("aborting"); usage(-1); } } if (g.options) { D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", g.options & OPT_ACCESS ? " access" : "", g.options & OPT_MEMCPY ? " memcpy" : "", g.options & OPT_INDIRECT ? " indirect" : "", g.options & OPT_COPY ? " copy" : "", g.options & OPT_RUBBISH ? " rubbish " : ""); } g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; if (g.tx_rate > 0) { /* try to have at least something every second, * reducing the burst size to some 0.01s worth of data * (but no less than one full set of fragments) */ uint64_t x; int lim = (g.tx_rate)/300; if (g.burst > lim) g.burst = lim; if (g.burst == 0) g.burst = 1; x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; g.tx_period.tv_nsec = x; g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } if (g.td_type == TD_TYPE_SENDER) D("Sending %d packets every %ld.%09ld s", g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); /* Install ^C handler. */ global_nthreads = g.nthreads; sigemptyset(&ss); sigaddset(&ss, SIGINT); /* block SIGINT now, so that all created threads will inherit the mask */ if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) { D("failed to block SIGINT: %s", strerror(errno)); } if (start_threads(&g) < 0) return 1; /* Install the handler and re-enable SIGINT for the main thread */ memset(&sa, 0, sizeof(sa)); sa.sa_handler = sigint_h; if (sigaction(SIGINT, &sa, NULL) < 0) { D("failed to install ^C handler: %s", strerror(errno)); } if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) { D("failed to re-enable SIGINT: %s", strerror(errno)); } main_thread(&g); free(targs); return 0; } /* end of file */