Index: stable/12/tools/tools/netmap/Makefile =================================================================== --- stable/12/tools/tools/netmap/Makefile (revision 366497) +++ stable/12/tools/tools/netmap/Makefile (revision 366498) @@ -1,36 +1,38 @@ # # $FreeBSD$ # # For multiple programs using a single source file each, # we can just define 'progs' and create custom targets. PROGS = pkt-gen nmreplay bridge lb CLEANFILES = $(PROGS) *.o MAN= CFLAGS += -Werror -Wall CFLAGS += -Wextra LDFLAGS += -lpthread .ifdef WITHOUT_PCAP CFLAGS += -DNO_PCAP .else LDFLAGS += -lpcap .endif LDFLAGS += -lm # used by nmreplay .include .include +CFLAGS += -Wno-cast-align + all: $(PROGS) pkt-gen: pkt-gen.o $(CC) $(CFLAGS) -o pkt-gen pkt-gen.o $(LDFLAGS) bridge: bridge.o $(CC) $(CFLAGS) -o bridge bridge.o nmreplay: nmreplay.o $(CC) $(CFLAGS) -o nmreplay nmreplay.o $(LDFLAGS) lb: lb.o pkt_hash.o $(CC) $(CFLAGS) -o lb lb.o pkt_hash.o $(LDFLAGS) Index: stable/12/tools/tools/netmap/bridge.c =================================================================== --- stable/12/tools/tools/netmap/bridge.c (revision 366497) +++ stable/12/tools/tools/netmap/bridge.c (revision 366498) @@ -1,356 +1,356 @@ /* * (C) 2011-2014 Luigi Rizzo, Matteo Landi * * BSD license * * A netmap client to bridge two network interfaces * (or one interface and the host stack). * * $FreeBSD$ */ #include #define NETMAP_WITH_LIBS #include #include -int verbose = 0; +static int verbose = 0; static int do_abort = 0; static int zerocopy = 1; /* enable zerocopy if possible */ static void sigint_h(int sig) { (void)sig; /* UNUSED */ do_abort = 1; signal(SIGINT, SIG_DFL); } /* * how many packets on this set of queues ? */ -int +static int pkt_queued(struct nm_desc *d, int tx) { u_int i, tot = 0; if (tx) { for (i = d->first_tx_ring; i <= d->last_tx_ring; i++) { tot += nm_ring_space(NETMAP_TXRING(d->nifp, i)); } } else { for (i = d->first_rx_ring; i <= d->last_rx_ring; i++) { tot += nm_ring_space(NETMAP_RXRING(d->nifp, i)); } } return tot; } /* * move up to 'limit' pkts from rxring to txring swapping buffers. */ static int process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, u_int limit, const char *msg) { u_int j, k, m = 0; /* print a warning if any of the ring flags is set (e.g. NM_REINIT) */ if (rxring->flags || txring->flags) D("%s rxflags %x txflags %x", msg, rxring->flags, txring->flags); j = rxring->cur; /* RX */ k = txring->cur; /* TX */ m = nm_ring_space(rxring); if (m < limit) limit = m; m = nm_ring_space(txring); if (m < limit) limit = m; m = limit; while (limit-- > 0) { struct netmap_slot *rs = &rxring->slot[j]; struct netmap_slot *ts = &txring->slot[k]; /* swap packets */ if (ts->buf_idx < 2 || rs->buf_idx < 2) { RD(5, "wrong index rx[%d] = %d -> tx[%d] = %d", j, rs->buf_idx, k, ts->buf_idx); sleep(2); } /* copy the packet length. */ if (rs->len > rxring->nr_buf_size) { RD(5, "wrong len %d rx[%d] -> tx[%d]", rs->len, j, k); rs->len = 0; } else if (verbose > 1) { D("%s send len %d rx[%d] -> tx[%d]", msg, rs->len, j, k); } ts->len = rs->len; if (zerocopy) { uint32_t pkt = ts->buf_idx; ts->buf_idx = rs->buf_idx; rs->buf_idx = pkt; /* report the buffer change. */ ts->flags |= NS_BUF_CHANGED; rs->flags |= NS_BUF_CHANGED; /* copy the NS_MOREFRAG */ rs->flags = (rs->flags & ~NS_MOREFRAG) | (ts->flags & NS_MOREFRAG); } else { char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); char *txbuf = NETMAP_BUF(txring, ts->buf_idx); nm_pkt_copy(rxbuf, txbuf, ts->len); } j = nm_ring_next(rxring, j); k = nm_ring_next(txring, k); } rxring->head = rxring->cur = j; txring->head = txring->cur = k; if (verbose && m > 0) D("%s sent %d packets to %p", msg, m, txring); return (m); } /* move packts from src to destination */ static int move(struct nm_desc *src, struct nm_desc *dst, u_int limit) { struct netmap_ring *txring, *rxring; u_int m = 0, si = src->first_rx_ring, di = dst->first_tx_ring; const char *msg = (src->req.nr_flags == NR_REG_SW) ? "host->net" : "net->host"; while (si <= src->last_rx_ring && di <= dst->last_tx_ring) { rxring = NETMAP_RXRING(src->nifp, si); txring = NETMAP_TXRING(dst->nifp, di); ND("txring %p rxring %p", txring, rxring); if (nm_ring_empty(rxring)) { si++; continue; } if (nm_ring_empty(txring)) { di++; continue; } m += process_rings(rxring, txring, limit, msg); } return (m); } static void usage(void) { fprintf(stderr, "netmap bridge program: forward packets between two " "network interfaces\n" " usage(1): bridge [-v] [-i ifa] [-i ifb] [-b burst] " "[-w wait_time] [-L]\n" " usage(2): bridge [-v] [-w wait_time] [-L] " "[ifa [ifb [burst]]]\n" "\n" " ifa and ifb are specified using the nm_open() syntax.\n" " When ifb is missing (or is equal to ifa), bridge will\n" " forward between between ifa and the host stack if -L\n" " is not specified, otherwise loopback traffic on ifa.\n" "\n" " example: bridge -w 10 -i netmap:eth3 -i netmap:eth1\n" ); exit(1); } /* * bridge [-v] if1 [if2] * * If only one name, or the two interfaces are the same, * bridges userland and the adapter. Otherwise bridge * two intefaces. */ int main(int argc, char **argv) { struct pollfd pollfd[2]; int ch; u_int burst = 1024, wait_link = 4; struct nm_desc *pa = NULL, *pb = NULL; char *ifa = NULL, *ifb = NULL; char ifabuf[64] = { 0 }; int loopback = 0; fprintf(stderr, "%s built %s %s\n\n", argv[0], __DATE__, __TIME__); while ((ch = getopt(argc, argv, "hb:ci:vw:L")) != -1) { switch (ch) { default: D("bad option %c %s", ch, optarg); /* fallthrough */ case 'h': usage(); break; case 'b': /* burst */ burst = atoi(optarg); break; case 'i': /* interface */ if (ifa == NULL) ifa = optarg; else if (ifb == NULL) ifb = optarg; else D("%s ignored, already have 2 interfaces", optarg); break; case 'c': zerocopy = 0; /* do not zerocopy */ break; case 'v': verbose++; break; case 'w': wait_link = atoi(optarg); break; case 'L': loopback = 1; break; } } argc -= optind; argv += optind; if (argc > 0) ifa = argv[0]; if (argc > 1) ifb = argv[1]; if (argc > 2) burst = atoi(argv[2]); if (!ifb) ifb = ifa; if (!ifa) { D("missing interface"); usage(); } if (burst < 1 || burst > 8192) { D("invalid burst %d, set to 1024", burst); burst = 1024; } if (wait_link > 100) { D("invalid wait_link %d, set to 4", wait_link); wait_link = 4; } if (!strcmp(ifa, ifb)) { if (!loopback) { D("same interface, endpoint 0 goes to host"); snprintf(ifabuf, sizeof(ifabuf) - 1, "%s^", ifa); ifa = ifabuf; } else { D("same interface, loopbacking traffic"); } } else { /* two different interfaces. Take all rings on if1 */ } pa = nm_open(ifa, NULL, 0, NULL); if (pa == NULL) { D("cannot open %s", ifa); return (1); } /* try to reuse the mmap() of the first interface, if possible */ pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa); if (pb == NULL) { D("cannot open %s", ifb); nm_close(pa); return (1); } zerocopy = zerocopy && (pa->mem == pb->mem); D("------- zerocopy %ssupported", zerocopy ? "" : "NOT "); /* setup poll(2) array */ memset(pollfd, 0, sizeof(pollfd)); pollfd[0].fd = pa->fd; pollfd[1].fd = pb->fd; D("Wait %d secs for link to come up...", wait_link); sleep(wait_link); D("Ready to go, %s 0x%x/%d <-> %s 0x%x/%d.", pa->req.nr_name, pa->first_rx_ring, pa->req.nr_rx_rings, pb->req.nr_name, pb->first_rx_ring, pb->req.nr_rx_rings); /* main loop */ signal(SIGINT, sigint_h); while (!do_abort) { int n0, n1, ret; pollfd[0].events = pollfd[1].events = 0; pollfd[0].revents = pollfd[1].revents = 0; n0 = pkt_queued(pa, 0); n1 = pkt_queued(pb, 0); #if defined(_WIN32) || defined(BUSYWAIT) if (n0) { ioctl(pollfd[1].fd, NIOCTXSYNC, NULL); pollfd[1].revents = POLLOUT; } else { ioctl(pollfd[0].fd, NIOCRXSYNC, NULL); } if (n1) { ioctl(pollfd[0].fd, NIOCTXSYNC, NULL); pollfd[0].revents = POLLOUT; } else { ioctl(pollfd[1].fd, NIOCRXSYNC, NULL); } ret = 1; #else if (n0) pollfd[1].events |= POLLOUT; else pollfd[0].events |= POLLIN; if (n1) pollfd[0].events |= POLLOUT; else pollfd[1].events |= POLLIN; /* poll() also cause kernel to txsync/rxsync the NICs */ ret = poll(pollfd, 2, 2500); #endif /* defined(_WIN32) || defined(BUSYWAIT) */ if (ret <= 0 || verbose) D("poll %s [0] ev %x %x rx %d@%d tx %d," " [1] ev %x %x rx %d@%d tx %d", ret <= 0 ? "timeout" : "ok", pollfd[0].events, pollfd[0].revents, pkt_queued(pa, 0), NETMAP_RXRING(pa->nifp, pa->cur_rx_ring)->cur, pkt_queued(pa, 1), pollfd[1].events, pollfd[1].revents, pkt_queued(pb, 0), NETMAP_RXRING(pb->nifp, pb->cur_rx_ring)->cur, pkt_queued(pb, 1) ); if (ret < 0) continue; if (pollfd[0].revents & POLLERR) { struct netmap_ring *rx = NETMAP_RXRING(pa->nifp, pa->cur_rx_ring); D("error on fd0, rx [%d,%d,%d)", rx->head, rx->cur, rx->tail); } if (pollfd[1].revents & POLLERR) { struct netmap_ring *rx = NETMAP_RXRING(pb->nifp, pb->cur_rx_ring); D("error on fd1, rx [%d,%d,%d)", rx->head, rx->cur, rx->tail); } if (pollfd[0].revents & POLLOUT) move(pb, pa, burst); if (pollfd[1].revents & POLLOUT) move(pa, pb, burst); /* We don't need ioctl(NIOCTXSYNC) on the two file descriptors here, * kernel will txsync on next poll(). */ } nm_close(pb); nm_close(pa); return (0); } Index: stable/12/tools/tools/netmap/ctrs.h =================================================================== --- stable/12/tools/tools/netmap/ctrs.h (revision 366497) +++ stable/12/tools/tools/netmap/ctrs.h (revision 366498) @@ -1,116 +1,116 @@ #ifndef CTRS_H_ #define CTRS_H_ /* $FreeBSD$ */ #include /* counters to accumulate statistics */ struct my_ctrs { uint64_t pkts, bytes, events; uint64_t drop, drop_bytes; uint64_t min_space; struct timeval t; uint32_t oq_n; /* number of elements in overflow queue (used in lb) */ }; /* very crude code to print a number in normalized form. * Caller has to make sure that the buffer is large enough. */ static const char * -norm2(char *buf, double val, char *fmt, int normalize) +norm2(char *buf, double val, const char *fmt, int normalize) { - char *units[] = { "", "K", "M", "G", "T" }; + const char *units[] = { "", "K", "M", "G", "T" }; u_int i; if (normalize) - for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++) + for (i = 0; val >=1000 && i < sizeof(units)/sizeof(const char *) - 1; i++) val /= 1000; else i=0; sprintf(buf, fmt, val, units[i]); return buf; } static __inline const char * norm(char *buf, double val, int normalize) { if (normalize) return norm2(buf, val, "%.3f %s", normalize); else return norm2(buf, val, "%.0f %s", normalize); } static __inline int timespec_ge(const struct timespec *a, const struct timespec *b) { if (a->tv_sec > b->tv_sec) return (1); if (a->tv_sec < b->tv_sec) return (0); if (a->tv_nsec >= b->tv_nsec) return (1); return (0); } static __inline struct timespec timeval2spec(const struct timeval *a) { struct timespec ts = { .tv_sec = a->tv_sec, .tv_nsec = a->tv_usec * 1000 }; return ts; } static __inline struct timeval timespec2val(const struct timespec *a) { struct timeval tv = { .tv_sec = a->tv_sec, .tv_usec = a->tv_nsec / 1000 }; return tv; } static __inline struct timespec timespec_add(struct timespec a, struct timespec b) { struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec }; if (ret.tv_nsec >= 1000000000) { ret.tv_sec++; ret.tv_nsec -= 1000000000; } return ret; } static __inline struct timespec timespec_sub(struct timespec a, struct timespec b) { struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec }; if (ret.tv_nsec < 0) { ret.tv_sec--; ret.tv_nsec += 1000000000; } return ret; } static __inline uint64_t wait_for_next_report(struct timeval *prev, struct timeval *cur, int report_interval) { struct timeval delta; delta.tv_sec = report_interval/1000; delta.tv_usec = (report_interval%1000)*1000; if (select(0, NULL, NULL, NULL, &delta) < 0 && errno != EINTR) { perror("select"); abort(); } gettimeofday(cur, NULL); timersub(cur, prev, &delta); return delta.tv_sec* 1000000 + delta.tv_usec; } #endif /* CTRS_H_ */ Index: stable/12/tools/tools/netmap/lb.c =================================================================== --- stable/12/tools/tools/netmap/lb.c (revision 366497) +++ stable/12/tools/tools/netmap/lb.c (revision 366498) @@ -1,1027 +1,1028 @@ /* * Copyright (C) 2017 Corelight, Inc. and Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* $FreeBSD$ */ #include #include #include #include #include #include #define NETMAP_WITH_LIBS #include #include #include /* htonl */ #include #include "pkt_hash.h" #include "ctrs.h" /* * use our version of header structs, rather than bringing in a ton * of platform specific ones */ #ifndef ETH_ALEN #define ETH_ALEN 6 #endif struct compact_eth_hdr { unsigned char h_dest[ETH_ALEN]; unsigned char h_source[ETH_ALEN]; u_int16_t h_proto; }; struct compact_ip_hdr { u_int8_t ihl:4, version:4; u_int8_t tos; u_int16_t tot_len; u_int16_t id; u_int16_t frag_off; u_int8_t ttl; u_int8_t protocol; u_int16_t check; u_int32_t saddr; u_int32_t daddr; }; struct compact_ipv6_hdr { u_int8_t priority:4, version:4; u_int8_t flow_lbl[3]; u_int16_t payload_len; u_int8_t nexthdr; u_int8_t hop_limit; struct in6_addr saddr; struct in6_addr daddr; }; #define MAX_IFNAMELEN 64 #define MAX_PORTNAMELEN (MAX_IFNAMELEN + 40) #define DEF_OUT_PIPES 2 #define DEF_EXTRA_BUFS 0 #define DEF_BATCH 2048 #define DEF_WAIT_LINK 2 #define DEF_STATS_INT 600 #define BUF_REVOKE 100 #define STAT_MSG_MAXSIZE 1024 -struct { +static struct { char ifname[MAX_IFNAMELEN]; char base_name[MAX_IFNAMELEN]; int netmap_fd; uint16_t output_rings; uint16_t num_groups; uint32_t extra_bufs; uint16_t batch; int stdout_interval; int syslog_interval; int wait_link; bool busy_wait; } glob_arg; /* * the overflow queue is a circular queue of buffers */ struct overflow_queue { char name[MAX_IFNAMELEN + 16]; struct netmap_slot *slots; uint32_t head; uint32_t tail; uint32_t n; uint32_t size; }; -struct overflow_queue *freeq; +static struct overflow_queue *freeq; static inline int oq_full(struct overflow_queue *q) { return q->n >= q->size; } static inline int oq_empty(struct overflow_queue *q) { return q->n <= 0; } static inline void oq_enq(struct overflow_queue *q, const struct netmap_slot *s) { if (unlikely(oq_full(q))) { D("%s: queue full!", q->name); abort(); } q->slots[q->tail] = *s; q->n++; q->tail++; if (q->tail >= q->size) q->tail = 0; } static inline struct netmap_slot oq_deq(struct overflow_queue *q) { struct netmap_slot s = q->slots[q->head]; if (unlikely(oq_empty(q))) { D("%s: queue empty!", q->name); abort(); } q->n--; q->head++; if (q->head >= q->size) q->head = 0; return s; } static volatile int do_abort = 0; -uint64_t dropped = 0; -uint64_t forwarded = 0; -uint64_t received_bytes = 0; -uint64_t received_pkts = 0; -uint64_t non_ip = 0; -uint32_t freeq_n = 0; +static uint64_t dropped = 0; +static uint64_t forwarded = 0; +static uint64_t received_bytes = 0; +static uint64_t received_pkts = 0; +static uint64_t non_ip = 0; +static uint32_t freeq_n = 0; struct port_des { char interface[MAX_PORTNAMELEN]; struct my_ctrs ctr; unsigned int last_sync; uint32_t last_tail; struct overflow_queue *oq; struct nm_desc *nmd; struct netmap_ring *ring; struct group_des *group; }; -struct port_des *ports; +static struct port_des *ports; /* each group of pipes receives all the packets */ struct group_des { char pipename[MAX_IFNAMELEN]; struct port_des *ports; int first_id; int nports; int last; int custom_port; }; -struct group_des *groups; +static struct group_des *groups; /* statistcs */ struct counters { struct timeval ts; struct my_ctrs *ctrs; uint64_t received_pkts; uint64_t received_bytes; uint64_t non_ip; uint32_t freeq_n; int status __attribute__((aligned(64))); #define COUNTERS_EMPTY 0 #define COUNTERS_FULL 1 }; -struct counters counters_buf; +static struct counters counters_buf; static void * print_stats(void *arg) { int npipes = glob_arg.output_rings; int sys_int = 0; (void)arg; struct my_ctrs cur, prev; struct my_ctrs *pipe_prev; pipe_prev = calloc(npipes, sizeof(struct my_ctrs)); if (pipe_prev == NULL) { D("out of memory"); exit(1); } char stat_msg[STAT_MSG_MAXSIZE] = ""; memset(&prev, 0, sizeof(prev)); while (!do_abort) { int j, dosyslog = 0, dostdout = 0, newdata; uint64_t pps = 0, dps = 0, bps = 0, dbps = 0, usec = 0; struct my_ctrs x; counters_buf.status = COUNTERS_EMPTY; newdata = 0; memset(&cur, 0, sizeof(cur)); sleep(1); if (counters_buf.status == COUNTERS_FULL) { __sync_synchronize(); newdata = 1; cur.t = counters_buf.ts; if (prev.t.tv_sec || prev.t.tv_usec) { usec = (cur.t.tv_sec - prev.t.tv_sec) * 1000000 + cur.t.tv_usec - prev.t.tv_usec; } } ++sys_int; if (glob_arg.stdout_interval && sys_int % glob_arg.stdout_interval == 0) dostdout = 1; if (glob_arg.syslog_interval && sys_int % glob_arg.syslog_interval == 0) dosyslog = 1; for (j = 0; j < npipes; ++j) { struct my_ctrs *c = &counters_buf.ctrs[j]; cur.pkts += c->pkts; cur.drop += c->drop; cur.drop_bytes += c->drop_bytes; cur.bytes += c->bytes; if (usec) { x.pkts = c->pkts - pipe_prev[j].pkts; x.drop = c->drop - pipe_prev[j].drop; x.bytes = c->bytes - pipe_prev[j].bytes; x.drop_bytes = c->drop_bytes - pipe_prev[j].drop_bytes; pps = (x.pkts*1000000 + usec/2) / usec; dps = (x.drop*1000000 + usec/2) / usec; bps = ((x.bytes*1000000 + usec/2) / usec) * 8; dbps = ((x.drop_bytes*1000000 + usec/2) / usec) * 8; } pipe_prev[j] = *c; if ( (dosyslog || dostdout) && newdata ) snprintf(stat_msg, STAT_MSG_MAXSIZE, "{" "\"ts\":%.6f," "\"interface\":\"%s\"," "\"output_ring\":%" PRIu16 "," "\"packets_forwarded\":%" PRIu64 "," "\"packets_dropped\":%" PRIu64 "," "\"data_forward_rate_Mbps\":%.4f," "\"data_drop_rate_Mbps\":%.4f," "\"packet_forward_rate_kpps\":%.4f," "\"packet_drop_rate_kpps\":%.4f," "\"overflow_queue_size\":%" PRIu32 "}", cur.t.tv_sec + (cur.t.tv_usec / 1000000.0), ports[j].interface, j, c->pkts, c->drop, (double)bps / 1024 / 1024, (double)dbps / 1024 / 1024, (double)pps / 1000, (double)dps / 1000, c->oq_n); if (dosyslog && stat_msg[0]) syslog(LOG_INFO, "%s", stat_msg); if (dostdout && stat_msg[0]) printf("%s\n", stat_msg); } if (usec) { x.pkts = cur.pkts - prev.pkts; x.drop = cur.drop - prev.drop; x.bytes = cur.bytes - prev.bytes; x.drop_bytes = cur.drop_bytes - prev.drop_bytes; pps = (x.pkts*1000000 + usec/2) / usec; dps = (x.drop*1000000 + usec/2) / usec; bps = ((x.bytes*1000000 + usec/2) / usec) * 8; dbps = ((x.drop_bytes*1000000 + usec/2) / usec) * 8; } if ( (dosyslog || dostdout) && newdata ) snprintf(stat_msg, STAT_MSG_MAXSIZE, "{" "\"ts\":%.6f," "\"interface\":\"%s\"," "\"output_ring\":null," "\"packets_received\":%" PRIu64 "," "\"packets_forwarded\":%" PRIu64 "," "\"packets_dropped\":%" PRIu64 "," "\"non_ip_packets\":%" PRIu64 "," "\"data_forward_rate_Mbps\":%.4f," "\"data_drop_rate_Mbps\":%.4f," "\"packet_forward_rate_kpps\":%.4f," "\"packet_drop_rate_kpps\":%.4f," "\"free_buffer_slots\":%" PRIu32 "}", cur.t.tv_sec + (cur.t.tv_usec / 1000000.0), glob_arg.ifname, received_pkts, cur.pkts, cur.drop, counters_buf.non_ip, (double)bps / 1024 / 1024, (double)dbps / 1024 / 1024, (double)pps / 1000, (double)dps / 1000, counters_buf.freeq_n); if (dosyslog && stat_msg[0]) syslog(LOG_INFO, "%s", stat_msg); if (dostdout && stat_msg[0]) printf("%s\n", stat_msg); prev = cur; } free(pipe_prev); return NULL; } static void free_buffers(void) { int i, tot = 0; struct port_des *rxport = &ports[glob_arg.output_rings]; /* build a netmap free list with the buffers in all the overflow queues */ for (i = 0; i < glob_arg.output_rings + 1; i++) { struct port_des *cp = &ports[i]; struct overflow_queue *q = cp->oq; if (!q) continue; while (q->n) { struct netmap_slot s = oq_deq(q); uint32_t *b = (uint32_t *)NETMAP_BUF(cp->ring, s.buf_idx); *b = rxport->nmd->nifp->ni_bufs_head; rxport->nmd->nifp->ni_bufs_head = s.buf_idx; tot++; } } D("added %d buffers to netmap free list", tot); for (i = 0; i < glob_arg.output_rings + 1; ++i) { nm_close(ports[i].nmd); } } static void sigint_h(int sig) { (void)sig; /* UNUSED */ do_abort = 1; signal(SIGINT, SIG_DFL); } -void usage() +static void usage() { printf("usage: lb [options]\n"); printf("where options are:\n"); printf(" -h view help text\n"); printf(" -i iface interface name (required)\n"); printf(" -p [prefix:]npipes add a new group of output pipes\n"); printf(" -B nbufs number of extra buffers (default: %d)\n", DEF_EXTRA_BUFS); printf(" -b batch batch size (default: %d)\n", DEF_BATCH); printf(" -w seconds wait for link up (default: %d)\n", DEF_WAIT_LINK); printf(" -W enable busy waiting. this will run your CPU at 100%%\n"); printf(" -s seconds seconds between syslog stats messages (default: 0)\n"); printf(" -o seconds seconds between stdout stats messages (default: 0)\n"); exit(0); } static int -parse_pipes(char *spec) +parse_pipes(const char *spec) { - char *end = index(spec, ':'); + const char *end = index(spec, ':'); static int max_groups = 0; struct group_des *g; ND("spec %s num_groups %d", spec, glob_arg.num_groups); if (max_groups < glob_arg.num_groups + 1) { size_t size = sizeof(*g) * (glob_arg.num_groups + 1); groups = realloc(groups, size); if (groups == NULL) { D("out of memory"); return 1; } } g = &groups[glob_arg.num_groups]; memset(g, 0, sizeof(*g)); if (end != NULL) { if (end - spec > MAX_IFNAMELEN - 8) { D("name '%s' too long", spec); return 1; } if (end == spec) { D("missing prefix before ':' in '%s'", spec); return 1; } strncpy(g->pipename, spec, end - spec); g->custom_port = 1; end++; } else { /* no prefix, this group will use the * name of the input port. * This will be set in init_groups(), * since here the input port may still * be uninitialized */ end = spec; } if (*end == '\0') { g->nports = DEF_OUT_PIPES; } else { g->nports = atoi(end); if (g->nports < 1) { D("invalid number of pipes '%s' (must be at least 1)", end); return 1; } } glob_arg.output_rings += g->nports; glob_arg.num_groups++; return 0; } /* complete the initialization of the groups data structure */ -void init_groups(void) +static void +init_groups(void) { int i, j, t = 0; struct group_des *g = NULL; for (i = 0; i < glob_arg.num_groups; i++) { g = &groups[i]; g->ports = &ports[t]; for (j = 0; j < g->nports; j++) g->ports[j].group = g; t += g->nports; if (!g->custom_port) strcpy(g->pipename, glob_arg.base_name); for (j = 0; j < i; j++) { struct group_des *h = &groups[j]; if (!strcmp(h->pipename, g->pipename)) g->first_id += h->nports; } } g->last = 1; } /* push the packet described by slot rs to the group g. * This may cause other buffers to be pushed down the * chain headed by g. * Return a free buffer. */ -uint32_t forward_packet(struct group_des *g, struct netmap_slot *rs) +static uint32_t +forward_packet(struct group_des *g, struct netmap_slot *rs) { uint32_t hash = rs->ptr; uint32_t output_port = hash % g->nports; struct port_des *port = &g->ports[output_port]; struct netmap_ring *ring = port->ring; struct overflow_queue *q = port->oq; /* Move the packet to the output pipe, unless there is * either no space left on the ring, or there is some * packet still in the overflow queue (since those must * take precedence over the new one) */ if (ring->head != ring->tail && (q == NULL || oq_empty(q))) { struct netmap_slot *ts = &ring->slot[ring->head]; struct netmap_slot old_slot = *ts; ts->buf_idx = rs->buf_idx; ts->len = rs->len; ts->flags |= NS_BUF_CHANGED; ts->ptr = rs->ptr; ring->head = nm_ring_next(ring, ring->head); port->ctr.bytes += rs->len; port->ctr.pkts++; forwarded++; return old_slot.buf_idx; } /* use the overflow queue, if available */ if (q == NULL || oq_full(q)) { /* no space left on the ring and no overflow queue * available: we are forced to drop the packet */ dropped++; port->ctr.drop++; port->ctr.drop_bytes += rs->len; return rs->buf_idx; } oq_enq(q, rs); /* * we cannot continue down the chain and we need to * return a free buffer now. We take it from the free queue. */ if (oq_empty(freeq)) { /* the free queue is empty. Revoke some buffers * from the longest overflow queue */ uint32_t j; struct port_des *lp = &ports[0]; uint32_t max = lp->oq->n; /* let lp point to the port with the longest queue */ for (j = 1; j < glob_arg.output_rings; j++) { struct port_des *cp = &ports[j]; if (cp->oq->n > max) { lp = cp; max = cp->oq->n; } } /* move the oldest BUF_REVOKE buffers from the * lp queue to the free queue */ // XXX optimize this cycle for (j = 0; lp->oq->n && j < BUF_REVOKE; j++) { struct netmap_slot tmp = oq_deq(lp->oq); dropped++; lp->ctr.drop++; lp->ctr.drop_bytes += tmp.len; oq_enq(freeq, &tmp); } ND(1, "revoked %d buffers from %s", j, lq->name); } return oq_deq(freeq).buf_idx; } int main(int argc, char **argv) { int ch; uint32_t i; int rv; unsigned int iter = 0; int poll_timeout = 10; /* default */ glob_arg.ifname[0] = '\0'; glob_arg.output_rings = 0; glob_arg.batch = DEF_BATCH; glob_arg.wait_link = DEF_WAIT_LINK; glob_arg.busy_wait = false; glob_arg.syslog_interval = 0; glob_arg.stdout_interval = 0; while ( (ch = getopt(argc, argv, "hi:p:b:B:s:o:w:W")) != -1) { switch (ch) { case 'i': D("interface is %s", optarg); if (strlen(optarg) > MAX_IFNAMELEN - 8) { D("ifname too long %s", optarg); return 1; } if (strncmp(optarg, "netmap:", 7) && strncmp(optarg, "vale", 4)) { sprintf(glob_arg.ifname, "netmap:%s", optarg); } else { strcpy(glob_arg.ifname, optarg); } break; case 'p': if (parse_pipes(optarg)) { usage(); return 1; } break; case 'B': glob_arg.extra_bufs = atoi(optarg); D("requested %d extra buffers", glob_arg.extra_bufs); break; case 'b': glob_arg.batch = atoi(optarg); D("batch is %d", glob_arg.batch); break; case 'w': glob_arg.wait_link = atoi(optarg); D("link wait for up time is %d", glob_arg.wait_link); break; case 'W': glob_arg.busy_wait = true; break; case 'o': glob_arg.stdout_interval = atoi(optarg); break; case 's': glob_arg.syslog_interval = atoi(optarg); break; case 'h': usage(); return 0; break; default: D("bad option %c %s", ch, optarg); usage(); return 1; } } if (glob_arg.ifname[0] == '\0') { D("missing interface name"); usage(); return 1; } /* extract the base name */ char *nscan = strncmp(glob_arg.ifname, "netmap:", 7) ? glob_arg.ifname : glob_arg.ifname + 7; strncpy(glob_arg.base_name, nscan, MAX_IFNAMELEN - 1); for (nscan = glob_arg.base_name; *nscan && !index("-*^{}/@", *nscan); nscan++) ; *nscan = '\0'; if (glob_arg.num_groups == 0) parse_pipes(""); if (glob_arg.syslog_interval) { setlogmask(LOG_UPTO(LOG_INFO)); openlog("lb", LOG_CONS | LOG_PID | LOG_NDELAY, LOG_LOCAL1); } uint32_t npipes = glob_arg.output_rings; pthread_t stat_thread; ports = calloc(npipes + 1, sizeof(struct port_des)); if (!ports) { D("failed to allocate the stats array"); return 1; } struct port_des *rxport = &ports[npipes]; init_groups(); memset(&counters_buf, 0, sizeof(counters_buf)); counters_buf.ctrs = calloc(npipes, sizeof(struct my_ctrs)); if (!counters_buf.ctrs) { D("failed to allocate the counters snapshot buffer"); return 1; } /* we need base_req to specify pipes and extra bufs */ struct nmreq base_req; memset(&base_req, 0, sizeof(base_req)); base_req.nr_arg1 = npipes; base_req.nr_arg3 = glob_arg.extra_bufs; rxport->nmd = nm_open(glob_arg.ifname, &base_req, 0, NULL); if (rxport->nmd == NULL) { D("cannot open %s", glob_arg.ifname); return (1); } else { D("successfully opened %s (tx rings: %u)", glob_arg.ifname, rxport->nmd->req.nr_tx_slots); } uint32_t extra_bufs = rxport->nmd->req.nr_arg3; struct overflow_queue *oq = NULL; /* reference ring to access the buffers */ rxport->ring = NETMAP_RXRING(rxport->nmd->nifp, 0); if (!glob_arg.extra_bufs) goto run; D("obtained %d extra buffers", extra_bufs); if (!extra_bufs) goto run; /* one overflow queue for each output pipe, plus one for the * free extra buffers */ oq = calloc(npipes + 1, sizeof(struct overflow_queue)); if (!oq) { D("failed to allocated overflow queues descriptors"); goto run; } freeq = &oq[npipes]; rxport->oq = freeq; freeq->slots = calloc(extra_bufs, sizeof(struct netmap_slot)); if (!freeq->slots) { D("failed to allocate the free list"); } freeq->size = extra_bufs; snprintf(freeq->name, MAX_IFNAMELEN, "free queue"); /* * the list of buffers uses the first uint32_t in each buffer * as the index of the next buffer. */ uint32_t scan; for (scan = rxport->nmd->nifp->ni_bufs_head; scan; scan = *(uint32_t *)NETMAP_BUF(rxport->ring, scan)) { struct netmap_slot s; s.len = s.flags = 0; s.ptr = 0; s.buf_idx = scan; ND("freeq <- %d", s.buf_idx); oq_enq(freeq, &s); } if (freeq->n != extra_bufs) { D("something went wrong: netmap reported %d extra_bufs, but the free list contained %d", extra_bufs, freeq->n); return 1; } rxport->nmd->nifp->ni_bufs_head = 0; run: atexit(free_buffers); int j, t = 0; for (j = 0; j < glob_arg.num_groups; j++) { struct group_des *g = &groups[j]; int k; for (k = 0; k < g->nports; ++k) { struct port_des *p = &g->ports[k]; snprintf(p->interface, MAX_PORTNAMELEN, "%s%s{%d/xT@%d", (strncmp(g->pipename, "vale", 4) ? "netmap:" : ""), g->pipename, g->first_id + k, rxport->nmd->req.nr_arg2); D("opening pipe named %s", p->interface); p->nmd = nm_open(p->interface, NULL, 0, rxport->nmd); if (p->nmd == NULL) { D("cannot open %s", p->interface); return (1); } else if (p->nmd->req.nr_arg2 != rxport->nmd->req.nr_arg2) { D("failed to open pipe #%d in zero-copy mode, " "please close any application that uses either pipe %s}%d, " "or %s{%d, and retry", k + 1, g->pipename, g->first_id + k, g->pipename, g->first_id + k); return (1); } else { D("successfully opened pipe #%d %s (tx slots: %d)", k + 1, p->interface, p->nmd->req.nr_tx_slots); p->ring = NETMAP_TXRING(p->nmd->nifp, 0); p->last_tail = nm_ring_next(p->ring, p->ring->tail); } D("zerocopy %s", (rxport->nmd->mem == p->nmd->mem) ? "enabled" : "disabled"); if (extra_bufs) { struct overflow_queue *q = &oq[t + k]; q->slots = calloc(extra_bufs, sizeof(struct netmap_slot)); if (!q->slots) { D("failed to allocate overflow queue for pipe %d", k); /* make all overflow queue management fail */ extra_bufs = 0; } q->size = extra_bufs; snprintf(q->name, sizeof(q->name), "oq %s{%4d", g->pipename, k); p->oq = q; } } t += g->nports; } if (glob_arg.extra_bufs && !extra_bufs) { if (oq) { for (i = 0; i < npipes + 1; i++) { free(oq[i].slots); oq[i].slots = NULL; } free(oq); oq = NULL; } D("*** overflow queues disabled ***"); } sleep(glob_arg.wait_link); /* start stats thread after wait_link */ if (pthread_create(&stat_thread, NULL, print_stats, NULL) == -1) { D("unable to create the stats thread: %s", strerror(errno)); return 1; } struct pollfd pollfd[npipes + 1]; memset(&pollfd, 0, sizeof(pollfd)); signal(SIGINT, sigint_h); /* make sure we wake up as often as needed, even when there are no * packets coming in */ if (glob_arg.syslog_interval > 0 && glob_arg.syslog_interval < poll_timeout) poll_timeout = glob_arg.syslog_interval; if (glob_arg.stdout_interval > 0 && glob_arg.stdout_interval < poll_timeout) poll_timeout = glob_arg.stdout_interval; while (!do_abort) { u_int polli = 0; iter++; for (i = 0; i < npipes; ++i) { struct netmap_ring *ring = ports[i].ring; int pending = nm_tx_pending(ring); /* if there are packets pending, we want to be notified when * tail moves, so we let cur=tail */ ring->cur = pending ? ring->tail : ring->head; if (!glob_arg.busy_wait && !pending) { /* no need to poll, there are no packets pending */ continue; } pollfd[polli].fd = ports[i].nmd->fd; pollfd[polli].events = POLLOUT; pollfd[polli].revents = 0; ++polli; } pollfd[polli].fd = rxport->nmd->fd; pollfd[polli].events = POLLIN; pollfd[polli].revents = 0; ++polli; //RD(5, "polling %d file descriptors", polli+1); rv = poll(pollfd, polli, poll_timeout); if (rv <= 0) { if (rv < 0 && errno != EAGAIN && errno != EINTR) RD(1, "poll error %s", strerror(errno)); goto send_stats; } /* if there are several groups, try pushing released packets from * upstream groups to the downstream ones. * * It is important to do this before returned slots are reused * for new transmissions. For the same reason, this must be * done starting from the last group going backwards. */ for (i = glob_arg.num_groups - 1U; i > 0; i--) { struct group_des *g = &groups[i - 1]; - int j; for (j = 0; j < g->nports; j++) { struct port_des *p = &g->ports[j]; struct netmap_ring *ring = p->ring; uint32_t last = p->last_tail, stop = nm_ring_next(ring, ring->tail); /* slight abuse of the API here: we touch the slot * pointed to by tail */ for ( ; last != stop; last = nm_ring_next(ring, last)) { struct netmap_slot *rs = &ring->slot[last]; // XXX less aggressive? rs->buf_idx = forward_packet(g + 1, rs); rs->flags |= NS_BUF_CHANGED; rs->ptr = 0; } p->last_tail = last; } } if (oq) { /* try to push packets from the overflow queues * to the corresponding pipes */ for (i = 0; i < npipes; i++) { struct port_des *p = &ports[i]; struct overflow_queue *q = p->oq; - uint32_t j, lim; + uint32_t k, lim; struct netmap_ring *ring; struct netmap_slot *slot; if (oq_empty(q)) continue; ring = p->ring; lim = nm_ring_space(ring); if (!lim) continue; if (q->n < lim) lim = q->n; - for (j = 0; j < lim; j++) { + for (k = 0; k < lim; k++) { struct netmap_slot s = oq_deq(q), tmp; tmp.ptr = 0; slot = &ring->slot[ring->head]; tmp.buf_idx = slot->buf_idx; oq_enq(freeq, &tmp); *slot = s; slot->flags |= NS_BUF_CHANGED; ring->head = nm_ring_next(ring, ring->head); } } } /* push any new packets from the input port to the first group */ int batch = 0; for (i = rxport->nmd->first_rx_ring; i <= rxport->nmd->last_rx_ring; i++) { struct netmap_ring *rxring = NETMAP_RXRING(rxport->nmd->nifp, i); //D("prepare to scan rings"); int next_head = rxring->head; struct netmap_slot *next_slot = &rxring->slot[next_head]; const char *next_buf = NETMAP_BUF(rxring, next_slot->buf_idx); while (!nm_ring_empty(rxring)) { struct netmap_slot *rs = next_slot; struct group_des *g = &groups[0]; ++received_pkts; received_bytes += rs->len; // CHOOSE THE CORRECT OUTPUT PIPE rs->ptr = pkt_hdr_hash((const unsigned char *)next_buf, 4, 'B'); if (rs->ptr == 0) { non_ip++; // XXX ?? } // prefetch the buffer for the next round next_head = nm_ring_next(rxring, next_head); next_slot = &rxring->slot[next_head]; next_buf = NETMAP_BUF(rxring, next_slot->buf_idx); __builtin_prefetch(next_buf); // 'B' is just a hashing seed rs->buf_idx = forward_packet(g, rs); rs->flags |= NS_BUF_CHANGED; rxring->head = rxring->cur = next_head; batch++; if (unlikely(batch >= glob_arg.batch)) { ioctl(rxport->nmd->fd, NIOCRXSYNC, NULL); batch = 0; } ND(1, "Forwarded Packets: %"PRIu64" Dropped packets: %"PRIu64" Percent: %.2f", forwarded, dropped, ((float)dropped / (float)forwarded * 100)); } } send_stats: if (counters_buf.status == COUNTERS_FULL) continue; /* take a new snapshot of the counters */ gettimeofday(&counters_buf.ts, NULL); for (i = 0; i < npipes; i++) { struct my_ctrs *c = &counters_buf.ctrs[i]; *c = ports[i].ctr; /* * If there are overflow queues, copy the number of them for each * port to the ctrs.oq_n variable for each port. */ if (ports[i].oq != NULL) c->oq_n = ports[i].oq->n; } counters_buf.received_pkts = received_pkts; counters_buf.received_bytes = received_bytes; counters_buf.non_ip = non_ip; if (freeq != NULL) counters_buf.freeq_n = freeq->n; __sync_synchronize(); counters_buf.status = COUNTERS_FULL; } /* * If freeq exists, copy the number to the freeq_n member of the * message struct, otherwise set it to 0. */ if (freeq != NULL) { freeq_n = freeq->n; } else { freeq_n = 0; } pthread_join(stat_thread, NULL); printf("%"PRIu64" packets forwarded. %"PRIu64" packets dropped. Total %"PRIu64"\n", forwarded, dropped, forwarded + dropped); return 0; } Index: stable/12/tools/tools/netmap/nmreplay.c =================================================================== --- stable/12/tools/tools/netmap/nmreplay.c (revision 366497) +++ stable/12/tools/tools/netmap/nmreplay.c (revision 366498) @@ -1,1821 +1,1822 @@ /* * Copyright (C) 2016 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * This program implements NMREPLAY, a program to replay a pcap file * enforcing the output rate and possibly random losses and delay * distributions. * It is meant to be run from the command line and implemented with a main * control thread for monitoring, plus a thread to push packets out. * * The control thread parses command line arguments, prepares a * schedule for transmission in a memory buffer and then sits * in a loop where it periodically reads traffic statistics from * the other threads and prints them out on the console. * * The transmit buffer contains headers and packets. Each header * includes a timestamp that determines when the packet should be sent out. * A "consumer" thread cons() reads from the queue and transmits packets * on the output netmap port when their time has come. * * The program does CPU pinning and sets the scheduler and priority * for the "cons" threads. Externally one should do the * assignment of other threads (e.g. interrupt handlers) and * make sure that network interfaces are configured properly. * * --- Main functions of the program --- * within each function, q is used as a pointer to the queue holding * packets and parameters. * * pcap_prod() * * reads from the pcap file and prepares packets to transmit. * After reading a packet from the pcap file, the following information * are extracted which can be used to determine the schedule: * * q->cur_pkt points to the buffer containing the packet * q->cur_len packet length, excluding CRC * q->cur_caplen available packet length (may be shorter than cur_len) * q->cur_tt transmission time for the packet, computed from the trace. * * The following functions are then called in sequence: * * q->c_loss (set with the -L command line option) decides * whether the packet should be dropped before even queuing. * This is generally useful to emulate random loss. * The function is supposed to set q->c_drop = 1 if the * packet should be dropped, or leave it to 0 otherwise. * * q->c_bw (set with the -B command line option) is used to * enforce the transmit bandwidth. The function must store * in q->cur_tt the transmission time (in nanoseconds) of * the packet, which is typically proportional to the length * of the packet, i.e. q->cur_tt = q->cur_len / * Variants are possible, eg. to account for constant framing * bits as on the ethernet, or variable channel acquisition times, * etc. * This mechanism can also be used to simulate variable queueing * delay e.g. due to the presence of cross traffic. * * q->c_delay (set with the -D option) implements delay emulation. * The function should set q->cur_delay to the additional * delay the packet is subject to. The framework will take care of * computing the actual exit time of a packet so that there is no * reordering. */ // debugging macros #define NED(_fmt, ...) do {} while (0) #define ED(_fmt, ...) \ do { \ struct timeval _t0; \ gettimeofday(&_t0, NULL); \ fprintf(stderr, "%03d.%03d %-10.10s [%5d] \t" _fmt "\n", \ (int)(_t0.tv_sec % 1000), (int)_t0.tv_usec/1000, \ __FUNCTION__, __LINE__, ##__VA_ARGS__); \ } while (0) /* WWW is for warnings, EEE is for errors */ #define WWW(_fmt, ...) ED("--WWW-- " _fmt, ##__VA_ARGS__) #define EEE(_fmt, ...) ED("--EEE-- " _fmt, ##__VA_ARGS__) #define DDD(_fmt, ...) ED("--DDD-- " _fmt, ##__VA_ARGS__) #define _GNU_SOURCE // for CPU_SET() etc #include #define NETMAP_WITH_LIBS #include #include /* * * A packet in the queue is q_pkt plus the payload. * * For the packet descriptor we need the following: * * - position of next packet in the queue (can go backwards). * We can reduce to 32 bits if we consider alignments, * or we just store the length to be added to the current * value and assume 0 as a special index. * - actual packet length (16 bits may be ok) * - queue output time, in nanoseconds (64 bits) * - delay line output time, in nanoseconds * One of the two can be packed to a 32bit value * * A convenient coding uses 32 bytes per packet. */ struct q_pkt { uint64_t next; /* buffer index for next packet */ uint64_t pktlen; /* actual packet len */ uint64_t pt_qout; /* time of output from queue */ uint64_t pt_tx; /* transmit time */ }; /* * The header for a pcap file */ struct pcap_file_header { uint32_t magic; /*used to detect the file format itself and the byte ordering. The writing application writes 0xa1b2c3d4 with it's native byte ordering format into this field. The reading application will read either 0xa1b2c3d4 (identical) or 0xd4c3b2a1 (swapped). If the reading application reads the swapped 0xd4c3b2a1 value, it knows that all the following fields will have to be swapped too. For nanosecond-resolution files, the writing application writes 0xa1b23c4d, with the two nibbles of the two lower-order bytes swapped, and the reading application will read either 0xa1b23c4d (identical) or 0x4d3cb2a1 (swapped)*/ uint16_t version_major; uint16_t version_minor; /*the version number of this file format */ int32_t thiszone; /*the correction time in seconds between GMT (UTC) and the local timezone of the following packet header timestamps. Examples: If the timestamps are in GMT (UTC), thiszone is simply 0. If the timestamps are in Central European time (Amsterdam, Berlin, ...) which is GMT + 1:00, thiszone must be -3600*/ uint32_t stampacc; /*the accuracy of time stamps in the capture*/ uint32_t snaplen; /*the "snapshot length" for the capture (typically 65535 or even more, but might be limited by the user)*/ uint32_t network; /*link-layer header type, specifying the type of headers at the beginning of the packet (e.g. 1 for Ethernet); this can be various types such as 802.11, 802.11 with various radio information, PPP, Token Ring, FDDI, etc.*/ }; #if 0 /* from pcap.h */ struct pcap_file_header { bpf_u_int32 magic; u_short version_major; u_short version_minor; bpf_int32 thiszone; /* gmt to local correction */ bpf_u_int32 sigfigs; /* accuracy of timestamps */ bpf_u_int32 snaplen; /* max length saved portion of each pkt */ bpf_u_int32 linktype; /* data link type (LINKTYPE_*) */ }; struct pcap_pkthdr { struct timeval ts; /* time stamp */ bpf_u_int32 caplen; /* length of portion present */ bpf_u_int32 len; /* length this packet (off wire) */ }; #endif /* from pcap.h */ struct pcap_pkthdr { uint32_t ts_sec; /* seconds from epoch */ uint32_t ts_frac; /* microseconds or nanoseconds depending on sigfigs */ uint32_t caplen; /*the number of bytes of packet data actually captured and saved in the file. This value should never become larger than orig_len or the snaplen value of the global header*/ uint32_t len; /* wire length */ }; #define PKT_PAD (32) /* padding on packets */ static inline int pad(int x) { return ((x) + PKT_PAD - 1) & ~(PKT_PAD - 1) ; } /* * wrapper around the pcap file. * We mmap the file so it is easy to do multiple passes through it. */ struct nm_pcap_file { int fd; uint64_t filesize; const char *data; /* mmapped file */ uint64_t tot_pkt; uint64_t tot_bytes; uint64_t tot_bytes_rounded; /* need hdr + pad(len) */ uint32_t resolution; /* 1000 for us, 1 for ns */ int swap; /* need to swap fields ? */ uint64_t first_ts; uint64_t total_tx_time; /* * total_tx_time is computed as last_ts - first_ts, plus the * transmission time for the first packet which in turn is * computed according to the average bandwidth */ uint64_t file_len; const char *cur; /* running pointer */ const char *lim; /* data + file_len */ int err; }; static struct nm_pcap_file *readpcap(const char *fn); static void destroy_pcap(struct nm_pcap_file *file); #include #include #include #include #include #include /* memcpy */ #include #define NS_SCALE 1000000000UL /* nanoseconds in 1s */ static void destroy_pcap(struct nm_pcap_file *pf) { if (!pf) return; munmap((void *)(uintptr_t)pf->data, pf->filesize); close(pf->fd); bzero(pf, sizeof(*pf)); free(pf); return; } // convert a field of given size if swap is needed. static uint32_t cvt(const void *src, int size, char swap) { uint32_t ret = 0; if (size != 2 && size != 4) { EEE("Invalid size %d\n", size); exit(1); } memcpy(&ret, src, size); if (swap) { unsigned char tmp, *data = (unsigned char *)&ret; int i; for (i = 0; i < size / 2; i++) { tmp = data[i]; data[i] = data[size - (1 + i)]; data[size - (1 + i)] = tmp; } } return ret; } static uint32_t read_next_info(struct nm_pcap_file *pf, int size) { const char *end = pf->cur + size; uint32_t ret; if (end > pf->lim) { pf->err = 1; ret = 0; } else { ret = cvt(pf->cur, size, pf->swap); pf->cur = end; } return ret; } /* * mmap the file, make sure timestamps are sorted, and count * packets and sizes * Timestamps represent the receive time of the packets. * We need to compute also the 'first_ts' which refers to a hypotetical * packet right before the first one, see the code for details. */ static struct nm_pcap_file * readpcap(const char *fn) { struct nm_pcap_file _f, *pf = &_f; uint64_t prev_ts, first_pkt_time; uint32_t magic, first_len = 0; bzero(pf, sizeof(*pf)); pf->fd = open(fn, O_RDONLY); if (pf->fd < 0) { EEE("cannot open file %s", fn); return NULL; } /* compute length */ pf->filesize = lseek(pf->fd, 0, SEEK_END); lseek(pf->fd, 0, SEEK_SET); ED("filesize is %lu", (u_long)(pf->filesize)); if (pf->filesize < sizeof(struct pcap_file_header)) { EEE("file too short %s", fn); close(pf->fd); return NULL; } pf->data = mmap(NULL, pf->filesize, PROT_READ, MAP_SHARED, pf->fd, 0); if (pf->data == MAP_FAILED) { EEE("cannot mmap file %s", fn); close(pf->fd); return NULL; } pf->cur = pf->data; pf->lim = pf->data + pf->filesize; pf->err = 0; pf->swap = 0; /* default, same endianness when read magic */ magic = read_next_info(pf, 4); ED("magic is 0x%x", magic); switch (magic) { case 0xa1b2c3d4: /* native, us resolution */ pf->swap = 0; pf->resolution = 1000; break; case 0xd4c3b2a1: /* swapped, us resolution */ pf->swap = 1; pf->resolution = 1000; break; case 0xa1b23c4d: /* native, ns resolution */ pf->swap = 0; pf->resolution = 1; /* nanoseconds */ break; case 0x4d3cb2a1: /* swapped, ns resolution */ pf->swap = 1; pf->resolution = 1; /* nanoseconds */ break; default: EEE("unknown magic 0x%x", magic); return NULL; } ED("swap %d res %d\n", pf->swap, pf->resolution); pf->cur = pf->data + sizeof(struct pcap_file_header); pf->lim = pf->data + pf->filesize; pf->err = 0; prev_ts = 0; while (pf->cur < pf->lim && pf->err == 0) { uint32_t base = pf->cur - pf->data; uint64_t cur_ts = read_next_info(pf, 4) * NS_SCALE + read_next_info(pf, 4) * pf->resolution; uint32_t caplen = read_next_info(pf, 4); uint32_t len = read_next_info(pf, 4); if (pf->err) { WWW("end of pcap file after %d packets\n", (int)pf->tot_pkt); break; } if (cur_ts < prev_ts) { WWW("reordered packet %d\n", (int)pf->tot_pkt); } prev_ts = cur_ts; (void)base; if (pf->tot_pkt == 0) { pf->first_ts = cur_ts; first_len = len; } pf->tot_pkt++; pf->tot_bytes += len; pf->tot_bytes_rounded += pad(len) + sizeof(struct q_pkt); pf->cur += caplen; } pf->total_tx_time = prev_ts - pf->first_ts; /* excluding first packet */ ED("tot_pkt %lu tot_bytes %lu tx_time %.6f s first_len %lu", (u_long)pf->tot_pkt, (u_long)pf->tot_bytes, 1e-9*pf->total_tx_time, (u_long)first_len); /* * We determine that based on the * average bandwidth of the trace, as follows * first_pkt_ts = p[0].len / avg_bw * In turn avg_bw = (total_len - p[0].len)/(p[n-1].ts - p[0].ts) * so * first_ts = p[0].ts - p[0].len * (p[n-1].ts - p[0].ts) / (total_len - p[0].len) */ if (pf->tot_bytes == first_len) { /* cannot estimate bandwidth, so force 1 Gbit */ first_pkt_time = first_len * 8; /* * 10^9 / bw */ } else { first_pkt_time = pf->total_tx_time * first_len / (pf->tot_bytes - first_len); } ED("first_pkt_time %.6f s", 1e-9*first_pkt_time); pf->total_tx_time += first_pkt_time; pf->first_ts -= first_pkt_time; /* all correct, allocate a record and copy */ pf = calloc(1, sizeof(*pf)); *pf = _f; /* reset pointer to start */ pf->cur = pf->data + sizeof(struct pcap_file_header); pf->err = 0; return pf; } enum my_pcap_mode { PM_NONE, PM_FAST, PM_FIXED, PM_REAL }; -int verbose = 0; +static int verbose = 0; static int do_abort = 0; #include #include #include #include #include // setpriority #ifdef __FreeBSD__ #include /* pthread w/ affinity */ #include /* cpu_set */ #endif /* __FreeBSD__ */ #ifdef linux #define cpuset_t cpu_set_t #endif #ifdef __APPLE__ #define cpuset_t uint64_t // XXX static inline void CPU_ZERO(cpuset_t *p) { *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) ((void)a, 0) #define sched_setscheduler(a, b, c) (1) /* error */ #define clock_gettime(a,b) \ do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) #define _P64 unsigned long #endif #ifndef _P64 /* we use uint64_t widely, but printf gives trouble on different * platforms so we use _P64 as a cast */ #define _P64 uint64_t #endif /* print stuff */ struct _qs; /* forward */ /* * descriptor of a configuration entry. * Each handler has a parse function which takes ac/av[] and returns * true if successful. Any allocated space is stored into struct _cfg * * that is passed as argument. * arg and arg_len are included for convenience. */ struct _cfg { int (*parse)(struct _qs *, struct _cfg *, int ac, char *av[]); /* 0 ok, 1 on error */ int (*run)(struct _qs *, struct _cfg *arg); /* 0 Ok, 1 on error */ // int close(struct _qs *, void *arg); /* 0 Ok, 1 on error */ const char *optarg; /* command line argument. Initial value is the error message */ /* placeholders for common values */ void *arg; /* allocated memory if any */ int arg_len; /* size of *arg in case a realloc is needed */ uint64_t d[16]; /* static storage for simple cases */ double f[4]; /* static storage for simple cases */ }; /* * communication occurs through this data structure, with fields * cache-aligned according to who are the readers/writers. * The queue is an array of memory (buf) of size buflen (does not change). The producer uses 'tail' as an index in the queue to indicate the first empty location (ie. after the last byte of data), the consumer uses head to indicate the next byte to consume. For best performance we should align buffers and packets to multiples of cacheline, but this would explode memory too much. Worst case memory explosion is with 65 byte packets. Memory usage as shown below: qpkt-pad size 32-16 32-32 32-64 64-64 64 96 96 96 128 65 112 128 160 192 An empty queue has head == tail, a full queue will have free space below a threshold. In our case the queue is large enough and we are non blocking so we can simply drop traffic when the queue approaches a full state. To simulate bandwidth limitations efficiently, the producer has a second pointer, prod_tail_1, used to check for expired packets. This is done lazily. */ /* * When sizing the buffer, we must assume some value for the bandwidth. * INFINITE_BW is supposed to be faster than what we support */ #define INFINITE_BW (200ULL*1000000*1000) #define MY_CACHELINE (128ULL) #define MAX_PKT (9200) /* max packet size */ #define ALIGN_CACHE __attribute__ ((aligned (MY_CACHELINE))) struct _qs { /* shared queue */ uint64_t t0; /* start of times */ uint64_t buflen; /* queue length */ char *buf; /* handlers for various options */ struct _cfg c_delay; struct _cfg c_bw; struct _cfg c_loss; /* producer's fields */ uint64_t tx ALIGN_CACHE; /* tx counter */ uint64_t prod_tail_1; /* head of queue */ uint64_t prod_head; /* cached copy */ uint64_t prod_tail; /* cached copy */ uint64_t prod_drop; /* drop packet count */ uint64_t prod_max_gap; /* rx round duration */ struct nm_pcap_file *pcap; /* the pcap struct */ /* parameters for reading from the netmap port */ struct nm_desc *src_port; /* netmap descriptor */ const char * prod_ifname; /* interface name or pcap file */ struct netmap_ring *rxring; /* current ring being handled */ uint32_t si; /* ring index */ int burst; uint32_t rx_qmax; /* stats on max queued */ uint64_t qt_qout; /* queue exit time for last packet */ /* * when doing shaping, the software computes and stores here * the time when the most recently queued packet will exit from * the queue. */ uint64_t qt_tx; /* delay line exit time for last packet */ /* * The software computes the time at which the most recently * queued packet exits from the queue. * To avoid reordering, the next packet should exit at least * at qt_tx + cur_tt */ /* producer's fields controlling the queueing */ const char * cur_pkt; /* current packet being analysed */ uint32_t cur_len; /* length of current packet */ uint32_t cur_caplen; /* captured length of current packet */ int cur_drop; /* 1 if current packet should be dropped. */ /* * cur_drop can be set as a result of the loss emulation, * and may need to use the packet size, current time, etc. */ uint64_t cur_tt; /* transmission time (ns) for current packet */ /* * The transmission time is how much link time the packet will consume. * should be set by the function that does the bandwidth emulation, * but could also be the result of a function that emulates the * presence of competing traffic, MAC protocols etc. * cur_tt is 0 for links with infinite bandwidth. */ uint64_t cur_delay; /* delay (ns) for current packet from c_delay.run() */ /* * this should be set by the function that computes the extra delay * applied to the packet. * The code makes sure that there is no reordering and possibly * bumps the output time as needed. */ /* consumer's fields */ const char * cons_ifname; uint64_t rx ALIGN_CACHE; /* rx counter */ uint64_t cons_head; /* cached copy */ uint64_t cons_tail; /* cached copy */ uint64_t cons_now; /* most recent producer timestamp */ uint64_t rx_wait; /* stats */ /* shared fields */ volatile uint64_t _tail ALIGN_CACHE ; /* producer writes here */ volatile uint64_t _head ALIGN_CACHE ; /* consumer reads from here */ }; struct pipe_args { int wait_link; pthread_t cons_tid; /* main thread */ pthread_t prod_tid; /* producer thread */ /* Affinity: */ int cons_core; /* core for cons() */ int prod_core; /* core for prod() */ struct nm_desc *pa; /* netmap descriptor */ struct nm_desc *pb; struct _qs q; }; #define NS_IN_S (1000000000ULL) // nanoseconds #define TIME_UNITS NS_IN_S /* set the thread affinity. */ static int setaffinity(int i) { cpuset_t cpumask; struct sched_param p; if (i == -1) return 0; /* Set thread affinity affinity.*/ CPU_ZERO(&cpumask); CPU_SET(i, &cpumask); if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), &cpumask) != 0) { WWW("Unable to set affinity: %s", strerror(errno)); } if (setpriority(PRIO_PROCESS, 0, -10)) {; // XXX not meaningful WWW("Unable to set priority: %s", strerror(errno)); } bzero(&p, sizeof(p)); p.sched_priority = 10; // 99 on linux ? // use SCHED_RR or SCHED_FIFO if (sched_setscheduler(0, SCHED_RR, &p)) { WWW("Unable to set scheduler: %s", strerror(errno)); } return 0; } /* * set the timestamp from the clock, subtract t0 */ static inline void set_tns_now(uint64_t *now, uint64_t t0) { struct timespec t; clock_gettime(CLOCK_REALTIME, &t); // XXX precise on FreeBSD ? *now = (uint64_t)(t.tv_nsec + NS_IN_S * t.tv_sec); *now -= t0; } /* compare two timestamps */ static inline int64_t ts_cmp(uint64_t a, uint64_t b) { return (int64_t)(a - b); } /* create a packet descriptor */ static inline struct q_pkt * pkt_at(struct _qs *q, uint64_t ofs) { return (struct q_pkt *)(q->buf + ofs); } /* * we have already checked for room and prepared p->next */ static inline int enq(struct _qs *q) { struct q_pkt *p = pkt_at(q, q->prod_tail); /* hopefully prefetch has been done ahead */ nm_pkt_copy(q->cur_pkt, (char *)(p+1), q->cur_caplen); p->pktlen = q->cur_len; p->pt_qout = q->qt_qout; p->pt_tx = q->qt_tx; p->next = q->prod_tail + pad(q->cur_len) + sizeof(struct q_pkt); ND("enqueue len %d at %d new tail %ld qout %.6f tx %.6f", q->cur_len, (int)q->prod_tail, p->next, 1e-9*p->pt_qout, 1e-9*p->pt_tx); q->prod_tail = p->next; q->tx++; return 0; } /* * simple handler for parameters not supplied */ static int null_run_fn(struct _qs *q, struct _cfg *cfg) { (void)q; (void)cfg; return 0; } /* * put packet data into the buffer. * We read from the mmapped pcap file, construct header, copy * the captured length of the packet and pad with zeroes. */ static void * pcap_prod(void *_pa) { struct pipe_args *pa = _pa; struct _qs *q = &pa->q; struct nm_pcap_file *pf = q->pcap; /* already opened by readpcap */ uint32_t loops, i, tot_pkts; /* data plus the loop record */ uint64_t need; uint64_t t_tx, tt, last_ts; /* last timestamp from trace */ /* * For speed we make sure the trace is at least some 1000 packets, * so we may need to loop the trace more than once (for short traces) */ loops = (1 + 10000 / pf->tot_pkt); tot_pkts = loops * pf->tot_pkt; need = loops * pf->tot_bytes_rounded + sizeof(struct q_pkt); q->buf = calloc(1, need); if (q->buf == NULL) { D("alloc %lld bytes for queue failed, exiting",(long long)need); goto fail; } q->prod_head = q->prod_tail = 0; q->buflen = need; pf->cur = pf->data + sizeof(struct pcap_file_header); pf->err = 0; ED("--- start create %lu packets at tail %d", (u_long)tot_pkts, (int)q->prod_tail); last_ts = pf->first_ts; /* beginning of the trace */ q->qt_qout = 0; /* first packet out of the queue */ for (loops = 0, i = 0; i < tot_pkts && !do_abort; i++) { const char *next_pkt; /* in the pcap buffer */ uint64_t cur_ts; /* read values from the pcap buffer */ cur_ts = read_next_info(pf, 4) * NS_SCALE + read_next_info(pf, 4) * pf->resolution; q->cur_caplen = read_next_info(pf, 4); q->cur_len = read_next_info(pf, 4); next_pkt = pf->cur + q->cur_caplen; /* prepare fields in q for the generator */ q->cur_pkt = pf->cur; /* initial estimate of tx time */ q->cur_tt = cur_ts - last_ts; // -pf->first_ts + loops * pf->total_tx_time - last_ts; if ((i % pf->tot_pkt) == 0) ED("insert %5d len %lu cur_tt %.6f", i, (u_long)q->cur_len, 1e-9*q->cur_tt); /* prepare for next iteration */ pf->cur = next_pkt; last_ts = cur_ts; if (next_pkt == pf->lim) { //last pkt pf->cur = pf->data + sizeof(struct pcap_file_header); last_ts = pf->first_ts; /* beginning of the trace */ loops++; } q->c_loss.run(q, &q->c_loss); if (q->cur_drop) continue; q->c_bw.run(q, &q->c_bw); tt = q->cur_tt; q->qt_qout += tt; #if 0 if (drop_after(q)) continue; #endif q->c_delay.run(q, &q->c_delay); /* compute delay */ t_tx = q->qt_qout + q->cur_delay; ND(5, "tt %ld qout %ld tx %ld qt_tx %ld", tt, q->qt_qout, t_tx, q->qt_tx); /* insure no reordering and spacing by transmission time */ q->qt_tx = (t_tx >= q->qt_tx + tt) ? t_tx : q->qt_tx + tt; enq(q); q->tx++; ND("ins %d q->prod_tail = %lu", (int)insert, (unsigned long)q->prod_tail); } /* loop marker ? */ ED("done q->prod_tail:%d",(int)q->prod_tail); q->_tail = q->prod_tail; /* publish */ return NULL; fail: if (q->buf != NULL) { free(q->buf); } nm_close(pa->pb); return (NULL); } /* * the consumer reads from the queue using head, * advances it every now and then. */ static void * cons(void *_pa) { struct pipe_args *pa = _pa; struct _qs *q = &pa->q; int pending = 0; uint64_t last_ts = 0; /* read the start of times in q->t0 */ set_tns_now(&q->t0, 0); /* set the time (cons_now) to clock - q->t0 */ set_tns_now(&q->cons_now, q->t0); q->cons_head = q->_head; q->cons_tail = q->_tail; while (!do_abort) { /* consumer, infinite */ struct q_pkt *p = pkt_at(q, q->cons_head); __builtin_prefetch (q->buf + p->next); if (q->cons_head == q->cons_tail) { //reset record ND("Transmission restarted"); /* * add to q->t0 the time for the last packet */ q->t0 += last_ts; set_tns_now(&q->cons_now, q->t0); q->cons_head = 0; //restart from beginning of the queue continue; } last_ts = p->pt_tx; if (ts_cmp(p->pt_tx, q->cons_now) > 0) { // packet not ready q->rx_wait++; /* the ioctl should be conditional */ ioctl(pa->pb->fd, NIOCTXSYNC, 0); // XXX just in case pending = 0; usleep(20); set_tns_now(&q->cons_now, q->t0); continue; } /* XXX copy is inefficient but simple */ if (nm_inject(pa->pb, (char *)(p + 1), p->pktlen) == 0) { RD(1, "inject failed len %d now %ld tx %ld h %ld t %ld next %ld", (int)p->pktlen, (u_long)q->cons_now, (u_long)p->pt_tx, (u_long)q->_head, (u_long)q->_tail, (u_long)p->next); ioctl(pa->pb->fd, NIOCTXSYNC, 0); pending = 0; continue; } pending++; if (pending > q->burst) { ioctl(pa->pb->fd, NIOCTXSYNC, 0); pending = 0; } q->cons_head = p->next; /* drain packets from the queue */ q->rx++; } D("exiting on abort"); return NULL; } /* * In case of pcap file as input, the program acts in 2 different * phases. It first fill the queue and then starts the cons() */ static void * nmreplay_main(void *_a) { struct pipe_args *a = _a; struct _qs *q = &a->q; const char *cap_fname = q->prod_ifname; setaffinity(a->cons_core); set_tns_now(&q->t0, 0); /* starting reference */ if (cap_fname == NULL) { goto fail; } q->pcap = readpcap(cap_fname); if (q->pcap == NULL) { EEE("unable to read file %s", cap_fname); goto fail; } pcap_prod((void*)a); destroy_pcap(q->pcap); q->pcap = NULL; a->pb = nm_open(q->cons_ifname, NULL, 0, NULL); if (a->pb == NULL) { EEE("cannot open netmap on %s", q->cons_ifname); do_abort = 1; // XXX any better way ? return NULL; } /* continue as cons() */ WWW("prepare to send packets"); usleep(1000); cons((void*)a); EEE("exiting on abort"); fail: if (q->pcap != NULL) { destroy_pcap(q->pcap); } do_abort = 1; return NULL; } static void sigint_h(int sig) { (void)sig; /* UNUSED */ do_abort = 1; signal(SIGINT, SIG_DFL); } static void usage(void) { fprintf(stderr, "usage: nmreplay [-v] [-D delay] [-B {[constant,]bps|ether,bps|real,speedup}] [-L loss]\n" "\t[-b burst] -f pcap-file -i \n"); exit(1); } /*---- configuration handling ---- */ /* * support routine: split argument, returns ac and *av. * av contains two extra entries, a NULL and a pointer * to the entire string. */ static char ** split_arg(const char *src, int *_ac) { - char *my = NULL, **av = NULL, *seps = " \t\r\n,"; + char *my = NULL, **av = NULL; + const char *seps = " \t\r\n,"; int l, i, ac; /* number of entries */ if (!src) return NULL; l = strlen(src); /* in the first pass we count fields, in the second pass * we allocate the av[] array and a copy of the string * and fill av[]. av[ac] = NULL, av[ac+1] */ for (;;) { i = ac = 0; ND("start pass %d: <%s>", av ? 1 : 0, my); while (i < l) { /* trim leading separator */ while (i = l) break; ND(" pass %d arg %d: <%s>", av ? 1 : 0, ac, src+i); if (av) /* in the second pass, set the result */ av[ac] = my+i; ac++; /* skip string */ while (i ", i, av[i]); } av[i++] = NULL; av[i++] = my; *_ac = ac; return av; } /* * apply a command against a set of functions, * install a handler in *dst */ static int cmd_apply(const struct _cfg *a, const char *arg, struct _qs *q, struct _cfg *dst) { int ac = 0; char **av; int i; if (arg == NULL || *arg == '\0') return 1; /* no argument may be ok */ if (a == NULL || dst == NULL) { ED("program error - invalid arguments"); exit(1); } av = split_arg(arg, &ac); if (av == NULL) return 1; /* error */ for (i = 0; a[i].parse; i++) { struct _cfg x = a[i]; const char *errmsg = x.optarg; int ret; x.arg = NULL; x.arg_len = 0; bzero(&x.d, sizeof(x.d)); ND("apply %s to %s", av[0], errmsg); ret = x.parse(q, &x, ac, av); if (ret == 2) /* not recognised */ continue; if (ret == 1) { ED("invalid arguments: need '%s' have '%s'", errmsg, arg); break; } x.optarg = arg; *dst = x; return 0; } ED("arguments %s not recognised", arg); free(av); return 1; } static struct _cfg delay_cfg[]; static struct _cfg bw_cfg[]; static struct _cfg loss_cfg[]; static uint64_t parse_bw(const char *arg); /* * prodcons [options] * accept separate sets of arguments for the two directions * */ static void add_to(const char ** v, int l, const char *arg, const char *msg) { for (; l > 0 && *v != NULL ; l--, v++); if (l == 0) { ED("%s %s", msg, arg); exit(1); } *v = arg; } int main(int argc, char **argv) { int ch, i, err=0; #define N_OPTS 1 struct pipe_args bp[N_OPTS]; const char *d[N_OPTS], *b[N_OPTS], *l[N_OPTS], *q[N_OPTS], *ifname[N_OPTS], *m[N_OPTS]; const char *pcap_file[N_OPTS]; int cores[4] = { 2, 8, 4, 10 }; /* default values */ bzero(&bp, sizeof(bp)); /* all data initially go here */ bzero(d, sizeof(d)); bzero(b, sizeof(b)); bzero(l, sizeof(l)); bzero(q, sizeof(q)); bzero(m, sizeof(m)); bzero(ifname, sizeof(ifname)); bzero(pcap_file, sizeof(pcap_file)); /* set default values */ for (i = 0; i < N_OPTS; i++) { - struct _qs *q = &bp[i].q; + struct _qs *qs = &bp[i].q; - q->burst = 128; - q->c_delay.optarg = "0"; - q->c_delay.run = null_run_fn; - q->c_loss.optarg = "0"; - q->c_loss.run = null_run_fn; - q->c_bw.optarg = "0"; - q->c_bw.run = null_run_fn; + qs->burst = 128; + qs->c_delay.optarg = "0"; + qs->c_delay.run = null_run_fn; + qs->c_loss.optarg = "0"; + qs->c_loss.run = null_run_fn; + qs->c_bw.optarg = "0"; + qs->c_bw.run = null_run_fn; } // Options: // B bandwidth in bps // D delay in seconds // L loss probability // f pcap file // i interface name // w wait link // b batch size // v verbose // C cpu placement while ( (ch = getopt(argc, argv, "B:C:D:L:b:f:i:vw:")) != -1) { switch (ch) { default: D("bad option %c %s", ch, optarg); usage(); break; case 'C': /* CPU placement, up to 4 arguments */ { int ac = 0; char **av = split_arg(optarg, &ac); if (ac == 1) { /* sequential after the first */ cores[0] = atoi(av[0]); cores[1] = cores[0] + 1; cores[2] = cores[1] + 1; cores[3] = cores[2] + 1; } else if (ac == 2) { /* two sequential pairs */ cores[0] = atoi(av[0]); cores[1] = cores[0] + 1; cores[2] = atoi(av[1]); cores[3] = cores[2] + 1; } else if (ac == 4) { /* four values */ cores[0] = atoi(av[0]); cores[1] = atoi(av[1]); cores[2] = atoi(av[2]); cores[3] = atoi(av[3]); } else { ED(" -C accepts 1, 2 or 4 comma separated arguments"); usage(); } if (av) free(av); } break; case 'B': /* bandwidth in bps */ add_to(b, N_OPTS, optarg, "-B too many times"); break; case 'D': /* delay in seconds (float) */ add_to(d, N_OPTS, optarg, "-D too many times"); break; case 'L': /* loss probability */ add_to(l, N_OPTS, optarg, "-L too many times"); break; case 'b': /* burst */ bp[0].q.burst = atoi(optarg); break; case 'f': /* pcap_file */ add_to(pcap_file, N_OPTS, optarg, "-f too many times"); break; case 'i': /* interface */ add_to(ifname, N_OPTS, optarg, "-i too many times"); break; case 'v': verbose++; break; case 'w': bp[0].wait_link = atoi(optarg); break; } } argc -= optind; argv += optind; /* * consistency checks for common arguments * if pcap file has been provided we need just one interface, two otherwise */ if (!pcap_file[0]) { ED("missing pcap file"); usage(); } if (!ifname[0]) { ED("missing interface"); usage(); } if (bp[0].q.burst < 1 || bp[0].q.burst > 8192) { WWW("invalid burst %d, set to 1024", bp[0].q.burst); bp[0].q.burst = 1024; // XXX 128 is probably better } if (bp[0].wait_link > 100) { ED("invalid wait_link %d, set to 4", bp[0].wait_link); bp[0].wait_link = 4; } bp[0].q.prod_ifname = pcap_file[0]; bp[0].q.cons_ifname = ifname[0]; /* assign cores. prod and cons work better if on the same HT */ bp[0].cons_core = cores[0]; bp[0].prod_core = cores[1]; ED("running on cores %d %d %d %d", cores[0], cores[1], cores[2], cores[3]); /* apply commands */ for (i = 0; i < N_OPTS; i++) { /* once per queue */ - struct _qs *q = &bp[i].q; - err += cmd_apply(delay_cfg, d[i], q, &q->c_delay); - err += cmd_apply(bw_cfg, b[i], q, &q->c_bw); - err += cmd_apply(loss_cfg, l[i], q, &q->c_loss); + struct _qs *qs = &bp[i].q; + err += cmd_apply(delay_cfg, d[i], qs, &qs->c_delay); + err += cmd_apply(bw_cfg, b[i], qs, &qs->c_bw); + err += cmd_apply(loss_cfg, l[i], qs, &qs->c_loss); } pthread_create(&bp[0].cons_tid, NULL, nmreplay_main, (void*)&bp[0]); signal(SIGINT, sigint_h); sleep(1); while (!do_abort) { struct _qs olda = bp[0].q; struct _qs *q0 = &bp[0].q; sleep(1); ED("%lld -> %lld maxq %d round %lld", (long long)(q0->rx - olda.rx), (long long)(q0->tx - olda.tx), q0->rx_qmax, (long long)q0->prod_max_gap ); ED("plr nominal %le actual %le", (double)(q0->c_loss.d[0])/(1<<24), q0->c_loss.d[1] == 0 ? 0 : (double)(q0->c_loss.d[2])/q0->c_loss.d[1]); bp[0].q.rx_qmax = (bp[0].q.rx_qmax * 7)/8; // ewma bp[0].q.prod_max_gap = (bp[0].q.prod_max_gap * 7)/8; // ewma } D("exiting on abort"); sleep(1); return (0); } /* conversion factor for numbers. * Each entry has a set of characters and conversion factor, * the first entry should have an empty string and default factor, * the final entry has s = NULL. */ struct _sm { /* string and multiplier */ - char *s; + const char *s; double m; }; /* * parse a generic value */ static double parse_gen(const char *arg, const struct _sm *conv, int *err) { double d; char *ep; int dummy; if (err == NULL) err = &dummy; *err = 0; if (arg == NULL) goto error; d = strtod(arg, &ep); if (ep == arg) { /* no value */ ED("bad argument %s", arg); goto error; } /* special case, no conversion */ if (conv == NULL && *ep == '\0') goto done; ND("checking %s [%s]", arg, ep); for (;conv->s; conv++) { if (strchr(conv->s, *ep)) goto done; } error: *err = 1; /* unrecognised */ return 0; done: if (conv) { ND("scale is %s %lf", conv->s, conv->m); d *= conv->m; /* apply default conversion */ } ND("returning %lf", d); return d; } #define U_PARSE_ERR ~(0ULL) /* returns a value in nanoseconds */ static uint64_t parse_time(const char *arg) { struct _sm a[] = { {"", 1000000000 /* seconds */}, {"n", 1 /* nanoseconds */}, {"u", 1000 /* microseconds */}, {"m", 1000000 /* milliseconds */}, {"s", 1000000000 /* seconds */}, {NULL, 0 /* seconds */} }; int err; uint64_t ret = (uint64_t)parse_gen(arg, a, &err); return err ? U_PARSE_ERR : ret; } /* * parse a bandwidth, returns value in bps or U_PARSE_ERR if error. */ static uint64_t parse_bw(const char *arg) { struct _sm a[] = { {"", 1}, {"kK", 1000}, {"mM", 1000000}, {"gG", 1000000000}, {NULL, 0} }; int err; uint64_t ret = (uint64_t)parse_gen(arg, a, &err); return err ? U_PARSE_ERR : ret; } /* * For some function we need random bits. * This is a wrapper to whatever function you want that returns * 24 useful random bits. */ #include /* log, exp etc. */ static inline uint64_t my_random24(void) /* 24 useful bits */ { return random() & ((1<<24) - 1); } /*-------------- user-configuration -----------------*/ #if 0 /* start of comment block */ Here we place the functions to implement the various features of the system. For each feature one should define a struct _cfg (see at the beginning for definition) that refers a *_parse() function to extract values from the command line, and a *_run() function that is invoked on each packet to implement the desired function. Examples of the two functions are below. In general: - the *_parse() function takes argc/argv[], matches the function name in argv[0], extracts the operating parameters, allocates memory if needed, and stores them in the struct _cfg. Return value is 2 if argv[0] is not recosnised, 1 if there is an error in the arguments, 0 if all ok. On the command line, argv[] is a single, comma separated argument that follow the specific option eg -D constant,20ms struct _cfg has some preallocated space (e.g an array of uint64_t) so simple function can use that without having to allocate memory. - the *_run() function takes struct _q *q and struct _cfg *cfg as arguments. *q contains all the informatio that may be possibly needed, including those on the packet currently under processing. The basic values are the following: char * cur_pkt points to the current packet (linear buffer) uint32_t cur_len; length of the current packet the functions are not supposed to modify these values int cur_drop; true if current packet must be dropped. Must be set to non-zero by the loss emulation function uint64_t cur_delay; delay in nanoseconds for the current packet Must be set by the delay emulation function More sophisticated functions may need to access other fields in *q, see the structure description for that. When implementing a new function for a feature (e.g. for delay, bandwidth, loss...) the struct _cfg should be added to the array that contains all possible options. --- Specific notes --- DELAY emulation -D option_arguments If the option is not supplied, the system applies 0 extra delay The resolution for times is 1ns, the precision is load dependent and generally in the order of 20-50us. Times are in nanoseconds, can be followed by a character specifying a different unit e.g. n nanoseconds u microseconds m milliseconds s seconds Currently implemented options: constant,t constant delay equal to t uniform,tmin,tmax uniform delay between tmin and tmax exp,tavg,tmin exponential distribution with average tavg and minimum tmin (corresponds to an exponential distribution with argument 1/(tavg-tmin) ) LOSS emulation -L option_arguments Loss is expressed as packet or bit error rate, which is an absolute number between 0 and 1 (typically small). Currently implemented options plr,p uniform packet loss rate p, independent of packet size burst,p,lmin,lmax burst loss with burst probability p and burst length uniformly distributed between lmin and lmax ber,p uniformly distributed bit error rate p, so actual loss prob. depends on size. BANDWIDTH emulation -B option_arguments Bandwidths are expressed in bits per second, can be followed by a character specifying a different unit e.g. b/B bits per second k/K kbits/s (10^3 bits/s) m/M mbits/s (10^6 bits/s) g/G gbits/s (10^9 bits/s) Currently implemented options const,b constant bw, excluding mac framing ether,b constant bw, including ethernet framing (20 bytes framing + 4 bytes crc) real,[scale] use real time, optionally with a scaling factor #endif /* end of comment block */ /* * Configuration options for delay */ /* constant delay, also accepts just a number */ static int const_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { uint64_t delay; (void)q; if (strncmp(av[0], "const", 5) != 0 && ac > 1) return 2; /* unrecognised */ if (ac > 2) return 1; /* error */ delay = parse_time(av[ac - 1]); if (delay == U_PARSE_ERR) return 1; /* error */ dst->d[0] = delay; return 0; /* success */ } /* runtime function, store the delay into q->cur_delay */ static int const_delay_run(struct _qs *q, struct _cfg *arg) { q->cur_delay = arg->d[0]; /* the delay */ return 0; } static int uniform_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { uint64_t dmin, dmax; (void)q; if (strcmp(av[0], "uniform") != 0) return 2; /* not recognised */ if (ac != 3) return 1; /* error */ dmin = parse_time(av[1]); dmax = parse_time(av[2]); if (dmin == U_PARSE_ERR || dmax == U_PARSE_ERR || dmin > dmax) return 1; D("dmin %lld dmax %lld", (long long)dmin, (long long)dmax); dst->d[0] = dmin; dst->d[1] = dmax; dst->d[2] = dmax - dmin; return 0; } static int uniform_delay_run(struct _qs *q, struct _cfg *arg) { uint64_t x = my_random24(); q->cur_delay = arg->d[0] + ((arg->d[2] * x) >> 24); #if 0 /* COMPUTE_STATS */ #endif /* COMPUTE_STATS */ return 0; } /* * exponential delay: Prob(delay = x) = exp(-x/d_av) * gives a delay between 0 and infinity with average d_av * The cumulative function is 1 - d_av exp(-x/d_av) * * The inverse function generates a uniform random number p in 0..1 * and generates delay = (d_av-d_min) * -ln(1-p) + d_min * * To speed up behaviour at runtime we tabulate the values */ static int exp_delay_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { #define PTS_D_EXP 512 uint64_t i, d_av, d_min, *t; /*table of values */ (void)q; if (strcmp(av[0], "exp") != 0) return 2; /* not recognised */ if (ac != 3) return 1; /* error */ d_av = parse_time(av[1]); d_min = parse_time(av[2]); if (d_av == U_PARSE_ERR || d_min == U_PARSE_ERR || d_av < d_min) return 1; /* error */ d_av -= d_min; dst->arg_len = PTS_D_EXP * sizeof(uint64_t); dst->arg = calloc(1, dst->arg_len); if (dst->arg == NULL) return 1; /* no memory */ t = (uint64_t *)dst->arg; /* tabulate -ln(1-n)*delay for n in 0..1 */ for (i = 0; i < PTS_D_EXP; i++) { double d = -log2 ((double)(PTS_D_EXP - i) / PTS_D_EXP) * d_av + d_min; t[i] = (uint64_t)d; ND(5, "%ld: %le", i, d); } return 0; } static int exp_delay_run(struct _qs *q, struct _cfg *arg) { uint64_t *t = (uint64_t *)arg->arg; q->cur_delay = t[my_random24() & (PTS_D_EXP - 1)]; RD(5, "delay %llu", (unsigned long long)q->cur_delay); return 0; } /* unused arguments in configuration */ #define TLEM_CFG_END NULL, 0, {0}, {0} static struct _cfg delay_cfg[] = { { const_delay_parse, const_delay_run, "constant,delay", TLEM_CFG_END }, { uniform_delay_parse, uniform_delay_run, "uniform,dmin,dmax # dmin <= dmax", TLEM_CFG_END }, { exp_delay_parse, exp_delay_run, "exp,dmin,davg # dmin <= davg", TLEM_CFG_END }, { NULL, NULL, NULL, TLEM_CFG_END } }; /* standard bandwidth, also accepts just a number */ static int const_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { uint64_t bw; (void)q; if (strncmp(av[0], "const", 5) != 0 && ac > 1) return 2; /* unrecognised */ if (ac > 2) return 1; /* error */ bw = parse_bw(av[ac - 1]); if (bw == U_PARSE_ERR) { return (ac == 2) ? 1 /* error */ : 2 /* unrecognised */; } dst->d[0] = bw; return 0; /* success */ } /* runtime function, store the delay into q->cur_delay */ static int const_bw_run(struct _qs *q, struct _cfg *arg) { uint64_t bps = arg->d[0]; q->cur_tt = bps ? 8ULL* TIME_UNITS * q->cur_len / bps : 0 ; return 0; } /* ethernet bandwidth, add 672 bits per packet */ static int ether_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { uint64_t bw; (void)q; if (strcmp(av[0], "ether") != 0) return 2; /* unrecognised */ if (ac != 2) return 1; /* error */ bw = parse_bw(av[ac - 1]); if (bw == U_PARSE_ERR) return 1; /* error */ dst->d[0] = bw; return 0; /* success */ } /* runtime function, add 20 bytes (framing) + 4 bytes (crc) */ static int ether_bw_run(struct _qs *q, struct _cfg *arg) { uint64_t bps = arg->d[0]; q->cur_tt = bps ? 8ULL * TIME_UNITS * (q->cur_len + 24) / bps : 0 ; return 0; } /* real bandwidth, plus scaling factor */ static int real_bw_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { double scale; (void)q; if (strcmp(av[0], "real") != 0) return 2; /* unrecognised */ if (ac > 2) { /* second argument is optional */ return 1; /* error */ } else if (ac == 1) { scale = 1; } else { int err = 0; scale = parse_gen(av[ac-1], NULL, &err); if (err || scale <= 0 || scale > 1000) return 1; } ED("real -> scale is %.6f", scale); dst->f[0] = scale; return 0; /* success */ } static int real_bw_run(struct _qs *q, struct _cfg *arg) { q->cur_tt /= arg->f[0]; return 0; } static struct _cfg bw_cfg[] = { { const_bw_parse, const_bw_run, "constant,bps", TLEM_CFG_END }, { ether_bw_parse, ether_bw_run, "ether,bps", TLEM_CFG_END }, { real_bw_parse, real_bw_run, "real,scale", TLEM_CFG_END }, { NULL, NULL, NULL, TLEM_CFG_END } }; /* * loss patterns */ static int const_plr_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { double plr; int err; (void)q; if (strcmp(av[0], "plr") != 0 && ac > 1) return 2; /* unrecognised */ if (ac > 2) return 1; /* error */ // XXX to be completed plr = parse_gen(av[ac-1], NULL, &err); if (err || plr < 0 || plr > 1) return 1; dst->d[0] = plr * (1<<24); /* scale is 16m */ if (plr != 0 && dst->d[0] == 0) ED("WWW warning, rounding %le down to 0", plr); return 0; /* success */ } static int const_plr_run(struct _qs *q, struct _cfg *arg) { (void)arg; uint64_t r = my_random24(); q->cur_drop = r < arg->d[0]; #if 1 /* keep stats */ arg->d[1]++; arg->d[2] += q->cur_drop; #endif return 0; } /* * For BER the loss is 1- (1-ber)**bit_len * The linear approximation is only good for small values, so we * tabulate (1-ber)**len for various sizes in bytes */ static int const_ber_parse(struct _qs *q, struct _cfg *dst, int ac, char *av[]) { double ber, ber8, cur; int i, err; uint32_t *plr; const uint32_t mask = (1<<24) - 1; (void)q; if (strcmp(av[0], "ber") != 0) return 2; /* unrecognised */ if (ac != 2) return 1; /* error */ ber = parse_gen(av[ac-1], NULL, &err); if (err || ber < 0 || ber > 1) return 1; dst->arg_len = MAX_PKT * sizeof(uint32_t); plr = calloc(1, dst->arg_len); if (plr == NULL) return 1; /* no memory */ dst->arg = plr; ber8 = 1 - ber; ber8 *= ber8; /* **2 */ ber8 *= ber8; /* **4 */ ber8 *= ber8; /* **8 */ cur = 1; for (i=0; i < MAX_PKT; i++, cur *= ber8) { plr[i] = (mask + 1)*(1 - cur); if (plr[i] > mask) plr[i] = mask; #if 0 if (i>= 60) // && plr[i] < mask/2) RD(50,"%4d: %le %ld", i, 1.0 - cur, (_P64)plr[i]); #endif } dst->d[0] = ber * (mask + 1); return 0; /* success */ } static int const_ber_run(struct _qs *q, struct _cfg *arg) { int l = q->cur_len; uint64_t r = my_random24(); uint32_t *plr = arg->arg; if (l >= MAX_PKT) { RD(5, "pkt len %d too large, trim to %d", l, MAX_PKT-1); l = MAX_PKT-1; } q->cur_drop = r < plr[l]; #if 1 /* keep stats */ arg->d[1] += l * 8; arg->d[2] += q->cur_drop; #endif return 0; } static struct _cfg loss_cfg[] = { { const_plr_parse, const_plr_run, "plr,prob # 0 <= prob <= 1", TLEM_CFG_END }, { const_ber_parse, const_ber_run, "ber,prob # 0 <= prob <= 1", TLEM_CFG_END }, { NULL, NULL, NULL, TLEM_CFG_END } }; Index: stable/12/tools/tools/netmap/pkt-gen.c =================================================================== --- stable/12/tools/tools/netmap/pkt-gen.c (revision 366497) +++ stable/12/tools/tools/netmap/pkt-gen.c (revision 366498) @@ -1,3250 +1,3249 @@ /* * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $ * * Example program to show how to build a multithreaded packet * source/sink using the netmap device. * * In this example we create a programmable number of threads * to take care of all the queues of the interface used to * send or receive traffic. * */ #define _GNU_SOURCE /* for CPU_SET() */ #include #define NETMAP_WITH_LIBS #include - #include // isprint() #include // sysconf() #include #include /* ntohs */ #ifndef _WIN32 #include /* sysctl */ #endif #include /* getifaddrs */ #include #include #include #include #include #ifdef linux #define IPV6_VERSION 0x60 #define IPV6_DEFHLIM 64 #endif #include #include #include #ifndef NO_PCAP #include #endif #include "ctrs.h" static void usage(int); #ifdef _WIN32 #define cpuset_t DWORD_PTR //uint64_t static inline void CPU_ZERO(cpuset_t *p) { *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0) #define TAP_CLONEDEV "/dev/tap" #define AF_LINK 18 //defined in winsocks.h #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME #include /* * Convert an ASCII representation of an ethernet address to * binary form. */ struct ether_addr * ether_aton(const char *a) { int i; static struct ether_addr o; unsigned int o0, o1, o2, o3, o4, o5; i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5); if (i != 6) return (NULL); o.octet[0]=o0; o.octet[1]=o1; o.octet[2]=o2; o.octet[3]=o3; o.octet[4]=o4; o.octet[5]=o5; return ((struct ether_addr *)&o); } /* * Convert a binary representation of an ethernet address to * an ASCII string. */ char * ether_ntoa(const struct ether_addr *n) { int i; static char a[18]; i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x", n->octet[0], n->octet[1], n->octet[2], n->octet[3], n->octet[4], n->octet[5]); return (i < 17 ? NULL : (char *)&a); } #endif /* _WIN32 */ #ifdef linux #define cpuset_t cpu_set_t #define ifr_flagshigh ifr_flags /* only the low 16 bits here */ #define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ #include #include #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME #include /* ether_aton */ #include /* sockaddr_ll */ #endif /* linux */ #ifdef __FreeBSD__ #include /* le64toh */ #include #include /* pthread w/ affinity */ #include /* cpu_set */ #include /* LLADDR */ #endif /* __FreeBSD__ */ #ifdef __APPLE__ #define cpuset_t uint64_t // XXX static inline void CPU_ZERO(cpuset_t *p) { *p = 0; } static inline void CPU_SET(uint32_t i, cpuset_t *p) { *p |= 1<< (i & 0x3f); } #define pthread_setaffinity_np(a, b, c) ((void)a, 0) #define ifr_flagshigh ifr_flags // XXX #define IFF_PPROMISC IFF_PROMISC #include /* LLADDR */ #define clock_gettime(a,b) \ do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) #endif /* __APPLE__ */ -const char *default_payload="netmap pkt-gen DIRECT payload\n" +static const char *default_payload = "netmap pkt-gen DIRECT payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; -const char *indirect_payload="netmap pkt-gen indirect payload\n" +static const char *indirect_payload = "netmap pkt-gen indirect payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; -int verbose = 0; -int normalize = 1; +static int verbose = 0; +static int normalize = 1; #define VIRT_HDR_1 10 /* length of a base vnet-hdr */ #define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ #define VIRT_HDR_MAX VIRT_HDR_2 struct virt_header { uint8_t fields[VIRT_HDR_MAX]; }; #define MAX_BODYSIZE 65536 struct pkt { struct virt_header vh; struct ether_header eh; union { struct { struct ip ip; struct udphdr udp; uint8_t body[MAX_BODYSIZE]; /* hardwired */ } ipv4; struct { struct ip6_hdr ip; struct udphdr udp; uint8_t body[MAX_BODYSIZE]; /* hardwired */ } ipv6; }; } __attribute__((__packed__)); #define PKT(p, f, af) \ ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f) struct ip_range { - char *name; + const char *name; union { struct { uint32_t start, end; /* same as struct in_addr */ } ipv4; struct { struct in6_addr start, end; uint8_t sgroup, egroup; } ipv6; }; uint16_t port0, port1; }; struct mac_range { - char *name; + const char *name; struct ether_addr start, end; }; /* ifname can be netmap:foo-xxxx */ #define MAX_IFNAMELEN 64 /* our buffer for ifname */ #define MAX_PKTSIZE MAX_BODYSIZE /* XXX: + IP_HDR + ETH_HDR */ /* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */ struct tstamp { uint32_t sec; uint32_t nsec; }; /* * global arguments for all threads */ struct glob_arg { int af; /* address family AF_INET/AF_INET6 */ struct ip_range src_ip; struct ip_range dst_ip; struct mac_range dst_mac; struct mac_range src_mac; int pkt_size; int pkt_min_size; int burst; int forever; uint64_t npackets; /* total packets to send */ int frags; /* fragments per packet */ u_int frag_size; /* size of each fragment */ int nthreads; int cpus; /* cpus used for running */ int system_cpus; /* cpus on the system */ int options; /* testing */ #define OPT_PREFETCH 1 #define OPT_ACCESS 2 #define OPT_COPY 4 #define OPT_MEMCPY 8 #define OPT_TS 16 /* add a timestamp */ #define OPT_INDIRECT 32 /* use indirect buffers, tx only */ #define OPT_DUMP 64 /* dump rx/tx traffic */ #define OPT_RUBBISH 256 /* send whatever the buffers contain */ #define OPT_RANDOM_SRC 512 #define OPT_RANDOM_DST 1024 #define OPT_PPS_STATS 2048 int dev_type; #ifndef NO_PCAP pcap_t *p; #endif int tx_rate; struct timespec tx_period; int affinity; int main_fd; struct nm_desc *nmd; int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); int td_type; void *mmap_addr; char ifname[MAX_IFNAMELEN]; - char *nmr_config; + const char *nmr_config; int dummy_send; int virt_header; /* send also the virt_header */ char *packet_file; /* -P option */ #define STATS_WIN 15 int win_idx; int64_t win[STATS_WIN]; int wait_link; int framing; /* #bits of framing (for bw output) */ }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; enum { TD_TYPE_SENDER = 1, TD_TYPE_RECEIVER, TD_TYPE_OTHER, }; /* * Arguments for a new thread. The same structure is used by * the source and the sink */ struct targ { struct glob_arg *g; int used; int completed; int cancel; int fd; struct nm_desc *nmd; /* these ought to be volatile, but they are * only sampled and errors should not accumulate */ struct my_ctrs ctr; struct timespec tic, toc; int me; pthread_t thread; int affinity; struct pkt pkt; void *frame; uint16_t seed[3]; u_int frags; u_int frag_size; }; static __inline uint16_t cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } static void extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port) { struct in_addr a; char *pp; pp = strchr(name, ':'); if (pp != NULL) { /* do we have ports ? */ *pp++ = '\0'; *port = (uint16_t)strtol(pp, NULL, 0); } inet_pton(AF_INET, name, &a); *addr = ntohl(a.s_addr); } static void extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port, uint8_t *group) { char *pp; /* * We accept IPv6 address in the following form: * group@[2001:DB8::1001]:port (w/ brackets and port) * group@[2001:DB8::1] (w/ brackets and w/o port) * group@2001:DB8::1234 (w/o brackets and w/o port) */ pp = strchr(name, '@'); if (pp != NULL) { *pp++ = '\0'; *group = (uint8_t)strtol(name, NULL, 0); if (*group > 7) *group = 7; name = pp; } if (name[0] == '[') name++; pp = strchr(name, ']'); if (pp != NULL) *pp++ = '\0'; if (pp != NULL && *pp != ':') pp = NULL; if (pp != NULL) { /* do we have ports ? */ *pp++ = '\0'; *port = (uint16_t)strtol(pp, NULL, 0); } inet_pton(AF_INET6, name, addr); } /* * extract the extremes from a range of ipv4 addresses. * addr_lo[-addr_hi][:port_lo[-port_hi]] */ static int extract_ip_range(struct ip_range *r, int af) { char *name, *ap, start[INET6_ADDRSTRLEN]; char end[INET6_ADDRSTRLEN]; struct in_addr a; uint32_t tmp; if (verbose) D("extract IP range from %s", r->name); name = strdup(r->name); if (name == NULL) { D("strdup failed"); usage(-1); } /* the first - splits start/end of range */ ap = strchr(name, '-'); if (ap != NULL) *ap++ = '\0'; r->port0 = 1234; /* default port */ if (af == AF_INET6) { r->ipv6.sgroup = 7; /* default group */ extract_ipv6_addr(name, &r->ipv6.start, &r->port0, &r->ipv6.sgroup); } else extract_ipv4_addr(name, &r->ipv4.start, &r->port0); r->port1 = r->port0; if (af == AF_INET6) { if (ap != NULL) { r->ipv6.egroup = r->ipv6.sgroup; extract_ipv6_addr(ap, &r->ipv6.end, &r->port1, &r->ipv6.egroup); } else { r->ipv6.end = r->ipv6.start; r->ipv6.egroup = r->ipv6.sgroup; } } else { if (ap != NULL) { extract_ipv4_addr(ap, &r->ipv4.end, &r->port1); if (r->ipv4.start > r->ipv4.end) { tmp = r->ipv4.end; r->ipv4.end = r->ipv4.start; r->ipv4.start = tmp; } } else r->ipv4.end = r->ipv4.start; } if (r->port0 > r->port1) { tmp = r->port0; r->port0 = r->port1; r->port1 = tmp; } if (af == AF_INET) { a.s_addr = htonl(r->ipv4.start); inet_ntop(af, &a, start, sizeof(start)); a.s_addr = htonl(r->ipv4.end); inet_ntop(af, &a, end, sizeof(end)); } else { inet_ntop(af, &r->ipv6.start, start, sizeof(start)); inet_ntop(af, &r->ipv6.end, end, sizeof(end)); } if (af == AF_INET) D("range is %s:%d to %s:%d", start, r->port0, end, r->port1); else D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup, start, r->port0, r->ipv6.egroup, end, r->port1); free(name); if (r->port0 != r->port1 || (af == AF_INET && r->ipv4.start != r->ipv4.end) || (af == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end))) return (OPT_COPY); return (0); } static int extract_mac_range(struct mac_range *r) { struct ether_addr *e; if (verbose) D("extract MAC range from %s", r->name); e = ether_aton(r->name); if (e == NULL) { D("invalid MAC address '%s'", r->name); return 1; } bcopy(e, &r->start, 6); bcopy(e, &r->end, 6); #if 0 bcopy(targ->src_mac, eh->ether_shost, 6); p = index(targ->g->src_mac, '-'); if (p) targ->src_mac_range = atoi(p+1); bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6); bcopy(targ->dst_mac, eh->ether_dhost, 6); p = index(targ->g->dst_mac, '-'); if (p) targ->dst_mac_range = atoi(p+1); #endif if (verbose) D("%s starts at %s", r->name, ether_ntoa(&r->start)); return 0; } static int get_if_mtu(const struct glob_arg *g) { char ifname[IFNAMSIZ]; struct ifreq ifreq; int s, ret; if (!strncmp(g->ifname, "netmap:", 7) && !strchr(g->ifname, '{') && !strchr(g->ifname, '}')) { /* Parse the interface name and ask the kernel for the * MTU value. */ strncpy(ifname, g->ifname+7, IFNAMSIZ-1); ifname[strcspn(ifname, "-*^{}/@")] = '\0'; s = socket(AF_INET, SOCK_DGRAM, 0); if (s < 0) { D("socket() failed: %s", strerror(errno)); return s; } memset(&ifreq, 0, sizeof(ifreq)); strncpy(ifreq.ifr_name, ifname, IFNAMSIZ); ret = ioctl(s, SIOCGIFMTU, &ifreq); if (ret) { D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno)); } return ifreq.ifr_mtu; } /* This is a pipe or a VALE port, where the MTU is very large, * so we use some practical limit. */ return 65536; } static struct targ *targs; static int global_nthreads; /* control-C handler */ static void sigint_h(int sig) { int i; (void)sig; /* UNUSED */ D("received control-C on thread %p", (void *)pthread_self()); for (i = 0; i < global_nthreads; i++) { targs[i].cancel = 1; } } /* sysctl wrapper to return the number of active CPUs */ static int system_ncpus(void) { int ncpus; #if defined (__FreeBSD__) int mib[2] = { CTL_HW, HW_NCPU }; size_t len = sizeof(mib); sysctl(mib, 2, &ncpus, &len, NULL, 0); #elif defined(linux) ncpus = sysconf(_SC_NPROCESSORS_ONLN); #elif defined(_WIN32) { SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); ncpus = sysinfo.dwNumberOfProcessors; } #else /* others */ ncpus = 1; #endif /* others */ return (ncpus); } #ifdef __linux__ #define sockaddr_dl sockaddr_ll #define sdl_family sll_family #define AF_LINK AF_PACKET #define LLADDR(s) s->sll_addr; #include #define TAP_CLONEDEV "/dev/net/tun" #endif /* __linux__ */ #ifdef __FreeBSD__ #include #define TAP_CLONEDEV "/dev/tap" #endif /* __FreeBSD */ #ifdef __APPLE__ // #warning TAP not supported on apple ? #include #define TAP_CLONEDEV "/dev/tap" #endif /* __APPLE__ */ /* * parse the vale configuration in conf and put it in nmr. * Return the flag set if necessary. * The configuration may consist of 1 to 4 numbers separated * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number * is specified, then this is assigned to both #tx-slots and #rx-slots. * If there is no 4th number, then the 3rd is assigned to both #tx-rings * and #rx-rings. */ -int +static int parse_nmr_config(const char* conf, struct nmreq *nmr) { char *w, *tok; int i, v; if (conf == NULL || ! *conf) return 0; nmr->nr_tx_rings = nmr->nr_rx_rings = 0; nmr->nr_tx_slots = nmr->nr_rx_slots = 0; w = strdup(conf); for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { v = atoi(tok); switch (i) { case 0: nmr->nr_tx_slots = nmr->nr_rx_slots = v; break; case 1: nmr->nr_rx_slots = v; break; case 2: nmr->nr_tx_rings = nmr->nr_rx_rings = v; break; case 3: nmr->nr_rx_rings = v; break; default: D("ignored config: %s", tok); break; } } D("txr %d txd %d rxr %d rxd %d", nmr->nr_tx_rings, nmr->nr_tx_slots, nmr->nr_rx_rings, nmr->nr_rx_slots); free(w); return (nmr->nr_tx_rings || nmr->nr_tx_slots || nmr->nr_rx_rings || nmr->nr_rx_slots) ? NM_OPEN_RING_CFG : 0; } /* * locate the src mac address for our interface, put it * into the user-supplied buffer. return 0 if ok, -1 on error. */ static int source_hwaddr(const char *ifname, char *buf) { struct ifaddrs *ifaphead, *ifap; if (getifaddrs(&ifaphead) != 0) { D("getifaddrs %s failed", ifname); return (-1); } for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)ifap->ifa_addr; uint8_t *mac; if (!sdl || sdl->sdl_family != AF_LINK) continue; if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0) continue; mac = (uint8_t *)LLADDR(sdl); sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x", mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); if (verbose) D("source hwaddr %s", buf); break; } freeifaddrs(ifaphead); return ifap ? 0 : 1; } /* set the thread affinity. */ static int setaffinity(pthread_t me, int i) { cpuset_t cpumask; if (i == -1) return 0; /* Set thread affinity affinity.*/ CPU_ZERO(&cpumask); CPU_SET(i, &cpumask); if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { D("Unable to set affinity: %s", strerror(errno)); return 1; } return 0; } /* Compute the checksum of the given ip header. */ static uint32_t checksum(const void *data, uint16_t len, uint32_t sum) { const uint8_t *addr = data; uint32_t i; /* Checksum all the pairs of bytes first... */ for (i = 0; i < (len & ~1U); i += 2) { - sum += (u_int16_t)ntohs(*((u_int16_t *)(addr + i))); + sum += (uint16_t)ntohs(*((const uint16_t *)(addr + i))); if (sum > 0xFFFF) sum -= 0xFFFF; } /* * If there's a single byte left over, checksum it, too. * Network byte order is big-endian, so the remaining byte is * the high byte. */ if (i < len) { sum += addr[i] << 8; if (sum > 0xFFFF) sum -= 0xFFFF; } return sum; } static uint16_t wrapsum(uint32_t sum) { sum = ~sum & 0xFFFF; return (htons(sum)); } /* Check the payload of the packet for errors (use it for debug). * Look for consecutive ascii representations of the size of the packet. */ static void dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur) { char buf[128]; int i, j, i0; const unsigned char *p = (const unsigned char *)_p; /* get the length in ASCII of the length of the packet. */ printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", ring, cur, ring->slot[cur].buf_idx, ring->slot[cur].flags, len); /* hexdump routine */ for (i = 0; i < len; ) { memset(buf, ' ', sizeof(buf)); sprintf(buf, "%5d: ", i); i0 = i; for (j=0; j < 16 && i < len; i++, j++) sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i])); i = i0; for (j=0; j < 16 && i < len; i++, j++) sprintf(buf+7+j + 48, "%c", isprint(p[i]) ? p[i] : '.'); printf("%s\n", buf); } } /* * Fill a packet with some payload. * We create a UDP packet so the payload starts at * 14+20+8 = 42 bytes. */ #ifdef __linux__ #define uh_sport source #define uh_dport dest #define uh_ulen len #define uh_sum check #endif /* linux */ static void update_ip(struct pkt *pkt, struct targ *t) { struct glob_arg *g = t->g; struct ip ip; struct udphdr udp; uint32_t oaddr, naddr; uint16_t oport, nport; uint16_t ip_sum, udp_sum; memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); memcpy(&udp, &pkt->ipv4.udp, sizeof(udp)); do { ip_sum = udp_sum = 0; naddr = oaddr = ntohl(ip.ip_src.s_addr); nport = oport = ntohs(udp.uh_sport); if (g->options & OPT_RANDOM_SRC) { ip.ip_src.s_addr = nrand48(t->seed); udp.uh_sport = nrand48(t->seed); naddr = ntohl(ip.ip_src.s_addr); nport = ntohs(udp.uh_sport); break; } if (oport < g->src_ip.port1) { nport = oport + 1; udp.uh_sport = htons(nport); break; } nport = g->src_ip.port0; udp.uh_sport = htons(nport); if (oaddr < g->src_ip.ipv4.end) { naddr = oaddr + 1; ip.ip_src.s_addr = htonl(naddr); break; } naddr = g->src_ip.ipv4.start; ip.ip_src.s_addr = htonl(naddr); } while (0); /* update checksums if needed */ if (oaddr != naddr) { ip_sum = cksum_add(ip_sum, ~oaddr >> 16); ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); ip_sum = cksum_add(ip_sum, naddr >> 16); ip_sum = cksum_add(ip_sum, naddr & 0xffff); } if (oport != nport) { udp_sum = cksum_add(udp_sum, ~oport); udp_sum = cksum_add(udp_sum, nport); } do { naddr = oaddr = ntohl(ip.ip_dst.s_addr); nport = oport = ntohs(udp.uh_dport); if (g->options & OPT_RANDOM_DST) { ip.ip_dst.s_addr = nrand48(t->seed); udp.uh_dport = nrand48(t->seed); naddr = ntohl(ip.ip_dst.s_addr); nport = ntohs(udp.uh_dport); break; } if (oport < g->dst_ip.port1) { nport = oport + 1; udp.uh_dport = htons(nport); break; } nport = g->dst_ip.port0; udp.uh_dport = htons(nport); if (oaddr < g->dst_ip.ipv4.end) { naddr = oaddr + 1; ip.ip_dst.s_addr = htonl(naddr); break; } naddr = g->dst_ip.ipv4.start; ip.ip_dst.s_addr = htonl(naddr); } while (0); /* update checksums */ if (oaddr != naddr) { ip_sum = cksum_add(ip_sum, ~oaddr >> 16); ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff); ip_sum = cksum_add(ip_sum, naddr >> 16); ip_sum = cksum_add(ip_sum, naddr & 0xffff); } if (oport != nport) { udp_sum = cksum_add(udp_sum, ~oport); udp_sum = cksum_add(udp_sum, nport); } if (udp_sum != 0) udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum)); if (ip_sum != 0) { ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum)); udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum)); } memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); memcpy(&pkt->ipv4.udp, &udp, sizeof(udp)); } #ifndef s6_addr16 #define s6_addr16 __u6_addr.__u6_addr16 #endif static void update_ip6(struct pkt *pkt, struct targ *t) { struct glob_arg *g = t->g; struct ip6_hdr ip6; struct udphdr udp; uint16_t udp_sum; uint16_t oaddr, naddr; uint16_t oport, nport; uint8_t group; memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6)); memcpy(&udp, &pkt->ipv6.udp, sizeof(udp)); do { udp_sum = 0; group = g->src_ip.ipv6.sgroup; naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]); nport = oport = ntohs(udp.uh_sport); if (g->options & OPT_RANDOM_SRC) { ip6.ip6_src.s6_addr16[group] = nrand48(t->seed); udp.uh_sport = nrand48(t->seed); naddr = ntohs(ip6.ip6_src.s6_addr16[group]); nport = ntohs(udp.uh_sport); break; } if (oport < g->src_ip.port1) { nport = oport + 1; udp.uh_sport = htons(nport); break; } nport = g->src_ip.port0; udp.uh_sport = htons(nport); if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) { naddr = oaddr + 1; ip6.ip6_src.s6_addr16[group] = htons(naddr); break; } naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]); ip6.ip6_src.s6_addr16[group] = htons(naddr); } while (0); /* update checksums if needed */ if (oaddr != naddr) udp_sum = cksum_add(~oaddr, naddr); if (oport != nport) udp_sum = cksum_add(udp_sum, cksum_add(~oport, nport)); do { group = g->dst_ip.ipv6.egroup; naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]); nport = oport = ntohs(udp.uh_dport); if (g->options & OPT_RANDOM_DST) { ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed); udp.uh_dport = nrand48(t->seed); naddr = ntohs(ip6.ip6_dst.s6_addr16[group]); nport = ntohs(udp.uh_dport); break; } if (oport < g->dst_ip.port1) { nport = oport + 1; udp.uh_dport = htons(nport); break; } nport = g->dst_ip.port0; udp.uh_dport = htons(nport); if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) { naddr = oaddr + 1; ip6.ip6_dst.s6_addr16[group] = htons(naddr); break; } naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]); ip6.ip6_dst.s6_addr16[group] = htons(naddr); } while (0); /* update checksums */ if (oaddr != naddr) udp_sum = cksum_add(udp_sum, cksum_add(~oaddr, naddr)); if (oport != nport) udp_sum = cksum_add(udp_sum, cksum_add(~oport, nport)); if (udp_sum != 0) udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum); memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); memcpy(&pkt->ipv6.udp, &udp, sizeof(udp)); } static void update_addresses(struct pkt *pkt, struct targ *t) { if (t->g->af == AF_INET) update_ip(pkt, t); else update_ip6(pkt, t); } /* * initialize one packet and prepare for the next one. * The copy could be done better instead of repeating it each time. */ static void initialize_packet(struct targ *targ) { struct pkt *pkt = &targ->pkt; struct ether_header *eh; struct ip6_hdr ip6; struct ip ip; struct udphdr udp; void *udp_ptr; uint16_t paylen; uint32_t csum = 0; const char *payload = targ->g->options & OPT_INDIRECT ? indirect_payload : default_payload; int i, l0 = strlen(payload); #ifndef NO_PCAP char errbuf[PCAP_ERRBUF_SIZE]; pcap_t *file; struct pcap_pkthdr *header; const unsigned char *packet; /* Read a packet from a PCAP file if asked. */ if (targ->g->packet_file != NULL) { if ((file = pcap_open_offline(targ->g->packet_file, errbuf)) == NULL) D("failed to open pcap file %s", targ->g->packet_file); if (pcap_next_ex(file, &header, &packet) < 0) D("failed to read packet from %s", targ->g->packet_file); if ((targ->frame = malloc(header->caplen)) == NULL) D("out of memory"); bcopy(packet, (unsigned char *)targ->frame, header->caplen); targ->g->pkt_size = header->caplen; pcap_close(file); return; } #endif paylen = targ->g->pkt_size - sizeof(*eh) - (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6)); /* create a nice NUL-terminated string */ for (i = 0; i < paylen; i += l0) { if (l0 > paylen - i) l0 = paylen - i; // last round bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0); } PKT(pkt, body, targ->g->af)[i - 1] = '\0'; /* prepare the headers */ eh = &pkt->eh; bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); if (targ->g->af == AF_INET) { eh->ether_type = htons(ETHERTYPE_IP); memcpy(&ip, &pkt->ipv4.ip, sizeof(ip)); udp_ptr = &pkt->ipv4.udp; ip.ip_v = IPVERSION; ip.ip_hl = sizeof(ip) >> 2; ip.ip_id = 0; ip.ip_tos = IPTOS_LOWDELAY; ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh)); ip.ip_id = 0; ip.ip_off = htons(IP_DF); /* Don't fragment */ ip.ip_ttl = IPDEFTTL; ip.ip_p = IPPROTO_UDP; ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start); ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start); ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0)); memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); } else { eh->ether_type = htons(ETHERTYPE_IPV6); memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6)); udp_ptr = &pkt->ipv6.udp; ip6.ip6_flow = 0; ip6.ip6_plen = htons(paylen); ip6.ip6_vfc = IPV6_VERSION; ip6.ip6_nxt = IPPROTO_UDP; ip6.ip6_hlim = IPV6_DEFHLIM; ip6.ip6_src = targ->g->src_ip.ipv6.start; ip6.ip6_dst = targ->g->dst_ip.ipv6.start; } memcpy(&udp, udp_ptr, sizeof(udp)); udp.uh_sport = htons(targ->g->src_ip.port0); udp.uh_dport = htons(targ->g->dst_ip.port0); udp.uh_ulen = htons(paylen); if (targ->g->af == AF_INET) { /* Magic: taken from sbin/dhclient/packet.c */ udp.uh_sum = wrapsum( checksum(&udp, sizeof(udp), /* udp header */ checksum(pkt->ipv4.body, /* udp payload */ paylen - sizeof(udp), checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */ 2 * sizeof(pkt->ipv4.ip.ip_src), IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen))))); memcpy(&pkt->ipv4.ip, &ip, sizeof(ip)); } else { /* Save part of pseudo header checksum into csum */ csum = IPPROTO_UDP << 24; csum = checksum(&csum, sizeof(csum), paylen); udp.uh_sum = wrapsum( checksum(udp_ptr, sizeof(udp), /* udp header */ checksum(pkt->ipv6.body, /* udp payload */ paylen - sizeof(udp), checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */ 2 * sizeof(pkt->ipv6.ip.ip6_src), csum)))); memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6)); } memcpy(udp_ptr, &udp, sizeof(udp)); bzero(&pkt->vh, sizeof(pkt->vh)); // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } static void get_vnet_hdr_len(struct glob_arg *g) { struct nmreq req; int err; memset(&req, 0, sizeof(req)); bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); req.nr_version = NETMAP_API; req.nr_cmd = NETMAP_VNET_HDR_GET; err = ioctl(g->main_fd, NIOCREGIF, &req); if (err) { D("Unable to get virtio-net header length"); return; } g->virt_header = req.nr_arg1; if (g->virt_header) { D("Port requires virtio-net header, length = %d", g->virt_header); } } static void set_vnet_hdr_len(struct glob_arg *g) { int err, l = g->virt_header; struct nmreq req; if (l == 0) return; memset(&req, 0, sizeof(req)); bcopy(g->nmd->req.nr_name, req.nr_name, sizeof(req.nr_name)); req.nr_version = NETMAP_API; req.nr_cmd = NETMAP_BDG_VNET_HDR; req.nr_arg1 = l; err = ioctl(g->main_fd, NIOCREGIF, &req); if (err) { D("Unable to set virtio-net header length %d", l); } } /* * create and enqueue a batch of packets on a ring. * On the last one set NS_REPORT to tell the driver to generate * an interrupt when done. */ static int send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, int size, struct targ *t, u_int count, int options) { u_int n, sent, head = ring->head; u_int frags = t->frags; u_int frag_size = t->frag_size; struct netmap_slot *slot = &ring->slot[head]; n = nm_ring_space(ring); #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { for (sent = 0; sent < count; sent++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); __builtin_prefetch(p); head = nm_ring_next(ring, head); } head = ring->head; } #endif for (sent = 0; sent < count && n >= frags; sent++, n--) { char *p; int buf_changed; u_int tosend = size; slot = &ring->slot[head]; p = NETMAP_BUF(ring, slot->buf_idx); buf_changed = slot->flags & NS_BUF_CHANGED; slot->flags = 0; if (options & OPT_RUBBISH) { /* do nothing */ } else if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; slot->ptr = (uint64_t)((uintptr_t)frame); } else if (frags > 1) { u_int i; const char *f = frame; char *fp = p; for (i = 0; i < frags - 1; i++) { memcpy(fp, f, frag_size); slot->len = frag_size; slot->flags = NS_MOREFRAG; if (options & OPT_DUMP) dump_payload(fp, frag_size, ring, head); tosend -= frag_size; f += frag_size; head = nm_ring_next(ring, head); slot = &ring->slot[head]; fp = NETMAP_BUF(ring, slot->buf_idx); } n -= (frags - 1); p = fp; slot->flags = 0; memcpy(p, f, tosend); update_addresses(pkt, t); } else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) { if (options & OPT_COPY) nm_pkt_copy(frame, p, size); else memcpy(p, frame, size); update_addresses(pkt, t); } else if (options & OPT_PREFETCH) { __builtin_prefetch(p); } slot->len = tosend; if (options & OPT_DUMP) dump_payload(p, tosend, ring, head); head = nm_ring_next(ring, head); } if (sent) { slot->flags |= NS_REPORT; ring->head = ring->cur = head; } if (sent < count) { /* tell netmap that we need more slots */ ring->cur = ring->tail; } return (sent); } /* * Index of the highest bit set */ -uint32_t +static uint32_t msb64(uint64_t x) { uint64_t m = 1ULL << 63; int i; for (i = 63; i >= 0; i--, m >>=1) if (m & x) return i; return 0; } /* * wait until ts, either busy or sleeping if more than 1ms. * Return wakeup time. */ static struct timespec wait_time(struct timespec ts) { for (;;) { struct timespec w, cur; clock_gettime(CLOCK_REALTIME_PRECISE, &cur); w = timespec_sub(ts, cur); if (w.tv_sec < 0) return cur; else if (w.tv_sec > 0 || w.tv_nsec > 1000000) poll(NULL, 0, 1); } } /* * Send a packet, and wait for a response. * The payload (after UDP header, ofs 42) has a 4-byte sequence * followed by a struct timeval (or bintime?) */ static void * ping_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; int i, m, rx = 0; void *frame; int size; struct timespec ts, now, last_print; struct timespec nexttime = {0, 0}; /* silence compiler */ uint64_t sent = 0, n = targ->g->npackets; uint64_t count = 0, t_cur, t_min = ~0, av = 0; uint64_t g_min = ~0, g_av = 0; uint64_t buckets[64]; /* bins for delays, ns */ int rate_limit = targ->g->tx_rate, tosend = 0; frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; if (targ->g->nthreads > 1) { D("can only ping with 1 thread"); return NULL; } bzero(&buckets, sizeof(buckets)); clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); now = last_print; if (rate_limit) { targ->tic = timespec_add(now, (struct timespec){2,0}); targ->tic.tv_nsec = 0; wait_time(targ->tic); nexttime = targ->tic; } while (!targ->cancel && (n == 0 || sent < n)) { struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); struct netmap_slot *slot; char *p; int rv; uint64_t limit, event = 0; if (rate_limit && tosend <= 0) { tosend = targ->g->burst; nexttime = timespec_add(nexttime, targ->g->tx_period); wait_time(nexttime); } limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n - sent < limit) limit = n - sent; for (m = 0; (unsigned)m < limit; m++) { slot = &ring->slot[ring->head]; slot->len = size; p = NETMAP_BUF(ring, slot->buf_idx); if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); break; } else { struct tstamp *tp; nm_pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); tp = (struct tstamp *)(p+46); tp->sec = (uint32_t)ts.tv_sec; tp->nsec = (uint32_t)ts.tv_nsec; sent++; ring->head = ring->cur = nm_ring_next(ring, ring->head); } } if (m > 0) event++; targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = event; if (rate_limit) tosend -= m; #ifdef BUSYWAIT rv = ioctl(pfd.fd, NIOCTXSYNC, NULL); if (rv < 0) { D("TXSYNC error on queue %d: %s", targ->me, strerror(errno)); } again: ioctl(pfd.fd, NIOCRXSYNC, NULL); #else /* should use a parameter to decide how often to send */ if ( (rv = poll(&pfd, 1, 3000)) <= 0) { D("poll error on queue %d: %s", targ->me, (rv ? strerror(errno) : "timeout")); continue; } #endif /* BUSYWAIT */ /* see what we got back */ rx = 0; for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { ring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(ring)) { uint32_t seq; struct tstamp *tp; int pos; slot = &ring->slot[ring->head]; p = NETMAP_BUF(ring, slot->buf_idx); clock_gettime(CLOCK_REALTIME_PRECISE, &now); bcopy(p+42, &seq, sizeof(seq)); tp = (struct tstamp *)(p+46); ts.tv_sec = (time_t)tp->sec; ts.tv_nsec = (long)tp->nsec; ts.tv_sec = now.tv_sec - ts.tv_sec; ts.tv_nsec = now.tv_nsec - ts.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_nsec += 1000000000; ts.tv_sec--; } if (0) D("seq %d/%llu delta %d.%09d", seq, (unsigned long long)sent, (int)ts.tv_sec, (int)ts.tv_nsec); t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec; if (t_cur < t_min) t_min = t_cur; count ++; av += t_cur; pos = msb64(t_cur); buckets[pos]++; /* now store it in a bucket */ ring->head = ring->cur = nm_ring_next(ring, ring->head); rx++; } } //D("tx %d rx %d", sent, rx); //usleep(100000); ts.tv_sec = now.tv_sec - last_print.tv_sec; ts.tv_nsec = now.tv_nsec - last_print.tv_nsec; if (ts.tv_nsec < 0) { ts.tv_nsec += 1000000000; ts.tv_sec--; } if (ts.tv_sec >= 1) { D("count %d RTT: min %d av %d ns", (int)count, (int)t_min, (int)(av/count)); int k, j, kmin, off; char buf[512]; for (kmin = 0; kmin < 64; kmin ++) if (buckets[kmin]) break; for (k = 63; k >= kmin; k--) if (buckets[k]) break; buf[0] = '\0'; off = 0; for (j = kmin; j <= k; j++) { off += sprintf(buf + off, " %5d", (int)buckets[j]); } D("k: %d .. %d\n\t%s", 1<cancel) goto again; #endif /* BUSYWAIT */ } if (sent > 0) { D("RTT over %llu packets: min %d av %d ns", (long long unsigned)sent, (int)g_min, (int)((double)g_av/sent)); } targ->completed = 1; /* reset the ``used`` flag. */ targ->used = 0; return NULL; } /* * reply to ping requests */ static void * pong_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring, *rxring; int i, rx = 0; uint64_t sent = 0, n = targ->g->npackets; if (targ->g->nthreads > 1) { D("can only reply ping with 1 thread"); return NULL; } if (n > 0) D("understood ponger %llu but don't know how to do it", (unsigned long long)n); while (!targ->cancel && (n == 0 || sent < n)) { uint32_t txhead, txavail; //#define BUSYWAIT #ifdef BUSYWAIT ioctl(pfd.fd, NIOCRXSYNC, NULL); #else int rv; if ( (rv = poll(&pfd, 1, 1000)) <= 0) { D("poll error on queue %d: %s", targ->me, rv ? strerror(errno) : "timeout"); continue; } #endif txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); txhead = txring->head; txavail = nm_ring_space(txring); /* see what we got back */ for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { rxring = NETMAP_RXRING(nifp, i); while (!nm_ring_empty(rxring)) { uint16_t *spkt, *dpkt; uint32_t head = rxring->head; struct netmap_slot *slot = &rxring->slot[head]; char *src, *dst; src = NETMAP_BUF(rxring, slot->buf_idx); //D("got pkt %p of size %d", src, slot->len); rxring->head = rxring->cur = nm_ring_next(rxring, head); rx++; if (txavail == 0) continue; dst = NETMAP_BUF(txring, txring->slot[txhead].buf_idx); /* copy... */ dpkt = (uint16_t *)dst; spkt = (uint16_t *)src; nm_pkt_copy(src, dst, slot->len); /* swap source and destination MAC */ dpkt[0] = spkt[3]; dpkt[1] = spkt[4]; dpkt[2] = spkt[5]; dpkt[3] = spkt[0]; dpkt[4] = spkt[1]; dpkt[5] = spkt[2]; txring->slot[txhead].len = slot->len; txhead = nm_ring_next(txring, txhead); txavail--; sent++; } } txring->head = txring->cur = txhead; targ->ctr.pkts = sent; #ifdef BUSYWAIT ioctl(pfd.fd, NIOCTXSYNC, NULL); #endif //D("tx %d rx %d", sent, rx); } targ->completed = 1; /* reset the ``used`` flag. */ targ->used = 0; return NULL; } static void * sender_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; struct netmap_if *nifp; struct netmap_ring *txring = NULL; int i; uint64_t n = targ->g->npackets / targ->g->nthreads; uint64_t sent = 0; uint64_t event = 0; int options = targ->g->options | OPT_COPY; struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; struct pkt *pkt = &targ->pkt; void *frame; int size; if (targ->frame == NULL) { frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; } else { frame = targ->frame; size = targ->g->pkt_size; } D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); if (setaffinity(targ->thread, targ->affinity)) goto quit; /* main loop.*/ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; wait_time(targ->tic); nexttime = targ->tic; } if (targ->g->dev_type == DEV_TAP) { D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (write(targ->g->main_fd, frame, size) != -1) sent++; update_addresses(pkt, targ); if (i > 10000) { targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = sent; i = 0; } } #ifndef NO_PCAP } else if (targ->g->dev_type == DEV_PCAP) { pcap_t *p = targ->g->p; for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { if (pcap_inject(p, frame, size) != -1) sent++; update_addresses(pkt, targ); if (i > 10000) { targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = sent; i = 0; } } #endif /* NO_PCAP */ } else { int tosend = 0; u_int bufsz, frag_size = targ->g->frag_size; nifp = targ->nmd->nifp; txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring); bufsz = txring->nr_buf_size; if (bufsz < frag_size) frag_size = bufsz; targ->frag_size = targ->g->pkt_size / targ->frags; if (targ->frag_size > frag_size) { targ->frags = targ->g->pkt_size / frag_size; targ->frag_size = frag_size; if (targ->g->pkt_size % frag_size != 0) targ->frags++; } D("frags %u frag_size %u", targ->frags, targ->frag_size); while (!targ->cancel && (n == 0 || sent < n)) { int rv; if (rate_limit && tosend <= 0) { tosend = targ->g->burst; nexttime = timespec_add(nexttime, targ->g->tx_period); wait_time(nexttime); } /* * wait for available room in the send queue(s) */ #ifdef BUSYWAIT (void)rv; if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if ( (rv = poll(&pfd, 1, 2000)) <= 0) { if (targ->cancel) break; D("poll error on queue %d: %s", targ->me, rv ? strerror(errno) : "timeout"); // goto quit; } if (pfd.revents & POLLERR) { D("poll error on %d ring %d-%d", pfd.fd, targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); goto quit; } #endif /* !BUSYWAIT */ /* * scan our queues and send on those with room */ if (options & OPT_COPY && sent > 100000 && !(targ->g->options & OPT_COPY) ) { D("drop copy"); options &= ~OPT_COPY; } for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { int m; uint64_t limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n == sent) break; if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); if (nm_ring_empty(txring)) continue; if (targ->g->pkt_min_size > 0) { size = nrand48(targ->seed) % (targ->g->pkt_size - targ->g->pkt_min_size) + targ->g->pkt_min_size; } m = send_packets(txring, pkt, frame, size, targ, limit, options); ND("limit %lu tail %d m %d", limit, txring->tail, m); sent += m; if (m > 0) //XXX-ste: can m be 0? event++; targ->ctr.pkts = sent; targ->ctr.bytes += m*size; targ->ctr.events = event; if (rate_limit) { tosend -= m; if (tosend <= 0) break; } } } /* flush any remaining packets */ if (txring != NULL) { D("flush tail %d head %d on thread %p", txring->tail, txring->head, (void *)pthread_self()); ioctl(pfd.fd, NIOCTXSYNC, NULL); } /* final part: wait all the TX queues to be empty. */ for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); while (!targ->cancel && nm_tx_pending(txring)) { RD(5, "pending tx tail %d head %d on ring %d", txring->tail, txring->head, i); ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } } } /* end DEV_NETMAP */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; targ->ctr.pkts = sent; targ->ctr.bytes = sent*size; targ->ctr.events = event; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } #ifndef NO_PCAP static void receive_pcap(u_char *user, const struct pcap_pkthdr * h, const u_char * bytes) { struct my_ctrs *ctr = (struct my_ctrs *)user; (void)bytes; /* UNUSED */ ctr->bytes += h->len; ctr->pkts++; } #endif /* !NO_PCAP */ static int receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes) { u_int head, rx, n; uint64_t b = 0; u_int complete = 0; if (bytes == NULL) bytes = &b; head = ring->head; n = nm_ring_space(ring); if (n < limit) limit = n; for (rx = 0; rx < limit; rx++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); *bytes += slot->len; if (dump) dump_payload(p, slot->len, ring, head); if (!(slot->flags & NS_MOREFRAG)) complete++; head = nm_ring_next(ring, head); } ring->head = ring->cur = head; return (complete); } static void * receiver_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; struct netmap_if *nifp; struct netmap_ring *rxring; int i; struct my_ctrs cur; memset(&cur, 0, sizeof(cur)); if (setaffinity(targ->thread, targ->affinity)) goto quit; D("reading from %s fd %d main_fd %d", targ->g->ifname, targ->fd, targ->g->main_fd); /* unbounded wait for the first packet. */ for (;!targ->cancel;) { i = poll(&pfd, 1, 1000); if (i > 0 && !(pfd.revents & POLLERR)) break; if (i < 0) { D("poll() error: %s", strerror(errno)); goto quit; } if (pfd.revents & POLLERR) { D("fd error"); goto quit; } RD(1, "waiting for initial packets, poll returns %d %d", i, pfd.revents); } /* main loop, exit after 1s silence */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (targ->g->dev_type == DEV_TAP) { while (!targ->cancel) { char buf[MAX_BODYSIZE]; /* XXX should we poll ? */ i = read(targ->g->main_fd, buf, sizeof(buf)); if (i > 0) { targ->ctr.pkts++; targ->ctr.bytes += i; targ->ctr.events++; } } #ifndef NO_PCAP } else if (targ->g->dev_type == DEV_PCAP) { while (!targ->cancel) { /* XXX should we poll ? */ pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, (u_char *)&targ->ctr); targ->ctr.events++; } #endif /* !NO_PCAP */ } else { int dump = targ->g->options & OPT_DUMP; nifp = targ->nmd->nifp; while (!targ->cancel) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ #ifdef BUSYWAIT if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ goto out; } if (pfd.revents & POLLERR) { D("poll err"); goto quit; } #endif /* !BUSYWAIT */ uint64_t cur_space = 0; for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { int m; rxring = NETMAP_RXRING(nifp, i); /* compute free space in the ring */ m = rxring->head + rxring->num_slots - rxring->tail; if (m >= (int) rxring->num_slots) m -= rxring->num_slots; cur_space += m; if (nm_ring_empty(rxring)) continue; m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes); cur.pkts += m; if (m > 0) cur.events++; } cur.min_space = targ->ctr.min_space; if (cur_space < cur.min_space) cur.min_space = cur_space; targ->ctr = cur; } } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); #if !defined(BUSYWAIT) out: #endif targ->completed = 1; targ->ctr = cur; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } static void * txseq_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; struct netmap_ring *ring; int64_t sent = 0; uint64_t event = 0; int options = targ->g->options | OPT_COPY; struct timespec nexttime = {0, 0}; int rate_limit = targ->g->tx_rate; struct pkt *pkt = &targ->pkt; int frags = targ->g->frags; uint32_t sequence = 0; int budget = 0; void *frame; int size; if (targ->g->nthreads > 1) { D("can only txseq ping with 1 thread"); return NULL; } if (targ->g->npackets > 0) { D("Ignoring -n argument"); } frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header; size = targ->g->pkt_size + targ->g->virt_header; D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd); if (setaffinity(targ->thread, targ->affinity)) goto quit; clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; wait_time(targ->tic); nexttime = targ->tic; } /* Only use the first queue. */ ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring); while (!targ->cancel) { int64_t limit; unsigned int space; unsigned int head; int fcnt; uint16_t sum = 0; int rv; if (!rate_limit) { budget = targ->g->burst; } else if (budget <= 0) { budget = targ->g->burst; nexttime = timespec_add(nexttime, targ->g->tx_period); wait_time(nexttime); } /* wait for available room in the send queue */ #ifdef BUSYWAIT (void)rv; if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if ( (rv = poll(&pfd, 1, 2000)) <= 0) { if (targ->cancel) break; D("poll error on queue %d: %s", targ->me, rv ? strerror(errno) : "timeout"); // goto quit; } if (pfd.revents & POLLERR) { D("poll error on %d ring %d-%d", pfd.fd, targ->nmd->first_tx_ring, targ->nmd->last_tx_ring); goto quit; } #endif /* !BUSYWAIT */ /* If no room poll() again. */ space = nm_ring_space(ring); if (!space) { continue; } limit = budget; if (space < limit) { limit = space; } /* Cut off ``limit`` to make sure is multiple of ``frags``. */ if (frags > 1) { limit = (limit / frags) * frags; } limit = sent + limit; /* Convert to absolute. */ for (fcnt = frags, head = ring->head; sent < limit; sent++, sequence++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t; memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum)); slot->flags = 0; t = *w; PKT(pkt, body, targ->g->af)[0] = sequence >> 24; PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff; sum = ~cksum_add(~sum, cksum_add(~t, *w)); t = *++w; PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff; PKT(pkt, body, targ->g->af)[3] = sequence & 0xff; sum = ~cksum_add(~sum, cksum_add(~t, *w)); memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum)); nm_pkt_copy(frame, p, size); if (fcnt == frags) { update_addresses(pkt, targ); } if (options & OPT_DUMP) { dump_payload(p, size, ring, head); } slot->len = size; if (--fcnt > 0) { slot->flags |= NS_MOREFRAG; } else { fcnt = frags; } if (sent == limit - 1) { /* Make sure we don't push an incomplete * packet. */ assert(!(slot->flags & NS_MOREFRAG)); slot->flags |= NS_REPORT; } head = nm_ring_next(ring, head); if (rate_limit) { budget--; } } ring->cur = ring->head = head; event ++; targ->ctr.pkts = sent; targ->ctr.bytes = sent * size; targ->ctr.events = event; } /* flush any remaining packets */ D("flush tail %d head %d on thread %p", ring->tail, ring->head, (void *)pthread_self()); ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait the TX queues to become empty. */ while (!targ->cancel && nm_tx_pending(ring)) { RD(5, "pending tx tail %d head %d on ring %d", ring->tail, ring->head, targ->nmd->first_tx_ring); ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; targ->ctr.pkts = sent; targ->ctr.bytes = sent * size; targ->ctr.events = event; quit: /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } static char * multi_slot_to_string(struct netmap_ring *ring, unsigned int head, unsigned int nfrags, char *strbuf, size_t strbuflen) { unsigned int f; char *ret = strbuf; for (f = 0; f < nfrags; f++) { struct netmap_slot *slot = &ring->slot[head]; int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len, slot->flags); if (m >= (int)strbuflen) { break; } strbuf += m; strbuflen -= m; head = nm_ring_next(ring, head); } return ret; } static void * rxseq_body(void *data) { struct targ *targ = (struct targ *) data; struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; int dump = targ->g->options & OPT_DUMP; struct netmap_ring *ring; unsigned int frags_exp = 1; struct my_ctrs cur; unsigned int frags = 0; int first_packet = 1; int first_slot = 1; int i, j, af, nrings; uint32_t seq, *seq_exp = NULL; memset(&cur, 0, sizeof(cur)); if (setaffinity(targ->thread, targ->affinity)) goto quit; nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1; seq_exp = calloc(nrings, sizeof(uint32_t)); if (seq_exp == NULL) { D("failed to allocate seq array"); goto quit; } D("reading from %s fd %d main_fd %d", targ->g->ifname, targ->fd, targ->g->main_fd); /* unbounded wait for the first packet. */ for (;!targ->cancel;) { i = poll(&pfd, 1, 1000); if (i > 0 && !(pfd.revents & POLLERR)) break; RD(1, "waiting for initial packets, poll returns %d %d", i, pfd.revents); } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); while (!targ->cancel) { unsigned int head; int limit; #ifdef BUSYWAIT if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) { D("ioctl error on queue %d: %s", targ->me, strerror(errno)); goto quit; } #else /* !BUSYWAIT */ if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ goto out; } if (pfd.revents & POLLERR) { D("poll err"); goto quit; } #endif /* !BUSYWAIT */ for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) { ring = NETMAP_RXRING(targ->nmd->nifp, j); if (nm_ring_empty(ring)) continue; limit = nm_ring_space(ring); if (limit > targ->g->burst) limit = targ->g->burst; #if 0 /* Enable this if * 1) we remove the early-return optimization from * the netmap poll implementation, or * 2) pipes get NS_MOREFRAG support. * With the current netmap implementation, an experiment like * pkt-gen -i vale:1{1 -f txseq -F 9 * pkt-gen -i vale:1}1 -f rxseq * would get stuck as soon as we find nm_ring_space(ring) < 9, * since here limit is rounded to 0 and * pipe rxsync is not called anymore by the poll() of this loop. */ if (frags_exp > 1) { int o = limit; /* Cut off to the closest smaller multiple. */ limit = (limit / frags_exp) * frags_exp; RD(2, "LIMIT %d --> %d", o, limit); } #endif for (head = ring->head, i = 0; i < limit; i++) { struct netmap_slot *slot = &ring->slot[head]; char *p = NETMAP_BUF(ring, slot->buf_idx); int len = slot->len; struct pkt *pkt; if (dump) { dump_payload(p, slot->len, ring, head); } frags++; if (!(slot->flags & NS_MOREFRAG)) { if (first_packet) { first_packet = 0; } else if (frags != frags_exp) { char prbuf[512]; RD(1, "Received packets with %u frags, " "expected %u, '%s'", frags, frags_exp, multi_slot_to_string(ring, head-frags+1, frags, prbuf, sizeof(prbuf))); } first_packet = 0; frags_exp = frags; frags = 0; } p -= sizeof(pkt->vh) - targ->g->virt_header; len += sizeof(pkt->vh) - targ->g->virt_header; pkt = (struct pkt *)p; if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP) af = AF_INET; else af = AF_INET6; if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) + sizeof(seq)) { RD(1, "%s: packet too small (len=%u)", __func__, slot->len); } else { seq = (PKT(pkt, body, af)[0] << 24) | (PKT(pkt, body, af)[1] << 16) | (PKT(pkt, body, af)[2] << 8) | PKT(pkt, body, af)[3]; if (first_slot) { /* Grab the first one, whatever it is. */ seq_exp[j] = seq; first_slot = 0; } else if (seq != seq_exp[j]) { uint32_t delta = seq - seq_exp[j]; if (delta < (0xFFFFFFFF >> 1)) { RD(2, "Sequence GAP: exp %u found %u", seq_exp[j], seq); } else { RD(2, "Sequence OUT OF ORDER: " "exp %u found %u", seq_exp[j], seq); } seq_exp[j] = seq; } seq_exp[j]++; } cur.bytes += slot->len; head = nm_ring_next(ring, head); cur.pkts++; } ring->cur = ring->head = head; cur.events++; targ->ctr = cur; } } clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); #ifndef BUSYWAIT out: #endif /* !BUSYWAIT */ targ->completed = 1; targ->ctr = cur; quit: if (seq_exp != NULL) free(seq_exp); /* reset the ``used`` flag. */ targ->used = 0; return (NULL); } static void tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg) { double bw, raw_bw, pps, abs; char b1[40], b2[80], b3[80]; int size; if (cur->pkts == 0) { printf("%s nothing.\n", msg); return; } size = (int)(cur->bytes / cur->pkts); printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n", msg, (unsigned long long)cur->pkts, (unsigned long long)cur->bytes, (unsigned long long)cur->events, size, delta); if (delta == 0) delta = 1e-6; if (size < 60) /* correct for min packet size */ size = 60; pps = cur->pkts / delta; bw = (8.0 * cur->bytes) / delta; raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta; abs = cur->pkts / (double)(cur->events); printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n", norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs); } static void usage(int errcode) { /* This usage is generated from the pkt-gen man page: * $ man pkt-gen > x * and pasted here adding the string terminators and endlines with simple * regular expressions. */ const char *cmd = "pkt-gen"; fprintf(stderr, "Usage:\n" "%s arguments\n" " -h Show program usage and exit.\n" "\n" " -i interface\n" " Name of the network interface that pkt-gen operates on. It can be a system network interface\n" " (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n" " monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n" " mented in netmap(4) (NIOCREGIF section).\n" "\n" " -f function\n" " The function to be executed by pkt-gen. Specify tx for transmission, rx for reception, ping\n" " for client-side ping-pong operation, and pong for server-side ping-pong operation.\n" "\n" " -n count\n" " Number of iterations of the pkt-gen function (with 0 meaning infinite). In case of tx or rx,\n" " count is the number of packets to receive or transmit. In case of ping or pong, count is the\n" " number of ping-pong transactions.\n" "\n" " -l pkt_size\n" " Packet size in bytes excluding CRC. If passed a second time, use random sizes larger or\n" " equal than the second one and lower than the first one.\n" "\n" " -b burst_size\n" " Transmit or receive up to burst_size packets at a time.\n" "\n" " -4 Use IPv4 addresses.\n" "\n" " -6 Use IPv6 addresses.\n" "\n" " -d dst_ip[:port[-dst_ip:port]]\n" " Destination IPv4/IPv6 address and port, single or range.\n" "\n" " -s src_ip[:port[-src_ip:port]]\n" " Source IPv4/IPv6 address and port, single or range.\n" "\n" " -D dst_mac\n" " Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n" "\n" " -S src_mac\n" " Source MAC address in colon notation.\n" "\n" " -a cpu_id\n" " Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3). If more\n" " threads are used, they are pinned to the subsequent CPUs, one per thread.\n" "\n" " -c cpus\n" " Maximum number of CPUs to use (0 means to use all the available ones).\n" "\n" " -p threads\n" " Number of threads to use. By default, only a single thread is used to handle all the netmap\n" " rings. If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n" " single RX ring (in rx mode), or a TX/RX ring pair. The number of threads must be less than or\n" " equal to the number of TX (or RX) rings available in the device specified by interface.\n" "\n" " -T report_ms\n" " Number of milliseconds between reports.\n" "\n" " -w wait_for_link_time\n" " Number of seconds to wait before starting the pkt-gen function, useful to make sure that the\n" " network link is up. A network device driver may take some time to enter netmap mode, or to\n" " create a new transmit/receive ring pair when netmap(4) requests one.\n" "\n" " -R rate\n" " Packet transmission rate. Not setting the packet transmission rate tells pkt-gen to transmit\n" " packets as quickly as possible. On servers from 2010 onward netmap(4) is able to com-\n" " pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n" " your intention is to saturate the link.\n" "\n" " -X Dump payload of each packet transmitted or received.\n" "\n" " -H len Add empty virtio-net-header with size 'len'. Valid sizes are 0, 10 and 12. This option is\n" " only used with Virtual Machine technologies that use virtio as a network interface.\n" "\n" " -P file\n" " Load the packet to be transmitted from a pcap file rather than constructing it within\n" " pkt-gen.\n" "\n" " -z Use random IPv4/IPv6 src address/port.\n" "\n" " -Z Use random IPv4/IPv6 dst address/port.\n" "\n" " -N Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n" "\n" " -F num_frags\n" " Send multi-slot packets, each one with num_frags fragments. A multi-slot packet is repre-\n" " sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n" " last slot). This is useful to transmit or receive packets larger than the netmap buffer\n" " size.\n" "\n" " -M frag_size\n" " In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n" " length divided by num_frags.\n" "\n" " -I Use indirect buffers. It is only valid for transmitting on VALE ports, and it is implemented\n" " by setting the NS_INDIRECT flag in the netmap slots.\n" "\n" " -W Exit immediately if all the RX rings are empty the first time they are examined.\n" "\n" " -v Increase the verbosity level.\n" "\n" " -r In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n" " netmap buffers is (rubbish mode).\n" "\n" " -A Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n" "\n" " -B Take Ethernet framing and CRC into account when computing the average bps. This adds 4 bytes\n" " of CRC and 20 bytes of framing to each packet.\n" "\n" " -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n" " Configuration in terms of number of rings and slots to be used when opening the netmap port.\n" " Such configuration has an effect on software ports created on the fly, such as VALE ports and\n" " netmap pipes. The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n" " rx_slots, tx_rings, rx_rings. Missing numbers or zeroes stand for default values. As an\n" " additional convenience, if exactly one number is specified, then this is assigned to both\n" " tx_slots and rx_slots. If there is no fourth number, then the third one is assigned to both\n" " tx_rings and rx_rings.\n" "\n" " -o options data generation options (parsed using atoi)\n" " OPT_PREFETCH 1\n" " OPT_ACCESS 2\n" " OPT_COPY 4\n" " OPT_MEMCPY 8\n" " OPT_TS 16 (add a timestamp)\n" " OPT_INDIRECT 32 (use indirect buffers)\n" " OPT_DUMP 64 (dump rx/tx traffic)\n" " OPT_RUBBISH 256\n" " (send whatever the buffers contain)\n" " OPT_RANDOM_SRC 512\n" " OPT_RANDOM_DST 1024\n" " OPT_PPS_STATS 2048\n" "", cmd); exit(errcode); } static void start_threads(struct glob_arg *g) { int i; targs = calloc(g->nthreads, sizeof(*targs)); struct targ *t; /* * Now create the desired number of threads, each one * using a single descriptor. */ for (i = 0; i < g->nthreads; i++) { uint64_t seed = time(0) | (time(0) << 32); t = &targs[i]; bzero(t, sizeof(*t)); t->fd = -1; /* default, with pcap */ t->g = g; memcpy(t->seed, &seed, sizeof(t->seed)); if (g->dev_type == DEV_NETMAP) { struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ uint64_t nmd_flags = 0; nmd.self = &nmd; if (i > 0) { /* the first thread uses the fd opened by the main * thread, the other threads re-open /dev/netmap */ if (g->nthreads > 1) { nmd.req.nr_flags = g->nmd->req.nr_flags & ~NR_REG_MASK; nmd.req.nr_flags |= NR_REG_ONE_NIC; nmd.req.nr_ringid = i; } /* Only touch one of the rings (rx is already ok) */ if (g->td_type == TD_TYPE_RECEIVER) nmd_flags |= NETMAP_NO_TX_POLL; /* register interface. Override ifname and ringid etc. */ t->nmd = nm_open(t->g->ifname, NULL, nmd_flags | NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, &nmd); if (t->nmd == NULL) { D("Unable to open %s: %s", t->g->ifname, strerror(errno)); continue; } } else { t->nmd = g->nmd; } t->fd = t->nmd->fd; t->frags = g->frags; } else { targs[i].fd = g->main_fd; } t->used = 1; t->me = i; if (g->affinity >= 0) { t->affinity = (g->affinity + i) % g->cpus; } else { t->affinity = -1; } /* default, init packets */ initialize_packet(t); } /* Wait for PHY reset. */ D("Wait %d secs for phy reset", g->wait_link); sleep(g->wait_link); D("Ready..."); for (i = 0; i < g->nthreads; i++) { t = &targs[i]; if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { D("Unable to create thread %d: %s", i, strerror(errno)); t->used = 0; } } } static void main_thread(struct glob_arg *g) { int i; struct my_ctrs prev, cur; double delta_t; struct timeval tic, toc; prev.pkts = prev.bytes = prev.events = 0; gettimeofday(&prev.t, NULL); for (;;) { char b1[40], b2[40], b3[40], b4[100]; uint64_t pps, usec; struct my_ctrs x; double abs; int done = 0; usec = wait_for_next_report(&prev.t, &cur.t, g->report_interval); cur.pkts = cur.bytes = cur.events = 0; cur.min_space = 0; if (usec < 10000) /* too short to be meaningful */ continue; /* accumulate counts for all threads */ for (i = 0; i < g->nthreads; i++) { cur.pkts += targs[i].ctr.pkts; cur.bytes += targs[i].ctr.bytes; cur.events += targs[i].ctr.events; cur.min_space += targs[i].ctr.min_space; targs[i].ctr.min_space = 99999; if (targs[i].used == 0) done++; } x.pkts = cur.pkts - prev.pkts; x.bytes = cur.bytes - prev.bytes; x.events = cur.events - prev.events; pps = (x.pkts*1000000 + usec/2) / usec; abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0; if (!(g->options & OPT_PPS_STATS)) { strcpy(b4, ""); } else { /* Compute some pps stats using a sliding window. */ double ppsavg = 0.0, ppsdev = 0.0; int nsamples = 0; g->win[g->win_idx] = pps; g->win_idx = (g->win_idx + 1) % STATS_WIN; for (i = 0; i < STATS_WIN; i++) { ppsavg += g->win[i]; if (g->win[i]) { nsamples ++; } } ppsavg /= nsamples; for (i = 0; i < STATS_WIN; i++) { if (g->win[i] == 0) { continue; } ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg); } ppsdev /= nsamples; ppsdev = sqrt(ppsdev); snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]", norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize)); } D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space", norm(b1, pps, normalize), b4, norm(b2, (double)x.pkts, normalize), norm(b3, 1000000*((double)x.bytes*8+(double)x.pkts*g->framing)/usec, normalize), (unsigned long long)usec, abs, (int)cur.min_space); prev = cur; if (done == g->nthreads) break; } timerclear(&tic); timerclear(&toc); cur.pkts = cur.bytes = cur.events = 0; /* final round */ for (i = 0; i < g->nthreads; i++) { struct timespec t_tic, t_toc; /* * Join active threads, unregister interfaces and close * file descriptors. */ if (targs[i].used) pthread_join(targs[i].thread, NULL); /* blocking */ if (g->dev_type == DEV_NETMAP) { nm_close(targs[i].nmd); targs[i].nmd = NULL; } else { close(targs[i].fd); } if (targs[i].completed == 0) D("ouch, thread %d exited with error", i); /* * Collect threads output and extract information about * how long it took to send all the packets. */ cur.pkts += targs[i].ctr.pkts; cur.bytes += targs[i].ctr.bytes; cur.events += targs[i].ctr.events; /* collect the largest start (tic) and end (toc) times, * XXX maybe we should do the earliest tic, or do a weighted * average ? */ t_tic = timeval2spec(&tic); t_toc = timeval2spec(&toc); if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic)) tic = timespec2val(&targs[i].tic); if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc)) toc = timespec2val(&targs[i].toc); } /* print output. */ timersub(&toc, &tic, &toc); delta_t = toc.tv_sec + 1e-6* toc.tv_usec; if (g->td_type == TD_TYPE_SENDER) tx_output(g, &cur, delta_t, "Sent"); else if (g->td_type == TD_TYPE_RECEIVER) tx_output(g, &cur, delta_t, "Received"); } struct td_desc { int ty; - char *key; + const char *key; void *f; int default_burst; }; static struct td_desc func[] = { { TD_TYPE_RECEIVER, "rx", receiver_body, 512}, /* default */ { TD_TYPE_SENDER, "tx", sender_body, 512 }, { TD_TYPE_OTHER, "ping", ping_body, 1 }, { TD_TYPE_OTHER, "pong", pong_body, 1 }, { TD_TYPE_SENDER, "txseq", txseq_body, 512 }, { TD_TYPE_RECEIVER, "rxseq", rxseq_body, 512 }, { 0, NULL, NULL, 0 } }; static int tap_alloc(char *dev) { struct ifreq ifr; int fd, err; - char *clonedev = TAP_CLONEDEV; + const char *clonedev = TAP_CLONEDEV; (void)err; (void)dev; /* Arguments taken by the function: * * char *dev: the name of an interface (or '\0'). MUST have enough * space to hold the interface name if '\0' is passed * int flags: interface flags (eg, IFF_TUN etc.) */ #ifdef __FreeBSD__ if (dev[3]) { /* tapSomething */ static char buf[128]; snprintf(buf, sizeof(buf), "/dev/%s", dev); clonedev = buf; } #endif /* open the device */ if( (fd = open(clonedev, O_RDWR)) < 0 ) { return fd; } D("%s open successful", clonedev); /* preparation of the struct ifr, of type "struct ifreq" */ memset(&ifr, 0, sizeof(ifr)); #ifdef linux ifr.ifr_flags = IFF_TAP | IFF_NO_PI; if (*dev) { /* if a device name was specified, put it in the structure; otherwise, * the kernel will try to allocate the "next" device of the * specified type */ size_t len = strlen(dev); if (len > IFNAMSIZ) { D("%s too long", dev); return -1; } memcpy(ifr.ifr_name, dev, len); } /* try to create the device */ if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) { D("failed to to a TUNSETIFF: %s", strerror(errno)); close(fd); return err; } /* if the operation was successful, write back the name of the * interface to the variable "dev", so the caller can know * it. Note that the caller MUST reserve space in *dev (see calling * code below) */ strcpy(dev, ifr.ifr_name); D("new name is %s", dev); #endif /* linux */ /* this is the special file descriptor that the caller will use to talk * with the virtual interface */ return fd; } int main(int arc, char **argv) { int i; struct sigaction sa; sigset_t ss; struct glob_arg g; int ch; int devqueues = 1; /* how many device queues */ int wait_link_arg = 0; int pkt_size_done = 0; struct td_desc *fn = func; bzero(&g, sizeof(g)); g.main_fd = -1; g.td_body = fn->f; g.td_type = fn->ty; g.report_interval = 1000; /* report interval */ g.affinity = -1; /* ip addresses can also be a range x.x.x.x-x.x.x.y */ g.af = AF_INET; /* default */ g.src_ip.name = "10.0.0.1"; g.dst_ip.name = "10.1.0.1"; g.dst_mac.name = "ff:ff:ff:ff:ff:ff"; g.src_mac.name = NULL; g.pkt_size = 60; g.pkt_min_size = 0; g.nthreads = 1; g.cpus = 1; /* default */ g.forever = 1; g.tx_rate = 0; g.frags = 1; g.frag_size = (u_int)-1; /* use the netmap buffer size by default */ g.nmr_config = ""; g.virt_header = 0; g.wait_link = 2; /* wait 2 seconds for physical ports */ while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:" "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) { switch(ch) { default: D("bad option %c %s", ch, optarg); usage(-1); break; case 'h': usage(0); break; case '4': g.af = AF_INET; break; case '6': g.af = AF_INET6; break; case 'N': normalize = 0; break; case 'n': g.npackets = strtoull(optarg, NULL, 10); break; case 'F': i = atoi(optarg); if (i < 1 || i > 63) { D("invalid frags %d [1..63], ignore", i); break; } g.frags = i; break; case 'M': g.frag_size = atoi(optarg); break; case 'f': for (fn = func; fn->key; fn++) { if (!strcmp(fn->key, optarg)) break; } if (fn->key) { g.td_body = fn->f; g.td_type = fn->ty; } else { D("unrecognised function %s", optarg); } break; case 'o': /* data generation options */ g.options |= atoi(optarg); break; case 'a': /* force affinity */ g.affinity = atoi(optarg); break; case 'i': /* interface */ /* a prefix of tap: netmap: or pcap: forces the mode. * otherwise we guess */ D("interface is %s", optarg); if (strlen(optarg) > MAX_IFNAMELEN - 8) { D("ifname too long %s", optarg); break; } strcpy(g.ifname, optarg); if (!strcmp(optarg, "null")) { g.dev_type = DEV_NETMAP; g.dummy_send = 1; } else if (!strncmp(optarg, "tap:", 4)) { g.dev_type = DEV_TAP; strcpy(g.ifname, optarg + 4); } else if (!strncmp(optarg, "pcap:", 5)) { g.dev_type = DEV_PCAP; strcpy(g.ifname, optarg + 5); } else if (!strncmp(optarg, "netmap:", 7) || !strncmp(optarg, "vale", 4)) { g.dev_type = DEV_NETMAP; } else if (!strncmp(optarg, "tap", 3)) { g.dev_type = DEV_TAP; } else { /* prepend netmap: */ g.dev_type = DEV_NETMAP; sprintf(g.ifname, "netmap:%s", optarg); } break; case 'I': g.options |= OPT_INDIRECT; /* use indirect buffers */ break; case 'l': /* pkt_size */ if (pkt_size_done) { g.pkt_min_size = atoi(optarg); } else { g.pkt_size = atoi(optarg); pkt_size_done = 1; } break; case 'd': g.dst_ip.name = optarg; break; case 's': g.src_ip.name = optarg; break; case 'T': /* report interval */ g.report_interval = atoi(optarg); break; case 'w': g.wait_link = atoi(optarg); wait_link_arg = 1; break; case 'W': g.forever = 0; /* exit RX with no traffic */ break; case 'b': /* burst */ g.burst = atoi(optarg); break; case 'c': g.cpus = atoi(optarg); break; case 'p': g.nthreads = atoi(optarg); break; case 'D': /* destination mac */ g.dst_mac.name = optarg; break; case 'S': /* source mac */ g.src_mac.name = optarg; break; case 'v': verbose++; break; case 'R': g.tx_rate = atoi(optarg); break; case 'X': g.options |= OPT_DUMP; break; case 'C': D("WARNING: the 'C' option is deprecated, use the '+conf:' libnetmap option instead"); g.nmr_config = strdup(optarg); break; case 'H': g.virt_header = atoi(optarg); break; case 'P': g.packet_file = strdup(optarg); break; case 'r': g.options |= OPT_RUBBISH; break; case 'z': g.options |= OPT_RANDOM_SRC; break; case 'Z': g.options |= OPT_RANDOM_DST; break; case 'A': g.options |= OPT_PPS_STATS; break; case 'B': /* raw packets have4 bytes crc + 20 bytes framing */ // XXX maybe add an option to pass the IFG g.framing = 24 * 8; break; } } if (strlen(g.ifname) <=0 ) { D("missing ifname"); usage(-1); } if (g.burst == 0) { g.burst = fn->default_burst; D("using default burst size: %d", g.burst); } g.system_cpus = i = system_ncpus(); if (g.cpus < 0 || g.cpus > i) { D("%d cpus is too high, have only %d cpus", g.cpus, i); usage(-1); } D("running on %d cpus (have %d)", g.cpus, i); if (g.cpus == 0) g.cpus = i; if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) { g.wait_link = 0; } if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) { D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE); usage(-1); } if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) { D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size); usage(-1); } if (g.src_mac.name == NULL) { static char mybuf[20] = "00:00:00:00:00:00"; /* retrieve source mac address. */ if (source_hwaddr(g.ifname, mybuf) == -1) { D("Unable to retrieve source mac"); // continue, fail later } g.src_mac.name = mybuf; } /* extract address ranges */ if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac)) usage(-1); g.options |= extract_ip_range(&g.src_ip, g.af); g.options |= extract_ip_range(&g.dst_ip, g.af); if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 && g.virt_header != VIRT_HDR_2) { D("bad virtio-net-header length"); usage(-1); } if (g.dev_type == DEV_TAP) { D("want to use tap %s", g.ifname); g.main_fd = tap_alloc(g.ifname); if (g.main_fd < 0) { D("cannot open tap %s", g.ifname); usage(-1); } #ifndef NO_PCAP } else if (g.dev_type == DEV_PCAP) { char pcap_errbuf[PCAP_ERRBUF_SIZE]; pcap_errbuf[0] = '\0'; // init the buffer g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf); if (g.p == NULL) { D("cannot open pcap on %s", g.ifname); usage(-1); } g.main_fd = pcap_fileno(g.p); D("using pcap on %s fileno %d", g.ifname, g.main_fd); #endif /* !NO_PCAP */ } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { struct nm_desc base_nmd; char errmsg[MAXERRMSG]; u_int flags; bzero(&base_nmd, sizeof(base_nmd)); parse_nmr_config(g.nmr_config, &base_nmd.req); base_nmd.req.nr_flags |= NR_ACCEPT_VNET_HDR; if (nm_parse(g.ifname, &base_nmd, errmsg) < 0) { D("Invalid name '%s': %s", g.ifname, errmsg); goto out; } /* * Open the netmap device using nm_open(). * * protocol stack and may cause a reset of the card, * which in turn may take some time for the PHY to * reconfigure. We do the open here to have time to reset. */ flags = NM_OPEN_IFNAME | NM_OPEN_ARG1 | NM_OPEN_ARG2 | NM_OPEN_ARG3 | NM_OPEN_RING_CFG; if (g.nthreads > 1) { base_nmd.req.nr_flags &= ~NR_REG_MASK; base_nmd.req.nr_flags |= NR_REG_ONE_NIC; base_nmd.req.nr_ringid = 0; } g.nmd = nm_open(g.ifname, NULL, flags, &base_nmd); if (g.nmd == NULL) { D("Unable to open %s: %s", g.ifname, strerror(errno)); goto out; } g.main_fd = g.nmd->fd; D("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10), g.nmd->mem); if (g.virt_header) { /* Set the virtio-net header length, since the user asked * for it explicitely. */ set_vnet_hdr_len(&g); } else { /* Check whether the netmap port we opened requires us to send * and receive frames with virtio-net header. */ get_vnet_hdr_len(&g); } /* get num of queues in tx or rx */ if (g.td_type == TD_TYPE_SENDER) devqueues = g.nmd->req.nr_tx_rings; else devqueues = g.nmd->req.nr_rx_rings; /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { D("bad nthreads %d, have %d queues", g.nthreads, devqueues); // continue, fail later } if (g.td_type == TD_TYPE_SENDER) { int mtu = get_if_mtu(&g); if (mtu > 0 && g.pkt_size > mtu) { D("pkt_size (%d) must be <= mtu (%d)", g.pkt_size, mtu); return -1; } } if (verbose) { struct netmap_if *nifp = g.nmd->nifp; struct nmreq *req = &g.nmd->req; D("nifp at offset %d, %d tx %d rx region %d", req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, req->nr_arg2); for (i = 0; i <= req->nr_tx_rings; i++) { struct netmap_ring *ring = NETMAP_TXRING(nifp, i); D(" TX%d at 0x%p slots %d", i, (void *)((char *)ring - (char *)nifp), ring->num_slots); } for (i = 0; i <= req->nr_rx_rings; i++) { struct netmap_ring *ring = NETMAP_RXRING(nifp, i); D(" RX%d at 0x%p slots %d", i, (void *)((char *)ring - (char *)nifp), ring->num_slots); } } /* Print some debug information. */ fprintf(stdout, "%s %s: %d queues, %d threads and %d cpus.\n", (g.td_type == TD_TYPE_SENDER) ? "Sending on" : ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" : "Working on"), g.ifname, devqueues, g.nthreads, g.cpus); if (g.td_type == TD_TYPE_SENDER) { fprintf(stdout, "%s -> %s (%s -> %s)\n", g.src_ip.name, g.dst_ip.name, g.src_mac.name, g.dst_mac.name); } out: /* Exit if something went wrong. */ if (g.main_fd < 0) { D("aborting"); usage(-1); } } if (g.options) { D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", g.options & OPT_ACCESS ? " access" : "", g.options & OPT_MEMCPY ? " memcpy" : "", g.options & OPT_INDIRECT ? " indirect" : "", g.options & OPT_COPY ? " copy" : "", g.options & OPT_RUBBISH ? " rubbish " : ""); } g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; if (g.tx_rate > 0) { /* try to have at least something every second, * reducing the burst size to some 0.01s worth of data * (but no less than one full set of fragments) */ uint64_t x; int lim = (g.tx_rate)/300; if (g.burst > lim) g.burst = lim; if (g.burst == 0) g.burst = 1; x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; g.tx_period.tv_nsec = x; g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } if (g.td_type == TD_TYPE_SENDER) D("Sending %d packets every %ld.%09ld s", g.burst, g.tx_period.tv_sec, g.tx_period.tv_nsec); /* Install ^C handler. */ global_nthreads = g.nthreads; sigemptyset(&ss); sigaddset(&ss, SIGINT); /* block SIGINT now, so that all created threads will inherit the mask */ if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) { D("failed to block SIGINT: %s", strerror(errno)); } start_threads(&g); /* Install the handler and re-enable SIGINT for the main thread */ memset(&sa, 0, sizeof(sa)); sa.sa_handler = sigint_h; if (sigaction(SIGINT, &sa, NULL) < 0) { D("failed to install ^C handler: %s", strerror(errno)); } if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) { D("failed to re-enable SIGINT: %s", strerror(errno)); } main_thread(&g); free(targs); return 0; } /* end of file */ Index: stable/12/tools/tools/netmap/pkt_hash.c =================================================================== --- stable/12/tools/tools/netmap/pkt_hash.c (revision 366497) +++ stable/12/tools/tools/netmap/pkt_hash.c (revision 366498) @@ -1,396 +1,396 @@ /* ** Copyright (c) 2015, Asim Jamshed, Robin Sommer, Seth Hall ** and the International Computer Science Institute. All rights reserved. ** ** Redistribution and use in source and binary forms, with or without ** modification, are permitted provided that the following conditions are met: ** ** (1) Redistributions of source code must retain the above copyright ** notice, this list of conditions and the following disclaimer. ** ** (2) Redistributions in binary form must reproduce the above copyright ** notice, this list of conditions and the following disclaimer in the ** documentation and/or other materials provided with the distribution. ** ** ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ** POSSIBILITY OF SUCH DAMAGE. **/ /* $FreeBSD$ */ /* for func prototypes */ #include "pkt_hash.h" /* Make Linux headers choose BSD versions of some of the data structures */ #define __FAVOR_BSD /* for types */ #include /* for [n/h]to[h/n][ls] */ #include /* iphdr */ #include /* ipv6hdr */ #include /* tcphdr */ #include /* udphdr */ #include /* eth hdr */ #include /* for memset */ #include #include #include //#include /*---------------------------------------------------------------------*/ /** * * The cache table is used to pick a nice seed for the hash value. It is * * built only once when sym_hash_fn is called for the very first time * */ static void build_sym_key_cache(uint32_t *cache, int cache_len) { static const uint8_t key[] = { 0x50, 0x6d }; uint32_t result = (((uint32_t)key[0]) << 24) | (((uint32_t)key[1]) << 16) | (((uint32_t)key[0]) << 8) | ((uint32_t)key[1]); uint32_t idx = 32; int i; for (i = 0; i < cache_len; i++, idx++) { uint8_t shift = (idx % 8); uint32_t bit; cache[i] = result; bit = ((key[(idx/8) & 1] << shift) & 0x80) ? 1 : 0; result = ((result << 1) | bit); } } static void build_byte_cache(uint32_t byte_cache[256][4]) { #define KEY_CACHE_LEN 96 int i, j, k; uint32_t key_cache[KEY_CACHE_LEN]; build_sym_key_cache(key_cache, KEY_CACHE_LEN); for (i = 0; i < 4; i++) { for (j = 0; j < 256; j++) { uint8_t b = j; byte_cache[j][i] = 0; for (k = 0; k < 8; k++) { if (b & 0x80) byte_cache[j][i] ^= key_cache[8 * i + k]; b <<= 1U; } } } } /*---------------------------------------------------------------------*/ /** ** Computes symmetric hash based on the 4-tuple header data **/ static uint32_t sym_hash_fn(uint32_t sip, uint32_t dip, uint16_t sp, uint32_t dp) { uint32_t rc = 0; static int first_time = 1; static uint32_t byte_cache[256][4]; uint8_t *sip_b = (uint8_t *)&sip, *dip_b = (uint8_t *)&dip, *sp_b = (uint8_t *)&sp, *dp_b = (uint8_t *)&dp; if (first_time) { build_byte_cache(byte_cache); first_time = 0; } rc = byte_cache[sip_b[3]][0] ^ byte_cache[sip_b[2]][1] ^ byte_cache[sip_b[1]][2] ^ byte_cache[sip_b[0]][3] ^ byte_cache[dip_b[3]][0] ^ byte_cache[dip_b[2]][1] ^ byte_cache[dip_b[1]][2] ^ byte_cache[dip_b[0]][3] ^ byte_cache[sp_b[1]][0] ^ byte_cache[sp_b[0]][1] ^ byte_cache[dp_b[1]][2] ^ byte_cache[dp_b[0]][3]; return rc; } static uint32_t decode_gre_hash(const uint8_t *, uint8_t, uint8_t); /*---------------------------------------------------------------------*/ /** ** Parser + hash function for the IPv4 packet **/ static uint32_t -decode_ip_n_hash(struct ip *iph, uint8_t hash_split, uint8_t seed) +decode_ip_n_hash(const struct ip *iph, uint8_t hash_split, uint8_t seed) { uint32_t rc = 0; if (hash_split == 2) { rc = sym_hash_fn(ntohl(iph->ip_src.s_addr), ntohl(iph->ip_dst.s_addr), ntohs(0xFFFD) + seed, ntohs(0xFFFE) + seed); } else { - struct tcphdr *tcph = NULL; - struct udphdr *udph = NULL; + const struct tcphdr *tcph = NULL; + const struct udphdr *udph = NULL; switch (iph->ip_p) { case IPPROTO_TCP: - tcph = (struct tcphdr *)((uint8_t *)iph + (iph->ip_hl<<2)); + tcph = (const struct tcphdr *)((const uint8_t *)iph + (iph->ip_hl<<2)); rc = sym_hash_fn(ntohl(iph->ip_src.s_addr), ntohl(iph->ip_dst.s_addr), ntohs(tcph->th_sport) + seed, ntohs(tcph->th_dport) + seed); break; case IPPROTO_UDP: - udph = (struct udphdr *)((uint8_t *)iph + (iph->ip_hl<<2)); + udph = (const struct udphdr *)((const uint8_t *)iph + (iph->ip_hl<<2)); rc = sym_hash_fn(ntohl(iph->ip_src.s_addr), ntohl(iph->ip_dst.s_addr), ntohs(udph->uh_sport) + seed, ntohs(udph->uh_dport) + seed); break; case IPPROTO_IPIP: /* tunneling */ - rc = decode_ip_n_hash((struct ip *)((uint8_t *)iph + (iph->ip_hl<<2)), + rc = decode_ip_n_hash((const struct ip *)((const uint8_t *)iph + (iph->ip_hl<<2)), hash_split, seed); break; case IPPROTO_GRE: - rc = decode_gre_hash((uint8_t *)iph + (iph->ip_hl<<2), + rc = decode_gre_hash((const uint8_t *)iph + (iph->ip_hl<<2), hash_split, seed); break; case IPPROTO_ICMP: case IPPROTO_ESP: case IPPROTO_PIM: case IPPROTO_IGMP: default: /* ** the hash strength (although weaker but) should still hold ** even with 2 fields **/ rc = sym_hash_fn(ntohl(iph->ip_src.s_addr), ntohl(iph->ip_dst.s_addr), ntohs(0xFFFD) + seed, ntohs(0xFFFE) + seed); break; } } return rc; } /*---------------------------------------------------------------------*/ /** ** Parser + hash function for the IPv6 packet **/ static uint32_t -decode_ipv6_n_hash(struct ip6_hdr *ipv6h, uint8_t hash_split, uint8_t seed) +decode_ipv6_n_hash(const struct ip6_hdr *ipv6h, uint8_t hash_split, uint8_t seed) { uint32_t saddr, daddr; uint32_t rc = 0; /* Get only the first 4 octets */ saddr = ipv6h->ip6_src.s6_addr[0] | (ipv6h->ip6_src.s6_addr[1] << 8) | (ipv6h->ip6_src.s6_addr[2] << 16) | (ipv6h->ip6_src.s6_addr[3] << 24); daddr = ipv6h->ip6_dst.s6_addr[0] | (ipv6h->ip6_dst.s6_addr[1] << 8) | (ipv6h->ip6_dst.s6_addr[2] << 16) | (ipv6h->ip6_dst.s6_addr[3] << 24); if (hash_split == 2) { rc = sym_hash_fn(ntohl(saddr), ntohl(daddr), ntohs(0xFFFD) + seed, ntohs(0xFFFE) + seed); } else { - struct tcphdr *tcph = NULL; - struct udphdr *udph = NULL; + const struct tcphdr *tcph = NULL; + const struct udphdr *udph = NULL; switch(ntohs(ipv6h->ip6_ctlun.ip6_un1.ip6_un1_nxt)) { case IPPROTO_TCP: - tcph = (struct tcphdr *)(ipv6h + 1); + tcph = (const struct tcphdr *)(ipv6h + 1); rc = sym_hash_fn(ntohl(saddr), ntohl(daddr), ntohs(tcph->th_sport) + seed, ntohs(tcph->th_dport) + seed); break; case IPPROTO_UDP: - udph = (struct udphdr *)(ipv6h + 1); + udph = (const struct udphdr *)(ipv6h + 1); rc = sym_hash_fn(ntohl(saddr), ntohl(daddr), ntohs(udph->uh_sport) + seed, ntohs(udph->uh_dport) + seed); break; case IPPROTO_IPIP: /* tunneling */ - rc = decode_ip_n_hash((struct ip *)(ipv6h + 1), + rc = decode_ip_n_hash((const struct ip *)(ipv6h + 1), hash_split, seed); break; case IPPROTO_IPV6: /* tunneling */ - rc = decode_ipv6_n_hash((struct ip6_hdr *)(ipv6h + 1), + rc = decode_ipv6_n_hash((const struct ip6_hdr *)(ipv6h + 1), hash_split, seed); break; case IPPROTO_GRE: - rc = decode_gre_hash((uint8_t *)(ipv6h + 1), hash_split, seed); + rc = decode_gre_hash((const uint8_t *)(ipv6h + 1), hash_split, seed); break; case IPPROTO_ICMP: case IPPROTO_ESP: case IPPROTO_PIM: case IPPROTO_IGMP: default: /* ** the hash strength (although weaker but) should still hold ** even with 2 fields **/ rc = sym_hash_fn(ntohl(saddr), ntohl(daddr), ntohs(0xFFFD) + seed, ntohs(0xFFFE) + seed); } } return rc; } /*---------------------------------------------------------------------*/ /** * * A temp solution while hash for other protocols are filled... * * (See decode_vlan_n_hash & pkt_hdr_hash functions). * */ static uint32_t -decode_others_n_hash(struct ether_header *ethh, uint8_t seed) +decode_others_n_hash(const struct ether_header *ethh, uint8_t seed) { uint32_t saddr, daddr, rc; saddr = ethh->ether_shost[5] | (ethh->ether_shost[4] << 8) | (ethh->ether_shost[3] << 16) | (ethh->ether_shost[2] << 24); daddr = ethh->ether_dhost[5] | (ethh->ether_dhost[4] << 8) | (ethh->ether_dhost[3] << 16) | (ethh->ether_dhost[2] << 24); rc = sym_hash_fn(ntohl(saddr), ntohl(daddr), ntohs(0xFFFD) + seed, ntohs(0xFFFE) + seed); return rc; } /*---------------------------------------------------------------------*/ /** ** Parser + hash function for VLAN packet **/ static inline uint32_t -decode_vlan_n_hash(struct ether_header *ethh, uint8_t hash_split, uint8_t seed) +decode_vlan_n_hash(const struct ether_header *ethh, uint8_t hash_split, uint8_t seed) { uint32_t rc = 0; - struct vlanhdr *vhdr = (struct vlanhdr *)(ethh + 1); + const struct vlanhdr *vhdr = (const struct vlanhdr *)(ethh + 1); switch (ntohs(vhdr->proto)) { case ETHERTYPE_IP: - rc = decode_ip_n_hash((struct ip *)(vhdr + 1), + rc = decode_ip_n_hash((const struct ip *)(vhdr + 1), hash_split, seed); break; case ETHERTYPE_IPV6: - rc = decode_ipv6_n_hash((struct ip6_hdr *)(vhdr + 1), + rc = decode_ipv6_n_hash((const struct ip6_hdr *)(vhdr + 1), hash_split, seed); break; case ETHERTYPE_ARP: default: /* others */ rc = decode_others_n_hash(ethh, seed); break; } return rc; } /*---------------------------------------------------------------------*/ /** ** General parser + hash function... **/ uint32_t pkt_hdr_hash(const unsigned char *buffer, uint8_t hash_split, uint8_t seed) { uint32_t rc = 0; - struct ether_header *ethh = (struct ether_header *)buffer; + const struct ether_header *ethh = (const struct ether_header *)buffer; switch (ntohs(ethh->ether_type)) { case ETHERTYPE_IP: - rc = decode_ip_n_hash((struct ip *)(ethh + 1), + rc = decode_ip_n_hash((const struct ip *)(ethh + 1), hash_split, seed); break; case ETHERTYPE_IPV6: - rc = decode_ipv6_n_hash((struct ip6_hdr *)(ethh + 1), + rc = decode_ipv6_n_hash((const struct ip6_hdr *)(ethh + 1), hash_split, seed); break; case ETHERTYPE_VLAN: rc = decode_vlan_n_hash(ethh, hash_split, seed); break; case ETHERTYPE_ARP: default: /* others */ rc = decode_others_n_hash(ethh, seed); break; } return rc; } /*---------------------------------------------------------------------*/ /** ** Parser + hash function for the GRE packet **/ static uint32_t decode_gre_hash(const uint8_t *grehdr, uint8_t hash_split, uint8_t seed) { uint32_t rc = 0; int len = 4 + 2 * (!!(*grehdr & 1) + /* Checksum */ !!(*grehdr & 2) + /* Routing */ !!(*grehdr & 4) + /* Key */ !!(*grehdr & 8)); /* Sequence Number */ - uint16_t proto = ntohs(*(uint16_t *)(void *)(grehdr + 2)); + uint16_t proto = ntohs(*(const uint16_t *)(const void *)(grehdr + 2)); switch (proto) { case ETHERTYPE_IP: - rc = decode_ip_n_hash((struct ip *)(grehdr + len), + rc = decode_ip_n_hash((const struct ip *)(grehdr + len), hash_split, seed); break; case ETHERTYPE_IPV6: - rc = decode_ipv6_n_hash((struct ip6_hdr *)(grehdr + len), + rc = decode_ipv6_n_hash((const struct ip6_hdr *)(grehdr + len), hash_split, seed); break; case 0x6558: /* Transparent Ethernet Bridging */ rc = pkt_hdr_hash(grehdr + len, hash_split, seed); break; default: /* others */ break; } return rc; } /*---------------------------------------------------------------------*/ Index: stable/12 =================================================================== --- stable/12 (revision 366497) +++ stable/12 (revision 366498) Property changes on: stable/12 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r366393