Index: head/sbin/ipfw/dummynet.c =================================================================== --- head/sbin/ipfw/dummynet.c +++ head/sbin/ipfw/dummynet.c @@ -1,4 +1,11 @@ /* + * Codel/FQ_Codel and PIE/FQ_PIE Code: + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * * Copyright (c) 2002-2003,2010 Luigi Rizzo * * Redistribution and use in source forms, with and without modification, @@ -15,6 +22,7 @@ * dummynet support */ +#define NEW_AQM #include #include /* XXX there are several sysctl leftover here */ @@ -22,6 +30,10 @@ #include "ipfw2.h" +#ifdef NEW_AQM +#include +#endif + #include #include #include @@ -59,6 +71,12 @@ { "ecn", TOK_ECN }, { "red", TOK_RED }, { "gred", TOK_GRED }, +#ifdef NEW_AQM + { "codel", TOK_CODEL}, /* Codel AQM */ + { "fq_codel", TOK_FQ_CODEL}, /* FQ-Codel */ + { "pie", TOK_PIE}, /* PIE AQM */ + { "fq_pie", TOK_FQ_PIE}, /* FQ-PIE */ +#endif { "bw", TOK_BW }, { "bandwidth", TOK_BW }, { "delay", TOK_DELAY }, @@ -81,6 +99,32 @@ { NULL, 0 } /* terminator */ }; +#ifdef NEW_AQM +/* AQM/extra sched parameters tokens*/ +static struct _s_x aqm_params[] = { + { "target", TOK_TARGET}, + { "interval", TOK_INTERVAL}, + { "limit", TOK_LIMIT}, + { "flows", TOK_FLOWS}, + { "quantum", TOK_QUANTUM}, + { "ecn", TOK_ECN}, + { "noecn", TOK_NO_ECN}, + { "tupdate", TOK_TUPDATE}, + { "max_burst", TOK_MAX_BURST}, + { "max_ecnth", TOK_MAX_ECNTH}, + { "alpha", TOK_ALPHA}, + { "beta", TOK_BETA}, + { "capdrop", TOK_CAPDROP}, + { "nocapdrop", TOK_NO_CAPDROP}, + { "onoff", TOK_ONOFF}, + { "dre", TOK_DRE}, + { "ts", TOK_TS}, + { "derand", TOK_DERAND}, + { "noderand", TOK_NO_DERAND}, + { NULL, 0 } /* terminator */ +}; +#endif + #define O_NEXT(p, len) ((void *)((char *)p + len)) static void @@ -102,6 +146,214 @@ return ret; } +#ifdef NEW_AQM + +/* Codel flags */ +enum { + CODEL_ECN_ENABLED = 1 +}; + +/* PIE flags, from PIE kernel module */ +enum { + PIE_ECN_ENABLED = 1, + PIE_CAPDROP_ENABLED = 2, + PIE_ON_OFF_MODE_ENABLED = 4, + PIE_DEPRATEEST_ENABLED = 8, + PIE_DERAND_ENABLED = 16 +}; + +#define PIE_FIX_POINT_BITS 13 +#define PIE_SCALE (1L<15) + return -1; + for (i = 0; ioid, l, DN_CMD_GET, DN_API_VERSION); + ep->oid.len = l; + ep->oid.subtype = subtype; + ep->nr = nr; + + ret = do_cmd(-IP_DUMMYNET3, ep, (uintptr_t)&l); + if (ret) { + free(ep); + errx(EX_DATAERR, "Error getting extra parameters\n"); + } + + switch (subtype) { + case DN_AQM_PARAMS: + if( !strcasecmp(ep->name, "codel")) { + us_to_time(ep->par[0], strt1); + us_to_time(ep->par[1], strt2); + l = sprintf(out, " AQM CoDel target %s interval %s", + strt1, strt2); + if (ep->par[2] & CODEL_ECN_ENABLED) + l = sprintf(out + l, " ECN"); + else + l += sprintf(out + l, " NoECN"); + } else if( !strcasecmp(ep->name, "pie")) { + us_to_time(ep->par[0], strt1); + us_to_time(ep->par[1], strt2); + us_to_time(ep->par[2], strt3); + l = sprintf(out, " AQM type PIE target %s tupdate %s alpha " + "%g beta %g max_burst %s max_ecnth %.3g", + strt1, + strt2, + ep->par[4] / (float) PIE_SCALE, + ep->par[5] / (float) PIE_SCALE, + strt3, + ep->par[3] / (float) PIE_SCALE + ); + + if (ep->par[6] & PIE_ECN_ENABLED) + l += sprintf(out + l, " ECN"); + else + l += sprintf(out + l, " NoECN"); + if (ep->par[6] & PIE_CAPDROP_ENABLED) + l += sprintf(out + l, " CapDrop"); + else + l += sprintf(out + l, " NoCapDrop"); + if (ep->par[6] & PIE_ON_OFF_MODE_ENABLED) + l += sprintf(out + l, " OnOff"); + if (ep->par[6] & PIE_DEPRATEEST_ENABLED) + l += sprintf(out + l, " DRE"); + else + l += sprintf(out + l, " TS"); + if (ep->par[6] & PIE_DERAND_ENABLED) + l += sprintf(out + l, " Derand"); + else + l += sprintf(out + l, " NoDerand"); + } + break; + + case DN_SCH_PARAMS: + if (!strcasecmp(ep->name,"FQ_CODEL")) { + us_to_time(ep->par[0], strt1); + us_to_time(ep->par[1], strt2); + l = sprintf(out," FQ_CODEL target %s interval %s" + " quantum %jd limit %jd flows %jd", + strt1, strt2, + (intmax_t) ep->par[3], + (intmax_t) ep->par[4], + (intmax_t) ep->par[5] + ); + if (ep->par[2] & CODEL_ECN_ENABLED) + l += sprintf(out + l, " ECN"); + else + l += sprintf(out + l, " NoECN"); + l += sprintf(out + l, "\n"); + } else if (!strcasecmp(ep->name,"FQ_PIE")) { + us_to_time(ep->par[0], strt1); + us_to_time(ep->par[1], strt2); + us_to_time(ep->par[2], strt3); + l = sprintf(out, " FQ_PIE target %s tupdate %s alpha " + "%g beta %g max_burst %s max_ecnth %.3g" + " quantum %jd limit %jd flows %jd", + strt1, + strt2, + ep->par[4] / (float) PIE_SCALE, + ep->par[5] / (float) PIE_SCALE, + strt3, + ep->par[3] / (float) PIE_SCALE, + (intmax_t) ep->par[7], + (intmax_t) ep->par[8], + (intmax_t) ep->par[9] + ); + + if (ep->par[6] & PIE_ECN_ENABLED) + l += sprintf(out + l, " ECN"); + else + l += sprintf(out + l, " NoECN"); + if (ep->par[6] & PIE_CAPDROP_ENABLED) + l += sprintf(out + l, " CapDrop"); + else + l += sprintf(out + l, " NoCapDrop"); + if (ep->par[6] & PIE_ON_OFF_MODE_ENABLED) + l += sprintf(out + l, " OnOff"); + if (ep->par[6] & PIE_DEPRATEEST_ENABLED) + l += sprintf(out + l, " DRE"); + else + l += sprintf(out + l, " TS"); + if (ep->par[6] & PIE_DERAND_ENABLED) + l += sprintf(out + l, " Derand"); + else + l += sprintf(out + l, " NoDerand"); + l += sprintf(out + l, "\n"); + } + break; + } + + free(ep); +} +#endif + + #if 0 static int sort_q(void *arg, const void *pa, const void *pb) @@ -221,7 +473,7 @@ int l; char qs[30]; char plr[30]; - char red[90]; /* Display RED parameters */ + char red[200]; /* Display RED parameters */ l = fs->qsize; if (fs->flags & DN_QSIZE_BYTES) { @@ -246,6 +498,11 @@ 1.0 * fs->max_p / (double)(1 << SCALE_RED)); if (fs->flags & DN_IS_ECN) strncat(red, " (ecn)", 6); +#ifdef NEW_AQM + /* get AQM parameters */ + } else if (fs->flags & DN_IS_AQM) { + get_extra_parms(fs->fs_nr, red, DN_AQM_PARAMS); +#endif } else sprintf(red, "droptail"); @@ -338,6 +595,11 @@ printf(" sched %d type %s flags 0x%x %d buckets %d active\n", s->sched_nr, s->name, s->flags, s->buckets, s->oid.id); +#ifdef NEW_AQM + char parms[200]; + get_extra_parms(s->sched_nr, parms, DN_SCH_PARAMS); + printf("%s",parms); +#endif if (s->flags & DN_HAVE_MASK) print_mask(&s->sched_mask); } @@ -745,6 +1007,242 @@ strncpy(p->name, profile_name, sizeof(p->name)); } +#ifdef NEW_AQM + +/* Parse AQM/extra scheduler parameters */ +static int +process_extra_parms(int *ac, char **av, struct dn_extra_parms *ep, + uint16_t type) +{ + int i; + + /* use kernel defaults */ + for (i=0; ipar[i] = -1; + + switch(type) { + case TOK_CODEL: + case TOK_FQ_CODEL: + /* Codel + * 0- target, 1- interval, 2- flags, + * FQ_CODEL + * 3- quantum, 4- limit, 5- flows + */ + if (type==TOK_CODEL) + ep->par[2] = 0; + else + ep->par[2] = CODEL_ECN_ENABLED; + + while (*ac > 0) { + int tok = match_token(aqm_params, *av); + (*ac)--; av++; + switch(tok) { + case TOK_TARGET: + if (*ac <= 0 || time_to_us(av[0]) < 0) + errx(EX_DATAERR, "target needs time\n"); + + ep->par[0] = time_to_us(av[0]); + (*ac)--; av++; + break; + + case TOK_INTERVAL: + if (*ac <= 0 || time_to_us(av[0]) < 0) + errx(EX_DATAERR, "interval needs time\n"); + + ep->par[1] = time_to_us(av[0]); + (*ac)--; av++; + break; + + case TOK_ECN: + ep->par[2] = CODEL_ECN_ENABLED; + break; + case TOK_NO_ECN: + ep->par[2] &= ~CODEL_ECN_ENABLED; + break; + /* Config fq_codel parameters */ + case TOK_QUANTUM: + if (type != TOK_FQ_CODEL) + errx(EX_DATAERR, "quantum is not for codel\n"); + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "quantum needs number\n"); + + ep->par[3]= atoi(av[0]); + (*ac)--; av++; + break; + + case TOK_LIMIT: + if (type != TOK_FQ_CODEL) + errx(EX_DATAERR, "limit is not for codel, use queue instead\n"); + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "limit needs number\n"); + + ep->par[4] = atoi(av[0]); + (*ac)--; av++; + break; + + case TOK_FLOWS: + if (type != TOK_FQ_CODEL) + errx(EX_DATAERR, "flows is not for codel\n"); + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "flows needs number\n"); + + ep->par[5] = atoi(av[0]); + (*ac)--; av++; + break; + + default: + printf("%s is Invalid parameter\n", av[-1]); + } + } + break; + case TOK_PIE: + case TOK_FQ_PIE: + /* PIE + * 0- target , 1- tupdate, 2- max_burst, + * 3- max_ecnth, 4- alpha, + * 5- beta, 6- flags + * FQ_CODEL + * 7- quantum, 8- limit, 9- flows + */ + + if ( type == TOK_PIE) + ep->par[6] = PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED + | PIE_DERAND_ENABLED; + else + /* for FQ-PIE, use TS mode */ + ep->par[6] = PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED + | PIE_ECN_ENABLED; + + while (*ac > 0) { + int tok = match_token(aqm_params, *av); + (*ac)--; av++; + switch(tok) { + case TOK_TARGET: + if (*ac <= 0 || time_to_us(av[0]) < 0) + errx(EX_DATAERR, "target needs time\n"); + + ep->par[0] = time_to_us(av[0]); + (*ac)--; av++; + break; + + case TOK_TUPDATE: + if (*ac <= 0 || time_to_us(av[0]) < 0) + errx(EX_DATAERR, "tupdate needs time\n"); + + ep->par[1] = time_to_us(av[0]); + (*ac)--; av++; + break; + + case TOK_MAX_BURST: + if (*ac <= 0 || time_to_us(av[0]) < 0) + errx(EX_DATAERR, "max_burst needs time\n"); + + ep->par[2] = time_to_us(av[0]); + (*ac)--; av++; + break; + + case TOK_MAX_ECNTH: + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "max_ecnth needs number\n"); + + ep->par[3] = atof(av[0]) * PIE_SCALE; + (*ac)--; av++; + break; + + case TOK_ALPHA: + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "alpha needs number\n"); + + ep->par[4] = atof(av[0]) * PIE_SCALE; + (*ac)--; av++; + break; + + case TOK_BETA: + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "beta needs number\n"); + + ep->par[5] = atof(av[0]) * PIE_SCALE; + (*ac)--; av++; + break; + + case TOK_ECN: + ep->par[6] |= PIE_ECN_ENABLED; + break; + case TOK_NO_ECN: + ep->par[6] &= ~PIE_ECN_ENABLED; + break; + + case TOK_CAPDROP: + ep->par[6] |= PIE_CAPDROP_ENABLED; + break; + case TOK_NO_CAPDROP: + ep->par[6] &= ~PIE_CAPDROP_ENABLED; + break; + + case TOK_ONOFF: + ep->par[6] |= PIE_ON_OFF_MODE_ENABLED; + break; + + case TOK_DRE: + ep->par[6] |= PIE_DEPRATEEST_ENABLED; + break; + + case TOK_TS: + ep->par[6] &= ~PIE_DEPRATEEST_ENABLED; + break; + + case TOK_DERAND: + ep->par[6] |= PIE_DERAND_ENABLED; + break; + case TOK_NO_DERAND: + ep->par[6] &= ~PIE_DERAND_ENABLED; + break; + + /* Config fq_pie parameters */ + case TOK_QUANTUM: + if (type != TOK_FQ_PIE) + errx(EX_DATAERR, "quantum is not for pie\n"); + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "quantum needs number\n"); + + ep->par[7]= atoi(av[0]); + (*ac)--; av++; + break; + + case TOK_LIMIT: + if (type != TOK_FQ_PIE) + errx(EX_DATAERR, "limit is not for pie, use queue instead\n"); + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "limit needs number\n"); + + ep->par[8] = atoi(av[0]); + (*ac)--; av++; + break; + + case TOK_FLOWS: + if (type != TOK_FQ_PIE) + errx(EX_DATAERR, "flows is not for pie\n"); + if (*ac <= 0 || !is_valid_number(av[0])) + errx(EX_DATAERR, "flows needs number\n"); + + ep->par[9] = atoi(av[0]); + (*ac)--; av++; + break; + + + default: + printf("%s is invalid parameter\n", av[-1]); + } + } + break; + } + + return 0; +} + +#endif + + /* * configuration of pipes, schedulers, flowsets. * When we configure a new scheduler, an empty pipe is created, so: @@ -776,6 +1274,12 @@ struct dn_fs *fs = NULL; struct dn_profile *pf = NULL; struct ipfw_flow_id *mask = NULL; +#ifdef NEW_AQM + struct dn_extra_parms *aqm_extra; + struct dn_extra_parms *sch_extra; + int lmax_extra; +#endif + int lmax; uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; @@ -787,6 +1291,15 @@ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + sizeof(struct dn_fs) + sizeof(struct dn_profile); +#ifdef NEW_AQM + /* Extra Params */ + lmax_extra = sizeof(struct dn_extra_parms); + /* two lmax_extra because one for AQM params and another + * sch params + */ + lmax += lmax_extra*2; +#endif + av++; ac--; /* Pipe number */ if (ac && isdigit(**av)) { @@ -812,8 +1325,16 @@ * The FIFO scheduler and link are derived from the * WF2Q+ one in the kernel. */ +#ifdef NEW_AQM + sch_extra = o_next(&buf, lmax_extra, DN_TEXT); + sch_extra ->oid.subtype = 0; /* don't configure scheduler */ +#endif sch = o_next(&buf, sizeof(*sch), DN_SCH); p = o_next(&buf, sizeof(*p), DN_LINK); +#ifdef NEW_AQM + aqm_extra = o_next(&buf, lmax_extra, DN_TEXT); + aqm_extra ->oid.subtype = 0; /* don't configure AQM */ +#endif fs = o_next(&buf, sizeof(*fs), DN_FS); sch->sched_nr = i; @@ -831,6 +1352,10 @@ break; case 2: /* "queue N config ... " */ +#ifdef NEW_AQM + aqm_extra = o_next(&buf, lmax_extra, DN_TEXT); + aqm_extra ->oid.subtype = 0; +#endif fs = o_next(&buf, sizeof(*fs), DN_FS); fs->fs_nr = i; mask = &fs->flow_mask; @@ -839,7 +1364,15 @@ break; case 3: /* "sched N config ..." */ +#ifdef NEW_AQM + sch_extra = o_next(&buf, lmax_extra, DN_TEXT); + sch_extra ->oid.subtype = 0; +#endif sch = o_next(&buf, sizeof(*sch), DN_SCH); +#ifdef NEW_AQM + aqm_extra = o_next(&buf, lmax_extra, DN_TEXT); + aqm_extra ->oid.subtype = 0; +#endif fs = o_next(&buf, sizeof(*fs), DN_FS); sch->sched_nr = i; mask = &sch->sched_mask; @@ -1026,7 +1559,31 @@ } /* end while, config masks */ end_mask: break; +#ifdef NEW_AQM + case TOK_CODEL: + case TOK_PIE: + NEED(fs, "codel/pie is only for flowsets"); + + fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); + fs->flags |= DN_IS_AQM; + + strcpy(aqm_extra->name,av[-1]); + aqm_extra->oid.subtype = DN_AQM_PARAMS; + + process_extra_parms(&ac, av, aqm_extra, tok); + break; + case TOK_FQ_CODEL: + case TOK_FQ_PIE: + if (!strcmp(av[-1],"type")) + errx(EX_DATAERR, "use type before fq_codel/fq_pie"); + + NEED(sch, "fq_codel/fq_pie is only for schd"); + strcpy(sch_extra->name,av[-1]); + sch_extra->oid.subtype = DN_SCH_PARAMS; + process_extra_parms(&ac, av, sch_extra, tok); + break; +#endif case TOK_RED: case TOK_GRED: NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); @@ -1093,7 +1650,20 @@ errx(1, "type %s too long\n", av[0]); strcpy(sch->name, av[0]); sch->oid.subtype = 0; /* use string */ - ac--; av++; +#ifdef NEW_AQM + /* if fq_codel is selected, consider all tokens after it + * as parameters + */ + if (!strcasecmp(av[0],"fq_codel") || !strcasecmp(av[0],"fq_pie")){ + strcpy(sch_extra->name,av[0]); + sch_extra->oid.subtype = DN_SCH_PARAMS; + process_extra_parms(&ac, av, sch_extra, tok); + } else { + ac--;av++; + } +#else + ac--;av++; +#endif break; } @@ -1187,9 +1757,17 @@ errx(EX_DATAERR, "2 <= queue size <= %ld", limit); } +#ifdef NEW_AQM + if ((fs->flags & DN_IS_ECN) && !((fs->flags & DN_IS_RED)|| + (fs->flags & DN_IS_AQM))) + errx(EX_USAGE, "ECN can be used with red/gred/" + "codel/fq_codel only!"); +#else if ((fs->flags & DN_IS_ECN) && !(fs->flags & DN_IS_RED)) errx(EX_USAGE, "enable red/gred for ECN"); +#endif + if (fs->flags & DN_IS_RED) { size_t len; int lookup_depth, avg_pkt_size; Index: head/sbin/ipfw/ipfw2.h =================================================================== --- head/sbin/ipfw/ipfw2.h +++ head/sbin/ipfw/ipfw2.h @@ -171,6 +171,31 @@ TOK_ECN, TOK_DROPTAIL, TOK_PROTO, +#ifdef NEW_AQM + /* AQM tokens*/ + TOK_NO_ECN, + TOK_CODEL, + TOK_FQ_CODEL, + TOK_TARGET, + TOK_INTERVAL, + TOK_FLOWS, + TOK_QUANTUM, + + TOK_PIE, + TOK_FQ_PIE, + TOK_TUPDATE, + TOK_MAX_BURST, + TOK_MAX_ECNTH, + TOK_ALPHA, + TOK_BETA, + TOK_CAPDROP, + TOK_NO_CAPDROP, + TOK_ONOFF, + TOK_DRE, + TOK_TS, + TOK_DERAND, + TOK_NO_DERAND, +#endif /* dummynet tokens */ TOK_WEIGHT, TOK_LMAX, Index: head/sys/modules/dummynet/Makefile =================================================================== --- head/sys/modules/dummynet/Makefile +++ head/sys/modules/dummynet/Makefile @@ -4,8 +4,9 @@ KMOD= dummynet SRCS= ip_dummynet.c SRCS+= ip_dn_glue.c ip_dn_io.c +SRCS+= dn_aqm_codel.c dn_aqm_pie.c SRCS+= dn_heap.c dn_sched_fifo.c dn_sched_qfq.c dn_sched_rr.c dn_sched_wf2q.c -SRCS+= dn_sched_prio.c +SRCS+= dn_sched_prio.c dn_sched_fq_codel.c dn_sched_fq_pie.c SRCS+= opt_inet6.h .include Index: head/sys/netinet/ip_dummynet.h =================================================================== --- head/sys/netinet/ip_dummynet.h +++ head/sys/netinet/ip_dummynet.h @@ -29,7 +29,7 @@ #ifndef _IP_DUMMYNET_H #define _IP_DUMMYNET_H - +#define NEW_AQM /* * Definition of the kernel-userland API for dummynet. * @@ -85,7 +85,13 @@ /* special commands for emulation of sysctl variables */ DN_SYSCTL_GET, DN_SYSCTL_SET, - +#ifdef NEW_AQM + /* subtypes used for setting/getting extra parameters. + * these subtypes used with IP_DUMMYNET3 command (get) + * and DN_TEXT (set). */ + DN_AQM_PARAMS, /* AQM extra params */ + DN_SCH_PARAMS, /* scheduler extra params */ +#endif DN_LAST, }; @@ -105,6 +111,9 @@ DN_IS_RED = 0x0020, DN_IS_GENTLE_RED= 0x0040, DN_IS_ECN = 0x0080, + #ifdef NEW_AQM + DN_IS_AQM = 0x0100, /* AQMs: e.g Codel & PIE */ + #endif DN_PIPE_CMD = 0x1000, /* pipe config... */ }; @@ -210,7 +219,19 @@ int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; - +#ifdef NEW_AQM +/* Extra parameters for AQM and scheduler. + * This struct is used to pass and retrieve parameters (configurations) + * to/from AQM and Scheduler. + */ +struct dn_extra_parms { + struct dn_id oid; + char name[16]; + uint32_t nr; +#define DN_MAX_EXTRA_PARM 10 + int64_t par[DN_MAX_EXTRA_PARM]; +}; +#endif /* * Overall structure of dummynet Index: head/sys/netpfil/ipfw/dn_aqm.h =================================================================== --- head/sys/netpfil/ipfw/dn_aqm.h +++ head/sys/netpfil/ipfw/dn_aqm.h @@ -0,0 +1,167 @@ +/*- + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * API for writting an Active Queue Management algorithm for Dummynet + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_AQM_H +#define _IP_DN_AQM_H + + +/* NOW is the current time in millisecond*/ +#define NOW ((dn_cfg.curr_time * tick) / 1000) + +#define AQM_UNOW (dn_cfg.curr_time * tick) +#define AQM_TIME_1US ((aqm_time_t)(1)) +#define AQM_TIME_1MS ((aqm_time_t)(1000)) +#define AQM_TIME_1S ((aqm_time_t)(AQM_TIME_1MS * 1000)) + +/* aqm time allows to store up to 4294 seconds */ +typedef uint32_t aqm_time_t; +typedef int32_t aqm_stime_t; + +#define DN_AQM_MTAG_TS 55345 + +/* Macro for variable bounding */ +#define BOUND_VAR(x,l,h) (x < l? l : x > h? h : x) + +/* sysctl variable to count number of droped packets */ +extern unsigned long io_pkt_drop; + +/* + * Structure for holding data and function pointers that together represent a + * AQM algorithm. + */ + struct dn_aqm { +#define DN_AQM_NAME_MAX 50 + char name[DN_AQM_NAME_MAX]; /* name of AQM algorithm */ + uint32_t type; /* AQM type number */ + + /* Methods implemented by AQM algorithm: + * + * enqueue enqueue packet 'm' on queue 'q'. + * Return 0 on success, 1 on drop. + * + * dequeue dequeue a packet from queue 'q'. + * Return a packet, NULL if no packet available. + * + * config configure AQM algorithm + * If required, this function should allocate space to store + * the configurations and set 'fs->aqmcfg' to point to this space. + * 'dn_extra_parms' includes array of parameters send + * from ipfw userland command. + * Return 0 on success, non-zero otherwise. + * + * deconfig deconfigure AQM algorithm. + * The allocated configuration memory space should be freed here. + * Return 0 on success, non-zero otherwise. + * + * init initialise AQM status variables of queue 'q' + * This function is used to allocate space and init AQM status for a + * queue and q->aqm_status to point to this space. + * Return 0 on success, non-zero otherwise. + * + * cleanup cleanup AQM status variables of queue 'q' + * The allocated memory space for AQM status should be freed here. + * Return 0 on success, non-zero otherwise. + * + * getconfig retrieve AQM configurations + * This function is used to return AQM parameters to userland + * command. The function should fill 'dn_extra_parms' struct with + * the AQM configurations using 'par' array. + * + */ + + int (*enqueue)(struct dn_queue *, struct mbuf *); + struct mbuf * (*dequeue)(struct dn_queue *); + int (*config)(struct dn_fsk *, struct dn_extra_parms *ep, int); + int (*deconfig)(struct dn_fsk *); + int (*init)(struct dn_queue *); + int (*cleanup)(struct dn_queue *); + int (*getconfig)(struct dn_fsk *, struct dn_extra_parms *); + + int ref_count; /*Number of queues instances in the system */ + int cfg_ref_count; /*Number of AQM instances in the system */ + SLIST_ENTRY (dn_aqm) next; /* Next AQM in the list */ +}; + +/* Helper function to update queue and scheduler statistics. + * negative len + drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +update_stats(struct dn_queue *q, int len, int drop) +{ + int inc = 0; + struct dn_flow *sni; + struct dn_flow *qni; + + sni = &q->_si->ni; + qni = &q->ni; + + if (len < 0) + inc = -1; + else if(len > 0) + inc = 1; + + if (drop) { + qni->drops++; + sni->drops++; + io_pkt_drop++; + } else { + /*update queue stats */ + qni->length += inc; + qni->len_bytes += len; + + /*update scheduler instance stats */ + sni->length += inc; + sni->len_bytes += len; + } + /* tot_pkts is updated in dn_enqueue function */ +} + + +/* kernel module related function */ +int +dn_aqm_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNAQM_MODULE(name, dnaqm) \ + static moduledata_t name##_mod = { \ + #name, dn_aqm_modevent, dnaqm \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3) + +#endif Index: head/sys/netpfil/ipfw/dn_aqm_codel.h =================================================================== --- head/sys/netpfil/ipfw/dn_aqm_codel.h +++ head/sys/netpfil/ipfw/dn_aqm_codel.h @@ -0,0 +1,222 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Copyright (C) 2011-2014 Kathleen Nichols . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * o Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * + * o Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * o The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General Public + * License ("GPL") version 2, in which case the provisions of the GPL + * apply INSTEAD OF those given above. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_DN_AQM_CODEL_H +#define _IP_DN_AQM_CODEL_H + + +// XXX How to choose MTAG? +#define FIX_POINT_BITS 16 + +enum { + CODEL_ECN_ENABLED = 1 +}; + +/* Codel parameters */ +struct dn_aqm_codel_parms { + aqm_time_t target; + aqm_time_t interval; + uint32_t flags; +}; + +/* codel status variables */ +struct codel_status { + uint32_t count; /* number of dropped pkts since entering drop state */ + uint16_t dropping; /* dropping state */ + aqm_time_t drop_next_time; /* time for next drop */ + aqm_time_t first_above_time; /* time for first ts over target we observed */ + uint16_t isqrt; /* last isqrt for control low */ + uint16_t maxpkt_size; /* max packet size seen so far */ +}; + +struct mbuf *codel_extract_head(struct dn_queue *, aqm_time_t *); +aqm_time_t control_law(struct codel_status *, + struct dn_aqm_codel_parms *, aqm_time_t ); + +__inline static struct mbuf * +codel_dodequeue(struct dn_queue *q, aqm_time_t now, uint16_t *ok_to_drop) +{ + struct mbuf * m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t pkt_ts, sojourn_time; + + *ok_to_drop = 0; + m = codel_extract_head(q, &pkt_ts); + + cst = q->aqm_status; + + if (m == NULL) { + /* queue is empty - we can't be above target */ + cst->first_above_time= 0; + return m; + } + + cprms = q->fs->aqmcfg; + + /* To span a large range of bandwidths, CoDel runs two + * different AQMs in parallel. One is sojourn-time-based + * and takes effect when the time to send an MTU-sized + * packet is less than target. The 1st term of the "if" + * below does this. The other is backlog-based and takes + * effect when the time to send an MTU-sized packet is >= + * target. The goal here is to keep the output link + * utilization high by never allowing the queue to get + * smaller than the amount that arrives in a typical + * interarrival time (MTU-sized packets arriving spaced + * by the amount of time it takes to send such a packet on + * the bottleneck). The 2nd term of the "if" does this. + */ + sojourn_time = now - pkt_ts; + if (sojourn_time < cprms->target || q->ni.len_bytes <= cst->maxpkt_size) { + /* went below - stay below for at least interval */ + cst->first_above_time = 0; + } else { + if (cst->first_above_time == 0) { + /* just went above from below. if still above at + * first_above_time, will say it's ok to drop. */ + cst->first_above_time = now + cprms->interval; + } else if (now >= cst->first_above_time) { + *ok_to_drop = 1; + } + } + return m; +} + +/* + * Dequeue a packet from queue 'q' + */ +__inline static struct mbuf * +codel_dequeue(struct dn_queue *q) +{ + struct mbuf *m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t now; + uint16_t ok_to_drop; + + cst = q->aqm_status;; + cprms = q->fs->aqmcfg; + now = AQM_UNOW; + + m = codel_dodequeue(q, now, &ok_to_drop); + if (cst->dropping) { + if (!ok_to_drop) { + /* sojourn time below target - leave dropping state */ + cst->dropping = false; + } + /* + * Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= cst->drop_next_time && cst->dropping) { + + /* mark the packet */ + if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) { + cst->count++; + /* schedule the next mark. */ + cst->drop_next_time = control_law(cst, cprms, + cst->drop_next_time); + return m; + } + + /* drop the packet */ + update_stats(q, 0, 1); + FREE_PKT(m); + m = codel_dodequeue(q, now, &ok_to_drop); + + if (!ok_to_drop) { + /* leave dropping state */ + cst->dropping = false; + } else { + cst->count++; + /* schedule the next drop. */ + cst->drop_next_time = control_law(cst, cprms, + cst->drop_next_time); + } + } + /* If we get here we're not in dropping state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'target' for 'interval' so enter dropping state. + */ + } else if (ok_to_drop) { + + /* if ECN option is disabled or the packet cannot be marked, + * drop the packet and extract another. + */ + if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) { + update_stats(q, 0, 1); + FREE_PKT(m); + m = codel_dodequeue(q, now, &ok_to_drop); + } + + cst->dropping = true; + + /* If min went above target close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'interval' + * later than the time of the last drop so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + cst->count = (cst->count > 2 && ((aqm_stime_t)now - + (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? + cst->count - 2 : 1; + /* we don't have to set initial guess for Newton's method isqrt as + * we initilaize isqrt in control_law function when count == 1 */ + cst->drop_next_time = control_law(cst, cprms, now); + } + + return m; +} + +#endif Index: head/sys/netpfil/ipfw/dn_aqm_codel.c =================================================================== --- head/sys/netpfil/ipfw/dn_aqm_codel.c +++ head/sys/netpfil/ipfw/dn_aqm_codel.c @@ -0,0 +1,444 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include + +#include +#include /* ip_len, ip_off */ +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include /* various ether_* routines */ +#include /* for ip6_input, ip6_output prototypes */ +#include +#include + +#ifdef NEW_AQM +#include +#include +#include +#include +#include + +#define DN_AQM_CODEL 1 + +static struct dn_aqm codel_desc; + +/* default codel parameters */ +struct dn_aqm_codel_parms codel_sysctl = {5000 * AQM_TIME_1US, + 100000 * AQM_TIME_1US, 0}; + +static int +codel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = codel_sysctl.interval; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 100 * AQM_TIME_1S) + return (EINVAL); + codel_sysctl.interval = value * AQM_TIME_1US ; + return (0); +} + +static int +codel_sysctl_target_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = codel_sysctl.target; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + D("%ld", value); + if (value < 1 || value > 5 * AQM_TIME_1S) + return (EINVAL); + codel_sysctl.target = value * AQM_TIME_1US ; + return (0); +} + +/* defining Codel sysctl variables */ +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, + codel, CTLFLAG_RW, 0, "CODEL"); + +#ifdef SYSCTL_NODE +SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0,codel_sysctl_target_handler, "L", + "CoDel target in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_codel, OID_AUTO, interval, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, codel_sysctl_interval_handler, "L", + "CoDel interval in microsecond"); +#endif + +/* This function computes codel_interval/sqrt(count) + * Newton's method of approximation is used to compute 1/sqrt(count). + * http://betterexplained.com/articles/ + * understanding-quakes-fast-inverse-square-root/ + */ +aqm_time_t +control_law(struct codel_status *cst, struct dn_aqm_codel_parms *cprms, + aqm_time_t t) +{ + uint32_t count; + uint64_t temp; + count = cst->count; + + /* we don't calculate isqrt(1) to get more accurate result*/ + if (count == 1) { + /* prepare isqrt (old guess) for the next iteration i.e. 1/sqrt(2)*/ + cst->isqrt = (1UL<< FIX_POINT_BITS) * 7/10; + /* return time + isqrt(1)*interval */ + return t + cprms->interval; + } + + /* newguess = g(1.5 - 0.5*c*g^2) + * Multiplying both sides by 2 to make all the constants intergers + * newguess * 2 = g(3 - c*g^2) g=old guess, c=count + * So, newguess = newguess /2 + * Fixed point operations are used here. + */ + + /* Calculate g^2 */ + temp = (uint32_t) cst->isqrt * cst->isqrt; + /* Calculate (3 - c*g^2) i.e. (3 - c * temp) */ + temp = (3ULL<< (FIX_POINT_BITS*2)) - (count * temp); + + /* + * Divide by 2 because we multiplied the original equation by two + * Also, we shift the result by 8 bits to prevent overflow. + * */ + temp >>= (1 + 8); + + /* Now, temp = (1.5 - 0.5*c*g^2) + * Calculate g (1.5 - 0.5*c*g^2) i.e. g * temp + */ + temp = (cst->isqrt * temp) >> (FIX_POINT_BITS + FIX_POINT_BITS - 8); + cst->isqrt = temp; + + /* calculate codel_interval/sqrt(count) */ + return t + ((cprms->interval * temp) >> FIX_POINT_BITS); +} + +/* + * Extract a packet from the head of queue 'q' + * Return a packet or NULL if the queue is empty. + * Also extract packet's timestamp from mtag. + */ +struct mbuf * +codel_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts) +{ + struct m_tag *mtag; + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + /* Update stats */ + update_stats(q, -m->m_pkthdr.len, 0); + + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + + /* extract packet TS*/ + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) { + D("Codel timestamp mtag not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + + return m; +} + +/* + * Enqueue a packet 'm' in queue 'q' + */ +static int +aqm_codel_enqueue(struct dn_queue *q, struct mbuf *m) +{ + struct dn_fs *f; + uint64_t len; + struct codel_status *cst; /*codel status variables */ + struct m_tag *mtag; + + f = &(q->fs->fs); + len = m->m_pkthdr.len; + cst = q->aqm_status; + if(!cst) { + D("Codel queue is not initialized\n"); + goto drop; + } + + /* Finding maximum packet size */ + // XXX we can get MTU from driver instead + if (len > cst->maxpkt_size) + cst->maxpkt_size = len; + + /* check for queue size and drop the tail if exceed queue limit*/ + if (f->flags & DN_QSIZE_BYTES) { + if ( q->ni.len_bytes > f->qsize) + goto drop; + } + else { + if ( q->ni.length >= f->qsize) + goto drop; + } + + /* Add timestamp as mtag */ + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, + sizeof(aqm_time_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + goto drop; + } + + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + + mq_append(&q->mq, m); + update_stats(q, len, 0); + return (0); + +drop: + update_stats(q, 0, 1); + FREE_PKT(m); + return (1); +} + + +/* Dequeue a pcaket from queue q */ +static struct mbuf * +aqm_codel_dequeue(struct dn_queue *q) +{ + return codel_dequeue(q); +} + +/* + * initialize Codel for queue 'q' + * First allocate memory for codel status. + */ +static int +aqm_codel_init(struct dn_queue *q) +{ + struct codel_status *cst; + + if (!q->fs->aqmcfg) { + D("Codel is not configure!d"); + return EINVAL; + } + + q->aqm_status = malloc(sizeof(struct codel_status), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q->aqm_status == NULL) { + D("Cannot allocate AQM_codel private data"); + return ENOMEM ; + } + + /* init codel status variables */ + cst = q->aqm_status; + cst->dropping=0; + cst->first_above_time=0; + cst->drop_next_time=0; + cst->count=0; + cst->maxpkt_size = 500; + + /* increase reference counters */ + codel_desc.ref_count++; + + return 0; +} + +/* + * Clean up Codel status for queue 'q' + * Destroy memory allocated for codel status. + */ +static int +aqm_codel_cleanup(struct dn_queue *q) +{ + + if (q && q->aqm_status) { + free(q->aqm_status, M_DUMMYNET); + q->aqm_status = NULL; + /* decrease reference counters */ + codel_desc.ref_count--; + } + else + D("Codel already cleaned up"); + return 0; +} + +/* + * Config codel parameters + * also allocate memory for codel configurations + */ +static int +aqm_codel_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len) +{ + struct dn_aqm_codel_parms *ccfg; + + int l = sizeof(struct dn_extra_parms); + if (len < l) { + D("invalid sched parms length got %d need %d", len, l); + return EINVAL; + } + /* we free the old cfg because maybe the original allocation + * not the same size as the new one (different AQM type). + */ + if (fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + } + + fs->aqmcfg = malloc(sizeof(struct dn_aqm_codel_parms), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs->aqmcfg== NULL) { + D("cannot allocate AQM_codel configuration parameters"); + return ENOMEM; + } + + /* configure codel parameters */ + ccfg = fs->aqmcfg; + + if (ep->par[0] < 0) + ccfg->target = codel_sysctl.target; + else + ccfg->target = ep->par[0] * AQM_TIME_1US; + + if (ep->par[1] < 0) + ccfg->interval = codel_sysctl.interval; + else + ccfg->interval = ep->par[1] * AQM_TIME_1US; + + if (ep->par[2] < 0) + ccfg->flags = 0; + else + ccfg->flags = ep->par[2]; + + /* bound codel configurations */ + ccfg->target = BOUND_VAR(ccfg->target,1, 5 * AQM_TIME_1S); + ccfg->interval = BOUND_VAR(ccfg->interval,1, 5 * AQM_TIME_1S); + /* increase config reference counter */ + codel_desc.cfg_ref_count++; + + return 0; +} + +/* + * Deconfigure Codel and free memory allocation + */ +static int +aqm_codel_deconfig(struct dn_fsk* fs) +{ + + if (fs && fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + fs->aqmfp = NULL; + /* decrease config reference counter */ + codel_desc.cfg_ref_count--; + } + + return 0; +} + +/* + * Retrieve Codel configuration parameters. + */ +static int +aqm_codel_getconfig(struct dn_fsk *fs, struct dn_extra_parms * ep) +{ + struct dn_aqm_codel_parms *ccfg; + + if (fs->aqmcfg) { + strcpy(ep->name, codel_desc.name); + ccfg = fs->aqmcfg; + ep->par[0] = ccfg->target / AQM_TIME_1US; + ep->par[1] = ccfg->interval / AQM_TIME_1US; + ep->par[2] = ccfg->flags; + return 0; + } + return 1; +} + +static struct dn_aqm codel_desc = { + _SI( .type = ) DN_AQM_CODEL, + _SI( .name = ) "CODEL", + _SI( .enqueue = ) aqm_codel_enqueue, + _SI( .dequeue = ) aqm_codel_dequeue, + _SI( .config = ) aqm_codel_config, + _SI( .getconfig = ) aqm_codel_getconfig, + _SI( .deconfig = ) aqm_codel_deconfig, + _SI( .init = ) aqm_codel_init, + _SI( .cleanup = ) aqm_codel_cleanup, +}; + +DECLARE_DNAQM_MODULE(dn_aqm_codel, &codel_desc); + + +#endif Index: head/sys/netpfil/ipfw/dn_aqm_pie.h =================================================================== --- head/sys/netpfil/ipfw/dn_aqm_pie.h +++ head/sys/netpfil/ipfw/dn_aqm_pie.h @@ -0,0 +1,151 @@ +/* + * PIE - Proportional Integral controller Enhanced AQM algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _IP_DN_AQM_PIE_H +#define _IP_DN_AQM_PIE_H + +#define DN_AQM_PIE 2 +#define PIE_DQ_THRESHOLD_BITS 14 +/* 2^14 =16KB */ +#define PIE_DQ_THRESHOLD (1UL << PIE_DQ_THRESHOLD_BITS) +#define MEAN_PKTSIZE 800 + +/* 31-bits because random() generates range from 0->(2**31)-1 */ +#define PIE_PROB_BITS 31 +#define PIE_MAX_PROB ((1ULL<parms; + + /* queue is not congested */ + + if ((pst->qdelay_old < (pprms->qdelay_ref >> 1) + && pst->drop_prob < PIE_MAX_PROB / 5 ) + || qlen <= 2 * MEAN_PKTSIZE) + return ENQUE; + + + if (pst->drop_prob == 0) + pst->accu_prob = 0; + + /* increment accu_prob */ + if (pprms->flags & PIE_DERAND_ENABLED) + pst->accu_prob += pst->drop_prob; + + /* De-randomize option + * if accu_prob < 0.85 -> enqueue + * if accu_prob>8.5 ->drop + * between 0.85 and 8.5 || !De-randomize --> drop on prob + */ + if (pprms->flags & PIE_DERAND_ENABLED) { + if(pst->accu_prob < (uint64_t) (PIE_MAX_PROB * 0.85)) + return ENQUE; + if( pst->accu_prob >= (uint64_t) (PIE_MAX_PROB * 8.5)) + return DROP; + } + + if (random() < pst->drop_prob) { + pst->accu_prob = 0; + return DROP; + } + + return ENQUE; +} + +#endif Index: head/sys/netpfil/ipfw/dn_aqm_pie.c =================================================================== --- head/sys/netpfil/ipfw/dn_aqm_pie.c +++ head/sys/netpfil/ipfw/dn_aqm_pie.c @@ -0,0 +1,793 @@ +/* + * PIE - Proportional Integral controller Enhanced AQM algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include "opt_inet6.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include + +#include +#include /* ip_len, ip_off */ +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include /* various ether_* routines */ +#include /* for ip6_input, ip6_output prototypes */ +#include +#include + +#ifdef NEW_AQM +#include +#include +#include +#include +#include + +/* for debugging */ +#include + +static struct dn_aqm pie_desc; + +/* PIE defaults + * target=15ms, tupdate=15ms, max_burst=150ms, + * max_ecnth=0.1, alpha=0.125, beta=1.25, + */ +struct dn_aqm_pie_parms pie_sysctl = + { 15 * AQM_TIME_1MS, 15 * AQM_TIME_1MS, 150 * AQM_TIME_1MS, + PIE_SCALE/10 , PIE_SCALE * 0.125, PIE_SCALE * 1.25 , + PIE_CAPDROP_ENABLED | PIE_DEPRATEEST_ENABLED | PIE_DERAND_ENABLED }; + +static int +pie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"alpha")) + value = pie_sysctl.alpha; + else + value = pie_sysctl.beta; + + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 7 * PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + if (!strcmp(oidp->oid_name,"alpha")) + pie_sysctl.alpha = value; + else + pie_sysctl.beta = value; + return (0); +} + +static int +pie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"target")) + value = pie_sysctl.qdelay_ref; + else if (!strcmp(oidp->oid_name,"tupdate")) + value = pie_sysctl.tupdate; + else + value = pie_sysctl.max_burst; + + value = value / AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 10 * AQM_TIME_1S) + return (EINVAL); + value = value * AQM_TIME_1US; + + if (!strcmp(oidp->oid_name,"target")) + pie_sysctl.qdelay_ref = value; + else if (!strcmp(oidp->oid_name,"tupdate")) + pie_sysctl.tupdate = value; + else + pie_sysctl.max_burst = value; + return (0); +} + +static int +pie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = pie_sysctl.max_ecnth; + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + pie_sysctl.max_ecnth = value; + return (0); +} + +/* define PIE sysctl variables */ +SYSBEGIN(f4) +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, + pie, CTLFLAG_RW, 0, "PIE"); + +#ifdef SYSCTL_NODE +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_target_tupdate_maxb_handler, "L", + "queue target in microsecond"); +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, tupdate, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_target_tupdate_maxb_handler, "L", + "the frequency of drop probability calculation in microsecond"); +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_burst, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_target_tupdate_maxb_handler, "L", + "Burst allowance interval in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, max_ecnth, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_max_ecnth_handler, "L", + "ECN safeguard threshold scaled by 1000"); + +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, alpha, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_alpha_beta_handler, "L", + "PIE alpha scaled by 1000"); +SYSCTL_PROC(_net_inet_ip_dummynet_pie, OID_AUTO, beta, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + pie_sysctl_alpha_beta_handler, "L", + "beta scaled by 1000"); +#endif + + +/* + * Callout function for drop probability calculation + * This function is called over tupdate ms and takes pointer of PIE + * status variables as an argument + */ +static void +calculate_drop_prob(void *x) +{ + int64_t p, prob, oldprob; + struct dn_aqm_pie_parms *pprms; + struct pie_status *pst = (struct pie_status *) x; + + /* dealing with race condition */ + if (callout_pending(&pst->aqm_pie_callout)) { + /* callout was reset */ + mtx_unlock(&pst->lock_mtx); + return; + } + + if (!callout_active(&pst->aqm_pie_callout)) { + /* callout was stopped */ + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + free(x, M_DUMMYNET); + //pst->pq->aqm_status = NULL; + pie_desc.ref_count--; + return; + } + callout_deactivate(&pst->aqm_pie_callout); + + pprms = pst->parms; + prob = pst->drop_prob; + + /* calculate current qdelay */ + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + pst->current_qdelay = ((uint64_t)pst->pq->ni.len_bytes * + pst->avg_dq_time) >> PIE_DQ_THRESHOLD_BITS; + } + + /* calculate drop probability */ + p = (int64_t)pprms->alpha * + ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref); + p +=(int64_t) pprms->beta * + ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old); + + /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */ + p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S; + + /* auto-tune drop probability */ + if (prob< PIE_MAX_PROB * 0.000001) + p >>= 11 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.00001) + p >>= 9 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.0001) + p >>= 7 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.001) + p >>= 5 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.01) + p >>= 3 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.1) + p >>= 1 + PIE_FIX_POINT_BITS+12; + else + p >>= PIE_FIX_POINT_BITS+12; + + oldprob = prob; + + /* Cap Drop adjustment */ + if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10 + && p > PIE_MAX_PROB / 50 ) + p = PIE_MAX_PROB / 50; + + prob = prob + p; + + /* decay the drop probability exponentially */ + if (pst->current_qdelay == 0 && pst->qdelay_old == 0) + /* 0.98 ~= 1- 1/64 */ + prob = prob - (prob >> 6); + + + /* check for multiplication overflow/underflow */ + if (p>0) { + if (proboldprob) { + prob= 0; + D("underflow"); + } + + /* make drop probability between 0 and PIE_MAX_PROB*/ + if (prob < 0) + prob = 0; + else if (prob > PIE_MAX_PROB) + prob = PIE_MAX_PROB; + + pst->drop_prob = prob; + + /* store current queue delay value in old queue delay*/ + pst->qdelay_old = pst->current_qdelay; + + /* update burst allowance */ + if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance>0) { + + if (pst->burst_allowance > pprms->tupdate ) + pst->burst_allowance -= pprms->tupdate; + else + pst->burst_allowance = 0; + } + + /* reschedule calculate_drop_prob function */ + if (pst->sflags & PIE_ACTIVE) + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, 0, calculate_drop_prob, pst, 0); + + mtx_unlock(&pst->lock_mtx); +} + +/* + * Extract a packet from the head of queue 'q' + * Return a packet or NULL if the queue is empty. + * If getts is set, also extract packet's timestamp from mtag. + */ +static struct mbuf * +pie_extract_head(struct dn_queue *q, aqm_time_t *pkt_ts, int getts) +{ + struct m_tag *mtag; + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + /* Update stats */ + update_stats(q, -m->m_pkthdr.len, 0); + + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + + if (getts) { + /* extract packet TS*/ + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) { + D("PIE timestamp mtag not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + } + return m; +} + +/* + * Initiate PIE variable and optionally activate it + */ +__inline static void +init_activate_pie(struct pie_status *pst, int resettimer) +{ + struct dn_aqm_pie_parms *pprms; + + mtx_lock(&pst->lock_mtx); + pprms = pst->parms; + pst->drop_prob = 0; + pst->qdelay_old = 0; + pst->burst_allowance = pprms->max_burst; + pst->accu_prob = 0; + pst->dq_count = 0; + pst->avg_dq_time = 0; + pst->sflags = PIE_INMEASUREMENT; + pst->measurement_start = AQM_UNOW; + + if (resettimer) { + pst->sflags |= PIE_ACTIVE; + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, + 0, calculate_drop_prob, pst, 0); + } + //DX(2, "PIE Activated"); + mtx_unlock(&pst->lock_mtx); +} + +/* + * Deactivate PIE and stop probe update callout + */ +__inline static void +deactivate_pie(struct pie_status *pst) +{ + mtx_lock(&pst->lock_mtx); + pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT); + callout_stop(&pst->aqm_pie_callout); + //D("PIE Deactivated"); + mtx_unlock(&pst->lock_mtx); +} + +/* + * Dequeue and return a pcaket from queue 'q' or NULL if 'q' is empty. + * Also, caculate depature time or queue delay using timestamp + */ +static struct mbuf * +aqm_pie_dequeue(struct dn_queue *q) +{ + struct mbuf *m; + struct dn_flow *ni; /* stats for scheduler instance */ + struct dn_aqm_pie_parms *pprms; + struct pie_status *pst; + aqm_time_t now; + aqm_time_t pkt_ts, dq_time; + int32_t w; + + pst = q->aqm_status; + pprms = pst->parms; + ni = &q->_si->ni; + + /*we extarct packet ts only when Departure Rate Estimation dis not used*/ + m = pie_extract_head(q, &pkt_ts, !(pprms->flags & PIE_DEPRATEEST_ENABLED)); + + if (!m || !(pst->sflags & PIE_ACTIVE)) + return m; + + now = AQM_UNOW; + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + /* calculate average depature time */ + if(pst->sflags & PIE_INMEASUREMENT) { + pst->dq_count += m->m_pkthdr.len; + + if (pst->dq_count >= PIE_DQ_THRESHOLD) { + dq_time = now - pst->measurement_start; + + /* + * if we don't have old avg dq_time i.e PIE is (re)initialized, + * don't use weight to calculate new avg_dq_time + */ + if(pst->avg_dq_time == 0) + pst->avg_dq_time = dq_time; + else { + /* + * weight = PIE_DQ_THRESHOLD/2^6, but we scaled + * weight by 2^8. Thus, scaled + * weight = PIE_DQ_THRESHOLD /2^8 + * */ + w = PIE_DQ_THRESHOLD >> 8; + pst->avg_dq_time = (dq_time* w + + (pst->avg_dq_time * ((1L << 8) - w))) >> 8; + pst->sflags &= ~PIE_INMEASUREMENT; + } + } + } + + /* + * Start new measurment cycle when the queue has + * PIE_DQ_THRESHOLD worth of bytes. + */ + if(!(pst->sflags & PIE_INMEASUREMENT) && + q->ni.len_bytes >= PIE_DQ_THRESHOLD) { + pst->sflags |= PIE_INMEASUREMENT; + pst->measurement_start = now; + pst->dq_count = 0; + } + } + /* Optionally, use packet timestamp to estimate queue delay */ + else + pst->current_qdelay = now - pkt_ts; + + return m; +} + +/* + * Enqueue a packet in q, subject to space and PIE queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +static int +aqm_pie_enqueue(struct dn_queue *q, struct mbuf* m) +{ + struct dn_fs *f; + uint64_t len; + uint32_t qlen; + struct pie_status *pst; + struct dn_aqm_pie_parms *pprms; + int t; + + len = m->m_pkthdr.len; + pst = q->aqm_status; + if(!pst) { + DX(2, "PIE queue is not initialized\n"); + update_stats(q, 0, 1); + FREE_PKT(m); + return 1; + } + + f = &(q->fs->fs); + pprms = pst->parms; + t = ENQUE; + + /* get current queue length in bytes or packets*/ + qlen = (f->flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* check for queue size and drop the tail if exceed queue limit*/ + if (qlen >= f->qsize) + t = DROP; + /* drop/mark the packet when PIE is active and burst time elapsed */ + else if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance==0 + && drop_early(pst, q->ni.len_bytes) == DROP) { + /* + * if drop_prob over ECN threshold, drop the packet + * otherwise mark and enqueue it. + */ + if ((pprms->flags & PIE_ECN_ENABLED) && pst->drop_prob < + (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS)) + && ecn_mark(m)) + t = ENQUE; + else + t = DROP; + } + + /* Turn PIE on when 1/3 of the queue is full */ + if (!(pst->sflags & PIE_ACTIVE) && qlen >= pst->one_third_q_size) { + init_activate_pie(pst, 1); + } + + /* Reset burst tolerance and optinally turn PIE off*/ + if ((pst->sflags & PIE_ACTIVE) && pst->drop_prob == 0 && + pst->current_qdelay < (pprms->qdelay_ref >> 1) && + pst->qdelay_old < (pprms->qdelay_ref >> 1)) { + + pst->burst_allowance = pprms->max_burst; + if ((pprms->flags & PIE_ON_OFF_MODE_ENABLED) && qlen<=0) + deactivate_pie(pst); + } + + /* Timestamp the packet if Departure Rate Estimation is disabled */ + if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) { + /* Add TS to mbuf as a TAG */ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, + sizeof(aqm_time_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + t = DROP; + } + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + } + + if (t != DROP) { + mq_append(&q->mq, m); + update_stats(q, len, 0); + return (0); + } else { + update_stats(q, 0, 1); + + /* reset accu_prob after packet drop */ + pst->accu_prob = 0; + FREE_PKT(m); + return 1; + } + return 0; +} + +/* + * initialize PIE for queue 'q' + * First allocate memory for PIE status. + */ +static int +aqm_pie_init(struct dn_queue *q) +{ + struct pie_status *pst; + struct dn_aqm_pie_parms *pprms; + int err = 0; + + pprms = q->fs->aqmcfg; + + do { /* exit with break when error occurs*/ + if (!pprms){ + D("AQM_PIE is not configured"); + err = EINVAL; + break; + } + + q->aqm_status = malloc(sizeof(struct pie_status), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q->aqm_status == NULL) { + D("cannot allocate PIE private data"); + err = ENOMEM ; + break; + } + + pst = q->aqm_status; + /* increase reference count for PIE module */ + pie_desc.ref_count++; + + pst->pq = q; + pst->parms = pprms; + + /* For speed optimization, we caculate 1/3 queue size once here */ + // we can use x/3 = (x >>2) + (x >>4) + (x >>7) + pst->one_third_q_size = q->fs->fs.qsize/3; + + mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF); + callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx, + CALLOUT_RETURNUNLOCKED); + + pst->current_qdelay = 0; + init_activate_pie(pst, !(pprms->flags & PIE_ON_OFF_MODE_ENABLED)); + + //DX(2, "aqm_PIE_init"); + + } while(0); + + return err; +} + +/* + * Clean up PIE status for queue 'q' + * Destroy memory allocated for PIE status. + */ +static int +aqm_pie_cleanup(struct dn_queue *q) +{ + + if(!q) { + D("q is null"); + return 0; + } + struct pie_status *pst = q->aqm_status; + if(!pst) { + //D("queue is already cleaned up"); + return 0; + } + if(!q->fs || !q->fs->aqmcfg) { + D("fs is null or no cfg"); + return 1; + } + if (q->fs->aqmfp && q->fs->aqmfp->type !=DN_AQM_PIE) { + D("Not PIE fs (%d)", q->fs->fs.fs_nr); + return 1; + } + + mtx_lock(&pst->lock_mtx); + + /* stop callout timer */ + if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) { + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + free(q->aqm_status, M_DUMMYNET); + q->aqm_status = NULL; + pie_desc.ref_count--; + return 0; + } else { + q->aqm_status = NULL; + mtx_unlock(&pst->lock_mtx); + DX(2, "PIE callout has not been stoped from cleanup!"); + return EBUSY; + } + return 0; +} + +/* + * Config PIE parameters + * also allocate memory for PIE configurations + */ +static int +aqm_pie_config(struct dn_fsk* fs, struct dn_extra_parms *ep, int len) +{ + struct dn_aqm_pie_parms *pcfg; + + int l = sizeof(struct dn_extra_parms); + if (len < l) { + D("invalid sched parms length got %d need %d", len, l); + return EINVAL; + } + /* we free the old cfg because maybe the orignal allocation + * was used for diffirent AQM type. + */ + if (fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + } + + fs->aqmcfg = malloc(sizeof(struct dn_aqm_pie_parms), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs->aqmcfg== NULL) { + D("cannot allocate PIE configuration parameters"); + return ENOMEM; + } + + /* par array contains pie configuration as follow + * 0- qdelay_ref,1- tupdate, 2- max_burst + * 3- max_ecnth, 4- alpha, 5- beta, 6- flags + */ + + /* configure PIE parameters */ + pcfg = fs->aqmcfg; + + if (ep->par[0] < 0) + pcfg->qdelay_ref = pie_sysctl.qdelay_ref * AQM_TIME_1US; + else + pcfg->qdelay_ref = ep->par[0]; + if (ep->par[1] < 0) + pcfg->tupdate = pie_sysctl.tupdate * AQM_TIME_1US; + else + pcfg->tupdate = ep->par[1]; + if (ep->par[2] < 0) + pcfg->max_burst = pie_sysctl.max_burst * AQM_TIME_1US; + else + pcfg->max_burst = ep->par[2]; + if (ep->par[3] < 0) + pcfg->max_ecnth = pie_sysctl.max_ecnth; + else + pcfg->max_ecnth = ep->par[3]; + if (ep->par[4] < 0) + pcfg->alpha = pie_sysctl.alpha; + else + pcfg->alpha = ep->par[4]; + if (ep->par[5] < 0) + pcfg->beta = pie_sysctl.beta; + else + pcfg->beta = ep->par[5]; + if (ep->par[6] < 0) + pcfg->flags = pie_sysctl.flags; + else + pcfg->flags = ep->par[6]; + + /* bound PIE configurations */ + pcfg->qdelay_ref = BOUND_VAR(pcfg->qdelay_ref, 1, 10 * AQM_TIME_1S); + pcfg->tupdate = BOUND_VAR(pcfg->tupdate, 1, 10 * AQM_TIME_1S); + pcfg->max_burst = BOUND_VAR(pcfg->max_burst, 0, 10 * AQM_TIME_1S); + pcfg->max_ecnth = BOUND_VAR(pcfg->max_ecnth, 0, PIE_SCALE); + pcfg->alpha = BOUND_VAR(pcfg->alpha, 0, 7 * PIE_SCALE); + pcfg->beta = BOUND_VAR(pcfg->beta, 0 , 7 * PIE_SCALE); + + pie_desc.cfg_ref_count++; + //D("pie cfg_ref_count=%d", pie_desc.cfg_ref_count); + return 0; +} + +/* + * Deconfigure PIE and free memory allocation + */ +static int +aqm_pie_deconfig(struct dn_fsk* fs) +{ + if (fs && fs->aqmcfg) { + free(fs->aqmcfg, M_DUMMYNET); + fs->aqmcfg = NULL; + pie_desc.cfg_ref_count--; + } + return 0; +} + +/* + * Retrieve PIE configuration parameters. + */ +static int +aqm_pie_getconfig (struct dn_fsk *fs, struct dn_extra_parms * ep) +{ + struct dn_aqm_pie_parms *pcfg; + if (fs->aqmcfg) { + strcpy(ep->name, pie_desc.name); + pcfg = fs->aqmcfg; + ep->par[0] = pcfg->qdelay_ref / AQM_TIME_1US; + ep->par[1] = pcfg->tupdate / AQM_TIME_1US; + ep->par[2] = pcfg->max_burst / AQM_TIME_1US; + ep->par[3] = pcfg->max_ecnth; + ep->par[4] = pcfg->alpha; + ep->par[5] = pcfg->beta; + ep->par[6] = pcfg->flags; + + return 0; + } + return 1; +} + +static struct dn_aqm pie_desc = { + _SI( .type = ) DN_AQM_PIE, + _SI( .name = ) "PIE", + _SI( .ref_count = ) 0, + _SI( .cfg_ref_count = ) 0, + _SI( .enqueue = ) aqm_pie_enqueue, + _SI( .dequeue = ) aqm_pie_dequeue, + _SI( .config = ) aqm_pie_config, + _SI( .deconfig = ) aqm_pie_deconfig, + _SI( .getconfig = ) aqm_pie_getconfig, + _SI( .init = ) aqm_pie_init, + _SI( .cleanup = ) aqm_pie_cleanup, +}; + +DECLARE_DNAQM_MODULE(dn_aqm_pie, &pie_desc); +#endif Index: head/sys/netpfil/ipfw/dn_sched.h =================================================================== --- head/sys/netpfil/ipfw/dn_sched.h +++ head/sys/netpfil/ipfw/dn_sched.h @@ -132,6 +132,10 @@ int (*free_fsk)(struct dn_fsk *f); int (*new_queue)(struct dn_queue *q); int (*free_queue)(struct dn_queue *q); +#ifdef NEW_AQM + /* Getting scheduler extra parameters */ + int (*getconfig)(struct dn_schk *, struct dn_extra_parms *); +#endif /* run-time fields */ int ref_count; /* XXX number of instances in the system */ @@ -165,6 +169,11 @@ struct mbuf *m = q->mq.head; if (m == NULL) return NULL; +#ifdef NEW_AQM + /* Call AQM dequeue function */ + if (q->fs->aqmfp && q->fs->aqmfp->dequeue ) + return q->fs->aqmfp->dequeue(q); +#endif q->mq.head = m->m_nextpkt; q->mq.count--; Index: head/sys/netpfil/ipfw/dn_sched_fifo.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_fifo.c +++ head/sys/netpfil/ipfw/dn_sched_fifo.c @@ -42,6 +42,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include #else #include @@ -116,6 +119,9 @@ _SI( .free_fsk = ) NULL, _SI( .new_queue = ) NULL, _SI( .free_queue = ) NULL, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); Index: head/sys/netpfil/ipfw/dn_sched_fq_codel.h =================================================================== --- head/sys/netpfil/ipfw/dn_sched_fq_codel.h +++ head/sys/netpfil/ipfw/dn_sched_fq_codel.h @@ -0,0 +1,167 @@ +/*- + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * FQ_Codel Structures and helper functions + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_SCHED_FQ_CODEL_H +#define _IP_DN_SCHED_FQ_CODEL_H + +/* list of queues */ +STAILQ_HEAD(fq_codel_list, fq_codel_flow) ; + +/* fq_codel parameters including codel */ +struct dn_sch_fq_codel_parms { + struct dn_aqm_codel_parms ccfg; /* CoDel Parameters */ + /* FQ_CODEL Parameters */ + uint32_t flows_cnt; /* number of flows */ + uint32_t limit; /* hard limit of fq_codel queue size*/ + uint32_t quantum; +}; /* defaults */ + +/* flow (sub-queue) stats */ +struct flow_stats { + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + +/* A flow of packets (sub-queue).*/ +struct fq_codel_flow { + struct mq mq; /* list of packets */ + struct flow_stats stats; /* statistics */ + int deficit; + int active; /* 1: flow is active (in a list) */ + struct codel_status cst; + STAILQ_ENTRY(fq_codel_flow) flowchain; +}; + +/* extra fq_codel scheduler configurations */ +struct fq_codel_schk { + struct dn_sch_fq_codel_parms cfg; +}; + +/* fq_codel scheduler instance */ +struct fq_codel_si { + struct dn_sch_inst _si; /* standard scheduler instance */ + struct dn_queue main_q; /* main queue is after si directly */ + + struct fq_codel_flow *flows; /* array of flows (queues) */ + uint32_t perturbation; /* random value */ + struct fq_codel_list newflows; /* list of new queues */ + struct fq_codel_list oldflows; /* list of old queues */ +}; + +/* Helper function to update queue&main-queue and scheduler statistics. + * negative len + drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +fq_update_stats(struct fq_codel_flow *q, struct fq_codel_si *si, int len, + int drop) +{ + int inc = 0; + + if (len < 0) + inc = -1; + else if (len > 0) + inc = 1; + + if (drop) { + si->main_q.ni.drops ++; + q->stats.drops ++; + si->_si.ni.drops ++; + io_pkt_drop ++; + } + + if (!drop || (drop && len < 0)) { + /* Update stats for the main queue */ + si->main_q.ni.length += inc; + si->main_q.ni.len_bytes += len; + + /*update sub-queue stats */ + q->stats.length += inc; + q->stats.len_bytes += len; + + /*update scheduler instance stats */ + si->_si.ni.length += inc; + si->_si.ni.len_bytes += len; + } + + if (inc > 0) { + si->main_q.ni.tot_bytes += len; + si->main_q.ni.tot_pkts ++; + + q->stats.tot_bytes +=len; + q->stats.tot_pkts++; + + si->_si.ni.tot_bytes +=len; + si->_si.ni.tot_pkts ++; + } + +} + +/* extract the head of fq_codel sub-queue */ +__inline static struct mbuf * +fq_codel_extract_head(struct fq_codel_flow *q, aqm_time_t *pkt_ts, struct fq_codel_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 0); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + /* extract packet timestamp*/ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL){ + D("timestamp tag is not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + + return m; +} + + +#endif Index: head/sys/netpfil/ipfw/dn_sched_fq_codel.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_fq_codel.c +++ head/sys/netpfil/ipfw/dn_sched_fq_codel.c @@ -0,0 +1,617 @@ +/* + * FQ_Codel - The FlowQueue-Codel scheduler/AQM + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef _KERNEL +#include +#include +//#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#else +#include +#endif + +/* NOTE: In fq_codel module, we reimplements CoDel AQM functions + * because fq_codel use different flows (sub-queues) structure and + * dn_queue includes many variables not needed by a flow (sub-queue + * )i.e. avoid extra overhead (88 bytes vs 208 bytes). + * Also, CoDel functions manages stats of sub-queues as well as the main queue. + */ + +#define DN_SCHED_FQ_CODEL 6 + +static struct dn_alg fq_codel_desc; + +/* fq_codel default parameters including codel */ +struct dn_sch_fq_codel_parms +fq_codel_sysctl = {{5000 * AQM_TIME_1US, 100000 * AQM_TIME_1US, + CODEL_ECN_ENABLED}, 1024, 10240, 1514}; + +static int +fqcodel_sysctl_interval_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = fq_codel_sysctl.ccfg.interval; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 100 * AQM_TIME_1S) + return (EINVAL); + fq_codel_sysctl.ccfg.interval = value * AQM_TIME_1US ; + + return (0); +} + +static int +fqcodel_sysctl_target_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = fq_codel_sysctl.ccfg.target; + value /= AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 5 * AQM_TIME_1S) + return (EINVAL); + fq_codel_sysctl.ccfg.target = value * AQM_TIME_1US ; + + return (0); +} + + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqcodel, + CTLFLAG_RW, 0, "FQ_CODEL"); + +#ifdef SYSCTL_NODE + +SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_target_handler, "L", + "FQ_CoDel target in microsecond"); +SYSCTL_PROC(_net_inet_ip_dummynet_fqcodel, OID_AUTO, interval, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, fqcodel_sysctl_interval_handler, "L", + "FQ_CoDel interval in microsecond"); + +SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, quantum, + CTLFLAG_RW, &fq_codel_sysctl.quantum, 1514, "FQ_CoDel quantum"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, flows, + CTLFLAG_RW, &fq_codel_sysctl.flows_cnt, 1024, + "Number of queues for FQ_CoDel"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqcodel, OID_AUTO, limit, + CTLFLAG_RW, &fq_codel_sysctl.limit, 10240, "FQ_CoDel queues size limit"); +#endif + +/* Drop a packet form the head of codel queue */ +static void +codel_drop_head(struct fq_codel_flow *q, struct fq_codel_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 1); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + FREE_PKT(m); +} + +/* Enqueue a packet 'm' to a queue 'q' and add timestamp to that packet. + * Return 1 when unable to add timestamp, otherwise return 0 + */ +static int +codel_enqueue(struct fq_codel_flow *q, struct mbuf *m, struct fq_codel_si *si) +{ + uint64_t len; + + len = m->m_pkthdr.len; + /* finding maximum packet size */ + if (len > q->cst.maxpkt_size) + q->cst.maxpkt_size = len; + + /* Add timestamp to mbuf as MTAG */ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, sizeof(aqm_time_t), + M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + goto drop; + } + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + + mq_append(&q->mq, m); + fq_update_stats(q, si, len, 0); + return 0; + +drop: + fq_update_stats(q, si, len, 1); + m_freem(m); + return 1; +} + +/* + * Classify a packet to queue number using Jenkins hash function. + * Return: queue number + * the input of the hash are protocol no, perturbation, src IP, dst IP, + * src port, dst port, + */ +static inline int +fq_codel_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_codel_si *si) +{ + struct ip *ip; + struct tcphdr *th; + struct udphdr *uh; + uint8_t tuple[41]; + uint16_t hash=0; + +//#ifdef INET6 + struct ip6_hdr *ip6; + int isip6; + isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; + + if(isip6) { + ip6 = mtod(m, struct ip6_hdr *); + *((uint8_t *) &tuple[0]) = ip6->ip6_nxt; + *((uint32_t *) &tuple[1]) = si->perturbation; + memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16); + memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16); + + switch (ip6->ip6_nxt) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = th->th_dport; + *((uint16_t *) &tuple[39]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = uh->uh_dport; + *((uint16_t *) &tuple[39]) = uh->uh_sport; + break; + default: + memset(&tuple[37], 0, 4); + + } + + hash = jenkins_hash(tuple, 41, HASHINIT) % fcount; + return hash; + } +//#endif + + /* IPv4 */ + ip = mtod(m, struct ip *); + *((uint8_t *) &tuple[0]) = ip->ip_p; + *((uint32_t *) &tuple[1]) = si->perturbation; + *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr; + *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr; + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = th->th_dport; + *((uint16_t *) &tuple[15]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = uh->uh_dport; + *((uint16_t *) &tuple[15]) = uh->uh_sport; + break; + default: + memset(&tuple[13], 0, 4); + + } + hash = jenkins_hash(tuple, 17, HASHINIT) % fcount; + + return hash; +} + +/* + * Enqueue a packet into an appropriate queue according to + * FQ_CODEL algorithm. + */ +static int +fq_codel_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, + struct mbuf *m) +{ + struct fq_codel_si *si; + struct fq_codel_schk *schk; + struct dn_sch_fq_codel_parms *param; + struct dn_queue *mainq; + int idx, drop, i, maxidx; + + mainq = (struct dn_queue *)(_si + 1); + si = (struct fq_codel_si *)_si; + schk = (struct fq_codel_schk *)(si->_si.sched+1); + param = &schk->cfg; + + /* classify a packet to queue number*/ + idx = fq_codel_classify_flow(m, param->flows_cnt, si); + /* enqueue packet into appropriate queue using CoDel AQM. + * Note: 'codel_enqueue' function returns 1 only when it unable to + * add timestamp to packet (no limit check)*/ + drop = codel_enqueue(&si->flows[idx], m, si); + + /* codel unable to timestamp a packet */ + if (drop) + return 1; + + /* If the flow (sub-queue) is not active ,then add it to the tail of + * new flows list, initialize and activate it. + */ + if (!si->flows[idx].active ) { + STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain); + si->flows[idx].deficit = param->quantum; + si->flows[idx].cst.dropping = false; + si->flows[idx].cst.first_above_time = 0; + si->flows[idx].active = 1; + //D("activate %d",idx); + } + + /* check the limit for all queues and remove a packet from the + * largest one + */ + if (mainq->ni.length > schk->cfg.limit) { D("over limit"); + /* find first active flow */ + for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++) + if (si->flows[maxidx].active) + break; + if (maxidx < schk->cfg.flows_cnt) { + /* find the largest sub- queue */ + for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) + if (si->flows[i].active && si->flows[i].stats.length > + si->flows[maxidx].stats.length) + maxidx = i; + codel_drop_head(&si->flows[maxidx], si); + D("maxidx = %d",maxidx); + drop = 1; + } + } + + return drop; +} + +/* + * Dequeue a packet from an appropriate queue according to + * FQ_CODEL algorithm. + */ +static struct mbuf * +fq_codel_dequeue(struct dn_sch_inst *_si) +{ + struct fq_codel_si *si; + struct fq_codel_schk *schk; + struct dn_sch_fq_codel_parms *param; + struct fq_codel_flow *f; + struct mbuf *mbuf; + struct fq_codel_list *fq_codel_flowlist; + + si = (struct fq_codel_si *)_si; + schk = (struct fq_codel_schk *)(si->_si.sched+1); + param = &schk->cfg; + + do { + /* select a list to start with */ + if (STAILQ_EMPTY(&si->newflows)) + fq_codel_flowlist = &si->oldflows; + else + fq_codel_flowlist = &si->newflows; + + /* Both new and old queue lists are empty, return NULL */ + if (STAILQ_EMPTY(fq_codel_flowlist)) + return NULL; + + f = STAILQ_FIRST(fq_codel_flowlist); + while (f != NULL) { + /* if there is no flow(sub-queue) deficit, increase deficit + * by quantum, move the flow to the tail of old flows list + * and try another flow. + * Otherwise, the flow will be used for dequeue. + */ + if (f->deficit < 0) { + f->deficit += param->quantum; + STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else + break; + + f = STAILQ_FIRST(fq_codel_flowlist); + } + + /* the new flows list is empty, try old flows list */ + if (STAILQ_EMPTY(fq_codel_flowlist)) + continue; + + /* Dequeue a packet from the selected flow */ + mbuf = fqc_codel_dequeue(f, si); + + /* Codel did not return a packet */ + if (!mbuf) { + /* If the selected flow belongs to new flows list, then move + * it to the tail of old flows list. Otherwise, deactivate it and + * remove it from the old list and + */ + if (fq_codel_flowlist == &si->newflows) { + STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else { + f->active = 0; + STAILQ_REMOVE_HEAD(fq_codel_flowlist, flowchain); + } + /* start again */ + continue; + } + + /* we have a packet to return, + * update flow deficit and return the packet*/ + f->deficit -= mbuf->m_pkthdr.len; + return mbuf; + + } while (1); + + /* unreachable point */ + return NULL; +} + +/* + * Initialize fq_codel scheduler instance. + * also, allocate memory for flows array. + */ +static int +fq_codel_new_sched(struct dn_sch_inst *_si) +{ + struct fq_codel_si *si; + struct dn_queue *q; + struct fq_codel_schk *schk; + int i; + + si = (struct fq_codel_si *)_si; + schk = (struct fq_codel_schk *)(_si->sched+1); + + if(si->flows) { + D("si already configured!"); + return 0; + } + + /* init the main queue */ + q = &si->main_q; + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = _si; + q->fs = _si->sched->fs; + + /* allocate memory for flows array */ + si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_codel_flow), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si->flows == NULL) { + D("cannot allocate memory for fq_codel configuration parameters"); + return ENOMEM ; + } + + /* init perturbation for this si */ + si->perturbation = random(); + + /* init the old and new flows lists */ + STAILQ_INIT(&si->newflows); + STAILQ_INIT(&si->oldflows); + + /* init the flows (sub-queues) */ + for (i = 0; i < schk->cfg.flows_cnt; i++) { + /* init codel */ + si->flows[i].cst.maxpkt_size = 500; + } + + fq_codel_desc.ref_count++; + return 0; +} + +/* + * Free fq_codel scheduler instance. + */ +static int +fq_codel_free_sched(struct dn_sch_inst *_si) +{ + struct fq_codel_si *si = (struct fq_codel_si *)_si ; + + /* free the flows array */ + free(si->flows , M_DUMMYNET); + si->flows = NULL; + fq_codel_desc.ref_count--; + + return 0; +} + +/* + * Configure fq_codel scheduler. + * the configurations for the scheduler is passed from userland. + */ +static int +fq_codel_config(struct dn_schk *_schk) +{ + struct fq_codel_schk *schk; + struct dn_extra_parms *ep; + struct dn_sch_fq_codel_parms *fqc_cfg; + + schk = (struct fq_codel_schk *)(_schk+1); + ep = (struct dn_extra_parms *) _schk->cfg; + + /* par array contains fq_codel configuration as follow + * Codel: 0- target,1- interval, 2- flags + * FQ_CODEL: 3- quantum, 4- limit, 5- flows + */ + if (ep && ep->oid.len ==sizeof(*ep) && + ep->oid.subtype == DN_SCH_PARAMS) { + + fqc_cfg = &schk->cfg; + if (ep->par[0] < 0) + fqc_cfg->ccfg.target = fq_codel_sysctl.ccfg.target; + else + fqc_cfg->ccfg.target = ep->par[0] * AQM_TIME_1US; + + if (ep->par[1] < 0) + fqc_cfg->ccfg.interval = fq_codel_sysctl.ccfg.interval; + else + fqc_cfg->ccfg.interval = ep->par[1] * AQM_TIME_1US; + + if (ep->par[2] < 0) + fqc_cfg->ccfg.flags = 0; + else + fqc_cfg->ccfg.flags = ep->par[2]; + + /* FQ configurations */ + if (ep->par[3] < 0) + fqc_cfg->quantum = fq_codel_sysctl.quantum; + else + fqc_cfg->quantum = ep->par[3]; + + if (ep->par[4] < 0) + fqc_cfg->limit = fq_codel_sysctl.limit; + else + fqc_cfg->limit = ep->par[4]; + + if (ep->par[5] < 0) + fqc_cfg->flows_cnt = fq_codel_sysctl.flows_cnt; + else + fqc_cfg->flows_cnt = ep->par[5]; + + /* Bound the configurations */ + fqc_cfg->ccfg.target = BOUND_VAR(fqc_cfg->ccfg.target, 1 , + 5 * AQM_TIME_1S); ; + fqc_cfg->ccfg.interval = BOUND_VAR(fqc_cfg->ccfg.interval, 1, + 100 * AQM_TIME_1S); + + fqc_cfg->quantum = BOUND_VAR(fqc_cfg->quantum,1, 9000); + fqc_cfg->limit= BOUND_VAR(fqc_cfg->limit,1,20480); + fqc_cfg->flows_cnt= BOUND_VAR(fqc_cfg->flows_cnt,1,65536); + } + else + return 1; + + return 0; +} + +/* + * Return fq_codel scheduler configurations + * the configurations for the scheduler is passed to userland. + */ +static int +fq_codel_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) { + + struct fq_codel_schk *schk = (struct fq_codel_schk *)(_schk+1); + struct dn_sch_fq_codel_parms *fqc_cfg; + + fqc_cfg = &schk->cfg; + + strcpy(ep->name, fq_codel_desc.name); + ep->par[0] = fqc_cfg->ccfg.target / AQM_TIME_1US; + ep->par[1] = fqc_cfg->ccfg.interval / AQM_TIME_1US; + ep->par[2] = fqc_cfg->ccfg.flags; + + ep->par[3] = fqc_cfg->quantum; + ep->par[4] = fqc_cfg->limit; + ep->par[5] = fqc_cfg->flows_cnt; + + return 0; +} + +/* + * fq_codel scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fq_codel_desc = { + _SI( .type = ) DN_SCHED_FQ_CODEL, + _SI( .name = ) "FQ_CODEL", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) sizeof(struct fq_codel_schk), + _SI( .si_datalen = ) sizeof(struct fq_codel_si) - sizeof(struct dn_sch_inst), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fq_codel_enqueue, + _SI( .dequeue = ) fq_codel_dequeue, + _SI( .config = ) fq_codel_config, /* new sched i.e. sched X config ...*/ + _SI( .destroy = ) NULL, /*sched x delete */ + _SI( .new_sched = ) fq_codel_new_sched, /* new schd instance */ + _SI( .free_sched = ) fq_codel_free_sched, /* delete schd instance */ + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, + _SI( .getconfig = ) fq_codel_getconfig, + _SI( .ref_count = ) 0 +}; + +DECLARE_DNSCHED_MODULE(dn_fq_codel, &fq_codel_desc); Index: head/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h =================================================================== --- head/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h +++ head/sys/netpfil/ipfw/dn_sched_fq_codel_helper.h @@ -0,0 +1,187 @@ +/* + * Codel - The Controlled-Delay Active Queue Management algorithm. + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Copyright (C) 2011-2014 Kathleen Nichols . + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * o Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * + * o Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * o The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General Public + * License ("GPL") version 2, in which case the provisions of the GPL + * apply INSTEAD OF those given above. + + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_DN_SCHED_FQ_CODEL_HELPER_H +#define _IP_DN_SCHED_FQ_CODEL_HELPER_H + +__inline static struct mbuf * +fqc_dodequeue(struct fq_codel_flow *q, aqm_time_t now, uint16_t *ok_to_drop, + struct fq_codel_si *si) +{ + struct mbuf * m; + struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1); + aqm_time_t pkt_ts, sojourn_time; + + *ok_to_drop = 0; + m = fq_codel_extract_head(q, &pkt_ts, si); + + if (m == NULL) { + /*queue is empty - we can't be above target*/ + q->cst.first_above_time= 0; + return m; + } + + /* To span a large range of bandwidths, CoDel runs two + * different AQMs in parallel. One is sojourn-time-based + * and takes effect when the time to send an MTU-sized + * packet is less than target. The 1st term of the "if" + * below does this. The other is backlog-based and takes + * effect when the time to send an MTU-sized packet is >= + * target. The goal here is to keep the output link + * utilization high by never allowing the queue to get + * smaller than the amount that arrives in a typical + * interarrival time (MTU-sized packets arriving spaced + * by the amount of time it takes to send such a packet on + * the bottleneck). The 2nd term of the "if" does this. + */ + sojourn_time = now - pkt_ts; + if (sojourn_time < schk->cfg.ccfg.target || q->stats.len_bytes <= q->cst.maxpkt_size) { + /* went below - stay below for at least interval */ + q->cst.first_above_time = 0; + } else { + if (q->cst.first_above_time == 0) { + /* just went above from below. if still above at + * first_above_time, will say it's ok to drop. */ + q->cst.first_above_time = now + schk->cfg.ccfg.interval; + } else if (now >= q->cst.first_above_time) { + *ok_to_drop = 1; + } + } + return m; +} + +/* Codel dequeue function */ +__inline static struct mbuf * +fqc_codel_dequeue(struct fq_codel_flow *q, struct fq_codel_si *si) +{ + struct mbuf *m; + struct dn_aqm_codel_parms *cprms; + struct codel_status *cst; + aqm_time_t now; + uint16_t ok_to_drop; + struct fq_codel_schk *schk = (struct fq_codel_schk *)(si->_si.sched+1); + + cst = &q->cst; + cprms = &schk->cfg.ccfg; + + now = AQM_UNOW; + m = fqc_dodequeue(q, now, &ok_to_drop, si); + + if (cst->dropping) { + if (!ok_to_drop) { + /* sojourn time below target - leave dropping state */ + cst->dropping = false; + } + + /* Time for the next drop. Drop current packet and dequeue + * next. If the dequeue doesn't take us out of dropping + * state, schedule the next drop. A large backlog might + * result in drop rates so high that the next drop should + * happen now, hence the 'while' loop. + */ + while (now >= cst->drop_next_time && cst->dropping) { + + /* mark the packet */ + if (cprms->flags & CODEL_ECN_ENABLED && ecn_mark(m)) { + cst->count++; + /* schedule the next mark. */ + cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time); + return m; + } + + /* drop the packet */ + fq_update_stats(q, si, 0, 1); + m_freem(m); + m = fqc_dodequeue(q, now, &ok_to_drop, si); + + if (!ok_to_drop) { + /* leave dropping state */ + cst->dropping = false; + } else { + cst->count++; + /* schedule the next drop. */ + cst->drop_next_time = control_law(cst, cprms, cst->drop_next_time); + } + } + /* If we get here we're not in dropping state. The 'ok_to_drop' + * return from dodequeue means that the sojourn time has been + * above 'target' for 'interval' so enter dropping state. + */ + } else if (ok_to_drop) { + + /* if ECN option is disabled or the packet cannot be marked, + * drop the packet and extract another. + */ + if (!(cprms->flags & CODEL_ECN_ENABLED) || !ecn_mark(m)) { + fq_update_stats(q, si, 0, 1); + m_freem(m); + m = fqc_dodequeue(q, now, &ok_to_drop,si); + } + + cst->dropping = true; + + /* If min went above target close to when it last went + * below, assume that the drop rate that controlled the + * queue on the last cycle is a good starting point to + * control it now. ('drop_next' will be at most 'interval' + * later than the time of the last drop so 'now - drop_next' + * is a good approximation of the time from the last drop + * until now.) + */ + cst->count = (cst->count > 2 && ((aqm_stime_t)now - + (aqm_stime_t)cst->drop_next_time) < 8* cprms->interval)? cst->count - 2 : 1; + + /* we don't have to set initial guess for Newton's method isqrt as + * we initilaize isqrt in control_law function when count == 1 */ + cst->drop_next_time = control_law(cst, cprms, now); + } + + return m; +} + +#endif Index: head/sys/netpfil/ipfw/dn_sched_fq_pie.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_fq_pie.c +++ head/sys/netpfil/ipfw/dn_sched_fq_pie.c @@ -0,0 +1,1262 @@ +/* + * FQ_PIE - The FlowQueue-PIE scheduler/AQM + * + * $FreeBSD$ + * + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Important note: + * As there is no an office document for FQ-PIE specification, we used + * FQ-CoDel algorithm with some modifications to implement FQ-PIE. + * This FQ-PIE implementation is a beta version and have not been tested + * extensively. Our FQ-PIE uses stand-alone PIE AQM per sub-queue. By + * default, timestamp is used to calculate queue delay instead of departure + * rate estimation method. Although departure rate estimation is available + * as testing option, the results could be incorrect. Moreover, turning PIE on + * and off option is available but it does not work properly in this version. + */ + + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#else +#include +#endif + +#define DN_SCHED_FQ_PIE 7 + +/* list of queues */ +STAILQ_HEAD(fq_pie_list, fq_pie_flow) ; + +/* FQ_PIE parameters including PIE */ +struct dn_sch_fq_pie_parms { + struct dn_aqm_pie_parms pcfg; /* PIE configuration Parameters */ + /* FQ_PIE Parameters */ + uint32_t flows_cnt; /* number of flows */ + uint32_t limit; /* hard limit of FQ_PIE queue size*/ + uint32_t quantum; +}; + +/* flow (sub-queue) stats */ +struct flow_stats { + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ + uint32_t drops; +}; + +/* A flow of packets (sub-queue)*/ +struct fq_pie_flow { + struct mq mq; /* list of packets */ + struct flow_stats stats; /* statistics */ + int deficit; + int active; /* 1: flow is active (in a list) */ + struct pie_status pst; /* pie status variables */ + struct fq_pie_si *psi; /* parent scheduler instance */ + STAILQ_ENTRY(fq_pie_flow) flowchain; +}; + +/* extra fq_pie scheduler configurations */ +struct fq_pie_schk { + struct dn_sch_fq_pie_parms cfg; +}; + +/* fq_pie scheduler instance */ +struct fq_pie_si { + struct dn_sch_inst _si; /* standard scheduler instance */ + struct dn_queue main_q; /* main queue is after si directly */ + uint32_t nr_active_q; + struct fq_pie_flow *flows; /* array of flows (queues) */ + uint32_t perturbation; /* random value */ + struct fq_pie_list newflows; /* list of new queues */ + struct fq_pie_list oldflows; /* list of old queues */ +}; + + +struct mem_to_free { + void *mem_flows; + void *mem_callout; +}; +static struct mtx freemem_mtx; +static struct dn_alg fq_pie_desc; + +/* Default FQ-PIE parameters including PIE */ +/* PIE defaults + * target=15ms, max_burst=150ms, max_ecnth=0.1, + * alpha=0.125, beta=1.25, tupdate=15ms + * FQ- + * flows=1024, limit=10240, quantum =1514 + */ +struct dn_sch_fq_pie_parms + fq_pie_sysctl = {{15000 * AQM_TIME_1US, 15000 * AQM_TIME_1US, + 150000 * AQM_TIME_1US, PIE_SCALE * 0.1, PIE_SCALE * 0.125, + PIE_SCALE * 1.25, PIE_CAPDROP_ENABLED | PIE_DERAND_ENABLED}, + 1024, 10240, 1514}; + +static int +fqpie_sysctl_alpha_beta_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"alpha")) + value = fq_pie_sysctl.pcfg.alpha; + else + value = fq_pie_sysctl.pcfg.beta; + + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 7 * PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + if (!strcmp(oidp->oid_name,"alpha")) + fq_pie_sysctl.pcfg.alpha = value; + else + fq_pie_sysctl.pcfg.beta = value; + return (0); +} + +static int +fqpie_sysctl_target_tupdate_maxb_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + if (!strcmp(oidp->oid_name,"target")) + value = fq_pie_sysctl.pcfg.qdelay_ref; + else if (!strcmp(oidp->oid_name,"tupdate")) + value = fq_pie_sysctl.pcfg.tupdate; + else + value = fq_pie_sysctl.pcfg.max_burst; + + value = value / AQM_TIME_1US; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > 10 * AQM_TIME_1S) + return (EINVAL); + value = value * AQM_TIME_1US; + + if (!strcmp(oidp->oid_name,"target")) + fq_pie_sysctl.pcfg.qdelay_ref = value; + else if (!strcmp(oidp->oid_name,"tupdate")) + fq_pie_sysctl.pcfg.tupdate = value; + else + fq_pie_sysctl.pcfg.max_burst = value; + return (0); +} + +static int +fqpie_sysctl_max_ecnth_handler(SYSCTL_HANDLER_ARGS) +{ + int error; + long value; + + value = fq_pie_sysctl.pcfg.max_ecnth; + value = value * 1000 / PIE_SCALE; + error = sysctl_handle_long(oidp, &value, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (value < 1 || value > PIE_SCALE) + return (EINVAL); + value = (value * PIE_SCALE) / 1000; + fq_pie_sysctl.pcfg.max_ecnth = value; + return (0); +} + +/* define FQ- PIE sysctl variables */ +SYSBEGIN(f4) +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_dummynet); +static SYSCTL_NODE(_net_inet_ip_dummynet, OID_AUTO, fqpie, + CTLFLAG_RW, 0, "FQ_PIE"); + +#ifdef SYSCTL_NODE + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, target, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_target_tupdate_maxb_handler, "L", + "queue target in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, tupdate, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_target_tupdate_maxb_handler, "L", + "the frequency of drop probability calculation in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_burst, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_target_tupdate_maxb_handler, "L", + "Burst allowance interval in microsecond"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, max_ecnth, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_max_ecnth_handler, "L", + "ECN safeguard threshold scaled by 1000"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, alpha, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_alpha_beta_handler, "L", "PIE alpha scaled by 1000"); + +SYSCTL_PROC(_net_inet_ip_dummynet_fqpie, OID_AUTO, beta, + CTLTYPE_LONG | CTLFLAG_RW, NULL, 0, + fqpie_sysctl_alpha_beta_handler, "L", "beta scaled by 1000"); + +SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, quantum, + CTLFLAG_RW, &fq_pie_sysctl.quantum, 1514, "quantum for FQ_PIE"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, flows, + CTLFLAG_RW, &fq_pie_sysctl.flows_cnt, 1024, "Number of queues for FQ_PIE"); +SYSCTL_UINT(_net_inet_ip_dummynet_fqpie, OID_AUTO, limit, + CTLFLAG_RW, &fq_pie_sysctl.limit, 10240, "limit for FQ_PIE"); +#endif + +/* Helper function to update queue&main-queue and scheduler statistics. + * negative len & drop -> drop + * negative len -> dequeue + * positive len -> enqueue + * positive len + drop -> drop during enqueue + */ +__inline static void +fq_update_stats(struct fq_pie_flow *q, struct fq_pie_si *si, int len, + int drop) +{ + int inc = 0; + + if (len < 0) + inc = -1; + else if (len > 0) + inc = 1; + + if (drop) { + si->main_q.ni.drops ++; + q->stats.drops ++; + si->_si.ni.drops ++; + io_pkt_drop ++; + } + + if (!drop || (drop && len < 0)) { + /* Update stats for the main queue */ + si->main_q.ni.length += inc; + si->main_q.ni.len_bytes += len; + + /*update sub-queue stats */ + q->stats.length += inc; + q->stats.len_bytes += len; + + /*update scheduler instance stats */ + si->_si.ni.length += inc; + si->_si.ni.len_bytes += len; + } + + if (inc > 0) { + si->main_q.ni.tot_bytes += len; + si->main_q.ni.tot_pkts ++; + + q->stats.tot_bytes +=len; + q->stats.tot_pkts++; + + si->_si.ni.tot_bytes +=len; + si->_si.ni.tot_pkts ++; + } + +} + +/* + * Extract a packet from the head of sub-queue 'q' + * Return a packet or NULL if the queue is empty. + * If getts is set, also extract packet's timestamp from mtag. + */ +__inline static struct mbuf * +fq_pie_extract_head(struct fq_pie_flow *q, aqm_time_t *pkt_ts, + struct fq_pie_si *si, int getts) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return m; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 0); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + + if (getts) { + /* extract packet timestamp*/ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL){ + D("PIE timestamp mtag not found!"); + *pkt_ts = 0; + } else { + *pkt_ts = *(aqm_time_t *)(mtag + 1); + m_tag_delete(m,mtag); + } + } + return m; +} + +/* + * Callout function for drop probability calculation + * This function is called over tupdate ms and takes pointer of FQ-PIE + * flow as an argument + */ +static void +fq_calculate_drop_prob(void *x) +{ + struct fq_pie_flow *q = (struct fq_pie_flow *) x; + struct pie_status *pst = &q->pst; + struct dn_aqm_pie_parms *pprms; + int64_t p, prob, oldprob; + aqm_time_t now; + + /* dealing with race condition */ + if (callout_pending(&pst->aqm_pie_callout)) { + /* callout was reset */ + mtx_unlock(&pst->lock_mtx); + return; + } + + if (!callout_active(&pst->aqm_pie_callout)) { + /* callout was stopped */ + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + q->psi->nr_active_q--; + return; + } + callout_deactivate(&pst->aqm_pie_callout); + + now = AQM_UNOW; + pprms = pst->parms; + prob = pst->drop_prob; + + /* calculate current qdelay */ + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + pst->current_qdelay = ((uint64_t)q->stats.len_bytes * pst->avg_dq_time) + >> PIE_DQ_THRESHOLD_BITS; + } + + /* calculate drop probability */ + p = (int64_t)pprms->alpha * + ((int64_t)pst->current_qdelay - (int64_t)pprms->qdelay_ref); + p +=(int64_t) pprms->beta * + ((int64_t)pst->current_qdelay - (int64_t)pst->qdelay_old); + + /* We PIE_MAX_PROB shift by 12-bits to increase the division precision */ + p *= (PIE_MAX_PROB << 12) / AQM_TIME_1S; + + /* auto-tune drop probability */ + if (prob< PIE_MAX_PROB * 0.000001) + p >>= 11 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.00001) + p >>= 9 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.0001) + p >>= 7 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.001) + p >>= 5 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.01) + p >>= 3 + PIE_FIX_POINT_BITS+12; + else if (prob < PIE_MAX_PROB * 0.1) + p >>= 1 + PIE_FIX_POINT_BITS+12; + else + p >>= PIE_FIX_POINT_BITS+12; + + oldprob = prob; + + /* Cap Drop adjustment */ + if ((pprms->flags & PIE_CAPDROP_ENABLED) && prob >= PIE_MAX_PROB / 10 + && p > PIE_MAX_PROB / 50 ) + p = PIE_MAX_PROB / 50; + + prob = prob + p; + + /* decay the drop probability exponentially */ + if (pst->current_qdelay == 0 && pst->qdelay_old == 0) + /* 0.98 ~= 1- 1/64 */ + prob = prob - (prob >> 6); + + + /* check for multiplication over/under flow */ + if (p>0) { + if (proboldprob) { + prob= 0; + D("underflow"); + } + + /* make drop probability between 0 and PIE_MAX_PROB*/ + if (prob < 0) + prob = 0; + else if (prob > PIE_MAX_PROB) + prob = PIE_MAX_PROB; + + pst->drop_prob = prob; + + /* store current delay value */ + pst->qdelay_old = pst->current_qdelay; + + /* update burst allowance */ + if ((pst->sflags & PIE_ACTIVE) && pst->burst_allowance) { + if (pst->burst_allowance > pprms->tupdate) + pst->burst_allowance -= pprms->tupdate; + else + pst->burst_allowance = 0; + } + + if (pst->sflags & PIE_ACTIVE) + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, + 0, fq_calculate_drop_prob, q, 0); + + mtx_unlock(&pst->lock_mtx); +} + +/* + * Reset PIE variables & activate the queue + */ +__inline static void +fq_activate_pie(struct fq_pie_flow *q) +{ + struct pie_status *pst = &q->pst; + struct dn_aqm_pie_parms *pprms; + + mtx_lock(&pst->lock_mtx); + pprms = pst->parms; + + pprms = pst->parms; + pst->drop_prob = 0; + pst->qdelay_old = 0; + pst->burst_allowance = pprms->max_burst; + pst->accu_prob = 0; + pst->dq_count = 0; + pst->avg_dq_time = 0; + pst->sflags = PIE_INMEASUREMENT | PIE_ACTIVE; + pst->measurement_start = AQM_UNOW; + + callout_reset_sbt(&pst->aqm_pie_callout, + (uint64_t)pprms->tupdate * SBT_1US, + 0, fq_calculate_drop_prob, q, 0); + + mtx_unlock(&pst->lock_mtx); +} + + + /* + * Deactivate PIE and stop probe update callout + */ +__inline static void +fq_deactivate_pie(struct pie_status *pst) +{ + mtx_lock(&pst->lock_mtx); + pst->sflags &= ~(PIE_ACTIVE | PIE_INMEASUREMENT); + callout_stop(&pst->aqm_pie_callout); + //D("PIE Deactivated"); + mtx_unlock(&pst->lock_mtx); +} + + /* + * Initialize PIE for sub-queue 'q' + */ +static int +pie_init(struct fq_pie_flow *q) +{ + struct pie_status *pst=&q->pst; + struct dn_aqm_pie_parms *pprms = pst->parms; + struct fq_pie_schk *fqpie_schk; + + fqpie_schk = (struct fq_pie_schk *)(q->psi->_si.sched+1); + int err = 0; + + if (!pprms){ + D("AQM_PIE is not configured"); + err = EINVAL; + } else { + q->psi->nr_active_q++; + + /* For speed optimization, we caculate 1/3 queue size once here */ + // XXX limit divided by number of queues divided by 3 ??? + pst->one_third_q_size = (fqpie_schk->cfg.limit / + fqpie_schk->cfg.flows_cnt) / 3; + + mtx_init(&pst->lock_mtx, "mtx_pie", NULL, MTX_DEF); + callout_init_mtx(&pst->aqm_pie_callout, &pst->lock_mtx, + CALLOUT_RETURNUNLOCKED); + } + + return err; +} + +/* + * Clean up PIE status for sub-queue 'q' + * Stop callout timer and destroy mtx + */ +static int +pie_cleanup(struct fq_pie_flow *q) +{ + struct pie_status *pst = &q->pst; + + mtx_lock(&pst->lock_mtx); + if (callout_stop(&pst->aqm_pie_callout) || !(pst->sflags & PIE_ACTIVE)) { + mtx_unlock(&pst->lock_mtx); + mtx_destroy(&pst->lock_mtx); + q->psi->nr_active_q--; + } else { + mtx_unlock(&pst->lock_mtx); + return EBUSY; + } + return 0; +} + +/* + * Dequeue and return a pcaket from sub-queue 'q' or NULL if 'q' is empty. + * Also, caculate depature time or queue delay using timestamp + */ + static struct mbuf * +pie_dequeue(struct fq_pie_flow *q, struct fq_pie_si *si) +{ + struct mbuf *m; + struct dn_aqm_pie_parms *pprms; + struct pie_status *pst; + aqm_time_t now; + aqm_time_t pkt_ts, dq_time; + int32_t w; + + pst = &q->pst; + pprms = q->pst.parms; + + /*we extarct packet ts only when Departure Rate Estimation dis not used*/ + m = fq_pie_extract_head(q, &pkt_ts, si, + !(pprms->flags & PIE_DEPRATEEST_ENABLED)); + + if (!m || !(pst->sflags & PIE_ACTIVE)) + return m; + + now = AQM_UNOW; + if (pprms->flags & PIE_DEPRATEEST_ENABLED) { + /* calculate average depature time */ + if(pst->sflags & PIE_INMEASUREMENT) { + pst->dq_count += m->m_pkthdr.len; + + if (pst->dq_count >= PIE_DQ_THRESHOLD) { + dq_time = now - pst->measurement_start; + + /* + * if we don't have old avg dq_time i.e PIE is (re)initialized, + * don't use weight to calculate new avg_dq_time + */ + if(pst->avg_dq_time == 0) + pst->avg_dq_time = dq_time; + else { + /* + * weight = PIE_DQ_THRESHOLD/2^6, but we scaled + * weight by 2^8. Thus, scaled + * weight = PIE_DQ_THRESHOLD /2^8 + * */ + w = PIE_DQ_THRESHOLD >> 8; + pst->avg_dq_time = (dq_time* w + + (pst->avg_dq_time * ((1L << 8) - w))) >> 8; + pst->sflags &= ~PIE_INMEASUREMENT; + } + } + } + + /* + * Start new measurment cycle when the queue has + * PIE_DQ_THRESHOLD worth of bytes. + */ + if(!(pst->sflags & PIE_INMEASUREMENT) && + q->stats.len_bytes >= PIE_DQ_THRESHOLD) { + pst->sflags |= PIE_INMEASUREMENT; + pst->measurement_start = now; + pst->dq_count = 0; + } + } + /* Optionally, use packet timestamp to estimate queue delay */ + else + pst->current_qdelay = now - pkt_ts; + + return m; +} + + + /* + * Enqueue a packet in q, subject to space and FQ-PIE queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +static int +pie_enqueue(struct fq_pie_flow *q, struct mbuf* m, struct fq_pie_si *si) +{ + uint64_t len; + struct pie_status *pst; + struct dn_aqm_pie_parms *pprms; + int t; + + len = m->m_pkthdr.len; + pst = &q->pst; + pprms = pst->parms; + t = ENQUE; + + /* drop/mark the packet when PIE is active and burst time elapsed */ + if (pst->sflags & PIE_ACTIVE && pst->burst_allowance == 0 + && drop_early(pst, q->stats.len_bytes) == DROP) { + /* + * if drop_prob over ECN threshold, drop the packet + * otherwise mark and enqueue it. + */ + if (pprms->flags & PIE_ECN_ENABLED && pst->drop_prob < + (pprms->max_ecnth << (PIE_PROB_BITS - PIE_FIX_POINT_BITS)) + && ecn_mark(m)) + t = ENQUE; + else + t = DROP; + } + + /* Turn PIE on when 1/3 of the queue is full */ + if (!(pst->sflags & PIE_ACTIVE) && q->stats.len_bytes >= + pst->one_third_q_size) { + fq_activate_pie(q); + } + + /* reset burst tolerance and optinally turn PIE off*/ + if (pst->drop_prob == 0 && pst->current_qdelay < (pprms->qdelay_ref >> 1) + && pst->qdelay_old < (pprms->qdelay_ref >> 1)) { + + pst->burst_allowance = pprms->max_burst; + if (pprms->flags & PIE_ON_OFF_MODE_ENABLED && q->stats.len_bytes<=0) + fq_deactivate_pie(pst); + } + + /* Use timestamp if Departure Rate Estimation mode is disabled */ + if (t != DROP && !(pprms->flags & PIE_DEPRATEEST_ENABLED)) { + /* Add TS to mbuf as a TAG */ + struct m_tag *mtag; + mtag = m_tag_locate(m, MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, NULL); + if (mtag == NULL) + mtag = m_tag_alloc(MTAG_ABI_COMPAT, DN_AQM_MTAG_TS, + sizeof(aqm_time_t), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + t = DROP; + } + *(aqm_time_t *)(mtag + 1) = AQM_UNOW; + m_tag_prepend(m, mtag); + } + + if (t != DROP) { + mq_append(&q->mq, m); + fq_update_stats(q, si, len, 0); + return 0; + } else { + fq_update_stats(q, si, len, 1); + pst->accu_prob = 0; + FREE_PKT(m); + return 1; + } + + return 0; +} + +/* Drop a packet form the head of FQ-PIE sub-queue */ +static void +pie_drop_head(struct fq_pie_flow *q, struct fq_pie_si *si) +{ + struct mbuf *m = q->mq.head; + + if (m == NULL) + return; + q->mq.head = m->m_nextpkt; + + fq_update_stats(q, si, -m->m_pkthdr.len, 1); + + if (si->main_q.ni.length == 0) /* queue is now idle */ + si->main_q.q_time = dn_cfg.curr_time; + /* reset accu_prob after packet drop */ + q->pst.accu_prob = 0; + + FREE_PKT(m); +} + +/* + * Classify a packet to queue number using Jenkins hash function. + * Return: queue number + * the input of the hash are protocol no, perturbation, src IP, dst IP, + * src port, dst port, + */ +static inline int +fq_pie_classify_flow(struct mbuf *m, uint16_t fcount, struct fq_pie_si *si) +{ + struct ip *ip; + struct tcphdr *th; + struct udphdr *uh; + uint8_t tuple[41]; + uint16_t hash=0; + +//#ifdef INET6 + struct ip6_hdr *ip6; + int isip6; + isip6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; + + if(isip6) { + ip6 = mtod(m, struct ip6_hdr *); + *((uint8_t *) &tuple[0]) = ip6->ip6_nxt; + *((uint32_t *) &tuple[1]) = si->perturbation; + memcpy(&tuple[5], ip6->ip6_src.s6_addr, 16); + memcpy(&tuple[21], ip6->ip6_dst.s6_addr, 16); + + switch (ip6->ip6_nxt) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = th->th_dport; + *((uint16_t *) &tuple[39]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip6 + 1); + *((uint16_t *) &tuple[37]) = uh->uh_dport; + *((uint16_t *) &tuple[39]) = uh->uh_sport; + break; + default: + memset(&tuple[37], 0, 4); + } + + hash = jenkins_hash(tuple, 41, HASHINIT) % fcount; + return hash; + } +//#endif + + /* IPv4 */ + ip = mtod(m, struct ip *); + *((uint8_t *) &tuple[0]) = ip->ip_p; + *((uint32_t *) &tuple[1]) = si->perturbation; + *((uint32_t *) &tuple[5]) = ip->ip_src.s_addr; + *((uint32_t *) &tuple[9]) = ip->ip_dst.s_addr; + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = th->th_dport; + *((uint16_t *) &tuple[15]) = th->th_sport; + break; + + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + *((uint16_t *) &tuple[13]) = uh->uh_dport; + *((uint16_t *) &tuple[15]) = uh->uh_sport; + break; + default: + memset(&tuple[13], 0, 4); + } + hash = jenkins_hash(tuple, 17, HASHINIT) % fcount; + + return hash; +} + +/* + * Enqueue a packet into an appropriate queue according to + * FQ-CoDe; algorithm. + */ +static int +fq_pie_enqueue(struct dn_sch_inst *_si, struct dn_queue *_q, + struct mbuf *m) +{ + struct fq_pie_si *si; + struct fq_pie_schk *schk; + struct dn_sch_fq_pie_parms *param; + struct dn_queue *mainq; + int idx, drop, i, maxidx; + + mainq = (struct dn_queue *)(_si + 1); + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(si->_si.sched+1); + param = &schk->cfg; + + /* classify a packet to queue number*/ + idx = fq_pie_classify_flow(m, param->flows_cnt, si); + + /* enqueue packet into appropriate queue using PIE AQM. + * Note: 'pie_enqueue' function returns 1 only when it unable to + * add timestamp to packet (no limit check)*/ + drop = pie_enqueue(&si->flows[idx], m, si); + + /* pie unable to timestamp a packet */ + if (drop) + return 1; + + /* If the flow (sub-queue) is not active ,then add it to tail of + * new flows list, initialize and activate it. + */ + if (!si->flows[idx].active) { + STAILQ_INSERT_TAIL(&si->newflows, &si->flows[idx], flowchain); + si->flows[idx].deficit = param->quantum; + fq_activate_pie(&si->flows[idx]); + si->flows[idx].active = 1; + } + + /* check the limit for all queues and remove a packet from the + * largest one + */ + if (mainq->ni.length > schk->cfg.limit) { + /* find first active flow */ + for (maxidx = 0; maxidx < schk->cfg.flows_cnt; maxidx++) + if (si->flows[maxidx].active) + break; + if (maxidx < schk->cfg.flows_cnt) { + /* find the largest sub- queue */ + for (i = maxidx + 1; i < schk->cfg.flows_cnt; i++) + if (si->flows[i].active && si->flows[i].stats.length > + si->flows[maxidx].stats.length) + maxidx = i; + pie_drop_head(&si->flows[maxidx], si); + drop = 1; + } + } + + return drop; +} + +/* + * Dequeue a packet from an appropriate queue according to + * FQ-CoDel algorithm. + */ +static struct mbuf * +fq_pie_dequeue(struct dn_sch_inst *_si) +{ + struct fq_pie_si *si; + struct fq_pie_schk *schk; + struct dn_sch_fq_pie_parms *param; + struct fq_pie_flow *f; + struct mbuf *mbuf; + struct fq_pie_list *fq_pie_flowlist; + + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(si->_si.sched+1); + param = &schk->cfg; + + do { + /* select a list to start with */ + if (STAILQ_EMPTY(&si->newflows)) + fq_pie_flowlist = &si->oldflows; + else + fq_pie_flowlist = &si->newflows; + + /* Both new and old queue lists are empty, return NULL */ + if (STAILQ_EMPTY(fq_pie_flowlist)) + return NULL; + + f = STAILQ_FIRST(fq_pie_flowlist); + while (f != NULL) { + /* if there is no flow(sub-queue) deficit, increase deficit + * by quantum, move the flow to the tail of old flows list + * and try another flow. + * Otherwise, the flow will be used for dequeue. + */ + if (f->deficit < 0) { + f->deficit += param->quantum; + STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else + break; + + f = STAILQ_FIRST(fq_pie_flowlist); + } + + /* the new flows list is empty, try old flows list */ + if (STAILQ_EMPTY(fq_pie_flowlist)) + continue; + + /* Dequeue a packet from the selected flow */ + mbuf = pie_dequeue(f, si); + + /* pie did not return a packet */ + if (!mbuf) { + /* If the selected flow belongs to new flows list, then move + * it to the tail of old flows list. Otherwise, deactivate it and + * remove it from the old list and + */ + if (fq_pie_flowlist == &si->newflows) { + STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain); + STAILQ_INSERT_TAIL(&si->oldflows, f, flowchain); + } else { + f->active = 0; + fq_deactivate_pie(&f->pst); + STAILQ_REMOVE_HEAD(fq_pie_flowlist, flowchain); + } + /* start again */ + continue; + } + + /* we have a packet to return, + * update flow deficit and return the packet*/ + f->deficit -= mbuf->m_pkthdr.len; + return mbuf; + + } while (1); + + /* unreachable point */ + return NULL; +} + +/* + * Initialize fq_pie scheduler instance. + * also, allocate memory for flows array. + */ +static int +fq_pie_new_sched(struct dn_sch_inst *_si) +{ + struct fq_pie_si *si; + struct dn_queue *q; + struct fq_pie_schk *schk; + int i; + + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(_si->sched+1); + + if(si->flows) { + D("si already configured!"); + return 0; + } + + /* init the main queue */ + q = &si->main_q; + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = _si; + q->fs = _si->sched->fs; + + /* allocate memory for flows array */ + si->flows = malloc(schk->cfg.flows_cnt * sizeof(struct fq_pie_flow), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si->flows == NULL) { + D("cannot allocate memory for fq_pie configuration parameters"); + return ENOMEM ; + } + + /* init perturbation for this si */ + si->perturbation = random(); + si->nr_active_q = 0; + + /* init the old and new flows lists */ + STAILQ_INIT(&si->newflows); + STAILQ_INIT(&si->oldflows); + + /* init the flows (sub-queues) */ + for (i = 0; i < schk->cfg.flows_cnt; i++) { + si->flows[i].pst.parms = &schk->cfg.pcfg; + si->flows[i].psi = si; + pie_init(&si->flows[i]); + } + + /* init mtx lock and callout function for free memory */ + if (!fq_pie_desc.ref_count) { + mtx_init(&freemem_mtx, "mtx_pie", NULL, MTX_DEF); + } + + mtx_lock(&freemem_mtx); + fq_pie_desc.ref_count++; + mtx_unlock(&freemem_mtx); + + return 0; +} + +/* + * Free FQ-PIE flows memory callout function. + * This function is scheduled when a flow or more still active and + * the scheduer is about to be destroyed, to prevent memory leak. + */ +static void +free_flows(void *_mem) +{ + struct mem_to_free *mem = _mem; + + free(mem->mem_flows, M_DUMMYNET); + free(mem->mem_callout, M_DUMMYNET); + free(_mem, M_DUMMYNET); + + fq_pie_desc.ref_count--; + if (!fq_pie_desc.ref_count) { + mtx_unlock(&freemem_mtx); + mtx_destroy(&freemem_mtx); + } else + mtx_unlock(&freemem_mtx); + //D("mem freed ok!"); +} + +/* + * Free fq_pie scheduler instance. + */ +static int +fq_pie_free_sched(struct dn_sch_inst *_si) +{ + struct fq_pie_si *si; + struct fq_pie_schk *schk; + int i; + + si = (struct fq_pie_si *)_si; + schk = (struct fq_pie_schk *)(_si->sched+1); + + for (i = 0; i < schk->cfg.flows_cnt; i++) { + pie_cleanup(&si->flows[i]); + } + + /* if there are still some queues have a callout going to start, + * we cannot free flows memory. If we do so, a panic can happen + * as prob calculate callout function uses flows memory. + */ + if (!si->nr_active_q) { + /* free the flows array */ + free(si->flows , M_DUMMYNET); + si->flows = NULL; + mtx_lock(&freemem_mtx); + fq_pie_desc.ref_count--; + if (!fq_pie_desc.ref_count) { + mtx_unlock(&freemem_mtx); + mtx_destroy(&freemem_mtx); + } else + mtx_unlock(&freemem_mtx); + //D("ok!"); + return 0; + } else { + /* memory leak happens here. So, we register a callout function to free + * flows memory later. + */ + D("unable to stop all fq_pie sub-queues!"); + mtx_lock(&freemem_mtx); + + struct callout *mem_callout; + struct mem_to_free *mem; + + mem = malloc(sizeof(*mem), M_DUMMYNET, + M_NOWAIT | M_ZERO); + mem_callout = malloc(sizeof(*mem_callout), M_DUMMYNET, + M_NOWAIT | M_ZERO); + + callout_init_mtx(mem_callout, &freemem_mtx, + CALLOUT_RETURNUNLOCKED); + + mem->mem_flows = si->flows; + mem->mem_callout = mem_callout; + callout_reset_sbt(mem_callout, + (uint64_t)(si->flows[0].pst.parms->tupdate + 1000) * SBT_1US, + 0, free_flows, mem, 0); + + si->flows = NULL; + mtx_unlock(&freemem_mtx); + + return EBUSY; + } +} + +/* + * Configure FQ-PIE scheduler. + * the configurations for the scheduler is passed fromipfw userland. + */ +static int +fq_pie_config(struct dn_schk *_schk) +{ + struct fq_pie_schk *schk; + struct dn_extra_parms *ep; + struct dn_sch_fq_pie_parms *fqp_cfg; + + schk = (struct fq_pie_schk *)(_schk+1); + ep = (struct dn_extra_parms *) _schk->cfg; + + /* par array contains fq_pie configuration as follow + * PIE: 0- qdelay_ref,1- tupdate, 2- max_burst + * 3- max_ecnth, 4- alpha, 5- beta, 6- flags + * FQ_PIE: 7- quantum, 8- limit, 9- flows + */ + if (ep && ep->oid.len ==sizeof(*ep) && + ep->oid.subtype == DN_SCH_PARAMS) { + + fqp_cfg = &schk->cfg; + if (ep->par[0] < 0) + fqp_cfg->pcfg.qdelay_ref = fq_pie_sysctl.pcfg.qdelay_ref; + else + fqp_cfg->pcfg.qdelay_ref = ep->par[0]; + if (ep->par[1] < 0) + fqp_cfg->pcfg.tupdate = fq_pie_sysctl.pcfg.tupdate; + else + fqp_cfg->pcfg.tupdate = ep->par[1]; + if (ep->par[2] < 0) + fqp_cfg->pcfg.max_burst = fq_pie_sysctl.pcfg.max_burst; + else + fqp_cfg->pcfg.max_burst = ep->par[2]; + if (ep->par[3] < 0) + fqp_cfg->pcfg.max_ecnth = fq_pie_sysctl.pcfg.max_ecnth; + else + fqp_cfg->pcfg.max_ecnth = ep->par[3]; + if (ep->par[4] < 0) + fqp_cfg->pcfg.alpha = fq_pie_sysctl.pcfg.alpha; + else + fqp_cfg->pcfg.alpha = ep->par[4]; + if (ep->par[5] < 0) + fqp_cfg->pcfg.beta = fq_pie_sysctl.pcfg.beta; + else + fqp_cfg->pcfg.beta = ep->par[5]; + if (ep->par[6] < 0) + fqp_cfg->pcfg.flags = 0; + else + fqp_cfg->pcfg.flags = ep->par[6]; + + /* FQ configurations */ + if (ep->par[7] < 0) + fqp_cfg->quantum = fq_pie_sysctl.quantum; + else + fqp_cfg->quantum = ep->par[7]; + if (ep->par[8] < 0) + fqp_cfg->limit = fq_pie_sysctl.limit; + else + fqp_cfg->limit = ep->par[8]; + if (ep->par[9] < 0) + fqp_cfg->flows_cnt = fq_pie_sysctl.flows_cnt; + else + fqp_cfg->flows_cnt = ep->par[9]; + + /* Bound the configurations */ + fqp_cfg->pcfg.qdelay_ref = BOUND_VAR(fqp_cfg->pcfg.qdelay_ref, + 1, 5 * AQM_TIME_1S); + fqp_cfg->pcfg.tupdate = BOUND_VAR(fqp_cfg->pcfg.tupdate, + 1, 5 * AQM_TIME_1S); + fqp_cfg->pcfg.max_burst = BOUND_VAR(fqp_cfg->pcfg.max_burst, + 0, 5 * AQM_TIME_1S); + fqp_cfg->pcfg.max_ecnth = BOUND_VAR(fqp_cfg->pcfg.max_ecnth, + 0, PIE_SCALE); + fqp_cfg->pcfg.alpha = BOUND_VAR(fqp_cfg->pcfg.alpha, 0, 7 * PIE_SCALE); + fqp_cfg->pcfg.beta = BOUND_VAR(fqp_cfg->pcfg.beta, 0, 7 * PIE_SCALE); + + fqp_cfg->quantum = BOUND_VAR(fqp_cfg->quantum,1,9000); + fqp_cfg->limit= BOUND_VAR(fqp_cfg->limit,1,20480); + fqp_cfg->flows_cnt= BOUND_VAR(fqp_cfg->flows_cnt,1,65536); + } + else { + D("Wrong parameters for fq_pie scheduler"); + return 1; + } + + return 0; +} + +/* + * Return FQ-PIE scheduler configurations + * the configurations for the scheduler is passed to userland. + */ +static int +fq_pie_getconfig (struct dn_schk *_schk, struct dn_extra_parms *ep) { + + struct fq_pie_schk *schk = (struct fq_pie_schk *)(_schk+1); + struct dn_sch_fq_pie_parms *fqp_cfg; + + fqp_cfg = &schk->cfg; + + strcpy(ep->name, fq_pie_desc.name); + ep->par[0] = fqp_cfg->pcfg.qdelay_ref; + ep->par[1] = fqp_cfg->pcfg.tupdate; + ep->par[2] = fqp_cfg->pcfg.max_burst; + ep->par[3] = fqp_cfg->pcfg.max_ecnth; + ep->par[4] = fqp_cfg->pcfg.alpha; + ep->par[5] = fqp_cfg->pcfg.beta; + ep->par[6] = fqp_cfg->pcfg.flags; + + ep->par[7] = fqp_cfg->quantum; + ep->par[8] = fqp_cfg->limit; + ep->par[9] = fqp_cfg->flows_cnt; + + return 0; +} + +/* + * FQ-PIE scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fq_pie_desc = { + _SI( .type = ) DN_SCHED_FQ_PIE, + _SI( .name = ) "FQ_PIE", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) sizeof(struct fq_pie_schk), + _SI( .si_datalen = ) sizeof(struct fq_pie_si) - sizeof(struct dn_sch_inst), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fq_pie_enqueue, + _SI( .dequeue = ) fq_pie_dequeue, + _SI( .config = ) fq_pie_config, /* new sched i.e. sched X config ...*/ + _SI( .destroy = ) NULL, /*sched x delete */ + _SI( .new_sched = ) fq_pie_new_sched, /* new schd instance */ + _SI( .free_sched = ) fq_pie_free_sched, /* delete schd instance */ + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, + _SI( .getconfig = ) fq_pie_getconfig, + _SI( .ref_count = ) 0 +}; + +DECLARE_DNSCHED_MODULE(dn_fq_pie, &fq_pie_desc); Index: head/sys/netpfil/ipfw/dn_sched_prio.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_prio.c +++ head/sys/netpfil/ipfw/dn_sched_prio.c @@ -41,6 +41,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include #else #include @@ -223,6 +226,9 @@ _SI( .new_queue = ) prio_new_queue, _SI( .free_queue = ) prio_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; Index: head/sys/netpfil/ipfw/dn_sched_qfq.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_qfq.c +++ head/sys/netpfil/ipfw/dn_sched_qfq.c @@ -42,6 +42,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include #else #include @@ -837,6 +840,9 @@ _SI( .free_fsk = ) NULL, _SI( .new_queue = ) qfq_new_queue, _SI( .free_queue = ) qfq_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); Index: head/sys/netpfil/ipfw/dn_sched_rr.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_rr.c +++ head/sys/netpfil/ipfw/dn_sched_rr.c @@ -42,6 +42,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include #else #include @@ -309,6 +312,9 @@ _SI( .free_fsk = ) NULL, _SI( .new_queue = ) rr_new_queue, _SI( .free_queue = ) rr_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif }; Index: head/sys/netpfil/ipfw/dn_sched_wf2q.c =================================================================== --- head/sys/netpfil/ipfw/dn_sched_wf2q.c +++ head/sys/netpfil/ipfw/dn_sched_wf2q.c @@ -43,6 +43,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include #else #include @@ -367,6 +370,10 @@ _SI( .new_queue = ) wf2qp_new_queue, _SI( .free_queue = ) wf2qp_free_queue, +#ifdef NEW_AQM + _SI( .getconfig = ) NULL, +#endif + }; Index: head/sys/netpfil/ipfw/ip_dn_glue.c =================================================================== --- head/sys/netpfil/ipfw/ip_dn_glue.c +++ head/sys/netpfil/ipfw/ip_dn_glue.c @@ -55,6 +55,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include /* FREEBSD7.2 ip_dummynet.h r191715*/ Index: head/sys/netpfil/ipfw/ip_dn_io.c =================================================================== --- head/sys/netpfil/ipfw/ip_dn_io.c +++ head/sys/netpfil/ipfw/ip_dn_io.c @@ -63,6 +63,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include /* @@ -84,8 +87,12 @@ static unsigned long io_pkt; static unsigned long io_pkt_fast; -static unsigned long io_pkt_drop; +#ifdef NEW_AQM +unsigned long io_pkt_drop; +#else +static unsigned long io_pkt_drop; +#endif /* * We use a heap to store entities for which we have pending timer events. * The heap is checked at every tick and all entities with expired events @@ -148,7 +155,11 @@ SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); +#ifdef NEW_AQM +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); +#else static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); +#endif /* wrapper to pass dn_cfg fields to SYSCTL_* */ //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) @@ -250,6 +261,14 @@ dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); +#ifdef NEW_AQM + /* XXX: to skip ts m_tag. For Debugging only*/ + if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) { + m_tag_delete(m,mtag); + mtag = m_tag_first(m); + D("skip TS tag"); + } +#endif KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, @@ -257,6 +276,7 @@ return (struct dn_pkt_tag *)(mtag+1); } +#ifndef NEW_AQM static inline void mq_append(struct mq *q, struct mbuf *m) { @@ -296,6 +316,7 @@ q->tail = m; m->m_nextpkt = NULL; } +#endif /* * Dispose a list of packet. Use a functions so if we need to do @@ -420,7 +441,10 @@ /* * ECN/ECT Processing (partially adopted from altq) */ -static int +#ifndef NEW_AQM +static +#endif +int ecn_mark(struct mbuf* m) { struct ip *ip; @@ -503,6 +527,11 @@ goto drop; if (f->plr && random() < f->plr) goto drop; +#ifdef NEW_AQM + /* Call AQM enqueue function */ + if (q->fs->aqmfp) + return q->fs->aqmfp->enqueue(q ,m); +#endif if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) { if (!(f->flags & DN_IS_ECN) || !ecn_mark(m)) goto drop; @@ -890,6 +919,10 @@ if (fs->sched->fp->enqueue(si, q, m)) { /* packet was dropped by enqueue() */ m = *m0 = NULL; + + /* dn_enqueue already increases io_pkt_drop */ + io_pkt_drop--; + goto dropit; } Index: head/sys/netpfil/ipfw/ip_dn_private.h =================================================================== --- head/sys/netpfil/ipfw/ip_dn_private.h +++ head/sys/netpfil/ipfw/ip_dn_private.h @@ -81,6 +81,10 @@ SLIST_HEAD(dn_queue_head, dn_queue); SLIST_HEAD(dn_alg_head, dn_alg); +#ifdef NEW_AQM +SLIST_HEAD(dn_aqm_head, dn_aqm); /* for new AQMs */ +#endif + struct mq { /* a basic queue of packets*/ struct mbuf *head, *tail; int count; @@ -136,6 +140,9 @@ /* list of flowsets without a scheduler -- use sch_chain */ struct dn_fsk_head fsu; /* list of unlinked flowsets */ struct dn_alg_head schedlist; /* list of algorithms */ +#ifdef NEW_AQM + struct dn_aqm_head aqmlist; /* list of AQMs */ +#endif /* Store the fs/sch to scan when draining. The value is the * bucket number of the hash table. Expire can be disabled @@ -232,6 +239,10 @@ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ +#ifdef NEW_AQM + struct dn_aqm *aqmfp; /* Pointer to AQM functions */ + void *aqmcfg; /* configuration parameters for AQM */ +#endif }; /* @@ -254,6 +265,9 @@ int count; /* arrivals since last RED drop */ int random; /* random value (scaled) */ uint64_t q_time; /* start of queue idle time */ +#ifdef NEW_AQM + void *aqm_status; /* per-queue status variables*/ +#endif }; @@ -401,4 +415,49 @@ void dn_drain_scheduler(void); void dn_drain_queue(void); +#ifdef NEW_AQM +int ecn_mark(struct mbuf* m); + +/* moved from ip_dn_io.c to here to be available for AQMs modules*/ +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ +#ifdef USERSPACE + // buffers from netmap need to be copied + // XXX note that the routine is not expected to fail + ND("append %p to %p", m, q); + if (m->m_flags & M_STACK) { + struct mbuf *m_new; + void *p; + int l, ofs; + + ofs = m->m_data - m->__m_extbuf; + // XXX allocate + MGETHDR(m_new, M_NOWAIT, MT_DATA); + ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", + m, m->__m_extbuf, m->__m_extlen, ofs, m_new); + p = m_new->__m_extbuf; /* new pointer */ + l = m_new->__m_extlen; /* new len */ + if (l <= m->__m_extlen) { + panic("extlen too large"); + } + + *m_new = *m; // copy + m_new->m_flags &= ~M_STACK; + m_new->__m_extbuf = p; // point to new buffer + _pkt_copy(m->__m_extbuf, p, m->__m_extlen); + m_new->m_data = p + ofs; + m = m_new; + } +#endif /* USERSPACE */ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->count++; + q->tail = m; + m->m_nextpkt = NULL; +} +#endif /* NEW_AQM */ + #endif /* _IP_DN_PRIVATE_H */ Index: head/sys/netpfil/ipfw/ip_dummynet.c =================================================================== --- head/sys/netpfil/ipfw/ip_dummynet.c +++ head/sys/netpfil/ipfw/ip_dummynet.c @@ -1,4 +1,11 @@ /*- + * Codel/FQ_Codel and PIE/FQ-PIE Code: + * Copyright (C) 2016 Centre for Advanced Internet Architectures, + * Swinburne University of Technology, Melbourne, Australia. + * Portions of this code were made possible in part by a gift from + * The Comcast Innovation Fund. + * Implemented by Rasool Al-Saadi + * * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved @@ -58,6 +65,9 @@ #include #include #include +#ifdef NEW_AQM +#include +#endif #include /* which objects to copy */ @@ -98,6 +108,21 @@ } /*----- end of callout hooks -----*/ +#ifdef NEW_AQM +/* Return AQM descriptor for given type or name. */ +static struct dn_aqm * +find_aqm_type(int type, char *name) +{ + struct dn_aqm *d; + + SLIST_FOREACH(d, &dn_cfg.aqmlist, next) { + if (d->type == type || (name && !strcasecmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} +#endif + /* Return a scheduler descriptor given the type or name. */ static struct dn_alg * find_sched_type(int type, char *name) @@ -320,7 +345,15 @@ if (fs->sched->fp->new_queue) fs->sched->fp->new_queue(q); + +#ifdef NEW_AQM + /* call AQM init function after creating a queue*/ + if (fs->aqmfp && fs->aqmfp->init) + if(fs->aqmfp->init(q)) + D("unable to init AQM for fs %d", fs->fs.fs_nr); +#endif dn_cfg.queue_count++; + return q; } @@ -334,6 +367,13 @@ { struct dn_fsk *fs = q->fs; +#ifdef NEW_AQM + /* clean up AQM status for queue 'q' + * cleanup here is called just with MULTIQUEUE + */ + if (fs && fs->aqmfp && fs->aqmfp->cleanup) + fs->aqmfp->cleanup(q); +#endif // D("fs %p si %p\n", fs, q->_si); /* notify the parent scheduler that the queue is going away */ if (fs && fs->sched->fp->free_queue) @@ -475,6 +515,16 @@ if (s->sch.flags & DN_HAVE_MASK) si->ni.fid = *(struct ipfw_flow_id *)key; +#ifdef NEW_AQM + /* init AQM status for !DN_MULTIQUEUE sched*/ + if (!(s->fp->flags & DN_MULTIQUEUE)) + if (s->fs->aqmfp && s->fs->aqmfp->init) + if(s->fs->aqmfp->init((struct dn_queue *)(si + 1))) { + D("unable to init AQM for fs %d", s->fs->fs.fs_nr); + goto error; + } +#endif + dn_cfg.si_count++; return si; @@ -504,6 +554,20 @@ dn_free_pkts(dl->mq.head); /* drain delay line */ if (si->kflags & DN_ACTIVE) /* remove si from event heap */ heap_extract(&dn_cfg.evheap, si); + +#ifdef NEW_AQM + /* clean up AQM status for !DN_MULTIQUEUE sched + * Note that all queues belong to fs were cleaned up in fsk_detach. + * When drain_scheduler is called s->fs and q->fs are pointing + * to a correct fs, so we can use fs in this case. + */ + if (!(s->fp->flags & DN_MULTIQUEUE)) { + struct dn_queue *q = (struct dn_queue *)(si + 1); + if (q->aqm_status && q->fs->aqmfp) + if (q->fs->aqmfp->cleanup) + q->fs->aqmfp->cleanup(q); + } +#endif if (s->fp->free_sched) s->fp->free_sched(si); bzero(si, sizeof(*si)); /* safety */ @@ -592,6 +656,67 @@ return fs; } +#ifdef NEW_AQM +/* callback function for cleaning up AQM queue status belongs to a flowset + * connected to scheduler instance '_si' (for !DN_MULTIQUEUE only). + */ +static int +si_cleanup_q(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + + if (!(si->sched->fp->flags & DN_MULTIQUEUE)) { + if (si->sched->fs->aqmfp && si->sched->fs->aqmfp->cleanup) + si->sched->fs->aqmfp->cleanup((struct dn_queue *) (si+1)); + } + return 0; +} + +/* callback to clean up queue AQM status.*/ +static int +q_cleanup_q(void *_q, void *arg) +{ + struct dn_queue *q = _q; + q->fs->aqmfp->cleanup(q); + return 0; +} + +/* Clean up all AQM queues status belongs to flowset 'fs' and then + * deconfig AQM for flowset 'fs' + */ +static void +aqm_cleanup_deconfig_fs(struct dn_fsk *fs) +{ + struct dn_sch_inst *si; + + /* clean up AQM status for all queues for !DN_MULTIQUEUE sched*/ + if (fs->fs.fs_nr > DN_MAX_ID) { + if (fs->sched && !(fs->sched->fp->flags & DN_MULTIQUEUE)) { + if (fs->sched->sch.flags & DN_HAVE_MASK) + dn_ht_scan(fs->sched->siht, si_cleanup_q, NULL); + else { + /* single si i.e. no sched mask */ + si = (struct dn_sch_inst *) fs->sched->siht; + if (si && fs->aqmfp && fs->aqmfp->cleanup) + fs->aqmfp->cleanup((struct dn_queue *) (si+1)); + } + } + } + + /* clean up AQM status for all queues for DN_MULTIQUEUE sched*/ + if (fs->sched && fs->sched->fp->flags & DN_MULTIQUEUE && fs->qht) { + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, q_cleanup_q, NULL); + else + fs->aqmfp->cleanup((struct dn_queue *)(fs->qht)); + } + + /* deconfig AQM */ + if(fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) + fs->aqmfp->deconfig(fs); +} +#endif + /* * detach flowset from its current scheduler. Flags as follows: * DN_DETACH removes from the fsk_list @@ -620,6 +745,10 @@ free(fs->w_q_lookup, M_DUMMYNET); fs->w_q_lookup = NULL; qht_delete(fs, flags); +#ifdef NEW_AQM + aqm_cleanup_deconfig_fs(fs); +#endif + if (fs->sched && fs->sched->fp->free_fsk) fs->sched->fp->free_fsk(fs); fs->sched = NULL; @@ -1191,6 +1320,183 @@ } } +#ifdef NEW_AQM +/* Retrieve AQM configurations to ipfw userland + */ +static int +get_aqm_parms(struct sockopt *sopt) +{ + struct dn_extra_parms *ep; + struct dn_fsk *fs; + size_t sopt_valsize; + int l, err = 0; + + sopt_valsize = sopt->sopt_valsize; + l = sizeof(*ep); + if (sopt->sopt_valsize < l) { + D("bad len sopt->sopt_valsize %d len %d", + (int) sopt->sopt_valsize , l); + err = EINVAL; + return err; + } + ep = malloc(l, M_DUMMYNET, M_WAITOK); + if(!ep) { + err = ENOMEM ; + return err; + } + do { + err = sooptcopyin(sopt, ep, l, l); + if(err) + break; + sopt->sopt_valsize = sopt_valsize; + if (ep->oid.len < l) { + err = EINVAL; + break; + } + + fs = dn_ht_find(dn_cfg.fshash, ep->nr, 0, NULL); + if (!fs) { + D("fs %d not found", ep->nr); + err = EINVAL; + break; + } + + if (fs->aqmfp && fs->aqmfp->getconfig) { + if(fs->aqmfp->getconfig(fs, ep)) { + D("Error while trying to get AQM params"); + err = EINVAL; + break; + } + ep->oid.len = l; + err = sooptcopyout(sopt, ep, l); + } + }while(0); + + free(ep, M_DUMMYNET); + return err; +} + +/* Retrieve AQM configurations to ipfw userland + */ +static int +get_sched_parms(struct sockopt *sopt) +{ + struct dn_extra_parms *ep; + struct dn_schk *schk; + size_t sopt_valsize; + int l, err = 0; + + sopt_valsize = sopt->sopt_valsize; + l = sizeof(*ep); + if (sopt->sopt_valsize < l) { + D("bad len sopt->sopt_valsize %d len %d", + (int) sopt->sopt_valsize , l); + err = EINVAL; + return err; + } + ep = malloc(l, M_DUMMYNET, M_WAITOK); + if(!ep) { + err = ENOMEM ; + return err; + } + do { + err = sooptcopyin(sopt, ep, l, l); + if(err) + break; + sopt->sopt_valsize = sopt_valsize; + if (ep->oid.len < l) { + err = EINVAL; + break; + } + + schk = locate_scheduler(ep->nr); + if (!schk) { + D("sched %d not found", ep->nr); + err = EINVAL; + break; + } + + if (schk->fp && schk->fp->getconfig) { + if(schk->fp->getconfig(schk, ep)) { + D("Error while trying to get sched params"); + err = EINVAL; + break; + } + ep->oid.len = l; + err = sooptcopyout(sopt, ep, l); + } + }while(0); + free(ep, M_DUMMYNET); + + return err; +} + +/* Configure AQM for flowset 'fs'. + * extra parameters are passed from userland. + */ +static int +config_aqm(struct dn_fsk *fs, struct dn_extra_parms *ep, int busy) +{ + int err = 0; + + do { + /* no configurations */ + if (!ep) { + err = 0; + break; + } + + /* no AQM for this flowset*/ + if (!strcmp(ep->name,"")) { + err = 0; + break; + } + if (ep->oid.len < sizeof(*ep)) { + D("short aqm len %d", ep->oid.len); + err = EINVAL; + break; + } + + if (busy) { + D("Unable to configure flowset, flowset busy!"); + err = EINVAL; + break; + } + + /* deconfigure old aqm if exist */ + if (fs->aqmcfg && fs->aqmfp && fs->aqmfp->deconfig) { + aqm_cleanup_deconfig_fs(fs); + } + + if (!(fs->aqmfp = find_aqm_type(0, ep->name))) { + D("AQM functions not found for type %s!", ep->name); + fs->fs.flags &= ~DN_IS_AQM; + err = EINVAL; + break; + } else + fs->fs.flags |= DN_IS_AQM; + + if (ep->oid.subtype != DN_AQM_PARAMS) { + D("Wrong subtype"); + err = EINVAL; + break; + } + + if (fs->aqmfp->config) { + err = fs->aqmfp->config(fs, ep, ep->oid.len); + if (err) { + D("Unable to configure AQM for FS %d", fs->fs.fs_nr ); + fs->fs.flags &= ~DN_IS_AQM; + fs->aqmfp = NULL; + break; + } + } + } while(0); + + return err; +} +#endif + /* * Configuration -- to preserve backward compatibility we use * the following scheme (N is 65536) @@ -1323,6 +1629,14 @@ } if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { ND("flowset %d unchanged", i); +#ifdef NEW_AQM + /* reconfigure AQM as the parameters can be changed. + * we consider the flowsetis busy if it has scheduler instance(s) + */ + s = locate_scheduler(nfs->sched_nr); + config_aqm(fs, (struct dn_extra_parms *) arg, + s != NULL && s->siht != NULL); +#endif break; /* no change, nothing to do */ } if (oldc != dn_cfg.fsk_count) /* new item */ @@ -1341,6 +1655,10 @@ fsk_detach(fs, flags); } fs->fs = *nfs; /* copy configuration */ +#ifdef NEW_AQM + fs->aqmfp = NULL; + config_aqm(fs, (struct dn_extra_parms *) arg, s != NULL && s->siht != NULL); +#endif if (s != NULL) fsk_attach(fs, s); } while (0); @@ -1866,6 +2184,19 @@ // cmd->id = sopt_valsize; D("compatibility mode"); } + +#ifdef NEW_AQM + /* get AQM params */ + if(cmd->subtype == DN_AQM_PARAMS) { + error = get_aqm_parms(sopt); + goto done; + /* get Scheduler params */ + } else if (cmd->subtype == DN_SCH_PARAMS) { + error = get_sched_parms(sopt); + goto done; + } +#endif + a.extra = (struct copy_range *)cmd; if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ uint32_t *rp = (uint32_t *)(cmd + 1); @@ -2318,4 +2649,98 @@ */ //VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); +#ifdef NEW_AQM + +/* modevent helpers for the AQM modules */ +static int +load_dn_aqm(struct dn_aqm *d) +{ + struct dn_aqm *aqm=NULL; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if AQM already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(aqm, &dn_cfg.aqmlist, next) { + if (strcmp(aqm->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* AQM already exists */ + } + } + if (aqm == NULL) + SLIST_INSERT_HEAD(&dn_cfg.aqmlist, d, next); + DN_BH_WUNLOCK(); + D("dn_aqm %s %sloaded", d->name, aqm ? "not ":""); + return aqm ? 1 : 0; +} + + +/* Callback to clean up AQM status for queues connected to a flowset + * and then deconfigure the flowset. + * This function is called before an AQM module is unloaded + */ +static int +fs_cleanup(void *_fs, void *arg) +{ + struct dn_fsk *fs = _fs; + uint32_t type = *(uint32_t *)arg; + + if (fs->aqmfp && fs->aqmfp->type == type) + aqm_cleanup_deconfig_fs(fs); + + return 0; +} + +static int +unload_dn_aqm(struct dn_aqm *aqm) +{ + struct dn_aqm *tmp, *r; + int err = EINVAL; + err = 0; + ND("called for %s", aqm->name); + + DN_BH_WLOCK(); + + /* clean up AQM status and deconfig flowset */ + dn_ht_scan(dn_cfg.fshash, fs_cleanup, &aqm->type); + + SLIST_FOREACH_SAFE(r, &dn_cfg.aqmlist, next, tmp) { + if (strcmp(aqm->name, r->name) != 0) + continue; + ND("ref_count = %d", r->ref_count); + err = (r->ref_count != 0 || r->cfg_ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.aqmlist, r, dn_aqm, next); + break; + } + DN_BH_WUNLOCK(); + D("%s %sunloaded", aqm->name, err ? "not ":""); + if (err) + D("ref_count=%d, cfg_ref_count=%d", r->ref_count, r->cfg_ref_count); + return err; +} + +int +dn_aqm_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_aqm *aqm = arg; + + if (cmd == MOD_LOAD) + return load_dn_aqm(aqm); + else if (cmd == MOD_UNLOAD) + return unload_dn_aqm(aqm); + else + return EINVAL; +} +#endif + /* end of file */ +