Index: stable/10/contrib/libpcap/grammar.y =================================================================== --- stable/10/contrib/libpcap/grammar.y (revision 263085) +++ stable/10/contrib/libpcap/grammar.y (revision 263086) @@ -1,698 +1,698 @@ %{ /* * Copyright (c) 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that: (1) source code distributions * retain the above copyright notice and this paragraph in its entirety, (2) * distributions including binary code include the above copyright notice and * this paragraph in its entirety in the documentation or other materials * provided with the distribution, and (3) all advertising materials mentioning * features or use of this software display the following acknowledgement: * ``This product includes software developed by the University of California, * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of * the University nor the names of its contributors may be used to endorse * or promote products derived from this software without specific prior * written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * $FreeBSD$ */ #ifndef lint static const char rcsid[] _U_ = "@(#) $Header: /tcpdump/master/libpcap/grammar.y,v 1.101 2007-11-18 02:03:52 guy Exp $ (LBL)"; #endif #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef WIN32 #include #else /* WIN32 */ #include #include #endif /* WIN32 */ #include #ifndef WIN32 #if __STDC__ struct mbuf; struct rtentry; #endif #include #include #endif /* WIN32 */ #include #include "pcap-int.h" #include "gencode.h" #ifdef HAVE_NET_PFVAR_H #include -#include +#include #include #endif #include "ieee80211.h" #include #ifdef HAVE_OS_PROTO_H #include "os-proto.h" #endif #define QSET(q, p, d, a) (q).proto = (p),\ (q).dir = (d),\ (q).addr = (a) struct tok { int v; /* value */ const char *s; /* string */ }; static const struct tok ieee80211_types[] = { { IEEE80211_FC0_TYPE_DATA, "data" }, { IEEE80211_FC0_TYPE_MGT, "mgt" }, { IEEE80211_FC0_TYPE_MGT, "management" }, { IEEE80211_FC0_TYPE_CTL, "ctl" }, { IEEE80211_FC0_TYPE_CTL, "control" }, { 0, NULL } }; static const struct tok ieee80211_mgt_subtypes[] = { { IEEE80211_FC0_SUBTYPE_ASSOC_REQ, "assocreq" }, { IEEE80211_FC0_SUBTYPE_ASSOC_REQ, "assoc-req" }, { IEEE80211_FC0_SUBTYPE_ASSOC_RESP, "assocresp" }, { IEEE80211_FC0_SUBTYPE_ASSOC_RESP, "assoc-resp" }, { IEEE80211_FC0_SUBTYPE_REASSOC_REQ, "reassocreq" }, { IEEE80211_FC0_SUBTYPE_REASSOC_REQ, "reassoc-req" }, { IEEE80211_FC0_SUBTYPE_REASSOC_RESP, "reassocresp" }, { IEEE80211_FC0_SUBTYPE_REASSOC_RESP, "reassoc-resp" }, { IEEE80211_FC0_SUBTYPE_PROBE_REQ, "probereq" }, { IEEE80211_FC0_SUBTYPE_PROBE_REQ, "probe-req" }, { IEEE80211_FC0_SUBTYPE_PROBE_RESP, "proberesp" }, { IEEE80211_FC0_SUBTYPE_PROBE_RESP, "probe-resp" }, { IEEE80211_FC0_SUBTYPE_BEACON, "beacon" }, { IEEE80211_FC0_SUBTYPE_ATIM, "atim" }, { IEEE80211_FC0_SUBTYPE_DISASSOC, "disassoc" }, { IEEE80211_FC0_SUBTYPE_DISASSOC, "disassociation" }, { IEEE80211_FC0_SUBTYPE_AUTH, "auth" }, { IEEE80211_FC0_SUBTYPE_AUTH, "authentication" }, { IEEE80211_FC0_SUBTYPE_DEAUTH, "deauth" }, { IEEE80211_FC0_SUBTYPE_DEAUTH, "deauthentication" }, { 0, NULL } }; static const struct tok ieee80211_ctl_subtypes[] = { { IEEE80211_FC0_SUBTYPE_PS_POLL, "ps-poll" }, { IEEE80211_FC0_SUBTYPE_RTS, "rts" }, { IEEE80211_FC0_SUBTYPE_CTS, "cts" }, { IEEE80211_FC0_SUBTYPE_ACK, "ack" }, { IEEE80211_FC0_SUBTYPE_CF_END, "cf-end" }, { IEEE80211_FC0_SUBTYPE_CF_END_ACK, "cf-end-ack" }, { 0, NULL } }; static const struct tok ieee80211_data_subtypes[] = { { IEEE80211_FC0_SUBTYPE_DATA, "data" }, { IEEE80211_FC0_SUBTYPE_CF_ACK, "data-cf-ack" }, { IEEE80211_FC0_SUBTYPE_CF_POLL, "data-cf-poll" }, { IEEE80211_FC0_SUBTYPE_CF_ACPL, "data-cf-ack-poll" }, { IEEE80211_FC0_SUBTYPE_NODATA, "null" }, { IEEE80211_FC0_SUBTYPE_NODATA_CF_ACK, "cf-ack" }, { IEEE80211_FC0_SUBTYPE_NODATA_CF_POLL, "cf-poll" }, { IEEE80211_FC0_SUBTYPE_NODATA_CF_ACPL, "cf-ack-poll" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_DATA, "qos-data" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_CF_ACK, "qos-data-cf-ack" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_CF_POLL, "qos-data-cf-poll" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_CF_ACPL, "qos-data-cf-ack-poll" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_NODATA, "qos" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_NODATA_CF_POLL, "qos-cf-poll" }, { IEEE80211_FC0_SUBTYPE_QOS|IEEE80211_FC0_SUBTYPE_NODATA_CF_ACPL, "qos-cf-ack-poll" }, { 0, NULL } }; struct type2tok { int type; const struct tok *tok; }; static const struct type2tok ieee80211_type_subtypes[] = { { IEEE80211_FC0_TYPE_MGT, ieee80211_mgt_subtypes }, { IEEE80211_FC0_TYPE_CTL, ieee80211_ctl_subtypes }, { IEEE80211_FC0_TYPE_DATA, ieee80211_data_subtypes }, { 0, NULL } }; static int str2tok(const char *str, const struct tok *toks) { int i; for (i = 0; toks[i].s != NULL; i++) { if (pcap_strcasecmp(toks[i].s, str) == 0) return (toks[i].v); } return (-1); } int n_errors = 0; static struct qual qerr = { Q_UNDEF, Q_UNDEF, Q_UNDEF, Q_UNDEF }; static void yyerror(const char *msg) { ++n_errors; bpf_error("%s", msg); /* NOTREACHED */ } #ifdef NEED_YYPARSE_WRAPPER int yyparse(void); int pcap_parse() { return (yyparse()); } #endif #ifdef HAVE_NET_PFVAR_H static int pfreason_to_num(const char *reason) { const char *reasons[] = PFRES_NAMES; int i; for (i = 0; reasons[i]; i++) { if (pcap_strcasecmp(reason, reasons[i]) == 0) return (i); } bpf_error("unknown PF reason"); /*NOTREACHED*/ } static int pfaction_to_num(const char *action) { if (pcap_strcasecmp(action, "pass") == 0 || pcap_strcasecmp(action, "accept") == 0) return (PF_PASS); else if (pcap_strcasecmp(action, "drop") == 0 || pcap_strcasecmp(action, "block") == 0) return (PF_DROP); #if HAVE_PF_NAT_THROUGH_PF_NORDR else if (pcap_strcasecmp(action, "rdr") == 0) return (PF_RDR); else if (pcap_strcasecmp(action, "nat") == 0) return (PF_NAT); else if (pcap_strcasecmp(action, "binat") == 0) return (PF_BINAT); else if (pcap_strcasecmp(action, "nordr") == 0) return (PF_NORDR); #endif else { bpf_error("unknown PF action"); /*NOTREACHED*/ } } #else /* !HAVE_NET_PFVAR_H */ static int pfreason_to_num(const char *reason) { bpf_error("libpcap was compiled on a machine without pf support"); /*NOTREACHED*/ /* this is to make the VC compiler happy */ return -1; } static int pfaction_to_num(const char *action) { bpf_error("libpcap was compiled on a machine without pf support"); /*NOTREACHED*/ /* this is to make the VC compiler happy */ return -1; } #endif /* HAVE_NET_PFVAR_H */ %} %union { int i; bpf_u_int32 h; u_char *e; char *s; struct stmt *stmt; struct arth *a; struct { struct qual q; int atmfieldtype; int mtp3fieldtype; struct block *b; } blk; struct block *rblk; } %type expr id nid pid term rterm qid %type head %type pqual dqual aqual ndaqual %type arth narth %type byteop pname pnum relop irelop %type and or paren not null prog %type other pfvar p80211 %type atmtype atmmultitype %type atmfield %type atmfieldvalue atmvalue atmlistvalue %type mtp2type %type mtp3field %type mtp3fieldvalue mtp3value mtp3listvalue %token DST SRC HOST GATEWAY %token NET NETMASK PORT PORTRANGE LESS GREATER PROTO PROTOCHAIN CBYTE %token ARP RARP IP SCTP TCP UDP ICMP IGMP IGRP PIM VRRP CARP %token ATALK AARP DECNET LAT SCA MOPRC MOPDL %token TK_BROADCAST TK_MULTICAST %token NUM INBOUND OUTBOUND %token PF_IFNAME PF_RSET PF_RNR PF_SRNR PF_REASON PF_ACTION %token TYPE SUBTYPE DIR ADDR1 ADDR2 ADDR3 ADDR4 RA TA %token LINK %token GEQ LEQ NEQ %token ID EID HID HID6 AID %token LSH RSH %token LEN %token IPV6 ICMPV6 AH ESP %token VLAN MPLS %token PPPOED PPPOES %token ISO ESIS CLNP ISIS L1 L2 IIH LSP SNP CSNP PSNP %token STP %token IPX %token NETBEUI %token LANE LLC METAC BCC SC ILMIC OAMF4EC OAMF4SC %token OAM OAMF4 CONNECTMSG METACONNECT %token VPI VCI %token RADIO %token FISU LSSU MSU %token SIO OPC DPC SLS %type ID %type EID %type AID %type HID HID6 %type NUM action reason type subtype type_subtype dir %left OR AND %nonassoc '!' %left '|' %left '&' %left LSH RSH %left '+' '-' %left '*' '/' %nonassoc UMINUS %% prog: null expr { finish_parse($2.b); } | null ; null: /* null */ { $$.q = qerr; } ; expr: term | expr and term { gen_and($1.b, $3.b); $$ = $3; } | expr and id { gen_and($1.b, $3.b); $$ = $3; } | expr or term { gen_or($1.b, $3.b); $$ = $3; } | expr or id { gen_or($1.b, $3.b); $$ = $3; } ; and: AND { $$ = $0; } ; or: OR { $$ = $0; } ; id: nid | pnum { $$.b = gen_ncode(NULL, (bpf_u_int32)$1, $$.q = $0.q); } | paren pid ')' { $$ = $2; } ; nid: ID { $$.b = gen_scode($1, $$.q = $0.q); } | HID '/' NUM { $$.b = gen_mcode($1, NULL, $3, $$.q = $0.q); } | HID NETMASK HID { $$.b = gen_mcode($1, $3, 0, $$.q = $0.q); } | HID { /* Decide how to parse HID based on proto */ $$.q = $0.q; if ($$.q.addr == Q_PORT) bpf_error("'port' modifier applied to ip host"); else if ($$.q.addr == Q_PORTRANGE) bpf_error("'portrange' modifier applied to ip host"); else if ($$.q.addr == Q_PROTO) bpf_error("'proto' modifier applied to ip host"); else if ($$.q.addr == Q_PROTOCHAIN) bpf_error("'protochain' modifier applied to ip host"); $$.b = gen_ncode($1, 0, $$.q); } | HID6 '/' NUM { #ifdef INET6 $$.b = gen_mcode6($1, NULL, $3, $$.q = $0.q); #else bpf_error("'ip6addr/prefixlen' not supported " "in this configuration"); #endif /*INET6*/ } | HID6 { #ifdef INET6 $$.b = gen_mcode6($1, 0, 128, $$.q = $0.q); #else bpf_error("'ip6addr' not supported " "in this configuration"); #endif /*INET6*/ } | EID { $$.b = gen_ecode($1, $$.q = $0.q); /* * $1 was allocated by "pcap_ether_aton()", * so we must free it now that we're done * with it. */ free($1); } | AID { $$.b = gen_acode($1, $$.q = $0.q); /* * $1 was allocated by "pcap_ether_aton()", * so we must free it now that we're done * with it. */ free($1); } | not id { gen_not($2.b); $$ = $2; } ; not: '!' { $$ = $0; } ; paren: '(' { $$ = $0; } ; pid: nid | qid and id { gen_and($1.b, $3.b); $$ = $3; } | qid or id { gen_or($1.b, $3.b); $$ = $3; } ; qid: pnum { $$.b = gen_ncode(NULL, (bpf_u_int32)$1, $$.q = $0.q); } | pid ; term: rterm | not term { gen_not($2.b); $$ = $2; } ; head: pqual dqual aqual { QSET($$.q, $1, $2, $3); } | pqual dqual { QSET($$.q, $1, $2, Q_DEFAULT); } | pqual aqual { QSET($$.q, $1, Q_DEFAULT, $2); } | pqual PROTO { QSET($$.q, $1, Q_DEFAULT, Q_PROTO); } | pqual PROTOCHAIN { QSET($$.q, $1, Q_DEFAULT, Q_PROTOCHAIN); } | pqual ndaqual { QSET($$.q, $1, Q_DEFAULT, $2); } ; rterm: head id { $$ = $2; } | paren expr ')' { $$.b = $2.b; $$.q = $1.q; } | pname { $$.b = gen_proto_abbrev($1); $$.q = qerr; } | arth relop arth { $$.b = gen_relation($2, $1, $3, 0); $$.q = qerr; } | arth irelop arth { $$.b = gen_relation($2, $1, $3, 1); $$.q = qerr; } | other { $$.b = $1; $$.q = qerr; } | atmtype { $$.b = gen_atmtype_abbrev($1); $$.q = qerr; } | atmmultitype { $$.b = gen_atmmulti_abbrev($1); $$.q = qerr; } | atmfield atmvalue { $$.b = $2.b; $$.q = qerr; } | mtp2type { $$.b = gen_mtp2type_abbrev($1); $$.q = qerr; } | mtp3field mtp3value { $$.b = $2.b; $$.q = qerr; } ; /* protocol level qualifiers */ pqual: pname | { $$ = Q_DEFAULT; } ; /* 'direction' qualifiers */ dqual: SRC { $$ = Q_SRC; } | DST { $$ = Q_DST; } | SRC OR DST { $$ = Q_OR; } | DST OR SRC { $$ = Q_OR; } | SRC AND DST { $$ = Q_AND; } | DST AND SRC { $$ = Q_AND; } | ADDR1 { $$ = Q_ADDR1; } | ADDR2 { $$ = Q_ADDR2; } | ADDR3 { $$ = Q_ADDR3; } | ADDR4 { $$ = Q_ADDR4; } | RA { $$ = Q_RA; } | TA { $$ = Q_TA; } ; /* address type qualifiers */ aqual: HOST { $$ = Q_HOST; } | NET { $$ = Q_NET; } | PORT { $$ = Q_PORT; } | PORTRANGE { $$ = Q_PORTRANGE; } ; /* non-directional address type qualifiers */ ndaqual: GATEWAY { $$ = Q_GATEWAY; } ; pname: LINK { $$ = Q_LINK; } | IP { $$ = Q_IP; } | ARP { $$ = Q_ARP; } | RARP { $$ = Q_RARP; } | SCTP { $$ = Q_SCTP; } | TCP { $$ = Q_TCP; } | UDP { $$ = Q_UDP; } | ICMP { $$ = Q_ICMP; } | IGMP { $$ = Q_IGMP; } | IGRP { $$ = Q_IGRP; } | PIM { $$ = Q_PIM; } | VRRP { $$ = Q_VRRP; } | CARP { $$ = Q_CARP; } | ATALK { $$ = Q_ATALK; } | AARP { $$ = Q_AARP; } | DECNET { $$ = Q_DECNET; } | LAT { $$ = Q_LAT; } | SCA { $$ = Q_SCA; } | MOPDL { $$ = Q_MOPDL; } | MOPRC { $$ = Q_MOPRC; } | IPV6 { $$ = Q_IPV6; } | ICMPV6 { $$ = Q_ICMPV6; } | AH { $$ = Q_AH; } | ESP { $$ = Q_ESP; } | ISO { $$ = Q_ISO; } | ESIS { $$ = Q_ESIS; } | ISIS { $$ = Q_ISIS; } | L1 { $$ = Q_ISIS_L1; } | L2 { $$ = Q_ISIS_L2; } | IIH { $$ = Q_ISIS_IIH; } | LSP { $$ = Q_ISIS_LSP; } | SNP { $$ = Q_ISIS_SNP; } | PSNP { $$ = Q_ISIS_PSNP; } | CSNP { $$ = Q_ISIS_CSNP; } | CLNP { $$ = Q_CLNP; } | STP { $$ = Q_STP; } | IPX { $$ = Q_IPX; } | NETBEUI { $$ = Q_NETBEUI; } | RADIO { $$ = Q_RADIO; } ; other: pqual TK_BROADCAST { $$ = gen_broadcast($1); } | pqual TK_MULTICAST { $$ = gen_multicast($1); } | LESS NUM { $$ = gen_less($2); } | GREATER NUM { $$ = gen_greater($2); } | CBYTE NUM byteop NUM { $$ = gen_byteop($3, $2, $4); } | INBOUND { $$ = gen_inbound(0); } | OUTBOUND { $$ = gen_inbound(1); } | VLAN pnum { $$ = gen_vlan($2); } | VLAN { $$ = gen_vlan(-1); } | MPLS pnum { $$ = gen_mpls($2); } | MPLS { $$ = gen_mpls(-1); } | PPPOED { $$ = gen_pppoed(); } | PPPOES { $$ = gen_pppoes(); } | pfvar { $$ = $1; } | pqual p80211 { $$ = $2; } ; pfvar: PF_IFNAME ID { $$ = gen_pf_ifname($2); } | PF_RSET ID { $$ = gen_pf_ruleset($2); } | PF_RNR NUM { $$ = gen_pf_rnr($2); } | PF_SRNR NUM { $$ = gen_pf_srnr($2); } | PF_REASON reason { $$ = gen_pf_reason($2); } | PF_ACTION action { $$ = gen_pf_action($2); } ; p80211: TYPE type SUBTYPE subtype { $$ = gen_p80211_type($2 | $4, IEEE80211_FC0_TYPE_MASK | IEEE80211_FC0_SUBTYPE_MASK); } | TYPE type { $$ = gen_p80211_type($2, IEEE80211_FC0_TYPE_MASK); } | SUBTYPE type_subtype { $$ = gen_p80211_type($2, IEEE80211_FC0_TYPE_MASK | IEEE80211_FC0_SUBTYPE_MASK); } | DIR dir { $$ = gen_p80211_fcdir($2); } ; type: NUM | ID { $$ = str2tok($1, ieee80211_types); if ($$ == -1) bpf_error("unknown 802.11 type name"); } ; subtype: NUM | ID { const struct tok *types = NULL; int i; for (i = 0;; i++) { if (ieee80211_type_subtypes[i].tok == NULL) { /* Ran out of types */ bpf_error("unknown 802.11 type"); break; } if ($-1 == ieee80211_type_subtypes[i].type) { types = ieee80211_type_subtypes[i].tok; break; } } $$ = str2tok($1, types); if ($$ == -1) bpf_error("unknown 802.11 subtype name"); } ; type_subtype: ID { int i; for (i = 0;; i++) { if (ieee80211_type_subtypes[i].tok == NULL) { /* Ran out of types */ bpf_error("unknown 802.11 type name"); break; } $$ = str2tok($1, ieee80211_type_subtypes[i].tok); if ($$ != -1) { $$ |= ieee80211_type_subtypes[i].type; break; } } } ; dir: NUM | ID { if (pcap_strcasecmp($1, "nods") == 0) $$ = IEEE80211_FC1_DIR_NODS; else if (pcap_strcasecmp($1, "tods") == 0) $$ = IEEE80211_FC1_DIR_TODS; else if (pcap_strcasecmp($1, "fromds") == 0) $$ = IEEE80211_FC1_DIR_FROMDS; else if (pcap_strcasecmp($1, "dstods") == 0) $$ = IEEE80211_FC1_DIR_DSTODS; else bpf_error("unknown 802.11 direction"); } ; reason: NUM { $$ = $1; } | ID { $$ = pfreason_to_num($1); } ; action: ID { $$ = pfaction_to_num($1); } ; relop: '>' { $$ = BPF_JGT; } | GEQ { $$ = BPF_JGE; } | '=' { $$ = BPF_JEQ; } ; irelop: LEQ { $$ = BPF_JGT; } | '<' { $$ = BPF_JGE; } | NEQ { $$ = BPF_JEQ; } ; arth: pnum { $$ = gen_loadi($1); } | narth ; narth: pname '[' arth ']' { $$ = gen_load($1, $3, 1); } | pname '[' arth ':' NUM ']' { $$ = gen_load($1, $3, $5); } | arth '+' arth { $$ = gen_arth(BPF_ADD, $1, $3); } | arth '-' arth { $$ = gen_arth(BPF_SUB, $1, $3); } | arth '*' arth { $$ = gen_arth(BPF_MUL, $1, $3); } | arth '/' arth { $$ = gen_arth(BPF_DIV, $1, $3); } | arth '&' arth { $$ = gen_arth(BPF_AND, $1, $3); } | arth '|' arth { $$ = gen_arth(BPF_OR, $1, $3); } | arth LSH arth { $$ = gen_arth(BPF_LSH, $1, $3); } | arth RSH arth { $$ = gen_arth(BPF_RSH, $1, $3); } | '-' arth %prec UMINUS { $$ = gen_neg($2); } | paren narth ')' { $$ = $2; } | LEN { $$ = gen_loadlen(); } ; byteop: '&' { $$ = '&'; } | '|' { $$ = '|'; } | '<' { $$ = '<'; } | '>' { $$ = '>'; } | '=' { $$ = '='; } ; pnum: NUM | paren pnum ')' { $$ = $2; } ; atmtype: LANE { $$ = A_LANE; } | LLC { $$ = A_LLC; } | METAC { $$ = A_METAC; } | BCC { $$ = A_BCC; } | OAMF4EC { $$ = A_OAMF4EC; } | OAMF4SC { $$ = A_OAMF4SC; } | SC { $$ = A_SC; } | ILMIC { $$ = A_ILMIC; } ; atmmultitype: OAM { $$ = A_OAM; } | OAMF4 { $$ = A_OAMF4; } | CONNECTMSG { $$ = A_CONNECTMSG; } | METACONNECT { $$ = A_METACONNECT; } ; /* ATM field types quantifier */ atmfield: VPI { $$.atmfieldtype = A_VPI; } | VCI { $$.atmfieldtype = A_VCI; } ; atmvalue: atmfieldvalue | relop NUM { $$.b = gen_atmfield_code($0.atmfieldtype, (bpf_int32)$2, (bpf_u_int32)$1, 0); } | irelop NUM { $$.b = gen_atmfield_code($0.atmfieldtype, (bpf_int32)$2, (bpf_u_int32)$1, 1); } | paren atmlistvalue ')' { $$.b = $2.b; $$.q = qerr; } ; atmfieldvalue: NUM { $$.atmfieldtype = $0.atmfieldtype; if ($$.atmfieldtype == A_VPI || $$.atmfieldtype == A_VCI) $$.b = gen_atmfield_code($$.atmfieldtype, (bpf_int32) $1, BPF_JEQ, 0); } ; atmlistvalue: atmfieldvalue | atmlistvalue or atmfieldvalue { gen_or($1.b, $3.b); $$ = $3; } ; /* MTP2 types quantifier */ mtp2type: FISU { $$ = M_FISU; } | LSSU { $$ = M_LSSU; } | MSU { $$ = M_MSU; } ; /* MTP3 field types quantifier */ mtp3field: SIO { $$.mtp3fieldtype = M_SIO; } | OPC { $$.mtp3fieldtype = M_OPC; } | DPC { $$.mtp3fieldtype = M_DPC; } | SLS { $$.mtp3fieldtype = M_SLS; } ; mtp3value: mtp3fieldvalue | relop NUM { $$.b = gen_mtp3field_code($0.mtp3fieldtype, (u_int)$2, (u_int)$1, 0); } | irelop NUM { $$.b = gen_mtp3field_code($0.mtp3fieldtype, (u_int)$2, (u_int)$1, 1); } | paren mtp3listvalue ')' { $$.b = $2.b; $$.q = qerr; } ; mtp3fieldvalue: NUM { $$.mtp3fieldtype = $0.mtp3fieldtype; if ($$.mtp3fieldtype == M_SIO || $$.mtp3fieldtype == M_OPC || $$.mtp3fieldtype == M_DPC || $$.mtp3fieldtype == M_SLS ) $$.b = gen_mtp3field_code($$.mtp3fieldtype, (u_int) $1, BPF_JEQ, 0); } ; mtp3listvalue: mtp3fieldvalue | mtp3listvalue or mtp3fieldvalue { gen_or($1.b, $3.b); $$ = $3; } ; %% Index: stable/10/contrib/tcpdump/print-ip.c =================================================================== --- stable/10/contrib/tcpdump/print-ip.c (revision 263085) +++ stable/10/contrib/tcpdump/print-ip.c (revision 263086) @@ -1,711 +1,713 @@ /* * Copyright (c) 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that: (1) source code distributions * retain the above copyright notice and this paragraph in its entirety, (2) * distributions including binary code include the above copyright notice and * this paragraph in its entirety in the documentation or other materials * provided with the distribution, and (3) all advertising materials mentioning * features or use of this software display the following acknowledgement: * ``This product includes software developed by the University of California, * Lawrence Berkeley Laboratory and its contributors.'' Neither the name of * the University nor the names of its contributors may be used to endorse * or promote products derived from this software without specific prior * written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * $FreeBSD$ */ #ifndef lint static const char rcsid[] _U_ = "@(#) $Header: /tcpdump/master/tcpdump/print-ip.c,v 1.159 2007-09-14 01:29:28 guy Exp $ (LBL)"; #endif #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #include #include "addrtoname.h" #include "interface.h" #include "extract.h" /* must come after interface.h */ #include "ip.h" #include "ipproto.h" struct tok ip_option_values[] = { { IPOPT_EOL, "EOL" }, { IPOPT_NOP, "NOP" }, { IPOPT_TS, "timestamp" }, { IPOPT_SECURITY, "security" }, { IPOPT_RR, "RR" }, { IPOPT_SSRR, "SSRR" }, { IPOPT_LSRR, "LSRR" }, { IPOPT_RA, "RA" }, { IPOPT_RFC1393, "traceroute" }, { 0, NULL } }; /* * print the recorded route in an IP RR, LSRR or SSRR option. */ static void ip_printroute(register const u_char *cp, u_int length) { register u_int ptr; register u_int len; if (length < 3) { printf(" [bad length %u]", length); return; } if ((length + 1) & 3) printf(" [bad length %u]", length); ptr = cp[2] - 1; if (ptr < 3 || ((ptr + 1) & 3) || ptr > length + 1) printf(" [bad ptr %u]", cp[2]); for (len = 3; len < length; len += 4) { printf(" %s", ipaddr_string(&cp[len])); if (ptr > len) printf(","); } } /* * If source-routing is present and valid, return the final destination. * Otherwise, return IP destination. * * This is used for UDP and TCP pseudo-header in the checksum * calculation. */ static u_int32_t ip_finddst(const struct ip *ip) { int length; int len; const u_char *cp; u_int32_t retval; cp = (const u_char *)(ip + 1); length = (IP_HL(ip) << 2) - sizeof(struct ip); for (; length > 0; cp += len, length -= len) { int tt; TCHECK(*cp); tt = *cp; if (tt == IPOPT_EOL) break; else if (tt == IPOPT_NOP) len = 1; else { TCHECK(cp[1]); len = cp[1]; if (len < 2) break; } TCHECK2(*cp, len); switch (tt) { case IPOPT_SSRR: case IPOPT_LSRR: if (len < 7) break; memcpy(&retval, cp + len - 4, 4); return retval; } } trunc: memcpy(&retval, &ip->ip_dst.s_addr, sizeof(u_int32_t)); return retval; } /* * Compute a V4-style checksum by building a pseudoheader. */ int nextproto4_cksum(const struct ip *ip, const u_int8_t *data, u_int len, u_int next_proto) { struct phdr { u_int32_t src; u_int32_t dst; u_char mbz; u_char proto; u_int16_t len; } ph; struct cksum_vec vec[2]; /* pseudo-header.. */ ph.len = htons((u_int16_t)len); ph.mbz = 0; ph.proto = next_proto; memcpy(&ph.src, &ip->ip_src.s_addr, sizeof(u_int32_t)); if (IP_HL(ip) == 5) memcpy(&ph.dst, &ip->ip_dst.s_addr, sizeof(u_int32_t)); else ph.dst = ip_finddst(ip); vec[0].ptr = (const u_int8_t *)(void *)&ph; vec[0].len = sizeof(ph); vec[1].ptr = data; vec[1].len = len; return (in_cksum(vec, 2)); } static void ip_printts(register const u_char *cp, u_int length) { register u_int ptr; register u_int len; int hoplen; const char *type; if (length < 4) { printf("[bad length %u]", length); return; } printf(" TS{"); hoplen = ((cp[3]&0xF) != IPOPT_TS_TSONLY) ? 8 : 4; if ((length - 4) & (hoplen-1)) printf("[bad length %u]", length); ptr = cp[2] - 1; len = 0; if (ptr < 4 || ((ptr - 4) & (hoplen-1)) || ptr > length + 1) printf("[bad ptr %u]", cp[2]); switch (cp[3]&0xF) { case IPOPT_TS_TSONLY: printf("TSONLY"); break; case IPOPT_TS_TSANDADDR: printf("TS+ADDR"); break; /* * prespecified should really be 3, but some ones might send 2 * instead, and the IPOPT_TS_PRESPEC constant can apparently * have both values, so we have to hard-code it here. */ case 2: printf("PRESPEC2.0"); break; case 3: /* IPOPT_TS_PRESPEC */ printf("PRESPEC"); break; default: printf("[bad ts type %d]", cp[3]&0xF); goto done; } type = " "; for (len = 4; len < length; len += hoplen) { if (ptr == len) type = " ^ "; printf("%s%d@%s", type, EXTRACT_32BITS(&cp[len+hoplen-4]), hoplen!=8 ? "" : ipaddr_string(&cp[len])); type = " "; } done: printf("%s", ptr == len ? " ^ " : ""); if (cp[3]>>4) printf(" [%d hops not recorded]} ", cp[3]>>4); else printf("}"); } /* * print IP options. */ static void ip_optprint(register const u_char *cp, u_int length) { register u_int option_len; const char *sep = ""; for (; length > 0; cp += option_len, length -= option_len) { u_int option_code; printf("%s", sep); sep = ","; TCHECK(*cp); option_code = *cp; printf("%s", tok2str(ip_option_values,"unknown %u",option_code)); if (option_code == IPOPT_NOP || option_code == IPOPT_EOL) option_len = 1; else { TCHECK(cp[1]); option_len = cp[1]; if (option_len < 2) { printf(" [bad length %u]", option_len); return; } } if (option_len > length) { printf(" [bad length %u]", option_len); return; } TCHECK2(*cp, option_len); switch (option_code) { case IPOPT_EOL: return; case IPOPT_TS: ip_printts(cp, option_len); break; case IPOPT_RR: /* fall through */ case IPOPT_SSRR: case IPOPT_LSRR: ip_printroute(cp, option_len); break; case IPOPT_RA: if (option_len < 4) { printf(" [bad length %u]", option_len); break; } TCHECK(cp[3]); if (EXTRACT_16BITS(&cp[2]) != 0) printf(" value %u", EXTRACT_16BITS(&cp[2])); break; case IPOPT_NOP: /* nothing to print - fall through */ case IPOPT_SECURITY: default: break; } } return; trunc: printf("[|ip]"); } #define IP_RES 0x8000 static struct tok ip_frag_values[] = { { IP_MF, "+" }, { IP_DF, "DF" }, { IP_RES, "rsvd" }, /* The RFC3514 evil ;-) bit */ { 0, NULL } }; struct ip_print_demux_state { const struct ip *ip; const u_char *cp; u_int len, off; u_char nh; int advance; }; static void ip_print_demux(netdissect_options *ndo, struct ip_print_demux_state *ipds) { struct protoent *proto; struct cksum_vec vec[1]; again: switch (ipds->nh) { case IPPROTO_AH: ipds->nh = *ipds->cp; ipds->advance = ah_print(ipds->cp); if (ipds->advance <= 0) break; ipds->cp += ipds->advance; ipds->len -= ipds->advance; goto again; case IPPROTO_ESP: { int enh, padlen; ipds->advance = esp_print(ndo, ipds->cp, ipds->len, (const u_char *)ipds->ip, &enh, &padlen); if (ipds->advance <= 0) break; ipds->cp += ipds->advance; ipds->len -= ipds->advance + padlen; ipds->nh = enh & 0xff; goto again; } case IPPROTO_IPCOMP: { int enh; ipds->advance = ipcomp_print(ipds->cp, &enh); if (ipds->advance <= 0) break; ipds->cp += ipds->advance; ipds->len -= ipds->advance; ipds->nh = enh & 0xff; goto again; } case IPPROTO_SCTP: sctp_print(ipds->cp, (const u_char *)ipds->ip, ipds->len); break; case IPPROTO_DCCP: dccp_print(ipds->cp, (const u_char *)ipds->ip, ipds->len); break; case IPPROTO_TCP: /* pass on the MF bit plus the offset to detect fragments */ tcp_print(ipds->cp, ipds->len, (const u_char *)ipds->ip, ipds->off & (IP_MF|IP_OFFMASK)); break; case IPPROTO_UDP: /* pass on the MF bit plus the offset to detect fragments */ udp_print(ipds->cp, ipds->len, (const u_char *)ipds->ip, ipds->off & (IP_MF|IP_OFFMASK)); break; case IPPROTO_ICMP: /* pass on the MF bit plus the offset to detect fragments */ icmp_print(ipds->cp, ipds->len, (const u_char *)ipds->ip, ipds->off & (IP_MF|IP_OFFMASK)); break; case IPPROTO_PIGP: /* * XXX - the current IANA protocol number assignments * page lists 9 as "any private interior gateway * (used by Cisco for their IGRP)" and 88 as * "EIGRP" from Cisco. * * Recent BSD headers define * IP_PROTO_PIGP as 9 and IP_PROTO_IGRP as 88. * We define IP_PROTO_PIGP as 9 and * IP_PROTO_EIGRP as 88; those names better * match was the current protocol number * assignments say. */ igrp_print(ipds->cp, ipds->len, (const u_char *)ipds->ip); break; case IPPROTO_EIGRP: eigrp_print(ipds->cp, ipds->len); break; case IPPROTO_ND: ND_PRINT((ndo, " nd %d", ipds->len)); break; case IPPROTO_EGP: egp_print(ipds->cp, ipds->len); break; case IPPROTO_OSPF: ospf_print(ipds->cp, ipds->len, (const u_char *)ipds->ip); break; case IPPROTO_IGMP: igmp_print(ipds->cp, ipds->len); break; case IPPROTO_IPV4: /* DVMRP multicast tunnel (ip-in-ip encapsulation) */ ip_print(ndo, ipds->cp, ipds->len); if (! vflag) { ND_PRINT((ndo, " (ipip-proto-4)")); return; } break; #ifdef INET6 case IPPROTO_IPV6: /* ip6-in-ip encapsulation */ ip6_print(ndo, ipds->cp, ipds->len); break; #endif /*INET6*/ case IPPROTO_RSVP: rsvp_print(ipds->cp, ipds->len); break; case IPPROTO_GRE: /* do it */ gre_print(ipds->cp, ipds->len); break; case IPPROTO_MOBILE: mobile_print(ipds->cp, ipds->len); break; case IPPROTO_PIM: vec[0].ptr = ipds->cp; vec[0].len = ipds->len; pim_print(ipds->cp, ipds->len, in_cksum(vec, 1)); break; case IPPROTO_VRRP: if (packettype == PT_CARP) { if (vflag) (void)printf("carp %s > %s: ", ipaddr_string(&ipds->ip->ip_src), ipaddr_string(&ipds->ip->ip_dst)); carp_print(ipds->cp, ipds->len, ipds->ip->ip_ttl); } else { if (vflag) (void)printf("vrrp %s > %s: ", ipaddr_string(&ipds->ip->ip_src), ipaddr_string(&ipds->ip->ip_dst)); vrrp_print(ipds->cp, ipds->len, ipds->ip->ip_ttl); } break; case IPPROTO_PGM: pgm_print(ipds->cp, ipds->len, (const u_char *)ipds->ip); break; +#if defined(HAVE_NET_PFVAR_H) case IPPROTO_PFSYNC: pfsync_ip_print(ipds->cp, ipds->len); break; +#endif default: if (ndo->ndo_nflag==0 && (proto = getprotobynumber(ipds->nh)) != NULL) ND_PRINT((ndo, " %s", proto->p_name)); else ND_PRINT((ndo, " ip-proto-%d", ipds->nh)); ND_PRINT((ndo, " %d", ipds->len)); break; } } void ip_print_inner(netdissect_options *ndo, const u_char *bp, u_int length, u_int nh, const u_char *bp2) { struct ip_print_demux_state ipd; ipd.ip = (const struct ip *)bp2; ipd.cp = bp; ipd.len = length; ipd.off = 0; ipd.nh = nh; ipd.advance = 0; ip_print_demux(ndo, &ipd); } /* * print an IP datagram. */ void ip_print(netdissect_options *ndo, const u_char *bp, u_int length) { struct ip_print_demux_state ipd; struct ip_print_demux_state *ipds=&ipd; const u_char *ipend; u_int hlen; struct cksum_vec vec[1]; u_int16_t sum, ip_sum; struct protoent *proto; ipds->ip = (const struct ip *)bp; if (IP_V(ipds->ip) != 4) { /* print version if != 4 */ printf("IP%u ", IP_V(ipds->ip)); if (IP_V(ipds->ip) == 6) printf(", wrong link-layer encapsulation"); } else if (!eflag) printf("IP "); if ((u_char *)(ipds->ip + 1) > ndo->ndo_snapend) { printf("[|ip]"); return; } if (length < sizeof (struct ip)) { (void)printf("truncated-ip %u", length); return; } hlen = IP_HL(ipds->ip) * 4; if (hlen < sizeof (struct ip)) { (void)printf("bad-hlen %u", hlen); return; } ipds->len = EXTRACT_16BITS(&ipds->ip->ip_len); if (length < ipds->len) (void)printf("truncated-ip - %u bytes missing! ", ipds->len - length); if (ipds->len < hlen) { #ifdef GUESS_TSO if (ipds->len) { (void)printf("bad-len %u", ipds->len); return; } else { /* we guess that it is a TSO send */ ipds->len = length; } #else (void)printf("bad-len %u", ipds->len); return; #endif /* GUESS_TSO */ } /* * Cut off the snapshot length to the end of the IP payload. */ ipend = bp + ipds->len; if (ipend < ndo->ndo_snapend) ndo->ndo_snapend = ipend; ipds->len -= hlen; ipds->off = EXTRACT_16BITS(&ipds->ip->ip_off); if (vflag) { (void)printf("(tos 0x%x", (int)ipds->ip->ip_tos); /* ECN bits */ if (ipds->ip->ip_tos & 0x03) { switch (ipds->ip->ip_tos & 0x03) { case 1: (void)printf(",ECT(1)"); break; case 2: (void)printf(",ECT(0)"); break; case 3: (void)printf(",CE"); } } if (ipds->ip->ip_ttl >= 1) (void)printf(", ttl %u", ipds->ip->ip_ttl); /* * for the firewall guys, print id, offset. * On all but the last stick a "+" in the flags portion. * For unfragmented datagrams, note the don't fragment flag. */ (void)printf(", id %u, offset %u, flags [%s], proto %s (%u)", EXTRACT_16BITS(&ipds->ip->ip_id), (ipds->off & 0x1fff) * 8, bittok2str(ip_frag_values, "none", ipds->off&0xe000), tok2str(ipproto_values,"unknown",ipds->ip->ip_p), ipds->ip->ip_p); (void)printf(", length %u", EXTRACT_16BITS(&ipds->ip->ip_len)); if ((hlen - sizeof(struct ip)) > 0) { printf(", options ("); ip_optprint((u_char *)(ipds->ip + 1), hlen - sizeof(struct ip)); printf(")"); } if (!Kflag && (u_char *)ipds->ip + hlen <= ndo->ndo_snapend) { vec[0].ptr = (const u_int8_t *)(void *)ipds->ip; vec[0].len = hlen; sum = in_cksum(vec, 1); if (sum != 0) { ip_sum = EXTRACT_16BITS(&ipds->ip->ip_sum); (void)printf(", bad cksum %x (->%x)!", ip_sum, in_cksum_shouldbe(ip_sum, sum)); } } printf(")\n "); } /* * If this is fragment zero, hand it to the next higher * level protocol. */ if ((ipds->off & 0x1fff) == 0) { ipds->cp = (const u_char *)ipds->ip + hlen; ipds->nh = ipds->ip->ip_p; if (ipds->nh != IPPROTO_TCP && ipds->nh != IPPROTO_UDP && ipds->nh != IPPROTO_SCTP && ipds->nh != IPPROTO_DCCP) { (void)printf("%s > %s: ", ipaddr_string(&ipds->ip->ip_src), ipaddr_string(&ipds->ip->ip_dst)); } ip_print_demux(ndo, ipds); } else { /* Ultra quiet now means that all this stuff should be suppressed */ if (qflag > 1) return; /* * if this isn't the first frag, we're missing the * next level protocol header. print the ip addr * and the protocol. */ if (ipds->off & 0x1fff) { (void)printf("%s > %s:", ipaddr_string(&ipds->ip->ip_src), ipaddr_string(&ipds->ip->ip_dst)); if (!ndo->ndo_nflag && (proto = getprotobynumber(ipds->ip->ip_p)) != NULL) (void)printf(" %s", proto->p_name); else (void)printf(" ip-proto-%d", ipds->ip->ip_p); } } } void ipN_print(register const u_char *bp, register u_int length) { struct ip *ip, hdr; ip = (struct ip *)bp; if (length < 4) { (void)printf("truncated-ip %d", length); return; } memcpy (&hdr, (char *)ip, 4); switch (IP_V(&hdr)) { case 4: ip_print (gndo, bp, length); return; #ifdef INET6 case 6: ip6_print (gndo, bp, length); return; #endif default: (void)printf("unknown ip %d", IP_V(&hdr)); return; } } /* * Local Variables: * c-style: whitesmith * c-basic-offset: 8 * End: */ Index: stable/10/etc/mtree/BSD.include.dist =================================================================== --- stable/10/etc/mtree/BSD.include.dist (revision 263085) +++ stable/10/etc/mtree/BSD.include.dist (revision 263086) @@ -1,340 +1,344 @@ # $FreeBSD$ # # Please see the file src/etc/mtree/README before making changes to this file. # /set type=dir uname=root gname=wheel mode=0755 . altq .. arpa .. bsm .. bsnmp .. c++ 4.2 backward .. bits .. debug .. ext pb_ds detail basic_tree_policy .. bin_search_tree_ .. binary_heap_ .. binomial_heap_ .. binomial_heap_base_ .. cc_hash_table_map_ .. eq_fn .. gp_hash_table_map_ .. hash_fn .. left_child_next_sibling_heap_ .. list_update_map_ .. list_update_policy .. ov_tree_map_ .. pairing_heap_ .. pat_trie_ .. rb_tree_map_ .. rc_binomial_heap_ .. resize_policy .. splay_tree_ .. thin_heap_ .. tree_policy .. trie_policy .. unordered_iterator .. .. .. .. tr1 .. .. v1 experimental .. ext .. tr1 .. .. .. cam ata .. scsi .. .. clang 3.3 .. .. crypto .. dev acpica .. agp .. an .. bktr .. ciss .. filemon .. firewire .. hwpmc .. ic .. ieee488 .. iicbus .. io .. lmc .. mfi .. mpt mpilib .. .. nand .. nvme .. ofw .. pbio .. pci .. powermac_nvram .. ppbus .. smbus .. speaker .. usb .. utopia .. vkbd .. wi .. .. edit readline .. .. fs devfs .. fdescfs .. msdosfs .. nandfs .. nfs .. nullfs .. procfs .. smbfs .. udf .. unionfs .. .. gcc 4.2 .. .. geom cache .. concat .. eli .. gate .. journal .. label .. mirror .. mountver .. multipath .. nop .. raid .. raid3 .. shsec .. stripe .. virstor .. .. gnu posix .. .. gpib .. gssapi .. infiniband complib .. iba .. opensm .. vendor .. .. isofs cd9660 .. .. kadm5 .. krb5 .. libmilter .. lzma .. machine pc .. .. net .. net80211 .. netatalk .. netgraph atm .. bluetooth include .. .. netflow .. .. netinet .. netinet6 .. netipsec .. netipx .. netnatm api .. msg .. saal .. sig .. .. + netpfil + pf + .. + .. netsmb .. nfs .. nfsclient .. nfsserver .. openssl .. pcap .. protocols .. rdma .. readline .. rpc .. rpcsvc .. security audit .. mac_biba .. mac_bsdextended .. mac_lomac .. mac_mls .. mac_partition .. .. ssp .. sys .. ufs ffs .. ufs .. .. vm .. xlocale .. .. Index: stable/10/include/Makefile =================================================================== --- stable/10/include/Makefile (revision 263085) +++ stable/10/include/Makefile (revision 263086) @@ -1,334 +1,341 @@ # @(#)Makefile 8.2 (Berkeley) 1/4/94 # $FreeBSD$ # # Doing a "make install" builds /usr/include. .include CLEANFILES= osreldate.h version vers.c SUBDIR= arpa gssapi protocols rpcsvc rpc xlocale INCS= a.out.h ar.h assert.h bitstring.h complex.h cpio.h _ctype.h ctype.h \ db.h \ dirent.h dlfcn.h elf.h elf-hints.h err.h fmtmsg.h fnmatch.h fstab.h \ fts.h ftw.h getopt.h glob.h grp.h gssapi.h \ ieeefp.h ifaddrs.h \ inttypes.h iso646.h kenv.h langinfo.h libgen.h limits.h link.h \ locale.h malloc.h malloc_np.h memory.h monetary.h mpool.h mqueue.h \ ndbm.h netconfig.h \ netdb.h nl_types.h nlist.h nss.h nsswitch.h paths.h \ printf.h proc_service.h pthread.h \ pthread_np.h pwd.h ranlib.h readpassphrase.h regex.h \ res_update.h resolv.h runetype.h search.h semaphore.h setjmp.h \ signal.h spawn.h stab.h stdalign.h stdbool.h stddef.h \ stdnoreturn.h stdio.h stdlib.h string.h stringlist.h \ strings.h sysexits.h tar.h termios.h tgmath.h \ time.h timeconv.h timers.h ttyent.h \ uchar.h ulimit.h unistd.h utime.h utmpx.h uuid.h varargs.h \ wchar.h wctype.h wordexp.h xlocale.h .PATH: ${.CURDIR}/../contrib/libc-vis INCS+= vis.h MHDRS= float.h floatingpoint.h stdarg.h PHDRS= sched.h _semaphore.h LHDRS= aio.h errno.h fcntl.h linker_set.h poll.h stdatomic.h stdint.h \ syslog.h ucontext.h LDIRS= bsm cam geom net net80211 netatalk netgraph netinet netinet6 \ netipsec ${_netipx} netnatm netsmb \ nfs nfsclient nfsserver \ sys vm LSUBDIRS= cam/ata cam/scsi \ dev/acpica dev/agp dev/an dev/bktr dev/ciss dev/filemon dev/firewire \ dev/hwpmc \ dev/ic dev/iicbus ${_dev_ieee488} dev/io dev/lmc dev/mfi dev/nvme \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/smbus \ dev/speaker dev/usb dev/utopia dev/vkbd dev/wi \ fs/devfs fs/fdescfs fs/msdosfs fs/nandfs fs/nfs fs/nullfs \ fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ netgraph/atm netgraph/netflow \ security/audit \ security/mac_biba security/mac_bsdextended security/mac_lomac \ security/mac_mls security/mac_partition \ ufs/ffs ufs/ufs LSUBSUBDIRS= dev/mpt/mpilib .if ${MACHINE_ARCH} == "powerpc" || ${MACHINE_ARCH} == "powerpc64" _dev_powermac_nvram= dev/powermac_nvram .endif .if ${MK_GPIB} != "no" _dev_ieee488= dev/ieee488 .endif .if ${MK_HESIOD} != "no" INCS+= hesiod.h .endif .if ${MK_BLUETOOTH} != "no" LSUBSUBDIRS+= netgraph/bluetooth/include .endif # XXX unconditionally needed by #.if ${MK_IPX} != "no" _netipx= netipx #.endif # Handle the #define aliases for libiconv .if ${MK_ICONV} == "yes" INCS+= iconv.h .endif # Define SHARED to indicate whether you want symbolic links to the system # source (``symlinks''), or a separate copy (``copies''). ``symlinks'' is # probably only useful for developers and should be avoided if you do not # wish to tie your /usr/include and /usr/src together. #SHARED= symlinks SHARED?= copies INCS+= osreldate.h SYSDIR= ${.CURDIR}/../sys NEWVERS_SH= ${SYSDIR}/conf/newvers.sh PARAM_H= ${SYSDIR}/sys/param.h MK_OSRELDATE_SH= ${.CURDIR}/mk-osreldate.sh osreldate.h vers.c: ${NEWVERS_SH} ${PARAM_H} ${MK_OSRELDATE_SH} env ECHO="${ECHO}" \ MAKE="${MAKE}" \ NEWVERS_SH=${NEWVERS_SH} \ PARAM_H=${PARAM_H} \ SYSDIR=${SYSDIR} \ sh ${MK_OSRELDATE_SH} .for i in ${LHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .for i in ${MHDRS} INCSLINKS+= machine/$i ${INCLUDEDIR}/$i .endfor .for i in ${PHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .if ${MACHINE} != ${MACHINE_CPUARCH} _MARCHS= ${MACHINE_CPUARCH} .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _MARCHS+= x86 .endif .include installincludes: ${SHARED} ${SHARED}: compat # Take care of stale directory-level symlinks. compat: .for i in ${LDIRS} ${LSUBDIRS} machine ${_MARCHS} crypto if [ -L ${DESTDIR}${INCLUDEDIR}/$i ]; then \ rm -f ${DESTDIR}${INCLUDEDIR}/$i; \ fi .endfor mtree -deU ${MTREE_FOLLOWS_SYMLINKS} \ -f ${.CURDIR}/../etc/mtree/BSD.include.dist \ -p ${DESTDIR}${INCLUDEDIR} copies: .for i in ${LDIRS} ${LSUBDIRS} ${LSUBSUBDIRS} altq crypto machine machine/pc \ ${_MARCHS} .if exists(${DESTDIR}${INCLUDEDIR}/$i) cd ${DESTDIR}${INCLUDEDIR}/$i; \ for h in *.h; do \ if [ -L $$h ]; then rm -f $$h; fi; \ done .endif .endfor .for i in ${LDIRS} ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/bktr:Ndev/nand:Ndev/pci} ${LSUBSUBDIRS} cd ${.CURDIR}/../sys; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 $i/*.h \ ${DESTDIR}${INCLUDEDIR}/$i .endfor cd ${.CURDIR}/../sys/dev/acpica; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 acpiio.h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica cd ${.CURDIR}/../sys/dev/agp; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 agpreg.h \ ${DESTDIR}${INCLUDEDIR}/dev/agp cd ${.CURDIR}/../sys/dev/bktr; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 ioctl_*.h \ ${DESTDIR}${INCLUDEDIR}/dev/bktr .if ${MK_NAND} != "no" cd ${.CURDIR}/../sys/dev/nand; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 nandsim.h \ ${DESTDIR}${INCLUDEDIR}/dev/nand; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 nand_dev.h \ ${DESTDIR}${INCLUDEDIR}/dev/nand .endif cd ${.CURDIR}/../sys/dev/pci; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 pcireg.h \ ${DESTDIR}${INCLUDEDIR}/dev/pci cd ${.CURDIR}/../sys/contrib/altq/altq; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/altq cd ${.CURDIR}/../sys/fs/cd9660/; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/isofs/cd9660 .if ${MK_IPFILTER} != "no" cd ${.CURDIR}/../sys/contrib/ipfilter/netinet; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/netinet .endif cd ${.CURDIR}/../sys/crypto; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 rijndael/rijndael.h \ ${DESTDIR}${INCLUDEDIR}/crypto cd ${.CURDIR}/../sys/opencrypto; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/crypto cd ${.CURDIR}/../sys/${MACHINE}/include; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/machine .if exists(${.CURDIR}/../sys/${MACHINE}/include/pc) cd ${.CURDIR}/../sys/${MACHINE}/include/pc; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/machine/pc .endif .for _MARCH in ${_MARCHS} .if exists(${.CURDIR}/../sys/${_MARCH}/include) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${.CURDIR}/../sys/${_MARCH}/include; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH} .if exists(${.CURDIR}/../sys/${_MARCH}/include/pc) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${.CURDIR}/../sys/${_MARCH}/include/pc; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc .endif .endif .endfor cd ${.CURDIR}/../sys/rpc; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 types.h \ ${DESTDIR}${INCLUDEDIR}/rpc symlinks: @${ECHO} "Setting up symlinks to kernel source tree..." .for i in ${LDIRS} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ln -fs ../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor .for i in ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/bktr:Ndev/nand:Ndev/pci} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ln -fs ../../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor cd ${.CURDIR}/../sys/dev/acpica; \ for h in acpiio.h; do \ ln -fs ../../../../sys/dev/acpica/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica; \ done cd ${.CURDIR}/../sys/dev/agp; \ for h in agpreg.h; do \ ln -fs ../../../../sys/dev/agp/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/agp; \ done cd ${.CURDIR}/../sys/dev/bktr; \ for h in ioctl_*.h; do \ ln -fs ../../../../sys/dev/bktr/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/bktr; \ done .if ${MK_NAND} != "no" cd ${.CURDIR}/../sys/dev/nand; \ for h in nandsim.h nand_dev.h; do \ ln -fs ../../../../sys/dev/nand/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/nand; \ done .endif cd ${.CURDIR}/../sys/dev/pci; \ for h in pcireg.h; do \ ln -fs ../../../../sys/dev/pci/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/pci; \ done .for i in ${LSUBSUBDIRS} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ln -fs ../../../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor cd ${.CURDIR}/../sys/contrib/altq/altq; \ for h in *.h; do \ ln -fs ../../../sys/contrib/altq/altq/$$h \ ${DESTDIR}${INCLUDEDIR}/altq; \ done .if ${MK_IPFILTER} != "no" cd ${.CURDIR}/../sys/contrib/ipfilter/netinet; \ for h in *.h; do \ ln -fs ../../../sys/contrib/ipfilter/netinet/$$h \ ${DESTDIR}${INCLUDEDIR}/netinet; \ done .endif +.if ${MK_PF} != "no" + cd ${.CURDIR}/../sys/netpfil/pf; \ + for h in *.h; do \ + ln -fs ../../../../sys/netpfil/pf/$$h \ + ${DESTDIR}${INCLUDEDIR}/netpfil/pf; \ + done +.endif cd ${.CURDIR}/../sys/crypto; \ for h in rijndael/rijndael.h; do \ ln -fs ../../../sys/crypto/$$h \ ${DESTDIR}${INCLUDEDIR}/crypto; \ done cd ${.CURDIR}/../sys/opencrypto; \ for h in *.h; do \ ln -fs ../../../sys/opencrypto/$$h \ ${DESTDIR}${INCLUDEDIR}/crypto; \ done cd ${.CURDIR}/../sys/${MACHINE}/include; \ for h in *.h; do \ ln -fs ../../../sys/${MACHINE}/include/$$h \ ${DESTDIR}${INCLUDEDIR}/machine; \ done .if exists(${.CURDIR}/../sys/${MACHINE}/include/pc) cd ${.CURDIR}/../sys/${MACHINE}/include/pc; \ for h in *.h; do \ ln -fs ../../../../sys/${MACHINE}/include/pc/$$h \ ${DESTDIR}${INCLUDEDIR}/machine/pc; \ done .endif .for _MARCH in ${_MARCHS} .if exists(${.CURDIR}/../sys/${_MARCH}/include) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${.CURDIR}/../sys/${_MARCH}/include; \ for h in *.h; do \ ln -fs ../../../sys/${_MARCH}/include/$$h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ done .if exists(${.CURDIR}/../sys/${_MARCH}/include/pc) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${.CURDIR}/../sys/${_MARCH}/include/pc; \ for h in *.h; do \ ln -fs ../../../../sys/${_MARCH}/include/pc/$$h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ done .endif .endif .endfor cd ${.CURDIR}/../sys/fs/cd9660; \ for h in *.h; do \ ln -fs ../../../../sys/fs/cd9660/$$h \ ${DESTDIR}${INCLUDEDIR}/isofs/cd9660; \ done cd ${.CURDIR}/../sys/rpc; \ for h in types.h; do \ ln -fs ../../../sys/rpc/$$h \ ${DESTDIR}${INCLUDEDIR}/rpc; \ done Index: stable/10/sbin/ifconfig/Makefile =================================================================== --- stable/10/sbin/ifconfig/Makefile (revision 263085) +++ stable/10/sbin/ifconfig/Makefile (revision 263086) @@ -1,70 +1,72 @@ # From: @(#)Makefile 8.1 (Berkeley) 6/5/93 # $FreeBSD$ .include PROG= ifconfig SRCS= ifconfig.c # base support # # NB: The order here defines the order in which the constructors # are called. This in turn defines the default order in which # status is displayed. Probably should add a priority mechanism # to the registration process so we don't depend on this aspect # of the toolchain. # SRCS+= af_link.c # LLC support .if ${MK_INET_SUPPORT} != "no" SRCS+= af_inet.c # IPv4 support .endif .if ${MK_INET6_SUPPORT} != "no" SRCS+= af_inet6.c # IPv6 support .endif SRCS+= af_atalk.c # AppleTalk support .if ${MK_INET6_SUPPORT} != "no" SRCS+= af_nd6.c # ND6 support .endif SRCS+= ifclone.c # clone device support SRCS+= ifmac.c # MAC support SRCS+= ifmedia.c # SIOC[GS]IFMEDIA support SRCS+= iffib.c # non-default FIB support SRCS+= ifvlan.c # SIOC[GS]ETVLAN support SRCS+= ifgre.c # GRE keys etc SRCS+= ifgif.c # GIF reversed header workaround SRCS+= ifieee80211.c regdomain.c # SIOC[GS]IEEE80211 support DPADD+= ${LIBBSDXML} ${LIBSBUF} LDADD+= -lbsdxml -lsbuf SRCS+= carp.c # SIOC[GS]VH support SRCS+= ifgroup.c # ... +.if ${MK_PF} != "no" SRCS+= ifpfsync.c # pfsync(4) support +.endif SRCS+= ifbridge.c # bridge support SRCS+= iflagg.c # lagg support .if ${MK_INET6_SUPPORT} != "no" CFLAGS+= -DINET6 .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+= -DINET .endif .if ${MK_IPX_SUPPORT} != "no" && !defined(RELEASE_CRUNCH) SRCS+= af_ipx.c # IPX support DPADD+= ${LIBIPX} LDADD+= -lipx .endif .if ${MK_JAIL} != "no" && !defined(RELEASE_CRUNCH) && !defined(RESCUE) CFLAGS+= -DJAIL DPADD+= ${LIBJAIL} LDADD+= -ljail .endif MAN= ifconfig.8 CFLAGS+= -Wall -Wmissing-prototypes -Wcast-qual -Wwrite-strings -Wnested-externs WARNS?= 2 .include Index: stable/10/sbin/ipfw/Makefile =================================================================== --- stable/10/sbin/ipfw/Makefile (revision 263085) +++ stable/10/sbin/ipfw/Makefile (revision 263086) @@ -1,10 +1,18 @@ # $FreeBSD$ +.include + PROG= ipfw -SRCS= ipfw2.c dummynet.c ipv6.c main.c nat.c altq.c +SRCS= ipfw2.c dummynet.c ipv6.c main.c nat.c WARNS?= 2 + +.if ${MK_PF} != "no" +SRCS+= altq.c +CFLAGS+=-DPF +.endif + DPADD= ${LIBUTIL} LDADD= -lutil MAN= ipfw.8 .include Index: stable/10/sbin/ipfw/ipfw2.h =================================================================== --- stable/10/sbin/ipfw/ipfw2.h (revision 263085) +++ stable/10/sbin/ipfw/ipfw2.h (revision 263086) @@ -1,293 +1,296 @@ /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD$ */ /* * Options that can be set on the command line. * When reading commands from a file, a subset of the options can also * be applied globally by specifying them before the file name. * After that, each line can contain its own option that changes * the global value. * XXX The context is not restored after each line. */ struct cmdline_opts { /* boolean options: */ int do_value_as_ip; /* show table value as IP */ int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_dynamic; /* display dynamic rules */ int do_expired; /* display expired dynamic rules */ int do_compact; /* show rules in compact mode */ int do_force; /* do not ask for confirmation */ int show_sets; /* display the set each rule belongs to */ int test_only; /* only check syntax */ int comment_only; /* only print action and comment */ int verbose; /* be verbose on some commands */ /* The options below can have multiple values. */ int do_sort; /* field to sort results (0 = no) */ /* valid fields are 1 and above */ int use_set; /* work with specified set number */ /* 0 means all sets, otherwise apply to set use_set - 1 */ }; extern struct cmdline_opts co; /* * _s_x is a structure that stores a string <-> token pairs, used in * various places in the parser. Entries are stored in arrays, * with an entry with s=NULL as terminator. * The search routines are match_token() and match_value(). * Often, an element with x=0 contains an error string. * */ struct _s_x { char const *s; int x; }; enum tokens { TOK_NULL=0, TOK_OR, TOK_NOT, TOK_STARTBRACE, TOK_ENDBRACE, TOK_ACCEPT, TOK_COUNT, TOK_PIPE, TOK_LINK, TOK_QUEUE, TOK_FLOWSET, TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, TOK_NGTEE, TOK_FORWARD, TOK_SKIPTO, TOK_DENY, TOK_REJECT, TOK_RESET, TOK_UNREACH, TOK_CHECKSTATE, TOK_NAT, TOK_REASS, TOK_CALL, TOK_RETURN, TOK_ALTQ, TOK_LOG, TOK_TAG, TOK_UNTAG, TOK_TAGGED, TOK_UID, TOK_GID, TOK_JAIL, TOK_IN, TOK_LIMIT, TOK_KEEPSTATE, TOK_LAYER2, TOK_OUT, TOK_DIVERTED, TOK_DIVERTEDLOOPBACK, TOK_DIVERTEDOUTPUT, TOK_XMIT, TOK_RECV, TOK_VIA, TOK_FRAG, TOK_IPOPTS, TOK_IPLEN, TOK_IPID, TOK_IPPRECEDENCE, TOK_DSCP, TOK_IPTOS, TOK_IPTTL, TOK_IPVER, TOK_ESTAB, TOK_SETUP, TOK_TCPDATALEN, TOK_TCPFLAGS, TOK_TCPOPTS, TOK_TCPSEQ, TOK_TCPACK, TOK_TCPWIN, TOK_ICMPTYPES, TOK_MAC, TOK_MACTYPE, TOK_VERREVPATH, TOK_VERSRCREACH, TOK_ANTISPOOF, TOK_IPSEC, TOK_COMMENT, TOK_PLR, TOK_NOERROR, TOK_BUCKETS, TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_ALL, TOK_MASK, TOK_FLOW_MASK, TOK_SCHED_MASK, TOK_BW, TOK_DELAY, TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_DROPTAIL, TOK_PROTO, /* dummynet tokens */ TOK_WEIGHT, TOK_LMAX, TOK_PRI, TOK_TYPE, TOK_SLOTSIZE, TOK_IP, TOK_IF, TOK_ALOG, TOK_DENY_INC, TOK_SAME_PORTS, TOK_UNREG_ONLY, TOK_SKIP_GLOBAL, TOK_RESET_ADDR, TOK_ALIAS_REV, TOK_PROXY_ONLY, TOK_REDIR_ADDR, TOK_REDIR_PORT, TOK_REDIR_PROTO, TOK_IPV6, TOK_FLOWID, TOK_ICMP6TYPES, TOK_EXT6HDR, TOK_DSTIP6, TOK_SRCIP6, TOK_IPV4, TOK_UNREACH6, TOK_RESET6, TOK_FIB, TOK_SETFIB, TOK_LOOKUP, TOK_SOCKARG, TOK_SETDSCP, }; /* * the following macro returns an error message if we run out of * arguments. */ #define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} #define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} int pr_u64(uint64_t *pd, int width); /* memory allocation support */ void *safe_calloc(size_t number, size_t size); void *safe_realloc(void *ptr, size_t size); /* string comparison functions used for historical compatibility */ int _substrcmp(const char *str1, const char* str2); int _substrcmp2(const char *str1, const char* str2, const char* str3); /* utility functions */ int match_token(struct _s_x *table, char *string); char const *match_value(struct _s_x *p, int value); int do_cmd(int optname, void *optval, uintptr_t optlen); struct in6_addr; void n2mask(struct in6_addr *mask, int n); int contigmask(uint8_t *p, int len); /* * Forward declarations to avoid include way too many headers. * C does not allow duplicated typedefs, so we use the base struct * that the typedef points to. * Should the typedefs use a different type, the compiler will * still detect the change when compiling the body of the * functions involved, so we do not lose error checking. */ struct _ipfw_insn; struct _ipfw_insn_altq; struct _ipfw_insn_u32; struct _ipfw_insn_ip6; struct _ipfw_insn_icmp6; /* * The reserved set numer. This is a constant in ip_fw.h * but we store it in a variable so other files do not depend * in that header just for one constant. */ extern int resvd_set_number; /* first-level command handlers */ void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); void ipfw_sysctl_handler(char *av[], int which); void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); +#ifdef PF /* altq.c */ void altq_set_enabled(int enabled); u_int32_t altq_name_to_qid(const char *name); - void print_altq_cmd(struct _ipfw_insn_altq *altqptr); +#else +#define NO_ALTQ +#endif /* dummynet.c */ void dummynet_list(int ac, char *av[], int show_counters); void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ void print_unreach6_code(uint16_t code); void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s); void print_flow6id(struct _ipfw_insn_u32 *cmd); void print_icmp6types(struct _ipfw_insn_u32 *cmd); void print_ext6hdr(struct _ipfw_insn *cmd ); struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av, int cblen); struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av, int cblen); void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av, int cblen); void fill_unreach6_code(u_short *codep, char *str); void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av, int cblen); int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); Index: stable/10/sys/contrib/altq/altq/altq_cbq.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_cbq.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_cbq.c (revision 263086) @@ -1,1170 +1,1173 @@ /* $FreeBSD$ */ /* $KAME: altq_cbq.c,v 1.19 2003/09/17 14:23:25 kjc Exp $ */ /* * Copyright (c) Sun Microsystems, Inc. 1993-1998 All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the SMCC Technology * Development Group at Sun Microsystems, Inc. * * 4. The name of the Sun Microsystems, Inc nor may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * SUN MICROSYSTEMS DOES NOT CLAIM MERCHANTABILITY OF THIS SOFTWARE OR THE * SUITABILITY OF THIS SOFTWARE FOR ANY PARTICULAR PURPOSE. The software is * provided "as is" without express or implied warranty of any kind. * * These notices must be retained in any copies of any part of this software. */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ #include #include #include #include #include #include #include #include #ifdef ALTQ3_COMPAT #include #include #endif #include +#include #include -#include +#include +#include +#include #include #include #ifdef ALTQ3_COMPAT #include #endif #ifdef ALTQ3_COMPAT /* * Local Data structures. */ static cbq_state_t *cbq_list = NULL; #endif /* * Forward Declarations. */ static int cbq_class_destroy(cbq_state_t *, struct rm_class *); static struct rm_class *clh_to_clp(cbq_state_t *, u_int32_t); static int cbq_clear_interface(cbq_state_t *); static int cbq_request(struct ifaltq *, int, void *); static int cbq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); static struct mbuf *cbq_dequeue(struct ifaltq *, int); static void cbqrestart(struct ifaltq *); static void get_class_stats(class_stats_t *, struct rm_class *); static void cbq_purge(cbq_state_t *); #ifdef ALTQ3_COMPAT static int cbq_add_class(struct cbq_add_class *); static int cbq_delete_class(struct cbq_delete_class *); static int cbq_modify_class(struct cbq_modify_class *); static int cbq_class_create(cbq_state_t *, struct cbq_add_class *, struct rm_class *, struct rm_class *); static int cbq_clear_hierarchy(struct cbq_interface *); static int cbq_set_enable(struct cbq_interface *, int); static int cbq_ifattach(struct cbq_interface *); static int cbq_ifdetach(struct cbq_interface *); static int cbq_getstats(struct cbq_getstats *); static int cbq_add_filter(struct cbq_add_filter *); static int cbq_delete_filter(struct cbq_delete_filter *); #endif /* ALTQ3_COMPAT */ /* * int * cbq_class_destroy(cbq_mod_state_t *, struct rm_class *) - This * function destroys a given traffic class. Before destroying * the class, all traffic for that class is released. */ static int cbq_class_destroy(cbq_state_t *cbqp, struct rm_class *cl) { int i; /* delete the class */ rmc_delete_class(&cbqp->ifnp, cl); /* * free the class handle */ for (i = 0; i < CBQ_MAX_CLASSES; i++) if (cbqp->cbq_class_tbl[i] == cl) cbqp->cbq_class_tbl[i] = NULL; if (cl == cbqp->ifnp.root_) cbqp->ifnp.root_ = NULL; if (cl == cbqp->ifnp.default_) cbqp->ifnp.default_ = NULL; #ifdef ALTQ3_COMPAT if (cl == cbqp->ifnp.ctl_) cbqp->ifnp.ctl_ = NULL; #endif return (0); } /* convert class handle to class pointer */ static struct rm_class * clh_to_clp(cbq_state_t *cbqp, u_int32_t chandle) { int i; struct rm_class *cl; if (chandle == 0) return (NULL); /* * first, try optimistically the slot matching the lower bits of * the handle. if it fails, do the linear table search. */ i = chandle % CBQ_MAX_CLASSES; if ((cl = cbqp->cbq_class_tbl[i]) != NULL && cl->stats_.handle == chandle) return (cl); for (i = 0; i < CBQ_MAX_CLASSES; i++) if ((cl = cbqp->cbq_class_tbl[i]) != NULL && cl->stats_.handle == chandle) return (cl); return (NULL); } static int cbq_clear_interface(cbq_state_t *cbqp) { int again, i; struct rm_class *cl; #ifdef ALTQ3_CLFIER_COMPAT /* free the filters for this interface */ acc_discard_filters(&cbqp->cbq_classifier, NULL, 1); #endif /* clear out the classes now */ do { again = 0; for (i = 0; i < CBQ_MAX_CLASSES; i++) { if ((cl = cbqp->cbq_class_tbl[i]) != NULL) { if (is_a_parent_class(cl)) again++; else { cbq_class_destroy(cbqp, cl); cbqp->cbq_class_tbl[i] = NULL; if (cl == cbqp->ifnp.root_) cbqp->ifnp.root_ = NULL; if (cl == cbqp->ifnp.default_) cbqp->ifnp.default_ = NULL; #ifdef ALTQ3_COMPAT if (cl == cbqp->ifnp.ctl_) cbqp->ifnp.ctl_ = NULL; #endif } } } } while (again); return (0); } static int cbq_request(struct ifaltq *ifq, int req, void *arg) { cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; IFQ_LOCK_ASSERT(ifq); switch (req) { case ALTRQ_PURGE: cbq_purge(cbqp); break; } return (0); } /* copy the stats info in rm_class to class_states_t */ static void get_class_stats(class_stats_t *statsp, struct rm_class *cl) { statsp->xmit_cnt = cl->stats_.xmit_cnt; statsp->drop_cnt = cl->stats_.drop_cnt; statsp->over = cl->stats_.over; statsp->borrows = cl->stats_.borrows; statsp->overactions = cl->stats_.overactions; statsp->delays = cl->stats_.delays; statsp->depth = cl->depth_; statsp->priority = cl->pri_; statsp->maxidle = cl->maxidle_; statsp->minidle = cl->minidle_; statsp->offtime = cl->offtime_; statsp->qmax = qlimit(cl->q_); statsp->ns_per_byte = cl->ns_per_byte_; statsp->wrr_allot = cl->w_allotment_; statsp->qcnt = qlen(cl->q_); statsp->avgidle = cl->avgidle_; statsp->qtype = qtype(cl->q_); #ifdef ALTQ_RED if (q_is_red(cl->q_)) red_getstats(cl->red_, &statsp->red[0]); #endif #ifdef ALTQ_RIO if (q_is_rio(cl->q_)) rio_getstats((rio_t *)cl->red_, &statsp->red[0]); #endif } int cbq_pfattach(struct pf_altq *a) { struct ifnet *ifp; int s, error; if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) return (EINVAL); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif error = altq_attach(&ifp->if_snd, ALTQT_CBQ, a->altq_disc, cbq_enqueue, cbq_dequeue, cbq_request, NULL, NULL); splx(s); return (error); } int cbq_add_altq(struct pf_altq *a) { cbq_state_t *cbqp; struct ifnet *ifp; if ((ifp = ifunit(a->ifname)) == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); /* allocate and initialize cbq_state_t */ cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (cbqp == NULL) return (ENOMEM); CALLOUT_INIT(&cbqp->cbq_callout); cbqp->cbq_qlen = 0; cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ /* keep the state in pf_altq */ a->altq_disc = cbqp; return (0); } int cbq_remove_altq(struct pf_altq *a) { cbq_state_t *cbqp; if ((cbqp = a->altq_disc) == NULL) return (EINVAL); a->altq_disc = NULL; cbq_clear_interface(cbqp); if (cbqp->ifnp.default_) cbq_class_destroy(cbqp, cbqp->ifnp.default_); if (cbqp->ifnp.root_) cbq_class_destroy(cbqp, cbqp->ifnp.root_); /* deallocate cbq_state_t */ free(cbqp, M_DEVBUF); return (0); } int cbq_add_queue(struct pf_altq *a) { struct rm_class *borrow, *parent; cbq_state_t *cbqp; struct rm_class *cl; struct cbq_opts *opts; int i; if ((cbqp = a->altq_disc) == NULL) return (EINVAL); if (a->qid == 0) return (EINVAL); /* * find a free slot in the class table. if the slot matching * the lower bits of qid is free, use this slot. otherwise, * use the first free slot. */ i = a->qid % CBQ_MAX_CLASSES; if (cbqp->cbq_class_tbl[i] != NULL) { for (i = 0; i < CBQ_MAX_CLASSES; i++) if (cbqp->cbq_class_tbl[i] == NULL) break; if (i == CBQ_MAX_CLASSES) return (EINVAL); } opts = &a->pq_u.cbq_opts; /* check parameters */ if (a->priority >= CBQ_MAXPRI) return (EINVAL); /* Get pointers to parent and borrow classes. */ parent = clh_to_clp(cbqp, a->parent_qid); if (opts->flags & CBQCLF_BORROW) borrow = parent; else borrow = NULL; /* * A class must borrow from it's parent or it can not * borrow at all. Hence, borrow can be null. */ if (parent == NULL && (opts->flags & CBQCLF_ROOTCLASS) == 0) { printf("cbq_add_queue: no parent class!\n"); return (EINVAL); } if ((borrow != parent) && (borrow != NULL)) { printf("cbq_add_class: borrow class != parent\n"); return (EINVAL); } /* * check parameters */ switch (opts->flags & CBQCLF_CLASSMASK) { case CBQCLF_ROOTCLASS: if (parent != NULL) return (EINVAL); if (cbqp->ifnp.root_) return (EINVAL); break; case CBQCLF_DEFCLASS: if (cbqp->ifnp.default_) return (EINVAL); break; case 0: if (a->qid == 0) return (EINVAL); break; default: /* more than two flags bits set */ return (EINVAL); } /* * create a class. if this is a root class, initialize the * interface. */ if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, opts->ns_per_byte, cbqrestart, a->qlimit, RM_MAXQUEUED, opts->maxidle, opts->minidle, opts->offtime, opts->flags); cl = cbqp->ifnp.root_; } else { cl = rmc_newclass(a->priority, &cbqp->ifnp, opts->ns_per_byte, rmc_delay_action, a->qlimit, parent, borrow, opts->maxidle, opts->minidle, opts->offtime, opts->pktsize, opts->flags); } if (cl == NULL) return (ENOMEM); /* return handle to user space. */ cl->stats_.handle = a->qid; cl->stats_.depth = cl->depth_; /* save the allocated class */ cbqp->cbq_class_tbl[i] = cl; if ((opts->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) cbqp->ifnp.default_ = cl; return (0); } int cbq_remove_queue(struct pf_altq *a) { struct rm_class *cl; cbq_state_t *cbqp; int i; if ((cbqp = a->altq_disc) == NULL) return (EINVAL); if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) return (EINVAL); /* if we are a parent class, then return an error. */ if (is_a_parent_class(cl)) return (EINVAL); /* delete the class */ rmc_delete_class(&cbqp->ifnp, cl); /* * free the class handle */ for (i = 0; i < CBQ_MAX_CLASSES; i++) if (cbqp->cbq_class_tbl[i] == cl) { cbqp->cbq_class_tbl[i] = NULL; if (cl == cbqp->ifnp.root_) cbqp->ifnp.root_ = NULL; if (cl == cbqp->ifnp.default_) cbqp->ifnp.default_ = NULL; break; } return (0); } int cbq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) { cbq_state_t *cbqp; struct rm_class *cl; class_stats_t stats; int error = 0; if ((cbqp = altq_lookup(a->ifname, ALTQT_CBQ)) == NULL) return (EBADF); if ((cl = clh_to_clp(cbqp, a->qid)) == NULL) return (EINVAL); if (*nbytes < sizeof(stats)) return (EINVAL); get_class_stats(&stats, cl); if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) return (error); *nbytes = sizeof(stats); return (0); } /* * int * cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pattr) * - Queue data packets. * * cbq_enqueue is set to ifp->if_altqenqueue and called by an upper * layer (e.g. ether_output). cbq_enqueue queues the given packet * to the cbq, then invokes the driver's start routine. * * Assumptions: called in splimp * Returns: 0 if the queueing is successful. * ENOBUFS if a packet dropping occurred as a result of * the queueing. */ static int cbq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) { cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; struct rm_class *cl; struct pf_mtag *t; int len; IFQ_LOCK_ASSERT(ifq); /* grab class set by classifier */ if ((m->m_flags & M_PKTHDR) == 0) { /* should not happen */ printf("altq: packet for %s does not have pkthdr\n", ifq->altq_ifp->if_xname); m_freem(m); return (ENOBUFS); } cl = NULL; if ((t = pf_find_mtag(m)) != NULL) cl = clh_to_clp(cbqp, t->qid); #ifdef ALTQ3_COMPAT else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) cl = pktattr->pattr_class; #endif if (cl == NULL) { cl = cbqp->ifnp.default_; if (cl == NULL) { m_freem(m); return (ENOBUFS); } } #ifdef ALTQ3_COMPAT if (pktattr != NULL) cl->pktattr_ = pktattr; /* save proto hdr used by ECN */ else #endif cl->pktattr_ = NULL; len = m_pktlen(m); if (rmc_queue_packet(cl, m) != 0) { /* drop occurred. some mbuf was freed in rmc_queue_packet. */ PKTCNTR_ADD(&cl->stats_.drop_cnt, len); return (ENOBUFS); } /* successfully queued. */ ++cbqp->cbq_qlen; IFQ_INC_LEN(ifq); return (0); } static struct mbuf * cbq_dequeue(struct ifaltq *ifq, int op) { cbq_state_t *cbqp = (cbq_state_t *)ifq->altq_disc; struct mbuf *m; IFQ_LOCK_ASSERT(ifq); m = rmc_dequeue_next(&cbqp->ifnp, op); if (m && op == ALTDQ_REMOVE) { --cbqp->cbq_qlen; /* decrement # of packets in cbq */ IFQ_DEC_LEN(ifq); /* Update the class. */ rmc_update_class_util(&cbqp->ifnp); } return (m); } /* * void * cbqrestart(queue_t *) - Restart sending of data. * called from rmc_restart in splimp via timeout after waking up * a suspended class. * Returns: NONE */ static void cbqrestart(struct ifaltq *ifq) { cbq_state_t *cbqp; struct ifnet *ifp; IFQ_LOCK_ASSERT(ifq); if (!ALTQ_IS_ENABLED(ifq)) /* cbq must have been detached */ return; if ((cbqp = (cbq_state_t *)ifq->altq_disc) == NULL) /* should not happen */ return; ifp = ifq->altq_ifp; if (ifp->if_start && cbqp->cbq_qlen > 0 && (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_UNLOCK(ifq); (*ifp->if_start)(ifp); IFQ_LOCK(ifq); } } static void cbq_purge(cbq_state_t *cbqp) { struct rm_class *cl; int i; for (i = 0; i < CBQ_MAX_CLASSES; i++) if ((cl = cbqp->cbq_class_tbl[i]) != NULL) rmc_dropall(cl); if (ALTQ_IS_ENABLED(cbqp->ifnp.ifq_)) cbqp->ifnp.ifq_->ifq_len = 0; } #ifdef ALTQ3_COMPAT static int cbq_add_class(acp) struct cbq_add_class *acp; { char *ifacename; struct rm_class *borrow, *parent; cbq_state_t *cbqp; ifacename = acp->cbq_iface.cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); /* check parameters */ if (acp->cbq_class.priority >= CBQ_MAXPRI || acp->cbq_class.maxq > CBQ_MAXQSIZE) return (EINVAL); /* Get pointers to parent and borrow classes. */ parent = clh_to_clp(cbqp, acp->cbq_class.parent_class_handle); borrow = clh_to_clp(cbqp, acp->cbq_class.borrow_class_handle); /* * A class must borrow from it's parent or it can not * borrow at all. Hence, borrow can be null. */ if (parent == NULL && (acp->cbq_class.flags & CBQCLF_ROOTCLASS) == 0) { printf("cbq_add_class: no parent class!\n"); return (EINVAL); } if ((borrow != parent) && (borrow != NULL)) { printf("cbq_add_class: borrow class != parent\n"); return (EINVAL); } return cbq_class_create(cbqp, acp, parent, borrow); } static int cbq_delete_class(dcp) struct cbq_delete_class *dcp; { char *ifacename; struct rm_class *cl; cbq_state_t *cbqp; ifacename = dcp->cbq_iface.cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); if ((cl = clh_to_clp(cbqp, dcp->cbq_class_handle)) == NULL) return (EINVAL); /* if we are a parent class, then return an error. */ if (is_a_parent_class(cl)) return (EINVAL); /* if a filter has a reference to this class delete the filter */ acc_discard_filters(&cbqp->cbq_classifier, cl, 0); return cbq_class_destroy(cbqp, cl); } static int cbq_modify_class(acp) struct cbq_modify_class *acp; { char *ifacename; struct rm_class *cl; cbq_state_t *cbqp; ifacename = acp->cbq_iface.cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); /* Get pointer to this class */ if ((cl = clh_to_clp(cbqp, acp->cbq_class_handle)) == NULL) return (EINVAL); if (rmc_modclass(cl, acp->cbq_class.nano_sec_per_byte, acp->cbq_class.maxq, acp->cbq_class.maxidle, acp->cbq_class.minidle, acp->cbq_class.offtime, acp->cbq_class.pktsize) < 0) return (EINVAL); return (0); } /* * struct rm_class * * cbq_class_create(cbq_mod_state_t *cbqp, struct cbq_add_class *acp, * struct rm_class *parent, struct rm_class *borrow) * * This function create a new traffic class in the CBQ class hierarchy of * given paramters. The class that created is either the root, default, * or a new dynamic class. If CBQ is not initilaized, the the root class * will be created. */ static int cbq_class_create(cbqp, acp, parent, borrow) cbq_state_t *cbqp; struct cbq_add_class *acp; struct rm_class *parent, *borrow; { struct rm_class *cl; cbq_class_spec_t *spec = &acp->cbq_class; u_int32_t chandle; int i; /* * allocate class handle */ for (i = 1; i < CBQ_MAX_CLASSES; i++) if (cbqp->cbq_class_tbl[i] == NULL) break; if (i == CBQ_MAX_CLASSES) return (EINVAL); chandle = i; /* use the slot number as class handle */ /* * create a class. if this is a root class, initialize the * interface. */ if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_ROOTCLASS) { rmc_init(cbqp->ifnp.ifq_, &cbqp->ifnp, spec->nano_sec_per_byte, cbqrestart, spec->maxq, RM_MAXQUEUED, spec->maxidle, spec->minidle, spec->offtime, spec->flags); cl = cbqp->ifnp.root_; } else { cl = rmc_newclass(spec->priority, &cbqp->ifnp, spec->nano_sec_per_byte, rmc_delay_action, spec->maxq, parent, borrow, spec->maxidle, spec->minidle, spec->offtime, spec->pktsize, spec->flags); } if (cl == NULL) return (ENOMEM); /* return handle to user space. */ acp->cbq_class_handle = chandle; cl->stats_.handle = chandle; cl->stats_.depth = cl->depth_; /* save the allocated class */ cbqp->cbq_class_tbl[i] = cl; if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_DEFCLASS) cbqp->ifnp.default_ = cl; if ((spec->flags & CBQCLF_CLASSMASK) == CBQCLF_CTLCLASS) cbqp->ifnp.ctl_ = cl; return (0); } static int cbq_add_filter(afp) struct cbq_add_filter *afp; { char *ifacename; cbq_state_t *cbqp; struct rm_class *cl; ifacename = afp->cbq_iface.cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); /* Get the pointer to class. */ if ((cl = clh_to_clp(cbqp, afp->cbq_class_handle)) == NULL) return (EINVAL); return acc_add_filter(&cbqp->cbq_classifier, &afp->cbq_filter, cl, &afp->cbq_filter_handle); } static int cbq_delete_filter(dfp) struct cbq_delete_filter *dfp; { char *ifacename; cbq_state_t *cbqp; ifacename = dfp->cbq_iface.cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); return acc_delete_filter(&cbqp->cbq_classifier, dfp->cbq_filter_handle); } /* * cbq_clear_hierarchy deletes all classes and their filters on the * given interface. */ static int cbq_clear_hierarchy(ifacep) struct cbq_interface *ifacep; { char *ifacename; cbq_state_t *cbqp; ifacename = ifacep->cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); return cbq_clear_interface(cbqp); } /* * static int * cbq_set_enable(struct cbq_enable *ep) - this function processed the * ioctl request to enable class based queueing. It searches the list * of interfaces for the specified interface and then enables CBQ on * that interface. * * Returns: 0, for no error. * EBADF, for specified inteface not found. */ static int cbq_set_enable(ep, enable) struct cbq_interface *ep; int enable; { int error = 0; cbq_state_t *cbqp; char *ifacename; ifacename = ep->cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); switch (enable) { case ENABLE: if (cbqp->ifnp.root_ == NULL || cbqp->ifnp.default_ == NULL || cbqp->ifnp.ctl_ == NULL) { if (cbqp->ifnp.root_ == NULL) printf("No Root Class for %s\n", ifacename); if (cbqp->ifnp.default_ == NULL) printf("No Default Class for %s\n", ifacename); if (cbqp->ifnp.ctl_ == NULL) printf("No Control Class for %s\n", ifacename); error = EINVAL; } else if ((error = altq_enable(cbqp->ifnp.ifq_)) == 0) { cbqp->cbq_qlen = 0; } break; case DISABLE: error = altq_disable(cbqp->ifnp.ifq_); break; } return (error); } static int cbq_getstats(gsp) struct cbq_getstats *gsp; { char *ifacename; int i, n, nclasses; cbq_state_t *cbqp; struct rm_class *cl; class_stats_t stats, *usp; int error = 0; ifacename = gsp->iface.cbq_ifacename; nclasses = gsp->nclasses; usp = gsp->stats; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); if (nclasses <= 0) return (EINVAL); for (n = 0, i = 0; n < nclasses && i < CBQ_MAX_CLASSES; n++, i++) { while ((cl = cbqp->cbq_class_tbl[i]) == NULL) if (++i >= CBQ_MAX_CLASSES) goto out; get_class_stats(&stats, cl); stats.handle = cl->stats_.handle; if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, sizeof(stats))) != 0) return (error); } out: gsp->nclasses = n; return (error); } static int cbq_ifattach(ifacep) struct cbq_interface *ifacep; { int error = 0; char *ifacename; cbq_state_t *new_cbqp; struct ifnet *ifp; ifacename = ifacep->cbq_ifacename; if ((ifp = ifunit(ifacename)) == NULL) return (ENXIO); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENXIO); /* allocate and initialize cbq_state_t */ new_cbqp = malloc(sizeof(cbq_state_t), M_DEVBUF, M_WAITOK); if (new_cbqp == NULL) return (ENOMEM); bzero(new_cbqp, sizeof(cbq_state_t)); CALLOUT_INIT(&new_cbqp->cbq_callout); new_cbqp->cbq_qlen = 0; new_cbqp->ifnp.ifq_ = &ifp->if_snd; /* keep the ifq */ /* * set CBQ to this ifnet structure. */ error = altq_attach(&ifp->if_snd, ALTQT_CBQ, new_cbqp, cbq_enqueue, cbq_dequeue, cbq_request, &new_cbqp->cbq_classifier, acc_classify); if (error) { free(new_cbqp, M_DEVBUF); return (error); } /* prepend to the list of cbq_state_t's. */ new_cbqp->cbq_next = cbq_list; cbq_list = new_cbqp; return (0); } static int cbq_ifdetach(ifacep) struct cbq_interface *ifacep; { char *ifacename; cbq_state_t *cbqp; ifacename = ifacep->cbq_ifacename; if ((cbqp = altq_lookup(ifacename, ALTQT_CBQ)) == NULL) return (EBADF); (void)cbq_set_enable(ifacep, DISABLE); cbq_clear_interface(cbqp); /* remove CBQ from the ifnet structure. */ (void)altq_detach(cbqp->ifnp.ifq_); /* remove from the list of cbq_state_t's. */ if (cbq_list == cbqp) cbq_list = cbqp->cbq_next; else { cbq_state_t *cp; for (cp = cbq_list; cp != NULL; cp = cp->cbq_next) if (cp->cbq_next == cbqp) { cp->cbq_next = cbqp->cbq_next; break; } ASSERT(cp != NULL); } /* deallocate cbq_state_t */ free(cbqp, M_DEVBUF); return (0); } /* * cbq device interface */ altqdev_decl(cbq); int cbqopen(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { return (0); } int cbqclose(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct ifnet *ifp; struct cbq_interface iface; int err, error = 0; while (cbq_list) { ifp = cbq_list->ifnp.ifq_->altq_ifp; sprintf(iface.cbq_ifacename, "%s", ifp->if_xname); err = cbq_ifdetach(&iface); if (err != 0 && error == 0) error = err; } return (error); } int cbqioctl(dev, cmd, addr, flag, p) dev_t dev; ioctlcmd_t cmd; caddr_t addr; int flag; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { int error = 0; /* check cmd for superuser only */ switch (cmd) { case CBQ_GETSTATS: /* currently only command that an ordinary user can call */ break; default: #if (__FreeBSD_version > 700000) error = priv_check(p, PRIV_ALTQ_MANAGE); #elsif (__FreeBSD_version > 400000) error = suser(p); #else error = suser(p->p_ucred, &p->p_acflag); #endif if (error) return (error); break; } switch (cmd) { case CBQ_ENABLE: error = cbq_set_enable((struct cbq_interface *)addr, ENABLE); break; case CBQ_DISABLE: error = cbq_set_enable((struct cbq_interface *)addr, DISABLE); break; case CBQ_ADD_FILTER: error = cbq_add_filter((struct cbq_add_filter *)addr); break; case CBQ_DEL_FILTER: error = cbq_delete_filter((struct cbq_delete_filter *)addr); break; case CBQ_ADD_CLASS: error = cbq_add_class((struct cbq_add_class *)addr); break; case CBQ_DEL_CLASS: error = cbq_delete_class((struct cbq_delete_class *)addr); break; case CBQ_MODIFY_CLASS: error = cbq_modify_class((struct cbq_modify_class *)addr); break; case CBQ_CLEAR_HIERARCHY: error = cbq_clear_hierarchy((struct cbq_interface *)addr); break; case CBQ_IF_ATTACH: error = cbq_ifattach((struct cbq_interface *)addr); break; case CBQ_IF_DETACH: error = cbq_ifdetach((struct cbq_interface *)addr); break; case CBQ_GETSTATS: error = cbq_getstats((struct cbq_getstats *)addr); break; default: error = EINVAL; break; } return error; } #if 0 /* for debug */ static void cbq_class_dump(int); static void cbq_class_dump(i) int i; { struct rm_class *cl; rm_class_stats_t *s; struct _class_queue_ *q; if (cbq_list == NULL) { printf("cbq_class_dump: no cbq_state found\n"); return; } cl = cbq_list->cbq_class_tbl[i]; printf("class %d cl=%p\n", i, cl); if (cl != NULL) { s = &cl->stats_; q = cl->q_; printf("pri=%d, depth=%d, maxrate=%d, allotment=%d\n", cl->pri_, cl->depth_, cl->maxrate_, cl->allotment_); printf("w_allotment=%d, bytes_alloc=%d, avgidle=%d, maxidle=%d\n", cl->w_allotment_, cl->bytes_alloc_, cl->avgidle_, cl->maxidle_); printf("minidle=%d, offtime=%d, sleeping=%d, leaf=%d\n", cl->minidle_, cl->offtime_, cl->sleeping_, cl->leaf_); printf("handle=%d, depth=%d, packets=%d, bytes=%d\n", s->handle, s->depth, (int)s->xmit_cnt.packets, (int)s->xmit_cnt.bytes); printf("over=%d\n, borrows=%d, drops=%d, overactions=%d, delays=%d\n", s->over, s->borrows, (int)s->drop_cnt.packets, s->overactions, s->delays); printf("tail=%p, head=%p, qlen=%d, qlim=%d, qthresh=%d,qtype=%d\n", q->tail_, q->head_, q->qlen_, q->qlim_, q->qthresh_, q->qtype_); } } #endif /* 0 */ #ifdef KLD_MODULE static struct altqsw cbq_sw = {"cbq", cbqopen, cbqclose, cbqioctl}; ALTQ_MODULE(altq_cbq, ALTQT_CBQ, &cbq_sw); MODULE_DEPEND(altq_cbq, altq_red, 1, 1, 1); MODULE_DEPEND(altq_cbq, altq_rio, 1, 1, 1); #endif /* KLD_MODULE */ #endif /* ALTQ3_COMPAT */ #endif /* ALTQ_CBQ */ Index: stable/10/sys/contrib/altq/altq/altq_cdnr.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_cdnr.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_cdnr.c (revision 263086) @@ -1,1389 +1,1390 @@ /* $FreeBSD$ */ /* $KAME: altq_cdnr.c,v 1.15 2005/04/13 03:44:24 suz Exp $ */ /* * Copyright (C) 1999-2002 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif +#include #include #ifdef ALTQ3_COMPAT #include #endif #include #ifdef ALTQ3_COMPAT /* * diffserv traffic conditioning module */ int altq_cdnr_enabled = 0; /* traffic conditioner is enabled by ALTQ_CDNR option in opt_altq.h */ #ifdef ALTQ_CDNR /* cdnr_list keeps all cdnr's allocated. */ static LIST_HEAD(, top_cdnr) tcb_list; static int altq_cdnr_input(struct mbuf *, int); static struct top_cdnr *tcb_lookup(char *ifname); static struct cdnr_block *cdnr_handle2cb(u_long); static u_long cdnr_cb2handle(struct cdnr_block *); static void *cdnr_cballoc(struct top_cdnr *, int, struct tc_action *(*)(struct cdnr_block *, struct cdnr_pktinfo *)); static void cdnr_cbdestroy(void *); static int tca_verify_action(struct tc_action *); static void tca_import_action(struct tc_action *, struct tc_action *); static void tca_invalidate_action(struct tc_action *); static int generic_element_destroy(struct cdnr_block *); static struct top_cdnr *top_create(struct ifaltq *); static int top_destroy(struct top_cdnr *); static struct cdnr_block *element_create(struct top_cdnr *, struct tc_action *); static int element_destroy(struct cdnr_block *); static void tb_import_profile(struct tbe *, struct tb_profile *); static struct tbmeter *tbm_create(struct top_cdnr *, struct tb_profile *, struct tc_action *, struct tc_action *); static int tbm_destroy(struct tbmeter *); static struct tc_action *tbm_input(struct cdnr_block *, struct cdnr_pktinfo *); static struct trtcm *trtcm_create(struct top_cdnr *, struct tb_profile *, struct tb_profile *, struct tc_action *, struct tc_action *, struct tc_action *, int); static int trtcm_destroy(struct trtcm *); static struct tc_action *trtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); static struct tswtcm *tswtcm_create(struct top_cdnr *, u_int32_t, u_int32_t, u_int32_t, struct tc_action *, struct tc_action *, struct tc_action *); static int tswtcm_destroy(struct tswtcm *); static struct tc_action *tswtcm_input(struct cdnr_block *, struct cdnr_pktinfo *); static int cdnrcmd_if_attach(char *); static int cdnrcmd_if_detach(char *); static int cdnrcmd_add_element(struct cdnr_add_element *); static int cdnrcmd_delete_element(struct cdnr_delete_element *); static int cdnrcmd_add_filter(struct cdnr_add_filter *); static int cdnrcmd_delete_filter(struct cdnr_delete_filter *); static int cdnrcmd_add_tbm(struct cdnr_add_tbmeter *); static int cdnrcmd_modify_tbm(struct cdnr_modify_tbmeter *); static int cdnrcmd_tbm_stats(struct cdnr_tbmeter_stats *); static int cdnrcmd_add_trtcm(struct cdnr_add_trtcm *); static int cdnrcmd_modify_trtcm(struct cdnr_modify_trtcm *); static int cdnrcmd_tcm_stats(struct cdnr_tcm_stats *); static int cdnrcmd_add_tswtcm(struct cdnr_add_tswtcm *); static int cdnrcmd_modify_tswtcm(struct cdnr_modify_tswtcm *); static int cdnrcmd_get_stats(struct cdnr_get_stats *); altqdev_decl(cdnr); /* * top level input function called from ip_input. * should be called before converting header fields to host-byte-order. */ int altq_cdnr_input(m, af) struct mbuf *m; int af; /* address family */ { struct ifnet *ifp; struct ip *ip; struct top_cdnr *top; struct tc_action *tca; struct cdnr_block *cb; struct cdnr_pktinfo pktinfo; ifp = m->m_pkthdr.rcvif; if (!ALTQ_IS_CNDTNING(&ifp->if_snd)) /* traffic conditioner is not enabled on this interface */ return (1); top = ifp->if_snd.altq_cdnr; ip = mtod(m, struct ip *); #ifdef INET6 if (af == AF_INET6) { u_int32_t flowlabel; flowlabel = ((struct ip6_hdr *)ip)->ip6_flow; pktinfo.pkt_dscp = (ntohl(flowlabel) >> 20) & DSCP_MASK; } else #endif pktinfo.pkt_dscp = ip->ip_tos & DSCP_MASK; pktinfo.pkt_len = m_pktlen(m); tca = NULL; cb = acc_classify(&top->tc_classifier, m, af); if (cb != NULL) tca = &cb->cb_action; if (tca == NULL) tca = &top->tc_block.cb_action; while (1) { PKTCNTR_ADD(&top->tc_cnts[tca->tca_code], pktinfo.pkt_len); switch (tca->tca_code) { case TCACODE_PASS: return (1); case TCACODE_DROP: m_freem(m); return (0); case TCACODE_RETURN: return (0); case TCACODE_MARK: #ifdef INET6 if (af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); flowlabel = (tca->tca_dscp << 20) | (flowlabel & ~(DSCP_MASK << 20)); ip6->ip6_flow = htonl(flowlabel); } else #endif ip->ip_tos = tca->tca_dscp | (ip->ip_tos & DSCP_CUMASK); return (1); case TCACODE_NEXT: cb = tca->tca_next; tca = (*cb->cb_input)(cb, &pktinfo); break; case TCACODE_NONE: default: return (1); } } } static struct top_cdnr * tcb_lookup(ifname) char *ifname; { struct top_cdnr *top; struct ifnet *ifp; if ((ifp = ifunit(ifname)) != NULL) LIST_FOREACH(top, &tcb_list, tc_next) if (top->tc_ifq->altq_ifp == ifp) return (top); return (NULL); } static struct cdnr_block * cdnr_handle2cb(handle) u_long handle; { struct cdnr_block *cb; cb = (struct cdnr_block *)handle; if (handle != ALIGN(cb)) return (NULL); if (cb == NULL || cb->cb_handle != handle) return (NULL); return (cb); } static u_long cdnr_cb2handle(cb) struct cdnr_block *cb; { return (cb->cb_handle); } static void * cdnr_cballoc(top, type, input_func) struct top_cdnr *top; int type; struct tc_action *(*input_func)(struct cdnr_block *, struct cdnr_pktinfo *); { struct cdnr_block *cb; int size; switch (type) { case TCETYPE_TOP: size = sizeof(struct top_cdnr); break; case TCETYPE_ELEMENT: size = sizeof(struct cdnr_block); break; case TCETYPE_TBMETER: size = sizeof(struct tbmeter); break; case TCETYPE_TRTCM: size = sizeof(struct trtcm); break; case TCETYPE_TSWTCM: size = sizeof(struct tswtcm); break; default: return (NULL); } cb = malloc(size, M_DEVBUF, M_WAITOK); if (cb == NULL) return (NULL); bzero(cb, size); cb->cb_len = size; cb->cb_type = type; cb->cb_ref = 0; cb->cb_handle = (u_long)cb; if (top == NULL) cb->cb_top = (struct top_cdnr *)cb; else cb->cb_top = top; if (input_func != NULL) { /* * if this cdnr has an action function, * make tc_action to call itself. */ cb->cb_action.tca_code = TCACODE_NEXT; cb->cb_action.tca_next = cb; cb->cb_input = input_func; } else cb->cb_action.tca_code = TCACODE_NONE; /* if this isn't top, register the element to the top level cdnr */ if (top != NULL) LIST_INSERT_HEAD(&top->tc_elements, cb, cb_next); return ((void *)cb); } static void cdnr_cbdestroy(cblock) void *cblock; { struct cdnr_block *cb = cblock; /* delete filters belonging to this cdnr */ acc_discard_filters(&cb->cb_top->tc_classifier, cb, 0); /* remove from the top level cdnr */ if (cb->cb_top != cblock) LIST_REMOVE(cb, cb_next); free(cb, M_DEVBUF); } /* * conditioner common destroy routine */ static int generic_element_destroy(cb) struct cdnr_block *cb; { int error = 0; switch (cb->cb_type) { case TCETYPE_TOP: error = top_destroy((struct top_cdnr *)cb); break; case TCETYPE_ELEMENT: error = element_destroy(cb); break; case TCETYPE_TBMETER: error = tbm_destroy((struct tbmeter *)cb); break; case TCETYPE_TRTCM: error = trtcm_destroy((struct trtcm *)cb); break; case TCETYPE_TSWTCM: error = tswtcm_destroy((struct tswtcm *)cb); break; default: error = EINVAL; } return (error); } static int tca_verify_action(utca) struct tc_action *utca; { switch (utca->tca_code) { case TCACODE_PASS: case TCACODE_DROP: case TCACODE_MARK: /* these are ok */ break; case TCACODE_HANDLE: /* verify handle value */ if (cdnr_handle2cb(utca->tca_handle) == NULL) return (-1); break; case TCACODE_NONE: case TCACODE_RETURN: case TCACODE_NEXT: default: /* should not be passed from a user */ return (-1); } return (0); } static void tca_import_action(ktca, utca) struct tc_action *ktca, *utca; { struct cdnr_block *cb; *ktca = *utca; if (ktca->tca_code == TCACODE_HANDLE) { cb = cdnr_handle2cb(ktca->tca_handle); if (cb == NULL) { ktca->tca_code = TCACODE_NONE; return; } ktca->tca_code = TCACODE_NEXT; ktca->tca_next = cb; cb->cb_ref++; } else if (ktca->tca_code == TCACODE_MARK) { ktca->tca_dscp &= DSCP_MASK; } return; } static void tca_invalidate_action(tca) struct tc_action *tca; { struct cdnr_block *cb; if (tca->tca_code == TCACODE_NEXT) { cb = tca->tca_next; if (cb == NULL) return; cb->cb_ref--; } tca->tca_code = TCACODE_NONE; } /* * top level traffic conditioner */ static struct top_cdnr * top_create(ifq) struct ifaltq *ifq; { struct top_cdnr *top; if ((top = cdnr_cballoc(NULL, TCETYPE_TOP, NULL)) == NULL) return (NULL); top->tc_ifq = ifq; /* set default action for the top level conditioner */ top->tc_block.cb_action.tca_code = TCACODE_PASS; LIST_INSERT_HEAD(&tcb_list, top, tc_next); ifq->altq_cdnr = top; return (top); } static int top_destroy(top) struct top_cdnr *top; { struct cdnr_block *cb; if (ALTQ_IS_CNDTNING(top->tc_ifq)) ALTQ_CLEAR_CNDTNING(top->tc_ifq); top->tc_ifq->altq_cdnr = NULL; /* * destroy all the conditioner elements belonging to this interface */ while ((cb = LIST_FIRST(&top->tc_elements)) != NULL) { while (cb != NULL && cb->cb_ref > 0) cb = LIST_NEXT(cb, cb_next); if (cb != NULL) generic_element_destroy(cb); } LIST_REMOVE(top, tc_next); cdnr_cbdestroy(top); /* if there is no active conditioner, remove the input hook */ if (altq_input != NULL) { LIST_FOREACH(top, &tcb_list, tc_next) if (ALTQ_IS_CNDTNING(top->tc_ifq)) break; if (top == NULL) altq_input = NULL; } return (0); } /* * simple tc elements without input function (e.g., dropper and makers). */ static struct cdnr_block * element_create(top, action) struct top_cdnr *top; struct tc_action *action; { struct cdnr_block *cb; if (tca_verify_action(action) < 0) return (NULL); if ((cb = cdnr_cballoc(top, TCETYPE_ELEMENT, NULL)) == NULL) return (NULL); tca_import_action(&cb->cb_action, action); return (cb); } static int element_destroy(cb) struct cdnr_block *cb; { if (cb->cb_ref > 0) return (EBUSY); tca_invalidate_action(&cb->cb_action); cdnr_cbdestroy(cb); return (0); } /* * internal representation of token bucket parameters * rate: byte_per_unittime << 32 * (((bits_per_sec) / 8) << 32) / machclk_freq * depth: byte << 32 * */ #define TB_SHIFT 32 #define TB_SCALE(x) ((u_int64_t)(x) << TB_SHIFT) #define TB_UNSCALE(x) ((x) >> TB_SHIFT) static void tb_import_profile(tb, profile) struct tbe *tb; struct tb_profile *profile; { tb->rate = TB_SCALE(profile->rate / 8) / machclk_freq; tb->depth = TB_SCALE(profile->depth); if (tb->rate > 0) tb->filluptime = tb->depth / tb->rate; else tb->filluptime = 0xffffffffffffffffLL; tb->token = tb->depth; tb->last = read_machclk(); } /* * simple token bucket meter */ static struct tbmeter * tbm_create(top, profile, in_action, out_action) struct top_cdnr *top; struct tb_profile *profile; struct tc_action *in_action, *out_action; { struct tbmeter *tbm = NULL; if (tca_verify_action(in_action) < 0 || tca_verify_action(out_action) < 0) return (NULL); if ((tbm = cdnr_cballoc(top, TCETYPE_TBMETER, tbm_input)) == NULL) return (NULL); tb_import_profile(&tbm->tb, profile); tca_import_action(&tbm->in_action, in_action); tca_import_action(&tbm->out_action, out_action); return (tbm); } static int tbm_destroy(tbm) struct tbmeter *tbm; { if (tbm->cdnrblk.cb_ref > 0) return (EBUSY); tca_invalidate_action(&tbm->in_action); tca_invalidate_action(&tbm->out_action); cdnr_cbdestroy(tbm); return (0); } static struct tc_action * tbm_input(cb, pktinfo) struct cdnr_block *cb; struct cdnr_pktinfo *pktinfo; { struct tbmeter *tbm = (struct tbmeter *)cb; u_int64_t len; u_int64_t interval, now; len = TB_SCALE(pktinfo->pkt_len); if (tbm->tb.token < len) { now = read_machclk(); interval = now - tbm->tb.last; if (interval >= tbm->tb.filluptime) tbm->tb.token = tbm->tb.depth; else { tbm->tb.token += interval * tbm->tb.rate; if (tbm->tb.token > tbm->tb.depth) tbm->tb.token = tbm->tb.depth; } tbm->tb.last = now; } if (tbm->tb.token < len) { PKTCNTR_ADD(&tbm->out_cnt, pktinfo->pkt_len); return (&tbm->out_action); } tbm->tb.token -= len; PKTCNTR_ADD(&tbm->in_cnt, pktinfo->pkt_len); return (&tbm->in_action); } /* * two rate three color marker * as described in draft-heinanen-diffserv-trtcm-01.txt */ static struct trtcm * trtcm_create(top, cmtd_profile, peak_profile, green_action, yellow_action, red_action, coloraware) struct top_cdnr *top; struct tb_profile *cmtd_profile, *peak_profile; struct tc_action *green_action, *yellow_action, *red_action; int coloraware; { struct trtcm *tcm = NULL; if (tca_verify_action(green_action) < 0 || tca_verify_action(yellow_action) < 0 || tca_verify_action(red_action) < 0) return (NULL); if ((tcm = cdnr_cballoc(top, TCETYPE_TRTCM, trtcm_input)) == NULL) return (NULL); tb_import_profile(&tcm->cmtd_tb, cmtd_profile); tb_import_profile(&tcm->peak_tb, peak_profile); tca_import_action(&tcm->green_action, green_action); tca_import_action(&tcm->yellow_action, yellow_action); tca_import_action(&tcm->red_action, red_action); /* set dscps to use */ if (tcm->green_action.tca_code == TCACODE_MARK) tcm->green_dscp = tcm->green_action.tca_dscp & DSCP_MASK; else tcm->green_dscp = DSCP_AF11; if (tcm->yellow_action.tca_code == TCACODE_MARK) tcm->yellow_dscp = tcm->yellow_action.tca_dscp & DSCP_MASK; else tcm->yellow_dscp = DSCP_AF12; if (tcm->red_action.tca_code == TCACODE_MARK) tcm->red_dscp = tcm->red_action.tca_dscp & DSCP_MASK; else tcm->red_dscp = DSCP_AF13; tcm->coloraware = coloraware; return (tcm); } static int trtcm_destroy(tcm) struct trtcm *tcm; { if (tcm->cdnrblk.cb_ref > 0) return (EBUSY); tca_invalidate_action(&tcm->green_action); tca_invalidate_action(&tcm->yellow_action); tca_invalidate_action(&tcm->red_action); cdnr_cbdestroy(tcm); return (0); } static struct tc_action * trtcm_input(cb, pktinfo) struct cdnr_block *cb; struct cdnr_pktinfo *pktinfo; { struct trtcm *tcm = (struct trtcm *)cb; u_int64_t len; u_int64_t interval, now; u_int8_t color; len = TB_SCALE(pktinfo->pkt_len); if (tcm->coloraware) { color = pktinfo->pkt_dscp; if (color != tcm->yellow_dscp && color != tcm->red_dscp) color = tcm->green_dscp; } else { /* if color-blind, precolor it as green */ color = tcm->green_dscp; } now = read_machclk(); if (tcm->cmtd_tb.token < len) { interval = now - tcm->cmtd_tb.last; if (interval >= tcm->cmtd_tb.filluptime) tcm->cmtd_tb.token = tcm->cmtd_tb.depth; else { tcm->cmtd_tb.token += interval * tcm->cmtd_tb.rate; if (tcm->cmtd_tb.token > tcm->cmtd_tb.depth) tcm->cmtd_tb.token = tcm->cmtd_tb.depth; } tcm->cmtd_tb.last = now; } if (tcm->peak_tb.token < len) { interval = now - tcm->peak_tb.last; if (interval >= tcm->peak_tb.filluptime) tcm->peak_tb.token = tcm->peak_tb.depth; else { tcm->peak_tb.token += interval * tcm->peak_tb.rate; if (tcm->peak_tb.token > tcm->peak_tb.depth) tcm->peak_tb.token = tcm->peak_tb.depth; } tcm->peak_tb.last = now; } if (color == tcm->red_dscp || tcm->peak_tb.token < len) { pktinfo->pkt_dscp = tcm->red_dscp; PKTCNTR_ADD(&tcm->red_cnt, pktinfo->pkt_len); return (&tcm->red_action); } if (color == tcm->yellow_dscp || tcm->cmtd_tb.token < len) { pktinfo->pkt_dscp = tcm->yellow_dscp; tcm->peak_tb.token -= len; PKTCNTR_ADD(&tcm->yellow_cnt, pktinfo->pkt_len); return (&tcm->yellow_action); } pktinfo->pkt_dscp = tcm->green_dscp; tcm->cmtd_tb.token -= len; tcm->peak_tb.token -= len; PKTCNTR_ADD(&tcm->green_cnt, pktinfo->pkt_len); return (&tcm->green_action); } /* * time sliding window three color marker * as described in draft-fang-diffserv-tc-tswtcm-00.txt */ static struct tswtcm * tswtcm_create(top, cmtd_rate, peak_rate, avg_interval, green_action, yellow_action, red_action) struct top_cdnr *top; u_int32_t cmtd_rate, peak_rate, avg_interval; struct tc_action *green_action, *yellow_action, *red_action; { struct tswtcm *tsw; if (tca_verify_action(green_action) < 0 || tca_verify_action(yellow_action) < 0 || tca_verify_action(red_action) < 0) return (NULL); if ((tsw = cdnr_cballoc(top, TCETYPE_TSWTCM, tswtcm_input)) == NULL) return (NULL); tca_import_action(&tsw->green_action, green_action); tca_import_action(&tsw->yellow_action, yellow_action); tca_import_action(&tsw->red_action, red_action); /* set dscps to use */ if (tsw->green_action.tca_code == TCACODE_MARK) tsw->green_dscp = tsw->green_action.tca_dscp & DSCP_MASK; else tsw->green_dscp = DSCP_AF11; if (tsw->yellow_action.tca_code == TCACODE_MARK) tsw->yellow_dscp = tsw->yellow_action.tca_dscp & DSCP_MASK; else tsw->yellow_dscp = DSCP_AF12; if (tsw->red_action.tca_code == TCACODE_MARK) tsw->red_dscp = tsw->red_action.tca_dscp & DSCP_MASK; else tsw->red_dscp = DSCP_AF13; /* convert rates from bits/sec to bytes/sec */ tsw->cmtd_rate = cmtd_rate / 8; tsw->peak_rate = peak_rate / 8; tsw->avg_rate = 0; /* timewin is converted from msec to machine clock unit */ tsw->timewin = (u_int64_t)machclk_freq * avg_interval / 1000; return (tsw); } static int tswtcm_destroy(tsw) struct tswtcm *tsw; { if (tsw->cdnrblk.cb_ref > 0) return (EBUSY); tca_invalidate_action(&tsw->green_action); tca_invalidate_action(&tsw->yellow_action); tca_invalidate_action(&tsw->red_action); cdnr_cbdestroy(tsw); return (0); } static struct tc_action * tswtcm_input(cb, pktinfo) struct cdnr_block *cb; struct cdnr_pktinfo *pktinfo; { struct tswtcm *tsw = (struct tswtcm *)cb; int len; u_int32_t avg_rate; u_int64_t interval, now, tmp; /* * rate estimator */ len = pktinfo->pkt_len; now = read_machclk(); interval = now - tsw->t_front; /* * calculate average rate: * avg = (avg * timewin + pkt_len)/(timewin + interval) * pkt_len needs to be multiplied by machclk_freq in order to * get (bytes/sec). * note: when avg_rate (bytes/sec) and timewin (machclk unit) are * less than 32 bits, the following 64-bit operation has enough * precision. */ tmp = ((u_int64_t)tsw->avg_rate * tsw->timewin + (u_int64_t)len * machclk_freq) / (tsw->timewin + interval); tsw->avg_rate = avg_rate = (u_int32_t)tmp; tsw->t_front = now; /* * marker */ if (avg_rate > tsw->cmtd_rate) { u_int32_t randval = arc4random() % avg_rate; if (avg_rate > tsw->peak_rate) { if (randval < avg_rate - tsw->peak_rate) { /* mark red */ pktinfo->pkt_dscp = tsw->red_dscp; PKTCNTR_ADD(&tsw->red_cnt, len); return (&tsw->red_action); } else if (randval < avg_rate - tsw->cmtd_rate) goto mark_yellow; } else { /* peak_rate >= avg_rate > cmtd_rate */ if (randval < avg_rate - tsw->cmtd_rate) { mark_yellow: pktinfo->pkt_dscp = tsw->yellow_dscp; PKTCNTR_ADD(&tsw->yellow_cnt, len); return (&tsw->yellow_action); } } } /* mark green */ pktinfo->pkt_dscp = tsw->green_dscp; PKTCNTR_ADD(&tsw->green_cnt, len); return (&tsw->green_action); } /* * ioctl requests */ static int cdnrcmd_if_attach(ifname) char *ifname; { struct ifnet *ifp; struct top_cdnr *top; if ((ifp = ifunit(ifname)) == NULL) return (EBADF); if (ifp->if_snd.altq_cdnr != NULL) return (EBUSY); if ((top = top_create(&ifp->if_snd)) == NULL) return (ENOMEM); return (0); } static int cdnrcmd_if_detach(ifname) char *ifname; { struct top_cdnr *top; if ((top = tcb_lookup(ifname)) == NULL) return (EBADF); return top_destroy(top); } static int cdnrcmd_add_element(ap) struct cdnr_add_element *ap; { struct top_cdnr *top; struct cdnr_block *cb; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); cb = element_create(top, &ap->action); if (cb == NULL) return (EINVAL); /* return a class handle to the user */ ap->cdnr_handle = cdnr_cb2handle(cb); return (0); } static int cdnrcmd_delete_element(ap) struct cdnr_delete_element *ap; { struct top_cdnr *top; struct cdnr_block *cb; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); if (cb->cb_type != TCETYPE_ELEMENT) return generic_element_destroy(cb); return element_destroy(cb); } static int cdnrcmd_add_filter(ap) struct cdnr_add_filter *ap; { struct top_cdnr *top; struct cdnr_block *cb; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); return acc_add_filter(&top->tc_classifier, &ap->filter, cb, &ap->filter_handle); } static int cdnrcmd_delete_filter(ap) struct cdnr_delete_filter *ap; { struct top_cdnr *top; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); return acc_delete_filter(&top->tc_classifier, ap->filter_handle); } static int cdnrcmd_add_tbm(ap) struct cdnr_add_tbmeter *ap; { struct top_cdnr *top; struct tbmeter *tbm; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); tbm = tbm_create(top, &ap->profile, &ap->in_action, &ap->out_action); if (tbm == NULL) return (EINVAL); /* return a class handle to the user */ ap->cdnr_handle = cdnr_cb2handle(&tbm->cdnrblk); return (0); } static int cdnrcmd_modify_tbm(ap) struct cdnr_modify_tbmeter *ap; { struct tbmeter *tbm; if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); tb_import_profile(&tbm->tb, &ap->profile); return (0); } static int cdnrcmd_tbm_stats(ap) struct cdnr_tbmeter_stats *ap; { struct tbmeter *tbm; if ((tbm = (struct tbmeter *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); ap->in_cnt = tbm->in_cnt; ap->out_cnt = tbm->out_cnt; return (0); } static int cdnrcmd_add_trtcm(ap) struct cdnr_add_trtcm *ap; { struct top_cdnr *top; struct trtcm *tcm; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); tcm = trtcm_create(top, &ap->cmtd_profile, &ap->peak_profile, &ap->green_action, &ap->yellow_action, &ap->red_action, ap->coloraware); if (tcm == NULL) return (EINVAL); /* return a class handle to the user */ ap->cdnr_handle = cdnr_cb2handle(&tcm->cdnrblk); return (0); } static int cdnrcmd_modify_trtcm(ap) struct cdnr_modify_trtcm *ap; { struct trtcm *tcm; if ((tcm = (struct trtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); tb_import_profile(&tcm->cmtd_tb, &ap->cmtd_profile); tb_import_profile(&tcm->peak_tb, &ap->peak_profile); return (0); } static int cdnrcmd_tcm_stats(ap) struct cdnr_tcm_stats *ap; { struct cdnr_block *cb; if ((cb = cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); if (cb->cb_type == TCETYPE_TRTCM) { struct trtcm *tcm = (struct trtcm *)cb; ap->green_cnt = tcm->green_cnt; ap->yellow_cnt = tcm->yellow_cnt; ap->red_cnt = tcm->red_cnt; } else if (cb->cb_type == TCETYPE_TSWTCM) { struct tswtcm *tsw = (struct tswtcm *)cb; ap->green_cnt = tsw->green_cnt; ap->yellow_cnt = tsw->yellow_cnt; ap->red_cnt = tsw->red_cnt; } else return (EINVAL); return (0); } static int cdnrcmd_add_tswtcm(ap) struct cdnr_add_tswtcm *ap; { struct top_cdnr *top; struct tswtcm *tsw; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); if (ap->cmtd_rate > ap->peak_rate) return (EINVAL); tsw = tswtcm_create(top, ap->cmtd_rate, ap->peak_rate, ap->avg_interval, &ap->green_action, &ap->yellow_action, &ap->red_action); if (tsw == NULL) return (EINVAL); /* return a class handle to the user */ ap->cdnr_handle = cdnr_cb2handle(&tsw->cdnrblk); return (0); } static int cdnrcmd_modify_tswtcm(ap) struct cdnr_modify_tswtcm *ap; { struct tswtcm *tsw; if ((tsw = (struct tswtcm *)cdnr_handle2cb(ap->cdnr_handle)) == NULL) return (EINVAL); if (ap->cmtd_rate > ap->peak_rate) return (EINVAL); /* convert rates from bits/sec to bytes/sec */ tsw->cmtd_rate = ap->cmtd_rate / 8; tsw->peak_rate = ap->peak_rate / 8; tsw->avg_rate = 0; /* timewin is converted from msec to machine clock unit */ tsw->timewin = (u_int64_t)machclk_freq * ap->avg_interval / 1000; return (0); } static int cdnrcmd_get_stats(ap) struct cdnr_get_stats *ap; { struct top_cdnr *top; struct cdnr_block *cb; struct tbmeter *tbm; struct trtcm *tcm; struct tswtcm *tsw; struct tce_stats tce, *usp; int error, n, nskip, nelements; if ((top = tcb_lookup(ap->iface.cdnr_ifname)) == NULL) return (EBADF); /* copy action stats */ bcopy(top->tc_cnts, ap->cnts, sizeof(ap->cnts)); /* stats for each element */ nelements = ap->nelements; usp = ap->tce_stats; if (nelements <= 0 || usp == NULL) return (0); nskip = ap->nskip; n = 0; LIST_FOREACH(cb, &top->tc_elements, cb_next) { if (nskip > 0) { nskip--; continue; } bzero(&tce, sizeof(tce)); tce.tce_handle = cb->cb_handle; tce.tce_type = cb->cb_type; switch (cb->cb_type) { case TCETYPE_TBMETER: tbm = (struct tbmeter *)cb; tce.tce_cnts[0] = tbm->in_cnt; tce.tce_cnts[1] = tbm->out_cnt; break; case TCETYPE_TRTCM: tcm = (struct trtcm *)cb; tce.tce_cnts[0] = tcm->green_cnt; tce.tce_cnts[1] = tcm->yellow_cnt; tce.tce_cnts[2] = tcm->red_cnt; break; case TCETYPE_TSWTCM: tsw = (struct tswtcm *)cb; tce.tce_cnts[0] = tsw->green_cnt; tce.tce_cnts[1] = tsw->yellow_cnt; tce.tce_cnts[2] = tsw->red_cnt; break; default: continue; } if ((error = copyout((caddr_t)&tce, (caddr_t)usp++, sizeof(tce))) != 0) return (error); if (++n == nelements) break; } ap->nelements = n; return (0); } /* * conditioner device interface */ int cdnropen(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) { printf("cdnr: no cpu clock available!\n"); return (ENXIO); } /* everything will be done when the queueing scheme is attached. */ return 0; } int cdnrclose(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct top_cdnr *top; int err, error = 0; while ((top = LIST_FIRST(&tcb_list)) != NULL) { /* destroy all */ err = top_destroy(top); if (err != 0 && error == 0) error = err; } altq_input = NULL; return (error); } int cdnrioctl(dev, cmd, addr, flag, p) dev_t dev; ioctlcmd_t cmd; caddr_t addr; int flag; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct top_cdnr *top; struct cdnr_interface *ifacep; int s, error = 0; /* check super-user privilege */ switch (cmd) { case CDNR_GETSTATS: break; default: #if (__FreeBSD_version > 700000) if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) #elsif (__FreeBSD_version > 400000) if ((error = suser(p)) != 0) #else if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) #endif return (error); break; } #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif switch (cmd) { case CDNR_IF_ATTACH: ifacep = (struct cdnr_interface *)addr; error = cdnrcmd_if_attach(ifacep->cdnr_ifname); break; case CDNR_IF_DETACH: ifacep = (struct cdnr_interface *)addr; error = cdnrcmd_if_detach(ifacep->cdnr_ifname); break; case CDNR_ENABLE: case CDNR_DISABLE: ifacep = (struct cdnr_interface *)addr; if ((top = tcb_lookup(ifacep->cdnr_ifname)) == NULL) { error = EBADF; break; } switch (cmd) { case CDNR_ENABLE: ALTQ_SET_CNDTNING(top->tc_ifq); if (altq_input == NULL) altq_input = altq_cdnr_input; break; case CDNR_DISABLE: ALTQ_CLEAR_CNDTNING(top->tc_ifq); LIST_FOREACH(top, &tcb_list, tc_next) if (ALTQ_IS_CNDTNING(top->tc_ifq)) break; if (top == NULL) altq_input = NULL; break; } break; case CDNR_ADD_ELEM: error = cdnrcmd_add_element((struct cdnr_add_element *)addr); break; case CDNR_DEL_ELEM: error = cdnrcmd_delete_element((struct cdnr_delete_element *)addr); break; case CDNR_ADD_TBM: error = cdnrcmd_add_tbm((struct cdnr_add_tbmeter *)addr); break; case CDNR_MOD_TBM: error = cdnrcmd_modify_tbm((struct cdnr_modify_tbmeter *)addr); break; case CDNR_TBM_STATS: error = cdnrcmd_tbm_stats((struct cdnr_tbmeter_stats *)addr); break; case CDNR_ADD_TCM: error = cdnrcmd_add_trtcm((struct cdnr_add_trtcm *)addr); break; case CDNR_MOD_TCM: error = cdnrcmd_modify_trtcm((struct cdnr_modify_trtcm *)addr); break; case CDNR_TCM_STATS: error = cdnrcmd_tcm_stats((struct cdnr_tcm_stats *)addr); break; case CDNR_ADD_FILTER: error = cdnrcmd_add_filter((struct cdnr_add_filter *)addr); break; case CDNR_DEL_FILTER: error = cdnrcmd_delete_filter((struct cdnr_delete_filter *)addr); break; case CDNR_GETSTATS: error = cdnrcmd_get_stats((struct cdnr_get_stats *)addr); break; case CDNR_ADD_TSW: error = cdnrcmd_add_tswtcm((struct cdnr_add_tswtcm *)addr); break; case CDNR_MOD_TSW: error = cdnrcmd_modify_tswtcm((struct cdnr_modify_tswtcm *)addr); break; default: error = EINVAL; break; } splx(s); return error; } #ifdef KLD_MODULE static struct altqsw cdnr_sw = {"cdnr", cdnropen, cdnrclose, cdnrioctl}; ALTQ_MODULE(altq_cdnr, ALTQT_CDNR, &cdnr_sw); #endif /* KLD_MODULE */ #endif /* ALTQ3_COMPAT */ #endif /* ALTQ_CDNR */ Index: stable/10/sys/contrib/altq/altq/altq_hfsc.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_hfsc.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_hfsc.c (revision 263086) @@ -1,2219 +1,2222 @@ /* $FreeBSD$ */ /* $KAME: altq_hfsc.c,v 1.24 2003/12/05 05:40:46 kjc Exp $ */ /* * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. * * Permission to use, copy, modify, and distribute this software and * its documentation is hereby granted (including for commercial or * for-profit use), provided that both the copyright notice and this * permission notice appear in all copies of the software, derivative * works, or modified versions, and any portions thereof. * * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * Carnegie Mellon encourages (but does not require) users of this * software to return any improvements or extensions that they make, * and to grant Carnegie Mellon the rights to redistribute these * changes without encumbrance. */ /* * H-FSC is described in Proceedings of SIGCOMM'97, * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, * Real-Time and Priority Service" * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. * * Oleg Cherevko added the upperlimit for link-sharing. * when a class has an upperlimit, the fit-time is computed from the * upperlimit service curve. the link-sharing scheduler does not schedule * a class whose fit-time exceeds the current time. */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #ifdef ALTQ_HFSC /* hfsc is enabled by ALTQ_HFSC option in opt_altq.h */ #include #include #include #include #include #include #include #if 1 /* ALTQ3_COMPAT */ #include #include #include #endif /* ALTQ3_COMPAT */ #include +#include #include -#include +#include +#include +#include #include #include #ifdef ALTQ3_COMPAT #include #endif /* * function prototypes */ static int hfsc_clear_interface(struct hfsc_if *); static int hfsc_request(struct ifaltq *, int, void *); static void hfsc_purge(struct hfsc_if *); static struct hfsc_class *hfsc_class_create(struct hfsc_if *, struct service_curve *, struct service_curve *, struct service_curve *, struct hfsc_class *, int, int, int); static int hfsc_class_destroy(struct hfsc_class *); static struct hfsc_class *hfsc_nextclass(struct hfsc_class *); static int hfsc_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); static struct mbuf *hfsc_dequeue(struct ifaltq *, int); static int hfsc_addq(struct hfsc_class *, struct mbuf *); static struct mbuf *hfsc_getq(struct hfsc_class *); static struct mbuf *hfsc_pollq(struct hfsc_class *); static void hfsc_purgeq(struct hfsc_class *); static void update_cfmin(struct hfsc_class *); static void set_active(struct hfsc_class *, int); static void set_passive(struct hfsc_class *); static void init_ed(struct hfsc_class *, int); static void update_ed(struct hfsc_class *, int); static void update_d(struct hfsc_class *, int); static void init_vf(struct hfsc_class *, int); static void update_vf(struct hfsc_class *, int, u_int64_t); static void ellist_insert(struct hfsc_class *); static void ellist_remove(struct hfsc_class *); static void ellist_update(struct hfsc_class *); struct hfsc_class *hfsc_get_mindl(struct hfsc_if *, u_int64_t); static void actlist_insert(struct hfsc_class *); static void actlist_remove(struct hfsc_class *); static void actlist_update(struct hfsc_class *); static struct hfsc_class *actlist_firstfit(struct hfsc_class *, u_int64_t); static __inline u_int64_t seg_x2y(u_int64_t, u_int64_t); static __inline u_int64_t seg_y2x(u_int64_t, u_int64_t); static __inline u_int64_t m2sm(u_int); static __inline u_int64_t m2ism(u_int); static __inline u_int64_t d2dx(u_int); static u_int sm2m(u_int64_t); static u_int dx2d(u_int64_t); static void sc2isc(struct service_curve *, struct internal_sc *); static void rtsc_init(struct runtime_sc *, struct internal_sc *, u_int64_t, u_int64_t); static u_int64_t rtsc_y2x(struct runtime_sc *, u_int64_t); static u_int64_t rtsc_x2y(struct runtime_sc *, u_int64_t); static void rtsc_min(struct runtime_sc *, struct internal_sc *, u_int64_t, u_int64_t); static void get_class_stats(struct hfsc_classstats *, struct hfsc_class *); static struct hfsc_class *clh_to_clp(struct hfsc_if *, u_int32_t); #ifdef ALTQ3_COMPAT static struct hfsc_if *hfsc_attach(struct ifaltq *, u_int); static int hfsc_detach(struct hfsc_if *); static int hfsc_class_modify(struct hfsc_class *, struct service_curve *, struct service_curve *, struct service_curve *); static int hfsccmd_if_attach(struct hfsc_attach *); static int hfsccmd_if_detach(struct hfsc_interface *); static int hfsccmd_add_class(struct hfsc_add_class *); static int hfsccmd_delete_class(struct hfsc_delete_class *); static int hfsccmd_modify_class(struct hfsc_modify_class *); static int hfsccmd_add_filter(struct hfsc_add_filter *); static int hfsccmd_delete_filter(struct hfsc_delete_filter *); static int hfsccmd_class_stats(struct hfsc_class_stats *); altqdev_decl(hfsc); #endif /* ALTQ3_COMPAT */ /* * macros */ #define is_a_parent_class(cl) ((cl)->cl_children != NULL) #define HT_INFINITY 0xffffffffffffffffLL /* infinite time value */ #ifdef ALTQ3_COMPAT /* hif_list keeps all hfsc_if's allocated. */ static struct hfsc_if *hif_list = NULL; #endif /* ALTQ3_COMPAT */ int hfsc_pfattach(struct pf_altq *a) { struct ifnet *ifp; int s, error; if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) return (EINVAL); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif error = altq_attach(&ifp->if_snd, ALTQT_HFSC, a->altq_disc, hfsc_enqueue, hfsc_dequeue, hfsc_request, NULL, NULL); splx(s); return (error); } int hfsc_add_altq(struct pf_altq *a) { struct hfsc_if *hif; struct ifnet *ifp; if ((ifp = ifunit(a->ifname)) == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_NOWAIT | M_ZERO); if (hif == NULL) return (ENOMEM); TAILQ_INIT(&hif->hif_eligible); hif->hif_ifq = &ifp->if_snd; /* keep the state in pf_altq */ a->altq_disc = hif; return (0); } int hfsc_remove_altq(struct pf_altq *a) { struct hfsc_if *hif; if ((hif = a->altq_disc) == NULL) return (EINVAL); a->altq_disc = NULL; (void)hfsc_clear_interface(hif); (void)hfsc_class_destroy(hif->hif_rootclass); free(hif, M_DEVBUF); return (0); } int hfsc_add_queue(struct pf_altq *a) { struct hfsc_if *hif; struct hfsc_class *cl, *parent; struct hfsc_opts *opts; struct service_curve rtsc, lssc, ulsc; if ((hif = a->altq_disc) == NULL) return (EINVAL); opts = &a->pq_u.hfsc_opts; if (a->parent_qid == HFSC_NULLCLASS_HANDLE && hif->hif_rootclass == NULL) parent = NULL; else if ((parent = clh_to_clp(hif, a->parent_qid)) == NULL) return (EINVAL); if (a->qid == 0) return (EINVAL); if (clh_to_clp(hif, a->qid) != NULL) return (EBUSY); rtsc.m1 = opts->rtsc_m1; rtsc.d = opts->rtsc_d; rtsc.m2 = opts->rtsc_m2; lssc.m1 = opts->lssc_m1; lssc.d = opts->lssc_d; lssc.m2 = opts->lssc_m2; ulsc.m1 = opts->ulsc_m1; ulsc.d = opts->ulsc_d; ulsc.m2 = opts->ulsc_m2; cl = hfsc_class_create(hif, &rtsc, &lssc, &ulsc, parent, a->qlimit, opts->flags, a->qid); if (cl == NULL) return (ENOMEM); return (0); } int hfsc_remove_queue(struct pf_altq *a) { struct hfsc_if *hif; struct hfsc_class *cl; if ((hif = a->altq_disc) == NULL) return (EINVAL); if ((cl = clh_to_clp(hif, a->qid)) == NULL) return (EINVAL); return (hfsc_class_destroy(cl)); } int hfsc_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) { struct hfsc_if *hif; struct hfsc_class *cl; struct hfsc_classstats stats; int error = 0; if ((hif = altq_lookup(a->ifname, ALTQT_HFSC)) == NULL) return (EBADF); if ((cl = clh_to_clp(hif, a->qid)) == NULL) return (EINVAL); if (*nbytes < sizeof(stats)) return (EINVAL); get_class_stats(&stats, cl); if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) return (error); *nbytes = sizeof(stats); return (0); } /* * bring the interface back to the initial state by discarding * all the filters and classes except the root class. */ static int hfsc_clear_interface(struct hfsc_if *hif) { struct hfsc_class *cl; #ifdef ALTQ3_COMPAT /* free the filters for this interface */ acc_discard_filters(&hif->hif_classifier, NULL, 1); #endif /* clear out the classes */ while (hif->hif_rootclass != NULL && (cl = hif->hif_rootclass->cl_children) != NULL) { /* * remove the first leaf class found in the hierarchy * then start over */ for (; cl != NULL; cl = hfsc_nextclass(cl)) { if (!is_a_parent_class(cl)) { (void)hfsc_class_destroy(cl); break; } } } return (0); } static int hfsc_request(struct ifaltq *ifq, int req, void *arg) { struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; IFQ_LOCK_ASSERT(ifq); switch (req) { case ALTRQ_PURGE: hfsc_purge(hif); break; } return (0); } /* discard all the queued packets on the interface */ static void hfsc_purge(struct hfsc_if *hif) { struct hfsc_class *cl; for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) if (!qempty(cl->cl_q)) hfsc_purgeq(cl); if (ALTQ_IS_ENABLED(hif->hif_ifq)) hif->hif_ifq->ifq_len = 0; } struct hfsc_class * hfsc_class_create(struct hfsc_if *hif, struct service_curve *rsc, struct service_curve *fsc, struct service_curve *usc, struct hfsc_class *parent, int qlimit, int flags, int qid) { struct hfsc_class *cl, *p; int i, s; if (hif->hif_classes >= HFSC_MAX_CLASSES) return (NULL); #ifndef ALTQ_RED if (flags & HFCF_RED) { #ifdef ALTQ_DEBUG printf("hfsc_class_create: RED not configured for HFSC!\n"); #endif return (NULL); } #endif cl = malloc(sizeof(struct hfsc_class), M_DEVBUF, M_NOWAIT | M_ZERO); if (cl == NULL) return (NULL); cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (cl->cl_q == NULL) goto err_ret; TAILQ_INIT(&cl->cl_actc); if (qlimit == 0) qlimit = 50; /* use default */ qlimit(cl->cl_q) = qlimit; qtype(cl->cl_q) = Q_DROPTAIL; qlen(cl->cl_q) = 0; cl->cl_flags = flags; #ifdef ALTQ_RED if (flags & (HFCF_RED|HFCF_RIO)) { int red_flags, red_pkttime; u_int m2; m2 = 0; if (rsc != NULL && rsc->m2 > m2) m2 = rsc->m2; if (fsc != NULL && fsc->m2 > m2) m2 = fsc->m2; if (usc != NULL && usc->m2 > m2) m2 = usc->m2; red_flags = 0; if (flags & HFCF_ECN) red_flags |= REDF_ECN; #ifdef ALTQ_RIO if (flags & HFCF_CLEARDSCP) red_flags |= RIOF_CLEARDSCP; #endif if (m2 < 8) red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ else red_pkttime = (int64_t)hif->hif_ifq->altq_ifp->if_mtu * 1000 * 1000 * 1000 / (m2 / 8); if (flags & HFCF_RED) { cl->cl_red = red_alloc(0, 0, qlimit(cl->cl_q) * 10/100, qlimit(cl->cl_q) * 30/100, red_flags, red_pkttime); if (cl->cl_red != NULL) qtype(cl->cl_q) = Q_RED; } #ifdef ALTQ_RIO else { cl->cl_red = (red_t *)rio_alloc(0, NULL, red_flags, red_pkttime); if (cl->cl_red != NULL) qtype(cl->cl_q) = Q_RIO; } #endif } #endif /* ALTQ_RED */ if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0)) { cl->cl_rsc = malloc(sizeof(struct internal_sc), M_DEVBUF, M_NOWAIT); if (cl->cl_rsc == NULL) goto err_ret; sc2isc(rsc, cl->cl_rsc); rtsc_init(&cl->cl_deadline, cl->cl_rsc, 0, 0); rtsc_init(&cl->cl_eligible, cl->cl_rsc, 0, 0); } if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0)) { cl->cl_fsc = malloc(sizeof(struct internal_sc), M_DEVBUF, M_NOWAIT); if (cl->cl_fsc == NULL) goto err_ret; sc2isc(fsc, cl->cl_fsc); rtsc_init(&cl->cl_virtual, cl->cl_fsc, 0, 0); } if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0)) { cl->cl_usc = malloc(sizeof(struct internal_sc), M_DEVBUF, M_NOWAIT); if (cl->cl_usc == NULL) goto err_ret; sc2isc(usc, cl->cl_usc); rtsc_init(&cl->cl_ulimit, cl->cl_usc, 0, 0); } cl->cl_id = hif->hif_classid++; cl->cl_handle = qid; cl->cl_hif = hif; cl->cl_parent = parent; #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(hif->hif_ifq); hif->hif_classes++; /* * find a free slot in the class table. if the slot matching * the lower bits of qid is free, use this slot. otherwise, * use the first free slot. */ i = qid % HFSC_MAX_CLASSES; if (hif->hif_class_tbl[i] == NULL) hif->hif_class_tbl[i] = cl; else { for (i = 0; i < HFSC_MAX_CLASSES; i++) if (hif->hif_class_tbl[i] == NULL) { hif->hif_class_tbl[i] = cl; break; } if (i == HFSC_MAX_CLASSES) { IFQ_UNLOCK(hif->hif_ifq); splx(s); goto err_ret; } } if (flags & HFCF_DEFAULTCLASS) hif->hif_defaultclass = cl; if (parent == NULL) { /* this is root class */ hif->hif_rootclass = cl; } else { /* add this class to the children list of the parent */ if ((p = parent->cl_children) == NULL) parent->cl_children = cl; else { while (p->cl_siblings != NULL) p = p->cl_siblings; p->cl_siblings = cl; } } IFQ_UNLOCK(hif->hif_ifq); splx(s); return (cl); err_ret: if (cl->cl_red != NULL) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_destroy((rio_t *)cl->cl_red); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_destroy(cl->cl_red); #endif } if (cl->cl_fsc != NULL) free(cl->cl_fsc, M_DEVBUF); if (cl->cl_rsc != NULL) free(cl->cl_rsc, M_DEVBUF); if (cl->cl_usc != NULL) free(cl->cl_usc, M_DEVBUF); if (cl->cl_q != NULL) free(cl->cl_q, M_DEVBUF); free(cl, M_DEVBUF); return (NULL); } static int hfsc_class_destroy(struct hfsc_class *cl) { int i, s; if (cl == NULL) return (0); if (is_a_parent_class(cl)) return (EBUSY); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(cl->cl_hif->hif_ifq); #ifdef ALTQ3_COMPAT /* delete filters referencing to this class */ acc_discard_filters(&cl->cl_hif->hif_classifier, cl, 0); #endif /* ALTQ3_COMPAT */ if (!qempty(cl->cl_q)) hfsc_purgeq(cl); if (cl->cl_parent == NULL) { /* this is root class */ } else { struct hfsc_class *p = cl->cl_parent->cl_children; if (p == cl) cl->cl_parent->cl_children = cl->cl_siblings; else do { if (p->cl_siblings == cl) { p->cl_siblings = cl->cl_siblings; break; } } while ((p = p->cl_siblings) != NULL); ASSERT(p != NULL); } for (i = 0; i < HFSC_MAX_CLASSES; i++) if (cl->cl_hif->hif_class_tbl[i] == cl) { cl->cl_hif->hif_class_tbl[i] = NULL; break; } cl->cl_hif->hif_classes--; IFQ_UNLOCK(cl->cl_hif->hif_ifq); splx(s); if (cl->cl_red != NULL) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_destroy((rio_t *)cl->cl_red); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_destroy(cl->cl_red); #endif } IFQ_LOCK(cl->cl_hif->hif_ifq); if (cl == cl->cl_hif->hif_rootclass) cl->cl_hif->hif_rootclass = NULL; if (cl == cl->cl_hif->hif_defaultclass) cl->cl_hif->hif_defaultclass = NULL; IFQ_UNLOCK(cl->cl_hif->hif_ifq); if (cl->cl_usc != NULL) free(cl->cl_usc, M_DEVBUF); if (cl->cl_fsc != NULL) free(cl->cl_fsc, M_DEVBUF); if (cl->cl_rsc != NULL) free(cl->cl_rsc, M_DEVBUF); free(cl->cl_q, M_DEVBUF); free(cl, M_DEVBUF); return (0); } /* * hfsc_nextclass returns the next class in the tree. * usage: * for (cl = hif->hif_rootclass; cl != NULL; cl = hfsc_nextclass(cl)) * do_something; */ static struct hfsc_class * hfsc_nextclass(struct hfsc_class *cl) { if (cl->cl_children != NULL) cl = cl->cl_children; else if (cl->cl_siblings != NULL) cl = cl->cl_siblings; else { while ((cl = cl->cl_parent) != NULL) if (cl->cl_siblings) { cl = cl->cl_siblings; break; } } return (cl); } /* * hfsc_enqueue is an enqueue function to be registered to * (*altq_enqueue) in struct ifaltq. */ static int hfsc_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) { struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; struct hfsc_class *cl; struct pf_mtag *t; int len; IFQ_LOCK_ASSERT(ifq); /* grab class set by classifier */ if ((m->m_flags & M_PKTHDR) == 0) { /* should not happen */ printf("altq: packet for %s does not have pkthdr\n", ifq->altq_ifp->if_xname); m_freem(m); return (ENOBUFS); } cl = NULL; if ((t = pf_find_mtag(m)) != NULL) cl = clh_to_clp(hif, t->qid); #ifdef ALTQ3_COMPAT else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) cl = pktattr->pattr_class; #endif if (cl == NULL || is_a_parent_class(cl)) { cl = hif->hif_defaultclass; if (cl == NULL) { m_freem(m); return (ENOBUFS); } } #ifdef ALTQ3_COMPAT if (pktattr != NULL) cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ else #endif cl->cl_pktattr = NULL; len = m_pktlen(m); if (hfsc_addq(cl, m) != 0) { /* drop occurred. mbuf was freed in hfsc_addq. */ PKTCNTR_ADD(&cl->cl_stats.drop_cnt, len); return (ENOBUFS); } IFQ_INC_LEN(ifq); cl->cl_hif->hif_packets++; /* successfully queued. */ if (qlen(cl->cl_q) == 1) set_active(cl, m_pktlen(m)); return (0); } /* * hfsc_dequeue is a dequeue function to be registered to * (*altq_dequeue) in struct ifaltq. * * note: ALTDQ_POLL returns the next packet without removing the packet * from the queue. ALTDQ_REMOVE is a normal dequeue operation. * ALTDQ_REMOVE must return the same packet if called immediately * after ALTDQ_POLL. */ static struct mbuf * hfsc_dequeue(struct ifaltq *ifq, int op) { struct hfsc_if *hif = (struct hfsc_if *)ifq->altq_disc; struct hfsc_class *cl; struct mbuf *m; int len, next_len; int realtime = 0; u_int64_t cur_time; IFQ_LOCK_ASSERT(ifq); if (hif->hif_packets == 0) /* no packet in the tree */ return (NULL); cur_time = read_machclk(); if (op == ALTDQ_REMOVE && hif->hif_pollcache != NULL) { cl = hif->hif_pollcache; hif->hif_pollcache = NULL; /* check if the class was scheduled by real-time criteria */ if (cl->cl_rsc != NULL) realtime = (cl->cl_e <= cur_time); } else { /* * if there are eligible classes, use real-time criteria. * find the class with the minimum deadline among * the eligible classes. */ if ((cl = hfsc_get_mindl(hif, cur_time)) != NULL) { realtime = 1; } else { #ifdef ALTQ_DEBUG int fits = 0; #endif /* * use link-sharing criteria * get the class with the minimum vt in the hierarchy */ cl = hif->hif_rootclass; while (is_a_parent_class(cl)) { cl = actlist_firstfit(cl, cur_time); if (cl == NULL) { #ifdef ALTQ_DEBUG if (fits > 0) printf("%d fit but none found\n",fits); #endif return (NULL); } /* * update parent's cl_cvtmin. * don't update if the new vt is smaller. */ if (cl->cl_parent->cl_cvtmin < cl->cl_vt) cl->cl_parent->cl_cvtmin = cl->cl_vt; #ifdef ALTQ_DEBUG fits++; #endif } } if (op == ALTDQ_POLL) { hif->hif_pollcache = cl; m = hfsc_pollq(cl); return (m); } } m = hfsc_getq(cl); if (m == NULL) panic("hfsc_dequeue:"); len = m_pktlen(m); cl->cl_hif->hif_packets--; IFQ_DEC_LEN(ifq); PKTCNTR_ADD(&cl->cl_stats.xmit_cnt, len); update_vf(cl, len, cur_time); if (realtime) cl->cl_cumul += len; if (!qempty(cl->cl_q)) { if (cl->cl_rsc != NULL) { /* update ed */ next_len = m_pktlen(qhead(cl->cl_q)); if (realtime) update_ed(cl, next_len); else update_d(cl, next_len); } } else { /* the class becomes passive */ set_passive(cl); } return (m); } static int hfsc_addq(struct hfsc_class *cl, struct mbuf *m) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, cl->cl_pktattr); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); #endif if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { m_freem(m); return (-1); } if (cl->cl_flags & HFCF_CLEARDSCP) write_dsfield(m, cl->cl_pktattr, 0); _addq(cl->cl_q, m); return (0); } static struct mbuf * hfsc_getq(struct hfsc_class *cl) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) return rio_getq((rio_t *)cl->cl_red, cl->cl_q); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) return red_getq(cl->cl_red, cl->cl_q); #endif return _getq(cl->cl_q); } static struct mbuf * hfsc_pollq(struct hfsc_class *cl) { return qhead(cl->cl_q); } static void hfsc_purgeq(struct hfsc_class *cl) { struct mbuf *m; if (qempty(cl->cl_q)) return; while ((m = _getq(cl->cl_q)) != NULL) { PKTCNTR_ADD(&cl->cl_stats.drop_cnt, m_pktlen(m)); m_freem(m); cl->cl_hif->hif_packets--; IFQ_DEC_LEN(cl->cl_hif->hif_ifq); } ASSERT(qlen(cl->cl_q) == 0); update_vf(cl, 0, 0); /* remove cl from the actlist */ set_passive(cl); } static void set_active(struct hfsc_class *cl, int len) { if (cl->cl_rsc != NULL) init_ed(cl, len); if (cl->cl_fsc != NULL) init_vf(cl, len); cl->cl_stats.period++; } static void set_passive(struct hfsc_class *cl) { if (cl->cl_rsc != NULL) ellist_remove(cl); /* * actlist is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from actlist */ } static void init_ed(struct hfsc_class *cl, int next_len) { u_int64_t cur_time; cur_time = read_machclk(); /* update the deadline curve */ rtsc_min(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); /* * update the eligible curve. * for concave, it is equal to the deadline curve. * for convex, it is a linear curve with slope m2. */ cl->cl_eligible = cl->cl_deadline; if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { cl->cl_eligible.dx = 0; cl->cl_eligible.dy = 0; } /* compute e and d */ cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); ellist_insert(cl); } static void update_ed(struct hfsc_class *cl, int next_len) { cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); ellist_update(cl); } static void update_d(struct hfsc_class *cl, int next_len) { cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); } static void init_vf(struct hfsc_class *cl, int len) { struct hfsc_class *max_cl, *p; u_int64_t vt, f, cur_time; int go_active; cur_time = 0; go_active = 1; for ( ; cl->cl_parent != NULL; cl = cl->cl_parent) { if (go_active && cl->cl_nactive++ == 0) go_active = 1; else go_active = 0; if (go_active) { max_cl = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead); if (max_cl != NULL) { /* * set vt to the average of the min and max * classes. if the parent's period didn't * change, don't decrease vt of the class. */ vt = max_cl->cl_vt; if (cl->cl_parent->cl_cvtmin != 0) vt = (cl->cl_parent->cl_cvtmin + vt)/2; if (cl->cl_parent->cl_vtperiod != cl->cl_parentperiod || vt > cl->cl_vt) cl->cl_vt = vt; } else { /* * first child for a new parent backlog period. * add parent's cvtmax to vtoff of children * to make a new vt (vtoff + vt) larger than * the vt in the last period for all children. */ vt = cl->cl_parent->cl_cvtmax; for (p = cl->cl_parent->cl_children; p != NULL; p = p->cl_siblings) p->cl_vtoff += vt; cl->cl_vt = 0; cl->cl_parent->cl_cvtmax = 0; cl->cl_parent->cl_cvtmin = 0; } cl->cl_initvt = cl->cl_vt; /* update the virtual curve */ vt = cl->cl_vt + cl->cl_vtoff; rtsc_min(&cl->cl_virtual, cl->cl_fsc, vt, cl->cl_total); if (cl->cl_virtual.x == vt) { cl->cl_virtual.x -= cl->cl_vtoff; cl->cl_vtoff = 0; } cl->cl_vtadj = 0; cl->cl_vtperiod++; /* increment vt period */ cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; if (cl->cl_parent->cl_nactive == 0) cl->cl_parentperiod++; cl->cl_f = 0; actlist_insert(cl); if (cl->cl_usc != NULL) { /* class has upper limit curve */ if (cur_time == 0) cur_time = read_machclk(); /* update the ulimit curve */ rtsc_min(&cl->cl_ulimit, cl->cl_usc, cur_time, cl->cl_total); /* compute myf */ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); cl->cl_myfadj = 0; } } if (cl->cl_myf > cl->cl_cfmin) f = cl->cl_myf; else f = cl->cl_cfmin; if (f != cl->cl_f) { cl->cl_f = f; update_cfmin(cl->cl_parent); } } } static void update_vf(struct hfsc_class *cl, int len, u_int64_t cur_time) { u_int64_t f, myf_bound, delta; int go_passive; go_passive = qempty(cl->cl_q); for (; cl->cl_parent != NULL; cl = cl->cl_parent) { cl->cl_total += len; if (cl->cl_fsc == NULL || cl->cl_nactive == 0) continue; if (go_passive && --cl->cl_nactive == 0) go_passive = 1; else go_passive = 0; if (go_passive) { /* no more active child, going passive */ /* update cvtmax of the parent class */ if (cl->cl_vt > cl->cl_parent->cl_cvtmax) cl->cl_parent->cl_cvtmax = cl->cl_vt; /* remove this class from the vt list */ actlist_remove(cl); update_cfmin(cl->cl_parent); continue; } /* * update vt and f */ cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) - cl->cl_vtoff + cl->cl_vtadj; /* * if vt of the class is smaller than cvtmin, * the class was skipped in the past due to non-fit. * if so, we need to adjust vtadj. */ if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; cl->cl_vt = cl->cl_parent->cl_cvtmin; } /* update the vt list */ actlist_update(cl); if (cl->cl_usc != NULL) { cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); /* * if myf lags behind by more than one clock tick * from the current time, adjust myfadj to prevent * a rate-limited class from going greedy. * in a steady state under rate-limiting, myf * fluctuates within one clock tick. */ myf_bound = cur_time - machclk_per_tick; if (cl->cl_myf < myf_bound) { delta = cur_time - cl->cl_myf; cl->cl_myfadj += delta; cl->cl_myf += delta; } } /* cl_f is max(cl_myf, cl_cfmin) */ if (cl->cl_myf > cl->cl_cfmin) f = cl->cl_myf; else f = cl->cl_cfmin; if (f != cl->cl_f) { cl->cl_f = f; update_cfmin(cl->cl_parent); } } } static void update_cfmin(struct hfsc_class *cl) { struct hfsc_class *p; u_int64_t cfmin; if (TAILQ_EMPTY(&cl->cl_actc)) { cl->cl_cfmin = 0; return; } cfmin = HT_INFINITY; TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) { if (p->cl_f == 0) { cl->cl_cfmin = 0; return; } if (p->cl_f < cfmin) cfmin = p->cl_f; } cl->cl_cfmin = cfmin; } /* * TAILQ based ellist and actlist implementation * (ion wanted to make a calendar queue based implementation) */ /* * eligible list holds backlogged classes being sorted by their eligible times. * there is one eligible list per interface. */ static void ellist_insert(struct hfsc_class *cl) { struct hfsc_if *hif = cl->cl_hif; struct hfsc_class *p; /* check the last entry first */ if ((p = TAILQ_LAST(&hif->hif_eligible, elighead)) == NULL || p->cl_e <= cl->cl_e) { TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist); return; } TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) { if (cl->cl_e < p->cl_e) { TAILQ_INSERT_BEFORE(p, cl, cl_ellist); return; } } ASSERT(0); /* should not reach here */ } static void ellist_remove(struct hfsc_class *cl) { struct hfsc_if *hif = cl->cl_hif; TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); } static void ellist_update(struct hfsc_class *cl) { struct hfsc_if *hif = cl->cl_hif; struct hfsc_class *p, *last; /* * the eligible time of a class increases monotonically. * if the next entry has a larger eligible time, nothing to do. */ p = TAILQ_NEXT(cl, cl_ellist); if (p == NULL || cl->cl_e <= p->cl_e) return; /* check the last entry */ last = TAILQ_LAST(&hif->hif_eligible, elighead); ASSERT(last != NULL); if (last->cl_e <= cl->cl_e) { TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); TAILQ_INSERT_TAIL(&hif->hif_eligible, cl, cl_ellist); return; } /* * the new position must be between the next entry * and the last entry */ while ((p = TAILQ_NEXT(p, cl_ellist)) != NULL) { if (cl->cl_e < p->cl_e) { TAILQ_REMOVE(&hif->hif_eligible, cl, cl_ellist); TAILQ_INSERT_BEFORE(p, cl, cl_ellist); return; } } ASSERT(0); /* should not reach here */ } /* find the class with the minimum deadline among the eligible classes */ struct hfsc_class * hfsc_get_mindl(struct hfsc_if *hif, u_int64_t cur_time) { struct hfsc_class *p, *cl = NULL; TAILQ_FOREACH(p, &hif->hif_eligible, cl_ellist) { if (p->cl_e > cur_time) break; if (cl == NULL || p->cl_d < cl->cl_d) cl = p; } return (cl); } /* * active children list holds backlogged child classes being sorted * by their virtual time. * each intermediate class has one active children list. */ static void actlist_insert(struct hfsc_class *cl) { struct hfsc_class *p; /* check the last entry first */ if ((p = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead)) == NULL || p->cl_vt <= cl->cl_vt) { TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist); return; } TAILQ_FOREACH(p, &cl->cl_parent->cl_actc, cl_actlist) { if (cl->cl_vt < p->cl_vt) { TAILQ_INSERT_BEFORE(p, cl, cl_actlist); return; } } ASSERT(0); /* should not reach here */ } static void actlist_remove(struct hfsc_class *cl) { TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); } static void actlist_update(struct hfsc_class *cl) { struct hfsc_class *p, *last; /* * the virtual time of a class increases monotonically during its * backlogged period. * if the next entry has a larger virtual time, nothing to do. */ p = TAILQ_NEXT(cl, cl_actlist); if (p == NULL || cl->cl_vt < p->cl_vt) return; /* check the last entry */ last = TAILQ_LAST(&cl->cl_parent->cl_actc, acthead); ASSERT(last != NULL); if (last->cl_vt <= cl->cl_vt) { TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); TAILQ_INSERT_TAIL(&cl->cl_parent->cl_actc, cl, cl_actlist); return; } /* * the new position must be between the next entry * and the last entry */ while ((p = TAILQ_NEXT(p, cl_actlist)) != NULL) { if (cl->cl_vt < p->cl_vt) { TAILQ_REMOVE(&cl->cl_parent->cl_actc, cl, cl_actlist); TAILQ_INSERT_BEFORE(p, cl, cl_actlist); return; } } ASSERT(0); /* should not reach here */ } static struct hfsc_class * actlist_firstfit(struct hfsc_class *cl, u_int64_t cur_time) { struct hfsc_class *p; TAILQ_FOREACH(p, &cl->cl_actc, cl_actlist) { if (p->cl_f <= cur_time) return (p); } return (NULL); } /* * service curve support functions * * external service curve parameters * m: bits/sec * d: msec * internal service curve parameters * sm: (bytes/tsc_interval) << SM_SHIFT * ism: (tsc_count/byte) << ISM_SHIFT * dx: tsc_count * * SM_SHIFT and ISM_SHIFT are scaled in order to keep effective digits. * we should be able to handle 100K-1Gbps linkspeed with 200Hz-1GHz CPU * speed. SM_SHIFT and ISM_SHIFT are selected to have at least 3 effective * digits in decimal using the following table. * * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps * ----------+------------------------------------------------------- * bytes/nsec 12.5e-6 125e-6 1250e-6 12500e-6 125000e-6 * sm(500MHz) 25.0e-6 250e-6 2500e-6 25000e-6 250000e-6 * sm(200MHz) 62.5e-6 625e-6 6250e-6 62500e-6 625000e-6 * * nsec/byte 80000 8000 800 80 8 * ism(500MHz) 40000 4000 400 40 4 * ism(200MHz) 16000 1600 160 16 1.6 */ #define SM_SHIFT 24 #define ISM_SHIFT 10 #define SM_MASK ((1LL << SM_SHIFT) - 1) #define ISM_MASK ((1LL << ISM_SHIFT) - 1) static __inline u_int64_t seg_x2y(u_int64_t x, u_int64_t sm) { u_int64_t y; /* * compute * y = x * sm >> SM_SHIFT * but divide it for the upper and lower bits to avoid overflow */ y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); return (y); } static __inline u_int64_t seg_y2x(u_int64_t y, u_int64_t ism) { u_int64_t x; if (y == 0) x = 0; else if (ism == HT_INFINITY) x = HT_INFINITY; else { x = (y >> ISM_SHIFT) * ism + (((y & ISM_MASK) * ism) >> ISM_SHIFT); } return (x); } static __inline u_int64_t m2sm(u_int m) { u_int64_t sm; sm = ((u_int64_t)m << SM_SHIFT) / 8 / machclk_freq; return (sm); } static __inline u_int64_t m2ism(u_int m) { u_int64_t ism; if (m == 0) ism = HT_INFINITY; else ism = ((u_int64_t)machclk_freq << ISM_SHIFT) * 8 / m; return (ism); } static __inline u_int64_t d2dx(u_int d) { u_int64_t dx; dx = ((u_int64_t)d * machclk_freq) / 1000; return (dx); } static u_int sm2m(u_int64_t sm) { u_int64_t m; m = (sm * 8 * machclk_freq) >> SM_SHIFT; return ((u_int)m); } static u_int dx2d(u_int64_t dx) { u_int64_t d; d = dx * 1000 / machclk_freq; return ((u_int)d); } static void sc2isc(struct service_curve *sc, struct internal_sc *isc) { isc->sm1 = m2sm(sc->m1); isc->ism1 = m2ism(sc->m1); isc->dx = d2dx(sc->d); isc->dy = seg_x2y(isc->dx, isc->sm1); isc->sm2 = m2sm(sc->m2); isc->ism2 = m2ism(sc->m2); } /* * initialize the runtime service curve with the given internal * service curve starting at (x, y). */ static void rtsc_init(struct runtime_sc *rtsc, struct internal_sc * isc, u_int64_t x, u_int64_t y) { rtsc->x = x; rtsc->y = y; rtsc->sm1 = isc->sm1; rtsc->ism1 = isc->ism1; rtsc->dx = isc->dx; rtsc->dy = isc->dy; rtsc->sm2 = isc->sm2; rtsc->ism2 = isc->ism2; } /* * calculate the y-projection of the runtime service curve by the * given x-projection value */ static u_int64_t rtsc_y2x(struct runtime_sc *rtsc, u_int64_t y) { u_int64_t x; if (y < rtsc->y) x = rtsc->x; else if (y <= rtsc->y + rtsc->dy) { /* x belongs to the 1st segment */ if (rtsc->dy == 0) x = rtsc->x + rtsc->dx; else x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); } else { /* x belongs to the 2nd segment */ x = rtsc->x + rtsc->dx + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); } return (x); } static u_int64_t rtsc_x2y(struct runtime_sc *rtsc, u_int64_t x) { u_int64_t y; if (x <= rtsc->x) y = rtsc->y; else if (x <= rtsc->x + rtsc->dx) /* y belongs to the 1st segment */ y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); else /* y belongs to the 2nd segment */ y = rtsc->y + rtsc->dy + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); return (y); } /* * update the runtime service curve by taking the minimum of the current * runtime service curve and the service curve starting at (x, y). */ static void rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u_int64_t x, u_int64_t y) { u_int64_t y1, y2, dx, dy; if (isc->sm1 <= isc->sm2) { /* service curve is convex */ y1 = rtsc_x2y(rtsc, x); if (y1 < y) /* the current rtsc is smaller */ return; rtsc->x = x; rtsc->y = y; return; } /* * service curve is concave * compute the two y values of the current rtsc * y1: at x * y2: at (x + dx) */ y1 = rtsc_x2y(rtsc, x); if (y1 <= y) { /* rtsc is below isc, no change to rtsc */ return; } y2 = rtsc_x2y(rtsc, x + isc->dx); if (y2 >= y + isc->dy) { /* rtsc is above isc, replace rtsc by isc */ rtsc->x = x; rtsc->y = y; rtsc->dx = isc->dx; rtsc->dy = isc->dy; return; } /* * the two curves intersect * compute the offsets (dx, dy) using the reverse * function of seg_x2y() * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) */ dx = ((y1 - y) << SM_SHIFT) / (isc->sm1 - isc->sm2); /* * check if (x, y1) belongs to the 1st segment of rtsc. * if so, add the offset. */ if (rtsc->x + rtsc->dx > x) dx += rtsc->x + rtsc->dx - x; dy = seg_x2y(dx, isc->sm1); rtsc->x = x; rtsc->y = y; rtsc->dx = dx; rtsc->dy = dy; return; } static void get_class_stats(struct hfsc_classstats *sp, struct hfsc_class *cl) { sp->class_id = cl->cl_id; sp->class_handle = cl->cl_handle; if (cl->cl_rsc != NULL) { sp->rsc.m1 = sm2m(cl->cl_rsc->sm1); sp->rsc.d = dx2d(cl->cl_rsc->dx); sp->rsc.m2 = sm2m(cl->cl_rsc->sm2); } else { sp->rsc.m1 = 0; sp->rsc.d = 0; sp->rsc.m2 = 0; } if (cl->cl_fsc != NULL) { sp->fsc.m1 = sm2m(cl->cl_fsc->sm1); sp->fsc.d = dx2d(cl->cl_fsc->dx); sp->fsc.m2 = sm2m(cl->cl_fsc->sm2); } else { sp->fsc.m1 = 0; sp->fsc.d = 0; sp->fsc.m2 = 0; } if (cl->cl_usc != NULL) { sp->usc.m1 = sm2m(cl->cl_usc->sm1); sp->usc.d = dx2d(cl->cl_usc->dx); sp->usc.m2 = sm2m(cl->cl_usc->sm2); } else { sp->usc.m1 = 0; sp->usc.d = 0; sp->usc.m2 = 0; } sp->total = cl->cl_total; sp->cumul = cl->cl_cumul; sp->d = cl->cl_d; sp->e = cl->cl_e; sp->vt = cl->cl_vt; sp->f = cl->cl_f; sp->initvt = cl->cl_initvt; sp->vtperiod = cl->cl_vtperiod; sp->parentperiod = cl->cl_parentperiod; sp->nactive = cl->cl_nactive; sp->vtoff = cl->cl_vtoff; sp->cvtmax = cl->cl_cvtmax; sp->myf = cl->cl_myf; sp->cfmin = cl->cl_cfmin; sp->cvtmin = cl->cl_cvtmin; sp->myfadj = cl->cl_myfadj; sp->vtadj = cl->cl_vtadj; sp->cur_time = read_machclk(); sp->machclk_freq = machclk_freq; sp->qlength = qlen(cl->cl_q); sp->qlimit = qlimit(cl->cl_q); sp->xmit_cnt = cl->cl_stats.xmit_cnt; sp->drop_cnt = cl->cl_stats.drop_cnt; sp->period = cl->cl_stats.period; sp->qtype = qtype(cl->cl_q); #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_getstats(cl->cl_red, &sp->red[0]); #endif #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); #endif } /* convert a class handle to the corresponding class pointer */ static struct hfsc_class * clh_to_clp(struct hfsc_if *hif, u_int32_t chandle) { int i; struct hfsc_class *cl; if (chandle == 0) return (NULL); /* * first, try optimistically the slot matching the lower bits of * the handle. if it fails, do the linear table search. */ i = chandle % HFSC_MAX_CLASSES; if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) return (cl); for (i = 0; i < HFSC_MAX_CLASSES; i++) if ((cl = hif->hif_class_tbl[i]) != NULL && cl->cl_handle == chandle) return (cl); return (NULL); } #ifdef ALTQ3_COMPAT static struct hfsc_if * hfsc_attach(ifq, bandwidth) struct ifaltq *ifq; u_int bandwidth; { struct hfsc_if *hif; hif = malloc(sizeof(struct hfsc_if), M_DEVBUF, M_WAITOK); if (hif == NULL) return (NULL); bzero(hif, sizeof(struct hfsc_if)); hif->hif_eligible = ellist_alloc(); if (hif->hif_eligible == NULL) { free(hif, M_DEVBUF); return NULL; } hif->hif_ifq = ifq; /* add this state to the hfsc list */ hif->hif_next = hif_list; hif_list = hif; return (hif); } static int hfsc_detach(hif) struct hfsc_if *hif; { (void)hfsc_clear_interface(hif); (void)hfsc_class_destroy(hif->hif_rootclass); /* remove this interface from the hif list */ if (hif_list == hif) hif_list = hif->hif_next; else { struct hfsc_if *h; for (h = hif_list; h != NULL; h = h->hif_next) if (h->hif_next == hif) { h->hif_next = hif->hif_next; break; } ASSERT(h != NULL); } ellist_destroy(hif->hif_eligible); free(hif, M_DEVBUF); return (0); } static int hfsc_class_modify(cl, rsc, fsc, usc) struct hfsc_class *cl; struct service_curve *rsc, *fsc, *usc; { struct internal_sc *rsc_tmp, *fsc_tmp, *usc_tmp; u_int64_t cur_time; int s; rsc_tmp = fsc_tmp = usc_tmp = NULL; if (rsc != NULL && (rsc->m1 != 0 || rsc->m2 != 0) && cl->cl_rsc == NULL) { rsc_tmp = malloc(sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); if (rsc_tmp == NULL) return (ENOMEM); } if (fsc != NULL && (fsc->m1 != 0 || fsc->m2 != 0) && cl->cl_fsc == NULL) { fsc_tmp = malloc(sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); if (fsc_tmp == NULL) { free(rsc_tmp); return (ENOMEM); } } if (usc != NULL && (usc->m1 != 0 || usc->m2 != 0) && cl->cl_usc == NULL) { usc_tmp = malloc(sizeof(struct internal_sc), M_DEVBUF, M_WAITOK); if (usc_tmp == NULL) { free(rsc_tmp); free(fsc_tmp); return (ENOMEM); } } cur_time = read_machclk(); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(cl->cl_hif->hif_ifq); if (rsc != NULL) { if (rsc->m1 == 0 && rsc->m2 == 0) { if (cl->cl_rsc != NULL) { if (!qempty(cl->cl_q)) hfsc_purgeq(cl); free(cl->cl_rsc, M_DEVBUF); cl->cl_rsc = NULL; } } else { if (cl->cl_rsc == NULL) cl->cl_rsc = rsc_tmp; sc2isc(rsc, cl->cl_rsc); rtsc_init(&cl->cl_deadline, cl->cl_rsc, cur_time, cl->cl_cumul); cl->cl_eligible = cl->cl_deadline; if (cl->cl_rsc->sm1 <= cl->cl_rsc->sm2) { cl->cl_eligible.dx = 0; cl->cl_eligible.dy = 0; } } } if (fsc != NULL) { if (fsc->m1 == 0 && fsc->m2 == 0) { if (cl->cl_fsc != NULL) { if (!qempty(cl->cl_q)) hfsc_purgeq(cl); free(cl->cl_fsc, M_DEVBUF); cl->cl_fsc = NULL; } } else { if (cl->cl_fsc == NULL) cl->cl_fsc = fsc_tmp; sc2isc(fsc, cl->cl_fsc); rtsc_init(&cl->cl_virtual, cl->cl_fsc, cl->cl_vt, cl->cl_total); } } if (usc != NULL) { if (usc->m1 == 0 && usc->m2 == 0) { if (cl->cl_usc != NULL) { free(cl->cl_usc, M_DEVBUF); cl->cl_usc = NULL; cl->cl_myf = 0; } } else { if (cl->cl_usc == NULL) cl->cl_usc = usc_tmp; sc2isc(usc, cl->cl_usc); rtsc_init(&cl->cl_ulimit, cl->cl_usc, cur_time, cl->cl_total); } } if (!qempty(cl->cl_q)) { if (cl->cl_rsc != NULL) update_ed(cl, m_pktlen(qhead(cl->cl_q))); if (cl->cl_fsc != NULL) update_vf(cl, 0, cur_time); /* is this enough? */ } IFQ_UNLOCK(cl->cl_hif->hif_ifq); splx(s); return (0); } /* * hfsc device interface */ int hfscopen(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) { printf("hfsc: no cpu clock available!\n"); return (ENXIO); } /* everything will be done when the queueing scheme is attached. */ return 0; } int hfscclose(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct hfsc_if *hif; int err, error = 0; while ((hif = hif_list) != NULL) { /* destroy all */ if (ALTQ_IS_ENABLED(hif->hif_ifq)) altq_disable(hif->hif_ifq); err = altq_detach(hif->hif_ifq); if (err == 0) err = hfsc_detach(hif); if (err != 0 && error == 0) error = err; } return error; } int hfscioctl(dev, cmd, addr, flag, p) dev_t dev; ioctlcmd_t cmd; caddr_t addr; int flag; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct hfsc_if *hif; struct hfsc_interface *ifacep; int error = 0; /* check super-user privilege */ switch (cmd) { case HFSC_GETSTATS: break; default: #if (__FreeBSD_version > 700000) if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) return (error); #elsif (__FreeBSD_version > 400000) if ((error = suser(p)) != 0) return (error); #else if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) return (error); #endif break; } switch (cmd) { case HFSC_IF_ATTACH: error = hfsccmd_if_attach((struct hfsc_attach *)addr); break; case HFSC_IF_DETACH: error = hfsccmd_if_detach((struct hfsc_interface *)addr); break; case HFSC_ENABLE: case HFSC_DISABLE: case HFSC_CLEAR_HIERARCHY: ifacep = (struct hfsc_interface *)addr; if ((hif = altq_lookup(ifacep->hfsc_ifname, ALTQT_HFSC)) == NULL) { error = EBADF; break; } switch (cmd) { case HFSC_ENABLE: if (hif->hif_defaultclass == NULL) { #ifdef ALTQ_DEBUG printf("hfsc: no default class\n"); #endif error = EINVAL; break; } error = altq_enable(hif->hif_ifq); break; case HFSC_DISABLE: error = altq_disable(hif->hif_ifq); break; case HFSC_CLEAR_HIERARCHY: hfsc_clear_interface(hif); break; } break; case HFSC_ADD_CLASS: error = hfsccmd_add_class((struct hfsc_add_class *)addr); break; case HFSC_DEL_CLASS: error = hfsccmd_delete_class((struct hfsc_delete_class *)addr); break; case HFSC_MOD_CLASS: error = hfsccmd_modify_class((struct hfsc_modify_class *)addr); break; case HFSC_ADD_FILTER: error = hfsccmd_add_filter((struct hfsc_add_filter *)addr); break; case HFSC_DEL_FILTER: error = hfsccmd_delete_filter((struct hfsc_delete_filter *)addr); break; case HFSC_GETSTATS: error = hfsccmd_class_stats((struct hfsc_class_stats *)addr); break; default: error = EINVAL; break; } return error; } static int hfsccmd_if_attach(ap) struct hfsc_attach *ap; { struct hfsc_if *hif; struct ifnet *ifp; int error; if ((ifp = ifunit(ap->iface.hfsc_ifname)) == NULL) return (ENXIO); if ((hif = hfsc_attach(&ifp->if_snd, ap->bandwidth)) == NULL) return (ENOMEM); /* * set HFSC to this ifnet structure. */ if ((error = altq_attach(&ifp->if_snd, ALTQT_HFSC, hif, hfsc_enqueue, hfsc_dequeue, hfsc_request, &hif->hif_classifier, acc_classify)) != 0) (void)hfsc_detach(hif); return (error); } static int hfsccmd_if_detach(ap) struct hfsc_interface *ap; { struct hfsc_if *hif; int error; if ((hif = altq_lookup(ap->hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); if (ALTQ_IS_ENABLED(hif->hif_ifq)) altq_disable(hif->hif_ifq); if ((error = altq_detach(hif->hif_ifq))) return (error); return hfsc_detach(hif); } static int hfsccmd_add_class(ap) struct hfsc_add_class *ap; { struct hfsc_if *hif; struct hfsc_class *cl, *parent; int i; if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); if (ap->parent_handle == HFSC_NULLCLASS_HANDLE && hif->hif_rootclass == NULL) parent = NULL; else if ((parent = clh_to_clp(hif, ap->parent_handle)) == NULL) return (EINVAL); /* assign a class handle (use a free slot number for now) */ for (i = 1; i < HFSC_MAX_CLASSES; i++) if (hif->hif_class_tbl[i] == NULL) break; if (i == HFSC_MAX_CLASSES) return (EBUSY); if ((cl = hfsc_class_create(hif, &ap->service_curve, NULL, NULL, parent, ap->qlimit, ap->flags, i)) == NULL) return (ENOMEM); /* return a class handle to the user */ ap->class_handle = i; return (0); } static int hfsccmd_delete_class(ap) struct hfsc_delete_class *ap; { struct hfsc_if *hif; struct hfsc_class *cl; if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) return (EINVAL); return hfsc_class_destroy(cl); } static int hfsccmd_modify_class(ap) struct hfsc_modify_class *ap; { struct hfsc_if *hif; struct hfsc_class *cl; struct service_curve *rsc = NULL; struct service_curve *fsc = NULL; struct service_curve *usc = NULL; if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) return (EINVAL); if (ap->sctype & HFSC_REALTIMESC) rsc = &ap->service_curve; if (ap->sctype & HFSC_LINKSHARINGSC) fsc = &ap->service_curve; if (ap->sctype & HFSC_UPPERLIMITSC) usc = &ap->service_curve; return hfsc_class_modify(cl, rsc, fsc, usc); } static int hfsccmd_add_filter(ap) struct hfsc_add_filter *ap; { struct hfsc_if *hif; struct hfsc_class *cl; if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); if ((cl = clh_to_clp(hif, ap->class_handle)) == NULL) return (EINVAL); if (is_a_parent_class(cl)) { #ifdef ALTQ_DEBUG printf("hfsccmd_add_filter: not a leaf class!\n"); #endif return (EINVAL); } return acc_add_filter(&hif->hif_classifier, &ap->filter, cl, &ap->filter_handle); } static int hfsccmd_delete_filter(ap) struct hfsc_delete_filter *ap; { struct hfsc_if *hif; if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); return acc_delete_filter(&hif->hif_classifier, ap->filter_handle); } static int hfsccmd_class_stats(ap) struct hfsc_class_stats *ap; { struct hfsc_if *hif; struct hfsc_class *cl; struct hfsc_classstats stats, *usp; int n, nclasses, error; if ((hif = altq_lookup(ap->iface.hfsc_ifname, ALTQT_HFSC)) == NULL) return (EBADF); ap->cur_time = read_machclk(); ap->machclk_freq = machclk_freq; ap->hif_classes = hif->hif_classes; ap->hif_packets = hif->hif_packets; /* skip the first N classes in the tree */ nclasses = ap->nskip; for (cl = hif->hif_rootclass, n = 0; cl != NULL && n < nclasses; cl = hfsc_nextclass(cl), n++) ; if (n != nclasses) return (EINVAL); /* then, read the next N classes in the tree */ nclasses = ap->nclasses; usp = ap->stats; for (n = 0; cl != NULL && n < nclasses; cl = hfsc_nextclass(cl), n++) { get_class_stats(&stats, cl); if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, sizeof(stats))) != 0) return (error); } ap->nclasses = n; return (0); } #ifdef KLD_MODULE static struct altqsw hfsc_sw = {"hfsc", hfscopen, hfscclose, hfscioctl}; ALTQ_MODULE(altq_hfsc, ALTQT_HFSC, &hfsc_sw); MODULE_DEPEND(altq_hfsc, altq_red, 1, 1, 1); MODULE_DEPEND(altq_hfsc, altq_rio, 1, 1, 1); #endif /* KLD_MODULE */ #endif /* ALTQ3_COMPAT */ #endif /* ALTQ_HFSC */ Index: stable/10/sys/contrib/altq/altq/altq_priq.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_priq.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_priq.c (revision 263086) @@ -1,1043 +1,1046 @@ /* $FreeBSD$ */ /* $KAME: altq_priq.c,v 1.11 2003/09/17 14:23:25 kjc Exp $ */ /* * Copyright (C) 2000-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * priority queue */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #ifdef ALTQ_PRIQ /* priq is enabled by ALTQ_PRIQ option in opt_altq.h */ #include #include #include #include #include #include #include #include #include #include #include +#include #include -#include +#include +#include +#include #include #ifdef ALTQ3_COMPAT #include #endif #include /* * function prototypes */ #ifdef ALTQ3_COMPAT static struct priq_if *priq_attach(struct ifaltq *, u_int); static int priq_detach(struct priq_if *); #endif static int priq_clear_interface(struct priq_if *); static int priq_request(struct ifaltq *, int, void *); static void priq_purge(struct priq_if *); static struct priq_class *priq_class_create(struct priq_if *, int, int, int, int); static int priq_class_destroy(struct priq_class *); static int priq_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); static struct mbuf *priq_dequeue(struct ifaltq *, int); static int priq_addq(struct priq_class *, struct mbuf *); static struct mbuf *priq_getq(struct priq_class *); static struct mbuf *priq_pollq(struct priq_class *); static void priq_purgeq(struct priq_class *); #ifdef ALTQ3_COMPAT static int priqcmd_if_attach(struct priq_interface *); static int priqcmd_if_detach(struct priq_interface *); static int priqcmd_add_class(struct priq_add_class *); static int priqcmd_delete_class(struct priq_delete_class *); static int priqcmd_modify_class(struct priq_modify_class *); static int priqcmd_add_filter(struct priq_add_filter *); static int priqcmd_delete_filter(struct priq_delete_filter *); static int priqcmd_class_stats(struct priq_class_stats *); #endif /* ALTQ3_COMPAT */ static void get_class_stats(struct priq_classstats *, struct priq_class *); static struct priq_class *clh_to_clp(struct priq_if *, u_int32_t); #ifdef ALTQ3_COMPAT altqdev_decl(priq); /* pif_list keeps all priq_if's allocated. */ static struct priq_if *pif_list = NULL; #endif /* ALTQ3_COMPAT */ int priq_pfattach(struct pf_altq *a) { struct ifnet *ifp; int s, error; if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL) return (EINVAL); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, a->altq_disc, priq_enqueue, priq_dequeue, priq_request, NULL, NULL); splx(s); return (error); } int priq_add_altq(struct pf_altq *a) { struct priq_if *pif; struct ifnet *ifp; if ((ifp = ifunit(a->ifname)) == NULL) return (EINVAL); if (!ALTQ_IS_READY(&ifp->if_snd)) return (ENODEV); pif = malloc(sizeof(struct priq_if), M_DEVBUF, M_NOWAIT | M_ZERO); if (pif == NULL) return (ENOMEM); pif->pif_bandwidth = a->ifbandwidth; pif->pif_maxpri = -1; pif->pif_ifq = &ifp->if_snd; /* keep the state in pf_altq */ a->altq_disc = pif; return (0); } int priq_remove_altq(struct pf_altq *a) { struct priq_if *pif; if ((pif = a->altq_disc) == NULL) return (EINVAL); a->altq_disc = NULL; (void)priq_clear_interface(pif); free(pif, M_DEVBUF); return (0); } int priq_add_queue(struct pf_altq *a) { struct priq_if *pif; struct priq_class *cl; if ((pif = a->altq_disc) == NULL) return (EINVAL); /* check parameters */ if (a->priority >= PRIQ_MAXPRI) return (EINVAL); if (a->qid == 0) return (EINVAL); if (pif->pif_classes[a->priority] != NULL) return (EBUSY); if (clh_to_clp(pif, a->qid) != NULL) return (EBUSY); cl = priq_class_create(pif, a->priority, a->qlimit, a->pq_u.priq_opts.flags, a->qid); if (cl == NULL) return (ENOMEM); return (0); } int priq_remove_queue(struct pf_altq *a) { struct priq_if *pif; struct priq_class *cl; if ((pif = a->altq_disc) == NULL) return (EINVAL); if ((cl = clh_to_clp(pif, a->qid)) == NULL) return (EINVAL); return (priq_class_destroy(cl)); } int priq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) { struct priq_if *pif; struct priq_class *cl; struct priq_classstats stats; int error = 0; if ((pif = altq_lookup(a->ifname, ALTQT_PRIQ)) == NULL) return (EBADF); if ((cl = clh_to_clp(pif, a->qid)) == NULL) return (EINVAL); if (*nbytes < sizeof(stats)) return (EINVAL); get_class_stats(&stats, cl); if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0) return (error); *nbytes = sizeof(stats); return (0); } /* * bring the interface back to the initial state by discarding * all the filters and classes. */ static int priq_clear_interface(struct priq_if *pif) { struct priq_class *cl; int pri; #ifdef ALTQ3_CLFIER_COMPAT /* free the filters for this interface */ acc_discard_filters(&pif->pif_classifier, NULL, 1); #endif /* clear out the classes */ for (pri = 0; pri <= pif->pif_maxpri; pri++) if ((cl = pif->pif_classes[pri]) != NULL) priq_class_destroy(cl); return (0); } static int priq_request(struct ifaltq *ifq, int req, void *arg) { struct priq_if *pif = (struct priq_if *)ifq->altq_disc; IFQ_LOCK_ASSERT(ifq); switch (req) { case ALTRQ_PURGE: priq_purge(pif); break; } return (0); } /* discard all the queued packets on the interface */ static void priq_purge(struct priq_if *pif) { struct priq_class *cl; int pri; for (pri = 0; pri <= pif->pif_maxpri; pri++) { if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) priq_purgeq(cl); } if (ALTQ_IS_ENABLED(pif->pif_ifq)) pif->pif_ifq->ifq_len = 0; } static struct priq_class * priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) { struct priq_class *cl; int s; #ifndef ALTQ_RED if (flags & PRCF_RED) { #ifdef ALTQ_DEBUG printf("priq_class_create: RED not configured for PRIQ!\n"); #endif return (NULL); } #endif if ((cl = pif->pif_classes[pri]) != NULL) { /* modify the class instead of creating a new one */ #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(cl->cl_pif->pif_ifq); if (!qempty(cl->cl_q)) priq_purgeq(cl); IFQ_UNLOCK(cl->cl_pif->pif_ifq); splx(s); #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_destroy((rio_t *)cl->cl_red); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_destroy(cl->cl_red); #endif } else { cl = malloc(sizeof(struct priq_class), M_DEVBUF, M_NOWAIT | M_ZERO); if (cl == NULL) return (NULL); cl->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (cl->cl_q == NULL) goto err_ret; } pif->pif_classes[pri] = cl; if (flags & PRCF_DEFAULTCLASS) pif->pif_default = cl; if (qlimit == 0) qlimit = 50; /* use default */ qlimit(cl->cl_q) = qlimit; qtype(cl->cl_q) = Q_DROPTAIL; qlen(cl->cl_q) = 0; cl->cl_flags = flags; cl->cl_pri = pri; if (pri > pif->pif_maxpri) pif->pif_maxpri = pri; cl->cl_pif = pif; cl->cl_handle = qid; #ifdef ALTQ_RED if (flags & (PRCF_RED|PRCF_RIO)) { int red_flags, red_pkttime; red_flags = 0; if (flags & PRCF_ECN) red_flags |= REDF_ECN; #ifdef ALTQ_RIO if (flags & PRCF_CLEARDSCP) red_flags |= RIOF_CLEARDSCP; #endif if (pif->pif_bandwidth < 8) red_pkttime = 1000 * 1000 * 1000; /* 1 sec */ else red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); #ifdef ALTQ_RIO if (flags & PRCF_RIO) { cl->cl_red = (red_t *)rio_alloc(0, NULL, red_flags, red_pkttime); if (cl->cl_red == NULL) goto err_ret; qtype(cl->cl_q) = Q_RIO; } else #endif if (flags & PRCF_RED) { cl->cl_red = red_alloc(0, 0, qlimit(cl->cl_q) * 10/100, qlimit(cl->cl_q) * 30/100, red_flags, red_pkttime); if (cl->cl_red == NULL) goto err_ret; qtype(cl->cl_q) = Q_RED; } } #endif /* ALTQ_RED */ return (cl); err_ret: if (cl->cl_red != NULL) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_destroy((rio_t *)cl->cl_red); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_destroy(cl->cl_red); #endif } if (cl->cl_q != NULL) free(cl->cl_q, M_DEVBUF); free(cl, M_DEVBUF); return (NULL); } static int priq_class_destroy(struct priq_class *cl) { struct priq_if *pif; int s, pri; #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(cl->cl_pif->pif_ifq); #ifdef ALTQ3_CLFIER_COMPAT /* delete filters referencing to this class */ acc_discard_filters(&cl->cl_pif->pif_classifier, cl, 0); #endif if (!qempty(cl->cl_q)) priq_purgeq(cl); pif = cl->cl_pif; pif->pif_classes[cl->cl_pri] = NULL; if (pif->pif_maxpri == cl->cl_pri) { for (pri = cl->cl_pri; pri >= 0; pri--) if (pif->pif_classes[pri] != NULL) { pif->pif_maxpri = pri; break; } if (pri < 0) pif->pif_maxpri = -1; } IFQ_UNLOCK(cl->cl_pif->pif_ifq); splx(s); if (cl->cl_red != NULL) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_destroy((rio_t *)cl->cl_red); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_destroy(cl->cl_red); #endif } free(cl->cl_q, M_DEVBUF); free(cl, M_DEVBUF); return (0); } /* * priq_enqueue is an enqueue function to be registered to * (*altq_enqueue) in struct ifaltq. */ static int priq_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr) { struct priq_if *pif = (struct priq_if *)ifq->altq_disc; struct priq_class *cl; struct pf_mtag *t; int len; IFQ_LOCK_ASSERT(ifq); /* grab class set by classifier */ if ((m->m_flags & M_PKTHDR) == 0) { /* should not happen */ printf("altq: packet for %s does not have pkthdr\n", ifq->altq_ifp->if_xname); m_freem(m); return (ENOBUFS); } cl = NULL; if ((t = pf_find_mtag(m)) != NULL) cl = clh_to_clp(pif, t->qid); #ifdef ALTQ3_COMPAT else if ((ifq->altq_flags & ALTQF_CLASSIFY) && pktattr != NULL) cl = pktattr->pattr_class; #endif if (cl == NULL) { cl = pif->pif_default; if (cl == NULL) { m_freem(m); return (ENOBUFS); } } #ifdef ALTQ3_COMPAT if (pktattr != NULL) cl->cl_pktattr = pktattr; /* save proto hdr used by ECN */ else #endif cl->cl_pktattr = NULL; len = m_pktlen(m); if (priq_addq(cl, m) != 0) { /* drop occurred. mbuf was freed in priq_addq. */ PKTCNTR_ADD(&cl->cl_dropcnt, len); return (ENOBUFS); } IFQ_INC_LEN(ifq); /* successfully queued. */ return (0); } /* * priq_dequeue is a dequeue function to be registered to * (*altq_dequeue) in struct ifaltq. * * note: ALTDQ_POLL returns the next packet without removing the packet * from the queue. ALTDQ_REMOVE is a normal dequeue operation. * ALTDQ_REMOVE must return the same packet if called immediately * after ALTDQ_POLL. */ static struct mbuf * priq_dequeue(struct ifaltq *ifq, int op) { struct priq_if *pif = (struct priq_if *)ifq->altq_disc; struct priq_class *cl; struct mbuf *m; int pri; IFQ_LOCK_ASSERT(ifq); if (IFQ_IS_EMPTY(ifq)) /* no packet in the queue */ return (NULL); for (pri = pif->pif_maxpri; pri >= 0; pri--) { if ((cl = pif->pif_classes[pri]) != NULL && !qempty(cl->cl_q)) { if (op == ALTDQ_POLL) return (priq_pollq(cl)); m = priq_getq(cl); if (m != NULL) { IFQ_DEC_LEN(ifq); if (qempty(cl->cl_q)) cl->cl_period++; PKTCNTR_ADD(&cl->cl_xmitcnt, m_pktlen(m)); } return (m); } } return (NULL); } static int priq_addq(struct priq_class *cl, struct mbuf *m) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) return rio_addq((rio_t *)cl->cl_red, cl->cl_q, m, cl->cl_pktattr); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); #endif if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { m_freem(m); return (-1); } if (cl->cl_flags & PRCF_CLEARDSCP) write_dsfield(m, cl->cl_pktattr, 0); _addq(cl->cl_q, m); return (0); } static struct mbuf * priq_getq(struct priq_class *cl) { #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) return rio_getq((rio_t *)cl->cl_red, cl->cl_q); #endif #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) return red_getq(cl->cl_red, cl->cl_q); #endif return _getq(cl->cl_q); } static struct mbuf * priq_pollq(cl) struct priq_class *cl; { return qhead(cl->cl_q); } static void priq_purgeq(struct priq_class *cl) { struct mbuf *m; if (qempty(cl->cl_q)) return; while ((m = _getq(cl->cl_q)) != NULL) { PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m)); m_freem(m); } ASSERT(qlen(cl->cl_q) == 0); } static void get_class_stats(struct priq_classstats *sp, struct priq_class *cl) { sp->class_handle = cl->cl_handle; sp->qlength = qlen(cl->cl_q); sp->qlimit = qlimit(cl->cl_q); sp->period = cl->cl_period; sp->xmitcnt = cl->cl_xmitcnt; sp->dropcnt = cl->cl_dropcnt; sp->qtype = qtype(cl->cl_q); #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) red_getstats(cl->cl_red, &sp->red[0]); #endif #ifdef ALTQ_RIO if (q_is_rio(cl->cl_q)) rio_getstats((rio_t *)cl->cl_red, &sp->red[0]); #endif } /* convert a class handle to the corresponding class pointer */ static struct priq_class * clh_to_clp(struct priq_if *pif, u_int32_t chandle) { struct priq_class *cl; int idx; if (chandle == 0) return (NULL); for (idx = pif->pif_maxpri; idx >= 0; idx--) if ((cl = pif->pif_classes[idx]) != NULL && cl->cl_handle == chandle) return (cl); return (NULL); } #ifdef ALTQ3_COMPAT static struct priq_if * priq_attach(ifq, bandwidth) struct ifaltq *ifq; u_int bandwidth; { struct priq_if *pif; pif = malloc(sizeof(struct priq_if), M_DEVBUF, M_WAITOK); if (pif == NULL) return (NULL); bzero(pif, sizeof(struct priq_if)); pif->pif_bandwidth = bandwidth; pif->pif_maxpri = -1; pif->pif_ifq = ifq; /* add this state to the priq list */ pif->pif_next = pif_list; pif_list = pif; return (pif); } static int priq_detach(pif) struct priq_if *pif; { (void)priq_clear_interface(pif); /* remove this interface from the pif list */ if (pif_list == pif) pif_list = pif->pif_next; else { struct priq_if *p; for (p = pif_list; p != NULL; p = p->pif_next) if (p->pif_next == pif) { p->pif_next = pif->pif_next; break; } ASSERT(p != NULL); } free(pif, M_DEVBUF); return (0); } /* * priq device interface */ int priqopen(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { /* everything will be done when the queueing scheme is attached. */ return 0; } int priqclose(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct priq_if *pif; int err, error = 0; while ((pif = pif_list) != NULL) { /* destroy all */ if (ALTQ_IS_ENABLED(pif->pif_ifq)) altq_disable(pif->pif_ifq); err = altq_detach(pif->pif_ifq); if (err == 0) err = priq_detach(pif); if (err != 0 && error == 0) error = err; } return error; } int priqioctl(dev, cmd, addr, flag, p) dev_t dev; ioctlcmd_t cmd; caddr_t addr; int flag; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { struct priq_if *pif; struct priq_interface *ifacep; int error = 0; /* check super-user privilege */ switch (cmd) { case PRIQ_GETSTATS: break; default: #if (__FreeBSD_version > 700000) if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) return (error); #elsif (__FreeBSD_version > 400000) if ((error = suser(p)) != 0) return (error); #else if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) return (error); #endif break; } switch (cmd) { case PRIQ_IF_ATTACH: error = priqcmd_if_attach((struct priq_interface *)addr); break; case PRIQ_IF_DETACH: error = priqcmd_if_detach((struct priq_interface *)addr); break; case PRIQ_ENABLE: case PRIQ_DISABLE: case PRIQ_CLEAR: ifacep = (struct priq_interface *)addr; if ((pif = altq_lookup(ifacep->ifname, ALTQT_PRIQ)) == NULL) { error = EBADF; break; } switch (cmd) { case PRIQ_ENABLE: if (pif->pif_default == NULL) { #ifdef ALTQ_DEBUG printf("priq: no default class\n"); #endif error = EINVAL; break; } error = altq_enable(pif->pif_ifq); break; case PRIQ_DISABLE: error = altq_disable(pif->pif_ifq); break; case PRIQ_CLEAR: priq_clear_interface(pif); break; } break; case PRIQ_ADD_CLASS: error = priqcmd_add_class((struct priq_add_class *)addr); break; case PRIQ_DEL_CLASS: error = priqcmd_delete_class((struct priq_delete_class *)addr); break; case PRIQ_MOD_CLASS: error = priqcmd_modify_class((struct priq_modify_class *)addr); break; case PRIQ_ADD_FILTER: error = priqcmd_add_filter((struct priq_add_filter *)addr); break; case PRIQ_DEL_FILTER: error = priqcmd_delete_filter((struct priq_delete_filter *)addr); break; case PRIQ_GETSTATS: error = priqcmd_class_stats((struct priq_class_stats *)addr); break; default: error = EINVAL; break; } return error; } static int priqcmd_if_attach(ap) struct priq_interface *ap; { struct priq_if *pif; struct ifnet *ifp; int error; if ((ifp = ifunit(ap->ifname)) == NULL) return (ENXIO); if ((pif = priq_attach(&ifp->if_snd, ap->arg)) == NULL) return (ENOMEM); /* * set PRIQ to this ifnet structure. */ if ((error = altq_attach(&ifp->if_snd, ALTQT_PRIQ, pif, priq_enqueue, priq_dequeue, priq_request, &pif->pif_classifier, acc_classify)) != 0) (void)priq_detach(pif); return (error); } static int priqcmd_if_detach(ap) struct priq_interface *ap; { struct priq_if *pif; int error; if ((pif = altq_lookup(ap->ifname, ALTQT_PRIQ)) == NULL) return (EBADF); if (ALTQ_IS_ENABLED(pif->pif_ifq)) altq_disable(pif->pif_ifq); if ((error = altq_detach(pif->pif_ifq))) return (error); return priq_detach(pif); } static int priqcmd_add_class(ap) struct priq_add_class *ap; { struct priq_if *pif; struct priq_class *cl; int qid; if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) return (EBADF); if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) return (EINVAL); if (pif->pif_classes[ap->pri] != NULL) return (EBUSY); qid = ap->pri + 1; if ((cl = priq_class_create(pif, ap->pri, ap->qlimit, ap->flags, qid)) == NULL) return (ENOMEM); /* return a class handle to the user */ ap->class_handle = cl->cl_handle; return (0); } static int priqcmd_delete_class(ap) struct priq_delete_class *ap; { struct priq_if *pif; struct priq_class *cl; if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) return (EBADF); if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) return (EINVAL); return priq_class_destroy(cl); } static int priqcmd_modify_class(ap) struct priq_modify_class *ap; { struct priq_if *pif; struct priq_class *cl; if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) return (EBADF); if (ap->pri < 0 || ap->pri >= PRIQ_MAXPRI) return (EINVAL); if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) return (EINVAL); /* * if priority is changed, move the class to the new priority */ if (pif->pif_classes[ap->pri] != cl) { if (pif->pif_classes[ap->pri] != NULL) return (EEXIST); pif->pif_classes[cl->cl_pri] = NULL; pif->pif_classes[ap->pri] = cl; cl->cl_pri = ap->pri; } /* call priq_class_create to change class parameters */ if ((cl = priq_class_create(pif, ap->pri, ap->qlimit, ap->flags, ap->class_handle)) == NULL) return (ENOMEM); return 0; } static int priqcmd_add_filter(ap) struct priq_add_filter *ap; { struct priq_if *pif; struct priq_class *cl; if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) return (EBADF); if ((cl = clh_to_clp(pif, ap->class_handle)) == NULL) return (EINVAL); return acc_add_filter(&pif->pif_classifier, &ap->filter, cl, &ap->filter_handle); } static int priqcmd_delete_filter(ap) struct priq_delete_filter *ap; { struct priq_if *pif; if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) return (EBADF); return acc_delete_filter(&pif->pif_classifier, ap->filter_handle); } static int priqcmd_class_stats(ap) struct priq_class_stats *ap; { struct priq_if *pif; struct priq_class *cl; struct priq_classstats stats, *usp; int pri, error; if ((pif = altq_lookup(ap->iface.ifname, ALTQT_PRIQ)) == NULL) return (EBADF); ap->maxpri = pif->pif_maxpri; /* then, read the next N classes in the tree */ usp = ap->stats; for (pri = 0; pri <= pif->pif_maxpri; pri++) { cl = pif->pif_classes[pri]; if (cl != NULL) get_class_stats(&stats, cl); else bzero(&stats, sizeof(stats)); if ((error = copyout((caddr_t)&stats, (caddr_t)usp++, sizeof(stats))) != 0) return (error); } return (0); } #ifdef KLD_MODULE static struct altqsw priq_sw = {"priq", priqopen, priqclose, priqioctl}; ALTQ_MODULE(altq_priq, ALTQT_PRIQ, &priq_sw); MODULE_DEPEND(altq_priq, altq_red, 1, 1, 1); MODULE_DEPEND(altq_priq, altq_rio, 1, 1, 1); #endif /* KLD_MODULE */ #endif /* ALTQ3_COMPAT */ #endif /* ALTQ_PRIQ */ Index: stable/10/sys/contrib/altq/altq/altq_red.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_red.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_red.c (revision 263086) @@ -1,1497 +1,1500 @@ /* $FreeBSD$ */ /* $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ */ /* * Copyright (C) 1997-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Copyright (c) 1990-1994 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the Computer Systems * Engineering Group at Lawrence Berkeley Laboratory. * 4. Neither the name of the University nor of the Laboratory may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ #include #include #include #include #include #include #if 1 /* ALTQ3_COMPAT */ #include #include #include #ifdef ALTQ_FLOWVALVE #include #include #endif #endif /* ALTQ3_COMPAT */ #include +#include #include #include #include #ifdef INET6 #include #endif -#include +#include +#include +#include #include #include #ifdef ALTQ3_COMPAT #include #ifdef ALTQ_FLOWVALVE #include #endif #endif /* * ALTQ/RED (Random Early Detection) implementation using 32-bit * fixed-point calculation. * * written by kjc using the ns code as a reference. * you can learn more about red and ns from Sally's home page at * http://www-nrg.ee.lbl.gov/floyd/ * * most of the red parameter values are fixed in this implementation * to prevent fixed-point overflow/underflow. * if you change the parameters, watch out for overflow/underflow! * * the parameters used are recommended values by Sally. * the corresponding ns config looks: * q_weight=0.00195 * minthresh=5 maxthresh=15 queue-size=60 * linterm=30 * dropmech=drop-tail * bytes=false (can't be handled by 32-bit fixed-point) * doubleq=false dqthresh=false * wait=true */ /* * alternative red parameters for a slow link. * * assume the queue length becomes from zero to L and keeps L, it takes * N packets for q_avg to reach 63% of L. * when q_weight is 0.002, N is about 500 packets. * for a slow link like dial-up, 500 packets takes more than 1 minute! * when q_weight is 0.008, N is about 127 packets. * when q_weight is 0.016, N is about 63 packets. * bursts of 50 packets are allowed for 0.002, bursts of 25 packets * are allowed for 0.016. * see Sally's paper for more details. */ /* normal red parameters */ #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ /* q_weight = 0.00195 */ /* red parameters for a slow link */ #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ /* q_weight = 0.0078125 */ /* red parameters for a very slow link (e.g., dialup) */ #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ /* q_weight = 0.015625 */ /* fixed-point uses 12-bit decimal places */ #define FP_SHIFT 12 /* fixed-point shift */ /* red parameters for drop probability */ #define INV_P_MAX 10 /* inverse of max drop probability */ #define TH_MIN 5 /* min threshold */ #define TH_MAX 15 /* max threshold */ #define RED_LIMIT 60 /* default max queue lenght */ #define RED_STATS /* collect statistics */ /* * our default policy for forced-drop is drop-tail. * (in altq-1.1.2 or earlier, the default was random-drop. * but it makes more sense to punish the cause of the surge.) * to switch to the random-drop policy, define "RED_RANDOM_DROP". */ #ifdef ALTQ3_COMPAT #ifdef ALTQ_FLOWVALVE /* * flow-valve is an extention to protect red from unresponsive flows * and to promote end-to-end congestion control. * flow-valve observes the average drop rates of the flows that have * experienced packet drops in the recent past. * when the average drop rate exceeds the threshold, the flow is * blocked by the flow-valve. the trapped flow should back off * exponentially to escape from the flow-valve. */ #ifdef RED_RANDOM_DROP #error "random-drop can't be used with flow-valve!" #endif #endif /* ALTQ_FLOWVALVE */ /* red_list keeps all red_queue_t's allocated. */ static red_queue_t *red_list = NULL; #endif /* ALTQ3_COMPAT */ /* default red parameter values */ static int default_th_min = TH_MIN; static int default_th_max = TH_MAX; static int default_inv_pmax = INV_P_MAX; #ifdef ALTQ3_COMPAT /* internal function prototypes */ static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); static struct mbuf *red_dequeue(struct ifaltq *, int); static int red_request(struct ifaltq *, int, void *); static void red_purgeq(red_queue_t *); static int red_detach(red_queue_t *); #ifdef ALTQ_FLOWVALVE static __inline struct fve *flowlist_lookup(struct flowvalve *, struct altq_pktattr *, struct timeval *); static __inline struct fve *flowlist_reclaim(struct flowvalve *, struct altq_pktattr *); static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *); static __inline int fv_p2f(struct flowvalve *, int); #if 0 /* XXX: make the compiler happy (fv_alloc unused) */ static struct flowvalve *fv_alloc(struct red *); #endif static void fv_destroy(struct flowvalve *); static int fv_checkflow(struct flowvalve *, struct altq_pktattr *, struct fve **); static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *, struct fve *); #endif #endif /* ALTQ3_COMPAT */ /* * red support routines */ red_t * red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime) { red_t *rp; int w, i; int npkts_per_sec; rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (rp == NULL) return (NULL); if (weight == 0) rp->red_weight = W_WEIGHT; else rp->red_weight = weight; /* allocate weight table */ rp->red_wtab = wtab_alloc(rp->red_weight); if (rp->red_wtab == NULL) { free(rp, M_DEVBUF); return (NULL); } rp->red_avg = 0; rp->red_idle = 1; if (inv_pmax == 0) rp->red_inv_pmax = default_inv_pmax; else rp->red_inv_pmax = inv_pmax; if (th_min == 0) rp->red_thmin = default_th_min; else rp->red_thmin = th_min; if (th_max == 0) rp->red_thmax = default_th_max; else rp->red_thmax = th_max; rp->red_flags = flags; if (pkttime == 0) /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ rp->red_pkttime = 800; else rp->red_pkttime = pkttime; if (weight == 0) { /* when the link is very slow, adjust red parameters */ npkts_per_sec = 1000000 / rp->red_pkttime; if (npkts_per_sec < 50) { /* up to about 400Kbps */ rp->red_weight = W_WEIGHT_2; } else if (npkts_per_sec < 300) { /* up to about 2.4Mbps */ rp->red_weight = W_WEIGHT_1; } } /* calculate wshift. weight must be power of 2 */ w = rp->red_weight; for (i = 0; w > 1; i++) w = w >> 1; rp->red_wshift = i; w = 1 << rp->red_wshift; if (w != rp->red_weight) { printf("invalid weight value %d for red! use %d\n", rp->red_weight, w); rp->red_weight = w; } /* * thmin_s and thmax_s are scaled versions of th_min and th_max * to be compared with avg. */ rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); /* * precompute probability denominator * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */ rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) * rp->red_inv_pmax) << FP_SHIFT; microtime(&rp->red_last); return (rp); } void red_destroy(red_t *rp) { #ifdef ALTQ3_COMPAT #ifdef ALTQ_FLOWVALVE if (rp->red_flowvalve != NULL) fv_destroy(rp->red_flowvalve); #endif #endif /* ALTQ3_COMPAT */ wtab_destroy(rp->red_wtab); free(rp, M_DEVBUF); } void red_getstats(red_t *rp, struct redstats *sp) { sp->q_avg = rp->red_avg >> rp->red_wshift; sp->xmit_cnt = rp->red_stats.xmit_cnt; sp->drop_cnt = rp->red_stats.drop_cnt; sp->drop_forced = rp->red_stats.drop_forced; sp->drop_unforced = rp->red_stats.drop_unforced; sp->marked_packets = rp->red_stats.marked_packets; } int red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr) { int avg, droptype; int n; #ifdef ALTQ3_COMPAT #ifdef ALTQ_FLOWVALVE struct fve *fve = NULL; if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0) if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) { m_freem(m); return (-1); } #endif #endif /* ALTQ3_COMPAT */ avg = rp->red_avg; /* * if we were idle, we pretend that n packets arrived during * the idle period. */ if (rp->red_idle) { struct timeval now; int t; rp->red_idle = 0; microtime(&now); t = (now.tv_sec - rp->red_last.tv_sec); if (t > 60) { /* * being idle for more than 1 minute, set avg to zero. * this prevents t from overflow. */ avg = 0; } else { t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); n = t / rp->red_pkttime - 1; /* the following line does (avg = (1 - Wq)^n * avg) */ if (n > 0) avg = (avg >> FP_SHIFT) * pow_w(rp->red_wtab, n); } } /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); rp->red_avg = avg; /* save the new value */ /* * red_count keeps a tally of arriving traffic that has not * been dropped. */ rp->red_count++; /* see if we drop early */ droptype = DTYPE_NODROP; if (avg >= rp->red_thmin_s && qlen(q) > 1) { if (avg >= rp->red_thmax_s) { /* avg >= th_max: forced drop */ droptype = DTYPE_FORCED; } else if (rp->red_old == 0) { /* first exceeds th_min */ rp->red_count = 1; rp->red_old = 1; } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, rp->red_probd, rp->red_count)) { /* mark or drop by red */ if ((rp->red_flags & REDF_ECN) && mark_ecn(m, pktattr, rp->red_flags)) { /* successfully marked. do not drop. */ rp->red_count = 0; #ifdef RED_STATS rp->red_stats.marked_packets++; #endif } else { /* unforced drop by red */ droptype = DTYPE_EARLY; } } } else { /* avg < th_min */ rp->red_old = 0; } /* * if the queue length hits the hard limit, it's a forced drop. */ if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) droptype = DTYPE_FORCED; #ifdef RED_RANDOM_DROP /* if successful or forced drop, enqueue this packet. */ if (droptype != DTYPE_EARLY) _addq(q, m); #else /* if successful, enqueue this packet. */ if (droptype == DTYPE_NODROP) _addq(q, m); #endif if (droptype != DTYPE_NODROP) { if (droptype == DTYPE_EARLY) { /* drop the incoming packet */ #ifdef RED_STATS rp->red_stats.drop_unforced++; #endif } else { /* forced drop, select a victim packet in the queue. */ #ifdef RED_RANDOM_DROP m = _getq_random(q); #endif #ifdef RED_STATS rp->red_stats.drop_forced++; #endif } #ifdef RED_STATS PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); #endif rp->red_count = 0; #ifdef ALTQ3_COMPAT #ifdef ALTQ_FLOWVALVE if (rp->red_flowvalve != NULL) fv_dropbyred(rp->red_flowvalve, pktattr, fve); #endif #endif /* ALTQ3_COMPAT */ m_freem(m); return (-1); } /* successfully queued */ #ifdef RED_STATS PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); #endif return (0); } /* * early-drop probability is calculated as follows: * prob = p_max * (avg - th_min) / (th_max - th_min) * prob_a = prob / (2 - count*prob) * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) * here prob_a increases as successive undrop count increases. * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), * becomes 1 when (count >= (2 / prob))). */ int drop_early(int fp_len, int fp_probd, int count) { int d; /* denominator of drop-probability */ d = fp_probd - count * fp_len; if (d <= 0) /* count exceeds the hard limit: drop or mark */ return (1); /* * now the range of d is [1..600] in fixed-point. (when * th_max-th_min=10 and p_max=1/30) * drop probability = (avg - TH_MIN) / d */ if ((arc4random() % d) < fp_len) { /* drop or mark */ return (1); } /* no drop/mark */ return (0); } /* * try to mark CE bit to the packet. * returns 1 if successfully marked, 0 otherwise. */ int mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) { struct mbuf *m0; struct pf_mtag *at; void *hdr; at = pf_find_mtag(m); if (at != NULL) { hdr = at->hdr; #ifdef ALTQ3_COMPAT } else if (pktattr != NULL) { af = pktattr->pattr_af; hdr = pktattr->pattr_hdr; #endif /* ALTQ3_COMPAT */ } else return (0); /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)hdr >= m0->m_data) && ((caddr_t)hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, tag info is stale */ return (0); } switch (((struct ip *)hdr)->ip_v) { case IPVERSION: if (flags & REDF_ECN4) { struct ip *ip = hdr; u_int8_t otos; int sum; if (ip->ip_v != 4) return (0); /* version mismatch! */ if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) return (0); /* not-ECT */ if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) return (1); /* already marked */ /* * ecn-capable but not marked, * mark CE and update checksum */ otos = ip->ip_tos; ip->ip_tos |= IPTOS_ECN_CE; /* * update checksum (from RFC1624) * HC' = ~(~HC + ~m + m') */ sum = ~ntohs(ip->ip_sum) & 0xffff; sum += (~otos & 0xffff) + ip->ip_tos; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); /* add carry */ ip->ip_sum = htons(~sum & 0xffff); return (1); } break; #ifdef INET6 case (IPV6_VERSION >> 4): if (flags & REDF_ECN6) { struct ip6_hdr *ip6 = hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return (0); /* version mismatch! */ if ((flowlabel & (IPTOS_ECN_MASK << 20)) == (IPTOS_ECN_NOTECT << 20)) return (0); /* not-ECT */ if ((flowlabel & (IPTOS_ECN_MASK << 20)) == (IPTOS_ECN_CE << 20)) return (1); /* already marked */ /* * ecn-capable but not marked, mark CE */ flowlabel |= (IPTOS_ECN_CE << 20); ip6->ip6_flow = htonl(flowlabel); return (1); } break; #endif /* INET6 */ } /* not marked */ return (0); } struct mbuf * red_getq(rp, q) red_t *rp; class_queue_t *q; { struct mbuf *m; if ((m = _getq(q)) == NULL) { if (rp->red_idle == 0) { rp->red_idle = 1; microtime(&rp->red_last); } return NULL; } rp->red_idle = 0; return (m); } /* * helper routine to calibrate avg during idle. * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point * here Wq = 1/weight and the code assumes Wq is close to zero. * * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. */ static struct wtab *wtab_list = NULL; /* pointer to wtab list */ struct wtab * wtab_alloc(int weight) { struct wtab *w; int i; for (w = wtab_list; w != NULL; w = w->w_next) if (w->w_weight == weight) { w->w_refcount++; return (w); } w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO); if (w == NULL) return (NULL); w->w_weight = weight; w->w_refcount = 1; w->w_next = wtab_list; wtab_list = w; /* initialize the weight table */ w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; for (i = 1; i < 32; i++) { w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; if (w->w_tab[i] == 0 && w->w_param_max == 0) w->w_param_max = 1 << i; } return (w); } int wtab_destroy(struct wtab *w) { struct wtab *prev; if (--w->w_refcount > 0) return (0); if (wtab_list == w) wtab_list = w->w_next; else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) if (prev->w_next == w) { prev->w_next = w->w_next; break; } free(w, M_DEVBUF); return (0); } int32_t pow_w(struct wtab *w, int n) { int i, bit; int32_t val; if (n >= w->w_param_max) return (0); val = 1 << FP_SHIFT; if (n <= 0) return (val); bit = 1; i = 0; while (n) { if (n & bit) { val = (val * w->w_tab[i]) >> FP_SHIFT; n &= ~bit; } i++; bit <<= 1; } return (val); } #ifdef ALTQ3_COMPAT /* * red device interface */ altqdev_decl(red); int redopen(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { /* everything will be done when the queueing scheme is attached. */ return 0; } int redclose(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { red_queue_t *rqp; int err, error = 0; while ((rqp = red_list) != NULL) { /* destroy all */ err = red_detach(rqp); if (err != 0 && error == 0) error = err; } return error; } int redioctl(dev, cmd, addr, flag, p) dev_t dev; ioctlcmd_t cmd; caddr_t addr; int flag; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { red_queue_t *rqp; struct red_interface *ifacep; struct ifnet *ifp; int error = 0; /* check super-user privilege */ switch (cmd) { case RED_GETSTATS: break; default: #if (__FreeBSD_version > 700000) if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) #elsif (__FreeBSD_version > 400000) if ((error = suser(p)) != 0) #else if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) #endif return (error); break; } switch (cmd) { case RED_ENABLE: ifacep = (struct red_interface *)addr; if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { error = EBADF; break; } error = altq_enable(rqp->rq_ifq); break; case RED_DISABLE: ifacep = (struct red_interface *)addr; if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { error = EBADF; break; } error = altq_disable(rqp->rq_ifq); break; case RED_IF_ATTACH: ifp = ifunit(((struct red_interface *)addr)->red_ifname); if (ifp == NULL) { error = ENXIO; break; } /* allocate and initialize red_queue_t */ rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK); if (rqp == NULL) { error = ENOMEM; break; } bzero(rqp, sizeof(red_queue_t)); rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK); if (rqp->rq_q == NULL) { free(rqp, M_DEVBUF); error = ENOMEM; break; } bzero(rqp->rq_q, sizeof(class_queue_t)); rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0); if (rqp->rq_red == NULL) { free(rqp->rq_q, M_DEVBUF); free(rqp, M_DEVBUF); error = ENOMEM; break; } rqp->rq_ifq = &ifp->if_snd; qtail(rqp->rq_q) = NULL; qlen(rqp->rq_q) = 0; qlimit(rqp->rq_q) = RED_LIMIT; qtype(rqp->rq_q) = Q_RED; /* * set RED to this ifnet structure. */ error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp, red_enqueue, red_dequeue, red_request, NULL, NULL); if (error) { red_destroy(rqp->rq_red); free(rqp->rq_q, M_DEVBUF); free(rqp, M_DEVBUF); break; } /* add this state to the red list */ rqp->rq_next = red_list; red_list = rqp; break; case RED_IF_DETACH: ifacep = (struct red_interface *)addr; if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) { error = EBADF; break; } error = red_detach(rqp); break; case RED_GETSTATS: do { struct red_stats *q_stats; red_t *rp; q_stats = (struct red_stats *)addr; if ((rqp = altq_lookup(q_stats->iface.red_ifname, ALTQT_RED)) == NULL) { error = EBADF; break; } q_stats->q_len = qlen(rqp->rq_q); q_stats->q_limit = qlimit(rqp->rq_q); rp = rqp->rq_red; q_stats->q_avg = rp->red_avg >> rp->red_wshift; q_stats->xmit_cnt = rp->red_stats.xmit_cnt; q_stats->drop_cnt = rp->red_stats.drop_cnt; q_stats->drop_forced = rp->red_stats.drop_forced; q_stats->drop_unforced = rp->red_stats.drop_unforced; q_stats->marked_packets = rp->red_stats.marked_packets; q_stats->weight = rp->red_weight; q_stats->inv_pmax = rp->red_inv_pmax; q_stats->th_min = rp->red_thmin; q_stats->th_max = rp->red_thmax; #ifdef ALTQ_FLOWVALVE if (rp->red_flowvalve != NULL) { struct flowvalve *fv = rp->red_flowvalve; q_stats->fv_flows = fv->fv_flows; q_stats->fv_pass = fv->fv_stats.pass; q_stats->fv_predrop = fv->fv_stats.predrop; q_stats->fv_alloc = fv->fv_stats.alloc; q_stats->fv_escape = fv->fv_stats.escape; } else { #endif /* ALTQ_FLOWVALVE */ q_stats->fv_flows = 0; q_stats->fv_pass = 0; q_stats->fv_predrop = 0; q_stats->fv_alloc = 0; q_stats->fv_escape = 0; #ifdef ALTQ_FLOWVALVE } #endif /* ALTQ_FLOWVALVE */ } while (/*CONSTCOND*/ 0); break; case RED_CONFIG: do { struct red_conf *fc; red_t *new; int s, limit; fc = (struct red_conf *)addr; if ((rqp = altq_lookup(fc->iface.red_ifname, ALTQT_RED)) == NULL) { error = EBADF; break; } new = red_alloc(fc->red_weight, fc->red_inv_pmax, fc->red_thmin, fc->red_thmax, fc->red_flags, fc->red_pkttime); if (new == NULL) { error = ENOMEM; break; } #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif red_purgeq(rqp); limit = fc->red_limit; if (limit < fc->red_thmax) limit = fc->red_thmax; qlimit(rqp->rq_q) = limit; fc->red_limit = limit; /* write back the new value */ red_destroy(rqp->rq_red); rqp->rq_red = new; splx(s); /* write back new values */ fc->red_limit = limit; fc->red_inv_pmax = rqp->rq_red->red_inv_pmax; fc->red_thmin = rqp->rq_red->red_thmin; fc->red_thmax = rqp->rq_red->red_thmax; } while (/*CONSTCOND*/ 0); break; case RED_SETDEFAULTS: do { struct redparams *rp; rp = (struct redparams *)addr; default_th_min = rp->th_min; default_th_max = rp->th_max; default_inv_pmax = rp->inv_pmax; } while (/*CONSTCOND*/ 0); break; default: error = EINVAL; break; } return error; } static int red_detach(rqp) red_queue_t *rqp; { red_queue_t *tmp; int error = 0; if (ALTQ_IS_ENABLED(rqp->rq_ifq)) altq_disable(rqp->rq_ifq); if ((error = altq_detach(rqp->rq_ifq))) return (error); if (red_list == rqp) red_list = rqp->rq_next; else { for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next) if (tmp->rq_next == rqp) { tmp->rq_next = rqp->rq_next; break; } if (tmp == NULL) printf("red_detach: no state found in red_list!\n"); } red_destroy(rqp->rq_red); free(rqp->rq_q, M_DEVBUF); free(rqp, M_DEVBUF); return (error); } /* * enqueue routine: * * returns: 0 when successfully queued. * ENOBUFS when drop occurs. */ static int red_enqueue(ifq, m, pktattr) struct ifaltq *ifq; struct mbuf *m; struct altq_pktattr *pktattr; { red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; IFQ_LOCK_ASSERT(ifq); if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0) return ENOBUFS; ifq->ifq_len++; return 0; } /* * dequeue routine: * must be called in splimp. * * returns: mbuf dequeued. * NULL when no packet is available in the queue. */ static struct mbuf * red_dequeue(ifq, op) struct ifaltq *ifq; int op; { red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; struct mbuf *m; IFQ_LOCK_ASSERT(ifq); if (op == ALTDQ_POLL) return qhead(rqp->rq_q); /* op == ALTDQ_REMOVE */ m = red_getq(rqp->rq_red, rqp->rq_q); if (m != NULL) ifq->ifq_len--; return (m); } static int red_request(ifq, req, arg) struct ifaltq *ifq; int req; void *arg; { red_queue_t *rqp = (red_queue_t *)ifq->altq_disc; IFQ_LOCK_ASSERT(ifq); switch (req) { case ALTRQ_PURGE: red_purgeq(rqp); break; } return (0); } static void red_purgeq(rqp) red_queue_t *rqp; { _flushq(rqp->rq_q); if (ALTQ_IS_ENABLED(rqp->rq_ifq)) rqp->rq_ifq->ifq_len = 0; } #ifdef ALTQ_FLOWVALVE #define FV_PSHIFT 7 /* weight of average drop rate -- 1/128 */ #define FV_PSCALE(x) ((x) << FV_PSHIFT) #define FV_PUNSCALE(x) ((x) >> FV_PSHIFT) #define FV_FSHIFT 5 /* weight of average fraction -- 1/32 */ #define FV_FSCALE(x) ((x) << FV_FSHIFT) #define FV_FUNSCALE(x) ((x) >> FV_FSHIFT) #define FV_TIMER (3 * hz) /* timer value for garbage collector */ #define FV_FLOWLISTSIZE 64 /* how many flows in flowlist */ #define FV_N 10 /* update fve_f every FV_N packets */ #define FV_BACKOFFTHRESH 1 /* backoff threshold interval in second */ #define FV_TTHRESH 3 /* time threshold to delete fve */ #define FV_ALPHA 5 /* extra packet count */ #define FV_STATS #if (__FreeBSD_version > 300000) #define FV_TIMESTAMP(tp) getmicrotime(tp) #else #define FV_TIMESTAMP(tp) { (*(tp)) = time; } #endif /* * Brtt table: 127 entry table to convert drop rate (p) to * the corresponding bandwidth fraction (f) * the following equation is implemented to use scaled values, * fve_p and fve_f, in the fixed point format. * * Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p)) * f = Brtt(p) / (max_th + alpha) */ #define BRTT_SIZE 128 #define BRTT_SHIFT 12 #define BRTT_MASK 0x0007f000 #define BRTT_PMAX (1 << (FV_PSHIFT + FP_SHIFT)) const int brtt_tab[BRTT_SIZE] = { 0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728, 392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361, 225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333, 145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612, 98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957, 67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440, 47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184, 33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611, 24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062, 18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487, 14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222, 10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844, 8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079, 6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746, 5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722, 4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924 }; static __inline struct fve * flowlist_lookup(fv, pktattr, now) struct flowvalve *fv; struct altq_pktattr *pktattr; struct timeval *now; { struct fve *fve; int flows; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct timeval tthresh; if (pktattr == NULL) return (NULL); tthresh.tv_sec = now->tv_sec - FV_TTHRESH; flows = 0; /* * search the flow list */ switch (pktattr->pattr_af) { case AF_INET: ip = (struct ip *)pktattr->pattr_hdr; TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ if (fve->fve_lastdrop.tv_sec == 0) break; if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { fve->fve_lastdrop.tv_sec = 0; break; } if (fve->fve_flow.flow_af == AF_INET && fve->fve_flow.flow_ip.ip_src.s_addr == ip->ip_src.s_addr && fve->fve_flow.flow_ip.ip_dst.s_addr == ip->ip_dst.s_addr) return (fve); flows++; } break; #ifdef INET6 case AF_INET6: ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){ if (fve->fve_lastdrop.tv_sec == 0) break; if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) { fve->fve_lastdrop.tv_sec = 0; break; } if (fve->fve_flow.flow_af == AF_INET6 && IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src, &ip6->ip6_src) && IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst, &ip6->ip6_dst)) return (fve); flows++; } break; #endif /* INET6 */ default: /* unknown protocol. no drop. */ return (NULL); } fv->fv_flows = flows; /* save the number of active fve's */ return (NULL); } static __inline struct fve * flowlist_reclaim(fv, pktattr) struct flowvalve *fv; struct altq_pktattr *pktattr; { struct fve *fve; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif /* * get an entry from the tail of the LRU list. */ fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead); switch (pktattr->pattr_af) { case AF_INET: ip = (struct ip *)pktattr->pattr_hdr; fve->fve_flow.flow_af = AF_INET; fve->fve_flow.flow_ip.ip_src = ip->ip_src; fve->fve_flow.flow_ip.ip_dst = ip->ip_dst; break; #ifdef INET6 case AF_INET6: ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; fve->fve_flow.flow_af = AF_INET6; fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src; fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst; break; #endif } fve->fve_state = Green; fve->fve_p = 0.0; fve->fve_f = 0.0; fve->fve_ifseq = fv->fv_ifseq - 1; fve->fve_count = 0; fv->fv_flows++; #ifdef FV_STATS fv->fv_stats.alloc++; #endif return (fve); } static __inline void flowlist_move_to_head(fv, fve) struct flowvalve *fv; struct fve *fve; { if (TAILQ_FIRST(&fv->fv_flowlist) != fve) { TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru); TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru); } } #if 0 /* XXX: make the compiler happy (fv_alloc unused) */ /* * allocate flowvalve structure */ static struct flowvalve * fv_alloc(rp) struct red *rp; { struct flowvalve *fv; struct fve *fve; int i, num; num = FV_FLOWLISTSIZE; fv = malloc(sizeof(struct flowvalve), M_DEVBUF, M_WAITOK); if (fv == NULL) return (NULL); bzero(fv, sizeof(struct flowvalve)); fv->fv_fves = malloc(sizeof(struct fve) * num, M_DEVBUF, M_WAITOK); if (fv->fv_fves == NULL) { free(fv, M_DEVBUF); return (NULL); } bzero(fv->fv_fves, sizeof(struct fve) * num); fv->fv_flows = 0; TAILQ_INIT(&fv->fv_flowlist); for (i = 0; i < num; i++) { fve = &fv->fv_fves[i]; fve->fve_lastdrop.tv_sec = 0; TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru); } /* initialize drop rate threshold in scaled fixed-point */ fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax; /* initialize drop rate to fraction table */ fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE, M_DEVBUF, M_WAITOK); if (fv->fv_p2ftab == NULL) { free(fv->fv_fves, M_DEVBUF); free(fv, M_DEVBUF); return (NULL); } /* * create the p2f table. * (shift is used to keep the precision) */ for (i = 1; i < BRTT_SIZE; i++) { int f; f = brtt_tab[i] << 8; fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8; } return (fv); } #endif static void fv_destroy(fv) struct flowvalve *fv; { free(fv->fv_p2ftab, M_DEVBUF); free(fv->fv_fves, M_DEVBUF); free(fv, M_DEVBUF); } static __inline int fv_p2f(fv, p) struct flowvalve *fv; int p; { int val, f; if (p >= BRTT_PMAX) f = fv->fv_p2ftab[BRTT_SIZE-1]; else if ((val = (p & BRTT_MASK))) f = fv->fv_p2ftab[(val >> BRTT_SHIFT)]; else f = fv->fv_p2ftab[1]; return (f); } /* * check if an arriving packet should be pre-dropped. * called from red_addq() when a packet arrives. * returns 1 when the packet should be pre-dropped. * should be called in splimp. */ static int fv_checkflow(fv, pktattr, fcache) struct flowvalve *fv; struct altq_pktattr *pktattr; struct fve **fcache; { struct fve *fve; struct timeval now; fv->fv_ifseq++; FV_TIMESTAMP(&now); if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) /* no matching entry in the flowlist */ return (0); *fcache = fve; /* update fraction f for every FV_N packets */ if (++fve->fve_count == FV_N) { /* * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f */ fve->fve_f = (FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq) + fve->fve_f - FV_FUNSCALE(fve->fve_f); fve->fve_ifseq = fv->fv_ifseq; fve->fve_count = 0; } /* * overpumping test */ if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) { int fthresh; /* calculate a threshold */ fthresh = fv_p2f(fv, fve->fve_p); if (fve->fve_f > fthresh) fve->fve_state = Red; } if (fve->fve_state == Red) { /* * backoff test */ if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) { /* no drop for at least FV_BACKOFFTHRESH sec */ fve->fve_p = 0; fve->fve_state = Green; #ifdef FV_STATS fv->fv_stats.escape++; #endif } else { /* block this flow */ flowlist_move_to_head(fv, fve); fve->fve_lastdrop = now; #ifdef FV_STATS fv->fv_stats.predrop++; #endif return (1); } } /* * p = (1 - Wp) * p */ fve->fve_p -= FV_PUNSCALE(fve->fve_p); if (fve->fve_p < 0) fve->fve_p = 0; #ifdef FV_STATS fv->fv_stats.pass++; #endif return (0); } /* * called from red_addq when a packet is dropped by red. * should be called in splimp. */ static void fv_dropbyred(fv, pktattr, fcache) struct flowvalve *fv; struct altq_pktattr *pktattr; struct fve *fcache; { struct fve *fve; struct timeval now; if (pktattr == NULL) return; FV_TIMESTAMP(&now); if (fcache != NULL) /* the fve of this packet is already cached */ fve = fcache; else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL) fve = flowlist_reclaim(fv, pktattr); flowlist_move_to_head(fv, fve); /* * update p: the following line cancels the update * in fv_checkflow() and calculate * p = Wp + (1 - Wp) * p */ fve->fve_p = (1 << FP_SHIFT) + fve->fve_p; fve->fve_lastdrop = now; } #endif /* ALTQ_FLOWVALVE */ #ifdef KLD_MODULE static struct altqsw red_sw = {"red", redopen, redclose, redioctl}; ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw); MODULE_VERSION(altq_red, 1); #endif /* KLD_MODULE */ #endif /* ALTQ3_COMPAT */ #endif /* ALTQ_RED */ Index: stable/10/sys/contrib/altq/altq/altq_rio.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_rio.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_rio.c (revision 263086) @@ -1,850 +1,852 @@ /* $FreeBSD$ */ /* $KAME: altq_rio.c,v 1.17 2003/07/10 12:07:49 kjc Exp $ */ /* * Copyright (C) 1998-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1990-1994 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the Computer Systems * Engineering Group at Lawrence Berkeley Laboratory. * 4. Neither the name of the University nor of the Laboratory may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #ifdef ALTQ_RIO /* rio is enabled by ALTQ_RIO option in opt_altq.h */ #include #include #include #include #include #include #if 1 /* ALTQ3_COMPAT */ #include #include #include #endif #include +#include #include #include #include #ifdef INET6 #include #endif -#include +#include +#include #include #include #include #include #ifdef ALTQ3_COMPAT #include #endif /* * RIO: RED with IN/OUT bit * described in * "Explicit Allocation of Best Effort Packet Delivery Service" * David D. Clark and Wenjia Fang, MIT Lab for Computer Science * http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf} * * this implementation is extended to support more than 2 drop precedence * values as described in RFC2597 (Assured Forwarding PHB Group). * */ /* * AF DS (differentiated service) codepoints. * (classes can be mapped to CBQ or H-FSC classes.) * * 0 1 2 3 4 5 6 7 * +---+---+---+---+---+---+---+---+ * | CLASS |DropPre| 0 | CU | * +---+---+---+---+---+---+---+---+ * * class 1: 001 * class 2: 010 * class 3: 011 * class 4: 100 * * low drop prec: 01 * medium drop prec: 10 * high drop prec: 01 */ /* normal red parameters */ #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ /* q_weight = 0.00195 */ /* red parameters for a slow link */ #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ /* q_weight = 0.0078125 */ /* red parameters for a very slow link (e.g., dialup) */ #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ /* q_weight = 0.015625 */ /* fixed-point uses 12-bit decimal places */ #define FP_SHIFT 12 /* fixed-point shift */ /* red parameters for drop probability */ #define INV_P_MAX 10 /* inverse of max drop probability */ #define TH_MIN 5 /* min threshold */ #define TH_MAX 15 /* max threshold */ #define RIO_LIMIT 60 /* default max queue lenght */ #define RIO_STATS /* collect statistics */ #define TV_DELTA(a, b, delta) { \ register int xxs; \ \ delta = (a)->tv_usec - (b)->tv_usec; \ if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) { \ if (xxs < 0) { \ delta = 60000000; \ } else if (xxs > 4) { \ if (xxs > 60) \ delta = 60000000; \ else \ delta += xxs * 1000000; \ } else while (xxs > 0) { \ delta += 1000000; \ xxs--; \ } \ } \ } #ifdef ALTQ3_COMPAT /* rio_list keeps all rio_queue_t's allocated. */ static rio_queue_t *rio_list = NULL; #endif /* default rio parameter values */ static struct redparams default_rio_params[RIO_NDROPPREC] = { /* th_min, th_max, inv_pmax */ { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */ { TH_MAX + TH_MIN, TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */ { TH_MIN, TH_MAX, INV_P_MAX } /* high drop precedence */ }; /* internal function prototypes */ static int dscp2index(u_int8_t); #ifdef ALTQ3_COMPAT static int rio_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *); static struct mbuf *rio_dequeue(struct ifaltq *, int); static int rio_request(struct ifaltq *, int, void *); static int rio_detach(rio_queue_t *); /* * rio device interface */ altqdev_decl(rio); #endif /* ALTQ3_COMPAT */ rio_t * rio_alloc(int weight, struct redparams *params, int flags, int pkttime) { rio_t *rp; int w, i; int npkts_per_sec; rp = malloc(sizeof(rio_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (rp == NULL) return (NULL); rp->rio_flags = flags; if (pkttime == 0) /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ rp->rio_pkttime = 800; else rp->rio_pkttime = pkttime; if (weight != 0) rp->rio_weight = weight; else { /* use default */ rp->rio_weight = W_WEIGHT; /* when the link is very slow, adjust red parameters */ npkts_per_sec = 1000000 / rp->rio_pkttime; if (npkts_per_sec < 50) { /* up to about 400Kbps */ rp->rio_weight = W_WEIGHT_2; } else if (npkts_per_sec < 300) { /* up to about 2.4Mbps */ rp->rio_weight = W_WEIGHT_1; } } /* calculate wshift. weight must be power of 2 */ w = rp->rio_weight; for (i = 0; w > 1; i++) w = w >> 1; rp->rio_wshift = i; w = 1 << rp->rio_wshift; if (w != rp->rio_weight) { printf("invalid weight value %d for red! use %d\n", rp->rio_weight, w); rp->rio_weight = w; } /* allocate weight table */ rp->rio_wtab = wtab_alloc(rp->rio_weight); for (i = 0; i < RIO_NDROPPREC; i++) { struct dropprec_state *prec = &rp->rio_precstate[i]; prec->avg = 0; prec->idle = 1; if (params == NULL || params[i].inv_pmax == 0) prec->inv_pmax = default_rio_params[i].inv_pmax; else prec->inv_pmax = params[i].inv_pmax; if (params == NULL || params[i].th_min == 0) prec->th_min = default_rio_params[i].th_min; else prec->th_min = params[i].th_min; if (params == NULL || params[i].th_max == 0) prec->th_max = default_rio_params[i].th_max; else prec->th_max = params[i].th_max; /* * th_min_s and th_max_s are scaled versions of th_min * and th_max to be compared with avg. */ prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT); prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT); /* * precompute probability denominator * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */ prec->probd = (2 * (prec->th_max - prec->th_min) * prec->inv_pmax) << FP_SHIFT; microtime(&prec->last); } return (rp); } void rio_destroy(rio_t *rp) { wtab_destroy(rp->rio_wtab); free(rp, M_DEVBUF); } void rio_getstats(rio_t *rp, struct redstats *sp) { int i; for (i = 0; i < RIO_NDROPPREC; i++) { bcopy(&rp->q_stats[i], sp, sizeof(struct redstats)); sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; sp++; } } #if (RIO_NDROPPREC == 3) /* * internally, a drop precedence value is converted to an index * starting from 0. */ static int dscp2index(u_int8_t dscp) { int dpindex = dscp & AF_DROPPRECMASK; if (dpindex == 0) return (0); return ((dpindex >> 3) - 1); } #endif #if 1 /* * kludge: when a packet is dequeued, we need to know its drop precedence * in order to keep the queue length of each drop precedence. * use m_pkthdr.rcvif to pass this info. */ #define RIOM_SET_PRECINDEX(m, idx) \ do { (m)->m_pkthdr.rcvif = (void *)((long)(idx)); } while (0) #define RIOM_GET_PRECINDEX(m) \ ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \ (m)->m_pkthdr.rcvif = NULL; idx; }) #endif int rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr) { int avg, droptype; u_int8_t dsfield, odsfield; int dpindex, i, n, t; struct timeval now; struct dropprec_state *prec; dsfield = odsfield = read_dsfield(m, pktattr); dpindex = dscp2index(dsfield); /* * update avg of the precedence states whose drop precedence * is larger than or equal to the drop precedence of the packet */ now.tv_sec = 0; for (i = dpindex; i < RIO_NDROPPREC; i++) { prec = &rp->rio_precstate[i]; avg = prec->avg; if (prec->idle) { prec->idle = 0; if (now.tv_sec == 0) microtime(&now); t = (now.tv_sec - prec->last.tv_sec); if (t > 60) avg = 0; else { t = t * 1000000 + (now.tv_usec - prec->last.tv_usec); n = t / rp->rio_pkttime; /* calculate (avg = (1 - Wq)^n * avg) */ if (n > 0) avg = (avg >> FP_SHIFT) * pow_w(rp->rio_wtab, n); } } /* run estimator. (avg is scaled by WEIGHT in fixed-point) */ avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift); prec->avg = avg; /* save the new value */ /* * count keeps a tally of arriving traffic that has not * been dropped. */ prec->count++; } prec = &rp->rio_precstate[dpindex]; avg = prec->avg; /* see if we drop early */ droptype = DTYPE_NODROP; if (avg >= prec->th_min_s && prec->qlen > 1) { if (avg >= prec->th_max_s) { /* avg >= th_max: forced drop */ droptype = DTYPE_FORCED; } else if (prec->old == 0) { /* first exceeds th_min */ prec->count = 1; prec->old = 1; } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift, prec->probd, prec->count)) { /* unforced drop by red */ droptype = DTYPE_EARLY; } } else { /* avg < th_min */ prec->old = 0; } /* * if the queue length hits the hard limit, it's a forced drop. */ if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) droptype = DTYPE_FORCED; if (droptype != DTYPE_NODROP) { /* always drop incoming packet (as opposed to randomdrop) */ for (i = dpindex; i < RIO_NDROPPREC; i++) rp->rio_precstate[i].count = 0; #ifdef RIO_STATS if (droptype == DTYPE_EARLY) rp->q_stats[dpindex].drop_unforced++; else rp->q_stats[dpindex].drop_forced++; PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m)); #endif m_freem(m); return (-1); } for (i = dpindex; i < RIO_NDROPPREC; i++) rp->rio_precstate[i].qlen++; /* save drop precedence index in mbuf hdr */ RIOM_SET_PRECINDEX(m, dpindex); if (rp->rio_flags & RIOF_CLEARDSCP) dsfield &= ~DSCP_MASK; if (dsfield != odsfield) write_dsfield(m, pktattr, dsfield); _addq(q, m); #ifdef RIO_STATS PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m)); #endif return (0); } struct mbuf * rio_getq(rio_t *rp, class_queue_t *q) { struct mbuf *m; int dpindex, i; if ((m = _getq(q)) == NULL) return NULL; dpindex = RIOM_GET_PRECINDEX(m); for (i = dpindex; i < RIO_NDROPPREC; i++) { if (--rp->rio_precstate[i].qlen == 0) { if (rp->rio_precstate[i].idle == 0) { rp->rio_precstate[i].idle = 1; microtime(&rp->rio_precstate[i].last); } } } return (m); } #ifdef ALTQ3_COMPAT int rioopen(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { /* everything will be done when the queueing scheme is attached. */ return 0; } int rioclose(dev, flag, fmt, p) dev_t dev; int flag, fmt; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { rio_queue_t *rqp; int err, error = 0; while ((rqp = rio_list) != NULL) { /* destroy all */ err = rio_detach(rqp); if (err != 0 && error == 0) error = err; } return error; } int rioioctl(dev, cmd, addr, flag, p) dev_t dev; ioctlcmd_t cmd; caddr_t addr; int flag; #if (__FreeBSD_version > 500000) struct thread *p; #else struct proc *p; #endif { rio_queue_t *rqp; struct rio_interface *ifacep; struct ifnet *ifp; int error = 0; /* check super-user privilege */ switch (cmd) { case RIO_GETSTATS: break; default: #if (__FreeBSD_version > 700000) if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0) return (error); #elsif (__FreeBSD_version > 400000) if ((error = suser(p)) != 0) return (error); #else if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) return (error); #endif break; } switch (cmd) { case RIO_ENABLE: ifacep = (struct rio_interface *)addr; if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { error = EBADF; break; } error = altq_enable(rqp->rq_ifq); break; case RIO_DISABLE: ifacep = (struct rio_interface *)addr; if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { error = EBADF; break; } error = altq_disable(rqp->rq_ifq); break; case RIO_IF_ATTACH: ifp = ifunit(((struct rio_interface *)addr)->rio_ifname); if (ifp == NULL) { error = ENXIO; break; } /* allocate and initialize rio_queue_t */ rqp = malloc(sizeof(rio_queue_t), M_DEVBUF, M_WAITOK); if (rqp == NULL) { error = ENOMEM; break; } bzero(rqp, sizeof(rio_queue_t)); rqp->rq_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK); if (rqp->rq_q == NULL) { free(rqp, M_DEVBUF); error = ENOMEM; break; } bzero(rqp->rq_q, sizeof(class_queue_t)); rqp->rq_rio = rio_alloc(0, NULL, 0, 0); if (rqp->rq_rio == NULL) { free(rqp->rq_q, M_DEVBUF); free(rqp, M_DEVBUF); error = ENOMEM; break; } rqp->rq_ifq = &ifp->if_snd; qtail(rqp->rq_q) = NULL; qlen(rqp->rq_q) = 0; qlimit(rqp->rq_q) = RIO_LIMIT; qtype(rqp->rq_q) = Q_RIO; /* * set RIO to this ifnet structure. */ error = altq_attach(rqp->rq_ifq, ALTQT_RIO, rqp, rio_enqueue, rio_dequeue, rio_request, NULL, NULL); if (error) { rio_destroy(rqp->rq_rio); free(rqp->rq_q, M_DEVBUF); free(rqp, M_DEVBUF); break; } /* add this state to the rio list */ rqp->rq_next = rio_list; rio_list = rqp; break; case RIO_IF_DETACH: ifacep = (struct rio_interface *)addr; if ((rqp = altq_lookup(ifacep->rio_ifname, ALTQT_RIO)) == NULL) { error = EBADF; break; } error = rio_detach(rqp); break; case RIO_GETSTATS: do { struct rio_stats *q_stats; rio_t *rp; int i; q_stats = (struct rio_stats *)addr; if ((rqp = altq_lookup(q_stats->iface.rio_ifname, ALTQT_RIO)) == NULL) { error = EBADF; break; } rp = rqp->rq_rio; q_stats->q_limit = qlimit(rqp->rq_q); q_stats->weight = rp->rio_weight; q_stats->flags = rp->rio_flags; for (i = 0; i < RIO_NDROPPREC; i++) { q_stats->q_len[i] = rp->rio_precstate[i].qlen; bcopy(&rp->q_stats[i], &q_stats->q_stats[i], sizeof(struct redstats)); q_stats->q_stats[i].q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift; q_stats->q_params[i].inv_pmax = rp->rio_precstate[i].inv_pmax; q_stats->q_params[i].th_min = rp->rio_precstate[i].th_min; q_stats->q_params[i].th_max = rp->rio_precstate[i].th_max; } } while (/*CONSTCOND*/ 0); break; case RIO_CONFIG: do { struct rio_conf *fc; rio_t *new; int s, limit, i; fc = (struct rio_conf *)addr; if ((rqp = altq_lookup(fc->iface.rio_ifname, ALTQT_RIO)) == NULL) { error = EBADF; break; } new = rio_alloc(fc->rio_weight, &fc->q_params[0], fc->rio_flags, fc->rio_pkttime); if (new == NULL) { error = ENOMEM; break; } #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif _flushq(rqp->rq_q); limit = fc->rio_limit; if (limit < fc->q_params[RIO_NDROPPREC-1].th_max) limit = fc->q_params[RIO_NDROPPREC-1].th_max; qlimit(rqp->rq_q) = limit; rio_destroy(rqp->rq_rio); rqp->rq_rio = new; splx(s); /* write back new values */ fc->rio_limit = limit; for (i = 0; i < RIO_NDROPPREC; i++) { fc->q_params[i].inv_pmax = rqp->rq_rio->rio_precstate[i].inv_pmax; fc->q_params[i].th_min = rqp->rq_rio->rio_precstate[i].th_min; fc->q_params[i].th_max = rqp->rq_rio->rio_precstate[i].th_max; } } while (/*CONSTCOND*/ 0); break; case RIO_SETDEFAULTS: do { struct redparams *rp; int i; rp = (struct redparams *)addr; for (i = 0; i < RIO_NDROPPREC; i++) default_rio_params[i] = rp[i]; } while (/*CONSTCOND*/ 0); break; default: error = EINVAL; break; } return error; } static int rio_detach(rqp) rio_queue_t *rqp; { rio_queue_t *tmp; int error = 0; if (ALTQ_IS_ENABLED(rqp->rq_ifq)) altq_disable(rqp->rq_ifq); if ((error = altq_detach(rqp->rq_ifq))) return (error); if (rio_list == rqp) rio_list = rqp->rq_next; else { for (tmp = rio_list; tmp != NULL; tmp = tmp->rq_next) if (tmp->rq_next == rqp) { tmp->rq_next = rqp->rq_next; break; } if (tmp == NULL) printf("rio_detach: no state found in rio_list!\n"); } rio_destroy(rqp->rq_rio); free(rqp->rq_q, M_DEVBUF); free(rqp, M_DEVBUF); return (error); } /* * rio support routines */ static int rio_request(ifq, req, arg) struct ifaltq *ifq; int req; void *arg; { rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; IFQ_LOCK_ASSERT(ifq); switch (req) { case ALTRQ_PURGE: _flushq(rqp->rq_q); if (ALTQ_IS_ENABLED(ifq)) ifq->ifq_len = 0; break; } return (0); } /* * enqueue routine: * * returns: 0 when successfully queued. * ENOBUFS when drop occurs. */ static int rio_enqueue(ifq, m, pktattr) struct ifaltq *ifq; struct mbuf *m; struct altq_pktattr *pktattr; { rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; int error = 0; IFQ_LOCK_ASSERT(ifq); if (rio_addq(rqp->rq_rio, rqp->rq_q, m, pktattr) == 0) ifq->ifq_len++; else error = ENOBUFS; return error; } /* * dequeue routine: * must be called in splimp. * * returns: mbuf dequeued. * NULL when no packet is available in the queue. */ static struct mbuf * rio_dequeue(ifq, op) struct ifaltq *ifq; int op; { rio_queue_t *rqp = (rio_queue_t *)ifq->altq_disc; struct mbuf *m = NULL; IFQ_LOCK_ASSERT(ifq); if (op == ALTDQ_POLL) return qhead(rqp->rq_q); m = rio_getq(rqp->rq_rio, rqp->rq_q); if (m != NULL) ifq->ifq_len--; return m; } #ifdef KLD_MODULE static struct altqsw rio_sw = {"rio", rioopen, rioclose, rioioctl}; ALTQ_MODULE(altq_rio, ALTQT_RIO, &rio_sw); MODULE_VERSION(altq_rio, 1); MODULE_DEPEND(altq_rio, altq_red, 1, 1, 1); #endif /* KLD_MODULE */ #endif /* ALTQ3_COMPAT */ #endif /* ALTQ_RIO */ Index: stable/10/sys/contrib/altq/altq/altq_rmclass.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_rmclass.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_rmclass.c (revision 263086) @@ -1,1834 +1,1836 @@ /* $FreeBSD$ */ /* $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $ */ /* * Copyright (c) 1991-1997 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the Network Research * Group at Lawrence Berkeley Laboratory. * 4. Neither the name of the University nor of the Laboratory may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * LBL code modified by speer@eng.sun.com, May 1977. * For questions and/or comments, please send mail to cbq@ee.lbl.gov * * @(#)rm_class.c 1.48 97/12/05 SMI */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ #include #include #include #include #include #include #include #ifdef ALTQ3_COMPAT #include #endif #include +#include #ifdef ALTQ3_COMPAT #include #include #include #endif +#include #include #include #include #include #include /* * Local Macros */ #define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } /* * Local routines. */ static int rmc_satisfied(struct rm_class *, struct timeval *); static void rmc_wrr_set_weights(struct rm_ifdat *); static void rmc_depth_compute(struct rm_class *); static void rmc_depth_recompute(rm_class_t *); static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); static int _rmc_addq(rm_class_t *, mbuf_t *); static void _rmc_dropq(rm_class_t *); static mbuf_t *_rmc_getq(rm_class_t *); static mbuf_t *_rmc_pollq(rm_class_t *); static int rmc_under_limit(struct rm_class *, struct timeval *); static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); static void rmc_drop_action(struct rm_class *); static void rmc_restart(struct rm_class *); static void rmc_root_overlimit(struct rm_class *, struct rm_class *); #define BORROW_OFFTIME /* * BORROW_OFFTIME (experimental): * borrow the offtime of the class borrowing from. * the reason is that when its own offtime is set, the class is unable * to borrow much, especially when cutoff is taking effect. * but when the borrowed class is overloaded (advidle is close to minidle), * use the borrowing class's offtime to avoid overload. */ #define ADJUST_CUTOFF /* * ADJUST_CUTOFF (experimental): * if no underlimit class is found due to cutoff, increase cutoff and * retry the scheduling loop. * also, don't invoke delay_actions while cutoff is taking effect, * since a sleeping class won't have a chance to be scheduled in the * next loop. * * now heuristics for setting the top-level variable (cutoff_) becomes: * 1. if a packet arrives for a not-overlimit class, set cutoff * to the depth of the class. * 2. if cutoff is i, and a packet arrives for an overlimit class * with an underlimit ancestor at a lower level than i (say j), * then set cutoff to j. * 3. at scheduling a packet, if there is no underlimit class * due to the current cutoff level, increase cutoff by 1 and * then try to schedule again. */ /* * rm_class_t * * rmc_newclass(...) - Create a new resource management class at priority * 'pri' on the interface given by 'ifd'. * * nsecPerByte is the data rate of the interface in nanoseconds/byte. * E.g., 800 for a 10Mb/s ethernet. If the class gets less * than 100% of the bandwidth, this number should be the * 'effective' rate for the class. Let f be the * bandwidth fraction allocated to this class, and let * nsPerByte be the data rate of the output link in * nanoseconds/byte. Then nsecPerByte is set to * nsPerByte / f. E.g., 1600 (= 800 / .5) * for a class that gets 50% of an ethernet's bandwidth. * * action the routine to call when the class is over limit. * * maxq max allowable queue size for class (in packets). * * parent parent class pointer. * * borrow class to borrow from (should be either 'parent' or null). * * maxidle max value allowed for class 'idle' time estimate (this * parameter determines how large an initial burst of packets * can be before overlimit action is invoked. * * offtime how long 'delay' action will delay when class goes over * limit (this parameter determines the steady-state burst * size when a class is running over its limit). * * Maxidle and offtime have to be computed from the following: If the * average packet size is s, the bandwidth fraction allocated to this * class is f, we want to allow b packet bursts, and the gain of the * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: * * ptime = s * nsPerByte * (1 - f) / f * maxidle = ptime * (1 - g^b) / g^b * minidle = -ptime * (1 / (f - 1)) * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) * * Operationally, it's convenient to specify maxidle & offtime in units * independent of the link bandwidth so the maxidle & offtime passed to * this routine are the above values multiplied by 8*f/(1000*nsPerByte). * (The constant factor is a scale factor needed to make the parameters * integers. This scaling also means that the 'unscaled' values of * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, * not nanoseconds.) Also note that the 'idle' filter computation keeps * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of * maxidle also must be scaled upward by this value. Thus, the passed * values for maxidle and offtime can be computed as follows: * * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) * offtime = offtime * 8 / (1000 * nsecPerByte) * * When USE_HRTIME is employed, then maxidle and offtime become: * maxidle = maxilde * (8.0 / nsecPerByte); * offtime = offtime * (8.0 / nsecPerByte); */ struct rm_class * rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, void (*action)(rm_class_t *, rm_class_t *), int maxq, struct rm_class *parent, struct rm_class *borrow, u_int maxidle, int minidle, u_int offtime, int pktsize, int flags) { struct rm_class *cl; struct rm_class *peer; int s; if (pri >= RM_MAXPRIO) return (NULL); #ifndef ALTQ_RED if (flags & RMCF_RED) { #ifdef ALTQ_DEBUG printf("rmc_newclass: RED not configured for CBQ!\n"); #endif return (NULL); } #endif #ifndef ALTQ_RIO if (flags & RMCF_RIO) { #ifdef ALTQ_DEBUG printf("rmc_newclass: RIO not configured for CBQ!\n"); #endif return (NULL); } #endif cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO); if (cl == NULL) return (NULL); CALLOUT_INIT(&cl->callout_); cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); if (cl->q_ == NULL) { free(cl, M_DEVBUF); return (NULL); } /* * Class initialization. */ cl->children_ = NULL; cl->parent_ = parent; cl->borrow_ = borrow; cl->leaf_ = 1; cl->ifdat_ = ifd; cl->pri_ = pri; cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ cl->depth_ = 0; cl->qthresh_ = 0; cl->ns_per_byte_ = nsecPerByte; qlimit(cl->q_) = maxq; qtype(cl->q_) = Q_DROPHEAD; qlen(cl->q_) = 0; cl->flags_ = flags; #if 1 /* minidle is also scaled in ALTQ */ cl->minidle_ = (minidle * (int)nsecPerByte) / 8; if (cl->minidle_ > 0) cl->minidle_ = 0; #else cl->minidle_ = minidle; #endif cl->maxidle_ = (maxidle * nsecPerByte) / 8; if (cl->maxidle_ == 0) cl->maxidle_ = 1; #if 1 /* offtime is also scaled in ALTQ */ cl->avgidle_ = cl->maxidle_; cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; if (cl->offtime_ == 0) cl->offtime_ = 1; #else cl->avgidle_ = 0; cl->offtime_ = (offtime * nsecPerByte) / 8; #endif cl->overlimit = action; #ifdef ALTQ_RED if (flags & (RMCF_RED|RMCF_RIO)) { int red_flags, red_pkttime; red_flags = 0; if (flags & RMCF_ECN) red_flags |= REDF_ECN; if (flags & RMCF_FLOWVALVE) red_flags |= REDF_FLOWVALVE; #ifdef ALTQ_RIO if (flags & RMCF_CLEARDSCP) red_flags |= RIOF_CLEARDSCP; #endif red_pkttime = nsecPerByte * pktsize / 1000; if (flags & RMCF_RED) { cl->red_ = red_alloc(0, 0, qlimit(cl->q_) * 10/100, qlimit(cl->q_) * 30/100, red_flags, red_pkttime); if (cl->red_ != NULL) qtype(cl->q_) = Q_RED; } #ifdef ALTQ_RIO else { cl->red_ = (red_t *)rio_alloc(0, NULL, red_flags, red_pkttime); if (cl->red_ != NULL) qtype(cl->q_) = Q_RIO; } #endif } #endif /* ALTQ_RED */ /* * put the class into the class tree */ #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(ifd->ifq_); if ((peer = ifd->active_[pri]) != NULL) { /* find the last class at this pri */ cl->peer_ = peer; while (peer->peer_ != ifd->active_[pri]) peer = peer->peer_; peer->peer_ = cl; } else { ifd->active_[pri] = cl; cl->peer_ = cl; } if (cl->parent_) { cl->next_ = parent->children_; parent->children_ = cl; parent->leaf_ = 0; } /* * Compute the depth of this class and its ancestors in the class * hierarchy. */ rmc_depth_compute(cl); /* * If CBQ's WRR is enabled, then initialize the class WRR state. */ if (ifd->wrr_) { ifd->num_[pri]++; ifd->alloc_[pri] += cl->allotment_; rmc_wrr_set_weights(ifd); } IFQ_UNLOCK(ifd->ifq_); splx(s); return (cl); } int rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, int minidle, u_int offtime, int pktsize) { struct rm_ifdat *ifd; u_int old_allotment; int s; ifd = cl->ifdat_; old_allotment = cl->allotment_; #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(ifd->ifq_); cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ cl->qthresh_ = 0; cl->ns_per_byte_ = nsecPerByte; qlimit(cl->q_) = maxq; #if 1 /* minidle is also scaled in ALTQ */ cl->minidle_ = (minidle * nsecPerByte) / 8; if (cl->minidle_ > 0) cl->minidle_ = 0; #else cl->minidle_ = minidle; #endif cl->maxidle_ = (maxidle * nsecPerByte) / 8; if (cl->maxidle_ == 0) cl->maxidle_ = 1; #if 1 /* offtime is also scaled in ALTQ */ cl->avgidle_ = cl->maxidle_; cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; if (cl->offtime_ == 0) cl->offtime_ = 1; #else cl->avgidle_ = 0; cl->offtime_ = (offtime * nsecPerByte) / 8; #endif /* * If CBQ's WRR is enabled, then initialize the class WRR state. */ if (ifd->wrr_) { ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; rmc_wrr_set_weights(ifd); } IFQ_UNLOCK(ifd->ifq_); splx(s); return (0); } /* * static void * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes * the appropriate run robin weights for the CBQ weighted round robin * algorithm. * * Returns: NONE */ static void rmc_wrr_set_weights(struct rm_ifdat *ifd) { int i; struct rm_class *cl, *clh; for (i = 0; i < RM_MAXPRIO; i++) { /* * This is inverted from that of the simulator to * maintain precision. */ if (ifd->num_[i] == 0) ifd->M_[i] = 0; else ifd->M_[i] = ifd->alloc_[i] / (ifd->num_[i] * ifd->maxpkt_); /* * Compute the weighted allotment for each class. * This takes the expensive div instruction out * of the main loop for the wrr scheduling path. * These only get recomputed when a class comes or * goes. */ if (ifd->active_[i] != NULL) { clh = cl = ifd->active_[i]; do { /* safe-guard for slow link or alloc_ == 0 */ if (ifd->M_[i] == 0) cl->w_allotment_ = 0; else cl->w_allotment_ = cl->allotment_ / ifd->M_[i]; cl = cl->peer_; } while ((cl != NULL) && (cl != clh)); } } } int rmc_get_weight(struct rm_ifdat *ifd, int pri) { if ((pri >= 0) && (pri < RM_MAXPRIO)) return (ifd->M_[pri]); else return (0); } /* * static void * rmc_depth_compute(struct rm_class *cl) - This function computes the * appropriate depth of class 'cl' and its ancestors. * * Returns: NONE */ static void rmc_depth_compute(struct rm_class *cl) { rm_class_t *t = cl, *p; /* * Recompute the depth for the branch of the tree. */ while (t != NULL) { p = t->parent_; if (p && (t->depth_ >= p->depth_)) { p->depth_ = t->depth_ + 1; t = p; } else t = NULL; } } /* * static void * rmc_depth_recompute(struct rm_class *cl) - This function re-computes * the depth of the tree after a class has been deleted. * * Returns: NONE */ static void rmc_depth_recompute(rm_class_t *cl) { #if 1 /* ALTQ */ rm_class_t *p, *t; p = cl; while (p != NULL) { if ((t = p->children_) == NULL) { p->depth_ = 0; } else { int cdepth = 0; while (t != NULL) { if (t->depth_ > cdepth) cdepth = t->depth_; t = t->next_; } if (p->depth_ == cdepth + 1) /* no change to this parent */ return; p->depth_ = cdepth + 1; } p = p->parent_; } #else rm_class_t *t; if (cl->depth_ >= 1) { if (cl->children_ == NULL) { cl->depth_ = 0; } else if ((t = cl->children_) != NULL) { while (t != NULL) { if (t->children_ != NULL) rmc_depth_recompute(t); t = t->next_; } } else rmc_depth_compute(cl); } #endif } /* * void * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This * function deletes a class from the link-sharing structure and frees * all resources associated with the class. * * Returns: NONE */ void rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) { struct rm_class *p, *head, *previous; int s; ASSERT(cl->children_ == NULL); if (cl->sleeping_) CALLOUT_STOP(&cl->callout_); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(ifd->ifq_); /* * Free packets in the packet queue. * XXX - this may not be a desired behavior. Packets should be * re-queued. */ rmc_dropall(cl); /* * If the class has a parent, then remove the class from the * class from the parent's children chain. */ if (cl->parent_ != NULL) { head = cl->parent_->children_; p = previous = head; if (head->next_ == NULL) { ASSERT(head == cl); cl->parent_->children_ = NULL; cl->parent_->leaf_ = 1; } else while (p != NULL) { if (p == cl) { if (cl == head) cl->parent_->children_ = cl->next_; else previous->next_ = cl->next_; cl->next_ = NULL; p = NULL; } else { previous = p; p = p->next_; } } } /* * Delete class from class priority peer list. */ if ((p = ifd->active_[cl->pri_]) != NULL) { /* * If there is more than one member of this priority * level, then look for class(cl) in the priority level. */ if (p != p->peer_) { while (p->peer_ != cl) p = p->peer_; p->peer_ = cl->peer_; if (ifd->active_[cl->pri_] == cl) ifd->active_[cl->pri_] = cl->peer_; } else { ASSERT(p == cl); ifd->active_[cl->pri_] = NULL; } } /* * Recompute the WRR weights. */ if (ifd->wrr_) { ifd->alloc_[cl->pri_] -= cl->allotment_; ifd->num_[cl->pri_]--; rmc_wrr_set_weights(ifd); } /* * Re-compute the depth of the tree. */ #if 1 /* ALTQ */ rmc_depth_recompute(cl->parent_); #else rmc_depth_recompute(ifd->root_); #endif IFQ_UNLOCK(ifd->ifq_); splx(s); /* * Free the class structure. */ if (cl->red_ != NULL) { #ifdef ALTQ_RIO if (q_is_rio(cl->q_)) rio_destroy((rio_t *)cl->red_); #endif #ifdef ALTQ_RED if (q_is_red(cl->q_)) red_destroy(cl->red_); #endif } free(cl->q_, M_DEVBUF); free(cl, M_DEVBUF); } /* * void * rmc_init(...) - Initialize the resource management data structures * associated with the output portion of interface 'ifp'. 'ifd' is * where the structures will be built (for backwards compatibility, the * structures aren't kept in the ifnet struct). 'nsecPerByte' * gives the link speed (inverse of bandwidth) in nanoseconds/byte. * 'restart' is the driver-specific routine that the generic 'delay * until under limit' action will call to restart output. `maxq' * is the queue size of the 'link' & 'default' classes. 'maxqueued' * is the maximum number of packets that the resource management * code will allow to be queued 'downstream' (this is typically 1). * * Returns: NONE */ void rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, int minidle, u_int offtime, int flags) { int i, mtu; /* * Initialize the CBQ tracing/debug facility. */ CBQTRACEINIT(); bzero((char *)ifd, sizeof (*ifd)); mtu = ifq->altq_ifp->if_mtu; ifd->ifq_ = ifq; ifd->restart = restart; ifd->maxqueued_ = maxqueued; ifd->ns_per_byte_ = nsecPerByte; ifd->maxpkt_ = mtu; ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; #if 1 ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; if (mtu * nsecPerByte > 10 * 1000000) ifd->maxiftime_ /= 4; #endif reset_cutoff(ifd); CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); /* * Initialize the CBQ's WRR state. */ for (i = 0; i < RM_MAXPRIO; i++) { ifd->alloc_[i] = 0; ifd->M_[i] = 0; ifd->num_[i] = 0; ifd->na_[i] = 0; ifd->active_[i] = NULL; } /* * Initialize current packet state. */ ifd->qi_ = 0; ifd->qo_ = 0; for (i = 0; i < RM_MAXQUEUED; i++) { ifd->class_[i] = NULL; ifd->curlen_[i] = 0; ifd->borrowed_[i] = NULL; } /* * Create the root class of the link-sharing structure. */ if ((ifd->root_ = rmc_newclass(0, ifd, nsecPerByte, rmc_root_overlimit, maxq, 0, 0, maxidle, minidle, offtime, 0, 0)) == NULL) { printf("rmc_init: root class not allocated\n"); return ; } ifd->root_->depth_ = 0; } /* * void * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by * mbuf 'm' to queue for resource class 'cl'. This routine is called * by a driver's if_output routine. This routine must be called with * output packet completion interrupts locked out (to avoid racing with * rmc_dequeue_next). * * Returns: 0 on successful queueing * -1 when packet drop occurs */ int rmc_queue_packet(struct rm_class *cl, mbuf_t *m) { struct timeval now; struct rm_ifdat *ifd = cl->ifdat_; int cpri = cl->pri_; int is_empty = qempty(cl->q_); RM_GETTIME(now); if (ifd->cutoff_ > 0) { if (TV_LT(&cl->undertime_, &now)) { if (ifd->cutoff_ > cl->depth_) ifd->cutoff_ = cl->depth_; CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); } #if 1 /* ALTQ */ else { /* * the class is overlimit. if the class has * underlimit ancestors, set cutoff to the lowest * depth among them. */ struct rm_class *borrow = cl->borrow_; while (borrow != NULL && borrow->depth_ < ifd->cutoff_) { if (TV_LT(&borrow->undertime_, &now)) { ifd->cutoff_ = borrow->depth_; CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); break; } borrow = borrow->borrow_; } } #else /* !ALTQ */ else if ((ifd->cutoff_ > 1) && cl->borrow_) { if (TV_LT(&cl->borrow_->undertime_, &now)) { ifd->cutoff_ = cl->borrow_->depth_; CBQTRACE(rmc_queue_packet, 'ffob', cl->borrow_->depth_); } } #endif /* !ALTQ */ } if (_rmc_addq(cl, m) < 0) /* failed */ return (-1); if (is_empty) { CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); ifd->na_[cpri]++; } if (qlen(cl->q_) > qlimit(cl->q_)) { /* note: qlimit can be set to 0 or 1 */ rmc_drop_action(cl); return (-1); } return (0); } /* * void * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all * classes to see if there are satified. */ static void rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) { int i; rm_class_t *p, *bp; for (i = RM_MAXPRIO - 1; i >= 0; i--) { if ((bp = ifd->active_[i]) != NULL) { p = bp; do { if (!rmc_satisfied(p, now)) { ifd->cutoff_ = p->depth_; return; } p = p->peer_; } while (p != bp); } } reset_cutoff(ifd); } /* * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. */ static int rmc_satisfied(struct rm_class *cl, struct timeval *now) { rm_class_t *p; if (cl == NULL) return (1); if (TV_LT(now, &cl->undertime_)) return (1); if (cl->depth_ == 0) { if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) return (0); else return (1); } if (cl->children_ != NULL) { p = cl->children_; while (p != NULL) { if (!rmc_satisfied(p, now)) return (0); p = p->next_; } } return (1); } /* * Return 1 if class 'cl' is under limit or can borrow from a parent, * 0 if overlimit. As a side-effect, this routine will invoke the * class overlimit action if the class if overlimit. */ static int rmc_under_limit(struct rm_class *cl, struct timeval *now) { rm_class_t *p = cl; rm_class_t *top; struct rm_ifdat *ifd = cl->ifdat_; ifd->borrowed_[ifd->qi_] = NULL; /* * If cl is the root class, then always return that it is * underlimit. Otherwise, check to see if the class is underlimit. */ if (cl->parent_ == NULL) return (1); if (cl->sleeping_) { if (TV_LT(now, &cl->undertime_)) return (0); CALLOUT_STOP(&cl->callout_); cl->sleeping_ = 0; cl->undertime_.tv_sec = 0; return (1); } top = NULL; while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { if (((cl = cl->borrow_) == NULL) || (cl->depth_ > ifd->cutoff_)) { #ifdef ADJUST_CUTOFF if (cl != NULL) /* cutoff is taking effect, just return false without calling the delay action. */ return (0); #endif #ifdef BORROW_OFFTIME /* * check if the class can borrow offtime too. * borrow offtime from the top of the borrow * chain if the top class is not overloaded. */ if (cl != NULL) { /* cutoff is taking effect, use this class as top. */ top = cl; CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); } if (top != NULL && top->avgidle_ == top->minidle_) top = NULL; p->overtime_ = *now; (p->overlimit)(p, top); #else p->overtime_ = *now; (p->overlimit)(p, NULL); #endif return (0); } top = cl; } if (cl != p) ifd->borrowed_[ifd->qi_] = cl; return (1); } /* * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to * Packet-by-packet round robin. * * The heart of the weighted round-robin scheduler, which decides which * class next gets to send a packet. Highest priority first, then * weighted round-robin within priorites. * * Each able-to-send class gets to send until its byte allocation is * exhausted. Thus, the active pointer is only changed after a class has * exhausted its allocation. * * If the scheduler finds no class that is underlimit or able to borrow, * then the first class found that had a nonzero queue and is allowed to * borrow gets to send. */ static mbuf_t * _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) { struct rm_class *cl = NULL, *first = NULL; u_int deficit; int cpri; mbuf_t *m; struct timeval now; RM_GETTIME(now); /* * if the driver polls the top of the queue and then removes * the polled packet, we must return the same packet. */ if (op == ALTDQ_REMOVE && ifd->pollcache_) { cl = ifd->pollcache_; cpri = cl->pri_; if (ifd->efficient_) { /* check if this class is overlimit */ if (cl->undertime_.tv_sec != 0 && rmc_under_limit(cl, &now) == 0) first = cl; } ifd->pollcache_ = NULL; goto _wrr_out; } else { /* mode == ALTDQ_POLL || pollcache == NULL */ ifd->pollcache_ = NULL; ifd->borrowed_[ifd->qi_] = NULL; } #ifdef ADJUST_CUTOFF _again: #endif for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { if (ifd->na_[cpri] == 0) continue; deficit = 0; /* * Loop through twice for a priority level, if some class * was unable to send a packet the first round because * of the weighted round-robin mechanism. * During the second loop at this level, deficit==2. * (This second loop is not needed if for every class, * "M[cl->pri_])" times "cl->allotment" is greater than * the byte size for the largest packet in the class.) */ _wrr_loop: cl = ifd->active_[cpri]; ASSERT(cl != NULL); do { if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) cl->bytes_alloc_ += cl->w_allotment_; if (!qempty(cl->q_)) { if ((cl->undertime_.tv_sec == 0) || rmc_under_limit(cl, &now)) { if (cl->bytes_alloc_ > 0 || deficit > 1) goto _wrr_out; /* underlimit but no alloc */ deficit = 1; #if 1 ifd->borrowed_[ifd->qi_] = NULL; #endif } else if (first == NULL && cl->borrow_ != NULL) first = cl; /* borrowing candidate */ } cl->bytes_alloc_ = 0; cl = cl->peer_; } while (cl != ifd->active_[cpri]); if (deficit == 1) { /* first loop found an underlimit class with deficit */ /* Loop on same priority level, with new deficit. */ deficit = 2; goto _wrr_loop; } } #ifdef ADJUST_CUTOFF /* * no underlimit class found. if cutoff is taking effect, * increase cutoff and try again. */ if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { ifd->cutoff_++; CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); goto _again; } #endif /* ADJUST_CUTOFF */ /* * If LINK_EFFICIENCY is turned on, then the first overlimit * class we encounter will send a packet if all the classes * of the link-sharing structure are overlimit. */ reset_cutoff(ifd); CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); if (!ifd->efficient_ || first == NULL) return (NULL); cl = first; cpri = cl->pri_; #if 0 /* too time-consuming for nothing */ if (cl->sleeping_) CALLOUT_STOP(&cl->callout_); cl->sleeping_ = 0; cl->undertime_.tv_sec = 0; #endif ifd->borrowed_[ifd->qi_] = cl->borrow_; ifd->cutoff_ = cl->borrow_->depth_; /* * Deque the packet and do the book keeping... */ _wrr_out: if (op == ALTDQ_REMOVE) { m = _rmc_getq(cl); if (m == NULL) panic("_rmc_wrr_dequeue_next"); if (qempty(cl->q_)) ifd->na_[cpri]--; /* * Update class statistics and link data. */ if (cl->bytes_alloc_ > 0) cl->bytes_alloc_ -= m_pktlen(m); if ((cl->bytes_alloc_ <= 0) || first == cl) ifd->active_[cl->pri_] = cl->peer_; else ifd->active_[cl->pri_] = cl; ifd->class_[ifd->qi_] = cl; ifd->curlen_[ifd->qi_] = m_pktlen(m); ifd->now_[ifd->qi_] = now; ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; ifd->queued_++; } else { /* mode == ALTDQ_PPOLL */ m = _rmc_pollq(cl); ifd->pollcache_ = cl; } return (m); } /* * Dequeue & return next packet from the highest priority class that * has a packet to send & has enough allocation to send it. This * routine is called by a driver whenever it needs a new packet to * output. */ static mbuf_t * _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) { mbuf_t *m; int cpri; struct rm_class *cl, *first = NULL; struct timeval now; RM_GETTIME(now); /* * if the driver polls the top of the queue and then removes * the polled packet, we must return the same packet. */ if (op == ALTDQ_REMOVE && ifd->pollcache_) { cl = ifd->pollcache_; cpri = cl->pri_; ifd->pollcache_ = NULL; goto _prr_out; } else { /* mode == ALTDQ_POLL || pollcache == NULL */ ifd->pollcache_ = NULL; ifd->borrowed_[ifd->qi_] = NULL; } #ifdef ADJUST_CUTOFF _again: #endif for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { if (ifd->na_[cpri] == 0) continue; cl = ifd->active_[cpri]; ASSERT(cl != NULL); do { if (!qempty(cl->q_)) { if ((cl->undertime_.tv_sec == 0) || rmc_under_limit(cl, &now)) goto _prr_out; if (first == NULL && cl->borrow_ != NULL) first = cl; } cl = cl->peer_; } while (cl != ifd->active_[cpri]); } #ifdef ADJUST_CUTOFF /* * no underlimit class found. if cutoff is taking effect, increase * cutoff and try again. */ if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { ifd->cutoff_++; goto _again; } #endif /* ADJUST_CUTOFF */ /* * If LINK_EFFICIENCY is turned on, then the first overlimit * class we encounter will send a packet if all the classes * of the link-sharing structure are overlimit. */ reset_cutoff(ifd); if (!ifd->efficient_ || first == NULL) return (NULL); cl = first; cpri = cl->pri_; #if 0 /* too time-consuming for nothing */ if (cl->sleeping_) CALLOUT_STOP(&cl->callout_); cl->sleeping_ = 0; cl->undertime_.tv_sec = 0; #endif ifd->borrowed_[ifd->qi_] = cl->borrow_; ifd->cutoff_ = cl->borrow_->depth_; /* * Deque the packet and do the book keeping... */ _prr_out: if (op == ALTDQ_REMOVE) { m = _rmc_getq(cl); if (m == NULL) panic("_rmc_prr_dequeue_next"); if (qempty(cl->q_)) ifd->na_[cpri]--; ifd->active_[cpri] = cl->peer_; ifd->class_[ifd->qi_] = cl; ifd->curlen_[ifd->qi_] = m_pktlen(m); ifd->now_[ifd->qi_] = now; ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; ifd->queued_++; } else { /* mode == ALTDQ_POLL */ m = _rmc_pollq(cl); ifd->pollcache_ = cl; } return (m); } /* * mbuf_t * * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function * is invoked by the packet driver to get the next packet to be * dequeued and output on the link. If WRR is enabled, then the * WRR dequeue next routine will determine the next packet to sent. * Otherwise, packet-by-packet round robin is invoked. * * Returns: NULL, if a packet is not available or if all * classes are overlimit. * * Otherwise, Pointer to the next packet. */ mbuf_t * rmc_dequeue_next(struct rm_ifdat *ifd, int mode) { if (ifd->queued_ >= ifd->maxqueued_) return (NULL); else if (ifd->wrr_) return (_rmc_wrr_dequeue_next(ifd, mode)); else return (_rmc_prr_dequeue_next(ifd, mode)); } /* * Update the utilization estimate for the packet that just completed. * The packet's class & the parent(s) of that class all get their * estimators updated. This routine is called by the driver's output- * packet-completion interrupt service routine. */ /* * a macro to approximate "divide by 1000" that gives 0.000999, * if a value has enough effective digits. * (on pentium, mul takes 9 cycles but div takes 46!) */ #define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) void rmc_update_class_util(struct rm_ifdat *ifd) { int idle, avgidle, pktlen; int pkt_time, tidle; rm_class_t *cl, *borrowed; rm_class_t *borrows; struct timeval *nowp; /* * Get the most recent completed class. */ if ((cl = ifd->class_[ifd->qo_]) == NULL) return; pktlen = ifd->curlen_[ifd->qo_]; borrowed = ifd->borrowed_[ifd->qo_]; borrows = borrowed; PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); /* * Run estimator on class and its ancestors. */ /* * rm_update_class_util is designed to be called when the * transfer is completed from a xmit complete interrupt, * but most drivers don't implement an upcall for that. * so, just use estimated completion time. * as a result, ifd->qi_ and ifd->qo_ are always synced. */ nowp = &ifd->now_[ifd->qo_]; /* get pkt_time (for link) in usec */ #if 1 /* use approximation */ pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; pkt_time = NSEC_TO_USEC(pkt_time); #else pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; #endif #if 1 /* ALTQ4PPP */ if (TV_LT(nowp, &ifd->ifnow_)) { int iftime; /* * make sure the estimated completion time does not go * too far. it can happen when the link layer supports * data compression or the interface speed is set to * a much lower value. */ TV_DELTA(&ifd->ifnow_, nowp, iftime); if (iftime+pkt_time < ifd->maxiftime_) { TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); } else { TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); } } else { TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); } #else if (TV_LT(nowp, &ifd->ifnow_)) { TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); } else { TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); } #endif while (cl != NULL) { TV_DELTA(&ifd->ifnow_, &cl->last_, idle); if (idle >= 2000000) /* * this class is idle enough, reset avgidle. * (TV_DELTA returns 2000000 us when delta is large.) */ cl->avgidle_ = cl->maxidle_; /* get pkt_time (for class) in usec */ #if 1 /* use approximation */ pkt_time = pktlen * cl->ns_per_byte_; pkt_time = NSEC_TO_USEC(pkt_time); #else pkt_time = pktlen * cl->ns_per_byte_ / 1000; #endif idle -= pkt_time; avgidle = cl->avgidle_; avgidle += idle - (avgidle >> RM_FILTER_GAIN); cl->avgidle_ = avgidle; /* Are we overlimit ? */ if (avgidle <= 0) { CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); #if 1 /* ALTQ */ /* * need some lower bound for avgidle, otherwise * a borrowing class gets unbounded penalty. */ if (avgidle < cl->minidle_) avgidle = cl->avgidle_ = cl->minidle_; #endif /* set next idle to make avgidle 0 */ tidle = pkt_time + (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); TV_ADD_DELTA(nowp, tidle, &cl->undertime_); ++cl->stats_.over; } else { cl->avgidle_ = (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; cl->undertime_.tv_sec = 0; if (cl->sleeping_) { CALLOUT_STOP(&cl->callout_); cl->sleeping_ = 0; } } if (borrows != NULL) { if (borrows != cl) ++cl->stats_.borrows; else borrows = NULL; } cl->last_ = ifd->ifnow_; cl->last_pkttime_ = pkt_time; #if 1 if (cl->parent_ == NULL) { /* take stats of root class */ PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); } #endif cl = cl->parent_; } /* * Check to see if cutoff needs to set to a new level. */ cl = ifd->class_[ifd->qo_]; if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { #if 1 /* ALTQ */ if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { rmc_tl_satisfied(ifd, nowp); CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); } else { ifd->cutoff_ = borrowed->depth_; CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); } #else /* !ALTQ */ if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { reset_cutoff(ifd); #ifdef notdef rmc_tl_satisfied(ifd, &now); #endif CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); } else { ifd->cutoff_ = borrowed->depth_; CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); } #endif /* !ALTQ */ } /* * Release class slot */ ifd->borrowed_[ifd->qo_] = NULL; ifd->class_[ifd->qo_] = NULL; ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; ifd->queued_--; } /* * void * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) * over-limit action routines. These get invoked by rmc_under_limit() * if a class with packets to send if over its bandwidth limit & can't * borrow from a parent class. * * Returns: NONE */ static void rmc_drop_action(struct rm_class *cl) { struct rm_ifdat *ifd = cl->ifdat_; ASSERT(qlen(cl->q_) > 0); _rmc_dropq(cl); if (qempty(cl->q_)) ifd->na_[cl->pri_]--; } void rmc_dropall(struct rm_class *cl) { struct rm_ifdat *ifd = cl->ifdat_; if (!qempty(cl->q_)) { _flushq(cl->q_); ifd->na_[cl->pri_]--; } } #if (__FreeBSD_version > 300000) /* hzto() is removed from FreeBSD-3.0 */ static int hzto(struct timeval *); static int hzto(tv) struct timeval *tv; { struct timeval t2; getmicrotime(&t2); t2.tv_sec = tv->tv_sec - t2.tv_sec; t2.tv_usec = tv->tv_usec - t2.tv_usec; return (tvtohz(&t2)); } #endif /* __FreeBSD_version > 300000 */ /* * void * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ * delay action routine. It is invoked via rmc_under_limit when the * packet is discoverd to be overlimit. * * If the delay action is result of borrow class being overlimit, then * delay for the offtime of the borrowing class that is overlimit. * * Returns: NONE */ void rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) { int delay, t, extradelay; cl->stats_.overactions++; TV_DELTA(&cl->undertime_, &cl->overtime_, delay); #ifndef BORROW_OFFTIME delay += cl->offtime_; #endif if (!cl->sleeping_) { CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); #ifdef BORROW_OFFTIME if (borrow != NULL) extradelay = borrow->offtime_; else #endif extradelay = cl->offtime_; #ifdef ALTQ /* * XXX recalculate suspend time: * current undertime is (tidle + pkt_time) calculated * from the last transmission. * tidle: time required to bring avgidle back to 0 * pkt_time: target waiting time for this class * we need to replace pkt_time by offtime */ extradelay -= cl->last_pkttime_; #endif if (extradelay > 0) { TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); delay += extradelay; } cl->sleeping_ = 1; cl->stats_.delays++; /* * Since packets are phased randomly with respect to the * clock, 1 tick (the next clock tick) can be an arbitrarily * short time so we have to wait for at least two ticks. * NOTE: If there's no other traffic, we need the timer as * a 'backstop' to restart this class. */ if (delay > tick * 2) { #ifdef __FreeBSD__ /* FreeBSD rounds up the tick */ t = hzto(&cl->undertime_); #else /* other BSDs round down the tick */ t = hzto(&cl->undertime_) + 1; #endif } else t = 2; CALLOUT_RESET(&cl->callout_, t, (timeout_t *)rmc_restart, (caddr_t)cl); } } /* * void * rmc_restart() - is just a helper routine for rmc_delay_action -- it is * called by the system timer code & is responsible checking if the * class is still sleeping (it might have been restarted as a side * effect of the queue scan on a packet arrival) and, if so, restarting * output for the class. Inspecting the class state & restarting output * require locking the class structure. In general the driver is * responsible for locking but this is the only routine that is not * called directly or indirectly from the interface driver so it has * know about system locking conventions. Under bsd, locking is done * by raising IPL to splimp so that's what's implemented here. On a * different system this would probably need to be changed. * * Returns: NONE */ static void rmc_restart(struct rm_class *cl) { struct rm_ifdat *ifd = cl->ifdat_; int s; #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_LOCK(ifd->ifq_); if (cl->sleeping_) { cl->sleeping_ = 0; cl->undertime_.tv_sec = 0; if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); (ifd->restart)(ifd->ifq_); } } IFQ_UNLOCK(ifd->ifq_); splx(s); } /* * void * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit * handling routine for the root class of the link sharing structure. * * Returns: NONE */ static void rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) { panic("rmc_root_overlimit"); } /* * Packet Queue handling routines. Eventually, this is to localize the * effects on the code whether queues are red queues or droptail * queues. */ static int _rmc_addq(rm_class_t *cl, mbuf_t *m) { #ifdef ALTQ_RIO if (q_is_rio(cl->q_)) return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); #endif #ifdef ALTQ_RED if (q_is_red(cl->q_)) return red_addq(cl->red_, cl->q_, m, cl->pktattr_); #endif /* ALTQ_RED */ if (cl->flags_ & RMCF_CLEARDSCP) write_dsfield(m, cl->pktattr_, 0); _addq(cl->q_, m); return (0); } /* note: _rmc_dropq is not called for red */ static void _rmc_dropq(rm_class_t *cl) { mbuf_t *m; if ((m = _getq(cl->q_)) != NULL) m_freem(m); } static mbuf_t * _rmc_getq(rm_class_t *cl) { #ifdef ALTQ_RIO if (q_is_rio(cl->q_)) return rio_getq((rio_t *)cl->red_, cl->q_); #endif #ifdef ALTQ_RED if (q_is_red(cl->q_)) return red_getq(cl->red_, cl->q_); #endif return _getq(cl->q_); } static mbuf_t * _rmc_pollq(rm_class_t *cl) { return qhead(cl->q_); } #ifdef CBQ_TRACE struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; struct cbqtrace *cbqtrace_ptr = NULL; int cbqtrace_count; /* * DDB hook to trace cbq events: * the last 1024 events are held in a circular buffer. * use "call cbqtrace_dump(N)" to display 20 events from Nth event. */ void cbqtrace_dump(int); static char *rmc_funcname(void *); static struct rmc_funcs { void *func; char *name; } rmc_funcs[] = { rmc_init, "rmc_init", rmc_queue_packet, "rmc_queue_packet", rmc_under_limit, "rmc_under_limit", rmc_update_class_util, "rmc_update_class_util", rmc_delay_action, "rmc_delay_action", rmc_restart, "rmc_restart", _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", NULL, NULL }; static char *rmc_funcname(void *func) { struct rmc_funcs *fp; for (fp = rmc_funcs; fp->func != NULL; fp++) if (fp->func == func) return (fp->name); return ("unknown"); } void cbqtrace_dump(int counter) { int i, *p; char *cp; counter = counter % NCBQTRACE; p = (int *)&cbqtrace_buffer[counter]; for (i=0; i<20; i++) { printf("[0x%x] ", *p++); printf("%s: ", rmc_funcname((void *)*p++)); cp = (char *)p++; printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); printf("%d\n",*p++); if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) p = (int *)cbqtrace_buffer; } } #endif /* CBQ_TRACE */ #endif /* ALTQ_CBQ */ #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) #if !defined(__GNUC__) || defined(ALTQ_DEBUG) void _addq(class_queue_t *q, mbuf_t *m) { mbuf_t *m0; if ((m0 = qtail(q)) != NULL) m->m_nextpkt = m0->m_nextpkt; else m0 = m; m0->m_nextpkt = m; qtail(q) = m; qlen(q)++; } mbuf_t * _getq(class_queue_t *q) { mbuf_t *m, *m0; if ((m = qtail(q)) == NULL) return (NULL); if ((m0 = m->m_nextpkt) != m) m->m_nextpkt = m0->m_nextpkt; else { ASSERT(qlen(q) == 1); qtail(q) = NULL; } qlen(q)--; m0->m_nextpkt = NULL; return (m0); } /* drop a packet at the tail of the queue */ mbuf_t * _getq_tail(class_queue_t *q) { mbuf_t *m, *m0, *prev; if ((m = m0 = qtail(q)) == NULL) return NULL; do { prev = m0; m0 = m0->m_nextpkt; } while (m0 != m); prev->m_nextpkt = m->m_nextpkt; if (prev == m) { ASSERT(qlen(q) == 1); qtail(q) = NULL; } else qtail(q) = prev; qlen(q)--; m->m_nextpkt = NULL; return (m); } /* randomly select a packet in the queue */ mbuf_t * _getq_random(class_queue_t *q) { struct mbuf *m; int i, n; if ((m = qtail(q)) == NULL) return NULL; if (m->m_nextpkt == m) { ASSERT(qlen(q) == 1); qtail(q) = NULL; } else { struct mbuf *prev = NULL; n = arc4random() % qlen(q) + 1; for (i = 0; i < n; i++) { prev = m; m = m->m_nextpkt; } prev->m_nextpkt = m->m_nextpkt; if (m == qtail(q)) qtail(q) = prev; } qlen(q)--; m->m_nextpkt = NULL; return (m); } void _removeq(class_queue_t *q, mbuf_t *m) { mbuf_t *m0, *prev; m0 = qtail(q); do { prev = m0; m0 = m0->m_nextpkt; } while (m0 != m); prev->m_nextpkt = m->m_nextpkt; if (prev == m) qtail(q) = NULL; else if (qtail(q) == m) qtail(q) = prev; qlen(q)--; } void _flushq(class_queue_t *q) { mbuf_t *m; while ((m = _getq(q)) != NULL) m_freem(m); ASSERT(qlen(q) == 0); } #endif /* !__GNUC__ || ALTQ_DEBUG */ #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ Index: stable/10/sys/contrib/altq/altq/altq_subr.c =================================================================== --- stable/10/sys/contrib/altq/altq/altq_subr.c (revision 263085) +++ stable/10/sys/contrib/altq/altq/altq_subr.c (revision 263086) @@ -1,1979 +1,1981 @@ /* $FreeBSD$ */ /* $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $ */ /* * Copyright (C) 1997-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(__FreeBSD__) || defined(__NetBSD__) #include "opt_altq.h" #include "opt_inet.h" #ifdef __FreeBSD__ #include "opt_inet6.h" #endif #endif /* __FreeBSD__ || __NetBSD__ */ #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #ifdef __FreeBSD__ #include #endif #include #include #include #ifdef INET6 #include #endif #include #include -#include +#include +#include #include #ifdef ALTQ3_COMPAT #include #endif /* machine dependent clock related includes */ #ifdef __FreeBSD__ #include #include #include #include #endif #if defined(__amd64__) || defined(__i386__) #include /* for pentium tsc */ #include /* for CPUID_TSC */ #ifdef __FreeBSD__ #include /* for cpu_feature */ #elif defined(__NetBSD__) || defined(__OpenBSD__) #include /* for cpu_feature */ #endif #endif /* __amd64 || __i386__ */ /* * internal function prototypes */ static void tbr_timeout(void *); int (*altq_input)(struct mbuf *, int) = NULL; static struct mbuf *tbr_dequeue(struct ifaltq *, int); static int tbr_timer = 0; /* token bucket regulator timer */ #if !defined(__FreeBSD__) || (__FreeBSD_version < 600000) static struct callout tbr_callout = CALLOUT_INITIALIZER; #else static struct callout tbr_callout; #endif #ifdef ALTQ3_CLFIER_COMPAT static int extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *); #ifdef INET6 static int extract_ports6(struct mbuf *, struct ip6_hdr *, struct flowinfo_in6 *); #endif static int apply_filter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); static int apply_ppfilter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); #ifdef INET6 static int apply_filter6(u_int32_t, struct flow_filter6 *, struct flowinfo_in6 *); #endif static int apply_tosfilter4(u_int32_t, struct flow_filter *, struct flowinfo_in *); static u_long get_filt_handle(struct acc_classifier *, int); static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long); static u_int32_t filt2fibmask(struct flow_filter *); static void ip4f_cache(struct ip *, struct flowinfo_in *); static int ip4f_lookup(struct ip *, struct flowinfo_in *); static int ip4f_init(void); static struct ip4_frag *ip4f_alloc(void); static void ip4f_free(struct ip4_frag *); #endif /* ALTQ3_CLFIER_COMPAT */ /* * alternate queueing support routines */ /* look up the queue state by the interface name and the queueing type. */ void * altq_lookup(name, type) char *name; int type; { struct ifnet *ifp; if ((ifp = ifunit(name)) != NULL) { /* read if_snd unlocked */ if (type != ALTQT_NONE && ifp->if_snd.altq_type == type) return (ifp->if_snd.altq_disc); } return NULL; } int altq_attach(ifq, type, discipline, enqueue, dequeue, request, clfier, classify) struct ifaltq *ifq; int type; void *discipline; int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *); struct mbuf *(*dequeue)(struct ifaltq *, int); int (*request)(struct ifaltq *, int, void *); void *clfier; void *(*classify)(void *, struct mbuf *, int); { IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } #ifdef ALTQ3_COMPAT /* * pfaltq can override the existing discipline, but altq3 cannot. * check these if clfier is not NULL (which implies altq3). */ if (clfier != NULL) { if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return EBUSY; } if (ALTQ_IS_ATTACHED(ifq)) { IFQ_UNLOCK(ifq); return EEXIST; } } #endif ifq->altq_type = type; ifq->altq_disc = discipline; ifq->altq_enqueue = enqueue; ifq->altq_dequeue = dequeue; ifq->altq_request = request; ifq->altq_clfier = clfier; ifq->altq_classify = classify; ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED); #ifdef ALTQ3_COMPAT #ifdef ALTQ_KLD altq_module_incref(type); #endif #endif IFQ_UNLOCK(ifq); return 0; } int altq_detach(ifq) struct ifaltq *ifq; { IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return EBUSY; } if (!ALTQ_IS_ATTACHED(ifq)) { IFQ_UNLOCK(ifq); return (0); } #ifdef ALTQ3_COMPAT #ifdef ALTQ_KLD altq_module_declref(ifq->altq_type); #endif #endif ifq->altq_type = ALTQT_NONE; ifq->altq_disc = NULL; ifq->altq_enqueue = NULL; ifq->altq_dequeue = NULL; ifq->altq_request = NULL; ifq->altq_clfier = NULL; ifq->altq_classify = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; IFQ_UNLOCK(ifq); return 0; } int altq_enable(ifq) struct ifaltq *ifq; { int s; IFQ_LOCK(ifq); if (!ALTQ_IS_READY(ifq)) { IFQ_UNLOCK(ifq); return ENXIO; } if (ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return 0; } #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_PURGE_NOLOCK(ifq); ASSERT(ifq->ifq_len == 0); ifq->ifq_drv_maxlen = 0; /* disable bulk dequeue */ ifq->altq_flags |= ALTQF_ENABLED; if (ifq->altq_clfier != NULL) ifq->altq_flags |= ALTQF_CLASSIFY; splx(s); IFQ_UNLOCK(ifq); return 0; } int altq_disable(ifq) struct ifaltq *ifq; { int s; IFQ_LOCK(ifq); if (!ALTQ_IS_ENABLED(ifq)) { IFQ_UNLOCK(ifq); return 0; } #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif IFQ_PURGE_NOLOCK(ifq); ASSERT(ifq->ifq_len == 0); ifq->altq_flags &= ~(ALTQF_ENABLED|ALTQF_CLASSIFY); splx(s); IFQ_UNLOCK(ifq); return 0; } #ifdef ALTQ_DEBUG void altq_assert(file, line, failedexpr) const char *file, *failedexpr; int line; { (void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n", failedexpr, file, line); panic("altq assertion"); /* NOTREACHED */ } #endif /* * internal representation of token bucket parameters * rate: byte_per_unittime << 32 * (((bits_per_sec) / 8) << 32) / machclk_freq * depth: byte << 32 * */ #define TBR_SHIFT 32 #define TBR_SCALE(x) ((int64_t)(x) << TBR_SHIFT) #define TBR_UNSCALE(x) ((x) >> TBR_SHIFT) static struct mbuf * tbr_dequeue(ifq, op) struct ifaltq *ifq; int op; { struct tb_regulator *tbr; struct mbuf *m; int64_t interval; u_int64_t now; IFQ_LOCK_ASSERT(ifq); tbr = ifq->altq_tbr; if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) { /* if this is a remove after poll, bypass tbr check */ } else { /* update token only when it is negative */ if (tbr->tbr_token <= 0) { now = read_machclk(); interval = now - tbr->tbr_last; if (interval >= tbr->tbr_filluptime) tbr->tbr_token = tbr->tbr_depth; else { tbr->tbr_token += interval * tbr->tbr_rate; if (tbr->tbr_token > tbr->tbr_depth) tbr->tbr_token = tbr->tbr_depth; } tbr->tbr_last = now; } /* if token is still negative, don't allow dequeue */ if (tbr->tbr_token <= 0) return (NULL); } if (ALTQ_IS_ENABLED(ifq)) m = (*ifq->altq_dequeue)(ifq, op); else { if (op == ALTDQ_POLL) _IF_POLL(ifq, m); else _IF_DEQUEUE(ifq, m); } if (m != NULL && op == ALTDQ_REMOVE) tbr->tbr_token -= TBR_SCALE(m_pktlen(m)); tbr->tbr_lastop = op; return (m); } /* * set a token bucket regulator. * if the specified rate is zero, the token bucket regulator is deleted. */ int tbr_set(ifq, profile) struct ifaltq *ifq; struct tb_profile *profile; { struct tb_regulator *tbr, *otbr; if (tbr_dequeue_ptr == NULL) tbr_dequeue_ptr = tbr_dequeue; if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) { printf("tbr_set: no cpu clock available!\n"); return (ENXIO); } IFQ_LOCK(ifq); if (profile->rate == 0) { /* delete this tbr */ if ((tbr = ifq->altq_tbr) == NULL) { IFQ_UNLOCK(ifq); return (ENOENT); } ifq->altq_tbr = NULL; free(tbr, M_DEVBUF); IFQ_UNLOCK(ifq); return (0); } tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO); if (tbr == NULL) { IFQ_UNLOCK(ifq); return (ENOMEM); } tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq; tbr->tbr_depth = TBR_SCALE(profile->depth); if (tbr->tbr_rate > 0) tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate; else tbr->tbr_filluptime = 0xffffffffffffffffLL; tbr->tbr_token = tbr->tbr_depth; tbr->tbr_last = read_machclk(); tbr->tbr_lastop = ALTDQ_REMOVE; otbr = ifq->altq_tbr; ifq->altq_tbr = tbr; /* set the new tbr */ if (otbr != NULL) free(otbr, M_DEVBUF); else { if (tbr_timer == 0) { CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); tbr_timer = 1; } } IFQ_UNLOCK(ifq); return (0); } /* * tbr_timeout goes through the interface list, and kicks the drivers * if necessary. * * MPSAFE */ static void tbr_timeout(arg) void *arg; { #ifdef __FreeBSD__ VNET_ITERATOR_DECL(vnet_iter); #endif struct ifnet *ifp; int active, s; active = 0; #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif #ifdef __FreeBSD__ IFNET_RLOCK_NOSLEEP(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #endif for (ifp = TAILQ_FIRST(&V_ifnet); ifp; ifp = TAILQ_NEXT(ifp, if_list)) { /* read from if_snd unlocked */ if (!TBR_IS_ENABLED(&ifp->if_snd)) continue; active++; if (!IFQ_IS_EMPTY(&ifp->if_snd) && ifp->if_start != NULL) (*ifp->if_start)(ifp); } #ifdef __FreeBSD__ CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); IFNET_RUNLOCK_NOSLEEP(); #endif splx(s); if (active > 0) CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0); else tbr_timer = 0; /* don't need tbr_timer anymore */ } /* * get token bucket regulator profile */ int tbr_get(ifq, profile) struct ifaltq *ifq; struct tb_profile *profile; { struct tb_regulator *tbr; IFQ_LOCK(ifq); if ((tbr = ifq->altq_tbr) == NULL) { profile->rate = 0; profile->depth = 0; } else { profile->rate = (u_int)TBR_UNSCALE(tbr->tbr_rate * 8 * machclk_freq); profile->depth = (u_int)TBR_UNSCALE(tbr->tbr_depth); } IFQ_UNLOCK(ifq); return (0); } /* * attach a discipline to the interface. if one already exists, it is * overridden. * Locking is done in the discipline specific attach functions. Basically * they call back to altq_attach which takes care of the attach and locking. */ int altq_pfattach(struct pf_altq *a) { int error = 0; switch (a->scheduler) { case ALTQT_NONE: break; #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_pfattach(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_pfattach(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_pfattach(a); break; #endif default: error = ENXIO; } return (error); } /* * detach a discipline from the interface. * it is possible that the discipline was already overridden by another * discipline. */ int altq_pfdetach(struct pf_altq *a) { struct ifnet *ifp; int s, error = 0; if ((ifp = ifunit(a->ifname)) == NULL) return (EINVAL); /* if this discipline is no longer referenced, just return */ /* read unlocked from if_snd */ if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc) return (0); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif /* read unlocked from if_snd, _disable and _detach take care */ if (ALTQ_IS_ENABLED(&ifp->if_snd)) error = altq_disable(&ifp->if_snd); if (error == 0) error = altq_detach(&ifp->if_snd); splx(s); return (error); } /* * add a discipline or a queue * Locking is done in the discipline specific functions with regards to * malloc with WAITOK, also it is not yet clear which lock to use. */ int altq_add(struct pf_altq *a) { int error = 0; if (a->qname[0] != 0) return (altq_add_queue(a)); if (machclk_freq == 0) init_machclk(); if (machclk_freq == 0) panic("altq_add: no cpu clock"); switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_add_altq(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_add_altq(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_add_altq(a); break; #endif default: error = ENXIO; } return (error); } /* * remove a discipline or a queue * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_remove(struct pf_altq *a) { int error = 0; if (a->qname[0] != 0) return (altq_remove_queue(a)); switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_remove_altq(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_remove_altq(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_remove_altq(a); break; #endif default: error = ENXIO; } return (error); } /* * add a queue to the discipline * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_add_queue(struct pf_altq *a) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_add_queue(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_add_queue(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_add_queue(a); break; #endif default: error = ENXIO; } return (error); } /* * remove a queue from the discipline * It is yet unclear what lock to use to protect this operation, the * discipline specific functions will determine and grab it */ int altq_remove_queue(struct pf_altq *a) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_remove_queue(a); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_remove_queue(a); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_remove_queue(a); break; #endif default: error = ENXIO; } return (error); } /* * get queue statistics * Locking is done in the discipline specific functions with regards to * copyout operations, also it is not yet clear which lock to use. */ int altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes) { int error = 0; switch (a->scheduler) { #ifdef ALTQ_CBQ case ALTQT_CBQ: error = cbq_getqstats(a, ubuf, nbytes); break; #endif #ifdef ALTQ_PRIQ case ALTQT_PRIQ: error = priq_getqstats(a, ubuf, nbytes); break; #endif #ifdef ALTQ_HFSC case ALTQT_HFSC: error = hfsc_getqstats(a, ubuf, nbytes); break; #endif default: error = ENXIO; } return (error); } /* * read and write diffserv field in IPv4 or IPv6 header */ u_int8_t read_dsfield(m, pktattr) struct mbuf *m; struct altq_pktattr *pktattr; { struct mbuf *m0; u_int8_t ds_field = 0; if (pktattr == NULL || (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) return ((u_int8_t)0); /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if ((pktattr->pattr_hdr >= m0->m_data) && (pktattr->pattr_hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, pattr_hdr is stale */ pktattr->pattr_af = AF_UNSPEC; #ifdef ALTQ_DEBUG printf("read_dsfield: can't locate header!\n"); #endif return ((u_int8_t)0); } if (pktattr->pattr_af == AF_INET) { struct ip *ip = (struct ip *)pktattr->pattr_hdr; if (ip->ip_v != 4) return ((u_int8_t)0); /* version mismatch! */ ds_field = ip->ip_tos; } #ifdef INET6 else if (pktattr->pattr_af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return ((u_int8_t)0); /* version mismatch! */ ds_field = (flowlabel >> 20) & 0xff; } #endif return (ds_field); } void write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield) { struct mbuf *m0; if (pktattr == NULL || (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6)) return; /* verify that pattr_hdr is within the mbuf data */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if ((pktattr->pattr_hdr >= m0->m_data) && (pktattr->pattr_hdr < m0->m_data + m0->m_len)) break; if (m0 == NULL) { /* ick, pattr_hdr is stale */ pktattr->pattr_af = AF_UNSPEC; #ifdef ALTQ_DEBUG printf("write_dsfield: can't locate header!\n"); #endif return; } if (pktattr->pattr_af == AF_INET) { struct ip *ip = (struct ip *)pktattr->pattr_hdr; u_int8_t old; int32_t sum; if (ip->ip_v != 4) return; /* version mismatch! */ old = ip->ip_tos; dsfield |= old & 3; /* leave CU bits */ if (old == dsfield) return; ip->ip_tos = dsfield; /* * update checksum (from RFC1624) * HC' = ~(~HC + ~m + m') */ sum = ~ntohs(ip->ip_sum) & 0xffff; sum += 0xff00 + (~old & 0xff) + dsfield; sum = (sum >> 16) + (sum & 0xffff); sum += (sum >> 16); /* add carry */ ip->ip_sum = htons(~sum & 0xffff); } #ifdef INET6 else if (pktattr->pattr_af == AF_INET6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr; u_int32_t flowlabel; flowlabel = ntohl(ip6->ip6_flow); if ((flowlabel >> 28) != 6) return; /* version mismatch! */ flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20); ip6->ip6_flow = htonl(flowlabel); } #endif return; } /* * high resolution clock support taking advantage of a machine dependent * high resolution time counter (e.g., timestamp counter of intel pentium). * we assume * - 64-bit-long monotonically-increasing counter * - frequency range is 100M-4GHz (CPU speed) */ /* if pcc is not available or disabled, emulate 256MHz using microtime() */ #define MACHCLK_SHIFT 8 int machclk_usepcc; u_int32_t machclk_freq; u_int32_t machclk_per_tick; #if defined(__i386__) && defined(__NetBSD__) extern u_int64_t cpu_tsc_freq; #endif #if (__FreeBSD_version >= 700035) /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { /* If there was an error during the transition, don't do anything. */ if (status != 0) return; #if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__)) /* If TSC is P-state invariant, don't do anything. */ if (tsc_is_invariant) return; #endif /* Total setting for this level gives the new frequency in MHz. */ init_machclk(); } EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_LAST); #endif /* __FreeBSD_version >= 700035 */ static void init_machclk_setup(void) { #if (__FreeBSD_version >= 600000) callout_init(&tbr_callout, 0); #endif machclk_usepcc = 1; #if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC) machclk_usepcc = 0; #endif #if defined(__FreeBSD__) && defined(SMP) machclk_usepcc = 0; #endif #if defined(__NetBSD__) && defined(MULTIPROCESSOR) machclk_usepcc = 0; #endif #if defined(__amd64__) || defined(__i386__) /* check if TSC is available */ #ifdef __FreeBSD__ if ((cpu_feature & CPUID_TSC) == 0 || atomic_load_acq_64(&tsc_freq) == 0) #else if ((cpu_feature & CPUID_TSC) == 0) #endif machclk_usepcc = 0; #endif } void init_machclk(void) { static int called; /* Call one-time initialization function. */ if (!called) { init_machclk_setup(); called = 1; } if (machclk_usepcc == 0) { /* emulate 256MHz using microtime() */ machclk_freq = 1000000 << MACHCLK_SHIFT; machclk_per_tick = machclk_freq / hz; #ifdef ALTQ_DEBUG printf("altq: emulate %uHz cpu clock\n", machclk_freq); #endif return; } /* * if the clock frequency (of Pentium TSC or Alpha PCC) is * accessible, just use it. */ #if defined(__amd64__) || defined(__i386__) #ifdef __FreeBSD__ machclk_freq = atomic_load_acq_64(&tsc_freq); #elif defined(__NetBSD__) machclk_freq = (u_int32_t)cpu_tsc_freq; #elif defined(__OpenBSD__) && (defined(I586_CPU) || defined(I686_CPU)) machclk_freq = pentium_mhz * 1000000; #endif #endif /* * if we don't know the clock frequency, measure it. */ if (machclk_freq == 0) { static int wait; struct timeval tv_start, tv_end; u_int64_t start, end, diff; int timo; microtime(&tv_start); start = read_machclk(); timo = hz; /* 1 sec */ (void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo); microtime(&tv_end); end = read_machclk(); diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000 + tv_end.tv_usec - tv_start.tv_usec; if (diff != 0) machclk_freq = (u_int)((end - start) * 1000000 / diff); } machclk_per_tick = machclk_freq / hz; #ifdef ALTQ_DEBUG printf("altq: CPU clock: %uHz\n", machclk_freq); #endif } #if defined(__OpenBSD__) && defined(__i386__) static __inline u_int64_t rdtsc(void) { u_int64_t rv; __asm __volatile(".byte 0x0f, 0x31" : "=A" (rv)); return (rv); } #endif /* __OpenBSD__ && __i386__ */ u_int64_t read_machclk(void) { u_int64_t val; if (machclk_usepcc) { #if defined(__amd64__) || defined(__i386__) val = rdtsc(); #else panic("read_machclk"); #endif } else { struct timeval tv; microtime(&tv); val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000 + tv.tv_usec) << MACHCLK_SHIFT); } return (val); } #ifdef ALTQ3_CLFIER_COMPAT #ifndef IPPROTO_ESP #define IPPROTO_ESP 50 /* encapsulating security payload */ #endif #ifndef IPPROTO_AH #define IPPROTO_AH 51 /* authentication header */ #endif /* * extract flow information from a given packet. * filt_mask shows flowinfo fields required. * we assume the ip header is in one mbuf, and addresses and ports are * in network byte order. */ int altq_extractflow(m, af, flow, filt_bmask) struct mbuf *m; int af; struct flowinfo *flow; u_int32_t filt_bmask; { switch (af) { case PF_INET: { struct flowinfo_in *fin; struct ip *ip; ip = mtod(m, struct ip *); if (ip->ip_v != 4) break; fin = (struct flowinfo_in *)flow; fin->fi_len = sizeof(struct flowinfo_in); fin->fi_family = AF_INET; fin->fi_proto = ip->ip_p; fin->fi_tos = ip->ip_tos; fin->fi_src.s_addr = ip->ip_src.s_addr; fin->fi_dst.s_addr = ip->ip_dst.s_addr; if (filt_bmask & FIMB4_PORTS) /* if port info is required, extract port numbers */ extract_ports4(m, ip, fin); else { fin->fi_sport = 0; fin->fi_dport = 0; fin->fi_gpi = 0; } return (1); } #ifdef INET6 case PF_INET6: { struct flowinfo_in6 *fin6; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); /* should we check the ip version? */ fin6 = (struct flowinfo_in6 *)flow; fin6->fi6_len = sizeof(struct flowinfo_in6); fin6->fi6_family = AF_INET6; fin6->fi6_proto = ip6->ip6_nxt; fin6->fi6_tclass = (ntohl(ip6->ip6_flow) >> 20) & 0xff; fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff); fin6->fi6_src = ip6->ip6_src; fin6->fi6_dst = ip6->ip6_dst; if ((filt_bmask & FIMB6_PORTS) || ((filt_bmask & FIMB6_PROTO) && ip6->ip6_nxt > IPPROTO_IPV6)) /* * if port info is required, or proto is required * but there are option headers, extract port * and protocol numbers. */ extract_ports6(m, ip6, fin6); else { fin6->fi6_sport = 0; fin6->fi6_dport = 0; fin6->fi6_gpi = 0; } return (1); } #endif /* INET6 */ default: break; } /* failed */ flow->fi_len = sizeof(struct flowinfo); flow->fi_family = AF_UNSPEC; return (0); } /* * helper routine to extract port numbers */ /* structure for ipsec and ipv6 option header template */ struct _opt6 { u_int8_t opt6_nxt; /* next header */ u_int8_t opt6_hlen; /* header extension length */ u_int16_t _pad; u_int32_t ah_spi; /* security parameter index for authentication header */ }; /* * extract port numbers from a ipv4 packet. */ static int extract_ports4(m, ip, fin) struct mbuf *m; struct ip *ip; struct flowinfo_in *fin; { struct mbuf *m0; u_short ip_off; u_int8_t proto; int off; fin->fi_sport = 0; fin->fi_dport = 0; fin->fi_gpi = 0; ip_off = ntohs(ip->ip_off); /* if it is a fragment, try cached fragment info */ if (ip_off & IP_OFFMASK) { ip4f_lookup(ip, fin); return (1); } /* locate the mbuf containing the protocol header */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)ip >= m0->m_data) && ((caddr_t)ip < m0->m_data + m0->m_len)) break; if (m0 == NULL) { #ifdef ALTQ_DEBUG printf("extract_ports4: can't locate header! ip=%p\n", ip); #endif return (0); } off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2); proto = ip->ip_p; #ifdef ALTQ_IPSEC again: #endif while (off >= m0->m_len) { off -= m0->m_len; m0 = m0->m_next; if (m0 == NULL) return (0); /* bogus ip_hl! */ } if (m0->m_len < off + 4) return (0); switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: { struct udphdr *udp; udp = (struct udphdr *)(mtod(m0, caddr_t) + off); fin->fi_sport = udp->uh_sport; fin->fi_dport = udp->uh_dport; fin->fi_proto = proto; } break; #ifdef ALTQ_IPSEC case IPPROTO_ESP: if (fin->fi_gpi == 0){ u_int32_t *gpi; gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); fin->fi_gpi = *gpi; } fin->fi_proto = proto; break; case IPPROTO_AH: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); proto = opt6->opt6_nxt; off += 8 + (opt6->opt6_hlen * 4); if (fin->fi_gpi == 0 && m0->m_len >= off + 8) fin->fi_gpi = opt6->ah_spi; } /* goto the next header */ goto again; #endif /* ALTQ_IPSEC */ default: fin->fi_proto = proto; return (0); } /* if this is a first fragment, cache it. */ if (ip_off & IP_MF) ip4f_cache(ip, fin); return (1); } #ifdef INET6 static int extract_ports6(m, ip6, fin6) struct mbuf *m; struct ip6_hdr *ip6; struct flowinfo_in6 *fin6; { struct mbuf *m0; int off; u_int8_t proto; fin6->fi6_gpi = 0; fin6->fi6_sport = 0; fin6->fi6_dport = 0; /* locate the mbuf containing the protocol header */ for (m0 = m; m0 != NULL; m0 = m0->m_next) if (((caddr_t)ip6 >= m0->m_data) && ((caddr_t)ip6 < m0->m_data + m0->m_len)) break; if (m0 == NULL) { #ifdef ALTQ_DEBUG printf("extract_ports6: can't locate header! ip6=%p\n", ip6); #endif return (0); } off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; do { while (off >= m0->m_len) { off -= m0->m_len; m0 = m0->m_next; if (m0 == NULL) return (0); } if (m0->m_len < off + 4) return (0); switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: { struct udphdr *udp; udp = (struct udphdr *)(mtod(m0, caddr_t) + off); fin6->fi6_sport = udp->uh_sport; fin6->fi6_dport = udp->uh_dport; fin6->fi6_proto = proto; } return (1); case IPPROTO_ESP: if (fin6->fi6_gpi == 0) { u_int32_t *gpi; gpi = (u_int32_t *)(mtod(m0, caddr_t) + off); fin6->fi6_gpi = *gpi; } fin6->fi6_proto = proto; return (1); case IPPROTO_AH: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8) fin6->fi6_gpi = opt6->ah_spi; proto = opt6->opt6_nxt; off += 8 + (opt6->opt6_hlen * 4); /* goto the next header */ break; } case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct _opt6 *opt6; opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off); proto = opt6->opt6_nxt; off += (opt6->opt6_hlen + 1) * 8; /* goto the next header */ break; } case IPPROTO_FRAGMENT: /* ipv6 fragmentations are not supported yet */ default: fin6->fi6_proto = proto; return (0); } } while (1); /*NOTREACHED*/ } #endif /* INET6 */ /* * altq common classifier */ int acc_add_filter(classifier, filter, class, phandle) struct acc_classifier *classifier; struct flow_filter *filter; void *class; u_long *phandle; { struct acc_filter *afp, *prev, *tmp; int i, s; #ifdef INET6 if (filter->ff_flow.fi_family != AF_INET && filter->ff_flow.fi_family != AF_INET6) return (EINVAL); #else if (filter->ff_flow.fi_family != AF_INET) return (EINVAL); #endif afp = malloc(sizeof(struct acc_filter), M_DEVBUF, M_WAITOK); if (afp == NULL) return (ENOMEM); bzero(afp, sizeof(struct acc_filter)); afp->f_filter = *filter; afp->f_class = class; i = ACC_WILDCARD_INDEX; if (filter->ff_flow.fi_family == AF_INET) { struct flow_filter *filter4 = &afp->f_filter; /* * if address is 0, it's a wildcard. if address mask * isn't set, use full mask. */ if (filter4->ff_flow.fi_dst.s_addr == 0) filter4->ff_mask.mask_dst.s_addr = 0; else if (filter4->ff_mask.mask_dst.s_addr == 0) filter4->ff_mask.mask_dst.s_addr = 0xffffffff; if (filter4->ff_flow.fi_src.s_addr == 0) filter4->ff_mask.mask_src.s_addr = 0; else if (filter4->ff_mask.mask_src.s_addr == 0) filter4->ff_mask.mask_src.s_addr = 0xffffffff; /* clear extra bits in addresses */ filter4->ff_flow.fi_dst.s_addr &= filter4->ff_mask.mask_dst.s_addr; filter4->ff_flow.fi_src.s_addr &= filter4->ff_mask.mask_src.s_addr; /* * if dst address is a wildcard, use hash-entry * ACC_WILDCARD_INDEX. */ if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff) i = ACC_WILDCARD_INDEX; else i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr); } #ifdef INET6 else if (filter->ff_flow.fi_family == AF_INET6) { struct flow_filter6 *filter6 = (struct flow_filter6 *)&afp->f_filter; #ifndef IN6MASK0 /* taken from kame ipv6 */ #define IN6MASK0 {{{ 0, 0, 0, 0 }}} #define IN6MASK128 {{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}} const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask128 = IN6MASK128; #endif if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst)) filter6->ff_mask6.mask6_dst = in6mask0; else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst)) filter6->ff_mask6.mask6_dst = in6mask128; if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src)) filter6->ff_mask6.mask6_src = in6mask0; else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src)) filter6->ff_mask6.mask6_src = in6mask128; /* clear extra bits in addresses */ for (i = 0; i < 16; i++) filter6->ff_flow6.fi6_dst.s6_addr[i] &= filter6->ff_mask6.mask6_dst.s6_addr[i]; for (i = 0; i < 16; i++) filter6->ff_flow6.fi6_src.s6_addr[i] &= filter6->ff_mask6.mask6_src.s6_addr[i]; if (filter6->ff_flow6.fi6_flowlabel == 0) i = ACC_WILDCARD_INDEX; else i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel); } #endif /* INET6 */ afp->f_handle = get_filt_handle(classifier, i); /* update filter bitmask */ afp->f_fbmask = filt2fibmask(filter); classifier->acc_fbmask |= afp->f_fbmask; /* * add this filter to the filter list. * filters are ordered from the highest rule number. */ #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif prev = NULL; LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) { if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno) prev = tmp; else break; } if (prev == NULL) LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain); else LIST_INSERT_AFTER(prev, afp, f_chain); splx(s); *phandle = afp->f_handle; return (0); } int acc_delete_filter(classifier, handle) struct acc_classifier *classifier; u_long handle; { struct acc_filter *afp; int s; if ((afp = filth_to_filtp(classifier, handle)) == NULL) return (EINVAL); #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif LIST_REMOVE(afp, f_chain); splx(s); free(afp, M_DEVBUF); /* todo: update filt_bmask */ return (0); } /* * delete filters referencing to the specified class. * if the all flag is not 0, delete all the filters. */ int acc_discard_filters(classifier, class, all) struct acc_classifier *classifier; void *class; int all; { struct acc_filter *afp; int i, s; #ifdef __NetBSD__ s = splnet(); #else s = splimp(); #endif for (i = 0; i < ACC_FILTER_TABLESIZE; i++) { do { LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (all || afp->f_class == class) { LIST_REMOVE(afp, f_chain); free(afp, M_DEVBUF); /* start again from the head */ break; } } while (afp != NULL); } splx(s); if (all) classifier->acc_fbmask = 0; return (0); } void * acc_classify(clfier, m, af) void *clfier; struct mbuf *m; int af; { struct acc_classifier *classifier; struct flowinfo flow; struct acc_filter *afp; int i; classifier = (struct acc_classifier *)clfier; altq_extractflow(m, af, &flow, classifier->acc_fbmask); if (flow.fi_family == AF_INET) { struct flowinfo_in *fp = (struct flowinfo_in *)&flow; if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) { /* only tos is used */ LIST_FOREACH(afp, &classifier->acc_filters[ACC_WILDCARD_INDEX], f_chain) if (apply_tosfilter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); } else if ((classifier->acc_fbmask & (~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL)) == 0) { /* only proto and ports are used */ LIST_FOREACH(afp, &classifier->acc_filters[ACC_WILDCARD_INDEX], f_chain) if (apply_ppfilter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); } else { /* get the filter hash entry from its dest address */ i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr); do { /* * go through this loop twice. first for dst * hash, second for wildcards. */ LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (apply_filter4(afp->f_fbmask, &afp->f_filter, fp)) /* filter matched */ return (afp->f_class); /* * check again for filters with a dst addr * wildcard. * (daddr == 0 || dmask != 0xffffffff). */ if (i != ACC_WILDCARD_INDEX) i = ACC_WILDCARD_INDEX; else break; } while (1); } } #ifdef INET6 else if (flow.fi_family == AF_INET6) { struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow; /* get the filter hash entry from its flow ID */ if (fp6->fi6_flowlabel != 0) i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel); else /* flowlable can be zero */ i = ACC_WILDCARD_INDEX; /* go through this loop twice. first for flow hash, second for wildcards. */ do { LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (apply_filter6(afp->f_fbmask, (struct flow_filter6 *)&afp->f_filter, fp6)) /* filter matched */ return (afp->f_class); /* * check again for filters with a wildcard. */ if (i != ACC_WILDCARD_INDEX) i = ACC_WILDCARD_INDEX; else break; } while (1); } #endif /* INET6 */ /* no filter matched */ return (NULL); } static int apply_filter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) return (0); if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) return (0); if ((fbmask & FIMB4_DADDR) && filt->ff_flow.fi_dst.s_addr != (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr)) return (0); if ((fbmask & FIMB4_SADDR) && filt->ff_flow.fi_src.s_addr != (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr)) return (0); if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) return (0); if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != (pkt->fi_tos & filt->ff_mask.mask_tos)) return (0); if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi)) return (0); /* match */ return (1); } /* * filter matching function optimized for a common case that checks * only protocol and port numbers */ static int apply_ppfilter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport) return (0); if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport) return (0); if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto) return (0); /* match */ return (1); } /* * filter matching function only for tos field. */ static int apply_tosfilter4(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter *filt; struct flowinfo_in *pkt; { if (filt->ff_flow.fi_family != AF_INET) return (0); if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos != (pkt->fi_tos & filt->ff_mask.mask_tos)) return (0); /* match */ return (1); } #ifdef INET6 static int apply_filter6(fbmask, filt, pkt) u_int32_t fbmask; struct flow_filter6 *filt; struct flowinfo_in6 *pkt; { int i; if (filt->ff_flow6.fi6_family != AF_INET6) return (0); if ((fbmask & FIMB6_FLABEL) && filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel) return (0); if ((fbmask & FIMB6_PROTO) && filt->ff_flow6.fi6_proto != pkt->fi6_proto) return (0); if ((fbmask & FIMB6_SPORT) && filt->ff_flow6.fi6_sport != pkt->fi6_sport) return (0); if ((fbmask & FIMB6_DPORT) && filt->ff_flow6.fi6_dport != pkt->fi6_dport) return (0); if (fbmask & FIMB6_SADDR) { for (i = 0; i < 4; i++) if (filt->ff_flow6.fi6_src.s6_addr32[i] != (pkt->fi6_src.s6_addr32[i] & filt->ff_mask6.mask6_src.s6_addr32[i])) return (0); } if (fbmask & FIMB6_DADDR) { for (i = 0; i < 4; i++) if (filt->ff_flow6.fi6_dst.s6_addr32[i] != (pkt->fi6_dst.s6_addr32[i] & filt->ff_mask6.mask6_dst.s6_addr32[i])) return (0); } if ((fbmask & FIMB6_TCLASS) && filt->ff_flow6.fi6_tclass != (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass)) return (0); if ((fbmask & FIMB6_GPI) && filt->ff_flow6.fi6_gpi != pkt->fi6_gpi) return (0); /* match */ return (1); } #endif /* INET6 */ /* * filter handle: * bit 20-28: index to the filter hash table * bit 0-19: unique id in the hash bucket. */ static u_long get_filt_handle(classifier, i) struct acc_classifier *classifier; int i; { static u_long handle_number = 1; u_long handle; struct acc_filter *afp; while (1) { handle = handle_number++ & 0x000fffff; if (LIST_EMPTY(&classifier->acc_filters[i])) break; LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if ((afp->f_handle & 0x000fffff) == handle) break; if (afp == NULL) break; /* this handle is already used, try again */ } return ((i << 20) | handle); } /* convert filter handle to filter pointer */ static struct acc_filter * filth_to_filtp(classifier, handle) struct acc_classifier *classifier; u_long handle; { struct acc_filter *afp; int i; i = ACC_GET_HINDEX(handle); LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain) if (afp->f_handle == handle) return (afp); return (NULL); } /* create flowinfo bitmask */ static u_int32_t filt2fibmask(filt) struct flow_filter *filt; { u_int32_t mask = 0; #ifdef INET6 struct flow_filter6 *filt6; #endif switch (filt->ff_flow.fi_family) { case AF_INET: if (filt->ff_flow.fi_proto != 0) mask |= FIMB4_PROTO; if (filt->ff_flow.fi_tos != 0) mask |= FIMB4_TOS; if (filt->ff_flow.fi_dst.s_addr != 0) mask |= FIMB4_DADDR; if (filt->ff_flow.fi_src.s_addr != 0) mask |= FIMB4_SADDR; if (filt->ff_flow.fi_sport != 0) mask |= FIMB4_SPORT; if (filt->ff_flow.fi_dport != 0) mask |= FIMB4_DPORT; if (filt->ff_flow.fi_gpi != 0) mask |= FIMB4_GPI; break; #ifdef INET6 case AF_INET6: filt6 = (struct flow_filter6 *)filt; if (filt6->ff_flow6.fi6_proto != 0) mask |= FIMB6_PROTO; if (filt6->ff_flow6.fi6_tclass != 0) mask |= FIMB6_TCLASS; if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst)) mask |= FIMB6_DADDR; if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src)) mask |= FIMB6_SADDR; if (filt6->ff_flow6.fi6_sport != 0) mask |= FIMB6_SPORT; if (filt6->ff_flow6.fi6_dport != 0) mask |= FIMB6_DPORT; if (filt6->ff_flow6.fi6_gpi != 0) mask |= FIMB6_GPI; if (filt6->ff_flow6.fi6_flowlabel != 0) mask |= FIMB6_FLABEL; break; #endif /* INET6 */ } return (mask); } /* * helper functions to handle IPv4 fragments. * currently only in-sequence fragments are handled. * - fragment info is cached in a LRU list. * - when a first fragment is found, cache its flow info. * - when a non-first fragment is found, lookup the cache. */ struct ip4_frag { TAILQ_ENTRY(ip4_frag) ip4f_chain; char ip4f_valid; u_short ip4f_id; struct flowinfo_in ip4f_info; }; static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */ #define IP4F_TABSIZE 16 /* IPv4 fragment cache size */ static void ip4f_cache(ip, fin) struct ip *ip; struct flowinfo_in *fin; { struct ip4_frag *fp; if (TAILQ_EMPTY(&ip4f_list)) { /* first time call, allocate fragment cache entries. */ if (ip4f_init() < 0) /* allocation failed! */ return; } fp = ip4f_alloc(); fp->ip4f_id = ip->ip_id; fp->ip4f_info.fi_proto = ip->ip_p; fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr; fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr; /* save port numbers */ fp->ip4f_info.fi_sport = fin->fi_sport; fp->ip4f_info.fi_dport = fin->fi_dport; fp->ip4f_info.fi_gpi = fin->fi_gpi; } static int ip4f_lookup(ip, fin) struct ip *ip; struct flowinfo_in *fin; { struct ip4_frag *fp; for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid; fp = TAILQ_NEXT(fp, ip4f_chain)) if (ip->ip_id == fp->ip4f_id && ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr && ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr && ip->ip_p == fp->ip4f_info.fi_proto) { /* found the matching entry */ fin->fi_sport = fp->ip4f_info.fi_sport; fin->fi_dport = fp->ip4f_info.fi_dport; fin->fi_gpi = fp->ip4f_info.fi_gpi; if ((ntohs(ip->ip_off) & IP_MF) == 0) /* this is the last fragment, release the entry. */ ip4f_free(fp); return (1); } /* no matching entry found */ return (0); } static int ip4f_init(void) { struct ip4_frag *fp; int i; TAILQ_INIT(&ip4f_list); for (i=0; iip4f_valid = 0; TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); } return (0); } static struct ip4_frag * ip4f_alloc(void) { struct ip4_frag *fp; /* reclaim an entry at the tail, put it at the head */ fp = TAILQ_LAST(&ip4f_list, ip4f_list); TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); fp->ip4f_valid = 1; TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain); return (fp); } static void ip4f_free(fp) struct ip4_frag *fp; { TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain); fp->ip4f_valid = 0; TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain); } #endif /* ALTQ3_CLFIER_COMPAT */ Index: stable/10/sys/net/pf_mtag.h =================================================================== --- stable/10/sys/net/pf_mtag.h (revision 263085) +++ stable/10/sys/net/pf_mtag.h (nonexistent) @@ -1,62 +0,0 @@ -/* $FreeBSD$ */ -/* - * Copyright (c) 2001 Daniel Hartmeier - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - */ - -#ifndef _NET_PF_MTAG_H_ -#define _NET_PF_MTAG_H_ - -#ifdef _KERNEL - -#define PF_TAG_GENERATED 0x01 -#define PF_TAG_FRAGCACHE 0x02 -#define PF_TAG_TRANSLATE_LOCALHOST 0x04 -#define PF_PACKET_LOOPED 0x08 -#define PF_FASTFWD_OURS_PRESENT 0x10 - -struct pf_mtag { - void *hdr; /* saved hdr pos in mbuf, for ECN */ - u_int32_t qid; /* queue id */ - u_int16_t tag; /* tag id */ - u_int8_t flags; - u_int8_t routed; -}; - -static __inline struct pf_mtag * -pf_find_mtag(struct mbuf *m) -{ - struct m_tag *mtag; - - if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) - return (NULL); - - return ((struct pf_mtag *)(mtag + 1)); -} -#endif /* _KERNEL */ -#endif /* _NET_PF_MTAG_H_ */ Property changes on: stable/10/sys/net/pf_mtag.h ___________________________________________________________________ Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Index: stable/10/sys/net/if_ethersubr.c =================================================================== --- stable/10/sys/net/if_ethersubr.c (revision 263085) +++ stable/10/sys/net/if_ethersubr.c (revision 263086) @@ -1,1321 +1,1322 @@ /*- * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_atalk.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipx.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include + +#include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #ifdef IPX #include #include #endif int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, const struct sockaddr *dst, short *tp, int *hlen); #ifdef NETATALK #include #include #include #define llc_snap_org_code llc_un.type_snap.org_code #define llc_snap_ether_type llc_un.type_snap.ether_type extern u_char at_org_code[3]; extern u_char aarp_org_code[3]; #endif /* NETATALK */ #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); int (*bridge_output_p)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif /* XXX: should be in an arp support file, not here */ static MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals"); #define ETHER_IS_BROADCAST(addr) \ (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { short type; int error = 0, hdrcmplt = 0; u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN]; struct llentry *lle = NULL; struct rtentry *rt0 = NULL; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ if (ro != NULL) { if (!(m->m_flags & (M_BCAST | M_MCAST))) lle = ro->ro_lle; rt0 = ro->ro_rt; } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); hlen = ETHER_HDR_LEN; switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); else error = arpresolve(ifp, rt0, m, dst, edst, &lle); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ETHER); loop_copy = 0; /* if this is for us, don't do it */ switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN); else bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); else error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif #ifdef IPX case AF_IPX: if (ef_outputp) { error = ef_outputp(ifp, &m, dst, &type, &hlen); if (error) goto bad; } else type = htons(ETHERTYPE_IPX); bcopy(&((const struct sockaddr_ipx *)dst)->sipx_addr.x_host, edst, sizeof (edst)); break; #endif #ifdef NETATALK case AF_APPLETALK: { struct at_ifaddr *aa; if ((aa = at_ifawithnet((const struct sockaddr_at *)dst)) == NULL) senderr(EHOSTUNREACH); /* XXX */ if (!aarpresolve(ifp, m, (const struct sockaddr_at *)dst, edst)) { ifa_free(&aa->aa_ifa); return (0); } /* * In the phase 2 case, need to prepend an mbuf for the llc header. */ if ( aa->aa_flags & AFA_PHASE2 ) { struct llc llc; ifa_free(&aa->aa_ifa); M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); llc.llc_dsap = llc.llc_ssap = LLC_SNAP_LSAP; llc.llc_control = LLC_UI; bcopy(at_org_code, llc.llc_snap_org_code, sizeof(at_org_code)); llc.llc_snap_ether_type = htons( ETHERTYPE_AT ); bcopy(&llc, mtod(m, caddr_t), LLC_SNAPFRAMELEN); type = htons(m->m_pkthdr.len); hlen = LLC_SNAPFRAMELEN + ETHER_HDR_LEN; } else { ifa_free(&aa->aa_ifa); type = htons(ETHERTYPE_AT); } break; } #endif /* NETATALK */ case pseudo_AF_HDRCMPLT: { const struct ether_header *eh; hdrcmplt = 1; eh = (const struct ether_header *)dst->sa_data; (void)memcpy(esrc, eh->ether_shost, sizeof (esrc)); /* FALLTHROUGH */ case AF_UNSPEC: loop_copy = 0; /* if this is for us, don't do it */ eh = (const struct ether_header *)dst->sa_data; (void)memcpy(edst, eh->ether_dhost, sizeof (edst)); type = eh->ether_type; break; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } if (lle != NULL && (lle->la_flags & LLE_IFADDR)) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); eh = mtod(m, struct ether_header *); (void)memcpy(&eh->ether_type, &type, sizeof(eh->ether_type)); (void)memcpy(eh->ether_dhost, edst, sizeof (edst)); if (hdrcmplt) (void)memcpy(eh->ether_shost, esrc, sizeof(eh->ether_shost)); else (void)memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { if (m->m_flags & M_BCAST) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else ifp->if_iqdrops++; } else if (bcmp(eh->ether_dhost, eh->ether_shost, ETHER_ADDR_LEN) == 0) { update_mbuf_csumflags(m, m); (void) if_simloop(ifp, m, dst->sa_family, hlen); return (0); /* XXX */ } } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { int i; if (PFIL_HOOKED(&V_link_pfil_hook)) { i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL); if (i != 0) return (EACCES); if (m == NULL) return (0); } /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } #if defined(INET) || defined(INET6) #endif /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); ifp->if_ierrors++; m_freem(m); return; } if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); ifp->if_ierrors++; m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); ifp->if_ierrors++; m_freem(m); return; } #ifdef DIAGNOSTIC if (m->m_pkthdr.rcvif != ifp) { if_printf(ifp, "Warning, frame marked as received on %s\n", m->m_pkthdr.rcvif->if_xname); } #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; ifp->if_imcasts++; } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) ifp->if_ibytes += m->m_pkthdr.len; /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif ifp->if_ierrors++; m_freem(m); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } if (harvest.ethernet) random_harvest(&(m->m_data), 12, 2, RANDOM_NET_ETHER); ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. */ static void ether_nh_input(struct mbuf *m) { ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { int i; /* Initialize packet filter hooks. */ V_link_pfil_hook.ph_type = PFIL_TYPE_AF; V_link_pfil_hook.ph_af = AF_LINK; if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil link hook, " "error %d\n", __func__, i); } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); static void vnet_ether_destroy(__unused void *arg) { int i; if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil link hook, " "error %d\n", __func__, i); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); static void ether_input(struct ifnet *ifp, struct mbuf *m) { /* * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__)); netisr_dispatch(NETISR_ETHER, m); } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; #if defined(NETATALK) struct llc *l; #endif KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { ifp->if_noproto++; m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: if ((m = ip_fastforward(m)) == NULL) return; isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef IPX case ETHERTYPE_IPX: if (ef_inputp && ef_inputp(ifp, eh, m) == 0) return; isr = NETISR_IPX; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif #ifdef NETATALK case ETHERTYPE_AT: isr = NETISR_ATALK1; break; case ETHERTYPE_AARP: isr = NETISR_AARP; break; #endif /* NETATALK */ default: #ifdef IPX if (ef_inputp && ef_inputp(ifp, eh, m) == 0) return; #endif /* IPX */ #if defined(NETATALK) if (ether_type > ETHERMTU) goto discard; l = mtod(m, struct llc *); if (l->llc_dsap == LLC_SNAP_LSAP && l->llc_ssap == LLC_SNAP_LSAP && l->llc_control == LLC_UI) { if (bcmp(&(l->llc_snap_org_code)[0], at_org_code, sizeof(at_org_code)) == 0 && ntohs(l->llc_snap_ether_type) == ETHERTYPE_AT) { m_adj(m, LLC_SNAPFRAMELEN); isr = NETISR_ATALK2; break; } if (bcmp(&(l->llc_snap_org_code)[0], aarp_org_code, sizeof(aarp_org_code)) == 0 && ntohs(l->llc_snap_ether_type) == ETHERTYPE_AARP) { m_adj(m, LLC_SNAPFRAMELEN); isr = NETISR_AARP; break; } } #endif /* NETATALK */ goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (IFP2AC(ifp)->ac_netgraph != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif #ifdef IPX /* * XXX - This code is probably wrong */ case AF_IPX: { struct ipx_addr *ina = &(IA_SIPX(ifa)->sipx_addr); if (ipx_nullhost(*ina)) ina->x_host = *(union ipx_host *) IF_LLADDR(ifp); else { bcopy((caddr_t) ina->x_host.c_host, (caddr_t) IF_LLADDR(ifp), ETHER_ADDR_LEN); } /* * Set new address */ ifp->if_init(ifp->if_softc); break; } #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, ETHER_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = 0; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = malloc(sizeof *sdl, M_IFMADDR, M_NOWAIT|M_ZERO); if (sdl == NULL) return ENOMEM; sdl->sdl_len = sizeof *sdl; sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = 0; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = malloc(sizeof *sdl, M_IFMADDR, M_NOWAIT|M_ZERO); if (sdl == NULL) return (ENOMEM); sdl->sdl_len = sizeof *sdl; sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static void* ether_alloc(u_char type, struct ifnet *ifp) { struct arpcom *ac; ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK | M_ZERO); ac->ac_ifp = ifp; return (ac); } static void ether_free(void *com, u_char type) { free(com, M_ARPCOM); } static int ether_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free); break; case MOD_UNLOAD: if_deregister_com_alloc(IFT_ETHER); break; default: return EOPNOTSUPP; } return (0); } static moduledata_t ether_mod = { "ether", ether_modevent, 0 }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); Index: stable/10/sys/net/pfvar.h =================================================================== --- stable/10/sys/net/pfvar.h (revision 263085) +++ stable/10/sys/net/pfvar.h (revision 263086) @@ -1,1951 +1,1775 @@ /* * Copyright (c) 2001 Daniel Hartmeier * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ * $FreeBSD$ */ #ifndef _NET_PFVAR_H_ #define _NET_PFVAR_H_ #include #include #include #include #include #include #include -#include +#include +#include +#include -#define PF_TCPS_PROXY_SRC ((TCP_NSTATES)+0) -#define PF_TCPS_PROXY_DST ((TCP_NSTATES)+1) - -#define PF_MD5_DIGEST_LENGTH 16 -#ifdef MD5_DIGEST_LENGTH -#if PF_MD5_DIGEST_LENGTH != MD5_DIGEST_LENGTH -#error -#endif -#endif - -enum { PF_INOUT, PF_IN, PF_OUT }; -enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT, - PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER }; -enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT, - PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX }; -enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT, - PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG }; -enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY }; -enum { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL, - PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER, - PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET }; -enum { PF_GET_NONE, PF_GET_CLR_CNTR }; -enum { PF_SK_WIRE, PF_SK_STACK, PF_SK_BOTH }; - -/* - * Note about PFTM_*: real indices into pf_rule.timeout[] come before - * PFTM_MAX, special cases afterwards. See pf_state_expires(). - */ -enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, - PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED, - PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE, - PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY, - PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE, - PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL, - PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE, - PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED }; - -/* PFTM default values */ -#define PFTM_TCP_FIRST_PACKET_VAL 120 /* First TCP packet */ -#define PFTM_TCP_OPENING_VAL 30 /* No response yet */ -#define PFTM_TCP_ESTABLISHED_VAL 24*60*60/* Established */ -#define PFTM_TCP_CLOSING_VAL 15 * 60 /* Half closed */ -#define PFTM_TCP_FIN_WAIT_VAL 45 /* Got both FINs */ -#define PFTM_TCP_CLOSED_VAL 90 /* Got a RST */ -#define PFTM_UDP_FIRST_PACKET_VAL 60 /* First UDP packet */ -#define PFTM_UDP_SINGLE_VAL 30 /* Unidirectional */ -#define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */ -#define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */ -#define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */ -#define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */ -#define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */ -#define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */ -#define PFTM_FRAG_VAL 30 /* Fragment expire */ -#define PFTM_INTERVAL_VAL 10 /* Expire interval */ -#define PFTM_SRC_NODE_VAL 0 /* Source tracking */ -#define PFTM_TS_DIFF_VAL 30 /* Allowed TS diff */ - -enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO }; -enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS, - PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX }; -#define PF_POOL_IDMASK 0x0f -enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM, - PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN }; -enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL, - PF_ADDR_TABLE, PF_ADDR_URPFFAILED, - PF_ADDR_RANGE }; -#define PF_POOL_TYPEMASK 0x0f -#define PF_POOL_STICKYADDR 0x20 -#define PF_WSCALE_FLAG 0x80 -#define PF_WSCALE_MASK 0x0f - -#define PF_LOG 0x01 -#define PF_LOG_ALL 0x02 -#define PF_LOG_SOCKET_LOOKUP 0x04 - struct pf_addr { union { struct in_addr v4; struct in6_addr v6; u_int8_t addr8[16]; u_int16_t addr16[8]; u_int32_t addr32[4]; } pfa; /* 128-bit address */ #define v4 pfa.v4 #define v6 pfa.v6 #define addr8 pfa.addr8 #define addr16 pfa.addr16 #define addr32 pfa.addr32 }; -#define PF_TABLE_NAME_SIZE 32 - #define PFI_AFLAG_NETWORK 0x01 #define PFI_AFLAG_BROADCAST 0x02 #define PFI_AFLAG_PEER 0x04 #define PFI_AFLAG_MODEMASK 0x07 #define PFI_AFLAG_NOALIAS 0x08 struct pf_addr_wrap { union { struct { struct pf_addr addr; struct pf_addr mask; } a; char ifname[IFNAMSIZ]; char tblname[PF_TABLE_NAME_SIZE]; } v; union { struct pfi_dynaddr *dyn; struct pfr_ktable *tbl; int dyncnt; int tblcnt; } p; u_int8_t type; /* PF_ADDR_* */ u_int8_t iflags; /* PFI_AFLAG_* */ }; #ifdef _KERNEL struct pfi_dynaddr { TAILQ_ENTRY(pfi_dynaddr) entry; struct pf_addr pfid_addr4; struct pf_addr pfid_mask4; struct pf_addr pfid_addr6; struct pf_addr pfid_mask6; struct pfr_ktable *pfid_kt; struct pfi_kif *pfid_kif; int pfid_net; /* mask or 128 */ int pfid_acnt4; /* address count IPv4 */ int pfid_acnt6; /* address count IPv6 */ sa_family_t pfid_af; /* rule af */ u_int8_t pfid_iflags; /* PFI_AFLAG_* */ }; /* * Address manipulation macros */ #define HTONL(x) (x) = htonl((__uint32_t)(x)) #define HTONS(x) (x) = htons((__uint16_t)(x)) #define NTOHL(x) (x) = ntohl((__uint32_t)(x)) #define NTOHS(x) (x) = ntohs((__uint16_t)(x)) #define PF_NAME "pf" #define PF_HASHROW_ASSERT(h) mtx_assert(&(h)->lock, MA_OWNED) #define PF_HASHROW_LOCK(h) mtx_lock(&(h)->lock) #define PF_HASHROW_UNLOCK(h) mtx_unlock(&(h)->lock) #define PF_STATE_LOCK(s) \ do { \ struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(s)]; \ PF_HASHROW_LOCK(_ih); \ } while (0) #define PF_STATE_UNLOCK(s) \ do { \ struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH((s))]; \ PF_HASHROW_UNLOCK(_ih); \ } while (0) #ifdef INVARIANTS #define PF_STATE_LOCK_ASSERT(s) \ do { \ struct pf_idhash *_ih = &V_pf_idhash[PF_IDHASH(s)]; \ PF_HASHROW_ASSERT(_ih); \ } while (0) #else /* !INVARIANTS */ #define PF_STATE_LOCK_ASSERT(s) do {} while (0) #endif /* INVARIANTS */ extern struct mtx pf_unlnkdrules_mtx; #define PF_UNLNKDRULES_LOCK() mtx_lock(&pf_unlnkdrules_mtx) #define PF_UNLNKDRULES_UNLOCK() mtx_unlock(&pf_unlnkdrules_mtx) extern struct rwlock pf_rules_lock; #define PF_RULES_RLOCK() rw_rlock(&pf_rules_lock) #define PF_RULES_RUNLOCK() rw_runlock(&pf_rules_lock) #define PF_RULES_WLOCK() rw_wlock(&pf_rules_lock) #define PF_RULES_WUNLOCK() rw_wunlock(&pf_rules_lock) #define PF_RULES_ASSERT() rw_assert(&pf_rules_lock, RA_LOCKED) #define PF_RULES_RASSERT() rw_assert(&pf_rules_lock, RA_RLOCKED) #define PF_RULES_WASSERT() rw_assert(&pf_rules_lock, RA_WLOCKED) #define PF_MODVER 1 #define PFLOG_MODVER 1 #define PFSYNC_MODVER 1 #define PFLOG_MINVER 1 #define PFLOG_PREFVER PFLOG_MODVER #define PFLOG_MAXVER 1 #define PFSYNC_MINVER 1 #define PFSYNC_PREFVER PFSYNC_MODVER #define PFSYNC_MAXVER 1 #ifdef INET #ifndef INET6 #define PF_INET_ONLY #endif /* ! INET6 */ #endif /* INET */ #ifdef INET6 #ifndef INET #define PF_INET6_ONLY #endif /* ! INET */ #endif /* INET6 */ #ifdef INET #ifdef INET6 #define PF_INET_INET6 #endif /* INET6 */ #endif /* INET */ #else #define PF_INET_INET6 #endif /* _KERNEL */ /* Both IPv4 and IPv6 */ #ifdef PF_INET_INET6 #define PF_AEQ(a, b, c) \ ((c == AF_INET && (a)->addr32[0] == (b)->addr32[0]) || \ ((a)->addr32[3] == (b)->addr32[3] && \ (a)->addr32[2] == (b)->addr32[2] && \ (a)->addr32[1] == (b)->addr32[1] && \ (a)->addr32[0] == (b)->addr32[0])) \ #define PF_ANEQ(a, b, c) \ ((c == AF_INET && (a)->addr32[0] != (b)->addr32[0]) || \ ((a)->addr32[3] != (b)->addr32[3] || \ (a)->addr32[2] != (b)->addr32[2] || \ (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[0] != (b)->addr32[0])) \ #define PF_AZERO(a, c) \ ((c == AF_INET && !(a)->addr32[0]) || \ (!(a)->addr32[0] && !(a)->addr32[1] && \ !(a)->addr32[2] && !(a)->addr32[3] )) \ #define PF_MATCHA(n, a, m, b, f) \ pf_match_addr(n, a, m, b, f) #define PF_ACPY(a, b, f) \ pf_addrcpy(a, b, f) #define PF_AINC(a, f) \ pf_addr_inc(a, f) #define PF_POOLMASK(a, b, c, d, f) \ pf_poolmask(a, b, c, d, f) #else /* Just IPv6 */ #ifdef PF_INET6_ONLY #define PF_AEQ(a, b, c) \ ((a)->addr32[3] == (b)->addr32[3] && \ (a)->addr32[2] == (b)->addr32[2] && \ (a)->addr32[1] == (b)->addr32[1] && \ (a)->addr32[0] == (b)->addr32[0]) \ #define PF_ANEQ(a, b, c) \ ((a)->addr32[3] != (b)->addr32[3] || \ (a)->addr32[2] != (b)->addr32[2] || \ (a)->addr32[1] != (b)->addr32[1] || \ (a)->addr32[0] != (b)->addr32[0]) \ #define PF_AZERO(a, c) \ (!(a)->addr32[0] && \ !(a)->addr32[1] && \ !(a)->addr32[2] && \ !(a)->addr32[3] ) \ #define PF_MATCHA(n, a, m, b, f) \ pf_match_addr(n, a, m, b, f) #define PF_ACPY(a, b, f) \ pf_addrcpy(a, b, f) #define PF_AINC(a, f) \ pf_addr_inc(a, f) #define PF_POOLMASK(a, b, c, d, f) \ pf_poolmask(a, b, c, d, f) #else /* Just IPv4 */ #ifdef PF_INET_ONLY #define PF_AEQ(a, b, c) \ ((a)->addr32[0] == (b)->addr32[0]) #define PF_ANEQ(a, b, c) \ ((a)->addr32[0] != (b)->addr32[0]) #define PF_AZERO(a, c) \ (!(a)->addr32[0]) #define PF_MATCHA(n, a, m, b, f) \ pf_match_addr(n, a, m, b, f) #define PF_ACPY(a, b, f) \ (a)->v4.s_addr = (b)->v4.s_addr #define PF_AINC(a, f) \ do { \ (a)->addr32[0] = htonl(ntohl((a)->addr32[0]) + 1); \ } while (0) #define PF_POOLMASK(a, b, c, d, f) \ do { \ (a)->addr32[0] = ((b)->addr32[0] & (c)->addr32[0]) | \ (((c)->addr32[0] ^ 0xffffffff ) & (d)->addr32[0]); \ } while (0) #endif /* PF_INET_ONLY */ #endif /* PF_INET6_ONLY */ #endif /* PF_INET_INET6 */ /* * XXX callers not FIB-aware in our version of pf yet. * OpenBSD fixed it later it seems, 2010/05/07 13:33:16 claudio. */ #define PF_MISMATCHAW(aw, x, af, neg, ifp, rtid) \ ( \ (((aw)->type == PF_ADDR_NOROUTE && \ pf_routable((x), (af), NULL, (rtid))) || \ (((aw)->type == PF_ADDR_URPFFAILED && (ifp) != NULL && \ pf_routable((x), (af), (ifp), (rtid))) || \ ((aw)->type == PF_ADDR_TABLE && \ !pfr_match_addr((aw)->p.tbl, (x), (af))) || \ ((aw)->type == PF_ADDR_DYNIFTL && \ !pfi_match_addr((aw)->p.dyn, (x), (af))) || \ ((aw)->type == PF_ADDR_RANGE && \ !pf_match_addr_range(&(aw)->v.a.addr, \ &(aw)->v.a.mask, (x), (af))) || \ ((aw)->type == PF_ADDR_ADDRMASK && \ !PF_AZERO(&(aw)->v.a.mask, (af)) && \ !PF_MATCHA(0, &(aw)->v.a.addr, \ &(aw)->v.a.mask, (x), (af))))) != \ (neg) \ ) struct pf_rule_uid { uid_t uid[2]; u_int8_t op; }; struct pf_rule_gid { uid_t gid[2]; u_int8_t op; }; struct pf_rule_addr { struct pf_addr_wrap addr; u_int16_t port[2]; u_int8_t neg; u_int8_t port_op; }; struct pf_pooladdr { struct pf_addr_wrap addr; TAILQ_ENTRY(pf_pooladdr) entries; char ifname[IFNAMSIZ]; struct pfi_kif *kif; }; TAILQ_HEAD(pf_palist, pf_pooladdr); struct pf_poolhashkey { union { u_int8_t key8[16]; u_int16_t key16[8]; u_int32_t key32[4]; } pfk; /* 128-bit hash key */ #define key8 pfk.key8 #define key16 pfk.key16 #define key32 pfk.key32 }; struct pf_pool { struct pf_palist list; struct pf_pooladdr *cur; struct pf_poolhashkey key; struct pf_addr counter; int tblidx; u_int16_t proxy_port[2]; u_int8_t opts; }; /* A packed Operating System description for fingerprinting */ typedef u_int32_t pf_osfp_t; #define PF_OSFP_ANY ((pf_osfp_t)0) #define PF_OSFP_UNKNOWN ((pf_osfp_t)-1) #define PF_OSFP_NOMATCH ((pf_osfp_t)-2) struct pf_osfp_entry { SLIST_ENTRY(pf_osfp_entry) fp_entry; pf_osfp_t fp_os; int fp_enflags; #define PF_OSFP_EXPANDED 0x001 /* expanded entry */ #define PF_OSFP_GENERIC 0x002 /* generic signature */ #define PF_OSFP_NODETAIL 0x004 /* no p0f details */ #define PF_OSFP_LEN 32 char fp_class_nm[PF_OSFP_LEN]; char fp_version_nm[PF_OSFP_LEN]; char fp_subtype_nm[PF_OSFP_LEN]; }; #define PF_OSFP_ENTRY_EQ(a, b) \ ((a)->fp_os == (b)->fp_os && \ memcmp((a)->fp_class_nm, (b)->fp_class_nm, PF_OSFP_LEN) == 0 && \ memcmp((a)->fp_version_nm, (b)->fp_version_nm, PF_OSFP_LEN) == 0 && \ memcmp((a)->fp_subtype_nm, (b)->fp_subtype_nm, PF_OSFP_LEN) == 0) /* handle pf_osfp_t packing */ #define _FP_RESERVED_BIT 1 /* For the special negative #defines */ #define _FP_UNUSED_BITS 1 #define _FP_CLASS_BITS 10 /* OS Class (Windows, Linux) */ #define _FP_VERSION_BITS 10 /* OS version (95, 98, NT, 2.4.54, 3.2) */ #define _FP_SUBTYPE_BITS 10 /* patch level (NT SP4, SP3, ECN patch) */ #define PF_OSFP_UNPACK(osfp, class, version, subtype) do { \ (class) = ((osfp) >> (_FP_VERSION_BITS+_FP_SUBTYPE_BITS)) & \ ((1 << _FP_CLASS_BITS) - 1); \ (version) = ((osfp) >> _FP_SUBTYPE_BITS) & \ ((1 << _FP_VERSION_BITS) - 1);\ (subtype) = (osfp) & ((1 << _FP_SUBTYPE_BITS) - 1); \ } while(0) #define PF_OSFP_PACK(osfp, class, version, subtype) do { \ (osfp) = ((class) & ((1 << _FP_CLASS_BITS) - 1)) << (_FP_VERSION_BITS \ + _FP_SUBTYPE_BITS); \ (osfp) |= ((version) & ((1 << _FP_VERSION_BITS) - 1)) << \ _FP_SUBTYPE_BITS; \ (osfp) |= (subtype) & ((1 << _FP_SUBTYPE_BITS) - 1); \ } while(0) /* the fingerprint of an OSes TCP SYN packet */ typedef u_int64_t pf_tcpopts_t; struct pf_os_fingerprint { SLIST_HEAD(pf_osfp_enlist, pf_osfp_entry) fp_oses; /* list of matches */ pf_tcpopts_t fp_tcpopts; /* packed TCP options */ u_int16_t fp_wsize; /* TCP window size */ u_int16_t fp_psize; /* ip->ip_len */ u_int16_t fp_mss; /* TCP MSS */ u_int16_t fp_flags; #define PF_OSFP_WSIZE_MOD 0x0001 /* Window modulus */ #define PF_OSFP_WSIZE_DC 0x0002 /* Window don't care */ #define PF_OSFP_WSIZE_MSS 0x0004 /* Window multiple of MSS */ #define PF_OSFP_WSIZE_MTU 0x0008 /* Window multiple of MTU */ #define PF_OSFP_PSIZE_MOD 0x0010 /* packet size modulus */ #define PF_OSFP_PSIZE_DC 0x0020 /* packet size don't care */ #define PF_OSFP_WSCALE 0x0040 /* TCP window scaling */ #define PF_OSFP_WSCALE_MOD 0x0080 /* TCP window scale modulus */ #define PF_OSFP_WSCALE_DC 0x0100 /* TCP window scale dont-care */ #define PF_OSFP_MSS 0x0200 /* TCP MSS */ #define PF_OSFP_MSS_MOD 0x0400 /* TCP MSS modulus */ #define PF_OSFP_MSS_DC 0x0800 /* TCP MSS dont-care */ #define PF_OSFP_DF 0x1000 /* IPv4 don't fragment bit */ #define PF_OSFP_TS0 0x2000 /* Zero timestamp */ #define PF_OSFP_INET6 0x4000 /* IPv6 */ u_int8_t fp_optcnt; /* TCP option count */ u_int8_t fp_wscale; /* TCP window scaling */ u_int8_t fp_ttl; /* IPv4 TTL */ #define PF_OSFP_MAXTTL_OFFSET 40 /* TCP options packing */ #define PF_OSFP_TCPOPT_NOP 0x0 /* TCP NOP option */ #define PF_OSFP_TCPOPT_WSCALE 0x1 /* TCP window scaling option */ #define PF_OSFP_TCPOPT_MSS 0x2 /* TCP max segment size opt */ #define PF_OSFP_TCPOPT_SACK 0x3 /* TCP SACK OK option */ #define PF_OSFP_TCPOPT_TS 0x4 /* TCP timestamp option */ #define PF_OSFP_TCPOPT_BITS 3 /* bits used by each option */ #define PF_OSFP_MAX_OPTS \ (sizeof(((struct pf_os_fingerprint *)0)->fp_tcpopts) * 8) \ / PF_OSFP_TCPOPT_BITS SLIST_ENTRY(pf_os_fingerprint) fp_next; }; struct pf_osfp_ioctl { struct pf_osfp_entry fp_os; pf_tcpopts_t fp_tcpopts; /* packed TCP options */ u_int16_t fp_wsize; /* TCP window size */ u_int16_t fp_psize; /* ip->ip_len */ u_int16_t fp_mss; /* TCP MSS */ u_int16_t fp_flags; u_int8_t fp_optcnt; /* TCP option count */ u_int8_t fp_wscale; /* TCP window scaling */ u_int8_t fp_ttl; /* IPv4 TTL */ int fp_getnum; /* DIOCOSFPGET number */ }; union pf_rule_ptr { struct pf_rule *ptr; u_int32_t nr; }; #define PF_ANCHOR_NAME_SIZE 64 struct pf_rule { struct pf_rule_addr src; struct pf_rule_addr dst; #define PF_SKIP_IFP 0 #define PF_SKIP_DIR 1 #define PF_SKIP_AF 2 #define PF_SKIP_PROTO 3 #define PF_SKIP_SRC_ADDR 4 #define PF_SKIP_SRC_PORT 5 #define PF_SKIP_DST_ADDR 6 #define PF_SKIP_DST_PORT 7 #define PF_SKIP_COUNT 8 union pf_rule_ptr skip[PF_SKIP_COUNT]; #define PF_RULE_LABEL_SIZE 64 char label[PF_RULE_LABEL_SIZE]; -#define PF_QNAME_SIZE 64 char ifname[IFNAMSIZ]; char qname[PF_QNAME_SIZE]; char pqname[PF_QNAME_SIZE]; #define PF_TAG_NAME_SIZE 64 char tagname[PF_TAG_NAME_SIZE]; char match_tagname[PF_TAG_NAME_SIZE]; char overload_tblname[PF_TABLE_NAME_SIZE]; TAILQ_ENTRY(pf_rule) entries; struct pf_pool rpool; u_int64_t evaluations; u_int64_t packets[2]; u_int64_t bytes[2]; struct pfi_kif *kif; struct pf_anchor *anchor; struct pfr_ktable *overload_tbl; pf_osfp_t os_fingerprint; int rtableid; u_int32_t timeout[PFTM_MAX]; u_int32_t max_states; u_int32_t max_src_nodes; u_int32_t max_src_states; u_int32_t max_src_conn; struct { u_int32_t limit; u_int32_t seconds; } max_src_conn_rate; u_int32_t qid; u_int32_t pqid; u_int32_t rt_listid; u_int32_t nr; u_int32_t prob; uid_t cuid; pid_t cpid; counter_u64_t states_cur; counter_u64_t states_tot; counter_u64_t src_nodes; u_int16_t return_icmp; u_int16_t return_icmp6; u_int16_t max_mss; u_int16_t tag; u_int16_t match_tag; u_int16_t spare2; /* netgraph */ struct pf_rule_uid uid; struct pf_rule_gid gid; u_int32_t rule_flag; u_int8_t action; u_int8_t direction; u_int8_t log; u_int8_t logif; u_int8_t quick; u_int8_t ifnot; u_int8_t match_tag_not; u_int8_t natpass; #define PF_STATE_NORMAL 0x1 #define PF_STATE_MODULATE 0x2 #define PF_STATE_SYNPROXY 0x3 u_int8_t keep_state; sa_family_t af; u_int8_t proto; u_int8_t type; u_int8_t code; u_int8_t flags; u_int8_t flagset; u_int8_t min_ttl; u_int8_t allow_opts; u_int8_t rt; u_int8_t return_ttl; u_int8_t tos; u_int8_t set_tos; u_int8_t anchor_relative; u_int8_t anchor_wildcard; #define PF_FLUSH 0x01 #define PF_FLUSH_GLOBAL 0x02 u_int8_t flush; struct { struct pf_addr addr; u_int16_t port; } divert; uint64_t u_states_cur; uint64_t u_states_tot; uint64_t u_src_nodes; }; /* rule flags */ #define PFRULE_DROP 0x0000 #define PFRULE_RETURNRST 0x0001 #define PFRULE_FRAGMENT 0x0002 #define PFRULE_RETURNICMP 0x0004 #define PFRULE_RETURN 0x0008 #define PFRULE_NOSYNC 0x0010 #define PFRULE_SRCTRACK 0x0020 /* track source states */ #define PFRULE_RULESRCTRACK 0x0040 /* per rule */ #define PFRULE_REFS 0x0080 /* rule has references */ /* scrub flags */ #define PFRULE_NODF 0x0100 #define PFRULE_FRAGCROP 0x0200 /* non-buffering frag cache */ #define PFRULE_FRAGDROP 0x0400 /* drop funny fragments */ #define PFRULE_RANDOMID 0x0800 #define PFRULE_REASSEMBLE_TCP 0x1000 #define PFRULE_SET_TOS 0x2000 /* rule flags again */ #define PFRULE_IFBOUND 0x00010000 /* if-bound */ #define PFRULE_STATESLOPPY 0x00020000 /* sloppy state tracking */ #define PFSTATE_HIWAT 10000 /* default state table size */ #define PFSTATE_ADAPT_START 6000 /* default adaptive timeout start */ #define PFSTATE_ADAPT_END 12000 /* default adaptive timeout end */ struct pf_threshold { u_int32_t limit; #define PF_THRESHOLD_MULT 1000 #define PF_THRESHOLD_MAX 0xffffffff / PF_THRESHOLD_MULT u_int32_t seconds; u_int32_t count; u_int32_t last; }; struct pf_src_node { LIST_ENTRY(pf_src_node) entry; struct pf_addr addr; struct pf_addr raddr; union pf_rule_ptr rule; struct pfi_kif *kif; u_int64_t bytes[2]; u_int64_t packets[2]; u_int32_t states; u_int32_t conn; struct pf_threshold conn_rate; u_int32_t creation; u_int32_t expire; sa_family_t af; u_int8_t ruletype; }; #define PFSNODE_HIWAT 10000 /* default source node table size */ struct pf_state_scrub { struct timeval pfss_last; /* time received last packet */ u_int32_t pfss_tsecr; /* last echoed timestamp */ u_int32_t pfss_tsval; /* largest timestamp */ u_int32_t pfss_tsval0; /* original timestamp */ u_int16_t pfss_flags; #define PFSS_TIMESTAMP 0x0001 /* modulate timestamp */ #define PFSS_PAWS 0x0010 /* stricter PAWS checks */ #define PFSS_PAWS_IDLED 0x0020 /* was idle too long. no PAWS */ #define PFSS_DATA_TS 0x0040 /* timestamp on data packets */ #define PFSS_DATA_NOTS 0x0080 /* no timestamp on data packets */ u_int8_t pfss_ttl; /* stashed TTL */ u_int8_t pad; u_int32_t pfss_ts_mod; /* timestamp modulation */ }; struct pf_state_host { struct pf_addr addr; u_int16_t port; u_int16_t pad; }; struct pf_state_peer { struct pf_state_scrub *scrub; /* state is scrubbed */ u_int32_t seqlo; /* Max sequence number sent */ u_int32_t seqhi; /* Max the other end ACKd + win */ u_int32_t seqdiff; /* Sequence number modulator */ u_int16_t max_win; /* largest window (pre scaling) */ u_int16_t mss; /* Maximum segment size option */ u_int8_t state; /* active state level */ u_int8_t wscale; /* window scaling factor */ u_int8_t tcp_est; /* Did we reach TCPS_ESTABLISHED */ u_int8_t pad[1]; }; /* Keep synced with struct pf_state_key. */ struct pf_state_key_cmp { struct pf_addr addr[2]; u_int16_t port[2]; sa_family_t af; u_int8_t proto; u_int8_t pad[2]; }; struct pf_state_key { struct pf_addr addr[2]; u_int16_t port[2]; sa_family_t af; u_int8_t proto; u_int8_t pad[2]; LIST_ENTRY(pf_state_key) entry; TAILQ_HEAD(, pf_state) states[2]; }; /* Keep synced with struct pf_state. */ struct pf_state_cmp { u_int64_t id; u_int32_t creatorid; u_int8_t direction; u_int8_t pad[3]; }; struct pf_state { u_int64_t id; u_int32_t creatorid; u_int8_t direction; u_int8_t pad[3]; u_int refs; TAILQ_ENTRY(pf_state) sync_list; TAILQ_ENTRY(pf_state) key_list[2]; LIST_ENTRY(pf_state) entry; struct pf_state_peer src; struct pf_state_peer dst; union pf_rule_ptr rule; union pf_rule_ptr anchor; union pf_rule_ptr nat_rule; struct pf_addr rt_addr; struct pf_state_key *key[2]; /* addresses stack and wire */ struct pfi_kif *kif; struct pfi_kif *rt_kif; struct pf_src_node *src_node; struct pf_src_node *nat_src_node; u_int64_t packets[2]; u_int64_t bytes[2]; u_int32_t creation; u_int32_t expire; u_int32_t pfsync_time; u_int16_t tag; u_int8_t log; u_int8_t state_flags; #define PFSTATE_ALLOWOPTS 0x01 #define PFSTATE_SLOPPY 0x02 /* was PFSTATE_PFLOW 0x04 */ #define PFSTATE_NOSYNC 0x08 #define PFSTATE_ACK 0x10 u_int8_t timeout; u_int8_t sync_state; /* PFSYNC_S_x */ /* XXX */ u_int8_t sync_updates; u_int8_t _tail[3]; }; /* * Unified state structures for pulling states out of the kernel * used by pfsync(4) and the pf(4) ioctl. */ struct pfsync_state_scrub { u_int16_t pfss_flags; u_int8_t pfss_ttl; /* stashed TTL */ #define PFSYNC_SCRUB_FLAG_VALID 0x01 u_int8_t scrub_flag; u_int32_t pfss_ts_mod; /* timestamp modulation */ } __packed; struct pfsync_state_peer { struct pfsync_state_scrub scrub; /* state is scrubbed */ u_int32_t seqlo; /* Max sequence number sent */ u_int32_t seqhi; /* Max the other end ACKd + win */ u_int32_t seqdiff; /* Sequence number modulator */ u_int16_t max_win; /* largest window (pre scaling) */ u_int16_t mss; /* Maximum segment size option */ u_int8_t state; /* active state level */ u_int8_t wscale; /* window scaling factor */ u_int8_t pad[6]; } __packed; struct pfsync_state_key { struct pf_addr addr[2]; u_int16_t port[2]; }; struct pfsync_state { u_int64_t id; char ifname[IFNAMSIZ]; struct pfsync_state_key key[2]; struct pfsync_state_peer src; struct pfsync_state_peer dst; struct pf_addr rt_addr; u_int32_t rule; u_int32_t anchor; u_int32_t nat_rule; u_int32_t creation; u_int32_t expire; u_int32_t packets[2][2]; u_int32_t bytes[2][2]; u_int32_t creatorid; sa_family_t af; u_int8_t proto; u_int8_t direction; u_int8_t __spare[2]; u_int8_t log; u_int8_t state_flags; u_int8_t timeout; u_int8_t sync_flags; u_int8_t updates; } __packed; #ifdef _KERNEL /* pfsync */ typedef int pfsync_state_import_t(struct pfsync_state *, u_int8_t); typedef void pfsync_insert_state_t(struct pf_state *); typedef void pfsync_update_state_t(struct pf_state *); typedef void pfsync_delete_state_t(struct pf_state *); typedef void pfsync_clear_states_t(u_int32_t, const char *); typedef int pfsync_defer_t(struct pf_state *, struct mbuf *); extern pfsync_state_import_t *pfsync_state_import_ptr; extern pfsync_insert_state_t *pfsync_insert_state_ptr; extern pfsync_update_state_t *pfsync_update_state_ptr; extern pfsync_delete_state_t *pfsync_delete_state_ptr; extern pfsync_clear_states_t *pfsync_clear_states_ptr; extern pfsync_defer_t *pfsync_defer_ptr; void pfsync_state_export(struct pfsync_state *, struct pf_state *); /* pflog */ struct pf_ruleset; struct pf_pdesc; typedef int pflog_packet_t(struct pfi_kif *, struct mbuf *, sa_family_t, u_int8_t, u_int8_t, struct pf_rule *, struct pf_rule *, struct pf_ruleset *, struct pf_pdesc *, int); extern pflog_packet_t *pflog_packet_ptr; #define V_pf_end_threads VNET(pf_end_threads) #endif /* _KERNEL */ #define PFSYNC_FLAG_SRCNODE 0x04 #define PFSYNC_FLAG_NATSRCNODE 0x08 /* for copies to/from network byte order */ /* ioctl interface also uses network byte order */ #define pf_state_peer_hton(s,d) do { \ (d)->seqlo = htonl((s)->seqlo); \ (d)->seqhi = htonl((s)->seqhi); \ (d)->seqdiff = htonl((s)->seqdiff); \ (d)->max_win = htons((s)->max_win); \ (d)->mss = htons((s)->mss); \ (d)->state = (s)->state; \ (d)->wscale = (s)->wscale; \ if ((s)->scrub) { \ (d)->scrub.pfss_flags = \ htons((s)->scrub->pfss_flags & PFSS_TIMESTAMP); \ (d)->scrub.pfss_ttl = (s)->scrub->pfss_ttl; \ (d)->scrub.pfss_ts_mod = htonl((s)->scrub->pfss_ts_mod);\ (d)->scrub.scrub_flag = PFSYNC_SCRUB_FLAG_VALID; \ } \ } while (0) #define pf_state_peer_ntoh(s,d) do { \ (d)->seqlo = ntohl((s)->seqlo); \ (d)->seqhi = ntohl((s)->seqhi); \ (d)->seqdiff = ntohl((s)->seqdiff); \ (d)->max_win = ntohs((s)->max_win); \ (d)->mss = ntohs((s)->mss); \ (d)->state = (s)->state; \ (d)->wscale = (s)->wscale; \ if ((s)->scrub.scrub_flag == PFSYNC_SCRUB_FLAG_VALID && \ (d)->scrub != NULL) { \ (d)->scrub->pfss_flags = \ ntohs((s)->scrub.pfss_flags) & PFSS_TIMESTAMP; \ (d)->scrub->pfss_ttl = (s)->scrub.pfss_ttl; \ (d)->scrub->pfss_ts_mod = ntohl((s)->scrub.pfss_ts_mod);\ } \ } while (0) #define pf_state_counter_hton(s,d) do { \ d[0] = htonl((s>>32)&0xffffffff); \ d[1] = htonl(s&0xffffffff); \ } while (0) #define pf_state_counter_from_pfsync(s) \ (((u_int64_t)(s[0])<<32) | (u_int64_t)(s[1])) #define pf_state_counter_ntoh(s,d) do { \ d = ntohl(s[0]); \ d = d<<32; \ d += ntohl(s[1]); \ } while (0) TAILQ_HEAD(pf_rulequeue, pf_rule); struct pf_anchor; struct pf_ruleset { struct { struct pf_rulequeue queues[2]; struct { struct pf_rulequeue *ptr; struct pf_rule **ptr_array; u_int32_t rcount; u_int32_t ticket; int open; } active, inactive; } rules[PF_RULESET_MAX]; struct pf_anchor *anchor; u_int32_t tticket; int tables; int topen; }; RB_HEAD(pf_anchor_global, pf_anchor); RB_HEAD(pf_anchor_node, pf_anchor); struct pf_anchor { RB_ENTRY(pf_anchor) entry_global; RB_ENTRY(pf_anchor) entry_node; struct pf_anchor *parent; struct pf_anchor_node children; char name[PF_ANCHOR_NAME_SIZE]; char path[MAXPATHLEN]; struct pf_ruleset ruleset; int refcnt; /* anchor rules */ int match; /* XXX: used for pfctl black magic */ }; RB_PROTOTYPE(pf_anchor_global, pf_anchor, entry_global, pf_anchor_compare); RB_PROTOTYPE(pf_anchor_node, pf_anchor, entry_node, pf_anchor_compare); #define PF_RESERVED_ANCHOR "_pf" #define PFR_TFLAG_PERSIST 0x00000001 #define PFR_TFLAG_CONST 0x00000002 #define PFR_TFLAG_ACTIVE 0x00000004 #define PFR_TFLAG_INACTIVE 0x00000008 #define PFR_TFLAG_REFERENCED 0x00000010 #define PFR_TFLAG_REFDANCHOR 0x00000020 #define PFR_TFLAG_COUNTERS 0x00000040 /* Adjust masks below when adding flags. */ #define PFR_TFLAG_USRMASK (PFR_TFLAG_PERSIST | \ PFR_TFLAG_CONST | \ PFR_TFLAG_COUNTERS) #define PFR_TFLAG_SETMASK (PFR_TFLAG_ACTIVE | \ PFR_TFLAG_INACTIVE | \ PFR_TFLAG_REFERENCED | \ PFR_TFLAG_REFDANCHOR) #define PFR_TFLAG_ALLMASK (PFR_TFLAG_PERSIST | \ PFR_TFLAG_CONST | \ PFR_TFLAG_ACTIVE | \ PFR_TFLAG_INACTIVE | \ PFR_TFLAG_REFERENCED | \ PFR_TFLAG_REFDANCHOR | \ PFR_TFLAG_COUNTERS) struct pf_anchor_stackframe; struct pfr_table { char pfrt_anchor[MAXPATHLEN]; char pfrt_name[PF_TABLE_NAME_SIZE]; u_int32_t pfrt_flags; u_int8_t pfrt_fback; }; enum { PFR_FB_NONE, PFR_FB_MATCH, PFR_FB_ADDED, PFR_FB_DELETED, PFR_FB_CHANGED, PFR_FB_CLEARED, PFR_FB_DUPLICATE, PFR_FB_NOTMATCH, PFR_FB_CONFLICT, PFR_FB_NOCOUNT, PFR_FB_MAX }; struct pfr_addr { union { struct in_addr _pfra_ip4addr; struct in6_addr _pfra_ip6addr; } pfra_u; u_int8_t pfra_af; u_int8_t pfra_net; u_int8_t pfra_not; u_int8_t pfra_fback; }; #define pfra_ip4addr pfra_u._pfra_ip4addr #define pfra_ip6addr pfra_u._pfra_ip6addr enum { PFR_DIR_IN, PFR_DIR_OUT, PFR_DIR_MAX }; enum { PFR_OP_BLOCK, PFR_OP_PASS, PFR_OP_ADDR_MAX, PFR_OP_TABLE_MAX }; #define PFR_OP_XPASS PFR_OP_ADDR_MAX struct pfr_astats { struct pfr_addr pfras_a; u_int64_t pfras_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; u_int64_t pfras_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; long pfras_tzero; }; enum { PFR_REFCNT_RULE, PFR_REFCNT_ANCHOR, PFR_REFCNT_MAX }; struct pfr_tstats { struct pfr_table pfrts_t; u_int64_t pfrts_packets[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; u_int64_t pfrts_bytes[PFR_DIR_MAX][PFR_OP_TABLE_MAX]; u_int64_t pfrts_match; u_int64_t pfrts_nomatch; long pfrts_tzero; int pfrts_cnt; int pfrts_refcnt[PFR_REFCNT_MAX]; }; #define pfrts_name pfrts_t.pfrt_name #define pfrts_flags pfrts_t.pfrt_flags #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ struct pfr_kcounters { u_int64_t pfrkc_packets[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; u_int64_t pfrkc_bytes[PFR_DIR_MAX][PFR_OP_ADDR_MAX]; }; SLIST_HEAD(pfr_kentryworkq, pfr_kentry); struct pfr_kentry { struct radix_node pfrke_node[2]; union sockaddr_union pfrke_sa; SLIST_ENTRY(pfr_kentry) pfrke_workq; struct pfr_kcounters *pfrke_counters; long pfrke_tzero; u_int8_t pfrke_af; u_int8_t pfrke_net; u_int8_t pfrke_not; u_int8_t pfrke_mark; }; SLIST_HEAD(pfr_ktableworkq, pfr_ktable); RB_HEAD(pfr_ktablehead, pfr_ktable); struct pfr_ktable { struct pfr_tstats pfrkt_ts; RB_ENTRY(pfr_ktable) pfrkt_tree; SLIST_ENTRY(pfr_ktable) pfrkt_workq; struct radix_node_head *pfrkt_ip4; struct radix_node_head *pfrkt_ip6; struct pfr_ktable *pfrkt_shadow; struct pfr_ktable *pfrkt_root; struct pf_ruleset *pfrkt_rs; long pfrkt_larg; int pfrkt_nflags; }; #define pfrkt_t pfrkt_ts.pfrts_t #define pfrkt_name pfrkt_t.pfrt_name #define pfrkt_anchor pfrkt_t.pfrt_anchor #define pfrkt_ruleset pfrkt_t.pfrt_ruleset #define pfrkt_flags pfrkt_t.pfrt_flags #define pfrkt_cnt pfrkt_ts.pfrts_cnt #define pfrkt_refcnt pfrkt_ts.pfrts_refcnt #define pfrkt_packets pfrkt_ts.pfrts_packets #define pfrkt_bytes pfrkt_ts.pfrts_bytes #define pfrkt_match pfrkt_ts.pfrts_match #define pfrkt_nomatch pfrkt_ts.pfrts_nomatch #define pfrkt_tzero pfrkt_ts.pfrts_tzero /* keep synced with pfi_kif, used in RB_FIND */ struct pfi_kif_cmp { char pfik_name[IFNAMSIZ]; }; struct pfi_kif { char pfik_name[IFNAMSIZ]; union { RB_ENTRY(pfi_kif) _pfik_tree; LIST_ENTRY(pfi_kif) _pfik_list; } _pfik_glue; #define pfik_tree _pfik_glue._pfik_tree #define pfik_list _pfik_glue._pfik_list u_int64_t pfik_packets[2][2][2]; u_int64_t pfik_bytes[2][2][2]; u_int32_t pfik_tzero; u_int pfik_flags; struct ifnet *pfik_ifp; struct ifg_group *pfik_group; u_int pfik_rulerefs; TAILQ_HEAD(, pfi_dynaddr) pfik_dynaddrs; }; #define PFI_IFLAG_REFS 0x0001 /* has state references */ #define PFI_IFLAG_SKIP 0x0100 /* skip filtering on interface */ struct pf_pdesc { struct { int done; uid_t uid; gid_t gid; } lookup; u_int64_t tot_len; /* Make Mickey money */ union { struct tcphdr *tcp; struct udphdr *udp; struct icmp *icmp; #ifdef INET6 struct icmp6_hdr *icmp6; #endif /* INET6 */ void *any; } hdr; struct pf_rule *nat_rule; /* nat/rdr rule applied to packet */ struct pf_addr *src; /* src address */ struct pf_addr *dst; /* dst address */ u_int16_t *sport; u_int16_t *dport; struct pf_mtag *pf_mtag; u_int32_t p_len; /* total length of payload */ u_int16_t *ip_sum; u_int16_t *proto_sum; u_int16_t flags; /* Let SCRUB trigger behavior in * state code. Easier than tags */ #define PFDESC_TCP_NORM 0x0001 /* TCP shall be statefully scrubbed */ #define PFDESC_IP_REAS 0x0002 /* IP frags would've been reassembled */ sa_family_t af; u_int8_t proto; u_int8_t tos; u_int8_t dir; /* direction */ u_int8_t sidx; /* key index for source */ u_int8_t didx; /* key index for destination */ }; /* flags for RDR options */ #define PF_DPORT_RANGE 0x01 /* Dest port uses range */ #define PF_RPORT_RANGE 0x02 /* RDR'ed port uses range */ -/* Reasons code for passing/dropping a packet */ -#define PFRES_MATCH 0 /* Explicit match of a rule */ -#define PFRES_BADOFF 1 /* Bad offset for pull_hdr */ -#define PFRES_FRAG 2 /* Dropping following fragment */ -#define PFRES_SHORT 3 /* Dropping short packet */ -#define PFRES_NORM 4 /* Dropping by normalizer */ -#define PFRES_MEMORY 5 /* Dropped due to lacking mem */ -#define PFRES_TS 6 /* Bad TCP Timestamp (RFC1323) */ -#define PFRES_CONGEST 7 /* Congestion (of ipintrq) */ -#define PFRES_IPOPTIONS 8 /* IP option */ -#define PFRES_PROTCKSUM 9 /* Protocol checksum invalid */ -#define PFRES_BADSTATE 10 /* State mismatch */ -#define PFRES_STATEINS 11 /* State insertion failure */ -#define PFRES_MAXSTATES 12 /* State limit */ -#define PFRES_SRCLIMIT 13 /* Source node/conn limit */ -#define PFRES_SYNPROXY 14 /* SYN proxy */ -#define PFRES_MAX 15 /* total+1 */ - -#define PFRES_NAMES { \ - "match", \ - "bad-offset", \ - "fragment", \ - "short", \ - "normalize", \ - "memory", \ - "bad-timestamp", \ - "congestion", \ - "ip-option", \ - "proto-cksum", \ - "state-mismatch", \ - "state-insert", \ - "state-limit", \ - "src-limit", \ - "synproxy", \ - NULL \ -} - /* Counters for other things we want to keep track of */ #define LCNT_STATES 0 /* states */ #define LCNT_SRCSTATES 1 /* max-src-states */ #define LCNT_SRCNODES 2 /* max-src-nodes */ #define LCNT_SRCCONN 3 /* max-src-conn */ #define LCNT_SRCCONNRATE 4 /* max-src-conn-rate */ #define LCNT_OVERLOAD_TABLE 5 /* entry added to overload table */ #define LCNT_OVERLOAD_FLUSH 6 /* state entries flushed */ #define LCNT_MAX 7 /* total+1 */ #define LCNT_NAMES { \ "max states per rule", \ "max-src-states", \ "max-src-nodes", \ "max-src-conn", \ "max-src-conn-rate", \ "overload table insertion", \ "overload flush states", \ NULL \ } /* UDP state enumeration */ #define PFUDPS_NO_TRAFFIC 0 #define PFUDPS_SINGLE 1 #define PFUDPS_MULTIPLE 2 #define PFUDPS_NSTATES 3 /* number of state levels */ #define PFUDPS_NAMES { \ "NO_TRAFFIC", \ "SINGLE", \ "MULTIPLE", \ NULL \ } /* Other protocol state enumeration */ #define PFOTHERS_NO_TRAFFIC 0 #define PFOTHERS_SINGLE 1 #define PFOTHERS_MULTIPLE 2 #define PFOTHERS_NSTATES 3 /* number of state levels */ #define PFOTHERS_NAMES { \ "NO_TRAFFIC", \ "SINGLE", \ "MULTIPLE", \ NULL \ } #define FCNT_STATE_SEARCH 0 #define FCNT_STATE_INSERT 1 #define FCNT_STATE_REMOVALS 2 #define FCNT_MAX 3 #define SCNT_SRC_NODE_SEARCH 0 #define SCNT_SRC_NODE_INSERT 1 #define SCNT_SRC_NODE_REMOVALS 2 #define SCNT_MAX 3 #define ACTION_SET(a, x) \ do { \ if ((a) != NULL) \ *(a) = (x); \ } while (0) #define REASON_SET(a, x) \ do { \ if ((a) != NULL) \ *(a) = (x); \ if (x < PFRES_MAX) \ V_pf_status.counters[x]++; \ } while (0) struct pf_status { u_int64_t counters[PFRES_MAX]; u_int64_t lcounters[LCNT_MAX]; /* limit counters */ u_int64_t fcounters[FCNT_MAX]; u_int64_t scounters[SCNT_MAX]; u_int64_t pcounters[2][2][3]; u_int64_t bcounters[2][2]; u_int32_t running; u_int32_t states; u_int32_t src_nodes; u_int32_t since; u_int32_t debug; u_int32_t hostid; char ifname[IFNAMSIZ]; u_int8_t pf_chksum[PF_MD5_DIGEST_LENGTH]; -}; - -struct cbq_opts { - u_int minburst; - u_int maxburst; - u_int pktsize; - u_int maxpktsize; - u_int ns_per_byte; - u_int maxidle; - int minidle; - u_int offtime; - int flags; -}; - -struct priq_opts { - int flags; -}; - -struct hfsc_opts { - /* real-time service curve */ - u_int rtsc_m1; /* slope of the 1st segment in bps */ - u_int rtsc_d; /* the x-projection of m1 in msec */ - u_int rtsc_m2; /* slope of the 2nd segment in bps */ - /* link-sharing service curve */ - u_int lssc_m1; - u_int lssc_d; - u_int lssc_m2; - /* upper-limit service curve */ - u_int ulsc_m1; - u_int ulsc_d; - u_int ulsc_m2; - int flags; -}; - -struct pf_altq { - char ifname[IFNAMSIZ]; - - void *altq_disc; /* discipline-specific state */ - TAILQ_ENTRY(pf_altq) entries; - - /* scheduler spec */ - u_int8_t scheduler; /* scheduler type */ - u_int16_t tbrsize; /* tokenbucket regulator size */ - u_int32_t ifbandwidth; /* interface bandwidth */ - - /* queue spec */ - char qname[PF_QNAME_SIZE]; /* queue name */ - char parent[PF_QNAME_SIZE]; /* parent name */ - u_int32_t parent_qid; /* parent queue id */ - u_int32_t bandwidth; /* queue bandwidth */ - u_int8_t priority; /* priority */ - u_int8_t local_flags; /* dynamic interface */ -#define PFALTQ_FLAG_IF_REMOVED 0x01 - - u_int16_t qlimit; /* queue size limit */ - u_int16_t flags; /* misc flags */ - union { - struct cbq_opts cbq_opts; - struct priq_opts priq_opts; - struct hfsc_opts hfsc_opts; - } pq_u; - - u_int32_t qid; /* return value */ }; struct pf_divert { union { struct in_addr ipv4; struct in6_addr ipv6; } addr; u_int16_t port; }; #define PFFRAG_FRENT_HIWAT 5000 /* Number of fragment entries */ #define PFR_KENTRY_HIWAT 200000 /* Number of table entries */ /* * ioctl parameter structures */ struct pfioc_pooladdr { u_int32_t action; u_int32_t ticket; u_int32_t nr; u_int32_t r_num; u_int8_t r_action; u_int8_t r_last; u_int8_t af; char anchor[MAXPATHLEN]; struct pf_pooladdr addr; }; struct pfioc_rule { u_int32_t action; u_int32_t ticket; u_int32_t pool_ticket; u_int32_t nr; char anchor[MAXPATHLEN]; char anchor_call[MAXPATHLEN]; struct pf_rule rule; }; struct pfioc_natlook { struct pf_addr saddr; struct pf_addr daddr; struct pf_addr rsaddr; struct pf_addr rdaddr; u_int16_t sport; u_int16_t dport; u_int16_t rsport; u_int16_t rdport; sa_family_t af; u_int8_t proto; u_int8_t direction; }; struct pfioc_state { struct pfsync_state state; }; struct pfioc_src_node_kill { sa_family_t psnk_af; struct pf_rule_addr psnk_src; struct pf_rule_addr psnk_dst; u_int psnk_killed; }; struct pfioc_state_kill { struct pf_state_cmp psk_pfcmp; sa_family_t psk_af; int psk_proto; struct pf_rule_addr psk_src; struct pf_rule_addr psk_dst; char psk_ifname[IFNAMSIZ]; char psk_label[PF_RULE_LABEL_SIZE]; u_int psk_killed; }; struct pfioc_states { int ps_len; union { caddr_t psu_buf; struct pfsync_state *psu_states; } ps_u; #define ps_buf ps_u.psu_buf #define ps_states ps_u.psu_states }; struct pfioc_src_nodes { int psn_len; union { caddr_t psu_buf; struct pf_src_node *psu_src_nodes; } psn_u; #define psn_buf psn_u.psu_buf #define psn_src_nodes psn_u.psu_src_nodes }; struct pfioc_if { char ifname[IFNAMSIZ]; }; struct pfioc_tm { int timeout; int seconds; }; struct pfioc_limit { int index; unsigned limit; }; struct pfioc_altq { u_int32_t action; u_int32_t ticket; u_int32_t nr; struct pf_altq altq; }; struct pfioc_qstats { u_int32_t ticket; u_int32_t nr; void *buf; int nbytes; u_int8_t scheduler; }; struct pfioc_ruleset { u_int32_t nr; char path[MAXPATHLEN]; char name[PF_ANCHOR_NAME_SIZE]; }; #define PF_RULESET_ALTQ (PF_RULESET_MAX) #define PF_RULESET_TABLE (PF_RULESET_MAX+1) struct pfioc_trans { int size; /* number of elements */ int esize; /* size of each element in bytes */ struct pfioc_trans_e { int rs_num; char anchor[MAXPATHLEN]; u_int32_t ticket; } *array; }; #define PFR_FLAG_ATOMIC 0x00000001 /* unused */ #define PFR_FLAG_DUMMY 0x00000002 #define PFR_FLAG_FEEDBACK 0x00000004 #define PFR_FLAG_CLSTATS 0x00000008 #define PFR_FLAG_ADDRSTOO 0x00000010 #define PFR_FLAG_REPLACE 0x00000020 #define PFR_FLAG_ALLRSETS 0x00000040 #define PFR_FLAG_ALLMASK 0x0000007F #ifdef _KERNEL #define PFR_FLAG_USERIOCTL 0x10000000 #endif struct pfioc_table { struct pfr_table pfrio_table; void *pfrio_buffer; int pfrio_esize; int pfrio_size; int pfrio_size2; int pfrio_nadd; int pfrio_ndel; int pfrio_nchange; int pfrio_flags; u_int32_t pfrio_ticket; }; #define pfrio_exists pfrio_nadd #define pfrio_nzero pfrio_nadd #define pfrio_nmatch pfrio_nadd #define pfrio_naddr pfrio_size2 #define pfrio_setflag pfrio_size2 #define pfrio_clrflag pfrio_nadd struct pfioc_iface { char pfiio_name[IFNAMSIZ]; void *pfiio_buffer; int pfiio_esize; int pfiio_size; int pfiio_nzero; int pfiio_flags; }; /* * ioctl operations */ #define DIOCSTART _IO ('D', 1) #define DIOCSTOP _IO ('D', 2) #define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule) #define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule) #define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule) /* XXX cut 8 - 17 */ #define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill) #define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state) #define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) #define DIOCGETSTATUS _IOWR('D', 21, struct pf_status) #define DIOCCLRSTATUS _IO ('D', 22) #define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook) #define DIOCSETDEBUG _IOWR('D', 24, u_int32_t) #define DIOCGETSTATES _IOWR('D', 25, struct pfioc_states) #define DIOCCHANGERULE _IOWR('D', 26, struct pfioc_rule) /* XXX cut 26 - 28 */ #define DIOCSETTIMEOUT _IOWR('D', 29, struct pfioc_tm) #define DIOCGETTIMEOUT _IOWR('D', 30, struct pfioc_tm) #define DIOCADDSTATE _IOWR('D', 37, struct pfioc_state) #define DIOCCLRRULECTRS _IO ('D', 38) #define DIOCGETLIMIT _IOWR('D', 39, struct pfioc_limit) #define DIOCSETLIMIT _IOWR('D', 40, struct pfioc_limit) #define DIOCKILLSTATES _IOWR('D', 41, struct pfioc_state_kill) #define DIOCSTARTALTQ _IO ('D', 42) #define DIOCSTOPALTQ _IO ('D', 43) #define DIOCADDALTQ _IOWR('D', 45, struct pfioc_altq) #define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) #define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) #define DIOCCHANGEALTQ _IOWR('D', 49, struct pfioc_altq) #define DIOCGETQSTATS _IOWR('D', 50, struct pfioc_qstats) #define DIOCBEGINADDRS _IOWR('D', 51, struct pfioc_pooladdr) #define DIOCADDADDR _IOWR('D', 52, struct pfioc_pooladdr) #define DIOCGETADDRS _IOWR('D', 53, struct pfioc_pooladdr) #define DIOCGETADDR _IOWR('D', 54, struct pfioc_pooladdr) #define DIOCCHANGEADDR _IOWR('D', 55, struct pfioc_pooladdr) /* XXX cut 55 - 57 */ #define DIOCGETRULESETS _IOWR('D', 58, struct pfioc_ruleset) #define DIOCGETRULESET _IOWR('D', 59, struct pfioc_ruleset) #define DIOCRCLRTABLES _IOWR('D', 60, struct pfioc_table) #define DIOCRADDTABLES _IOWR('D', 61, struct pfioc_table) #define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table) #define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table) #define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table) #define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) #define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table) #define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table) #define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table) #define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table) #define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table) #define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table) #define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) #define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table) #define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table) #define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table) #define DIOCOSFPFLUSH _IO('D', 78) #define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl) #define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl) #define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) #define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) #define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) #define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes) #define DIOCCLRSRCNODES _IO('D', 85) #define DIOCSETHOSTID _IOWR('D', 86, u_int32_t) #define DIOCIGETIFACES _IOWR('D', 87, struct pfioc_iface) #define DIOCSETIFFLAG _IOWR('D', 89, struct pfioc_iface) #define DIOCCLRIFFLAG _IOWR('D', 90, struct pfioc_iface) #define DIOCKILLSRCNODES _IOWR('D', 91, struct pfioc_src_node_kill) struct pf_ifspeed { char ifname[IFNAMSIZ]; u_int32_t baudrate; }; #define DIOCGIFSPEED _IOWR('D', 92, struct pf_ifspeed) #ifdef _KERNEL LIST_HEAD(pf_src_node_list, pf_src_node); struct pf_srchash { struct pf_src_node_list nodes; struct mtx lock; }; struct pf_keyhash { LIST_HEAD(, pf_state_key) keys; struct mtx lock; }; struct pf_idhash { LIST_HEAD(, pf_state) states; struct mtx lock; }; #define PF_HASHSIZ (32768) VNET_DECLARE(struct pf_keyhash *, pf_keyhash); VNET_DECLARE(struct pf_idhash *, pf_idhash); VNET_DECLARE(u_long, pf_hashmask); #define V_pf_keyhash VNET(pf_keyhash) #define V_pf_idhash VNET(pf_idhash) #define V_pf_hashmask VNET(pf_hashmask) VNET_DECLARE(struct pf_srchash *, pf_srchash); VNET_DECLARE(u_long, pf_srchashmask); #define V_pf_srchash VNET(pf_srchash) #define V_pf_srchashmask VNET(pf_srchashmask) #define PF_IDHASH(s) (be64toh((s)->id) % (V_pf_hashmask + 1)) VNET_DECLARE(void *, pf_swi_cookie); #define V_pf_swi_cookie VNET(pf_swi_cookie) VNET_DECLARE(uint64_t, pf_stateid[MAXCPU]); #define V_pf_stateid VNET(pf_stateid) TAILQ_HEAD(pf_altqqueue, pf_altq); VNET_DECLARE(struct pf_altqqueue, pf_altqs[2]); #define V_pf_altqs VNET(pf_altqs) VNET_DECLARE(struct pf_palist, pf_pabuf); #define V_pf_pabuf VNET(pf_pabuf) VNET_DECLARE(u_int32_t, ticket_altqs_active); #define V_ticket_altqs_active VNET(ticket_altqs_active) VNET_DECLARE(u_int32_t, ticket_altqs_inactive); #define V_ticket_altqs_inactive VNET(ticket_altqs_inactive) VNET_DECLARE(int, altqs_inactive_open); #define V_altqs_inactive_open VNET(altqs_inactive_open) VNET_DECLARE(u_int32_t, ticket_pabuf); #define V_ticket_pabuf VNET(ticket_pabuf) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_active); #define V_pf_altqs_active VNET(pf_altqs_active) VNET_DECLARE(struct pf_altqqueue *, pf_altqs_inactive); #define V_pf_altqs_inactive VNET(pf_altqs_inactive) VNET_DECLARE(struct pf_rulequeue, pf_unlinked_rules); #define V_pf_unlinked_rules VNET(pf_unlinked_rules) void pf_initialize(void); void pf_cleanup(void); struct pf_mtag *pf_get_mtag(struct mbuf *); extern void pf_calc_skip_steps(struct pf_rulequeue *); #ifdef ALTQ extern void pf_altq_ifnet_event(struct ifnet *, int); #endif VNET_DECLARE(uma_zone_t, pf_state_z); #define V_pf_state_z VNET(pf_state_z) VNET_DECLARE(uma_zone_t, pf_state_key_z); #define V_pf_state_key_z VNET(pf_state_key_z) VNET_DECLARE(uma_zone_t, pf_state_scrub_z); #define V_pf_state_scrub_z VNET(pf_state_scrub_z) extern void pf_purge_thread(void *); extern void pf_intr(void *); extern void pf_purge_expired_src_nodes(void); extern int pf_unlink_state(struct pf_state *, u_int); #define PF_ENTER_LOCKED 0x00000001 #define PF_RETURN_LOCKED 0x00000002 extern int pf_state_insert(struct pfi_kif *, struct pf_state_key *, struct pf_state_key *, struct pf_state *); extern void pf_free_state(struct pf_state *); static __inline void pf_ref_state(struct pf_state *s) { refcount_acquire(&s->refs); } static __inline int pf_release_state(struct pf_state *s) { if (refcount_release(&s->refs)) { pf_free_state(s); return (1); } else return (0); } extern struct pf_state *pf_find_state_byid(uint64_t, uint32_t); extern struct pf_state *pf_find_state_all(struct pf_state_key_cmp *, u_int, int *); extern struct pf_src_node *pf_find_src_node(struct pf_addr *, struct pf_rule *, sa_family_t, int); extern void pf_unlink_src_node(struct pf_src_node *); extern void pf_unlink_src_node_locked(struct pf_src_node *); extern u_int pf_free_src_nodes(struct pf_src_node_list *); extern void pf_print_state(struct pf_state *); extern void pf_print_flags(u_int8_t); extern u_int16_t pf_cksum_fixup(u_int16_t, u_int16_t, u_int16_t, u_int8_t); VNET_DECLARE(struct ifnet *, sync_ifp); #define V_sync_ifp VNET(sync_ifp); VNET_DECLARE(struct pf_rule, pf_default_rule); #define V_pf_default_rule VNET(pf_default_rule) extern void pf_addrcpy(struct pf_addr *, struct pf_addr *, u_int8_t); void pf_free_rule(struct pf_rule *); #ifdef INET int pf_test(int, struct ifnet *, struct mbuf **, struct inpcb *); #endif /* INET */ #ifdef INET6 int pf_test6(int, struct ifnet *, struct mbuf **, struct inpcb *); void pf_poolmask(struct pf_addr *, struct pf_addr*, struct pf_addr *, struct pf_addr *, u_int8_t); void pf_addr_inc(struct pf_addr *, sa_family_t); #endif /* INET6 */ u_int32_t pf_new_isn(struct pf_state *); void *pf_pull_hdr(struct mbuf *, int, void *, int, u_short *, u_short *, sa_family_t); void pf_change_a(void *, u_int16_t *, u_int32_t, u_int8_t); void pf_send_deferred_syn(struct pf_state *); int pf_match_addr(u_int8_t, struct pf_addr *, struct pf_addr *, struct pf_addr *, sa_family_t); int pf_match_addr_range(struct pf_addr *, struct pf_addr *, struct pf_addr *, sa_family_t); int pf_match_port(u_int8_t, u_int16_t, u_int16_t, u_int16_t); void pf_normalize_init(void); void pf_normalize_cleanup(void); int pf_normalize_ip(struct mbuf **, int, struct pfi_kif *, u_short *, struct pf_pdesc *); int pf_normalize_ip6(struct mbuf **, int, struct pfi_kif *, u_short *, struct pf_pdesc *); int pf_normalize_tcp(int, struct pfi_kif *, struct mbuf *, int, int, void *, struct pf_pdesc *); void pf_normalize_tcp_cleanup(struct pf_state *); int pf_normalize_tcp_init(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *, struct pf_state_peer *); int pf_normalize_tcp_stateful(struct mbuf *, int, struct pf_pdesc *, u_short *, struct tcphdr *, struct pf_state *, struct pf_state_peer *, struct pf_state_peer *, int *); u_int32_t pf_state_expires(const struct pf_state *); void pf_purge_expired_fragments(void); int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *, int); int pf_socket_lookup(int, struct pf_pdesc *, struct mbuf *); struct pf_state_key *pf_alloc_state_key(int); void pfr_initialize(void); void pfr_cleanup(void); int pfr_match_addr(struct pfr_ktable *, struct pf_addr *, sa_family_t); void pfr_update_stats(struct pfr_ktable *, struct pf_addr *, sa_family_t, u_int64_t, int, int, int); int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, sa_family_t); void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); struct pfr_ktable * pfr_attach_table(struct pf_ruleset *, char *); void pfr_detach_table(struct pfr_ktable *); int pfr_clr_tables(struct pfr_table *, int *, int); int pfr_add_tables(struct pfr_table *, int, int *, int); int pfr_del_tables(struct pfr_table *, int, int *, int); int pfr_get_tables(struct pfr_table *, struct pfr_table *, int *, int); int pfr_get_tstats(struct pfr_table *, struct pfr_tstats *, int *, int); int pfr_clr_tstats(struct pfr_table *, int, int *, int); int pfr_set_tflags(struct pfr_table *, int, int, int, int *, int *, int); int pfr_clr_addrs(struct pfr_table *, int *, int); int pfr_insert_kentry(struct pfr_ktable *, struct pfr_addr *, long); int pfr_add_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_del_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_set_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int *, int *, int *, int, u_int32_t); int pfr_get_addrs(struct pfr_table *, struct pfr_addr *, int *, int); int pfr_get_astats(struct pfr_table *, struct pfr_astats *, int *, int); int pfr_clr_astats(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_tst_addrs(struct pfr_table *, struct pfr_addr *, int, int *, int); int pfr_ina_begin(struct pfr_table *, u_int32_t *, int *, int); int pfr_ina_rollback(struct pfr_table *, u_int32_t, int *, int); int pfr_ina_commit(struct pfr_table *, u_int32_t, int *, int *, int); int pfr_ina_define(struct pfr_table *, struct pfr_addr *, int, int *, int *, u_int32_t, int); MALLOC_DECLARE(PFI_MTYPE); VNET_DECLARE(struct pfi_kif *, pfi_all); #define V_pfi_all VNET(pfi_all) void pfi_initialize(void); void pfi_cleanup(void); void pfi_kif_ref(struct pfi_kif *); void pfi_kif_unref(struct pfi_kif *); struct pfi_kif *pfi_kif_find(const char *); struct pfi_kif *pfi_kif_attach(struct pfi_kif *, const char *); int pfi_kif_match(struct pfi_kif *, struct pfi_kif *); void pfi_kif_purge(void); int pfi_match_addr(struct pfi_dynaddr *, struct pf_addr *, sa_family_t); int pfi_dynaddr_setup(struct pf_addr_wrap *, sa_family_t); void pfi_dynaddr_remove(struct pfi_dynaddr *); void pfi_dynaddr_copyout(struct pf_addr_wrap *); void pfi_update_status(const char *, struct pf_status *); void pfi_get_ifaces(const char *, struct pfi_kif *, int *); int pfi_set_flags(const char *, int); int pfi_clear_flags(const char *, int); int pf_match_tag(struct mbuf *, struct pf_rule *, int *, int); int pf_tag_packet(struct mbuf *, struct pf_pdesc *, int); void pf_qid2qname(u_int32_t, char *); VNET_DECLARE(struct pf_status, pf_status); #define V_pf_status VNET(pf_status) struct pf_limit { uma_zone_t zone; u_int limit; }; VNET_DECLARE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); #define V_pf_limits VNET(pf_limits) #endif /* _KERNEL */ #ifdef _KERNEL VNET_DECLARE(struct pf_anchor_global, pf_anchors); #define V_pf_anchors VNET(pf_anchors) VNET_DECLARE(struct pf_anchor, pf_main_anchor); #define V_pf_main_anchor VNET(pf_main_anchor) #define pf_main_ruleset V_pf_main_anchor.ruleset #endif /* these ruleset functions can be linked into userland programs (pfctl) */ int pf_get_ruleset_number(u_int8_t); void pf_init_ruleset(struct pf_ruleset *); int pf_anchor_setup(struct pf_rule *, const struct pf_ruleset *, const char *); int pf_anchor_copyout(const struct pf_ruleset *, const struct pf_rule *, struct pfioc_rule *); void pf_anchor_remove(struct pf_rule *); void pf_remove_if_empty_ruleset(struct pf_ruleset *); struct pf_ruleset *pf_find_ruleset(const char *); struct pf_ruleset *pf_find_or_create_ruleset(const char *); void pf_rs_initialize(void); /* The fingerprint functions can be linked into userland programs (tcpdump) */ int pf_osfp_add(struct pf_osfp_ioctl *); #ifdef _KERNEL struct pf_osfp_enlist * pf_osfp_fingerprint(struct pf_pdesc *, struct mbuf *, int, const struct tcphdr *); #endif /* _KERNEL */ void pf_osfp_flush(void); int pf_osfp_get(struct pf_osfp_ioctl *); int pf_osfp_match(struct pf_osfp_enlist *, pf_osfp_t); #ifdef _KERNEL void pf_print_host(struct pf_addr *, u_int16_t, u_int8_t); void pf_step_into_anchor(struct pf_anchor_stackframe *, int *, struct pf_ruleset **, int, struct pf_rule **, struct pf_rule **, int *); int pf_step_out_of_anchor(struct pf_anchor_stackframe *, int *, struct pf_ruleset **, int, struct pf_rule **, struct pf_rule **, int *); int pf_map_addr(u_int8_t, struct pf_rule *, struct pf_addr *, struct pf_addr *, struct pf_addr *, struct pf_src_node **); struct pf_rule *pf_get_translation(struct pf_pdesc *, struct mbuf *, int, int, struct pfi_kif *, struct pf_src_node **, struct pf_state_key **, struct pf_state_key **, struct pf_addr *, struct pf_addr *, uint16_t, uint16_t, struct pf_anchor_stackframe *); struct pf_state_key *pf_state_key_setup(struct pf_pdesc *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t); struct pf_state_key *pf_state_key_clone(struct pf_state_key *); #endif /* _KERNEL */ #endif /* _NET_PFVAR_H_ */ Index: stable/10/sys/netpfil/ipfw/ip_fw2.c =================================================================== --- stable/10/sys/netpfil/ipfw/ip_fw2.c (revision 263085) +++ stable/10/sys/netpfil/ipfw/ip_fw2.c (revision 263086) @@ -1,2804 +1,2805 @@ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include -#include #include #include + +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ /* ipfw_vnet_ready controls when we are open for business */ static VNET_DEFINE(int, ipfw_vnet_ready) = 0; #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) static VNET_DEFINE(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) static VNET_DEFINE(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table_extended(chain, cmd->p.glob, ifp->if_xname, tablearg, IPFW_TABLE_INTERFACE); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #ifdef __FreeBSD__ /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #ifndef __FreeBSD__ return 0; #else struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; in_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * if useloopback == 1 routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; #endif /* __FreeBSD__ */ } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &V_ifnet, if_link) { if_addr_rlock(mdc); TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { if_addr_runlock(mdc); return 1; } } } if_addr_runlock(mdc); } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; in6_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; struct ifnet *oif; int lookupflags; int match; id = &args->f_id; inp = args->inp; oif = args->oif; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (oif == NULL) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, oif, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, oif, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (oif == NULL) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, oif, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, oif, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; } /* * Helper function to enable cached rule lookups using * x_next and next_rule fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; /* If possible use cached f_pos (in f->next_rule), * whose version is written in f->next_rule * (horrible hacks to avoid changing the ABI). */ if (num != IP_FW_TABLEARG && (uintptr_t)f->x_next == chain->id) f_pos = (uintptr_t)f->next_rule; else { int i = IP_FW_ARG_TABLEARG(num); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (num != IP_FW_TABLEARG) { f->next_rule = (void *)(uintptr_t)f_pos; f->x_next = (void *)(uintptr_t)chain->id; } } return (f_pos); } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->oif Outgoing interface, NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; int f_pos = 0; /* index of current rule in the array */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragement (fragement header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ uint16_t iplen=0; int pktlen; uint16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define PULLUP_LEN(_len, p, T) \ do { \ int x = (_len) + T; \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (_len)); \ } while (0) /* * if we have an ether header, */ if (args->eh) etype = ntohs(args->eh->ether_type); /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, struct carp_header); if (((struct carp_header *)ulp)->carp_version != CARP_VERSION) return (IP_FW_DENY); if (((struct carp_header *)ulp)->carp_type != CARP_ADVERTISEMENT) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); pktlen = iplen < pktlen ? iplen : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->rule.slot) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: { /* For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ uint32_t i = args->rule.info; match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); } break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t key = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v = 0; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* generic lookup. The key must be * in 32bit big-endian format. */ v = ((ipfw_insn_u32 *)cmd)->d[1]; if (v == 0) key = dst_ip.s_addr; else if (v == 1) key = src_ip.s_addr; else if (v == 6) /* dscp */ key = (ip->ip_tos >> 2) & 0x3f; else if (offset != 0) break; else if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) break; else if (v == 2) key = htonl(dst_port); else if (v == 3) key = htonl(src_port); else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (v ==4 /* O_UID */) key = ucred_cache.uid; else if (v == 5 /* O_JAIL */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ key = htonl(key); } else break; } match = ipfw_lookup_table(chain, cmd->arg1, key, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } else if (is_ipv6) { uint32_t v = 0; void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? &args->f_id.dst_ip6: &args->f_id.src_ip6; match = ipfw_lookup_table_extended(chain, cmd->arg1, pkey, &v, IPFW_TABLE_CIDR); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; x = (*v & 0x0F) << 2; v++; x |= *v >> 6; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x > 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = iplen - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(f, hlen, args, m, oif, offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif, args->f_id.fib) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif, args->f_id.fib) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib); else match = 1; break; case O_IPSEC: #ifdef IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = IP_FW_ARG_TABLEARG(cmd->arg1); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incomming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = IP_FW_ARG_TABLEARG(cmd->arg1); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_install_state(f, (ipfw_insn_limit *)cmd, args, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = ipfw_lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule by setting * f, cmd, l and clearing cmdlen. */ IPFW_INC_DYN_COUNTER(q, pktlen); /* XXX we would like to have f_pos * readily accessible in the dynamic * rule, instead of having to * lookup q->rule. */ f = q->rule; f_pos = ipfw_find_rule(chain, f->rulenum, f->id); cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; ipfw_dyn_unlock(q); cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->eh) /* not on layer 2 */ break; /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); f_pos = jump_fast(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } IPFW_INC_RULE_COUNTER(f, pktlen); stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; f_pos = jump_fast(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( args, cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { bcopy(sa, &args->hopstore, sizeof(*sa)); args->hopstore.sin_addr.s_addr = htonl(tablearg); args->next_hop = &args->hopstore; } else { args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->eh) /* not valid on layer2 pkts */ break; if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = IP_FW_ARG_TABLEARG(cmd->arg1); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = IP_FW_ARG_TABLEARG(cmd->arg1); if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = IP_FW_ARG_TABLEARG(cmd->arg1) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t a; a = ip->ip_tos; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); a += ntohs(ip->ip_sum) - ip->ip_tos; ip->ip_sum = htons(a); } else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; *v = (*v & 0xF0) | (code >> 2); v++; *v = (*v & 0x3F) | ((code & 0x03) << 6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: if (!IPFW_NAT_LOADED) { retval = IP_FW_DENY; } else { struct cfg_nat *t; int nat_id; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == 0) { retval = ipfw_nat_ptr(args, NULL, m); l = 0; done = 1; break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = IP_FW_ARG_TABLEARG(cmd->arg1); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; } if (cmd->arg1 != IP_FW_TABLEARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); } l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_REASS: { int ip_off; IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* in any case exit inner loop */ ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_log_bpf(1); /* init */ return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_log_bpf(0); /* uninit */ printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* insert the default rule and create the initial map */ chain->n_rules = 1; chain->static_len = sizeof(struct ip_fw); chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); if (chain->map) rule = malloc(chain->static_len, M_IPFW, M_WAITOK | M_ZERO); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } /* fill and insert the default rule */ rule->act_ofs = 0; rule->rulenum = IPFW_DEFAULT_RULE; rule->cmd_len = 1; rule->set = RESVD_SET; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->rules = chain->default_rule = chain->map[0] = rule; chain->id = rule->id = 1; IPFW_LOCK_INIT(chain); ipfw_dyn_init(chain); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl; error = ipfw_attach_hooks(1); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap, *rule; struct ip_fw_chain *chain = &V_layer3_chain; int i; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ (void)ipfw_attach_hooks(0 /* detach */); V_ip_fw_ctl_ptr = NULL; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); IPFW_UH_WLOCK(chain); IPFW_WLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_WUNLOCK(chain); ipfw_destroy_tables(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; rule->x_next = reap; reap = rule; } if (chain->map) free(chain->map, M_IPFW); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); if (reap != NULL) ipfw_reap_rules(reap); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ return 0; } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); MODULE_VERSION(ipfw, 2); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ Index: stable/10/sys/netpfil/pf/pf.c =================================================================== --- stable/10/sys/netpfil/pf/pf.c (revision 263085) +++ stable/10/sys/netpfil/pf/pf.c (revision 263086) @@ -1,6392 +1,6391 @@ /*- * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2002 - 2008 Henning Brauer * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. * * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_bpf.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX: only for DIR_IN/DIR_OUT */ #ifdef INET6 #include #include #include #include #include #endif /* INET6 */ #include #include #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x /* * Global variables */ /* state tables */ VNET_DEFINE(struct pf_altqqueue, pf_altqs[2]); VNET_DEFINE(struct pf_palist, pf_pabuf); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_active); VNET_DEFINE(struct pf_altqqueue *, pf_altqs_inactive); VNET_DEFINE(struct pf_status, pf_status); VNET_DEFINE(u_int32_t, ticket_altqs_active); VNET_DEFINE(u_int32_t, ticket_altqs_inactive); VNET_DEFINE(int, altqs_inactive_open); VNET_DEFINE(u_int32_t, ticket_pabuf); VNET_DEFINE(MD5_CTX, pf_tcp_secret_ctx); #define V_pf_tcp_secret_ctx VNET(pf_tcp_secret_ctx) VNET_DEFINE(u_char, pf_tcp_secret[16]); #define V_pf_tcp_secret VNET(pf_tcp_secret) VNET_DEFINE(int, pf_tcp_secret_init); #define V_pf_tcp_secret_init VNET(pf_tcp_secret_init) VNET_DEFINE(int, pf_tcp_iss_off); #define V_pf_tcp_iss_off VNET(pf_tcp_iss_off) /* * Queue for pf_intr() sends. */ static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations"); struct pf_send_entry { STAILQ_ENTRY(pf_send_entry) pfse_next; struct mbuf *pfse_m; enum { PFSE_IP, PFSE_IP6, PFSE_ICMP, PFSE_ICMP6, } pfse_type; union { struct route ro; struct { int type; int code; int mtu; } icmpopts; } u; #define pfse_ro u.ro #define pfse_icmp_type u.icmpopts.type #define pfse_icmp_code u.icmpopts.code #define pfse_icmp_mtu u.icmpopts.mtu }; STAILQ_HEAD(pf_send_head, pf_send_entry); static VNET_DEFINE(struct pf_send_head, pf_sendqueue); #define V_pf_sendqueue VNET(pf_sendqueue) static struct mtx pf_sendqueue_mtx; #define PF_SENDQ_LOCK() mtx_lock(&pf_sendqueue_mtx) #define PF_SENDQ_UNLOCK() mtx_unlock(&pf_sendqueue_mtx) /* * Queue for pf_overload_task() tasks. */ struct pf_overload_entry { SLIST_ENTRY(pf_overload_entry) next; struct pf_addr addr; sa_family_t af; uint8_t dir; struct pf_rule *rule; }; SLIST_HEAD(pf_overload_head, pf_overload_entry); static VNET_DEFINE(struct pf_overload_head, pf_overloadqueue); #define V_pf_overloadqueue VNET(pf_overloadqueue) static VNET_DEFINE(struct task, pf_overloadtask); #define V_pf_overloadtask VNET(pf_overloadtask) static struct mtx pf_overloadqueue_mtx; #define PF_OVERLOADQ_LOCK() mtx_lock(&pf_overloadqueue_mtx) #define PF_OVERLOADQ_UNLOCK() mtx_unlock(&pf_overloadqueue_mtx) VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules); struct mtx pf_unlnkdrules_mtx; static VNET_DEFINE(uma_zone_t, pf_sources_z); #define V_pf_sources_z VNET(pf_sources_z) static VNET_DEFINE(uma_zone_t, pf_mtag_z); #define V_pf_mtag_z VNET(pf_mtag_z) VNET_DEFINE(uma_zone_t, pf_state_z); VNET_DEFINE(uma_zone_t, pf_state_key_z); VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]); #define PFID_CPUBITS 8 #define PFID_CPUSHIFT (sizeof(uint64_t) * NBBY - PFID_CPUBITS) #define PFID_CPUMASK ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT) #define PFID_MAXID (~PFID_CPUMASK) CTASSERT((1 << PFID_CPUBITS) > MAXCPU); static void pf_src_tree_remove_state(struct pf_state *); static void pf_init_threshold(struct pf_threshold *, u_int32_t, u_int32_t); static void pf_add_threshold(struct pf_threshold *); static int pf_check_threshold(struct pf_threshold *); static void pf_change_ap(struct pf_addr *, u_int16_t *, u_int16_t *, u_int16_t *, struct pf_addr *, u_int16_t, u_int8_t, sa_family_t); static int pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *, struct tcphdr *, struct pf_state_peer *); static void pf_change_icmp(struct pf_addr *, u_int16_t *, struct pf_addr *, struct pf_addr *, u_int16_t, u_int16_t *, u_int16_t *, u_int16_t *, u_int16_t *, u_int8_t, sa_family_t); static void pf_send_tcp(struct mbuf *, const struct pf_rule *, sa_family_t, const struct pf_addr *, const struct pf_addr *, u_int16_t, u_int16_t, u_int32_t, u_int32_t, u_int8_t, u_int16_t, u_int16_t, u_int8_t, int, u_int16_t, struct ifnet *); static void pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t, sa_family_t, struct pf_rule *); static void pf_detach_state(struct pf_state *); static int pf_state_key_attach(struct pf_state_key *, struct pf_state_key *, struct pf_state *); static void pf_state_key_detach(struct pf_state *, int); static int pf_state_key_ctor(void *, int, void *, int); static u_int32_t pf_tcp_iss(struct pf_pdesc *); static int pf_test_rule(struct pf_rule **, struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **, struct inpcb *); static int pf_create_state(struct pf_rule *, struct pf_rule *, struct pf_rule *, struct pf_pdesc *, struct pf_src_node *, struct pf_state_key *, struct pf_state_key *, struct mbuf *, int, u_int16_t, u_int16_t, int *, struct pfi_kif *, struct pf_state **, int, u_int16_t, u_int16_t, int); static int pf_test_fragment(struct pf_rule **, int, struct pfi_kif *, struct mbuf *, void *, struct pf_pdesc *, struct pf_rule **, struct pf_ruleset **); static int pf_tcp_track_full(struct pf_state_peer *, struct pf_state_peer *, struct pf_state **, struct pfi_kif *, struct mbuf *, int, struct pf_pdesc *, u_short *, int *); static int pf_tcp_track_sloppy(struct pf_state_peer *, struct pf_state_peer *, struct pf_state **, struct pf_pdesc *, u_short *); static int pf_test_state_tcp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_udp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *); static int pf_test_state_icmp(struct pf_state **, int, struct pfi_kif *, struct mbuf *, int, void *, struct pf_pdesc *, u_short *); static int pf_test_state_other(struct pf_state **, int, struct pfi_kif *, struct mbuf *, struct pf_pdesc *); static u_int8_t pf_get_wscale(struct mbuf *, int, u_int16_t, sa_family_t); static u_int16_t pf_get_mss(struct mbuf *, int, u_int16_t, sa_family_t); static u_int16_t pf_calc_mss(struct pf_addr *, sa_family_t, int, u_int16_t); static void pf_set_rt_ifp(struct pf_state *, struct pf_addr *); static int pf_check_proto_cksum(struct mbuf *, int, int, u_int8_t, sa_family_t); static void pf_print_state_parts(struct pf_state *, struct pf_state_key *, struct pf_state_key *); static int pf_addr_wrap_neq(struct pf_addr_wrap *, struct pf_addr_wrap *); static struct pf_state *pf_find_state(struct pfi_kif *, struct pf_state_key_cmp *, u_int); static int pf_src_connlimit(struct pf_state **); static void pf_overload_task(void *c, int pending); static int pf_insert_src_node(struct pf_src_node **, struct pf_rule *, struct pf_addr *, sa_family_t); static u_int pf_purge_expired_states(u_int, int); static void pf_purge_unlinked_rules(void); static int pf_mtag_init(void *, int, int); static void pf_mtag_free(struct m_tag *); #ifdef INET static void pf_route(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *); #endif /* INET */ #ifdef INET6 static void pf_change_a6(struct pf_addr *, u_int16_t *, struct pf_addr *, u_int8_t); static void pf_route6(struct mbuf **, struct pf_rule *, int, struct ifnet *, struct pf_state *, struct pf_pdesc *); #endif /* INET6 */ int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len); VNET_DECLARE(int, pf_end_threads); VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]); #define PACKET_LOOPED(pd) ((pd)->pf_mtag && \ (pd)->pf_mtag->flags & PF_PACKET_LOOPED) #define STATE_LOOKUP(i, k, d, s, pd) \ do { \ (s) = pf_find_state((i), (k), (d)); \ if ((s) == NULL) \ return (PF_DROP); \ if (PACKET_LOOPED(pd)) \ return (PF_PASS); \ if ((d) == PF_OUT && \ (((s)->rule.ptr->rt == PF_ROUTETO && \ (s)->rule.ptr->direction == PF_OUT) || \ ((s)->rule.ptr->rt == PF_REPLYTO && \ (s)->rule.ptr->direction == PF_IN)) && \ (s)->rt_kif != NULL && \ (s)->rt_kif != (i)) \ return (PF_PASS); \ } while (0) #define BOUND_IFACE(r, k) \ ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all #define STATE_INC_COUNTERS(s) \ do { \ counter_u64_add(s->rule.ptr->states_cur, 1); \ counter_u64_add(s->rule.ptr->states_tot, 1); \ if (s->anchor.ptr != NULL) { \ counter_u64_add(s->anchor.ptr->states_cur, 1); \ counter_u64_add(s->anchor.ptr->states_tot, 1); \ } \ if (s->nat_rule.ptr != NULL) { \ counter_u64_add(s->nat_rule.ptr->states_cur, 1);\ counter_u64_add(s->nat_rule.ptr->states_tot, 1);\ } \ } while (0) #define STATE_DEC_COUNTERS(s) \ do { \ if (s->nat_rule.ptr != NULL) \ counter_u64_add(s->nat_rule.ptr->states_cur, -1);\ if (s->anchor.ptr != NULL) \ counter_u64_add(s->anchor.ptr->states_cur, -1); \ counter_u64_add(s->rule.ptr->states_cur, -1); \ } while (0) static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures"); VNET_DEFINE(struct pf_keyhash *, pf_keyhash); VNET_DEFINE(struct pf_idhash *, pf_idhash); VNET_DEFINE(u_long, pf_hashmask); VNET_DEFINE(struct pf_srchash *, pf_srchash); VNET_DEFINE(u_long, pf_srchashmask); SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)"); VNET_DEFINE(u_long, pf_hashsize); #define V_pf_hashsize VNET(pf_hashsize) SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN, &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable"); VNET_DEFINE(u_long, pf_srchashsize); #define V_pf_srchashsize VNET(pf_srchashsize) SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN, &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable"); VNET_DEFINE(void *, pf_swi_cookie); VNET_DEFINE(uint32_t, pf_hashseed); #define V_pf_hashseed VNET(pf_hashseed) static __inline uint32_t pf_hashkey(struct pf_state_key *sk) { uint32_t h; h = jenkins_hash32((uint32_t *)sk, sizeof(struct pf_state_key_cmp)/sizeof(uint32_t), V_pf_hashseed); return (h & V_pf_hashmask); } static __inline uint32_t pf_hashsrc(struct pf_addr *addr, sa_family_t af) { uint32_t h; switch (af) { case AF_INET: h = jenkins_hash32((uint32_t *)&addr->v4, sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed); break; case AF_INET6: h = jenkins_hash32((uint32_t *)&addr->v6, sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed); break; default: panic("%s: unknown address family %u", __func__, af); } return (h & V_pf_srchashmask); } #ifdef INET6 void pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: dst->addr32[0] = src->addr32[0]; break; #endif /* INET */ case AF_INET6: dst->addr32[0] = src->addr32[0]; dst->addr32[1] = src->addr32[1]; dst->addr32[2] = src->addr32[2]; dst->addr32[3] = src->addr32[3]; break; } } #endif /* INET6 */ static void pf_init_threshold(struct pf_threshold *threshold, u_int32_t limit, u_int32_t seconds) { threshold->limit = limit * PF_THRESHOLD_MULT; threshold->seconds = seconds; threshold->count = 0; threshold->last = time_uptime; } static void pf_add_threshold(struct pf_threshold *threshold) { u_int32_t t = time_uptime, diff = t - threshold->last; if (diff >= threshold->seconds) threshold->count = 0; else threshold->count -= threshold->count * diff / threshold->seconds; threshold->count += PF_THRESHOLD_MULT; threshold->last = t; } static int pf_check_threshold(struct pf_threshold *threshold) { return (threshold->count > threshold->limit); } static int pf_src_connlimit(struct pf_state **state) { struct pf_overload_entry *pfoe; int bad = 0; PF_STATE_LOCK_ASSERT(*state); (*state)->src_node->conn++; (*state)->src.tcp_est = 1; pf_add_threshold(&(*state)->src_node->conn_rate); if ((*state)->rule.ptr->max_src_conn && (*state)->rule.ptr->max_src_conn < (*state)->src_node->conn) { V_pf_status.lcounters[LCNT_SRCCONN]++; bad++; } if ((*state)->rule.ptr->max_src_conn_rate.limit && pf_check_threshold(&(*state)->src_node->conn_rate)) { V_pf_status.lcounters[LCNT_SRCCONNRATE]++; bad++; } if (!bad) return (0); /* Kill this state. */ (*state)->timeout = PFTM_PURGE; (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; if ((*state)->rule.ptr->overload_tbl == NULL) return (1); /* Schedule overloading and flushing task. */ pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT); if (pfoe == NULL) return (1); /* too bad :( */ bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr)); pfoe->af = (*state)->key[PF_SK_WIRE]->af; pfoe->rule = (*state)->rule.ptr; pfoe->dir = (*state)->direction; PF_OVERLOADQ_LOCK(); SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next); PF_OVERLOADQ_UNLOCK(); taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask); return (1); } static void pf_overload_task(void *c, int pending) { struct pf_overload_head queue; struct pfr_addr p; struct pf_overload_entry *pfoe, *pfoe1; uint32_t killed = 0; PF_OVERLOADQ_LOCK(); queue = *(struct pf_overload_head *)c; SLIST_INIT((struct pf_overload_head *)c); PF_OVERLOADQ_UNLOCK(); bzero(&p, sizeof(p)); SLIST_FOREACH(pfoe, &queue, next) { V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++; if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("%s: blocking address ", __func__); pf_print_host(&pfoe->addr, 0, pfoe->af); printf("\n"); } p.pfra_af = pfoe->af; switch (pfoe->af) { #ifdef INET case AF_INET: p.pfra_net = 32; p.pfra_ip4addr = pfoe->addr.v4; break; #endif #ifdef INET6 case AF_INET6: p.pfra_net = 128; p.pfra_ip6addr = pfoe->addr.v6; break; #endif } PF_RULES_WLOCK(); pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second); PF_RULES_WUNLOCK(); } /* * Remove those entries, that don't need flushing. */ SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) if (pfoe->rule->flush == 0) { SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next); free(pfoe, M_PFTEMP); } else V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++; /* If nothing to flush, return. */ if (SLIST_EMPTY(&queue)) return; for (int i = 0; i <= V_pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_state_key *sk; struct pf_state *s; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { sk = s->key[PF_SK_WIRE]; SLIST_FOREACH(pfoe, &queue, next) if (sk->af == pfoe->af && ((pfoe->rule->flush & PF_FLUSH_GLOBAL) || pfoe->rule == s->rule.ptr) && ((pfoe->dir == PF_OUT && PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) || (pfoe->dir == PF_IN && PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) { s->timeout = PFTM_PURGE; s->src.state = s->dst.state = TCPS_CLOSED; killed++; } } PF_HASHROW_UNLOCK(ih); } SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1) free(pfoe, M_PFTEMP); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: %u states killed", __func__, killed); } /* * Can return locked on failure, so that we can consistently * allocate and insert a new one. */ struct pf_src_node * pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af, int returnlocked) { struct pf_srchash *sh; struct pf_src_node *n; V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++; sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_LOCK(sh); LIST_FOREACH(n, &sh->nodes, entry) if (n->rule.ptr == rule && n->af == af && ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) || (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0))) break; if (n != NULL || returnlocked == 0) PF_HASHROW_UNLOCK(sh); return (n); } static int pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule, struct pf_addr *src, sa_family_t af) { KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK || rule->rpool.opts & PF_POOL_STICKYADDR), ("%s for non-tracking rule %p", __func__, rule)); if (*sn == NULL) *sn = pf_find_src_node(src, rule, af, 1); if (*sn == NULL) { struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)]; PF_HASHROW_ASSERT(sh); if (!rule->max_src_nodes || counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes) (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO); else V_pf_status.lcounters[LCNT_SRCNODES]++; if ((*sn) == NULL) { PF_HASHROW_UNLOCK(sh); return (-1); } pf_init_threshold(&(*sn)->conn_rate, rule->max_src_conn_rate.limit, rule->max_src_conn_rate.seconds); (*sn)->af = af; (*sn)->rule.ptr = rule; PF_ACPY(&(*sn)->addr, src, af); LIST_INSERT_HEAD(&sh->nodes, *sn, entry); (*sn)->creation = time_uptime; (*sn)->ruletype = rule->action; if ((*sn)->rule.ptr != NULL) counter_u64_add((*sn)->rule.ptr->src_nodes, 1); PF_HASHROW_UNLOCK(sh); V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++; V_pf_status.src_nodes++; } else { if (rule->max_src_states && (*sn)->states >= rule->max_src_states) { V_pf_status.lcounters[LCNT_SRCSTATES]++; return (-1); } } return (0); } void pf_unlink_src_node_locked(struct pf_src_node *src) { #ifdef INVARIANTS struct pf_srchash *sh; sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)]; PF_HASHROW_ASSERT(sh); #endif LIST_REMOVE(src, entry); if (src->rule.ptr) counter_u64_add(src->rule.ptr->src_nodes, -1); V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++; V_pf_status.src_nodes--; } void pf_unlink_src_node(struct pf_src_node *src) { struct pf_srchash *sh; sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)]; PF_HASHROW_LOCK(sh); pf_unlink_src_node_locked(src); PF_HASHROW_UNLOCK(sh); } static void pf_free_src_node(struct pf_src_node *sn) { KASSERT(sn->states == 0, ("%s: %p has refs", __func__, sn)); uma_zfree(V_pf_sources_z, sn); } u_int pf_free_src_nodes(struct pf_src_node_list *head) { struct pf_src_node *sn, *tmp; u_int count = 0; LIST_FOREACH_SAFE(sn, head, entry, tmp) { pf_free_src_node(sn); count++; } return (count); } /* Data storage structures initialization. */ void pf_initialize() { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; u_int i; TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize); if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize)) V_pf_hashsize = PF_HASHSIZ; TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize); if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize)) V_pf_srchashsize = PF_HASHSIZ / 4; V_pf_hashseed = arc4random(); /* States and state keys storage. */ V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z; uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT); uma_zone_set_warning(V_pf_state_z, "PF states limit reached"); V_pf_state_key_z = uma_zcreate("pf state keys", sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO); V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO); V_pf_hashmask = V_pf_hashsize - 1; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask; i++, kh++, ih++) { mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF); } /* Source nodes. */ V_pf_sources_z = uma_zcreate("pf source nodes", sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z; uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT); uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached"); V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash), M_PFHASH, M_WAITOK|M_ZERO); V_pf_srchashmask = V_pf_srchashsize - 1; for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF); /* ALTQ */ TAILQ_INIT(&V_pf_altqs[0]); TAILQ_INIT(&V_pf_altqs[1]); TAILQ_INIT(&V_pf_pabuf); V_pf_altqs_active = &V_pf_altqs[0]; V_pf_altqs_inactive = &V_pf_altqs[1]; /* Mbuf tags */ V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) + sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL, UMA_ALIGN_PTR, 0); /* Send & overload+flush queues. */ STAILQ_INIT(&V_pf_sendqueue); SLIST_INIT(&V_pf_overloadqueue); TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, &V_pf_overloadqueue); mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF); mtx_init(&pf_overloadqueue_mtx, "pf overload/flush queue", NULL, MTX_DEF); /* Unlinked, but may be referenced rules. */ TAILQ_INIT(&V_pf_unlinked_rules); mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF); } void pf_cleanup() { struct pf_keyhash *kh; struct pf_idhash *ih; struct pf_srchash *sh; struct pf_send_entry *pfse, *next; u_int i; for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask; i++, kh++, ih++) { KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty", __func__)); KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty", __func__)); mtx_destroy(&kh->lock); mtx_destroy(&ih->lock); } free(V_pf_keyhash, M_PFHASH); free(V_pf_idhash, M_PFHASH); for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) { KASSERT(LIST_EMPTY(&sh->nodes), ("%s: source node hash not empty", __func__)); mtx_destroy(&sh->lock); } free(V_pf_srchash, M_PFHASH); STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) { m_freem(pfse->pfse_m); free(pfse, M_PFTEMP); } mtx_destroy(&pf_sendqueue_mtx); mtx_destroy(&pf_overloadqueue_mtx); mtx_destroy(&pf_unlnkdrules_mtx); uma_zdestroy(V_pf_mtag_z); uma_zdestroy(V_pf_sources_z); uma_zdestroy(V_pf_state_z); uma_zdestroy(V_pf_state_key_z); } static int pf_mtag_init(void *mem, int size, int how) { struct m_tag *t; t = (struct m_tag *)mem; t->m_tag_cookie = MTAG_ABI_COMPAT; t->m_tag_id = PACKET_TAG_PF; t->m_tag_len = sizeof(struct pf_mtag); t->m_tag_free = pf_mtag_free; return (0); } static void pf_mtag_free(struct m_tag *t) { uma_zfree(V_pf_mtag_z, t); } struct pf_mtag * pf_get_mtag(struct mbuf *m) { struct m_tag *mtag; if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL) return ((struct pf_mtag *)(mtag + 1)); mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof(struct pf_mtag)); m_tag_prepend(m, mtag); return ((struct pf_mtag *)(mtag + 1)); } static int pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks, struct pf_state *s) { struct pf_keyhash *khs, *khw, *kh; struct pf_state_key *sk, *cur; struct pf_state *si, *olds = NULL; int idx; KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__)); KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__)); /* * We need to lock hash slots of both keys. To avoid deadlock * we always lock the slot with lower address first. Unlock order * isn't important. * * We also need to lock ID hash slot before dropping key * locks. On success we return with ID hash slot locked. */ if (skw == sks) { khs = khw = &V_pf_keyhash[pf_hashkey(skw)]; PF_HASHROW_LOCK(khs); } else { khs = &V_pf_keyhash[pf_hashkey(sks)]; khw = &V_pf_keyhash[pf_hashkey(skw)]; if (khs == khw) { PF_HASHROW_LOCK(khs); } else if (khs < khw) { PF_HASHROW_LOCK(khs); PF_HASHROW_LOCK(khw); } else { PF_HASHROW_LOCK(khw); PF_HASHROW_LOCK(khs); } } #define KEYS_UNLOCK() do { \ if (khs != khw) { \ PF_HASHROW_UNLOCK(khs); \ PF_HASHROW_UNLOCK(khw); \ } else \ PF_HASHROW_UNLOCK(khs); \ } while (0) /* * First run: start with wire key. */ sk = skw; kh = khw; idx = PF_SK_WIRE; keyattach: LIST_FOREACH(cur, &kh->keys, entry) if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0) break; if (cur != NULL) { /* Key exists. Check for same kif, if none, add to key. */ TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)]; PF_HASHROW_LOCK(ih); if (si->kif == s->kif && si->direction == s->direction) { if (sk->proto == IPPROTO_TCP && si->src.state >= TCPS_FIN_WAIT_2 && si->dst.state >= TCPS_FIN_WAIT_2) { /* * New state matches an old >FIN_WAIT_2 * state. We can't drop key hash locks, * thus we can't unlink it properly. * * As a workaround we drop it into * TCPS_CLOSED state, schedule purge * ASAP and push it into the very end * of the slot TAILQ, so that it won't * conflict with our new state. */ si->src.state = si->dst.state = TCPS_CLOSED; si->timeout = PFTM_PURGE; olds = si; } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: %s key attach " "failed on %s: ", (idx == PF_SK_WIRE) ? "wire" : "stack", s->kif->pfik_name); pf_print_state_parts(s, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf(", existing: "); pf_print_state_parts(si, (idx == PF_SK_WIRE) ? sk : NULL, (idx == PF_SK_STACK) ? sk : NULL); printf("\n"); } PF_HASHROW_UNLOCK(ih); KEYS_UNLOCK(); uma_zfree(V_pf_state_key_z, sk); if (idx == PF_SK_STACK) pf_detach_state(s); return (EEXIST); /* collision! */ } } PF_HASHROW_UNLOCK(ih); } uma_zfree(V_pf_state_key_z, sk); s->key[idx] = cur; } else { LIST_INSERT_HEAD(&kh->keys, sk, entry); s->key[idx] = sk; } stateattach: /* List is sorted, if-bound states before floating. */ if (s->kif == V_pfi_all) TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]); else TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]); if (olds) { TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]); TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds, key_list[idx]); olds = NULL; } /* * Attach done. See how should we (or should not?) * attach a second key. */ if (sks == skw) { s->key[PF_SK_STACK] = s->key[PF_SK_WIRE]; idx = PF_SK_STACK; sks = NULL; goto stateattach; } else if (sks != NULL) { /* * Continue attaching with stack key. */ sk = sks; kh = khs; idx = PF_SK_STACK; sks = NULL; goto keyattach; } PF_STATE_LOCK(s); KEYS_UNLOCK(); KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL, ("%s failure", __func__)); return (0); #undef KEYS_UNLOCK } static void pf_detach_state(struct pf_state *s) { struct pf_state_key *sks = s->key[PF_SK_STACK]; struct pf_keyhash *kh; if (sks != NULL) { kh = &V_pf_keyhash[pf_hashkey(sks)]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_STACK] != NULL) pf_state_key_detach(s, PF_SK_STACK); /* * If both point to same key, then we are done. */ if (sks == s->key[PF_SK_WIRE]) { pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); return; } PF_HASHROW_UNLOCK(kh); } if (s->key[PF_SK_WIRE] != NULL) { kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])]; PF_HASHROW_LOCK(kh); if (s->key[PF_SK_WIRE] != NULL) pf_state_key_detach(s, PF_SK_WIRE); PF_HASHROW_UNLOCK(kh); } } static void pf_state_key_detach(struct pf_state *s, int idx) { struct pf_state_key *sk = s->key[idx]; #ifdef INVARIANTS struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)]; PF_HASHROW_ASSERT(kh); #endif TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]); s->key[idx] = NULL; if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) { LIST_REMOVE(sk, entry); uma_zfree(V_pf_state_key_z, sk); } } static int pf_state_key_ctor(void *mem, int size, void *arg, int flags) { struct pf_state_key *sk = mem; bzero(sk, sizeof(struct pf_state_key_cmp)); TAILQ_INIT(&sk->states[PF_SK_WIRE]); TAILQ_INIT(&sk->states[PF_SK_STACK]); return (0); } struct pf_state_key * pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr, struct pf_addr *daddr, u_int16_t sport, u_int16_t dport) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af); PF_ACPY(&sk->addr[pd->didx], daddr, pd->af); sk->port[pd->sidx] = sport; sk->port[pd->didx] = dport; sk->proto = pd->proto; sk->af = pd->af; return (sk); } struct pf_state_key * pf_state_key_clone(struct pf_state_key *orig) { struct pf_state_key *sk; sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sk == NULL) return (NULL); bcopy(orig, sk, sizeof(struct pf_state_key_cmp)); return (sk); } int pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw, struct pf_state_key *sks, struct pf_state *s) { struct pf_idhash *ih; struct pf_state *cur; int error; KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]), ("%s: sks not pristine", __func__)); KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]), ("%s: skw not pristine", __func__)); KASSERT(s->refs == 0, ("%s: state not pristine", __func__)); s->kif = kif; if (s->id == 0 && s->creatorid == 0) { /* XXX: should be atomic, but probability of collision low */ if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID) V_pf_stateid[curcpu] = 1; s->id |= (uint64_t )curcpu << PFID_CPUSHIFT; s->id = htobe64(s->id); s->creatorid = V_pf_status.hostid; } /* Returns with ID locked on success. */ if ((error = pf_state_key_attach(skw, sks, s)) != 0) return (error); ih = &V_pf_idhash[PF_IDHASH(s)]; PF_HASHROW_ASSERT(ih); LIST_FOREACH(cur, &ih->states, entry) if (cur->id == s->id && cur->creatorid == s->creatorid) break; if (cur != NULL) { PF_HASHROW_UNLOCK(ih); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state ID collision: " "id: %016llx creatorid: %08x\n", (unsigned long long)be64toh(s->id), ntohl(s->creatorid)); } pf_detach_state(s); return (EEXIST); } LIST_INSERT_HEAD(&ih->states, s, entry); /* One for keys, one for ID hash. */ refcount_init(&s->refs, 2); V_pf_status.fcounters[FCNT_STATE_INSERT]++; if (pfsync_insert_state_ptr != NULL) pfsync_insert_state_ptr(s); /* Returns locked. */ return (0); } /* * Find state by ID: returns with locked row on success. */ struct pf_state * pf_find_state_byid(uint64_t id, uint32_t creatorid) { struct pf_idhash *ih; struct pf_state *s; V_pf_status.fcounters[FCNT_STATE_SEARCH]++; ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))]; PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) if (s->id == id && s->creatorid == creatorid) break; if (s == NULL) PF_HASHROW_UNLOCK(ih); return (s); } /* * Find state by key. * Returns with ID hash slot locked on success. */ static struct pf_state * pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_state *s; int idx; V_pf_status.fcounters[FCNT_STATE_SEARCH]++; kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK); /* List is sorted, if-bound states before floating ones. */ TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) if (s->kif == V_pfi_all || s->kif == kif) { PF_STATE_LOCK(s); PF_HASHROW_UNLOCK(kh); if (s->timeout >= PFTM_MAX) { /* * State is either being processed by * pf_unlink_state() in an other thread, or * is scheduled for immediate expiry. */ PF_STATE_UNLOCK(s); return (NULL); } return (s); } PF_HASHROW_UNLOCK(kh); return (NULL); } struct pf_state * pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more) { struct pf_keyhash *kh; struct pf_state_key *sk; struct pf_state *s, *ret = NULL; int idx, inout = 0; V_pf_status.fcounters[FCNT_STATE_SEARCH]++; kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)]; PF_HASHROW_LOCK(kh); LIST_FOREACH(sk, &kh->keys, entry) if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0) break; if (sk == NULL) { PF_HASHROW_UNLOCK(kh); return (NULL); } switch (dir) { case PF_IN: idx = PF_SK_WIRE; break; case PF_OUT: idx = PF_SK_STACK; break; case PF_INOUT: idx = PF_SK_WIRE; inout = 1; break; default: panic("%s: dir %u", __func__, dir); } second_run: TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) { if (more == NULL) { PF_HASHROW_UNLOCK(kh); return (s); } if (ret) (*more)++; else ret = s; } if (inout == 1) { inout = 0; idx = PF_SK_STACK; goto second_run; } PF_HASHROW_UNLOCK(kh); return (ret); } /* END state table stuff */ static void pf_send(struct pf_send_entry *pfse) { PF_SENDQ_LOCK(); STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next); PF_SENDQ_UNLOCK(); swi_sched(V_pf_swi_cookie, 0); } void pf_intr(void *v) { struct pf_send_head queue; struct pf_send_entry *pfse, *next; CURVNET_SET((struct vnet *)v); PF_SENDQ_LOCK(); queue = V_pf_sendqueue; STAILQ_INIT(&V_pf_sendqueue); PF_SENDQ_UNLOCK(); STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) { switch (pfse->pfse_type) { #ifdef INET case PFSE_IP: ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL); break; case PFSE_ICMP: icmp_error(pfse->pfse_m, pfse->pfse_icmp_type, pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu); break; #endif /* INET */ #ifdef INET6 case PFSE_IP6: ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL, NULL); break; case PFSE_ICMP6: icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type, pfse->pfse_icmp_code, pfse->pfse_icmp_mtu); break; #endif /* INET6 */ default: panic("%s: unknown type", __func__); } free(pfse, M_PFTEMP); } CURVNET_RESTORE(); } void pf_purge_thread(void *v) { u_int idx = 0; CURVNET_SET((struct vnet *)v); for (;;) { PF_RULES_RLOCK(); rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10); if (V_pf_end_threads) { /* * To cleanse up all kifs and rules we need * two runs: first one clears reference flags, * then pf_purge_expired_states() doesn't * raise them, and then second run frees. */ PF_RULES_RUNLOCK(); pf_purge_unlinked_rules(); pfi_kif_purge(); /* * Now purge everything. */ pf_purge_expired_states(0, V_pf_hashmask); pf_purge_expired_fragments(); pf_purge_expired_src_nodes(); /* * Now all kifs & rules should be unreferenced, * thus should be successfully freed. */ pf_purge_unlinked_rules(); pfi_kif_purge(); /* * Announce success and exit. */ PF_RULES_RLOCK(); V_pf_end_threads++; PF_RULES_RUNLOCK(); wakeup(pf_purge_thread); kproc_exit(0); } PF_RULES_RUNLOCK(); /* Process 1/interval fraction of the state table every run. */ idx = pf_purge_expired_states(idx, V_pf_hashmask / (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10)); /* Purge other expired types every PFTM_INTERVAL seconds. */ if (idx == 0) { /* * Order is important: * - states and src nodes reference rules * - states and rules reference kifs */ pf_purge_expired_fragments(); pf_purge_expired_src_nodes(); pf_purge_unlinked_rules(); pfi_kif_purge(); } } /* not reached */ CURVNET_RESTORE(); } u_int32_t pf_state_expires(const struct pf_state *state) { u_int32_t timeout; u_int32_t start; u_int32_t end; u_int32_t states; /* handle all PFTM_* > PFTM_MAX here */ if (state->timeout == PFTM_PURGE) return (time_uptime); KASSERT(state->timeout != PFTM_UNLINKED, ("pf_state_expires: timeout == PFTM_UNLINKED")); KASSERT((state->timeout < PFTM_MAX), ("pf_state_expires: timeout > PFTM_MAX")); timeout = state->rule.ptr->timeout[state->timeout]; if (!timeout) timeout = V_pf_default_rule.timeout[state->timeout]; start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START]; if (start) { end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END]; states = counter_u64_fetch(state->rule.ptr->states_cur); } else { start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START]; end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END]; states = V_pf_status.states; } if (end && states > start && start < end) { if (states < end) return (state->expire + timeout * (end - states) / (end - start)); else return (time_uptime); } return (state->expire + timeout); } void pf_purge_expired_src_nodes() { struct pf_src_node_list freelist; struct pf_srchash *sh; struct pf_src_node *cur, *next; int i; LIST_INIT(&freelist); for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) { PF_HASHROW_LOCK(sh); LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next) if (cur->states == 0 && cur->expire <= time_uptime) { pf_unlink_src_node_locked(cur); LIST_INSERT_HEAD(&freelist, cur, entry); } else if (cur->rule.ptr != NULL) cur->rule.ptr->rule_flag |= PFRULE_REFS; PF_HASHROW_UNLOCK(sh); } pf_free_src_nodes(&freelist); } static void pf_src_tree_remove_state(struct pf_state *s) { u_int32_t timeout; if (s->src_node != NULL) { if (s->src.tcp_est) --s->src_node->conn; if (--s->src_node->states == 0) { timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; if (!timeout) timeout = V_pf_default_rule.timeout[PFTM_SRC_NODE]; s->src_node->expire = time_uptime + timeout; } } if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) { if (--s->nat_src_node->states == 0) { timeout = s->rule.ptr->timeout[PFTM_SRC_NODE]; if (!timeout) timeout = V_pf_default_rule.timeout[PFTM_SRC_NODE]; s->nat_src_node->expire = time_uptime + timeout; } } s->src_node = s->nat_src_node = NULL; } /* * Unlink and potentilly free a state. Function may be * called with ID hash row locked, but always returns * unlocked, since it needs to go through key hash locking. */ int pf_unlink_state(struct pf_state *s, u_int flags) { struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)]; if ((flags & PF_ENTER_LOCKED) == 0) PF_HASHROW_LOCK(ih); else PF_HASHROW_ASSERT(ih); if (s->timeout == PFTM_UNLINKED) { /* * State is being processed * by pf_unlink_state() in * an other thread. */ PF_HASHROW_UNLOCK(ih); return (0); /* XXXGL: undefined actually */ } if (s->src.state == PF_TCPS_PROXY_DST) { /* XXX wire key the right one? */ pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af, &s->key[PF_SK_WIRE]->addr[1], &s->key[PF_SK_WIRE]->addr[0], s->key[PF_SK_WIRE]->port[1], s->key[PF_SK_WIRE]->port[0], s->src.seqhi, s->src.seqlo + 1, TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL); } LIST_REMOVE(s, entry); pf_src_tree_remove_state(s); if (pfsync_delete_state_ptr != NULL) pfsync_delete_state_ptr(s); STATE_DEC_COUNTERS(s); s->timeout = PFTM_UNLINKED; PF_HASHROW_UNLOCK(ih); pf_detach_state(s); refcount_release(&s->refs); return (pf_release_state(s)); } void pf_free_state(struct pf_state *cur) { KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur)); KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__, cur->timeout)); pf_normalize_tcp_cleanup(cur); uma_zfree(V_pf_state_z, cur); V_pf_status.fcounters[FCNT_STATE_REMOVALS]++; } /* * Called only from pf_purge_thread(), thus serialized. */ static u_int pf_purge_expired_states(u_int i, int maxcheck) { struct pf_idhash *ih; struct pf_state *s; V_pf_status.states = uma_zone_get_cur(V_pf_state_z); /* * Go through hash and unlink states that expire now. */ while (maxcheck > 0) { ih = &V_pf_idhash[i]; relock: PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (pf_state_expires(s) <= time_uptime) { V_pf_status.states -= pf_unlink_state(s, PF_ENTER_LOCKED); goto relock; } s->rule.ptr->rule_flag |= PFRULE_REFS; if (s->nat_rule.ptr != NULL) s->nat_rule.ptr->rule_flag |= PFRULE_REFS; if (s->anchor.ptr != NULL) s->anchor.ptr->rule_flag |= PFRULE_REFS; s->kif->pfik_flags |= PFI_IFLAG_REFS; if (s->rt_kif) s->rt_kif->pfik_flags |= PFI_IFLAG_REFS; } PF_HASHROW_UNLOCK(ih); /* Return when we hit end of hash. */ if (++i > V_pf_hashmask) { V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (0); } maxcheck--; } V_pf_status.states = uma_zone_get_cur(V_pf_state_z); return (i); } static void pf_purge_unlinked_rules() { struct pf_rulequeue tmpq; struct pf_rule *r, *r1; /* * If we have overloading task pending, then we'd * better skip purging this time. There is a tiny * probability that overloading task references * an already unlinked rule. */ PF_OVERLOADQ_LOCK(); if (!SLIST_EMPTY(&V_pf_overloadqueue)) { PF_OVERLOADQ_UNLOCK(); return; } PF_OVERLOADQ_UNLOCK(); /* * Do naive mark-and-sweep garbage collecting of old rules. * Reference flag is raised by pf_purge_expired_states() * and pf_purge_expired_src_nodes(). * * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK, * use a temporary queue. */ TAILQ_INIT(&tmpq); PF_UNLNKDRULES_LOCK(); TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) { if (!(r->rule_flag & PFRULE_REFS)) { TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries); TAILQ_INSERT_TAIL(&tmpq, r, entries); } else r->rule_flag &= ~PFRULE_REFS; } PF_UNLNKDRULES_UNLOCK(); if (!TAILQ_EMPTY(&tmpq)) { PF_RULES_WLOCK(); TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) { TAILQ_REMOVE(&tmpq, r, entries); pf_free_rule(r); } PF_RULES_WUNLOCK(); } } void pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { u_int32_t a = ntohl(addr->addr32[0]); printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255, (a>>8)&255, a&255); if (p) { p = ntohs(p); printf(":%u", p); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { u_int16_t b; u_int8_t i, curstart, curend, maxstart, maxend; curstart = curend = maxstart = maxend = 255; for (i = 0; i < 8; i++) { if (!addr->addr16[i]) { if (curstart == 255) curstart = i; curend = i; } else { if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } curstart = curend = 255; } } if ((curend - curstart) > (maxend - maxstart)) { maxstart = curstart; maxend = curend; } for (i = 0; i < 8; i++) { if (i >= maxstart && i <= maxend) { if (i == 0) printf(":"); if (i == maxend) printf(":"); } else { b = ntohs(addr->addr16[i]); printf("%x", b); if (i < 7) printf(":"); } } if (p) { p = ntohs(p); printf("[%u]", p); } break; } #endif /* INET6 */ } } void pf_print_state(struct pf_state *s) { pf_print_state_parts(s, NULL, NULL); } static void pf_print_state_parts(struct pf_state *s, struct pf_state_key *skwp, struct pf_state_key *sksp) { struct pf_state_key *skw, *sks; u_int8_t proto, dir; /* Do our best to fill these, but they're skipped if NULL */ skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL); sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL); proto = skw ? skw->proto : (sks ? sks->proto : 0); dir = s ? s->direction : 0; switch (proto) { case IPPROTO_IPV4: printf("IPv4"); break; case IPPROTO_IPV6: printf("IPv6"); break; case IPPROTO_TCP: printf("TCP"); break; case IPPROTO_UDP: printf("UDP"); break; case IPPROTO_ICMP: printf("ICMP"); break; case IPPROTO_ICMPV6: printf("ICMPv6"); break; default: printf("%u", skw->proto); break; } switch (dir) { case PF_IN: printf(" in"); break; case PF_OUT: printf(" out"); break; } if (skw) { printf(" wire: "); pf_print_host(&skw->addr[0], skw->port[0], skw->af); printf(" "); pf_print_host(&skw->addr[1], skw->port[1], skw->af); } if (sks) { printf(" stack: "); if (sks != skw) { pf_print_host(&sks->addr[0], sks->port[0], sks->af); printf(" "); pf_print_host(&sks->addr[1], sks->port[1], sks->af); } else printf("-"); } if (s) { if (proto == IPPROTO_TCP) { printf(" [lo=%u high=%u win=%u modulator=%u", s->src.seqlo, s->src.seqhi, s->src.max_win, s->src.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->src.wscale & PF_WSCALE_MASK); printf("]"); printf(" [lo=%u high=%u win=%u modulator=%u", s->dst.seqlo, s->dst.seqhi, s->dst.max_win, s->dst.seqdiff); if (s->src.wscale && s->dst.wscale) printf(" wscale=%u", s->dst.wscale & PF_WSCALE_MASK); printf("]"); } printf(" %u:%u", s->src.state, s->dst.state); } } void pf_print_flags(u_int8_t f) { if (f) printf(" "); if (f & TH_FIN) printf("F"); if (f & TH_SYN) printf("S"); if (f & TH_RST) printf("R"); if (f & TH_PUSH) printf("P"); if (f & TH_ACK) printf("A"); if (f & TH_URG) printf("U"); if (f & TH_ECE) printf("E"); if (f & TH_CWR) printf("W"); } #define PF_SET_SKIP_STEPS(i) \ do { \ while (head[i] != cur) { \ head[i]->skip[i].ptr = cur; \ head[i] = TAILQ_NEXT(head[i], entries); \ } \ } while (0) void pf_calc_skip_steps(struct pf_rulequeue *rules) { struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT]; int i; cur = TAILQ_FIRST(rules); prev = cur; for (i = 0; i < PF_SKIP_COUNT; ++i) head[i] = cur; while (cur != NULL) { if (cur->kif != prev->kif || cur->ifnot != prev->ifnot) PF_SET_SKIP_STEPS(PF_SKIP_IFP); if (cur->direction != prev->direction) PF_SET_SKIP_STEPS(PF_SKIP_DIR); if (cur->af != prev->af) PF_SET_SKIP_STEPS(PF_SKIP_AF); if (cur->proto != prev->proto) PF_SET_SKIP_STEPS(PF_SKIP_PROTO); if (cur->src.neg != prev->src.neg || pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr)) PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR); if (cur->src.port[0] != prev->src.port[0] || cur->src.port[1] != prev->src.port[1] || cur->src.port_op != prev->src.port_op) PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT); if (cur->dst.neg != prev->dst.neg || pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr)) PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR); if (cur->dst.port[0] != prev->dst.port[0] || cur->dst.port[1] != prev->dst.port[1] || cur->dst.port_op != prev->dst.port_op) PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT); prev = cur; cur = TAILQ_NEXT(cur, entries); } for (i = 0; i < PF_SKIP_COUNT; ++i) PF_SET_SKIP_STEPS(i); } static int pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2) { if (aw1->type != aw2->type) return (1); switch (aw1->type) { case PF_ADDR_ADDRMASK: case PF_ADDR_RANGE: if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0)) return (1); if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0)) return (1); return (0); case PF_ADDR_DYNIFTL: return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (aw1->p.tbl != aw2->p.tbl); default: printf("invalid address type: %d\n", aw1->type); return (1); } } u_int16_t pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp) { u_int32_t l; if (udp && !cksum) return (0x0000); l = cksum + old - new; l = (l >> 16) + (l & 65535); l = l & 65535; if (udp && !l) return (0xFFFF); return (l); } static void pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af) { struct pf_addr ao; u_int16_t po = *p; PF_ACPY(&ao, a, af); PF_ACPY(a, an, af); *p = pn; switch (af) { #ifdef INET case AF_INET: *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, ao.addr16[0], an->addr16[0], 0), ao.addr16[1], an->addr16[1], 0); *p = pn; *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), po, pn, u); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u), po, pn, u); break; #endif /* INET6 */ } } /* Changes a u_int32_t. Uses a void * so there are no align restrictions */ void pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u) { u_int32_t ao; memcpy(&ao, a, sizeof(ao)); memcpy(a, &an, sizeof(u_int32_t)); *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u), ao % 65536, an % 65536, u); } #ifdef INET6 static void pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u) { struct pf_addr ao; PF_ACPY(&ao, a, AF_INET6); PF_ACPY(a, an, AF_INET6); *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*c, ao.addr16[0], an->addr16[0], u), ao.addr16[1], an->addr16[1], u), ao.addr16[2], an->addr16[2], u), ao.addr16[3], an->addr16[3], u), ao.addr16[4], an->addr16[4], u), ao.addr16[5], an->addr16[5], u), ao.addr16[6], an->addr16[6], u), ao.addr16[7], an->addr16[7], u); } #endif /* INET6 */ static void pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa, struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c, u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af) { struct pf_addr oia, ooa; PF_ACPY(&oia, ia, af); if (oa) PF_ACPY(&ooa, oa, af); /* Change inner protocol port, fix inner protocol checksum. */ if (ip != NULL) { u_int16_t oip = *ip; u_int32_t opc; if (pc != NULL) opc = *pc; *ip = np; if (pc != NULL) *pc = pf_cksum_fixup(*pc, oip, *ip, u); *ic = pf_cksum_fixup(*ic, oip, *ip, 0); if (pc != NULL) *ic = pf_cksum_fixup(*ic, opc, *pc, 0); } /* Change inner ip address, fix inner ip and icmp checksums. */ PF_ACPY(ia, na, af); switch (af) { #ifdef INET case AF_INET: { u_int32_t oh2c = *h2c; *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], 0), oia.addr16[1], ia->addr16[1], 0); *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0); break; } #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, oia.addr16[0], ia->addr16[0], u), oia.addr16[1], ia->addr16[1], u), oia.addr16[2], ia->addr16[2], u), oia.addr16[3], ia->addr16[3], u), oia.addr16[4], ia->addr16[4], u), oia.addr16[5], ia->addr16[5], u), oia.addr16[6], ia->addr16[6], u), oia.addr16[7], ia->addr16[7], u); break; #endif /* INET6 */ } /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */ if (oa) { PF_ACPY(oa, na, af); switch (af) { #ifdef INET case AF_INET: *hc = pf_cksum_fixup(pf_cksum_fixup(*hc, ooa.addr16[0], oa->addr16[0], 0), ooa.addr16[1], oa->addr16[1], 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( pf_cksum_fixup(pf_cksum_fixup(*ic, ooa.addr16[0], oa->addr16[0], u), ooa.addr16[1], oa->addr16[1], u), ooa.addr16[2], oa->addr16[2], u), ooa.addr16[3], oa->addr16[3], u), ooa.addr16[4], oa->addr16[4], u), ooa.addr16[5], oa->addr16[5], u), ooa.addr16[6], oa->addr16[6], u), ooa.addr16[7], oa->addr16[7], u); break; #endif /* INET6 */ } } } /* * Need to modulate the sequence numbers in the TCP SACK option * (credits to Krzysztof Pfaff for report and patch) */ static int pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *dst) { int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen; u_int8_t opts[TCP_MAXOLEN], *opt = opts; int copyback = 0, i, olen; struct sackblk sack; #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2) if (hlen < TCPOLEN_SACKLEN || !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af)) return 0; while (hlen >= TCPOLEN_SACKLEN) { olen = opt[1]; switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_SACK: if (olen > hlen) olen = hlen; if (olen >= TCPOLEN_SACKLEN) { for (i = 2; i + TCPOLEN_SACK <= olen; i += TCPOLEN_SACK) { memcpy(&sack, &opt[i], sizeof(sack)); pf_change_a(&sack.start, &th->th_sum, htonl(ntohl(sack.start) - dst->seqdiff), 0); pf_change_a(&sack.end, &th->th_sum, htonl(ntohl(sack.end) - dst->seqdiff), 0); memcpy(&opt[i], &sack, sizeof(sack)); } copyback = 1; } /* FALLTHROUGH */ default: if (olen < 2) olen = 2; hlen -= olen; opt += olen; } } if (copyback) m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts); return (copyback); } static void pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af, const struct pf_addr *saddr, const struct pf_addr *daddr, u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack, u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag, u_int16_t rtag, struct ifnet *ifp) { struct pf_send_entry *pfse; struct mbuf *m; int len, tlen; #ifdef INET struct ip *h = NULL; #endif /* INET */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif /* INET6 */ struct tcphdr *th; char *opt; struct pf_mtag *pf_mtag; len = 0; th = NULL; /* maximum segment size tcp option */ tlen = sizeof(struct tcphdr); if (mss) tlen += 4; switch (af) { #ifdef INET case AF_INET: len = sizeof(struct ip) + tlen; break; #endif /* INET */ #ifdef INET6 case AF_INET6: len = sizeof(struct ip6_hdr) + tlen; break; #endif /* INET6 */ default: panic("%s: unsupported af %d", __func__, af); } /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { free(pfse, M_PFTEMP); return; } #ifdef MAC mac_netinet_firewall_send(m); #endif if ((pf_mtag = pf_get_mtag(m)) == NULL) { free(pfse, M_PFTEMP); m_freem(m); return; } if (tag) m->m_flags |= M_SKIP_FIREWALL; pf_mtag->tag = rtag; if (r != NULL && r->rtableid >= 0) M_SETFIB(m, r->rtableid); #ifdef ALTQ if (r != NULL && r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m, struct ip *); } #endif /* ALTQ */ m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (af) { #ifdef INET case AF_INET: h = mtod(m, struct ip *); /* IP header fields included in the TCP checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(tlen); h->ip_src.s_addr = saddr->v4.s_addr; h->ip_dst.s_addr = daddr->v4.s_addr; th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip)); break; #endif /* INET */ #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); /* IP header fields included in the TCP checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(tlen); memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr)); memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr)); th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr)); break; #endif /* INET6 */ } /* TCP header */ th->th_sport = sport; th->th_dport = dport; th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_off = tlen >> 2; th->th_flags = flags; th->th_win = htons(win); if (mss) { opt = (char *)(th + 1); opt[0] = TCPOPT_MAXSEG; opt[1] = 4; HTONS(mss); bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2); } switch (af) { #ifdef INET case AF_INET: /* TCP checksum */ th->th_sum = in_cksum(m, len); /* Finish the IP header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); h->ip_len = htons(len); h->ip_ttl = ttl ? ttl : V_ip_defttl; h->ip_sum = 0; pfse->pfse_type = PFSE_IP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* TCP checksum */ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen); h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; pfse->pfse_type = PFSE_IP6; break; #endif /* INET6 */ } pfse->pfse_m = m; pf_send(pfse); } static void pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af, struct pf_rule *r) { struct pf_send_entry *pfse; struct mbuf *m0; struct pf_mtag *pf_mtag; /* Allocate outgoing queue entry, mbuf and mbuf tag. */ pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT); if (pfse == NULL) return; if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) { free(pfse, M_PFTEMP); return; } if ((pf_mtag = pf_get_mtag(m0)) == NULL) { free(pfse, M_PFTEMP); return; } /* XXX: revisit */ m0->m_flags |= M_SKIP_FIREWALL; if (r->rtableid >= 0) M_SETFIB(m0, r->rtableid); #ifdef ALTQ if (r->qid) { pf_mtag->qid = r->qid; /* add hints for ecn */ pf_mtag->hdr = mtod(m0, struct ip *); } #endif /* ALTQ */ switch (af) { #ifdef INET case AF_INET: pfse->pfse_type = PFSE_ICMP; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pfse->pfse_type = PFSE_ICMP6; break; #endif /* INET6 */ } pfse->pfse_m = m0; pfse->pfse_icmp_type = type; pfse->pfse_icmp_code = code; pf_send(pfse); } /* * Return 1 if the addresses a and b match (with mask m), otherwise return 0. * If n is 0, they match if they are equal. If n is != 0, they match if they * are different. */ int pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m, struct pf_addr *b, sa_family_t af) { int match = 0; switch (af) { #ifdef INET case AF_INET: if ((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) match++; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (((a->addr32[0] & m->addr32[0]) == (b->addr32[0] & m->addr32[0])) && ((a->addr32[1] & m->addr32[1]) == (b->addr32[1] & m->addr32[1])) && ((a->addr32[2] & m->addr32[2]) == (b->addr32[2] & m->addr32[2])) && ((a->addr32[3] & m->addr32[3]) == (b->addr32[3] & m->addr32[3]))) match++; break; #endif /* INET6 */ } if (match) { if (n) return (0); else return (1); } else { if (n) return (1); else return (0); } } /* * Return 1 if b <= a <= e, otherwise return 0. */ int pf_match_addr_range(struct pf_addr *b, struct pf_addr *e, struct pf_addr *a, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: if ((a->addr32[0] < b->addr32[0]) || (a->addr32[0] > e->addr32[0])) return (0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: { int i; /* check a >= b */ for (i = 0; i < 4; ++i) if (a->addr32[i] > b->addr32[i]) break; else if (a->addr32[i] < b->addr32[i]) return (0); /* check a <= e */ for (i = 0; i < 4; ++i) if (a->addr32[i] < e->addr32[i]) break; else if (a->addr32[i] > e->addr32[i]) return (0); break; } #endif /* INET6 */ } return (1); } static int pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p) { switch (op) { case PF_OP_IRG: return ((p > a1) && (p < a2)); case PF_OP_XRG: return ((p < a1) || (p > a2)); case PF_OP_RRG: return ((p >= a1) && (p <= a2)); case PF_OP_EQ: return (p == a1); case PF_OP_NE: return (p != a1); case PF_OP_LT: return (p < a1); case PF_OP_LE: return (p <= a1); case PF_OP_GT: return (p > a1); case PF_OP_GE: return (p >= a1); } return (0); /* never reached */ } int pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p) { NTOHS(a1); NTOHS(a2); NTOHS(p); return (pf_match(op, a1, a2, p)); } static int pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u) { if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, u)); } static int pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g) { if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE) return (0); return (pf_match(op, a1, a2, g)); } int pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag) { if (*tag == -1) *tag = mtag; return ((!r->match_tag_not && r->match_tag == *tag) || (r->match_tag_not && r->match_tag != *tag)); } int pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag) { KASSERT(tag > 0, ("%s: tag %d", __func__, tag)); if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL)) return (ENOMEM); pd->pf_mtag->tag = tag; return (0); } #define PF_ANCHOR_STACKSIZE 32 struct pf_anchor_stackframe { struct pf_ruleset *rs; struct pf_rule *r; /* XXX: + match bit */ struct pf_anchor *child; }; /* * XXX: We rely on malloc(9) returning pointer aligned addresses. */ #define PF_ANCHORSTACK_MATCH 0x00000001 #define PF_ANCHORSTACK_MASK (PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_MATCH(f) ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH) #define PF_ANCHOR_RULE(f) (struct pf_rule *) \ ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK) #define PF_ANCHOR_SET_MATCH(f) do { (f)->r = (void *) \ ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH); \ } while (0) void pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth, struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, int *match) { struct pf_anchor_stackframe *f; PF_RULES_RASSERT(); if (match) *match = 0; if (*depth >= PF_ANCHOR_STACKSIZE) { printf("%s: anchor stack overflow on %s\n", __func__, (*r)->anchor->name); *r = TAILQ_NEXT(*r, entries); return; } else if (*depth == 0 && a != NULL) *a = *r; f = stack + (*depth)++; f->rs = *rs; f->r = *r; if ((*r)->anchor_wildcard) { struct pf_anchor_node *parent = &(*r)->anchor->children; if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) { *r = NULL; return; } *rs = &f->child->ruleset; } else { f->child = NULL; *rs = &(*r)->anchor->ruleset; } *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); } int pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth, struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a, int *match) { struct pf_anchor_stackframe *f; struct pf_rule *fr; int quick = 0; PF_RULES_RASSERT(); do { if (*depth <= 0) break; f = stack + *depth - 1; fr = PF_ANCHOR_RULE(f); if (f->child != NULL) { struct pf_anchor_node *parent; /* * This block traverses through * a wildcard anchor. */ parent = &fr->anchor->children; if (match != NULL && *match) { /* * If any of "*" matched, then * "foo/ *" matched, mark frame * appropriately. */ PF_ANCHOR_SET_MATCH(f); *match = 0; } f->child = RB_NEXT(pf_anchor_node, parent, f->child); if (f->child != NULL) { *rs = &f->child->ruleset; *r = TAILQ_FIRST((*rs)->rules[n].active.ptr); if (*r == NULL) continue; else break; } } (*depth)--; if (*depth == 0 && a != NULL) *a = NULL; *rs = f->rs; if (PF_ANCHOR_MATCH(f) || (match != NULL && *match)) quick = fr->quick; *r = TAILQ_NEXT(fr, entries); } while (*r == NULL); return (quick); } #ifdef INET6 void pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr, struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); break; #endif /* INET */ case AF_INET6: naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) | ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]); naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) | ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]); naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) | ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]); naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) | ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]); break; } } void pf_addr_inc(struct pf_addr *addr, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); break; #endif /* INET */ case AF_INET6: if (addr->addr32[3] == 0xffffffff) { addr->addr32[3] = 0; if (addr->addr32[2] == 0xffffffff) { addr->addr32[2] = 0; if (addr->addr32[1] == 0xffffffff) { addr->addr32[1] = 0; addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1); } else addr->addr32[1] = htonl(ntohl(addr->addr32[1]) + 1); } else addr->addr32[2] = htonl(ntohl(addr->addr32[2]) + 1); } else addr->addr32[3] = htonl(ntohl(addr->addr32[3]) + 1); break; } } #endif /* INET6 */ int pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; struct inpcbinfo *pi; struct inpcb *inp; pd->lookup.uid = UID_MAX; pd->lookup.gid = GID_MAX; switch (pd->proto) { case IPPROTO_TCP: if (pd->hdr.tcp == NULL) return (-1); sport = pd->hdr.tcp->th_sport; dport = pd->hdr.tcp->th_dport; pi = &V_tcbinfo; break; case IPPROTO_UDP: if (pd->hdr.udp == NULL) return (-1); sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; pi = &V_udbinfo; break; default: return (-1); } if (direction == PF_IN) { saddr = pd->src; daddr = pd->dst; } else { u_int16_t p; p = sport; sport = dport; dport = p; saddr = pd->dst; daddr = pd->src; } switch (pd->af) { #ifdef INET case AF_INET: inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) { inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6, dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL, m); if (inp == NULL) return (-1); } break; #endif /* INET6 */ default: return (-1); } INP_RLOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; INP_RUNLOCK(inp); return (1); } static u_int8_t pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int8_t wscale = 0; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= 3) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_WINDOW: wscale = opt[2]; if (wscale > TCP_MAX_WINSHIFT) wscale = TCP_MAX_WINSHIFT; wscale |= PF_WSCALE_FLAG; /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (wscale); } static u_int16_t pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af) { int hlen; u_int8_t hdr[60]; u_int8_t *opt, optlen; u_int16_t mss = V_tcp_mssdflt; hlen = th_off << 2; /* hlen <= sizeof(hdr) */ if (hlen <= sizeof(struct tcphdr)) return (0); if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af)) return (0); opt = hdr + sizeof(struct tcphdr); hlen -= sizeof(struct tcphdr); while (hlen >= TCPOLEN_MAXSEG) { switch (*opt) { case TCPOPT_EOL: case TCPOPT_NOP: ++opt; --hlen; break; case TCPOPT_MAXSEG: bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2); NTOHS(mss); /* FALLTHROUGH */ default: optlen = opt[1]; if (optlen < 2) optlen = 2; hlen -= optlen; opt += optlen; break; } } return (mss); } static u_int16_t pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer) { #ifdef INET struct sockaddr_in *dst; struct route ro; #endif /* INET */ #ifdef INET6 struct sockaddr_in6 *dst6; struct route_in6 ro6; #endif /* INET6 */ struct rtentry *rt = NULL; int hlen = 0; u_int16_t mss = V_tcp_mssdflt; switch (af) { #ifdef INET case AF_INET: hlen = sizeof(struct ip); bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = addr->v4; in_rtalloc_ign(&ro, 0, rtableid); rt = ro.ro_rt; break; #endif /* INET */ #ifdef INET6 case AF_INET6: hlen = sizeof(struct ip6_hdr); bzero(&ro6, sizeof(ro6)); dst6 = (struct sockaddr_in6 *)&ro6.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = addr->v6; in6_rtalloc_ign(&ro6, 0, rtableid); rt = ro6.ro_rt; break; #endif /* INET6 */ } if (rt && rt->rt_ifp) { mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr); mss = max(V_tcp_mssdflt, mss); RTFREE(rt); } mss = min(mss, offer); mss = max(mss, 64); /* sanity - at least max opt space */ return (mss); } static void pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr) { struct pf_rule *r = s->rule.ptr; struct pf_src_node *sn = NULL; s->rt_kif = NULL; if (!r->rt || r->rt == PF_FASTROUTE) return; switch (s->key[PF_SK_WIRE]->af) { #ifdef INET case AF_INET: pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn); s->rt_kif = r->rpool.cur->kif; break; #endif /* INET */ #ifdef INET6 case AF_INET6: pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn); s->rt_kif = r->rpool.cur->kif; break; #endif /* INET6 */ } } static u_int32_t pf_tcp_iss(struct pf_pdesc *pd) { MD5_CTX ctx; u_int32_t digest[4]; if (V_pf_tcp_secret_init == 0) { read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); MD5Init(&V_pf_tcp_secret_ctx); MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret, sizeof(V_pf_tcp_secret)); V_pf_tcp_secret_init = 1; } ctx = V_pf_tcp_secret_ctx; MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short)); MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short)); if (pd->af == AF_INET6) { MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr)); MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr)); } else { MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr)); MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr)); } MD5Final((u_char *)digest, &ctx); V_pf_tcp_iss_off += 4096; #define ISN_RANDOM_INCREMENT (4096 - 1) return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) + V_pf_tcp_iss_off); #undef ISN_RANDOM_INCREMENT } static int pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction, struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp) { struct pf_rule *nr = NULL; struct pf_addr * const saddr = pd->src; struct pf_addr * const daddr = pd->dst; sa_family_t af = pd->af; struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; struct pf_src_node *nsn = NULL; struct tcphdr *th = pd->hdr.tcp; struct pf_state_key *sk = NULL, *nk = NULL; u_short reason; int rewrite = 0, hdrlen = 0; int tag = -1, rtableid = -1; int asd = 0; int match = 0; int state_icmp = 0; u_int16_t sport = 0, dport = 0; u_int16_t bproto_sum = 0, bip_sum = 0; u_int8_t icmptype = 0, icmpcode = 0; struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); if (inp != NULL) { INP_LOCK_ASSERT(inp); pd->lookup.uid = inp->inp_cred->cr_uid; pd->lookup.gid = inp->inp_cred->cr_groups[0]; pd->lookup.done = 1; } switch (pd->proto) { case IPPROTO_TCP: sport = th->th_sport; dport = th->th_dport; hdrlen = sizeof(*th); break; case IPPROTO_UDP: sport = pd->hdr.udp->uh_sport; dport = pd->hdr.udp->uh_dport; hdrlen = sizeof(*pd->hdr.udp); break; #ifdef INET case IPPROTO_ICMP: if (pd->af != AF_INET) break; sport = dport = pd->hdr.icmp->icmp_id; hdrlen = sizeof(*pd->hdr.icmp); icmptype = pd->hdr.icmp->icmp_type; icmpcode = pd->hdr.icmp->icmp_code; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: if (af != AF_INET6) break; sport = dport = pd->hdr.icmp6->icmp6_id; hdrlen = sizeof(*pd->hdr.icmp6); icmptype = pd->hdr.icmp6->icmp6_type; icmpcode = pd->hdr.icmp6->icmp6_code; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ default: sport = dport = hdrlen = 0; break; } r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); /* check packet for BINAT/NAT/RDR */ if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk, &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) { KASSERT(sk != NULL, ("%s: null sk", __func__)); KASSERT(nk != NULL, ("%s: null nk", __func__)); if (pd->ip_sum) bip_sum = *pd->ip_sum; switch (pd->proto) { case IPPROTO_TCP: bproto_sum = th->th_sum; pd->proto_sum = &th->th_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(saddr, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, af); pd->sport = &th->th_sport; sport = th->th_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(daddr, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, af); dport = th->th_dport; pd->dport = &th->th_dport; } rewrite++; break; case IPPROTO_UDP: bproto_sum = pd->hdr.udp->uh_sum; pd->proto_sum = &pd->hdr.udp->uh_sum; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) || nk->port[pd->sidx] != sport) { pf_change_ap(saddr, &pd->hdr.udp->uh_sport, pd->ip_sum, &pd->hdr.udp->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, af); sport = pd->hdr.udp->uh_sport; pd->sport = &pd->hdr.udp->uh_sport; } if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) || nk->port[pd->didx] != dport) { pf_change_ap(daddr, &pd->hdr.udp->uh_dport, pd->ip_sum, &pd->hdr.udp->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, af); dport = pd->hdr.udp->uh_dport; pd->dport = &pd->hdr.udp->uh_dport; } rewrite++; break; #ifdef INET case IPPROTO_ICMP: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[1] != pd->hdr.icmp->icmp_id) { pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, sport, nk->port[1], 0); pd->hdr.icmp->icmp_id = nk->port[1]; pd->sport = &pd->hdr.icmp->icmp_id; } m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: nk->port[0] = nk->port[1]; if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->didx], 0); rewrite++; break; #endif /* INET */ default: switch (af) { #ifdef INET case AF_INET: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6)) PF_ACPY(saddr, &nk->addr[pd->sidx], af); if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6)) PF_ACPY(saddr, &nk->addr[pd->didx], af); break; #endif /* INET */ } break; } if (nr->natpass) r = NULL; pd->nat_rule = nr; } while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, saddr, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, daddr, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; /* tcp/udp only. port_op always 0 in other cases */ else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; /* icmp only. type always 0 in other cases */ else if (r->type && r->type != icmptype + 1) r = TAILQ_NEXT(r, entries); /* icmp only. type always 0 in other cases */ else if (r->code && r->code != icmpcode + 1) r = TAILQ_NEXT(r, entries); else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->rule_flag & PFRULE_FRAGMENT) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->flagset & th->th_flags) != r->flags) r = TAILQ_NEXT(r, entries); /* tcp/udp only. uid.op always 0 in other cases */ else if (r->uid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(direction, pd, m), 1)) && !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1], pd->lookup.uid)) r = TAILQ_NEXT(r, entries); /* tcp/udp only. gid.op always 0 in other cases */ else if (r->gid.op && (pd->lookup.done || (pd->lookup.done = pf_socket_lookup(direction, pd, m), 1)) && !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1], pd->lookup.gid)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= arc4random()) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != IPPROTO_TCP || !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint))) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log || (nr != NULL && nr->log)) { if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a, ruleset, pd, 1); } if ((r->action == PF_DROP) && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURNICMP) || (r->rule_flag & PFRULE_RETURN))) { /* undo NAT changes, if they have taken place */ if (nr != NULL) { PF_ACPY(saddr, &sk->addr[pd->sidx], af); PF_ACPY(daddr, &sk->addr[pd->didx], af); if (pd->sport) *pd->sport = sk->port[pd->sidx]; if (pd->dport) *pd->dport = sk->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } if (pd->proto == IPPROTO_TCP && ((r->rule_flag & PFRULE_RETURNRST) || (r->rule_flag & PFRULE_RETURN)) && !(th->th_flags & TH_RST)) { u_int32_t ack = ntohl(th->th_seq) + pd->p_len; int len = 0; #ifdef INET struct ip *h4; #endif #ifdef INET6 struct ip6_hdr *h6; #endif switch (af) { #ifdef INET case AF_INET: h4 = mtod(m, struct ip *); len = ntohs(h4->ip_len) - off; break; #endif #ifdef INET6 case AF_INET6: h6 = mtod(m, struct ip6_hdr *); len = ntohs(h6->ip6_plen) - (off - sizeof(*h6)); break; #endif } if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af)) REASON_SET(&reason, PFRES_PROTCKSUM); else { if (th->th_flags & TH_SYN) ack++; if (th->th_flags & TH_FIN) ack++; pf_send_tcp(m, r, af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0, r->return_ttl, 1, 0, kif->pfik_ifp); } } else if (pd->proto != IPPROTO_ICMP && af == AF_INET && r->return_icmp) pf_send_icmp(m, r->return_icmp >> 8, r->return_icmp & 255, af, r); else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 && r->return_icmp6) pf_send_icmp(m, r->return_icmp6 >> 8, r->return_icmp6 & 255, af, r); } if (r->action == PF_DROP) goto cleanup; if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); goto cleanup; } if (rtableid >= 0) M_SETFIB(m, rtableid); if (!state_icmp && (r->keep_state || nr != NULL || (pd->flags & PFDESC_TCP_NORM))) { int action; action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off, sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum, hdrlen); if (action != PF_PASS) return (action); } else { if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); } /* copy back packet headers if we performed NAT operations */ if (rewrite) m_copyback(m, off, hdrlen, pd->hdr.any); if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) && direction == PF_OUT && pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m)) /* * We want the state created, but we dont * want to send this in case a partner * firewall has to know about it to allow * replies through it. */ return (PF_DEFER); return (PF_PASS); cleanup: if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); return (PF_DROP); } static int pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a, struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk, struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport, u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm, int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen) { struct pf_state *s = NULL; struct pf_src_node *sn = NULL; struct tcphdr *th = pd->hdr.tcp; u_int16_t mss = V_tcp_mssdflt; u_short reason; /* check maximums */ if (r->max_states && (counter_u64_fetch(r->states_cur) >= r->max_states)) { V_pf_status.lcounters[LCNT_STATES]++; REASON_SET(&reason, PFRES_MAXSTATES); return (PF_DROP); } /* src node for filter rule */ if ((r->rule_flag & PFRULE_SRCTRACK || r->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) { REASON_SET(&reason, PFRES_SRCLIMIT); goto csfailed; } /* src node for translation rule */ if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) && pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) { REASON_SET(&reason, PFRES_SRCLIMIT); goto csfailed; } s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO); if (s == NULL) { REASON_SET(&reason, PFRES_MEMORY); goto csfailed; } s->rule.ptr = r; s->nat_rule.ptr = nr; s->anchor.ptr = a; STATE_INC_COUNTERS(s); if (r->allow_opts) s->state_flags |= PFSTATE_ALLOWOPTS; if (r->rule_flag & PFRULE_STATESLOPPY) s->state_flags |= PFSTATE_SLOPPY; s->log = r->log & PF_LOG_ALL; s->sync_state = PFSYNC_S_NONE; if (nr != NULL) s->log |= nr->log & PF_LOG_ALL; switch (pd->proto) { case IPPROTO_TCP: s->src.seqlo = ntohl(th->th_seq); s->src.seqhi = s->src.seqlo + pd->p_len + 1; if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_MODULATE) { /* Generate sequence number modulator */ if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) == 0) s->src.seqdiff = 1; pf_change_a(&th->th_seq, &th->th_sum, htonl(s->src.seqlo + s->src.seqdiff), 0); *rewrite = 1; } else s->src.seqdiff = 0; if (th->th_flags & TH_SYN) { s->src.seqhi++; s->src.wscale = pf_get_wscale(m, off, th->th_off, pd->af); } s->src.max_win = MAX(ntohs(th->th_win), 1); if (s->src.wscale & PF_WSCALE_MASK) { /* Remove scale factor from initial window */ int win = s->src.max_win; win += 1 << (s->src.wscale & PF_WSCALE_MASK); s->src.max_win = (win - 1) >> (s->src.wscale & PF_WSCALE_MASK); } if (th->th_flags & TH_FIN) s->src.seqhi++; s->dst.seqhi = 1; s->dst.max_win = 1; s->src.state = TCPS_SYN_SENT; s->dst.state = TCPS_CLOSED; s->timeout = PFTM_TCP_FIRST_PACKET; break; case IPPROTO_UDP: s->src.state = PFUDPS_SINGLE; s->dst.state = PFUDPS_NO_TRAFFIC; s->timeout = PFTM_UDP_FIRST_PACKET; break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif s->timeout = PFTM_ICMP_FIRST_PACKET; break; default: s->src.state = PFOTHERS_SINGLE; s->dst.state = PFOTHERS_NO_TRAFFIC; s->timeout = PFTM_OTHER_FIRST_PACKET; } s->creation = time_uptime; s->expire = time_uptime; if (sn != NULL) { s->src_node = sn; s->src_node->states++; } if (nsn != NULL) { /* XXX We only modify one side for now. */ PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af); s->nat_src_node = nsn; s->nat_src_node->states++; } if (pd->proto == IPPROTO_TCP) { if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m, off, pd, th, &s->src, &s->dst)) { REASON_SET(&reason, PFRES_MEMORY); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub && pf_normalize_tcp_stateful(m, off, pd, &reason, th, s, &s->src, &s->dst, rewrite)) { /* This really shouldn't happen!!! */ DPFPRINTF(PF_DEBUG_URGENT, ("pf_normalize_tcp_stateful failed on first pkt")); pf_normalize_tcp_cleanup(s); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } } s->direction = pd->dir; /* * sk/nk could already been setup by pf_get_translation(). */ if (nr == NULL) { KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport); if (sk == NULL) goto csfailed; nk = sk; } else KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p", __func__, nr, sk, nk)); /* Swap sk/nk for PF_OUT. */ if (pf_state_insert(BOUND_IFACE(r, kif), (pd->dir == PF_IN) ? sk : nk, (pd->dir == PF_IN) ? nk : sk, s)) { if (pd->proto == IPPROTO_TCP) pf_normalize_tcp_cleanup(s); REASON_SET(&reason, PFRES_STATEINS); pf_src_tree_remove_state(s); STATE_DEC_COUNTERS(s); uma_zfree(V_pf_state_z, s); return (PF_DROP); } else *sm = s; pf_set_rt_ifp(s, pd->src); /* needs s->state_key set */ if (tag > 0) s->tag = tag; if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN && r->keep_state == PF_STATE_SYNPROXY) { s->src.state = PF_TCPS_PROXY_SRC; /* undo NAT changes, if they have taken place */ if (nr != NULL) { struct pf_state_key *skt = s->key[PF_SK_WIRE]; if (pd->dir == PF_OUT) skt = s->key[PF_SK_STACK]; PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af); PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af); if (pd->sport) *pd->sport = skt->port[pd->sidx]; if (pd->dport) *pd->dport = skt->port[pd->didx]; if (pd->proto_sum) *pd->proto_sum = bproto_sum; if (pd->ip_sum) *pd->ip_sum = bip_sum; m_copyback(m, off, hdrlen, pd->hdr.any); } s->src.seqhi = htonl(arc4random()); /* Find mss option */ int rtid = M_GETFIB(m); mss = pf_get_mss(m, off, th->th_off, pd->af); mss = pf_calc_mss(pd->src, pd->af, rtid, mss); mss = pf_calc_mss(pd->dst, pd->af, rtid, mss); s->src.mss = mss; pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL); REASON_SET(&reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } return (PF_PASS); csfailed: if (sk != NULL) uma_zfree(V_pf_state_key_z, sk); if (nk != NULL) uma_zfree(V_pf_state_key_z, nk); if (sn != NULL && sn->states == 0 && sn->expire == 0) { pf_unlink_src_node(sn); pf_free_src_node(sn); } if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) { pf_unlink_src_node(nsn); pf_free_src_node(nsn); } return (PF_DROP); } static int pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif, struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am, struct pf_ruleset **rsm) { struct pf_rule *r, *a = NULL; struct pf_ruleset *ruleset = NULL; sa_family_t af = pd->af; u_short reason; int tag = -1; int asd = 0; int match = 0; struct pf_anchor_stackframe anchor_stack[PF_ANCHOR_STACKSIZE]; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->tos && !(r->tos == pd->tos)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_UDP && (r->src.port_op || r->dst.port_op)) r = TAILQ_NEXT(r, entries); else if (pd->proto == IPPROTO_TCP && (r->src.port_op || r->dst.port_op || r->flagset)) r = TAILQ_NEXT(r, entries); else if ((pd->proto == IPPROTO_ICMP || pd->proto == IPPROTO_ICMPV6) && (r->type || r->code)) r = TAILQ_NEXT(r, entries); else if (r->prob && r->prob <= (arc4random() % (UINT_MAX - 1) + 1)) r = TAILQ_NEXT(r, entries); else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else { if (r->anchor == NULL) { match = 1; *rm = r; *am = a; *rsm = ruleset; if ((*rm)->quick) break; r = TAILQ_NEXT(r, entries); } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match); } if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, PF_RULESET_FILTER, &r, &a, &match)) break; } r = *rm; a = *am; ruleset = *rsm; REASON_SET(&reason, PFRES_MATCH); if (r->log) PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd, 1); if (r->action != PF_PASS) return (PF_DROP); if (tag > 0 && pf_tag_packet(m, pd, tag)) { REASON_SET(&reason, PFRES_MEMORY); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst, struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, int *copyback) { struct tcphdr *th = pd->hdr.tcp; u_int16_t win = ntohs(th->th_win); u_int32_t ack, end, seq, orig_seq; u_int8_t sws, dws; int ackskew; if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) { sws = src->wscale & PF_WSCALE_MASK; dws = dst->wscale & PF_WSCALE_MASK; } else sws = dws = 0; /* * Sequence tracking algorithm from Guido van Rooij's paper: * http://www.madison-gurkha.com/publications/tcp_filtering/ * tcp_filtering.ps */ orig_seq = seq = ntohl(th->th_seq); if (src->seqlo == 0) { /* First packet from this end. Set its state */ if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) && src->scrub == NULL) { if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) { REASON_SET(reason, PFRES_MEMORY); return (PF_DROP); } } /* Deferred generation of sequence number modulator */ if (dst->seqdiff && !src->seqdiff) { /* use random iss for the TCP server */ while ((src->seqdiff = arc4random() - seq) == 0) ; ack = ntohl(th->th_ack) - dst->seqdiff; pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } else { ack = ntohl(th->th_ack); } end = seq + pd->p_len; if (th->th_flags & TH_SYN) { end++; if (dst->wscale & PF_WSCALE_FLAG) { src->wscale = pf_get_wscale(m, off, th->th_off, pd->af); if (src->wscale & PF_WSCALE_FLAG) { /* Remove scale factor from initial * window */ sws = src->wscale & PF_WSCALE_MASK; win = ((u_int32_t)win + (1 << sws) - 1) >> sws; dws = dst->wscale & PF_WSCALE_MASK; } else { /* fixup other window */ dst->max_win <<= dst->wscale & PF_WSCALE_MASK; /* in case of a retrans SYN|ACK */ dst->wscale = 0; } } } if (th->th_flags & TH_FIN) end++; src->seqlo = seq; if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; /* * May need to slide the window (seqhi may have been set by * the crappy stack check or if we picked up the connection * after establishment) */ if (src->seqhi == 1 || SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) src->seqhi = end + MAX(1, dst->max_win << dws); if (win > src->max_win) src->max_win = win; } else { ack = ntohl(th->th_ack) - dst->seqdiff; if (src->seqdiff) { /* Modulate sequence numbers */ pf_change_a(&th->th_seq, &th->th_sum, htonl(seq + src->seqdiff), 0); pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0); *copyback = 1; } end = seq + pd->p_len; if (th->th_flags & TH_SYN) end++; if (th->th_flags & TH_FIN) end++; } if ((th->th_flags & TH_ACK) == 0) { /* Let it pass through the ack skew check */ ack = dst->seqlo; } else if ((ack == 0 && (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) || /* broken tcp stacks do not set ack */ (dst->state < TCPS_SYN_SENT)) { /* * Many stacks (ours included) will set the ACK number in an * FIN|ACK if the SYN times out -- no sequence to ACK. */ ack = dst->seqlo; } if (seq == end) { /* Ease sequencing restrictions on no data packets */ seq = src->seqlo; end = seq; } ackskew = dst->seqlo - ack; /* * Need to demodulate the sequence numbers in any TCP SACK options * (Selective ACK). We could optionally validate the SACK values * against the current ACK window, either forwards or backwards, but * I'm not confident that SACK has been implemented properly * everywhere. It wouldn't surprise me if several stacks accidently * SACK too far backwards of previously ACKed data. There really aren't * any security implications of bad SACKing unless the target stack * doesn't validate the option length correctly. Someone trying to * spoof into a TCP connection won't bother blindly sending SACK * options anyway. */ if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) { if (pf_modulate_sack(m, off, pd, th, dst)) *copyback = 1; } #define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ if (SEQ_GEQ(src->seqhi, end) && /* Last octet inside other's window space */ SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) && /* Retrans: not more than one window back */ (ackskew >= -MAXACKWINDOW) && /* Acking not more than one reassembled fragment backwards */ (ackskew <= (MAXACKWINDOW << sws)) && /* Acking not more than one window forward */ ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) || (pd->flags & PFDESC_IP_REAS) == 0)) { /* Require an exact/+1 sequence match on resets when possible */ if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* update states */ if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) dst->state = TCPS_FIN_WAIT_2; } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; /* Fall through to PASS packet */ } else if ((dst->state < TCPS_SYN_SENT || dst->state >= TCPS_FIN_WAIT_2 || src->state >= TCPS_FIN_WAIT_2) && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) && /* Within a window forward of the originating packet */ SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { /* Within a window backward of the originating packet */ /* * This currently handles three situations: * 1) Stupid stacks will shotgun SYNs before their peer * replies. * 2) When PF catches an already established stream (the * firewall rebooted, the state table was flushed, routes * changed...) * 3) Packets get funky immediately after the connection * closes (this should catch Solaris spurious ACK|FINs * that web servers like to spew after a close) * * This must be a little more careful than the above code * since packet floods will also be caught here. We don't * update the TTL here to mitigate the damage of a packet * flood and so the same code can handle awkward establishment * and a loosened connection close. * In the establishment case, a correct peer response will * validate the connection, go through the normal state code * and keep updating the state TTL. */ if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: loose state match: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); } if (dst->scrub || src->scrub) { if (pf_normalize_tcp_stateful(m, off, pd, reason, th, *state, src, dst, copyback)) return (PF_DROP); } /* update max window */ if (src->max_win < win) src->max_win = win; /* synchronize sequencing */ if (SEQ_GT(end, src->seqlo)) src->seqlo = end; /* slide the window of what the other end can send */ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) dst->seqhi = ack + MAX((win << sws), 1); /* * Cannot set dst->seqhi here since this could be a shotgunned * SYN and not an already established connection. */ if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* Fall through to PASS packet */ } else { if ((*state)->dst.state == TCPS_SYN_SENT && (*state)->src.state == TCPS_SYN_SENT) { /* Send RST for state mismatches during handshake */ if (!(th->th_flags & TH_RST)) pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), 0, TH_RST, 0, 0, (*state)->rule.ptr->return_ttl, 1, 0, kif->pfik_ifp); src->seqlo = 0; src->seqhi = 1; src->max_win = 1; } else if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD state: "); pf_print_state(*state); pf_print_flags(th->th_flags); printf(" seq=%u (%u) ack=%u len=%u ackskew=%d " "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack, pd->p_len, ackskew, (unsigned long long)(*state)->packets[0], (unsigned long long)(*state)->packets[1], pd->dir == PF_IN ? "in" : "out", pd->dir == (*state)->direction ? "fwd" : "rev"); printf("pf: State failure on: %c %c %c %c | %c %c\n", SEQ_GEQ(src->seqhi, end) ? ' ' : '1', SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ? ' ': '2', (ackskew >= -MAXACKWINDOW) ? ' ' : '3', (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4', SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5', SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6'); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } return (PF_PASS); } static int pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst, struct pf_state **state, struct pf_pdesc *pd, u_short *reason) { struct tcphdr *th = pd->hdr.tcp; if (th->th_flags & TH_SYN) if (src->state < TCPS_SYN_SENT) src->state = TCPS_SYN_SENT; if (th->th_flags & TH_FIN) if (src->state < TCPS_CLOSING) src->state = TCPS_CLOSING; if (th->th_flags & TH_ACK) { if (dst->state == TCPS_SYN_SENT) { dst->state = TCPS_ESTABLISHED; if (src->state == TCPS_ESTABLISHED && (*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (dst->state == TCPS_CLOSING) { dst->state = TCPS_FIN_WAIT_2; } else if (src->state == TCPS_SYN_SENT && dst->state < TCPS_SYN_SENT) { /* * Handle a special sloppy case where we only see one * half of the connection. If there is a ACK after * the initial SYN without ever seeing a packet from * the destination, set the connection to established. */ dst->state = src->state = TCPS_ESTABLISHED; if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } } else if (src->state == TCPS_CLOSING && dst->state == TCPS_ESTABLISHED && dst->seqlo == 0) { /* * Handle the closing of half connections where we * don't see the full bidirectional FIN/ACK+ACK * handshake. */ dst->state = TCPS_CLOSING; } } if (th->th_flags & TH_RST) src->state = dst->state = TCPS_TIME_WAIT; /* update expire time */ (*state)->expire = time_uptime; if (src->state >= TCPS_FIN_WAIT_2 && dst->state >= TCPS_FIN_WAIT_2) (*state)->timeout = PFTM_TCP_CLOSED; else if (src->state >= TCPS_CLOSING && dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_FIN_WAIT; else if (src->state < TCPS_ESTABLISHED || dst->state < TCPS_ESTABLISHED) (*state)->timeout = PFTM_TCP_OPENING; else if (src->state >= TCPS_CLOSING || dst->state >= TCPS_CLOSING) (*state)->timeout = PFTM_TCP_CLOSING; else (*state)->timeout = PFTM_TCP_ESTABLISHED; return (PF_PASS); } static int pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_state_key_cmp key; struct tcphdr *th = pd->hdr.tcp; int copyback = 0; struct pf_state_peer *src, *dst; struct pf_state_key *sk; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_TCP; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = th->th_sport; key.port[1] = th->th_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = th->th_sport; key.port[0] = th->th_dport; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } sk = (*state)->key[pd->didx]; if ((*state)->src.state == PF_TCPS_PROXY_SRC) { if (direction != (*state)->direction) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } if (th->th_flags & TH_SYN) { if (ntohl(th->th_seq) != (*state)->src.seqlo) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, (*state)->src.seqhi, ntohl(th->th_seq) + 1, TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (!(th->th_flags & TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else if ((*state)->src_node != NULL && pf_src_connlimit(state)) { REASON_SET(reason, PFRES_SRCLIMIT); return (PF_DROP); } else (*state)->src.state = PF_TCPS_PROXY_DST; } if ((*state)->src.state == PF_TCPS_PROXY_DST) { if (direction == (*state)->direction) { if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) || (ntohl(th->th_ack) != (*state)->src.seqhi + 1) || (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } (*state)->src.max_win = MAX(ntohs(th->th_win), 1); if ((*state)->dst.seqhi == 1) (*state)->dst.seqhi = htonl(arc4random()); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->dst.seqhi, 0, TH_SYN, 0, (*state)->src.mss, 0, 0, (*state)->tag, NULL); REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } else if (((th->th_flags & (TH_SYN|TH_ACK)) != (TH_SYN|TH_ACK)) || (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) { REASON_SET(reason, PFRES_SYNPROXY); return (PF_DROP); } else { (*state)->dst.max_win = MAX(ntohs(th->th_win), 1); (*state)->dst.seqlo = ntohl(th->th_seq); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst, pd->src, th->th_dport, th->th_sport, ntohl(th->th_ack), ntohl(th->th_seq) + 1, TH_ACK, (*state)->src.max_win, 0, 0, 0, (*state)->tag, NULL); pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, &sk->addr[pd->sidx], &sk->addr[pd->didx], sk->port[pd->sidx], sk->port[pd->didx], (*state)->src.seqhi + 1, (*state)->src.seqlo + 1, TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL); (*state)->src.seqdiff = (*state)->dst.seqhi - (*state)->src.seqlo; (*state)->dst.seqdiff = (*state)->src.seqhi - (*state)->dst.seqlo; (*state)->src.seqhi = (*state)->src.seqlo + (*state)->dst.max_win; (*state)->dst.seqhi = (*state)->dst.seqlo + (*state)->src.max_win; (*state)->src.wscale = (*state)->dst.wscale = 0; (*state)->src.state = (*state)->dst.state = TCPS_ESTABLISHED; REASON_SET(reason, PFRES_SYNPROXY); return (PF_SYNPROXY_DROP); } } if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) && dst->state >= TCPS_FIN_WAIT_2 && src->state >= TCPS_FIN_WAIT_2) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: state reuse "); pf_print_state(*state); pf_print_flags(th->th_flags); printf("\n"); } /* XXX make sure it's the same direction ?? */ (*state)->src.state = (*state)->dst.state = TCPS_CLOSED; pf_unlink_state(*state, PF_ENTER_LOCKED); *state = NULL; return (PF_DROP); } if ((*state)->state_flags & PFSTATE_SLOPPY) { if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP) return (PF_DROP); } else { if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason, ©back) == PF_DROP) return (PF_DROP); } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != th->th_sport) pf_change_ap(pd->src, &th->th_sport, pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 0, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != th->th_dport) pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum, &th->th_sum, &nk->addr[pd->didx], nk->port[pd->didx], 0, pd->af); copyback = 1; } /* Copyback sequence modulation or stateful scrub changes if needed */ if (copyback) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); } static int pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; struct udphdr *uh = pd->hdr.udp; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = IPPROTO_UDP; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = uh->uh_sport; key.port[1] = uh->uh_dport; } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = uh->uh_sport; key.port[0] = uh->uh_dport; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFUDPS_SINGLE) src->state = PFUDPS_SINGLE; if (dst->state == PFUDPS_SINGLE) dst->state = PFUDPS_MULTIPLE; /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE) (*state)->timeout = PFTM_UDP_MULTIPLE; else (*state)->timeout = PFTM_UDP_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) || nk->port[pd->sidx] != uh->uh_sport) pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->sidx], nk->port[pd->sidx], 1, pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) || nk->port[pd->didx] != uh->uh_dport) pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum, &uh->uh_sum, &nk->addr[pd->didx], nk->port[pd->didx], 1, pd->af); m_copyback(m, off, sizeof(*uh), (caddr_t)uh); } return (PF_PASS); } static int pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason) { struct pf_addr *saddr = pd->src, *daddr = pd->dst; u_int16_t icmpid = 0, *icmpsum; u_int8_t icmptype; int state_icmp = 0; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); switch (pd->proto) { #ifdef INET case IPPROTO_ICMP: icmptype = pd->hdr.icmp->icmp_type; icmpid = pd->hdr.icmp->icmp_id; icmpsum = &pd->hdr.icmp->icmp_cksum; if (icmptype == ICMP_UNREACH || icmptype == ICMP_SOURCEQUENCH || icmptype == ICMP_REDIRECT || icmptype == ICMP_TIMXCEED || icmptype == ICMP_PARAMPROB) state_icmp++; break; #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: icmptype = pd->hdr.icmp6->icmp6_type; icmpid = pd->hdr.icmp6->icmp6_id; icmpsum = &pd->hdr.icmp6->icmp6_cksum; if (icmptype == ICMP6_DST_UNREACH || icmptype == ICMP6_PACKET_TOO_BIG || icmptype == ICMP6_TIME_EXCEEDED || icmptype == ICMP6_PARAM_PROB) state_icmp++; break; #endif /* INET6 */ } if (!state_icmp) { /* * ICMP query/reply message not related to a TCP/UDP packet. * Search for an ICMP state. */ key.af = pd->af; key.proto = pd->proto; key.port[0] = key.port[1] = icmpid; if (direction == PF_IN) { /* wire side, straight */ PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); } else { /* stack side, reverse */ PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); } STATE_LOOKUP(kif, &key, direction, *state, pd); (*state)->expire = time_uptime; (*state)->timeout = PFTM_ICMP_ERROR_REPLY; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&saddr->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&daddr->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); if (nk->port[0] != pd->hdr.icmp->icmp_id) { pd->hdr.icmp->icmp_cksum = pf_cksum_fixup( pd->hdr.icmp->icmp_cksum, icmpid, nk->port[pd->sidx], 0); pd->hdr.icmp->icmp_id = nk->port[pd->sidx]; } m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET6)) pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->sidx], 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET6)) pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum, &nk->addr[pd->didx], 0); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); break; #endif /* INET6 */ } } return (PF_PASS); } else { /* * ICMP error message in response to a TCP/UDP packet. * Extract the inner TCP/UDP header and search for that state. */ struct pf_pdesc pd2; bzero(&pd2, sizeof pd2); #ifdef INET struct ip h2; #endif /* INET */ #ifdef INET6 struct ip6_hdr h2_6; int terminal = 0; #endif /* INET6 */ int ipoff2 = 0; int off2 = 0; pd2.af = pd->af; /* Payload packet is from the opposite direction. */ pd2.sidx = (direction == PF_IN) ? 1 : 0; pd2.didx = (direction == PF_IN) ? 0 : 1; switch (pd->af) { #ifdef INET case AF_INET: /* offset of h2 in mbuf chain */ ipoff2 = off + ICMP_MINLEN; if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip)\n")); return (PF_DROP); } /* * ICMP error messages don't refer to non-first * fragments */ if (h2.ip_off & htons(IP_OFFMASK)) { REASON_SET(reason, PFRES_FRAG); return (PF_DROP); } /* offset of protocol header that follows h2 */ off2 = ipoff2 + (h2.ip_hl << 2); pd2.proto = h2.ip_p; pd2.src = (struct pf_addr *)&h2.ip_src; pd2.dst = (struct pf_addr *)&h2.ip_dst; pd2.ip_sum = &h2.ip_sum; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipoff2 = off + sizeof(struct icmp6_hdr); if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(ip6)\n")); return (PF_DROP); } pd2.proto = h2_6.ip6_nxt; pd2.src = (struct pf_addr *)&h2_6.ip6_src; pd2.dst = (struct pf_addr *)&h2_6.ip6_dst; pd2.ip_sum = NULL; off2 = ipoff2 + sizeof(h2_6); do { switch (pd2.proto) { case IPPROTO_FRAGMENT: /* * ICMPv6 error messages for * non-first fragments */ REASON_SET(reason, PFRES_FRAG); return (PF_DROP); case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off2, &opt6, sizeof(opt6), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMPv6 short opt\n")); return (PF_DROP); } if (pd2.proto == IPPROTO_AH) off2 += (opt6.ip6e_len + 2) * 4; else off2 += (opt6.ip6e_len + 1) * 8; pd2.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); break; #endif /* INET6 */ } switch (pd2.proto) { case IPPROTO_TCP: { struct tcphdr th; u_int32_t seq; struct pf_state_peer *src, *dst; u_int8_t dws; int copyback = 0; /* * Only the first 8 bytes of the TCP header can be * expected. Don't access any TCP header fields after * th_seq, an ackskew test is not possible. */ if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(tcp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_TCP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = th.th_sport; key.port[pd2.didx] = th.th_dport; STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->dst; dst = &(*state)->src; } else { src = &(*state)->src; dst = &(*state)->dst; } if (src->wscale && dst->wscale) dws = dst->wscale & PF_WSCALE_MASK; else dws = 0; /* Demodulate sequence number */ seq = ntohl(th.th_seq) - src->seqdiff; if (src->seqdiff) { pf_change_a(&th.th_seq, icmpsum, htonl(seq), 0); copyback = 1; } if (!((*state)->state_flags & PFSTATE_SLOPPY) && (!SEQ_GEQ(src->seqhi, seq) || !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: BAD ICMP %d:%d ", icmptype, pd->hdr.icmp->icmp_code); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } REASON_SET(reason, PFRES_BADSTATE); return (PF_DROP); } else { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf: OK ICMP %d:%d ", icmptype, pd->hdr.icmp->icmp_code); pf_print_host(pd->src, 0, pd->af); printf(" -> "); pf_print_host(pd->dst, 0, pd->af); printf(" state: "); pf_print_state(*state); printf(" seq=%u\n", seq); } } /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != th.th_sport) pf_change_icmp(pd2.src, &th.th_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != th.th_dport) pf_change_icmp(pd2.dst, &th.th_dport, NULL, /* XXX Inbound NAT? */ &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); copyback = 1; } if (copyback) { switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t )&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, 8, (caddr_t)&th); } return (PF_PASS); break; } case IPPROTO_UDP: { struct udphdr uh; if (!pf_pull_hdr(m, off2, &uh, sizeof(uh), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(udp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_UDP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[pd2.sidx] = uh.uh_sport; key.port[pd2.didx] = uh.uh_dport; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != uh.uh_sport) pf_change_icmp(pd2.src, &uh.uh_sport, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != uh.uh_dport) pf_change_icmp(pd2.dst, &uh.uh_dport, NULL, /* XXX Inbound NAT? */ &nk->addr[pd2.didx], nk->port[pd2.didx], &uh.uh_sum, pd2.ip_sum, icmpsum, pd->ip_sum, 1, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t )pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } m_copyback(m, off2, sizeof(uh), (caddr_t)&uh); } return (PF_PASS); break; } #ifdef INET case IPPROTO_ICMP: { struct icmp iih; if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN, NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short i" "(icmp)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMP; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp_id; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp_id) pf_change_icmp(pd2.src, &iih.icmp_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp_id) pf_change_icmp(pd2.dst, &iih.icmp_id, NULL, /* XXX Inbound NAT? */ &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET); m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET */ #ifdef INET6 case IPPROTO_ICMPV6: { struct icmp6_hdr iih; if (!pf_pull_hdr(m, off2, &iih, sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: ICMP error message too short " "(icmp6)\n")); return (PF_DROP); } key.af = pd2.af; key.proto = IPPROTO_ICMPV6; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = iih.icmp6_id; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af) || nk->port[pd2.sidx] != iih.icmp6_id) pf_change_icmp(pd2.src, &iih.icmp6_id, daddr, &nk->addr[pd2.sidx], nk->port[pd2.sidx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af) || nk->port[pd2.didx] != iih.icmp6_id) pf_change_icmp(pd2.dst, &iih.icmp6_id, NULL, /* XXX Inbound NAT? */ &nk->addr[pd2.didx], nk->port[pd2.didx], NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, AF_INET6); m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t)pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6); m_copyback(m, off2, sizeof(struct icmp6_hdr), (caddr_t)&iih); } return (PF_PASS); break; } #endif /* INET6 */ default: { key.af = pd2.af; key.proto = pd2.proto; PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af); PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af); key.port[0] = key.port[1] = 0; STATE_LOOKUP(kif, &key, direction, *state, pd); /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; if (PF_ANEQ(pd2.src, &nk->addr[pd2.sidx], pd2.af)) pf_change_icmp(pd2.src, NULL, daddr, &nk->addr[pd2.sidx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); if (PF_ANEQ(pd2.dst, &nk->addr[pd2.didx], pd2.af)) pf_change_icmp(pd2.src, NULL, NULL, /* XXX Inbound NAT? */ &nk->addr[pd2.didx], 0, NULL, pd2.ip_sum, icmpsum, pd->ip_sum, 0, pd2.af); switch (pd2.af) { #ifdef INET case AF_INET: m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp); m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2); break; #endif /* INET */ #ifdef INET6 case AF_INET6: m_copyback(m, off, sizeof(struct icmp6_hdr), (caddr_t )pd->hdr.icmp6); m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t )&h2_6); break; #endif /* INET6 */ } } return (PF_PASS); break; } } } } static int pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif, struct mbuf *m, struct pf_pdesc *pd) { struct pf_state_peer *src, *dst; struct pf_state_key_cmp key; bzero(&key, sizeof(key)); key.af = pd->af; key.proto = pd->proto; if (direction == PF_IN) { PF_ACPY(&key.addr[0], pd->src, key.af); PF_ACPY(&key.addr[1], pd->dst, key.af); key.port[0] = key.port[1] = 0; } else { PF_ACPY(&key.addr[1], pd->src, key.af); PF_ACPY(&key.addr[0], pd->dst, key.af); key.port[1] = key.port[0] = 0; } STATE_LOOKUP(kif, &key, direction, *state, pd); if (direction == (*state)->direction) { src = &(*state)->src; dst = &(*state)->dst; } else { src = &(*state)->dst; dst = &(*state)->src; } /* update states */ if (src->state < PFOTHERS_SINGLE) src->state = PFOTHERS_SINGLE; if (dst->state == PFOTHERS_SINGLE) dst->state = PFOTHERS_MULTIPLE; /* update expire time */ (*state)->expire = time_uptime; if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE) (*state)->timeout = PFTM_OTHER_MULTIPLE; else (*state)->timeout = PFTM_OTHER_SINGLE; /* translate source/destination address, if necessary */ if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) { struct pf_state_key *nk = (*state)->key[pd->didx]; KASSERT(nk, ("%s: nk is null", __func__)); KASSERT(pd, ("%s: pd is null", __func__)); KASSERT(pd->src, ("%s: pd->src is null", __func__)); KASSERT(pd->dst, ("%s: pd->dst is null", __func__)); switch (pd->af) { #ifdef INET case AF_INET: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) pf_change_a(&pd->src->v4.s_addr, pd->ip_sum, nk->addr[pd->sidx].v4.s_addr, 0); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) pf_change_a(&pd->dst->v4.s_addr, pd->ip_sum, nk->addr[pd->didx].v4.s_addr, 0); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET)) PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af); if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET)) PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af); #endif /* INET6 */ } } return (PF_PASS); } /* * ipoff and off are measured from the start of the mbuf chain. * h must be at "ipoff" on the mbuf chain. */ void * pf_pull_hdr(struct mbuf *m, int off, void *p, int len, u_short *actionp, u_short *reasonp, sa_family_t af) { switch (af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; if (fragoff) { if (fragoff >= len) ACTION_SET(actionp, PF_PASS); else { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_FRAG); } return (NULL); } if (m->m_pkthdr.len < off + len || ntohs(h->ip_len) < off + len) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (m->m_pkthdr.len < off + len || (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) < (unsigned)(off + len)) { ACTION_SET(actionp, PF_DROP); REASON_SET(reasonp, PFRES_SHORT); return (NULL); } break; } #endif /* INET6 */ } m_copydata(m, off, len, p); return (p); } int pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif, int rtableid) { #ifdef RADIX_MPATH struct radix_node_head *rnh; #endif struct sockaddr_in *dst; int ret = 1; int check_mpath; #ifdef INET6 struct sockaddr_in6 *dst6; struct route_in6 ro; #else struct route ro; #endif struct radix_node *rn; struct rtentry *rt; struct ifnet *ifp; check_mpath = 0; #ifdef RADIX_MPATH /* XXX: stick to table 0 for now */ rnh = rt_tables_get_rnh(0, af); if (rnh != NULL && rn_mpath_capable(rnh)) check_mpath = 1; #endif bzero(&ro, sizeof(ro)); switch (af) { case AF_INET: dst = satosin(&ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = addr->v4; break; #ifdef INET6 case AF_INET6: /* * Skip check for addresses with embedded interface scope, * as they would always match anyway. */ if (IN6_IS_SCOPE_EMBED(&addr->v6)) goto out; dst6 = (struct sockaddr_in6 *)&ro.ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = addr->v6; break; #endif /* INET6 */ default: return (0); } /* Skip checks for ipsec interfaces */ if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC) goto out; switch (af) { #ifdef INET6 case AF_INET6: in6_rtalloc_ign(&ro, 0, rtableid); break; #endif #ifdef INET case AF_INET: in_rtalloc_ign((struct route *)&ro, 0, rtableid); break; #endif default: rtalloc_ign((struct route *)&ro, 0); /* No/default FIB. */ break; } if (ro.ro_rt != NULL) { /* No interface given, this is a no-route check */ if (kif == NULL) goto out; if (kif->pfik_ifp == NULL) { ret = 0; goto out; } /* Perform uRPF check if passed input interface */ ret = 0; rn = (struct radix_node *)ro.ro_rt; do { rt = (struct rtentry *)rn; ifp = rt->rt_ifp; if (kif->pfik_ifp == ifp) ret = 1; #ifdef RADIX_MPATH rn = rn_mpath_next(rn); #endif } while (check_mpath == 1 && rn != NULL && ret == 0); } else ret = 0; out: if (ro.ro_rt != NULL) RTFREE(ro.ro_rt); return (ret); } #ifdef INET static void pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd) { struct mbuf *m0, *m1; struct sockaddr_in dst; struct ip *ip; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_src_node *sn = NULL; int error = 0; uint16_t ip_len, ip_off; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r->rt == PF_DUPTO) { if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { if (s) PF_STATE_UNLOCK(s); return; } } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip = mtod(m0, struct ip *); bzero(&dst, sizeof(dst)); dst.sin_family = AF_INET; dst.sin_len = sizeof(dst); dst.sin_addr = ip->ip_dst; if (r->rt == PF_FASTROUTE) { struct rtentry *rt; if (s) PF_STATE_UNLOCK(s); rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0)); if (rt == NULL) { KMOD_IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ifp = rt->rt_ifp; rt->rt_rmx.rmx_pksent++; if (rt->rt_flags & RTF_GATEWAY) bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst)); RTFREE_LOCKED(rt); } else { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } if (s == NULL) { pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET)) dst.sin_addr.s_addr = naddr.v4.s_addr; ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET)) dst.sin_addr.s_addr = s->rt_addr.v4.s_addr; ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; PF_STATE_UNLOCK(s); } } if (ifp == NULL) goto bad; if (oifp != ifp) { if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip)\n", __func__)); goto bad; } ip = mtod(m0, struct ip *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* Copied from FreeBSD 10.0-CURRENT ip_output. */ m0->m_pkthdr.csum_flags |= CSUM_IP; if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= ifp->if_mtu || (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || ((ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m0, ip->ip_hl << 2); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } m_clrprotoflags(m0); /* Avoid confusing lower layers. */ error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; KMOD_IPSTAT_INC(ips_cantfrag); if (r->rt != PF_DUPTO) { icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, ifp->if_mtu); goto done; } else goto bad; } error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist); if (error) goto bad; for (; m0; m0 = m1) { m1 = m0->m_nextpkt; m0->m_nextpkt = NULL; if (error == 0) { m_clrprotoflags(m0); error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL); } else m_freem(m0); } if (error == 0) KMOD_IPSTAT_INC(ips_fragmented); done: if (r->rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET */ #ifdef INET6 static void pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, struct pf_state *s, struct pf_pdesc *pd) { struct mbuf *m0; struct sockaddr_in6 dst; struct ip6_hdr *ip6; struct ifnet *ifp = NULL; struct pf_addr naddr; struct pf_src_node *sn = NULL; KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__)); KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction", __func__)); if ((pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) || pd->pf_mtag->routed++ > 3) { m0 = *m; *m = NULL; goto bad_locked; } if (r->rt == PF_DUPTO) { if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) { if (s) PF_STATE_UNLOCK(s); return; } } else { if ((r->rt == PF_REPLYTO) == (r->direction == dir)) { if (s) PF_STATE_UNLOCK(s); return; } m0 = *m; } ip6 = mtod(m0, struct ip6_hdr *); bzero(&dst, sizeof(dst)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(dst); dst.sin6_addr = ip6->ip6_dst; /* Cheat. XXX why only in the v6 case??? */ if (r->rt == PF_FASTROUTE) { if (s) PF_STATE_UNLOCK(s); m0->m_flags |= M_SKIP_FIREWALL; ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); return; } if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__)); goto bad_locked; } if (s == NULL) { pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src, &naddr, NULL, &sn); if (!PF_AZERO(&naddr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &naddr, AF_INET6); ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL; } else { if (!PF_AZERO(&s->rt_addr, AF_INET6)) PF_ACPY((struct pf_addr *)&dst.sin6_addr, &s->rt_addr, AF_INET6); ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL; } if (s) PF_STATE_UNLOCK(s); if (ifp == NULL) goto bad; if (oifp != ifp) { if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS) goto bad; else if (m0 == NULL) goto done; if (m0->m_len < sizeof(struct ip6_hdr)) { DPFPRINTF(PF_DEBUG_URGENT, ("%s: m0->m_len < sizeof(struct ip6_hdr)\n", __func__)); goto bad; } ip6 = mtod(m0, struct ip6_hdr *); } if (ifp->if_flags & IFF_LOOPBACK) m0->m_flags |= M_SKIP_FIREWALL; /* * If the packet is too large for the outgoing interface, * send back an icmp6 error. */ if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr)) dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) nd6_output(ifp, ifp, m0, &dst, NULL); else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r->rt != PF_DUPTO) icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); else goto bad; } done: if (r->rt != PF_DUPTO) *m = NULL; return; bad_locked: if (s) PF_STATE_UNLOCK(s); bad: m_freem(m0); goto done; } #endif /* INET6 */ /* * FreeBSD supports cksum offloads for the following drivers. * em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4), * ti(4), txp(4), xl(4) * * CSUM_DATA_VALID | CSUM_PSEUDO_HDR : * network driver performed cksum including pseudo header, need to verify * csum_data * CSUM_DATA_VALID : * network driver performed cksum, needs to additional pseudo header * cksum computation with partial csum_data(i.e. lack of H/W support for * pseudo header, for instance hme(4), sk(4) and possibly gem(4)) * * After validating the cksum of packet, set both flag CSUM_DATA_VALID and * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper * TCP/UDP layer. * Also, set csum_data to 0xffff to force cksum validation. */ static int pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af) { u_int16_t sum = 0; int hw_assist = 0; struct ip *ip; if (off < sizeof(struct ip) || len < sizeof(struct udphdr)) return (1); if (m->m_pkthdr.len < off + len) return (1); switch (p) { case IPPROTO_TCP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_TCP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_UDP: if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) { sum = m->m_pkthdr.csum_data; } else { ip = mtod(m, struct ip *); sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + IPPROTO_UDP)); } sum ^= 0xffff; ++hw_assist; } break; case IPPROTO_ICMP: #ifdef INET6 case IPPROTO_ICMPV6: #endif /* INET6 */ break; default: return (1); } if (!hw_assist) { switch (af) { case AF_INET: if (p == IPPROTO_ICMP) { if (m->m_len < off) return (1); m->m_data += off; m->m_len -= off; sum = in_cksum(m, len); m->m_data -= off; m->m_len += off; } else { if (m->m_len < sizeof(struct ip)) return (1); sum = in4_cksum(m, p, off, len); } break; #ifdef INET6 case AF_INET6: if (m->m_len < sizeof(struct ip6_hdr)) return (1); sum = in6_cksum(m, p, off, len); break; #endif /* INET6 */ default: return (1); } } if (sum) { switch (p) { case IPPROTO_TCP: { KMOD_TCPSTAT_INC(tcps_rcvbadsum); break; } case IPPROTO_UDP: { KMOD_UDPSTAT_INC(udps_badsum); break; } #ifdef INET case IPPROTO_ICMP: { KMOD_ICMPSTAT_INC(icps_checksum); break; } #endif #ifdef INET6 case IPPROTO_ICMPV6: { KMOD_ICMP6STAT_INC(icp6s_checksum); break; } #endif /* INET6 */ } return (1); } else { if (p == IPPROTO_TCP || p == IPPROTO_UDP) { m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } } return (0); } #ifdef INET int pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0; struct ip *h = NULL; struct m_tag *ipfwtag; struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, dirndx, pqid = 0; M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); memset(&pd, 0, sizeof(pd)); kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); if (m->m_flags & M_SKIP_FIREWALL) return (PF_PASS); pd.pf_mtag = pf_find_mtag(m); PF_RULES_RLOCK(); if (ip_divert_ptr != NULL && ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) { struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1); if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; goto done; } pd.pf_mtag->flags |= PF_PACKET_LOOPED; m_tag_delete(m, ipfwtag); } if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) { m->m_flags |= M_FASTFWD_OURS; pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT; } } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) { /* We do IP header normalization and packet reassembly here */ action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip *); off = h->ip_hl << 2; if (off < (int)sizeof(struct ip)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } pd.src = (struct pf_addr *)&h->ip_src; pd.dst = (struct pf_addr *)&h->ip_dst; pd.sport = pd.dport = NULL; pd.ip_sum = &h->ip_sum; pd.proto_sum = NULL; pd.proto = h->ip_p; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET; pd.tos = h->ip_tos; pd.tot_len = ntohs(h->ip_len); /* handle fragments that didn't get reassembled by normalization */ if (h->ip_off & htons(IP_MF | IP_OFFMASK)) { action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); goto done; } switch (h->ip_p) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); if ((th.th_flags & TH_ACK) && pd.p_len == 0) pqid = 1; action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_ICMP: { struct icmp ih; pd.hdr.icmp = &ih; if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN, &action, &reason, AF_INET)) { log = action != PF_PASS; goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } #ifdef INET6 case IPPROTO_ICMPV6: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv4 packet with ICMPv6 payload\n")); goto done; } #endif default: action = pf_test_state_other(&s, dir, kif, m, &pd); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (action == PF_PASS && h->ip_hl > 5 && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with ip options\n")); } if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (r->rtableid >= 0) M_SETFIB(m, r->rtableid); #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (pqid || (pd.tos & IPTOS_LOWDELAY)) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* add hints for ecn */ pd.pf_mtag->hdr = h; } #endif /* ALTQ */ /* * connections redirected to loopback should not match sockets * bound specifically to loopback due to security implications, * see tcp_input() and in_pcblookup_listen(). */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) m->m_flags |= M_SKIP_FIREWALL; if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL && !PACKET_LOOPED(&pd)) { ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (ipfwtag != NULL) { ((struct ipfw_rule_ref *)(ipfwtag+1))->info = ntohs(r->divert.port); ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir; if (s) PF_STATE_UNLOCK(s); m_tag_prepend(m, ipfwtag); if (m->m_flags & M_FASTFWD_OURS) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate tag\n")); } pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT; m->m_flags &= ~M_FASTFWD_OURS; } ip_divert_ptr(*m0, dir == PF_IN ? DIR_IN : DIR_OUT); *m0 = NULL; return (action); } else { /* XXX: ipfw has the same behaviour! */ action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: failed to allocate divert tag\n")); } } if (log) { struct pf_rule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd, (s == NULL)); } kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len; kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++; if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); r->packets[dirndx]++; r->bytes[dirndx] += pd.tot_len; if (a != NULL) { a->packets[dirndx]++; a->bytes[dirndx] += pd.tot_len; } if (s != NULL) { if (s->nat_rule.ptr != NULL) { s->nat_rule.ptr->packets[dirndx]++; s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; } if (s->src_node != NULL) { s->src_node->packets[dirndx]++; s->src_node->bytes[dirndx] += pd.tot_len; } if (s->nat_src_node != NULL) { s->nat_src_node->packets[dirndx]++; s->nat_src_node->bytes[dirndx] += pd.tot_len; } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_OUT)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]-> addr[(s->direction == PF_IN)], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; default: /* pf_route() returns unlocked. */ if (r->rt) { pf_route(m0, r, dir, kif->pfik_ifp, s, &pd); return (action); } break; } if (s) PF_STATE_UNLOCK(s); return (action); } #endif /* INET */ #ifdef INET6 int pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp) { struct pfi_kif *kif; u_short action, reason = 0, log = 0; struct mbuf *m = *m0, *n = NULL; struct ip6_hdr *h = NULL; struct pf_rule *a = NULL, *r = &V_pf_default_rule, *tr, *nr; struct pf_state *s = NULL; struct pf_ruleset *ruleset = NULL; struct pf_pdesc pd; int off, terminal = 0, dirndx, rh_cnt = 0; M_ASSERTPKTHDR(m); if (!V_pf_status.running) return (PF_PASS); memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED) return (PF_PASS); kif = (struct pfi_kif *)ifp->if_pf_kif; if (kif == NULL) { DPFPRINTF(PF_DEBUG_URGENT, ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname)); return (PF_DROP); } if (kif->pfik_flags & PFI_IFLAG_SKIP) return (PF_PASS); PF_RULES_RLOCK(); /* We do IP header normalization and packet reassembly here */ if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) { action = PF_DROP; goto done; } m = *m0; /* pf_normalize messes with m0 */ h = mtod(m, struct ip6_hdr *); #if 1 /* * we do not support jumbogram yet. if we keep going, zero ip6_plen * will do something bad, so drop the packet for now. */ if (htons(h->ip6_plen) == 0) { action = PF_DROP; REASON_SET(&reason, PFRES_NORM); /*XXX*/ goto done; } #endif pd.src = (struct pf_addr *)&h->ip6_src; pd.dst = (struct pf_addr *)&h->ip6_dst; pd.sport = pd.dport = NULL; pd.ip_sum = NULL; pd.proto_sum = NULL; pd.dir = dir; pd.sidx = (dir == PF_IN) ? 0 : 1; pd.didx = (dir == PF_IN) ? 1 : 0; pd.af = AF_INET6; pd.tos = 0; pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr); off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr); pd.proto = h->ip6_nxt; do { switch (pd.proto) { case IPPROTO_FRAGMENT: action = pf_test_fragment(&r, dir, kif, m, h, &pd, &a, &ruleset); if (action == PF_DROP) REASON_SET(&reason, PFRES_FRAG); goto done; case IPPROTO_ROUTING: { struct ip6_rthdr rthdr; if (rh_cnt++) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 more than one rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short rthdr\n")); action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); log = 1; goto done; } if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 rthdr0\n")); action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; goto done; } /* FALLTHROUGH */ } case IPPROTO_AH: case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: { /* get next header and header length */ struct ip6_ext opt6; if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6), NULL, &reason, pd.af)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: IPv6 short opt\n")); action = PF_DROP; log = 1; goto done; } if (pd.proto == IPPROTO_AH) off += (opt6.ip6e_len + 2) * 4; else off += (opt6.ip6e_len + 1) * 8; pd.proto = opt6.ip6e_nxt; /* goto the next header */ break; } default: terminal++; break; } } while (!terminal); /* if there's no routing header, use unmodified mbuf for checksumming */ if (!n) n = m; switch (pd.proto) { case IPPROTO_TCP: { struct tcphdr th; pd.hdr.tcp = &th; if (!pf_pull_hdr(m, off, &th, sizeof(th), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } pd.p_len = pd.tot_len - off - (th.th_off << 2); action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd); if (action == PF_DROP) goto done; action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_UDP: { struct udphdr uh; pd.hdr.udp = &uh; if (!pf_pull_hdr(m, off, &uh, sizeof(uh), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } if (uh.uh_dport == 0 || ntohs(uh.uh_ulen) > m->m_pkthdr.len - off || ntohs(uh.uh_ulen) < sizeof(struct udphdr)) { action = PF_DROP; REASON_SET(&reason, PFRES_SHORT); goto done; } action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } case IPPROTO_ICMP: { action = PF_DROP; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping IPv6 packet with ICMPv4 payload\n")); goto done; } case IPPROTO_ICMPV6: { struct icmp6_hdr ih; pd.hdr.icmp6 = &ih; if (!pf_pull_hdr(m, off, &ih, sizeof(ih), &action, &reason, AF_INET6)) { log = action != PF_PASS; goto done; } action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd, &reason); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } default: action = pf_test_state_other(&s, dir, kif, m, &pd); if (action == PF_PASS) { if (pfsync_update_state_ptr != NULL) pfsync_update_state_ptr(s); r = s->rule.ptr; a = s->anchor.ptr; log = s->log; } else if (s == NULL) action = pf_test_rule(&r, &s, dir, kif, m, off, &pd, &a, &ruleset, inp); break; } done: PF_RULES_RUNLOCK(); if (n != m) { m_freem(n); n = NULL; } /* handle dangerous IPv6 extension headers. */ if (action == PF_PASS && rh_cnt && !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) { action = PF_DROP; REASON_SET(&reason, PFRES_IPOPTIONS); log = 1; DPFPRINTF(PF_DEBUG_MISC, ("pf: dropping packet with dangerous v6 headers\n")); } if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (r->rtableid >= 0) M_SETFIB(m, r->rtableid); #ifdef ALTQ if (action == PF_PASS && r->qid) { if (pd.pf_mtag == NULL && ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) { action = PF_DROP; REASON_SET(&reason, PFRES_MEMORY); } if (pd.tos & IPTOS_LOWDELAY) pd.pf_mtag->qid = r->pqid; else pd.pf_mtag->qid = r->qid; /* add hints for ecn */ pd.pf_mtag->hdr = h; } #endif /* ALTQ */ if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP || pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL && (s->nat_rule.ptr->action == PF_RDR || s->nat_rule.ptr->action == PF_BINAT) && IN6_IS_ADDR_LOOPBACK(&pd.dst->v6)) m->m_flags |= M_SKIP_FIREWALL; /* XXX: Anybody working on it?! */ if (r->divert.port) printf("pf: divert(9) is not supported for IPv6\n"); if (log) { struct pf_rule *lr; if (s != NULL && s->nat_rule.ptr != NULL && s->nat_rule.ptr->log & PF_LOG_ALL) lr = s->nat_rule.ptr; else lr = r; PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset, &pd, (s == NULL)); } kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len; kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++; if (action == PF_PASS || r->action == PF_DROP) { dirndx = (dir == PF_OUT); r->packets[dirndx]++; r->bytes[dirndx] += pd.tot_len; if (a != NULL) { a->packets[dirndx]++; a->bytes[dirndx] += pd.tot_len; } if (s != NULL) { if (s->nat_rule.ptr != NULL) { s->nat_rule.ptr->packets[dirndx]++; s->nat_rule.ptr->bytes[dirndx] += pd.tot_len; } if (s->src_node != NULL) { s->src_node->packets[dirndx]++; s->src_node->bytes[dirndx] += pd.tot_len; } if (s->nat_src_node != NULL) { s->nat_src_node->packets[dirndx]++; s->nat_src_node->bytes[dirndx] += pd.tot_len; } dirndx = (dir == s->direction) ? 0 : 1; s->packets[dirndx]++; s->bytes[dirndx] += pd.tot_len; } tr = r; nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule; if (nr != NULL && r == &V_pf_default_rule) tr = nr; if (tr->src.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->src.addr.p.tbl, (s == NULL) ? pd.src : &s->key[(s->direction == PF_IN)]->addr[0], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->src.neg); if (tr->dst.addr.type == PF_ADDR_TABLE) pfr_update_stats(tr->dst.addr.p.tbl, (s == NULL) ? pd.dst : &s->key[(s->direction == PF_IN)]->addr[1], pd.af, pd.tot_len, dir == PF_OUT, r->action == PF_PASS, tr->dst.neg); } switch (action) { case PF_SYNPROXY_DROP: m_freem(*m0); case PF_DEFER: *m0 = NULL; action = PF_PASS; break; default: /* pf_route6() returns unlocked. */ if (r->rt) { pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd); return (action); } break; } if (s) PF_STATE_UNLOCK(s); return (action); } #endif /* INET6 */ Index: stable/10/sys/netpfil/pf/pf.h =================================================================== --- stable/10/sys/netpfil/pf/pf.h (nonexistent) +++ stable/10/sys/netpfil/pf/pf.h (revision 263086) @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ + * $FreeBSD$ + */ + +#ifndef _NET_PF_H_ +#define _NET_PF_H_ + +#define PF_TCPS_PROXY_SRC ((TCP_NSTATES)+0) +#define PF_TCPS_PROXY_DST ((TCP_NSTATES)+1) + +#define PF_MD5_DIGEST_LENGTH 16 +#ifdef MD5_DIGEST_LENGTH +#if PF_MD5_DIGEST_LENGTH != MD5_DIGEST_LENGTH +#error +#endif +#endif + +enum { PF_INOUT, PF_IN, PF_OUT }; +enum { PF_PASS, PF_DROP, PF_SCRUB, PF_NOSCRUB, PF_NAT, PF_NONAT, + PF_BINAT, PF_NOBINAT, PF_RDR, PF_NORDR, PF_SYNPROXY_DROP, PF_DEFER }; +enum { PF_RULESET_SCRUB, PF_RULESET_FILTER, PF_RULESET_NAT, + PF_RULESET_BINAT, PF_RULESET_RDR, PF_RULESET_MAX }; +enum { PF_OP_NONE, PF_OP_IRG, PF_OP_EQ, PF_OP_NE, PF_OP_LT, + PF_OP_LE, PF_OP_GT, PF_OP_GE, PF_OP_XRG, PF_OP_RRG }; +enum { PF_DEBUG_NONE, PF_DEBUG_URGENT, PF_DEBUG_MISC, PF_DEBUG_NOISY }; +enum { PF_CHANGE_NONE, PF_CHANGE_ADD_HEAD, PF_CHANGE_ADD_TAIL, + PF_CHANGE_ADD_BEFORE, PF_CHANGE_ADD_AFTER, + PF_CHANGE_REMOVE, PF_CHANGE_GET_TICKET }; +enum { PF_GET_NONE, PF_GET_CLR_CNTR }; +enum { PF_SK_WIRE, PF_SK_STACK, PF_SK_BOTH }; + +/* + * Note about PFTM_*: real indices into pf_rule.timeout[] come before + * PFTM_MAX, special cases afterwards. See pf_state_expires(). + */ +enum { PFTM_TCP_FIRST_PACKET, PFTM_TCP_OPENING, PFTM_TCP_ESTABLISHED, + PFTM_TCP_CLOSING, PFTM_TCP_FIN_WAIT, PFTM_TCP_CLOSED, + PFTM_UDP_FIRST_PACKET, PFTM_UDP_SINGLE, PFTM_UDP_MULTIPLE, + PFTM_ICMP_FIRST_PACKET, PFTM_ICMP_ERROR_REPLY, + PFTM_OTHER_FIRST_PACKET, PFTM_OTHER_SINGLE, + PFTM_OTHER_MULTIPLE, PFTM_FRAG, PFTM_INTERVAL, + PFTM_ADAPTIVE_START, PFTM_ADAPTIVE_END, PFTM_SRC_NODE, + PFTM_TS_DIFF, PFTM_MAX, PFTM_PURGE, PFTM_UNLINKED, + PFTM_UNTIL_PACKET }; + +/* PFTM default values */ +#define PFTM_TCP_FIRST_PACKET_VAL 120 /* First TCP packet */ +#define PFTM_TCP_OPENING_VAL 30 /* No response yet */ +#define PFTM_TCP_ESTABLISHED_VAL 24*60*60/* Established */ +#define PFTM_TCP_CLOSING_VAL 15 * 60 /* Half closed */ +#define PFTM_TCP_FIN_WAIT_VAL 45 /* Got both FINs */ +#define PFTM_TCP_CLOSED_VAL 90 /* Got a RST */ +#define PFTM_UDP_FIRST_PACKET_VAL 60 /* First UDP packet */ +#define PFTM_UDP_SINGLE_VAL 30 /* Unidirectional */ +#define PFTM_UDP_MULTIPLE_VAL 60 /* Bidirectional */ +#define PFTM_ICMP_FIRST_PACKET_VAL 20 /* First ICMP packet */ +#define PFTM_ICMP_ERROR_REPLY_VAL 10 /* Got error response */ +#define PFTM_OTHER_FIRST_PACKET_VAL 60 /* First packet */ +#define PFTM_OTHER_SINGLE_VAL 30 /* Unidirectional */ +#define PFTM_OTHER_MULTIPLE_VAL 60 /* Bidirectional */ +#define PFTM_FRAG_VAL 30 /* Fragment expire */ +#define PFTM_INTERVAL_VAL 10 /* Expire interval */ +#define PFTM_SRC_NODE_VAL 0 /* Source tracking */ +#define PFTM_TS_DIFF_VAL 30 /* Allowed TS diff */ + +enum { PF_NOPFROUTE, PF_FASTROUTE, PF_ROUTETO, PF_DUPTO, PF_REPLYTO }; +enum { PF_LIMIT_STATES, PF_LIMIT_SRC_NODES, PF_LIMIT_FRAGS, + PF_LIMIT_TABLE_ENTRIES, PF_LIMIT_MAX }; +#define PF_POOL_IDMASK 0x0f +enum { PF_POOL_NONE, PF_POOL_BITMASK, PF_POOL_RANDOM, + PF_POOL_SRCHASH, PF_POOL_ROUNDROBIN }; +enum { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, PF_ADDR_DYNIFTL, + PF_ADDR_TABLE, PF_ADDR_URPFFAILED, + PF_ADDR_RANGE }; +#define PF_POOL_TYPEMASK 0x0f +#define PF_POOL_STICKYADDR 0x20 +#define PF_WSCALE_FLAG 0x80 +#define PF_WSCALE_MASK 0x0f + +#define PF_LOG 0x01 +#define PF_LOG_ALL 0x02 +#define PF_LOG_SOCKET_LOOKUP 0x04 + +/* Reasons code for passing/dropping a packet */ +#define PFRES_MATCH 0 /* Explicit match of a rule */ +#define PFRES_BADOFF 1 /* Bad offset for pull_hdr */ +#define PFRES_FRAG 2 /* Dropping following fragment */ +#define PFRES_SHORT 3 /* Dropping short packet */ +#define PFRES_NORM 4 /* Dropping by normalizer */ +#define PFRES_MEMORY 5 /* Dropped due to lacking mem */ +#define PFRES_TS 6 /* Bad TCP Timestamp (RFC1323) */ +#define PFRES_CONGEST 7 /* Congestion (of ipintrq) */ +#define PFRES_IPOPTIONS 8 /* IP option */ +#define PFRES_PROTCKSUM 9 /* Protocol checksum invalid */ +#define PFRES_BADSTATE 10 /* State mismatch */ +#define PFRES_STATEINS 11 /* State insertion failure */ +#define PFRES_MAXSTATES 12 /* State limit */ +#define PFRES_SRCLIMIT 13 /* Source node/conn limit */ +#define PFRES_SYNPROXY 14 /* SYN proxy */ +#define PFRES_MAX 15 /* total+1 */ + +#define PFRES_NAMES { \ + "match", \ + "bad-offset", \ + "fragment", \ + "short", \ + "normalize", \ + "memory", \ + "bad-timestamp", \ + "congestion", \ + "ip-option", \ + "proto-cksum", \ + "state-mismatch", \ + "state-insert", \ + "state-limit", \ + "src-limit", \ + "synproxy", \ + NULL \ +} + +#define PF_TABLE_NAME_SIZE 32 +#define PF_QNAME_SIZE 64 + +#endif /* _NET_PF_H_ */ Property changes on: stable/10/sys/netpfil/pf/pf.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/10/sys/netpfil/pf/pf_altq.h =================================================================== --- stable/10/sys/netpfil/pf/pf_altq.h (nonexistent) +++ stable/10/sys/netpfil/pf/pf_altq.h (revision 263086) @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: pfvar.h,v 1.282 2009/01/29 15:12:28 pyr Exp $ + * $FreeBSD$ + */ + +#ifndef _NET_PF_ALTQ_H_ +#define _NET_PF_ALTQ_H_ + +struct cbq_opts { + u_int minburst; + u_int maxburst; + u_int pktsize; + u_int maxpktsize; + u_int ns_per_byte; + u_int maxidle; + int minidle; + u_int offtime; + int flags; +}; + +struct priq_opts { + int flags; +}; + +struct hfsc_opts { + /* real-time service curve */ + u_int rtsc_m1; /* slope of the 1st segment in bps */ + u_int rtsc_d; /* the x-projection of m1 in msec */ + u_int rtsc_m2; /* slope of the 2nd segment in bps */ + /* link-sharing service curve */ + u_int lssc_m1; + u_int lssc_d; + u_int lssc_m2; + /* upper-limit service curve */ + u_int ulsc_m1; + u_int ulsc_d; + u_int ulsc_m2; + int flags; +}; + +struct pf_altq { + char ifname[IFNAMSIZ]; + + void *altq_disc; /* discipline-specific state */ + TAILQ_ENTRY(pf_altq) entries; + + /* scheduler spec */ + uint8_t scheduler; /* scheduler type */ + uint16_t tbrsize; /* tokenbucket regulator size */ + uint32_t ifbandwidth; /* interface bandwidth */ + + /* queue spec */ + char qname[PF_QNAME_SIZE]; /* queue name */ + char parent[PF_QNAME_SIZE]; /* parent name */ + uint32_t parent_qid; /* parent queue id */ + uint32_t bandwidth; /* queue bandwidth */ + uint8_t priority; /* priority */ + uint8_t local_flags; /* dynamic interface */ +#define PFALTQ_FLAG_IF_REMOVED 0x01 + + uint16_t qlimit; /* queue size limit */ + uint16_t flags; /* misc flags */ + union { + struct cbq_opts cbq_opts; + struct priq_opts priq_opts; + struct hfsc_opts hfsc_opts; + } pq_u; + + uint32_t qid; /* return value */ +}; + +#endif /* _NET_PF_ALTQ_H_ */ Property changes on: stable/10/sys/netpfil/pf/pf_altq.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/10/sys/netpfil/pf/pf_lb.c =================================================================== --- stable/10/sys/netpfil/pf/pf_lb.c (revision 263085) +++ stable/10/sys/netpfil/pf/pf_lb.c (revision 263086) @@ -1,669 +1,668 @@ /*- * Copyright (c) 2001 Daniel Hartmeier * Copyright (c) 2002 - 2008 Henning Brauer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided * with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Effort sponsored in part by the Defense Advanced Research Projects * Agency (DARPA) and Air Force Research Laboratory, Air Force * Materiel Command, USAF, under agreement number F30602-01-2-0537. * * $OpenBSD: pf_lb.c,v 1.2 2009/02/12 02:13:15 sthen Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_pf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include -#include #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x static void pf_hash(struct pf_addr *, struct pf_addr *, struct pf_poolhashkey *, sa_family_t); static struct pf_rule *pf_match_translation(struct pf_pdesc *, struct mbuf *, int, int, struct pfi_kif *, struct pf_addr *, u_int16_t, struct pf_addr *, uint16_t, int, struct pf_anchor_stackframe *); static int pf_get_sport(sa_family_t, uint8_t, struct pf_rule *, struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *, uint16_t *, uint16_t, uint16_t, struct pf_src_node **); #define mix(a,b,c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (0) /* * hash function based on bridge_hash in if_bridge.c */ static void pf_hash(struct pf_addr *inaddr, struct pf_addr *hash, struct pf_poolhashkey *key, sa_family_t af) { u_int32_t a = 0x9e3779b9, b = 0x9e3779b9, c = key->key32[0]; switch (af) { #ifdef INET case AF_INET: a += inaddr->addr32[0]; b += key->key32[1]; mix(a, b, c); hash->addr32[0] = c + key->key32[2]; break; #endif /* INET */ #ifdef INET6 case AF_INET6: a += inaddr->addr32[0]; b += inaddr->addr32[2]; mix(a, b, c); hash->addr32[0] = c; a += inaddr->addr32[1]; b += inaddr->addr32[3]; c += key->key32[1]; mix(a, b, c); hash->addr32[1] = c; a += inaddr->addr32[2]; b += inaddr->addr32[1]; c += key->key32[2]; mix(a, b, c); hash->addr32[2] = c; a += inaddr->addr32[3]; b += inaddr->addr32[0]; c += key->key32[3]; mix(a, b, c); hash->addr32[3] = c; break; #endif /* INET6 */ } } static struct pf_rule * pf_match_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, struct pfi_kif *kif, struct pf_addr *saddr, u_int16_t sport, struct pf_addr *daddr, uint16_t dport, int rs_num, struct pf_anchor_stackframe *anchor_stack) { struct pf_rule *r, *rm = NULL; struct pf_ruleset *ruleset = NULL; int tag = -1; int rtableid = -1; int asd = 0; r = TAILQ_FIRST(pf_main_ruleset.rules[rs_num].active.ptr); while (r && rm == NULL) { struct pf_rule_addr *src = NULL, *dst = NULL; struct pf_addr_wrap *xdst = NULL; if (r->action == PF_BINAT && direction == PF_IN) { src = &r->dst; if (r->rpool.cur != NULL) xdst = &r->rpool.cur->addr; } else { src = &r->src; dst = &r->dst; } r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != direction) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != pd->af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&src->addr, saddr, pd->af, src->neg, kif, M_GETFIB(m))) r = r->skip[src == &r->src ? PF_SKIP_SRC_ADDR : PF_SKIP_DST_ADDR].ptr; else if (src->port_op && !pf_match_port(src->port_op, src->port[0], src->port[1], sport)) r = r->skip[src == &r->src ? PF_SKIP_SRC_PORT : PF_SKIP_DST_PORT].ptr; else if (dst != NULL && PF_MISMATCHAW(&dst->addr, daddr, pd->af, dst->neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (xdst != NULL && PF_MISMATCHAW(xdst, daddr, pd->af, 0, NULL, M_GETFIB(m))) r = TAILQ_NEXT(r, entries); else if (dst != NULL && dst->port_op && !pf_match_port(dst->port_op, dst->port[0], dst->port[1], dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else if (r->os_fingerprint != PF_OSFP_ANY && (pd->proto != IPPROTO_TCP || !pf_osfp_match(pf_osfp_fingerprint(pd, m, off, pd->hdr.tcp), r->os_fingerprint))) r = TAILQ_NEXT(r, entries); else { if (r->tag) tag = r->tag; if (r->rtableid >= 0) rtableid = r->rtableid; if (r->anchor == NULL) { rm = r; } else pf_step_into_anchor(anchor_stack, &asd, &ruleset, rs_num, &r, NULL, NULL); } if (r == NULL) pf_step_out_of_anchor(anchor_stack, &asd, &ruleset, rs_num, &r, NULL, NULL); } if (tag > 0 && pf_tag_packet(m, pd, tag)) return (NULL); if (rtableid >= 0) M_SETFIB(m, rtableid); if (rm != NULL && (rm->action == PF_NONAT || rm->action == PF_NORDR || rm->action == PF_NOBINAT)) return (NULL); return (rm); } static int pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_rule *r, struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr, uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low, uint16_t high, struct pf_src_node **sn) { struct pf_state_key_cmp key; struct pf_addr init_addr; uint16_t cut; bzero(&init_addr, sizeof(init_addr)); if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) return (1); if (proto == IPPROTO_ICMP) { low = 1; high = 65535; } bzero(&key, sizeof(key)); key.af = af; key.proto = proto; key.port[0] = dport; PF_ACPY(&key.addr[0], daddr, key.af); do { PF_ACPY(&key.addr[1], naddr, key.af); /* * port search; start random, step; * similar 2 portloop in in_pcbbind */ if (!(proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_ICMP) || (low == 0 && high == 0)) { /* * XXX bug: icmp states don't use the id on both sides. * (traceroute -I through nat) */ key.port[1] = sport; if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { *nport = sport; return (0); } } else if (low == high) { key.port[1] = htons(low); if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { *nport = htons(low); return (0); } } else { uint16_t tmp; if (low > high) { tmp = low; low = high; high = tmp; } /* low < high */ cut = htonl(arc4random()) % (1 + high - low) + low; /* low <= cut <= high */ for (tmp = cut; tmp <= high; ++(tmp)) { key.port[1] = htons(tmp); if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { *nport = htons(tmp); return (0); } } for (tmp = cut - 1; tmp >= low; --(tmp)) { key.port[1] = htons(tmp); if (pf_find_state_all(&key, PF_IN, NULL) == NULL) { *nport = htons(tmp); return (0); } } } switch (r->rpool.opts & PF_POOL_TYPEMASK) { case PF_POOL_RANDOM: case PF_POOL_ROUNDROBIN: if (pf_map_addr(af, r, saddr, naddr, &init_addr, sn)) return (1); break; case PF_POOL_NONE: case PF_POOL_SRCHASH: case PF_POOL_BITMASK: default: return (1); } } while (! PF_AEQ(&init_addr, naddr, af) ); return (1); /* none available */ } int pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr, struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sn) { struct pf_pool *rpool = &r->rpool; struct pf_addr *raddr = NULL, *rmask = NULL; if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR && (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { *sn = pf_find_src_node(saddr, r, af, 0); if (*sn != NULL && !PF_AZERO(&(*sn)->raddr, af)) { PF_ACPY(naddr, &(*sn)->raddr, af); if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pf_map_addr: src tracking maps "); pf_print_host(saddr, 0, af); printf(" to "); pf_print_host(naddr, 0, af); printf("\n"); } return (0); } } if (rpool->cur->addr.type == PF_ADDR_NOROUTE) return (1); if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { switch (af) { #ifdef INET case AF_INET: if (rpool->cur->addr.p.dyn->pfid_acnt4 < 1 && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) return (1); raddr = &rpool->cur->addr.p.dyn->pfid_addr4; rmask = &rpool->cur->addr.p.dyn->pfid_mask4; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (rpool->cur->addr.p.dyn->pfid_acnt6 < 1 && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) return (1); raddr = &rpool->cur->addr.p.dyn->pfid_addr6; rmask = &rpool->cur->addr.p.dyn->pfid_mask6; break; #endif /* INET6 */ } } else if (rpool->cur->addr.type == PF_ADDR_TABLE) { if ((rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) return (1); /* unsupported */ } else { raddr = &rpool->cur->addr.v.a.addr; rmask = &rpool->cur->addr.v.a.mask; } switch (rpool->opts & PF_POOL_TYPEMASK) { case PF_POOL_NONE: PF_ACPY(naddr, raddr, af); break; case PF_POOL_BITMASK: PF_POOLMASK(naddr, raddr, rmask, saddr, af); break; case PF_POOL_RANDOM: if (init_addr != NULL && PF_AZERO(init_addr, af)) { switch (af) { #ifdef INET case AF_INET: rpool->counter.addr32[0] = htonl(arc4random()); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (rmask->addr32[3] != 0xffffffff) rpool->counter.addr32[3] = htonl(arc4random()); else break; if (rmask->addr32[2] != 0xffffffff) rpool->counter.addr32[2] = htonl(arc4random()); else break; if (rmask->addr32[1] != 0xffffffff) rpool->counter.addr32[1] = htonl(arc4random()); else break; if (rmask->addr32[0] != 0xffffffff) rpool->counter.addr32[0] = htonl(arc4random()); break; #endif /* INET6 */ } PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); PF_ACPY(init_addr, naddr, af); } else { PF_AINC(&rpool->counter, af); PF_POOLMASK(naddr, raddr, rmask, &rpool->counter, af); } break; case PF_POOL_SRCHASH: { unsigned char hash[16]; pf_hash(saddr, (struct pf_addr *)&hash, &rpool->key, af); PF_POOLMASK(naddr, raddr, rmask, (struct pf_addr *)&hash, af); break; } case PF_POOL_ROUNDROBIN: { struct pf_pooladdr *acur = rpool->cur; /* * XXXGL: in the round-robin case we need to store * the round-robin machine state in the rule, thus * forwarding thread needs to modify rule. * * This is done w/o locking, because performance is assumed * more important than round-robin precision. * * In the simpliest case we just update the "rpool->cur" * pointer. However, if pool contains tables or dynamic * addresses, then "tblidx" is also used to store machine * state. Since "tblidx" is int, concurrent access to it can't * lead to inconsistence, only to lost of precision. * * Things get worse, if table contains not hosts, but * prefixes. In this case counter also stores machine state, * and for IPv6 address, counter can't be updated atomically. * Probably, using round-robin on a table containing IPv6 * prefixes (or even IPv4) would cause a panic. */ if (rpool->cur->addr.type == PF_ADDR_TABLE) { if (!pfr_pool_get(rpool->cur->addr.p.tbl, &rpool->tblidx, &rpool->counter, af)) goto get_addr; } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { if (!pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, &rpool->tblidx, &rpool->counter, af)) goto get_addr; } else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af)) goto get_addr; try_next: if (TAILQ_NEXT(rpool->cur, entries) == NULL) rpool->cur = TAILQ_FIRST(&rpool->list); else rpool->cur = TAILQ_NEXT(rpool->cur, entries); if (rpool->cur->addr.type == PF_ADDR_TABLE) { rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.tbl, &rpool->tblidx, &rpool->counter, af)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; return (1); } } else if (rpool->cur->addr.type == PF_ADDR_DYNIFTL) { rpool->tblidx = -1; if (pfr_pool_get(rpool->cur->addr.p.dyn->pfid_kt, &rpool->tblidx, &rpool->counter, af)) { /* table contains no address of type 'af' */ if (rpool->cur != acur) goto try_next; return (1); } } else { raddr = &rpool->cur->addr.v.a.addr; rmask = &rpool->cur->addr.v.a.mask; PF_ACPY(&rpool->counter, raddr, af); } get_addr: PF_ACPY(naddr, &rpool->counter, af); if (init_addr != NULL && PF_AZERO(init_addr, af)) PF_ACPY(init_addr, naddr, af); PF_AINC(&rpool->counter, af); break; } } if (*sn != NULL) PF_ACPY(&(*sn)->raddr, naddr, af); if (V_pf_status.debug >= PF_DEBUG_MISC && (rpool->opts & PF_POOL_TYPEMASK) != PF_POOL_NONE) { printf("pf_map_addr: selected address "); pf_print_host(naddr, 0, af); printf("\n"); } return (0); } struct pf_rule * pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int off, int direction, struct pfi_kif *kif, struct pf_src_node **sn, struct pf_state_key **skp, struct pf_state_key **nkp, struct pf_addr *saddr, struct pf_addr *daddr, uint16_t sport, uint16_t dport, struct pf_anchor_stackframe *anchor_stack) { struct pf_rule *r = NULL; struct pf_addr *naddr; uint16_t *nport; PF_RULES_RASSERT(); KASSERT(*skp == NULL, ("*skp not NULL")); KASSERT(*nkp == NULL, ("*nkp not NULL")); if (direction == PF_OUT) { r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_BINAT, anchor_stack); if (r == NULL) r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_NAT, anchor_stack); } else { r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_RDR, anchor_stack); if (r == NULL) r = pf_match_translation(pd, m, off, direction, kif, saddr, sport, daddr, dport, PF_RULESET_BINAT, anchor_stack); } if (r == NULL) return (NULL); switch (r->action) { case PF_NONAT: case PF_NOBINAT: case PF_NORDR: return (NULL); } *skp = pf_state_key_setup(pd, saddr, daddr, sport, dport); if (*skp == NULL) return (NULL); *nkp = pf_state_key_clone(*skp); if (*nkp == NULL) { uma_zfree(V_pf_state_key_z, skp); *skp = NULL; return (NULL); } /* XXX We only modify one side for now. */ naddr = &(*nkp)->addr[1]; nport = &(*nkp)->port[1]; switch (r->action) { case PF_NAT: if (pf_get_sport(pd->af, pd->proto, r, saddr, sport, daddr, dport, naddr, nport, r->rpool.proxy_port[0], r->rpool.proxy_port[1], sn)) { DPFPRINTF(PF_DEBUG_MISC, ("pf: NAT proxy port allocation (%u-%u) failed\n", r->rpool.proxy_port[0], r->rpool.proxy_port[1])); goto notrans; } break; case PF_BINAT: switch (direction) { case PF_OUT: if (r->rpool.cur->addr.type == PF_ADDR_DYNIFTL){ switch (pd->af) { #ifdef INET case AF_INET: if (r->rpool.cur->addr.p.dyn-> pfid_acnt4 < 1) goto notrans; PF_POOLMASK(naddr, &r->rpool.cur->addr.p.dyn-> pfid_addr4, &r->rpool.cur->addr.p.dyn-> pfid_mask4, saddr, AF_INET); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (r->rpool.cur->addr.p.dyn-> pfid_acnt6 < 1) goto notrans; PF_POOLMASK(naddr, &r->rpool.cur->addr.p.dyn-> pfid_addr6, &r->rpool.cur->addr.p.dyn-> pfid_mask6, saddr, AF_INET6); break; #endif /* INET6 */ } } else PF_POOLMASK(naddr, &r->rpool.cur->addr.v.a.addr, &r->rpool.cur->addr.v.a.mask, saddr, pd->af); break; case PF_IN: if (r->src.addr.type == PF_ADDR_DYNIFTL) { switch (pd->af) { #ifdef INET case AF_INET: if (r->src.addr.p.dyn-> pfid_acnt4 < 1) goto notrans; PF_POOLMASK(naddr, &r->src.addr.p.dyn->pfid_addr4, &r->src.addr.p.dyn->pfid_mask4, daddr, AF_INET); break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (r->src.addr.p.dyn->pfid_acnt6 < 1) goto notrans; PF_POOLMASK(naddr, &r->src.addr.p.dyn->pfid_addr6, &r->src.addr.p.dyn->pfid_mask6, daddr, AF_INET6); break; #endif /* INET6 */ } } else PF_POOLMASK(naddr, &r->src.addr.v.a.addr, &r->src.addr.v.a.mask, daddr, pd->af); break; } break; case PF_RDR: { if (pf_map_addr(pd->af, r, saddr, naddr, NULL, sn)) goto notrans; if ((r->rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_BITMASK) PF_POOLMASK(naddr, naddr, &r->rpool.cur->addr.v.a.mask, daddr, pd->af); if (r->rpool.proxy_port[1]) { uint32_t tmp_nport; tmp_nport = ((ntohs(dport) - ntohs(r->dst.port[0])) % (r->rpool.proxy_port[1] - r->rpool.proxy_port[0] + 1)) + r->rpool.proxy_port[0]; /* Wrap around if necessary. */ if (tmp_nport > 65535) tmp_nport -= 65535; *nport = htons((uint16_t)tmp_nport); } else if (r->rpool.proxy_port[0]) *nport = htons(r->rpool.proxy_port[0]); break; } default: panic("%s: unknown action %u", __func__, r->action); } /* Return success only if translation really happened. */ if (bcmp(*skp, *nkp, sizeof(struct pf_state_key_cmp))) return (r); notrans: uma_zfree(V_pf_state_key_z, *nkp); uma_zfree(V_pf_state_key_z, *skp); *skp = *nkp = NULL; *sn = NULL; return (NULL); } Index: stable/10/sys/netpfil/pf/pf_mtag.h =================================================================== --- stable/10/sys/netpfil/pf/pf_mtag.h (nonexistent) +++ stable/10/sys/netpfil/pf/pf_mtag.h (revision 263086) @@ -0,0 +1,62 @@ +/* $FreeBSD$ */ +/* + * Copyright (c) 2001 Daniel Hartmeier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _NET_PF_MTAG_H_ +#define _NET_PF_MTAG_H_ + +#ifdef _KERNEL + +#define PF_TAG_GENERATED 0x01 +#define PF_TAG_FRAGCACHE 0x02 +#define PF_TAG_TRANSLATE_LOCALHOST 0x04 +#define PF_PACKET_LOOPED 0x08 +#define PF_FASTFWD_OURS_PRESENT 0x10 + +struct pf_mtag { + void *hdr; /* saved hdr pos in mbuf, for ECN */ + u_int32_t qid; /* queue id */ + u_int16_t tag; /* tag id */ + u_int8_t flags; + u_int8_t routed; +}; + +static __inline struct pf_mtag * +pf_find_mtag(struct mbuf *m) +{ + struct m_tag *mtag; + + if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) == NULL) + return (NULL); + + return ((struct pf_mtag *)(mtag + 1)); +} +#endif /* _KERNEL */ +#endif /* _NET_PF_MTAG_H_ */ Property changes on: stable/10/sys/netpfil/pf/pf_mtag.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: stable/10/sys/netpfil/pf/pf_norm.c =================================================================== --- stable/10/sys/netpfil/pf/pf_norm.c (revision 263085) +++ stable/10/sys/netpfil/pf/pf_norm.c (revision 263086) @@ -1,2000 +1,1999 @@ /*- * Copyright 2001 Niels Provos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ struct pf_frent { LIST_ENTRY(pf_frent) fr_next; union { struct { struct ip *_fr_ip; struct mbuf *_fr_m; } _frag; struct { uint16_t _fr_off; uint16_t _fr_end; } _cache; } _u; }; #define fr_ip _u._frag._fr_ip #define fr_m _u._frag._fr_m #define fr_off _u._cache._fr_off #define fr_end _u._cache._fr_end struct pf_fragment { RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; struct in_addr fr_src; struct in_addr fr_dst; u_int8_t fr_p; /* protocol of this fragment */ u_int8_t fr_flags; /* status flags */ #define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */ #define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */ #define PFFRAG_DROP 0x0004 /* Drop all fragments */ #define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER)) u_int16_t fr_id; /* fragment id for reassemble */ u_int16_t fr_max; /* fragment data max */ u_int32_t fr_timeout; LIST_HEAD(, pf_frent) fr_queue; }; static struct mtx pf_frag_mtx; #define PF_FRAG_LOCK() mtx_lock(&pf_frag_mtx) #define PF_FRAG_UNLOCK() mtx_unlock(&pf_frag_mtx) #define PF_FRAG_ASSERT() mtx_assert(&pf_frag_mtx, MA_OWNED) VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ static VNET_DEFINE(uma_zone_t, pf_frent_z); #define V_pf_frent_z VNET(pf_frent_z) static VNET_DEFINE(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); static VNET_DEFINE(struct pf_fragqueue, pf_fragqueue); #define V_pf_fragqueue VNET(pf_fragqueue) static VNET_DEFINE(struct pf_cachequeue, pf_cachequeue); #define V_pf_cachequeue VNET(pf_cachequeue) RB_HEAD(pf_frag_tree, pf_fragment); static VNET_DEFINE(struct pf_frag_tree, pf_frag_tree); #define V_pf_frag_tree VNET(pf_frag_tree) static VNET_DEFINE(struct pf_frag_tree, pf_cache_tree); #define V_pf_cache_tree VNET(pf_cache_tree) static int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); /* Private prototypes */ static void pf_free_fragment(struct pf_fragment *); static void pf_remove_fragment(struct pf_fragment *); static int pf_normalize_tcpopt(struct pf_rule *, struct mbuf *, struct tcphdr *, int, sa_family_t); #ifdef INET static void pf_ip2key(struct pf_fragment *, struct ip *); static void pf_scrub_ip(struct mbuf **, u_int32_t, u_int8_t, u_int8_t); static void pf_flush_fragments(void); static struct pf_fragment *pf_find_fragment(struct ip *, struct pf_frag_tree *); static struct mbuf *pf_reassemble(struct mbuf **, struct pf_fragment **, struct pf_frent *, int); static struct mbuf *pf_fragcache(struct mbuf **, struct ip*, struct pf_fragment **, int, int, int *); #endif /* INET */ #ifdef INET6 static void pf_scrub_ip6(struct mbuf **, u_int8_t); #endif #define DPFPRINTF(x) do { \ if (V_pf_status.debug >= PF_DEBUG_MISC) { \ printf("%s: ", __func__); \ printf x ; \ } \ } while(0) void pf_normalize_init(void) { V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_state_scrub_z = uma_zcreate("pf state scrubs", sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached"); mtx_init(&pf_frag_mtx, "pf fragments", NULL, MTX_DEF); TAILQ_INIT(&V_pf_fragqueue); TAILQ_INIT(&V_pf_cachequeue); } void pf_normalize_cleanup(void) { uma_zdestroy(V_pf_state_scrub_z); uma_zdestroy(V_pf_frent_z); uma_zdestroy(V_pf_frag_z); mtx_destroy(&pf_frag_mtx); } static int pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) { int diff; if ((diff = a->fr_id - b->fr_id)) return (diff); else if ((diff = a->fr_p - b->fr_p)) return (diff); else if (a->fr_src.s_addr < b->fr_src.s_addr) return (-1); else if (a->fr_src.s_addr > b->fr_src.s_addr) return (1); else if (a->fr_dst.s_addr < b->fr_dst.s_addr) return (-1); else if (a->fr_dst.s_addr > b->fr_dst.s_addr) return (1); return (0); } void pf_purge_expired_fragments(void) { struct pf_fragment *frag; u_int32_t expire = time_uptime - V_pf_default_rule.timeout[PFTM_FRAG]; PF_FRAG_LOCK(); while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { KASSERT((BUFFER_FRAGMENTS(frag)), ("BUFFER_FRAGMENTS(frag) == 0: %s", __FUNCTION__)); if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); } while ((frag = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue)) != NULL) { KASSERT((!BUFFER_FRAGMENTS(frag)), ("BUFFER_FRAGMENTS(frag) != 0: %s", __FUNCTION__)); if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); KASSERT((TAILQ_EMPTY(&V_pf_cachequeue) || TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue) != frag), ("!(TAILQ_EMPTY() || TAILQ_LAST() == farg): %s", __FUNCTION__)); } PF_FRAG_UNLOCK(); } #ifdef INET /* * Try to flush old fragments to make space for new ones */ static void pf_flush_fragments(void) { struct pf_fragment *frag, *cache; int goal; PF_FRAG_ASSERT(); goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; DPFPRINTF(("trying to free %d frag entriess\n", goal)); while (goal < uma_zone_get_cur(V_pf_frent_z)) { frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); if (frag) pf_free_fragment(frag); cache = TAILQ_LAST(&V_pf_cachequeue, pf_cachequeue); if (cache) pf_free_fragment(cache); if (frag == NULL && cache == NULL) break; } } #endif /* INET */ /* Frees the fragments and all associated entries */ static void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; PF_FRAG_ASSERT(); /* Free all fragments */ if (BUFFER_FRAGMENTS(frag)) { for (frent = LIST_FIRST(&frag->fr_queue); frent; frent = LIST_FIRST(&frag->fr_queue)) { LIST_REMOVE(frent, fr_next); m_freem(frent->fr_m); uma_zfree(V_pf_frent_z, frent); } } else { for (frent = LIST_FIRST(&frag->fr_queue); frent; frent = LIST_FIRST(&frag->fr_queue)) { LIST_REMOVE(frent, fr_next); KASSERT((LIST_EMPTY(&frag->fr_queue) || LIST_FIRST(&frag->fr_queue)->fr_off > frent->fr_end), ("! (LIST_EMPTY() || LIST_FIRST()->fr_off >" " frent->fr_end): %s", __func__)); uma_zfree(V_pf_frent_z, frent); } } pf_remove_fragment(frag); } #ifdef INET static void pf_ip2key(struct pf_fragment *key, struct ip *ip) { key->fr_p = ip->ip_p; key->fr_id = ip->ip_id; key->fr_src.s_addr = ip->ip_src.s_addr; key->fr_dst.s_addr = ip->ip_dst.s_addr; } static struct pf_fragment * pf_find_fragment(struct ip *ip, struct pf_frag_tree *tree) { struct pf_fragment key; struct pf_fragment *frag; PF_FRAG_ASSERT(); pf_ip2key(&key, ip); frag = RB_FIND(pf_frag_tree, tree, &key); if (frag != NULL) { /* XXX Are we sure we want to update the timeout? */ frag->fr_timeout = time_uptime; if (BUFFER_FRAGMENTS(frag)) { TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); } else { TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_cachequeue, frag, frag_next); } } return (frag); } #endif /* INET */ /* Removes a fragment from the fragment queue and frees the fragment */ static void pf_remove_fragment(struct pf_fragment *frag) { PF_FRAG_ASSERT(); if (BUFFER_FRAGMENTS(frag)) { RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } else { RB_REMOVE(pf_frag_tree, &V_pf_cache_tree, frag); TAILQ_REMOVE(&V_pf_cachequeue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } } #ifdef INET #define FR_IP_OFF(fr) ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3) static struct mbuf * pf_reassemble(struct mbuf **m0, struct pf_fragment **frag, struct pf_frent *frent, int mff) { struct mbuf *m = *m0, *m2; struct pf_frent *frea, *next; struct pf_frent *frep = NULL; struct ip *ip = frent->fr_ip; int hlen = ip->ip_hl << 2; u_int16_t off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; u_int16_t ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4; u_int16_t max = ip_len + off; PF_FRAG_ASSERT(); KASSERT((*frag == NULL || BUFFER_FRAGMENTS(*frag)), ("! (*frag == NULL || BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); /* Strip off ip header */ m->m_data += hlen; m->m_len -= hlen; /* Create a new reassembly queue for this packet */ if (*frag == NULL) { *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (*frag == NULL) { pf_flush_fragments(); *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (*frag == NULL) goto drop_fragment; } (*frag)->fr_flags = 0; (*frag)->fr_max = 0; (*frag)->fr_src = frent->fr_ip->ip_src; (*frag)->fr_dst = frent->fr_ip->ip_dst; (*frag)->fr_p = frent->fr_ip->ip_p; (*frag)->fr_id = frent->fr_ip->ip_id; (*frag)->fr_timeout = time_uptime; LIST_INIT(&(*frag)->fr_queue); RB_INSERT(pf_frag_tree, &V_pf_frag_tree, *frag); TAILQ_INSERT_HEAD(&V_pf_fragqueue, *frag, frag_next); /* We do not have a previous fragment */ frep = NULL; goto insert; } /* * Find a fragment after the current one: * - off contains the real shifted offset. */ LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) { if (FR_IP_OFF(frea) > off) break; frep = frea; } KASSERT((frep != NULL || frea != NULL), ("!(frep != NULL || frea != NULL): %s", __FUNCTION__));; if (frep != NULL && FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4 > off) { u_int16_t precut; precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4 - off; if (precut >= ip_len) goto drop_fragment; m_adj(frent->fr_m, precut); DPFPRINTF(("overlap -%d\n", precut)); /* Enforce 8 byte boundaries */ ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3)); off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; ip_len -= precut; ip->ip_len = htons(ip_len); } for (; frea != NULL && ip_len + off > FR_IP_OFF(frea); frea = next) { u_int16_t aftercut; aftercut = ip_len + off - FR_IP_OFF(frea); DPFPRINTF(("adjust overlap %d\n", aftercut)); if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl * 4) { frea->fr_ip->ip_len = htons(ntohs(frea->fr_ip->ip_len) - aftercut); frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) + (aftercut >> 3)); m_adj(frea->fr_m, aftercut); break; } /* This fragment is completely overlapped, lose it */ next = LIST_NEXT(frea, fr_next); m_freem(frea->fr_m); LIST_REMOVE(frea, fr_next); uma_zfree(V_pf_frent_z, frea); } insert: /* Update maximum data size */ if ((*frag)->fr_max < max) (*frag)->fr_max = max; /* This is the last segment */ if (!mff) (*frag)->fr_flags |= PFFRAG_SEENLAST; if (frep == NULL) LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next); else LIST_INSERT_AFTER(frep, frent, fr_next); /* Check if we are completely reassembled */ if (!((*frag)->fr_flags & PFFRAG_SEENLAST)) return (NULL); /* Check if we have all the data */ off = 0; for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) { next = LIST_NEXT(frep, fr_next); off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4; if (off < (*frag)->fr_max && (next == NULL || FR_IP_OFF(next) != off)) { DPFPRINTF(("missing fragment at %d, next %d, max %d\n", off, next == NULL ? -1 : FR_IP_OFF(next), (*frag)->fr_max)); return (NULL); } } DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max)); if (off < (*frag)->fr_max) return (NULL); /* We have all the data */ frent = LIST_FIRST(&(*frag)->fr_queue); KASSERT((frent != NULL), ("frent == NULL: %s", __FUNCTION__)); if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) { DPFPRINTF(("drop: too big: %d\n", off)); pf_free_fragment(*frag); *frag = NULL; return (NULL); } next = LIST_NEXT(frent, fr_next); /* Magic from ip_input */ ip = frent->fr_ip; m = frent->fr_m; m2 = m->m_next; m->m_next = NULL; m_cat(m, m2); uma_zfree(V_pf_frent_z, frent); for (frent = next; frent != NULL; frent = next) { next = LIST_NEXT(frent, fr_next); m2 = frent->fr_m; uma_zfree(V_pf_frent_z, frent); m->m_pkthdr.csum_flags &= m2->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += m2->m_pkthdr.csum_data; m_cat(m, m2); } while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); ip->ip_src = (*frag)->fr_src; ip->ip_dst = (*frag)->fr_dst; /* Remove from fragment queue */ pf_remove_fragment(*frag); *frag = NULL; hlen = ip->ip_hl << 2; ip->ip_len = htons(off + hlen); m->m_len += hlen; m->m_data -= hlen; /* some debugging cruft by sklower, below, will go away soon */ /* XXX this should be done elsewhere */ if (m->m_flags & M_PKTHDR) { int plen = 0; for (m2 = m; m2; m2 = m2->m_next) plen += m2->m_len; m->m_pkthdr.len = plen; } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); return (m); drop_fragment: /* Oops - fail safe - drop packet */ uma_zfree(V_pf_frent_z, frent); m_freem(m); return (NULL); } static struct mbuf * pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff, int drop, int *nomem) { struct mbuf *m = *m0; struct pf_frent *frp, *fra, *cur = NULL; int ip_len = ntohs(h->ip_len) - (h->ip_hl << 2); u_int16_t off = ntohs(h->ip_off) << 3; u_int16_t max = ip_len + off; int hosed = 0; PF_FRAG_ASSERT(); KASSERT((*frag == NULL || !BUFFER_FRAGMENTS(*frag)), ("!(*frag == NULL || !BUFFER_FRAGMENTS(*frag)): %s", __FUNCTION__)); /* Create a new range queue for this packet */ if (*frag == NULL) { *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (*frag == NULL) { pf_flush_fragments(); *frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (*frag == NULL) goto no_mem; } /* Get an entry for the queue */ cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (cur == NULL) { uma_zfree(V_pf_frag_z, *frag); *frag = NULL; goto no_mem; } (*frag)->fr_flags = PFFRAG_NOBUFFER; (*frag)->fr_max = 0; (*frag)->fr_src = h->ip_src; (*frag)->fr_dst = h->ip_dst; (*frag)->fr_p = h->ip_p; (*frag)->fr_id = h->ip_id; (*frag)->fr_timeout = time_uptime; cur->fr_off = off; cur->fr_end = max; LIST_INIT(&(*frag)->fr_queue); LIST_INSERT_HEAD(&(*frag)->fr_queue, cur, fr_next); RB_INSERT(pf_frag_tree, &V_pf_cache_tree, *frag); TAILQ_INSERT_HEAD(&V_pf_cachequeue, *frag, frag_next); DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off, max)); goto pass; } /* * Find a fragment after the current one: * - off contains the real shifted offset. */ frp = NULL; LIST_FOREACH(fra, &(*frag)->fr_queue, fr_next) { if (fra->fr_off > off) break; frp = fra; } KASSERT((frp != NULL || fra != NULL), ("!(frp != NULL || fra != NULL): %s", __FUNCTION__)); if (frp != NULL) { int precut; precut = frp->fr_end - off; if (precut >= ip_len) { /* Fragment is entirely a duplicate */ DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n", h->ip_id, frp->fr_off, frp->fr_end, off, max)); goto drop_fragment; } if (precut == 0) { /* They are adjacent. Fixup cache entry */ DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n", h->ip_id, frp->fr_off, frp->fr_end, off, max)); frp->fr_end = max; } else if (precut > 0) { /* The first part of this payload overlaps with a * fragment that has already been passed. * Need to trim off the first part of the payload. * But to do so easily, we need to create another * mbuf to throw the original header into. */ DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n", h->ip_id, precut, frp->fr_off, frp->fr_end, off, max)); off += precut; max -= precut; /* Update the previous frag to encompass this one */ frp->fr_end = max; if (!drop) { /* XXX Optimization opportunity * This is a very heavy way to trim the payload. * we could do it much faster by diddling mbuf * internals but that would be even less legible * than this mbuf magic. For my next trick, * I'll pull a rabbit out of my laptop. */ *m0 = m_dup(m, M_NOWAIT); if (*m0 == NULL) goto no_mem; /* From KAME Project : We have missed this! */ m_adj(*m0, (h->ip_hl << 2) - (*m0)->m_pkthdr.len); KASSERT(((*m0)->m_next == NULL), ("(*m0)->m_next != NULL: %s", __FUNCTION__)); m_adj(m, precut + (h->ip_hl << 2)); m_cat(*m0, m); m = *m0; if (m->m_flags & M_PKTHDR) { int plen = 0; struct mbuf *t; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; } h = mtod(m, struct ip *); KASSERT(((int)m->m_len == ntohs(h->ip_len) - precut), ("m->m_len != ntohs(h->ip_len) - precut: %s", __FUNCTION__)); h->ip_off = htons(ntohs(h->ip_off) + (precut >> 3)); h->ip_len = htons(ntohs(h->ip_len) - precut); } else { hosed++; } } else { /* There is a gap between fragments */ DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n", h->ip_id, -precut, frp->fr_off, frp->fr_end, off, max)); cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (cur == NULL) goto no_mem; cur->fr_off = off; cur->fr_end = max; LIST_INSERT_AFTER(frp, cur, fr_next); } } if (fra != NULL) { int aftercut; int merge = 0; aftercut = max - fra->fr_off; if (aftercut == 0) { /* Adjacent fragments */ DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n", h->ip_id, off, max, fra->fr_off, fra->fr_end)); fra->fr_off = off; merge = 1; } else if (aftercut > 0) { /* Need to chop off the tail of this fragment */ DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n", h->ip_id, aftercut, off, max, fra->fr_off, fra->fr_end)); fra->fr_off = off; max -= aftercut; merge = 1; if (!drop) { m_adj(m, -aftercut); if (m->m_flags & M_PKTHDR) { int plen = 0; struct mbuf *t; for (t = m; t; t = t->m_next) plen += t->m_len; m->m_pkthdr.len = plen; } h = mtod(m, struct ip *); KASSERT(((int)m->m_len == ntohs(h->ip_len) - aftercut), ("m->m_len != ntohs(h->ip_len) - aftercut: %s", __FUNCTION__)); h->ip_len = htons(ntohs(h->ip_len) - aftercut); } else { hosed++; } } else if (frp == NULL) { /* There is a gap between fragments */ DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n", h->ip_id, -aftercut, off, max, fra->fr_off, fra->fr_end)); cur = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (cur == NULL) goto no_mem; cur->fr_off = off; cur->fr_end = max; LIST_INSERT_BEFORE(fra, cur, fr_next); } /* Need to glue together two separate fragment descriptors */ if (merge) { if (cur && fra->fr_off <= cur->fr_end) { /* Need to merge in a previous 'cur' */ DPFPRINTF(("fragcache[%d]: adjacent(merge " "%d-%d) %d-%d (%d-%d)\n", h->ip_id, cur->fr_off, cur->fr_end, off, max, fra->fr_off, fra->fr_end)); fra->fr_off = cur->fr_off; LIST_REMOVE(cur, fr_next); uma_zfree(V_pf_frent_z, cur); cur = NULL; } else if (frp && fra->fr_off <= frp->fr_end) { /* Need to merge in a modified 'frp' */ KASSERT((cur == NULL), ("cur != NULL: %s", __FUNCTION__)); DPFPRINTF(("fragcache[%d]: adjacent(merge " "%d-%d) %d-%d (%d-%d)\n", h->ip_id, frp->fr_off, frp->fr_end, off, max, fra->fr_off, fra->fr_end)); fra->fr_off = frp->fr_off; LIST_REMOVE(frp, fr_next); uma_zfree(V_pf_frent_z, frp); frp = NULL; } } } if (hosed) { /* * We must keep tracking the overall fragment even when * we're going to drop it anyway so that we know when to * free the overall descriptor. Thus we drop the frag late. */ goto drop_fragment; } pass: /* Update maximum data size */ if ((*frag)->fr_max < max) (*frag)->fr_max = max; /* This is the last segment */ if (!mff) (*frag)->fr_flags |= PFFRAG_SEENLAST; /* Check if we are completely reassembled */ if (((*frag)->fr_flags & PFFRAG_SEENLAST) && LIST_FIRST(&(*frag)->fr_queue)->fr_off == 0 && LIST_FIRST(&(*frag)->fr_queue)->fr_end == (*frag)->fr_max) { /* Remove from fragment queue */ DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id, (*frag)->fr_max)); pf_free_fragment(*frag); *frag = NULL; } return (m); no_mem: *nomem = 1; /* Still need to pay attention to !IP_MF */ if (!mff && *frag != NULL) (*frag)->fr_flags |= PFFRAG_SEENLAST; m_freem(m); return (NULL); drop_fragment: /* Still need to pay attention to !IP_MF */ if (!mff && *frag != NULL) (*frag)->fr_flags |= PFFRAG_SEENLAST; if (drop) { /* This fragment has been deemed bad. Don't reass */ if (((*frag)->fr_flags & PFFRAG_DROP) == 0) DPFPRINTF(("fragcache[%d]: dropping overall fragment\n", h->ip_id)); (*frag)->fr_flags |= PFFRAG_DROP; } m_freem(m); return (NULL); } int pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_rule *r; struct pf_frent *frent; struct pf_fragment *frag = NULL; struct ip *h = mtod(m, struct ip *); int mff = (ntohs(h->ip_off) & IP_MF); int hlen = h->ip_hl << 2; u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; u_int16_t max; int ip_len; int ip_off; int tag = -1; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != h->ip_p) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip_src.s_addr, AF_INET, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else break; } if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } /* Check for illegal packets */ if (hlen < (int)sizeof(struct ip)) goto drop; if (hlen > ntohs(h->ip_len)) goto drop; /* Clear IP_DF if the rule uses the no-df option */ if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* We will need other tests here */ if (!fragoff && !mff) goto no_fragment; /* We're dealing with a fragment now. Don't allow fragments * with IP_DF to enter the cache. If the flag was cleared by * no-df above, fine. Otherwise drop it. */ if (h->ip_off & htons(IP_DF)) { DPFPRINTF(("IP_DF\n")); goto bad; } ip_len = ntohs(h->ip_len) - hlen; ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3; /* All fragments are 8 byte aligned */ if (mff && (ip_len & 0x7)) { DPFPRINTF(("mff and %d\n", ip_len)); goto bad; } /* Respect maximum length */ if (fragoff + ip_len > IP_MAXPACKET) { DPFPRINTF(("max packet %d\n", fragoff + ip_len)); goto bad; } max = fragoff + ip_len; if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) { /* Fully buffer all of the fragments */ PF_FRAG_LOCK(); frag = pf_find_fragment(h, &V_pf_frag_tree); /* Check if we saw the last fragment already */ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && max > frag->fr_max) goto bad; /* Get an entry for the fragment queue */ frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { PF_FRAG_UNLOCK(); REASON_SET(reason, PFRES_MEMORY); return (PF_DROP); } frent->fr_ip = h; frent->fr_m = m; /* Might return a completely reassembled mbuf, or NULL */ DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); *m0 = m = pf_reassemble(m0, &frag, frent, mff); PF_FRAG_UNLOCK(); if (m == NULL) return (PF_DROP); /* use mtag from concatenated mbuf chain */ pd->pf_mtag = pf_find_mtag(m); #ifdef DIAGNOSTIC if (pd->pf_mtag == NULL) { printf("%s: pf_find_mtag returned NULL(1)\n", __func__); if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) { m_freem(m); *m0 = NULL; goto no_mem; } } #endif if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) goto drop; h = mtod(m, struct ip *); } else { /* non-buffering fragment cache (drops or masks overlaps) */ int nomem = 0; if (dir == PF_OUT && pd->pf_mtag->flags & PF_TAG_FRAGCACHE) { /* * Already passed the fragment cache in the * input direction. If we continued, it would * appear to be a dup and would be dropped. */ goto fragment_pass; } PF_FRAG_LOCK(); frag = pf_find_fragment(h, &V_pf_cache_tree); /* Check if we saw the last fragment already */ if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) && max > frag->fr_max) { if (r->rule_flag & PFRULE_FRAGDROP) frag->fr_flags |= PFFRAG_DROP; goto bad; } *m0 = m = pf_fragcache(m0, h, &frag, mff, (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem); PF_FRAG_UNLOCK(); if (m == NULL) { if (nomem) goto no_mem; goto drop; } /* use mtag from copied and trimmed mbuf chain */ pd->pf_mtag = pf_find_mtag(m); #ifdef DIAGNOSTIC if (pd->pf_mtag == NULL) { printf("%s: pf_find_mtag returned NULL(2)\n", __func__); if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) { m_freem(m); *m0 = NULL; goto no_mem; } } #endif if (dir == PF_IN) pd->pf_mtag->flags |= PF_TAG_FRAGCACHE; if (frag != NULL && (frag->fr_flags & PFFRAG_DROP)) goto drop; goto fragment_pass; } no_fragment: /* At this point, only IP_DF is allowed in ip_off */ if (h->ip_off & ~htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* not missing a return here */ fragment_pass: pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos); if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) pd->flags |= PFDESC_IP_REAS; return (PF_PASS); no_mem: REASON_SET(reason, PFRES_MEMORY); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); bad: DPFPRINTF(("dropping bad fragment\n")); /* Free associated fragments */ if (frag != NULL) { pf_free_fragment(frag); PF_FRAG_UNLOCK(); } REASON_SET(reason, PFRES_FRAG); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif #ifdef INET6 int pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_rule *r; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); int off; struct ip6_ext ext; struct ip6_opt opt; struct ip6_opt_jumbo jumbo; struct ip6_frag frag; u_int32_t jumbolen = 0, plen; u_int16_t fragoff = 0; int optend; int ooff; u_int8_t proto; int terminal; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET6) r = r->skip[PF_SKIP_AF].ptr; #if 0 /* header chain! */ else if (r->proto && r->proto != h->ip6_nxt) r = r->skip[PF_SKIP_PROTO].ptr; #endif else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip6_src, AF_INET6, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip6_dst, AF_INET6, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else break; } if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } /* Check for illegal packets */ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) goto drop; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; terminal = 0; do { switch (proto) { case IPPROTO_FRAGMENT: goto fragment; break; case IPPROTO_AH: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; if (proto == IPPROTO_AH) off += (ext.ip6e_len + 2) * 4; else off += (ext.ip6e_len + 1) * 8; proto = ext.ip6e_nxt; break; case IPPROTO_HOPOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; optend = off + (ext.ip6e_len + 1) * 8; ooff = off + sizeof(ext); do { if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, NULL, AF_INET6)) goto shortpkt; if (opt.ip6o_type == IP6OPT_PAD1) { ooff++; continue; } if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), NULL, NULL, AF_INET6)) goto shortpkt; if (ooff + sizeof(opt) + opt.ip6o_len > optend) goto drop; switch (opt.ip6o_type) { case IP6OPT_JUMBO: if (h->ip6_plen != 0) goto drop; if (!pf_pull_hdr(m, ooff, &jumbo, sizeof(jumbo), NULL, NULL, AF_INET6)) goto shortpkt; memcpy(&jumbolen, jumbo.ip6oj_jumbo_len, sizeof(jumbolen)); jumbolen = ntohl(jumbolen); if (jumbolen <= IPV6_MAXPACKET) goto drop; if (sizeof(struct ip6_hdr) + jumbolen != m->m_pkthdr.len) goto drop; break; default: break; } ooff += sizeof(opt) + opt.ip6o_len; } while (ooff < optend); off = optend; proto = ext.ip6e_nxt; break; default: terminal = 1; break; } } while (!terminal); /* jumbo payload option must be present, or plen > 0 */ if (ntohs(h->ip6_plen) == 0) plen = jumbolen; else plen = ntohs(h->ip6_plen); if (plen == 0) goto drop; if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; pf_scrub_ip6(&m, r->min_ttl); return (PF_PASS); fragment: if (ntohs(h->ip6_plen) == 0 || jumbolen) goto drop; plen = ntohs(h->ip6_plen); if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) goto shortpkt; fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK); if (fragoff + (plen - off - sizeof(frag)) > IPV6_MAXPACKET) goto badfrag; /* do something about it */ /* remember to set pd->flags |= PFDESC_IP_REAS */ return (PF_PASS); shortpkt: REASON_SET(reason, PFRES_SHORT); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); badfrag: REASON_SET(reason, PFRES_FRAG); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif /* INET6 */ int pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff, int off, void *h, struct pf_pdesc *pd) { struct pf_rule *r, *rm = NULL; struct tcphdr *th = pd->hdr.tcp; int rewrite = 0; u_short reason; u_int8_t flags; sa_family_t af = pd->af; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); while (r != NULL) { r->evaluations++; if (pfi_kif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], th->th_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], th->th_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) r = TAILQ_NEXT(r, entries); else { rm = r; break; } } if (rm == NULL || rm->action == PF_NOSCRUB) return (PF_PASS); else { r->packets[dir == PF_OUT]++; r->bytes[dir == PF_OUT] += pd->tot_len; } if (rm->rule_flag & PFRULE_REASSEMBLE_TCP) pd->flags |= PFDESC_TCP_NORM; flags = th->th_flags; if (flags & TH_SYN) { /* Illegal packet */ if (flags & TH_RST) goto tcp_drop; if (flags & TH_FIN) flags &= ~TH_FIN; } else { /* Illegal packet */ if (!(flags & (TH_ACK|TH_RST))) goto tcp_drop; } if (!(flags & TH_ACK)) { /* These flags are only valid if ACK is set */ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) goto tcp_drop; } /* Check for illegal header length */ if (th->th_off < (sizeof(struct tcphdr) >> 2)) goto tcp_drop; /* If flags changed, or reserved data set, then adjust */ if (flags != th->th_flags || th->th_x2 != 0) { u_int16_t ov, nv; ov = *(u_int16_t *)(&th->th_ack + 1); th->th_flags = flags; th->th_x2 = 0; nv = *(u_int16_t *)(&th->th_ack + 1); th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0); rewrite = 1; } /* Remove urgent pointer, if TH_URG is not set */ if (!(flags & TH_URG) && th->th_urp) { th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0); th->th_urp = 0; rewrite = 1; } /* Process options */ if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af)) rewrite = 1; /* copy back packet headers if we sanitized */ if (rewrite) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); tcp_drop: REASON_SET(&reason, PFRES_NORM); if (rm != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd, 1); return (PF_DROP); } int pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) { u_int32_t tsval, tsecr; u_int8_t hdr[60]; u_int8_t *opt; KASSERT((src->scrub == NULL), ("pf_normalize_tcp_init: src->scrub != NULL")); src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (src->scrub == NULL) return (1); switch (pd->af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); src->scrub->pfss_ttl = h->ip_ttl; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); src->scrub->pfss_ttl = h->ip6_hlim; break; } #endif /* INET6 */ } /* * All normalizations below are only begun if we see the start of * the connections. They must all set an enabled bit in pfss_flags */ if ((th->th_flags & TH_SYN) == 0) return (0); if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: if (opt[1] >= TCPOLEN_TIMESTAMP) { src->scrub->pfss_flags |= PFSS_TIMESTAMP; src->scrub->pfss_ts_mod = htonl(arc4random()); /* note PFSS_PAWS not set yet */ memcpy(&tsval, &opt[2], sizeof(u_int32_t)); memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); src->scrub->pfss_tsval0 = ntohl(tsval); src->scrub->pfss_tsval = ntohl(tsval); src->scrub->pfss_tsecr = ntohl(tsecr); getmicrouptime(&src->scrub->pfss_last); } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } } return (0); } void pf_normalize_tcp_cleanup(struct pf_state *state) { if (state->src.scrub) uma_zfree(V_pf_state_scrub_z, state->src.scrub); if (state->dst.scrub) uma_zfree(V_pf_state_scrub_z, state->dst.scrub); /* Someday... flush the TCP segment reassembly descriptors. */ } int pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, struct tcphdr *th, struct pf_state *state, struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) { struct timeval uptime; u_int32_t tsval, tsecr; u_int tsval_from_last; u_int8_t hdr[60]; u_int8_t *opt; int copyback = 0; int got_ts = 0; KASSERT((src->scrub || dst->scrub), ("%s: src->scrub && dst->scrub!", __func__)); /* * Enforce the minimum TTL seen for this connection. Negate a common * technique to evade an intrusion detection system and confuse * firewall state code. */ switch (pd->af) { #ifdef INET case AF_INET: { if (src->scrub) { struct ip *h = mtod(m, struct ip *); if (h->ip_ttl > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip_ttl; h->ip_ttl = src->scrub->pfss_ttl; } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { if (src->scrub) { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (h->ip6_hlim > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip6_hlim; h->ip6_hlim = src->scrub->pfss_ttl; } break; } #endif /* INET6 */ } if (th->th_off > (sizeof(struct tcphdr) >> 2) && ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: /* Modulate the timestamps. Can be used for * NAT detection, OS uptime determination or * reboot detection. */ if (got_ts) { /* Huh? Multiple timestamps!? */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("multiple TS??")); pf_print_state(state); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } if (opt[1] >= TCPOLEN_TIMESTAMP) { memcpy(&tsval, &opt[2], sizeof(u_int32_t)); if (tsval && src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsval = ntohl(tsval); pf_change_a(&opt[2], &th->th_sum, htonl(tsval + src->scrub->pfss_ts_mod), 0); copyback = 1; } /* Modulate TS reply iff valid (!0) */ memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); if (tsecr && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsecr = ntohl(tsecr) - dst->scrub->pfss_ts_mod; pf_change_a(&opt[6], &th->th_sum, htonl(tsecr), 0); copyback = 1; } got_ts = 1; } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } if (copyback) { /* Copyback the options, caller copys back header */ *writeback = 1; m_copyback(m, off + sizeof(struct tcphdr), (th->th_off << 2) - sizeof(struct tcphdr), hdr + sizeof(struct tcphdr)); } } /* * Must invalidate PAWS checks on connections idle for too long. * The fastest allowed timestamp clock is 1ms. That turns out to * be about 24 days before it wraps. XXX Right now our lowerbound * TS echo check only works for the first 12 days of a connection * when the TS has exhausted half its 32bit space */ #define TS_MAX_IDLE (24*24*60*60) #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ getmicrouptime(&uptime); if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || time_uptime - state->creation > TS_MAX_CONN)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("src idled out of PAWS\n")); pf_print_state(state); printf("\n"); } src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("dst idled out of PAWS\n")); pf_print_state(state); printf("\n"); } dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (got_ts && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Validate that the timestamps are "in-window". * RFC1323 describes TCP Timestamp options that allow * measurement of RTT (round trip time) and PAWS * (protection against wrapped sequence numbers). PAWS * gives us a set of rules for rejecting packets on * long fat pipes (packets that were somehow delayed * in transit longer than the time it took to send the * full TCP sequence space of 4Gb). We can use these * rules and infer a few others that will let us treat * the 32bit timestamp and the 32bit echoed timestamp * as sequence numbers to prevent a blind attacker from * inserting packets into a connection. * * RFC1323 tells us: * - The timestamp on this packet must be greater than * or equal to the last value echoed by the other * endpoint. The RFC says those will be discarded * since it is a dup that has already been acked. * This gives us a lowerbound on the timestamp. * timestamp >= other last echoed timestamp * - The timestamp will be less than or equal to * the last timestamp plus the time between the * last packet and now. The RFC defines the max * clock rate as 1ms. We will allow clocks to be * up to 10% fast and will allow a total difference * or 30 seconds due to a route change. And this * gives us an upperbound on the timestamp. * timestamp <= last timestamp + max ticks * We have to be careful here. Windows will send an * initial timestamp of zero and then initialize it * to a random value after the 3whs; presumably to * avoid a DoS by having to call an expensive RNG * during a SYN flood. Proof MS has at least one * good security geek. * * - The TCP timestamp option must also echo the other * endpoints timestamp. The timestamp echoed is the * one carried on the earliest unacknowledged segment * on the left edge of the sequence window. The RFC * states that the host will reject any echoed * timestamps that were larger than any ever sent. * This gives us an upperbound on the TS echo. * tescr <= largest_tsval * - The lowerbound on the TS echo is a little more * tricky to determine. The other endpoint's echoed * values will not decrease. But there may be * network conditions that re-order packets and * cause our view of them to decrease. For now the * only lowerbound we can safely determine is that * the TS echo will never be less than the original * TS. XXX There is probably a better lowerbound. * Remove TS_MAX_CONN with better lowerbound check. * tescr >= other original TS * * It is also important to note that the fastest * timestamp clock of 1ms will wrap its 32bit space in * 24 days. So we just disable TS checking after 24 * days of idle time. We actually must use a 12d * connection limit until we can come up with a better * lowerbound to the TS echo check. */ struct timeval delta_ts; int ts_fudge; /* * PFTM_TS_DIFF is how many seconds of leeway to allow * a host's timestamp. This can happen if the previous * packet got delayed in transit for much longer than * this packet. */ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; /* Calculate max ticks since the last timestamp */ #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ #define TS_MICROSECS 1000000 /* microseconds per second */ delta_ts = uptime; timevalsub(&delta_ts, &src->scrub->pfss_last); tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); if ((src->state >= TCPS_ESTABLISHED && dst->state >= TCPS_ESTABLISHED) && (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { /* Bad RFC1323 implementation or an insertion attack. * * - Solaris 2.6 and 2.7 are known to send another ACK * after the FIN,FIN|ACK,ACK closing that carries * an old timestamp. */ DPFPRINTF(("Timestamp failed %c%c%c%c\n", SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ? '1' : ' ', SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " "idle: %jus %lums\n", tsval, tsecr, tsval_from_last, (uintmax_t)delta_ts.tv_sec, delta_ts.tv_usec / 1000)); DPFPRINTF((" src->tsval: %u tsecr: %u\n", src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" "\n", dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); if (V_pf_status.debug >= PF_DEBUG_MISC) { pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } /* XXX I'd really like to require tsecr but it's optional */ } else if (!got_ts && (th->th_flags & TH_RST) == 0 && ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) || pd->p_len > 0 || (th->th_flags & TH_SYN)) && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Didn't send a timestamp. Timestamps aren't really useful * when: * - connection opening or closing (often not even sent). * but we must not let an attacker to put a FIN on a * data packet to sneak it through our ESTABLISHED check. * - on a TCP reset. RFC suggests not even looking at TS. * - on an empty ACK. The TS will not be echoed so it will * probably not help keep the RTT calculation in sync and * there isn't as much danger when the sequence numbers * got wrapped. So some stacks don't include TS on empty * ACKs :-( * * To minimize the disruption to mostly RFC1323 conformant * stacks, we will only require timestamps on data packets. * * And what do ya know, we cannot require timestamps on data * packets. There appear to be devices that do legitimate * TCP connection hijacking. There are HTTP devices that allow * a 3whs (with timestamps) and then buffer the HTTP request. * If the intermediate device has the HTTP response cache, it * will spoof the response but not bother timestamping its * packets. So we can look for the presence of a timestamp in * the first data packet and if there, require it in all future * packets. */ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { /* * Hey! Someone tried to sneak a packet in. Or the * stack changed its RFC1323 behavior?!?! */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("Did not receive expected RFC1323 " "timestamp\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } } /* * We will note if a host sends his data packets with or without * timestamps. And require all data packets to contain a timestamp * if the first does. PAWS implicitly requires that all data packets be * timestamped. But I think there are middle-man devices that hijack * TCP streams immediately after the 3whs and don't timestamp their * packets (seen in a WWW accelerator or cache). */ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { if (got_ts) src->scrub->pfss_flags |= PFSS_DATA_TS; else { src->scrub->pfss_flags |= PFSS_DATA_NOTS; if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { /* Don't warn if other host rejected RFC1323 */ DPFPRINTF(("Broken RFC1323 stack did not " "timestamp data packet. Disabled PAWS " "security.\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } } } /* * Update PAWS values */ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { getmicrouptime(&src->scrub->pfss_last); if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsval = tsval; if (tsecr) { if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsecr = tsecr; if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && (SEQ_LT(tsval, src->scrub->pfss_tsval0) || src->scrub->pfss_tsval0 == 0)) { /* tsval0 MUST be the lowest timestamp */ src->scrub->pfss_tsval0 = tsval; } /* Only fully initialized after a TS gets echoed */ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_flags |= PFSS_PAWS; } } /* I have a dream.... TCP segment reassembly.... */ return (0); } static int pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th, int off, sa_family_t af) { u_int16_t *mss; int thoff; int opt, cnt, optlen = 0; int rewrite = 0; u_char opts[TCP_MAXOLEN]; u_char *optp = opts; thoff = th->th_off << 2; cnt = thoff - sizeof(struct tcphdr); if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, NULL, NULL, af)) return (rewrite); for (; cnt > 0; cnt -= optlen, optp += optlen) { opt = optp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = optp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: mss = (u_int16_t *)(optp + 2); if ((ntohs(*mss)) > r->max_mss) { th->th_sum = pf_cksum_fixup(th->th_sum, *mss, htons(r->max_mss), 0); *mss = htons(r->max_mss); rewrite = 1; } break; default: break; } } if (rewrite) m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); return (rewrite); } #ifdef INET static void pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos) { struct mbuf *m = *m0; struct ip *h = mtod(m, struct ip *); /* Clear IP_DF if no-df was requested */ if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* Enforce a minimum ttl, may cause endless packet loops */ if (min_ttl && h->ip_ttl < min_ttl) { u_int16_t ip_ttl = h->ip_ttl; h->ip_ttl = min_ttl; h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); } /* Enforce tos */ if (flags & PFRULE_SET_TOS) { u_int16_t ov, nv; ov = *(u_int16_t *)h; h->ip_tos = tos; nv = *(u_int16_t *)h; h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); } /* random-id, but not for fragments */ if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { u_int16_t ip_id = h->ip_id; h->ip_id = ip_randomid(); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); } } #endif /* INET */ #ifdef INET6 static void pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl) { struct mbuf *m = *m0; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); /* Enforce a minimum ttl, may cause endless packet loops */ if (min_ttl && h->ip6_hlim < min_ttl) h->ip6_hlim = min_ttl; } #endif Index: stable/10/usr.bin/kdump/Makefile =================================================================== --- stable/10/usr.bin/kdump/Makefile (revision 263085) +++ stable/10/usr.bin/kdump/Makefile (revision 263086) @@ -1,40 +1,46 @@ # @(#)Makefile 8.1 (Berkeley) 6/6/93 # $FreeBSD$ +.include + .if (${MACHINE_ARCH} == "amd64") SFX= 32 .endif .PATH: ${.CURDIR}/../ktrace PROG= kdump SRCS= kdump_subr.c kdump.c ioctl.c subr.c DPSRCS= kdump_subr.h CFLAGS+= -I${.CURDIR}/../ktrace -I${.CURDIR} -I${.CURDIR}/../.. -I. + +.if ${MK_PF} != "no" +CFLAGS+=-DPF +.endif .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "i386" SRCS+= linux_syscalls.c .endif NO_WERROR?= YES CLEANFILES= ioctl.c kdump_subr.c kdump_subr.h linux_syscalls.c ioctl.c: mkioctls env MACHINE=${MACHINE} CPP="${CPP}" \ sh ${.CURDIR}/mkioctls print ${DESTDIR}/usr/include > ${.TARGET} kdump_subr.h: mksubr sh ${.CURDIR}/mksubr ${DESTDIR}/usr/include | \ sed -n 's/^\([a-z].*)\)$$/void \1;/p' >${.TARGET} kdump_subr.c: mksubr kdump_subr.h sh ${.CURDIR}/mksubr ${DESTDIR}/usr/include >${.TARGET} linux_syscalls.c: /bin/sh ${.CURDIR}/../../sys/kern/makesyscalls.sh \ ${.CURDIR}/../../sys/${MACHINE_ARCH}/linux${SFX}/syscalls.master ${.CURDIR}/linux_syscalls.conf echo "int nlinux_syscalls = sizeof(linux_syscallnames) / sizeof(linux_syscallnames[0]);" \ >> linux_syscalls.c .include Index: stable/10/usr.bin/kdump/mkioctls =================================================================== --- stable/10/usr.bin/kdump/mkioctls (revision 263085) +++ stable/10/usr.bin/kdump/mkioctls (revision 263086) @@ -1,115 +1,119 @@ #!/bin/sh # # $FreeBSD$ # # When editing this script, keep in mind that truss also uses it. # set -e if [ $# -ne 2 -o \( $1 != "print" -a $1 != "return" \) ]; then echo "usage: sh $0 print|return include-dir" exit 1 fi style="$1" includedir="$2" LC_ALL=C; export LC_ALL # Build a list of headers that have ioctls in them. # XXX should we use an ANSI cpp? ioctl_includes=$( cd $includedir - find -H -s * -name '*.h' | grep -v '.*disk.*\.h' | \ + find -H -s * -name '*.h' | \ + egrep -v '(.*disk.*|net/pfvar|net/if_pfsync)\.h' | \ xargs egrep -l \ '^#[ ]*define[ ]+[A-Za-z_][A-Za-z0-9_]*[ ]+_IO[^a-z0-9_]' | awk '{printf("#include <%s>\\n", $1)}' ) : ${MACHINE=$(uname -m)} case "${MACHINE}" in *pc98*) ioctl_includes="$ioctl_includes#include \\n" ;; *) ioctl_includes="$ioctl_includes#include \\n" ;; esac awk -v x="$ioctl_includes" 'BEGIN {print x}' | $CPP -I$1 -dM -DCOMPAT_43TTY - | awk -v ioctl_includes="$ioctl_includes" -v style="$style" ' BEGIN { print "/* XXX obnoxious prerequisites. */" print "#define COMPAT_43" print "#define COMPAT_43TTY" print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " + print "#ifdef PF" print "#include " + print "#include " + print "#endif" print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "#include " print "" print ioctl_includes print "" if (style == "print") { print "void ioctlname(unsigned long val, int decimal);" print "" print "void" print "ioctlname(unsigned long val, int decimal)" } else { print "const char *ioctlname(unsigned long val);" print "" print "const char *" print "ioctlname(unsigned long val)" } print "{" print "\tconst char *str = NULL;" print "" } /^#[ ]*define[ ]+[A-Za-z_][A-Za-z0-9_]*[ ]+_IO/ { # find where the name starts for (i = 1; i <= NF; i++) if ($i ~ /define/) break; ++i; # printf("\t"); if (n++ > 0) printf("else "); printf("if (val == %s)\n", $i); printf("\t\tstr = \"%s\";\n", $i); } END { print "" if (style == "print") { print "\tif (str != NULL)" print "\t\tprintf(\"%s\", str);" print "\telse if (decimal)" print "\t\tprintf(\"%lu\", val);" print "\telse" print "\t\tprintf(\"%#lx\", val);" } else { print "\treturn (str);" } print "}" } ' Index: stable/10/usr.bin/netstat/Makefile =================================================================== --- stable/10/usr.bin/netstat/Makefile (revision 263085) +++ stable/10/usr.bin/netstat/Makefile (revision 263086) @@ -1,49 +1,53 @@ # @(#)Makefile 8.1 (Berkeley) 6/12/93 # $FreeBSD$ .include PROG= netstat SRCS= if.c inet.c main.c mbuf.c mroute.c netisr.c route.c \ unix.c atalk.c mroute6.c ipsec.c bpf.c pfkey.c sctp.c \ flowtable.c WARNS?= 3 CFLAGS+=-fno-strict-aliasing CFLAGS+=-DIPSEC CFLAGS+=-DSCTP .if ${MK_INET_SUPPORT} != "no" CFLAGS+=-DINET .endif .if ${MK_INET6_SUPPORT} != "no" SRCS+= inet6.c CFLAGS+=-DINET6 .endif .if ${MK_OFED} != "no" CFLAGS+=-DSDP .endif +.if ${MK_PF} != "no" +CFLAGS+=-DPF +.endif + BINGRP= kmem BINMODE=2555 DPADD= ${LIBKVM} ${LIBMEMSTAT} ${LIBUTIL} LDADD= -lkvm -lmemstat -lutil .if ${MK_NETGRAPH_SUPPORT} != "no" SRCS+= netgraph.c DPADD+= ${LIBNETGRAPH} LDADD+= -lnetgraph CFLAGS+=-DNETGRAPH .endif .if ${MK_IPX_SUPPORT} != "no" SRCS+= ipx.c DPADD+= ${LIBIPX} LDADD+= -lipx CFLAGS+=-DIPX .endif .include Index: stable/10/usr.bin/netstat/if.c =================================================================== --- stable/10/usr.bin/netstat/if.c (revision 263085) +++ stable/10/usr.bin/netstat/if.c (revision 263086) @@ -1,590 +1,594 @@ /*- * Copyright (c) 2013 Gleb Smirnoff * Copyright (c) 1983, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)if.c 8.3 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include -#include -#include #include #include #include #include #include +#ifdef PF +#include +#include +#endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include "netstat.h" static void sidewaysintpr(int); #ifdef INET6 static char addr_buf[NI_MAXHOST]; /* for getnameinfo() */ #endif +#ifdef PF static const char* pfsyncacts[] = { /* PFSYNC_ACT_CLR */ "clear all request", /* PFSYNC_ACT_INS */ "state insert", /* PFSYNC_ACT_INS_ACK */ "state inserted ack", /* PFSYNC_ACT_UPD */ "state update", /* PFSYNC_ACT_UPD_C */ "compressed state update", /* PFSYNC_ACT_UPD_REQ */ "uncompressed state request", /* PFSYNC_ACT_DEL */ "state delete", /* PFSYNC_ACT_DEL_C */ "compressed state delete", /* PFSYNC_ACT_INS_F */ "fragment insert", /* PFSYNC_ACT_DEL_F */ "fragment delete", /* PFSYNC_ACT_BUS */ "bulk update mark", /* PFSYNC_ACT_TDB */ "TDB replay counter update", /* PFSYNC_ACT_EOF */ "end of frame mark", }; static void pfsync_acts_stats(const char *fmt, uint64_t *a) { int i; for (i = 0; i < PFSYNC_ACT_MAX; i++, a++) if (*a || sflag <= 1) printf(fmt, *a, pfsyncacts[i], plural(*a)); } /* * Dump pfsync statistics structure. */ void pfsync_stats(u_long off, const char *name, int af1 __unused, int proto __unused) { struct pfsyncstats pfsyncstat, zerostat; size_t len = sizeof(struct pfsyncstats); if (live) { if (zflag) memset(&zerostat, 0, len); if (sysctlbyname("net.pfsync.stats", &pfsyncstat, &len, zflag ? &zerostat : NULL, zflag ? len : 0) < 0) { if (errno != ENOENT) warn("sysctl: net.pfsync.stats"); return; } } else kread(off, &pfsyncstat, len); printf("%s:\n", name); #define p(f, m) if (pfsyncstat.f || sflag <= 1) \ printf(m, (uintmax_t)pfsyncstat.f, plural(pfsyncstat.f)) p(pfsyncs_ipackets, "\t%ju packet%s received (IPv4)\n"); p(pfsyncs_ipackets6, "\t%ju packet%s received (IPv6)\n"); pfsync_acts_stats("\t %ju %s%s received\n", &pfsyncstat.pfsyncs_iacts[0]); p(pfsyncs_badif, "\t\t%ju packet%s discarded for bad interface\n"); p(pfsyncs_badttl, "\t\t%ju packet%s discarded for bad ttl\n"); p(pfsyncs_hdrops, "\t\t%ju packet%s shorter than header\n"); p(pfsyncs_badver, "\t\t%ju packet%s discarded for bad version\n"); p(pfsyncs_badauth, "\t\t%ju packet%s discarded for bad HMAC\n"); p(pfsyncs_badact,"\t\t%ju packet%s discarded for bad action\n"); p(pfsyncs_badlen, "\t\t%ju packet%s discarded for short packet\n"); p(pfsyncs_badval, "\t\t%ju state%s discarded for bad values\n"); p(pfsyncs_stale, "\t\t%ju stale state%s\n"); p(pfsyncs_badstate, "\t\t%ju failed state lookup/insert%s\n"); p(pfsyncs_opackets, "\t%ju packet%s sent (IPv4)\n"); p(pfsyncs_opackets6, "\t%ju packet%s sent (IPv6)\n"); pfsync_acts_stats("\t %ju %s%s sent\n", &pfsyncstat.pfsyncs_oacts[0]); p(pfsyncs_onomem, "\t\t%ju failure%s due to mbuf memory error\n"); p(pfsyncs_oerrors, "\t\t%ju send error%s\n"); #undef p } +#endif /* PF */ /* * Display a formatted value, or a '-' in the same space. */ static void show_stat(const char *fmt, int width, u_long value, short showvalue) { const char *lsep, *rsep; char newfmt[32]; lsep = ""; if (strncmp(fmt, "LS", 2) == 0) { lsep = " "; fmt += 2; } rsep = " "; if (strncmp(fmt, "NRS", 3) == 0) { rsep = ""; fmt += 3; } if (showvalue == 0) { /* Print just dash. */ sprintf(newfmt, "%s%%%ds%s", lsep, width, rsep); printf(newfmt, "-"); return; } if (hflag) { char buf[5]; /* Format in human readable form. */ humanize_number(buf, sizeof(buf), (int64_t)value, "", HN_AUTOSCALE, HN_NOSPACE | HN_DECIMAL); sprintf(newfmt, "%s%%%ds%s", lsep, width, rsep); printf(newfmt, buf); } else { /* Construct the format string. */ sprintf(newfmt, "%s%%%d%s%s", lsep, width, fmt, rsep); printf(newfmt, value); } } /* * Find next multiaddr for a given interface name. */ static struct ifmaddrs * next_ifma(struct ifmaddrs *ifma, const char *name, const sa_family_t family) { for(; ifma != NULL; ifma = ifma->ifma_next) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)ifma->ifma_name; if (ifma->ifma_addr->sa_family == family && strcmp(sdl->sdl_data, name) == 0) break; } return (ifma); } /* * Print a description of the network interfaces. */ void intpr(int interval, void (*pfunc)(char *)) { struct ifaddrs *ifap, *ifa; struct ifmaddrs *ifmap, *ifma; if (interval) return sidewaysintpr(interval); if (getifaddrs(&ifap) != 0) err(EX_OSERR, "getifaddrs"); if (aflag && getifmaddrs(&ifmap) != 0) err(EX_OSERR, "getifmaddrs"); if (!pfunc) { if (Wflag) printf("%-7.7s", "Name"); else printf("%-5.5s", "Name"); printf(" %5.5s %-13.13s %-17.17s %8.8s %5.5s %5.5s", "Mtu", "Network", "Address", "Ipkts", "Ierrs", "Idrop"); if (bflag) printf(" %10.10s","Ibytes"); printf(" %8.8s %5.5s", "Opkts", "Oerrs"); if (bflag) printf(" %10.10s","Obytes"); printf(" %5s", "Coll"); if (dflag) printf(" %s", "Drop"); putchar('\n'); } for (ifa = ifap; ifa; ifa = ifa->ifa_next) { bool network = false, link = false; if (interface != NULL && strcmp(ifa->ifa_name, interface) != 0) continue; if (pfunc) { char *name; name = ifa->ifa_name; (*pfunc)(name); /* * Skip all ifaddrs belonging to same interface. */ while(ifa->ifa_next != NULL && (strcmp(ifa->ifa_next->ifa_name, name) == 0)) { ifa = ifa->ifa_next; } continue; } if (af != AF_UNSPEC && ifa->ifa_addr->sa_family != af) continue; if (Wflag) printf("%-7.7s", ifa->ifa_name); else printf("%-5.5s", ifa->ifa_name); #define IFA_MTU(ifa) (((struct if_data *)(ifa)->ifa_data)->ifi_mtu) show_stat("lu", 6, IFA_MTU(ifa), IFA_MTU(ifa)); #undef IFA_MTU switch (ifa->ifa_addr->sa_family) { case AF_UNSPEC: printf("%-13.13s ", "none"); printf("%-15.15s ", "none"); break; case AF_INET: { struct sockaddr_in *sin, *mask; sin = (struct sockaddr_in *)ifa->ifa_addr; mask = (struct sockaddr_in *)ifa->ifa_netmask; printf("%-13.13s ", netname(sin->sin_addr.s_addr, mask->sin_addr.s_addr)); printf("%-17.17s ", routename(sin->sin_addr.s_addr)); network = true; break; } #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *mask; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; mask = (struct sockaddr_in6 *)ifa->ifa_netmask; printf("%-13.13s ", netname6(sin6, &mask->sin6_addr)); getnameinfo(ifa->ifa_addr, ifa->ifa_addr->sa_len, addr_buf, sizeof(addr_buf), 0, 0, NI_NUMERICHOST); printf("%-17.17s ", addr_buf); network = 1; break; } #endif /* INET6 */ case AF_IPX: { struct sockaddr_ipx *sipx; u_long net; char netnum[10]; sipx = (struct sockaddr_ipx *)ifa->ifa_addr; *(union ipx_net *) &net = sipx->sipx_addr.x_net; sprintf(netnum, "%lx", (u_long)ntohl(net)); printf("ipx:%-8s ", netnum); printf("%-17s ", ipx_phost((struct sockaddr *)sipx)); network = 1; break; } case AF_APPLETALK: printf("atalk:%-12.12s ", atalk_print(ifa->ifa_addr, 0x10)); printf("%-11.11s ", atalk_print(ifa->ifa_addr, 0x0b)); break; case AF_LINK: { struct sockaddr_dl *sdl; char *cp, linknum[10]; int n, m; sdl = (struct sockaddr_dl *)ifa->ifa_addr; cp = (char *)LLADDR(sdl); n = sdl->sdl_alen; sprintf(linknum, "", sdl->sdl_index); m = printf("%-13.13s ", linknum); while ((--n >= 0) && (m < 30)) m += printf("%02x%c", *cp++ & 0xff, n > 0 ? ':' : ' '); m = 32 - m; while (m-- > 0) putchar(' '); link = 1; break; } } #define IFA_STAT(s) (((struct if_data *)ifa->ifa_data)->ifi_ ## s) show_stat("lu", 8, IFA_STAT(ipackets), link|network); show_stat("lu", 5, IFA_STAT(ierrors), link); show_stat("lu", 5, IFA_STAT(iqdrops), link); if (bflag) show_stat("lu", 10, IFA_STAT(ibytes), link|network); show_stat("lu", 8, IFA_STAT(opackets), link|network); show_stat("lu", 5, IFA_STAT(oerrors), link); if (bflag) show_stat("lu", 10, IFA_STAT(obytes), link|network); show_stat("NRSlu", 5, IFA_STAT(collisions), link); /* XXXGL: output queue drops */ putchar('\n'); if (!aflag) continue; /* * Print family's multicast addresses. */ for (ifma = next_ifma(ifmap, ifa->ifa_name, ifa->ifa_addr->sa_family); ifma != NULL; ifma = next_ifma(ifma, ifa->ifa_name, ifa->ifa_addr->sa_family)) { const char *fmt = NULL; switch (ifma->ifma_addr->sa_family) { case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)ifma->ifma_addr; fmt = routename(sin->sin_addr.s_addr); break; } #ifdef INET6 case AF_INET6: /* in6_fillscopeid(&msa.in6); */ getnameinfo(ifma->ifma_addr, ifma->ifma_addr->sa_len, addr_buf, sizeof(addr_buf), 0, 0, NI_NUMERICHOST); printf("%*s %s\n", Wflag ? 27 : 25, "", addr_buf); break; #endif /* INET6 */ case AF_LINK: { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)ifma->ifma_addr; switch (sdl->sdl_type) { case IFT_ETHER: case IFT_FDDI: fmt = ether_ntoa( (struct ether_addr *)LLADDR(sdl)); break; } break; } } if (fmt) { printf("%*s %-17.17s", Wflag ? 27 : 25, "", fmt); if (ifma->ifma_addr->sa_family == AF_LINK) { printf(" %8lu", IFA_STAT(imcasts)); printf("%*s", bflag ? 17 : 6, ""); printf(" %8lu", IFA_STAT(omcasts)); } putchar('\n'); } ifma = ifma->ifma_next; } } freeifaddrs(ifap); if (aflag) freeifmaddrs(ifmap); } struct iftot { u_long ift_ip; /* input packets */ u_long ift_ie; /* input errors */ u_long ift_id; /* input drops */ u_long ift_op; /* output packets */ u_long ift_oe; /* output errors */ u_long ift_co; /* collisions */ u_long ift_ib; /* input bytes */ u_long ift_ob; /* output bytes */ }; /* * Obtain stats for interface(s). */ static void fill_iftot(struct iftot *st) { struct ifaddrs *ifap, *ifa; bool found = false; if (getifaddrs(&ifap) != 0) err(EX_OSERR, "getifaddrs"); bzero(st, sizeof(*st)); for (ifa = ifap; ifa; ifa = ifa->ifa_next) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; if (interface) { if (strcmp(ifa->ifa_name, interface) == 0) found = true; else continue; } st->ift_ip += IFA_STAT(ipackets); st->ift_ie += IFA_STAT(ierrors); st->ift_id += IFA_STAT(iqdrops); st->ift_ib += IFA_STAT(ibytes); st->ift_op += IFA_STAT(opackets); st->ift_oe += IFA_STAT(oerrors); st->ift_ob += IFA_STAT(obytes); st->ift_co += IFA_STAT(collisions); } if (interface && found == false) err(EX_DATAERR, "interface %s not found", interface); freeifaddrs(ifap); } /* * Set a flag to indicate that a signal from the periodic itimer has been * caught. */ static sig_atomic_t signalled; static void catchalarm(int signo __unused) { signalled = true; } /* * Print a running summary of interface statistics. * Repeat display every interval seconds, showing statistics * collected over that interval. Assumes that interval is non-zero. * First line printed at top of screen is always cumulative. */ static void sidewaysintpr(int interval) { struct iftot ift[2], *new, *old; struct itimerval interval_it; int oldmask, line; new = &ift[0]; old = &ift[1]; fill_iftot(old); (void)signal(SIGALRM, catchalarm); signalled = false; interval_it.it_interval.tv_sec = interval; interval_it.it_interval.tv_usec = 0; interval_it.it_value = interval_it.it_interval; setitimer(ITIMER_REAL, &interval_it, NULL); banner: printf("%17s %14s %16s", "input", interface != NULL ? interface : "(Total)", "output"); putchar('\n'); printf("%10s %5s %5s %10s %10s %5s %10s %5s", "packets", "errs", "idrops", "bytes", "packets", "errs", "bytes", "colls"); if (dflag) printf(" %5.5s", "drops"); putchar('\n'); fflush(stdout); line = 0; loop: if ((noutputs != 0) && (--noutputs == 0)) exit(0); oldmask = sigblock(sigmask(SIGALRM)); while (!signalled) sigpause(0); signalled = false; sigsetmask(oldmask); line++; fill_iftot(new); show_stat("lu", 10, new->ift_ip - old->ift_ip, 1); show_stat("lu", 5, new->ift_ie - old->ift_ie, 1); show_stat("lu", 5, new->ift_id - old->ift_id, 1); show_stat("lu", 10, new->ift_ib - old->ift_ib, 1); show_stat("lu", 10, new->ift_op - old->ift_op, 1); show_stat("lu", 5, new->ift_oe - old->ift_oe, 1); show_stat("lu", 10, new->ift_ob - old->ift_ob, 1); show_stat("NRSlu", 5, new->ift_co - old->ift_co, 1); /* XXXGL: output queue drops */ putchar('\n'); fflush(stdout); if (new == &ift[0]) { new = &ift[1]; old = &ift[0]; } else { new = &ift[0]; old = &ift[1]; } if (line == 21) goto banner; else goto loop; /* NOTREACHED */ } Index: stable/10/usr.bin/netstat/main.c =================================================================== --- stable/10/usr.bin/netstat/main.c (revision 263085) +++ stable/10/usr.bin/netstat/main.c (revision 263086) @@ -1,846 +1,848 @@ /*- * Copyright (c) 1983, 1988, 1993 * Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint char const copyright[] = "@(#) Copyright (c) 1983, 1988, 1993\n\ Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #if 0 #ifndef lint static char sccsid[] = "@(#)main.c 8.4 (Berkeley) 3/1/94"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #ifdef NETGRAPH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netstat.h" static struct nlist nl[] = { #define N_IFNET 0 { .n_name = "_ifnet" }, /* XXXGL: can be deleted */ #define N_RTSTAT 1 { .n_name = "_rtstat" }, #define N_RTREE 2 { .n_name = "_rt_tables"}, #define N_MRTSTAT 3 { .n_name = "_mrtstat" }, #define N_MFCHASHTBL 4 { .n_name = "_mfchashtbl" }, #define N_VIFTABLE 5 { .n_name = "_viftable" }, #define N_IPX 6 { .n_name = "_ipxpcb_list"}, #define N_IPXSTAT 7 { .n_name = "_ipxstat"}, #define N_SPXSTAT 8 { .n_name = "_spx_istat"}, #define N_DDPSTAT 9 { .n_name = "_ddpstat"}, #define N_DDPCB 10 { .n_name = "_ddpcb"}, #define N_NGSOCKS 11 { .n_name = "_ngsocklist"}, #define N_IP6STAT 12 { .n_name = "_ip6stat" }, #define N_ICMP6STAT 13 { .n_name = "_icmp6stat" }, #define N_IPSECSTAT 14 { .n_name = "_ipsec4stat" }, #define N_IPSEC6STAT 15 { .n_name = "_ipsec6stat" }, #define N_PIM6STAT 16 { .n_name = "_pim6stat" }, #define N_MRT6STAT 17 { .n_name = "_mrt6stat" }, #define N_MF6CTABLE 18 { .n_name = "_mf6ctable" }, #define N_MIF6TABLE 19 { .n_name = "_mif6table" }, #define N_PFKEYSTAT 20 { .n_name = "_pfkeystat" }, #define N_RTTRASH 21 { .n_name = "_rttrash" }, #define N_CARPSTAT 22 { .n_name = "_carpstats" }, #define N_PFSYNCSTAT 23 { .n_name = "_pfsyncstats" }, #define N_AHSTAT 24 { .n_name = "_ahstat" }, #define N_ESPSTAT 25 { .n_name = "_espstat" }, #define N_IPCOMPSTAT 26 { .n_name = "_ipcompstat" }, #define N_TCPSTAT 27 { .n_name = "_tcpstat" }, #define N_UDPSTAT 28 { .n_name = "_udpstat" }, #define N_IPSTAT 29 { .n_name = "_ipstat" }, #define N_ICMPSTAT 30 { .n_name = "_icmpstat" }, #define N_IGMPSTAT 31 { .n_name = "_igmpstat" }, #define N_PIMSTAT 32 { .n_name = "_pimstat" }, #define N_TCBINFO 33 { .n_name = "_tcbinfo" }, #define N_UDBINFO 34 { .n_name = "_udbinfo" }, #define N_DIVCBINFO 35 { .n_name = "_divcbinfo" }, #define N_RIPCBINFO 36 { .n_name = "_ripcbinfo" }, #define N_UNP_COUNT 37 { .n_name = "_unp_count" }, #define N_UNP_GENCNT 38 { .n_name = "_unp_gencnt" }, #define N_UNP_DHEAD 39 { .n_name = "_unp_dhead" }, #define N_UNP_SHEAD 40 { .n_name = "_unp_shead" }, #define N_RIP6STAT 41 { .n_name = "_rip6stat" }, #define N_SCTPSTAT 42 { .n_name = "_sctpstat" }, #define N_MFCTABLESIZE 43 { .n_name = "_mfctablesize" }, #define N_ARPSTAT 44 { .n_name = "_arpstat" }, #define N_UNP_SPHEAD 45 { .n_name = "unp_sphead" }, #define N_SFSTAT 46 { .n_name = "_sfstat"}, { .n_name = NULL }, }; struct protox { int pr_index; /* index into nlist of cb head */ int pr_sindex; /* index into nlist of stat block */ u_char pr_wanted; /* 1 if wanted, 0 otherwise */ void (*pr_cblocks)(u_long, const char *, int, int); /* control blocks printing routine */ void (*pr_stats)(u_long, const char *, int, int); /* statistics printing routine */ void (*pr_istats)(char *); /* per/if statistics printing routine */ const char *pr_name; /* well-known name */ int pr_usesysctl; /* non-zero if we use sysctl, not kvm */ int pr_protocol; } protox[] = { { N_TCBINFO, N_TCPSTAT, 1, protopr, tcp_stats, NULL, "tcp", 1, IPPROTO_TCP }, { N_UDBINFO, N_UDPSTAT, 1, protopr, udp_stats, NULL, "udp", 1, IPPROTO_UDP }, #ifdef SCTP { -1, N_SCTPSTAT, 1, sctp_protopr, sctp_stats, NULL, "sctp", 1, IPPROTO_SCTP }, #endif #ifdef SDP { -1, -1, 1, protopr, NULL, NULL, "sdp", 1, IPPROTO_TCP }, #endif { N_DIVCBINFO, -1, 1, protopr, NULL, NULL, "divert", 1, IPPROTO_DIVERT }, { N_RIPCBINFO, N_IPSTAT, 1, protopr, ip_stats, NULL, "ip", 1, IPPROTO_RAW }, { N_RIPCBINFO, N_ICMPSTAT, 1, protopr, icmp_stats, NULL, "icmp", 1, IPPROTO_ICMP }, { N_RIPCBINFO, N_IGMPSTAT, 1, protopr, igmp_stats, NULL, "igmp", 1, IPPROTO_IGMP }, #ifdef IPSEC { -1, N_IPSECSTAT, 1, NULL, /* keep as compat */ ipsec_stats, NULL, "ipsec", 0, 0}, { -1, N_AHSTAT, 1, NULL, ah_stats, NULL, "ah", 0, 0}, { -1, N_ESPSTAT, 1, NULL, esp_stats, NULL, "esp", 0, 0}, { -1, N_IPCOMPSTAT, 1, NULL, ipcomp_stats, NULL, "ipcomp", 0, 0}, #endif { N_RIPCBINFO, N_PIMSTAT, 1, protopr, pim_stats, NULL, "pim", 1, IPPROTO_PIM }, { -1, N_CARPSTAT, 1, NULL, carp_stats, NULL, "carp", 1, 0 }, +#ifdef PF { -1, N_PFSYNCSTAT, 1, NULL, pfsync_stats, NULL, "pfsync", 1, 0 }, +#endif { -1, N_ARPSTAT, 1, NULL, arp_stats, NULL, "arp", 1, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #ifdef INET6 struct protox ip6protox[] = { { N_TCBINFO, N_TCPSTAT, 1, protopr, tcp_stats, NULL, "tcp", 1, IPPROTO_TCP }, { N_UDBINFO, N_UDPSTAT, 1, protopr, udp_stats, NULL, "udp", 1, IPPROTO_UDP }, { N_RIPCBINFO, N_IP6STAT, 1, protopr, ip6_stats, ip6_ifstats, "ip6", 1, IPPROTO_RAW }, { N_RIPCBINFO, N_ICMP6STAT, 1, protopr, icmp6_stats, icmp6_ifstats, "icmp6", 1, IPPROTO_ICMPV6 }, #ifdef SDP { -1, -1, 1, protopr, NULL, NULL, "sdp", 1, IPPROTO_TCP }, #endif #ifdef IPSEC { -1, N_IPSEC6STAT, 1, NULL, ipsec_stats, NULL, "ipsec6", 0, 0 }, #endif #ifdef notyet { -1, N_PIM6STAT, 1, NULL, pim6_stats, NULL, "pim6", 1, 0 }, #endif { -1, N_RIP6STAT, 1, NULL, rip6_stats, NULL, "rip6", 1, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #endif /*INET6*/ #ifdef IPSEC struct protox pfkeyprotox[] = { { -1, N_PFKEYSTAT, 1, NULL, pfkey_stats, NULL, "pfkey", 0, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #endif struct protox atalkprotox[] = { { N_DDPCB, N_DDPSTAT, 1, atalkprotopr, ddp_stats, NULL, "ddp", 0, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #ifdef NETGRAPH struct protox netgraphprotox[] = { { N_NGSOCKS, -1, 1, netgraphprotopr, NULL, NULL, "ctrl", 0, 0 }, { N_NGSOCKS, -1, 1, netgraphprotopr, NULL, NULL, "data", 0, 0 }, { -1, -1, 0, NULL, NULL, NULL, NULL, 0, 0 } }; #endif #ifdef IPX struct protox ipxprotox[] = { { N_IPX, N_IPXSTAT, 1, ipxprotopr, ipx_stats, NULL, "ipx", 0, 0 }, { N_IPX, N_SPXSTAT, 1, ipxprotopr, spx_stats, NULL, "spx", 0, 0 }, { -1, -1, 0, NULL, NULL, NULL, 0, 0, 0 } }; #endif struct protox *protoprotox[] = { protox, #ifdef INET6 ip6protox, #endif #ifdef IPSEC pfkeyprotox, #endif #ifdef IPX ipxprotox, #endif atalkprotox, NULL }; static void printproto(struct protox *, const char *); static void usage(void); static struct protox *name2protox(const char *); static struct protox *knownname(const char *); static kvm_t *kvmd; static char *nlistf = NULL, *memf = NULL; int Aflag; /* show addresses of protocol control block */ int aflag; /* show all sockets (including servers) */ int Bflag; /* show information about bpf consumers */ int bflag; /* show i/f total bytes in/out */ int dflag; /* show i/f dropped packets */ int gflag; /* show group (multicast) routing or stats */ int hflag; /* show counters in human readable format */ int iflag; /* show interfaces */ int Lflag; /* show size of listen queues */ int mflag; /* show memory stats */ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ static int pflag; /* show given protocol */ int Qflag; /* show netisr information */ int rflag; /* show routing tables (or routing stats) */ int sflag; /* show protocol statistics */ int Wflag; /* wide display */ int Tflag; /* TCP Information */ int xflag; /* extra information, includes all socket buffer info */ int zflag; /* zero stats */ int interval; /* repeat interval for i/f stats */ char *interface; /* desired i/f for stats, or NULL for all i/fs */ int unit; /* unit number for above */ int af; /* address family */ int live; /* true if we are examining a live system */ int main(int argc, char *argv[]) { struct protox *tp = NULL; /* for printing cblocks & stats */ int ch; int fib = -1; char *endptr; af = AF_UNSPEC; while ((ch = getopt(argc, argv, "AaBbdF:f:ghI:iLlM:mN:np:Qq:rSTsuWw:xz")) != -1) switch(ch) { case 'A': Aflag = 1; break; case 'a': aflag = 1; break; case 'B': Bflag = 1; break; case 'b': bflag = 1; break; case 'd': dflag = 1; break; case 'F': fib = strtol(optarg, &endptr, 0); if (*endptr != '\0' || (fib == 0 && (errno == EINVAL || errno == ERANGE))) errx(1, "%s: invalid fib", optarg); break; case 'f': if (strcmp(optarg, "ipx") == 0) af = AF_IPX; else if (strcmp(optarg, "inet") == 0) af = AF_INET; #ifdef INET6 else if (strcmp(optarg, "inet6") == 0) af = AF_INET6; #endif #ifdef IPSEC else if (strcmp(optarg, "pfkey") == 0) af = PF_KEY; #endif else if (strcmp(optarg, "unix") == 0) af = AF_UNIX; else if (strcmp(optarg, "atalk") == 0) af = AF_APPLETALK; #ifdef NETGRAPH else if (strcmp(optarg, "ng") == 0 || strcmp(optarg, "netgraph") == 0) af = AF_NETGRAPH; #endif else if (strcmp(optarg, "link") == 0) af = AF_LINK; else { errx(1, "%s: unknown address family", optarg); } break; case 'g': gflag = 1; break; case 'h': hflag = 1; break; case 'I': { char *cp; iflag = 1; for (cp = interface = optarg; isalpha(*cp); cp++) continue; unit = atoi(cp); break; } case 'i': iflag = 1; break; case 'L': Lflag = 1; break; case 'M': memf = optarg; break; case 'm': mflag = 1; break; case 'N': nlistf = optarg; break; case 'n': numeric_addr = numeric_port = 1; break; case 'p': if ((tp = name2protox(optarg)) == NULL) { errx(1, "%s: unknown or uninstrumented protocol", optarg); } pflag = 1; break; case 'Q': Qflag = 1; break; case 'q': noutputs = atoi(optarg); if (noutputs != 0) noutputs++; break; case 'r': rflag = 1; break; case 's': ++sflag; break; case 'S': numeric_addr = 1; break; case 'u': af = AF_UNIX; break; case 'W': case 'l': Wflag = 1; break; case 'w': interval = atoi(optarg); iflag = 1; break; case 'T': Tflag = 1; break; case 'x': xflag = 1; break; case 'z': zflag = 1; break; case '?': default: usage(); } argv += optind; argc -= optind; #define BACKWARD_COMPATIBILITY #ifdef BACKWARD_COMPATIBILITY if (*argv) { if (isdigit(**argv)) { interval = atoi(*argv); if (interval <= 0) usage(); ++argv; iflag = 1; } if (*argv) { nlistf = *argv; if (*++argv) memf = *argv; } } #endif /* * Discard setgid privileges if not the running kernel so that bad * guys can't print interesting stuff from kernel memory. */ live = (nlistf == NULL && memf == NULL); if (!live) setgid(getgid()); if (xflag && Tflag) errx(1, "-x and -T are incompatible, pick one."); if (Bflag) { if (!live) usage(); bpf_stats(interface); exit(0); } if (mflag) { if (!live) { if (kread(0, NULL, 0) == 0) mbpr(kvmd, nl[N_SFSTAT].n_value); } else mbpr(NULL, 0); exit(0); } if (Qflag) { if (!live) { if (kread(0, NULL, 0) == 0) netisr_stats(kvmd); } else netisr_stats(NULL); exit(0); } #if 0 /* * Keep file descriptors open to avoid overhead * of open/close on each call to get* routines. */ sethostent(1); setnetent(1); #else /* * This does not make sense any more with DNS being default over * the files. Doing a setXXXXent(1) causes a tcp connection to be * used for the queries, which is slower. */ #endif kread(0, NULL, 0); if (iflag && !sflag) { intpr(interval, NULL); exit(0); } if (rflag) { if (sflag) { rt_stats(nl[N_RTSTAT].n_value, nl[N_RTTRASH].n_value); flowtable_stats(); } else routepr(nl[N_RTREE].n_value, fib); exit(0); } if (gflag) { if (sflag) { if (af == AF_INET || af == AF_UNSPEC) mrt_stats(nl[N_MRTSTAT].n_value); #ifdef INET6 if (af == AF_INET6 || af == AF_UNSPEC) mrt6_stats(nl[N_MRT6STAT].n_value); #endif } else { if (af == AF_INET || af == AF_UNSPEC) mroutepr(nl[N_MFCHASHTBL].n_value, nl[N_MFCTABLESIZE].n_value, nl[N_VIFTABLE].n_value); #ifdef INET6 if (af == AF_INET6 || af == AF_UNSPEC) mroute6pr(nl[N_MF6CTABLE].n_value, nl[N_MIF6TABLE].n_value); #endif } exit(0); } if (tp) { printproto(tp, tp->pr_name); exit(0); } if (af == AF_INET || af == AF_UNSPEC) for (tp = protox; tp->pr_name; tp++) printproto(tp, tp->pr_name); #ifdef INET6 if (af == AF_INET6 || af == AF_UNSPEC) for (tp = ip6protox; tp->pr_name; tp++) printproto(tp, tp->pr_name); #endif /*INET6*/ #ifdef IPSEC if (af == PF_KEY || af == AF_UNSPEC) for (tp = pfkeyprotox; tp->pr_name; tp++) printproto(tp, tp->pr_name); #endif /*IPSEC*/ #ifdef IPX if (af == AF_IPX || af == AF_UNSPEC) { for (tp = ipxprotox; tp->pr_name; tp++) printproto(tp, tp->pr_name); } #endif /* IPX */ if (af == AF_APPLETALK || af == AF_UNSPEC) for (tp = atalkprotox; tp->pr_name; tp++) printproto(tp, tp->pr_name); #ifdef NETGRAPH if (af == AF_NETGRAPH || af == AF_UNSPEC) for (tp = netgraphprotox; tp->pr_name; tp++) printproto(tp, tp->pr_name); #endif /* NETGRAPH */ if ((af == AF_UNIX || af == AF_UNSPEC) && !sflag) unixpr(nl[N_UNP_COUNT].n_value, nl[N_UNP_GENCNT].n_value, nl[N_UNP_DHEAD].n_value, nl[N_UNP_SHEAD].n_value, nl[N_UNP_SPHEAD].n_value); exit(0); } /* * Print out protocol statistics or control blocks (per sflag). * If the interface was not specifically requested, and the symbol * is not in the namelist, ignore this one. */ static void printproto(struct protox *tp, const char *name) { void (*pr)(u_long, const char *, int, int); u_long off; if (sflag) { if (iflag) { if (tp->pr_istats) intpr(interval, tp->pr_istats); else if (pflag) printf("%s: no per-interface stats routine\n", tp->pr_name); return; } else { pr = tp->pr_stats; if (!pr) { if (pflag) printf("%s: no stats routine\n", tp->pr_name); return; } if (tp->pr_usesysctl && live) off = 0; else if (tp->pr_sindex < 0) { if (pflag) printf( "%s: stats routine doesn't work on cores\n", tp->pr_name); return; } else off = nl[tp->pr_sindex].n_value; } } else { pr = tp->pr_cblocks; if (!pr) { if (pflag) printf("%s: no PCB routine\n", tp->pr_name); return; } if (tp->pr_usesysctl && live) off = 0; else if (tp->pr_index < 0) { if (pflag) printf( "%s: PCB routine doesn't work on cores\n", tp->pr_name); return; } else off = nl[tp->pr_index].n_value; } if (pr != NULL && (off || (live && tp->pr_usesysctl) || af != AF_UNSPEC)) (*pr)(off, name, af, tp->pr_protocol); } static int kvmd_init(void) { char errbuf[_POSIX2_LINE_MAX]; if (kvmd != NULL) return (0); kvmd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, errbuf); setgid(getgid()); if (kvmd == NULL) { warnx("kvm not available: %s", errbuf); return (-1); } if (kvm_nlist(kvmd, nl) < 0) { if (nlistf) errx(1, "%s: kvm_nlist: %s", nlistf, kvm_geterr(kvmd)); else errx(1, "kvm_nlist: %s", kvm_geterr(kvmd)); } if (nl[0].n_type == 0) { if (nlistf) errx(1, "%s: no namelist", nlistf); else errx(1, "no namelist"); } return (0); } /* * Read kernel memory, return 0 on success. */ int kread(u_long addr, void *buf, size_t size) { if (kvmd_init() < 0) return (-1); if (!buf) return (0); if (kvm_read(kvmd, addr, buf, size) != (ssize_t)size) { warnx("%s", kvm_geterr(kvmd)); return (-1); } return (0); } /* * Read an array of N counters in kernel memory into array of N uint64_t's. */ int kread_counters(u_long addr, void *buf, size_t size) { uint64_t *c = buf; if (kvmd_init() < 0) return (-1); if (kread(addr, buf, size) < 0) return (-1); while (size != 0) { *c = kvm_counter_u64_fetch(kvmd, *c); size -= sizeof(*c); c++; } return (0); } const char * plural(uintmax_t n) { return (n != 1 ? "s" : ""); } const char * plurales(uintmax_t n) { return (n != 1 ? "es" : ""); } const char * pluralies(uintmax_t n) { return (n != 1 ? "ies" : "y"); } /* * Find the protox for the given "well-known" name. */ static struct protox * knownname(const char *name) { struct protox **tpp, *tp; for (tpp = protoprotox; *tpp; tpp++) for (tp = *tpp; tp->pr_name; tp++) if (strcmp(tp->pr_name, name) == 0) return (tp); return (NULL); } /* * Find the protox corresponding to name. */ static struct protox * name2protox(const char *name) { struct protox *tp; char **alias; /* alias from p->aliases */ struct protoent *p; /* * Try to find the name in the list of "well-known" names. If that * fails, check if name is an alias for an Internet protocol. */ if ((tp = knownname(name)) != NULL) return (tp); setprotoent(1); /* make protocol lookup cheaper */ while ((p = getprotoent()) != NULL) { /* assert: name not same as p->name */ for (alias = p->p_aliases; *alias; alias++) if (strcmp(name, *alias) == 0) { endprotoent(); return (knownname(p->p_name)); } } endprotoent(); return (NULL); } static void usage(void) { (void)fprintf(stderr, "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n", "usage: netstat [-AaLnSTWx] [-f protocol_family | -p protocol]\n" " [-M core] [-N system]", " netstat -i | -I interface [-abdhnW] [-f address_family]\n" " [-M core] [-N system]", " netstat -w wait [-I interface] [-d] [-M core] [-N system] [-q howmany]", " netstat -s [-s] [-z] [-f protocol_family | -p protocol]\n" " [-M core] [-N system]", " netstat -i | -I interface -s [-f protocol_family | -p protocol]\n" " [-M core] [-N system]", " netstat -m [-M core] [-N system]", " netstat -B [-I interface]", " netstat -r [-AanW] [-f address_family] [-M core] [-N system]", " netstat -rs [-s] [-M core] [-N system]", " netstat -g [-W] [-f address_family] [-M core] [-N system]", " netstat -gs [-s] [-f address_family] [-M core] [-N system]", " netstat -Q"); exit(1); } Index: stable/10/usr.sbin/bsnmpd/modules/Makefile =================================================================== --- stable/10/usr.sbin/bsnmpd/modules/Makefile (revision 263085) +++ stable/10/usr.sbin/bsnmpd/modules/Makefile (revision 263086) @@ -1,29 +1,32 @@ # $FreeBSD$ .include .PATH: ${.CURDIR}/../../../contrib/bsnmp/snmpd .if ${MK_ATM} != "no" _snmp_atm= snmp_atm .endif SUBDIR= ${_snmp_atm} \ snmp_bridge \ snmp_hast \ snmp_hostres \ snmp_mibII \ - snmp_pf \ snmp_target \ snmp_usm \ snmp_vacm \ snmp_wlan + +.if ${MK_PF} != "no" +SUBDIR+=snmp_pf +.endif .if ${MK_NETGRAPH_SUPPORT} != "no" SUBDIR+=snmp_netgraph .endif INCS= snmpmod.h INCSDIR= ${INCLUDEDIR}/bsnmp .include Index: stable/10/usr.sbin/tcpdump/tcpdump/Makefile =================================================================== --- stable/10/usr.sbin/tcpdump/tcpdump/Makefile (revision 263085) +++ stable/10/usr.sbin/tcpdump/tcpdump/Makefile (revision 263086) @@ -1,183 +1,183 @@ # $FreeBSD$ .include TCPDUMP_DISTDIR?= ${.CURDIR}/../../../contrib/tcpdump .PATH: ${TCPDUMP_DISTDIR} PROG= tcpdump SRCS= addrtoname.c \ af.c \ checksum.c \ cpack.c \ gmpls.c \ gmt2local.c \ in_cksum.c \ ipproto.c \ l2vpn.c \ machdep.c \ nlpid.c \ oui.c \ parsenfsfh.c \ print-802_11.c \ print-802_15_4.c \ print-ah.c \ print-aodv.c \ print-ap1394.c \ print-arcnet.c \ print-arp.c \ print-ascii.c \ print-atalk.c \ print-atm.c \ print-beep.c \ print-bfd.c \ print-bgp.c \ print-bootp.c \ print-bt.c \ print-carp.c \ print-cdp.c \ print-cfm.c \ print-chdlc.c \ print-cip.c \ print-cnfp.c \ print-dccp.c \ print-decnet.c \ print-domain.c \ print-dtp.c \ print-dvmrp.c \ print-eap.c \ print-egp.c \ print-eigrp.c \ print-enc.c \ print-esp.c \ print-ether.c \ print-fddi.c \ print-forces.c \ print-fr.c \ print-gre.c \ print-hsrp.c \ print-icmp.c \ print-igmp.c \ print-igrp.c \ print-ip.c \ print-ipcomp.c \ print-ipfc.c \ print-ipnet.c \ print-ipx.c \ print-isakmp.c \ print-isoclns.c \ print-juniper.c \ print-krb.c \ print-l2tp.c \ print-lane.c \ print-ldp.c \ print-llc.c \ print-lldp.c \ print-lmp.c \ print-lspping.c \ print-lwapp.c \ print-lwres.c \ print-mobile.c \ print-mpcp.c \ print-mpls.c \ print-msdp.c \ print-msnlb.c \ print-nfs.c \ print-ntp.c \ print-null.c \ print-olsr.c \ print-ospf.c \ print-otv.c \ - print-pfsync.c \ print-pgm.c \ print-pim.c \ print-ppi.c \ print-ppp.c \ print-pppoe.c \ print-pptp.c \ print-radius.c \ print-raw.c \ print-rip.c \ print-rpki-rtr.c \ print-rrcp.c \ print-rsvp.c \ print-rx.c \ print-sctp.c \ print-sflow.c \ print-sip.c \ print-sl.c \ print-sll.c \ print-slow.c \ print-smb.c \ print-snmp.c \ print-stp.c \ print-sunatm.c \ print-sunrpc.c \ print-symantec.c \ print-syslog.c \ print-tcp.c \ print-telnet.c \ print-tftp.c \ print-timed.c \ print-tipc.c \ print-token.c \ print-udld.c \ print-udp.c \ print-vjc.c \ print-vqp.c \ print-vrrp.c \ print-vtp.c \ print-vxlan.c \ print-wb.c \ print-zephyr.c \ print-zeromq.c \ setsignal.c \ signature.c \ smbutil.c \ tcpdump.c \ util.c \ version.c CLEANFILES+= version.c CFLAGS+= -I${.CURDIR} -I${TCPDUMP_DISTDIR} CFLAGS+= -DHAVE_CONFIG_H CFLAGS+= -D_U_="__attribute__((unused))" .if ${MK_INET6_SUPPORT} != "no" SRCS+= print-babel.c \ print-dhcp6.c \ print-frag6.c \ print-icmp6.c \ print-ip6.c \ print-ip6opts.c \ print-mobility.c \ print-ospf6.c \ print-ripng.c \ print-rt6.c CFLAGS+= -DINET6 .endif .if ${MACHINE_CPUARCH} != "i386" CFLAGS+= -DLBL_ALIGN .endif DPADD= ${LIBL} ${LIBPCAP} LDADD= -ll -lpcap .if ${MK_OPENSSL} != "no" && !defined(RELEASE_CRUNCH) DPADD+= ${LIBCRYPTO} LDADD+= -lcrypto CFLAGS+= -I${DESTDIR}/usr/include/openssl CFLAGS+= -DHAVE_LIBCRYPTO -DHAVE_OPENSSL_EVP_H .endif .if ${MK_PF} != "no" -SRCS+= print-pflog.c +SRCS+= print-pflog.c \ + print-pfsync.c CFLAGS+= -DHAVE_NET_PFVAR_H .endif version.c: ${TCPDUMP_DISTDIR}/VERSION rm -f version.c ; \ sed 's/.*/char version[] = "&";/' ${TCPDUMP_DISTDIR}/VERSION \ > version.c .include Index: stable/10 =================================================================== --- stable/10 (revision 263085) +++ stable/10 (revision 263086) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r257186,257215,257349,259736,261797