diff --git a/sbin/ipfw/ipfw.8 b/sbin/ipfw/ipfw.8 index bc78ae1c655b..348e9a58f2ce 100644 --- a/sbin/ipfw/ipfw.8 +++ b/sbin/ipfw/ipfw.8 @@ -1,5011 +1,5021 @@ .\" -.Dd December 6, 2024 +.Dd March 3, 2025 .Dt IPFW 8 .Os .Sh NAME .Nm ipfw , dnctl .Nd User interface for firewall, traffic shaper, packet scheduler, in-kernel NAT. .Sh SYNOPSIS .Ss FIREWALL CONFIGURATION .Nm .Op Fl cq .Cm add .Ar rule .Nm .Op Fl acdefnNStT .Op Cm set Ar N .Brq Cm list | show .Op Ar rule | first-last ... .Nm .Op Fl f | q .Op Cm set Ar N .Cm flush .Nm .Op Fl q .Op Cm set Ar N .Brq Cm delete | zero | resetlog .Op Ar number ... .Pp .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... .Nm .Cm set move .Op Cm rule .Ar number Cm to Ar number .Nm .Cm set swap Ar number number .Nm .Cm set show .Ss SYSCTL SHORTCUTS .Nm .Cm enable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive | skipto_cache .Nm .Cm disable -.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive +.Brq Cm firewall | altq | one_pass | debug | verbose | dyn_keepalive | skipto_cache .Ss LOOKUP TABLES .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm create Ar create-options .Nm .Oo Cm set Ar N Oc Cm table .Brq Ar name | all .Cm destroy .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm modify Ar modify-options .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm swap Ar name .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm add Ar table-key Op Ar value .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm add Op Ar table-key Ar value ... .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm atomic add Op Ar table-key Ar value ... .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm delete Op Ar table-key ... .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm lookup Ar addr .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm lock .Nm .Oo Cm set Ar N Oc Cm table Ar name Cm unlock .Nm .Oo Cm set Ar N Oc Cm table .Brq Ar name | all .Cm list .Nm .Oo Cm set Ar N Oc Cm table .Brq Ar name | all .Cm info .Nm .Oo Cm set Ar N Oc Cm table .Brq Ar name | all .Cm detail .Nm .Oo Cm set Ar N Oc Cm table .Brq Ar name | all .Cm flush .Ss DUMMYNET CONFIGURATION (TRAFFIC SHAPER AND PACKET SCHEDULER) .Nm dnctl .Brq Cm pipe | queue | sched .Ar number .Cm config .Ar config-options .Nm dnctl .Op Fl s Op Ar field .Brq Cm pipe | queue | sched .Brq Cm delete | list | show .Op Ar number ... .Ss IN-KERNEL NAT .Nm .Op Fl q .Cm nat .Ar number .Cm config .Ar config-options .Nm .Cm nat .Ar number .Cm show .Brq Cm config | log .Ss STATEFUL IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION .Nm .Oo Cm set Ar N Oc Cm nat64lsn Ar name Cm create Ar create-options .Nm .Oo Cm set Ar N Oc Cm nat64lsn Ar name Cm config Ar config-options .Nm .Oo Cm set Ar N Oc Cm nat64lsn .Brq Ar name | all .Brq Cm list | show .Op Cm states .Nm .Oo Cm set Ar N Oc Cm nat64lsn .Brq Ar name | all .Cm destroy .Nm .Oo Cm set Ar N Oc Cm nat64lsn Ar name Cm stats Op Cm reset .Ss STATELESS IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION .Nm .Oo Cm set Ar N Oc Cm nat64stl Ar name Cm create Ar create-options .Nm .Oo Cm set Ar N Oc Cm nat64stl Ar name Cm config Ar config-options .Nm .Oo Cm set Ar N Oc Cm nat64stl .Brq Ar name | all .Brq Cm list | show .Nm .Oo Cm set Ar N Oc Cm nat64stl .Brq Ar name | all .Cm destroy .Nm .Oo Cm set Ar N Oc Cm nat64stl Ar name Cm stats Op Cm reset .Ss XLAT464 CLAT IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION .Nm .Oo Cm set Ar N Oc Cm nat64clat Ar name Cm create Ar create-options .Nm .Oo Cm set Ar N Oc Cm nat64clat Ar name Cm config Ar config-options .Nm .Oo Cm set Ar N Oc Cm nat64clat .Brq Ar name | all .Brq Cm list | show .Nm .Oo Cm set Ar N Oc Cm nat64clat .Brq Ar name | all .Cm destroy .Nm .Oo Cm set Ar N Oc Cm nat64clat Ar name Cm stats Op Cm reset .Ss IPv6-to-IPv6 NETWORK PREFIX TRANSLATION .Nm .Oo Cm set Ar N Oc Cm nptv6 Ar name Cm create Ar create-options .Nm .Oo Cm set Ar N Oc Cm nptv6 .Brq Ar name | all .Brq Cm list | show .Nm .Oo Cm set Ar N Oc Cm nptv6 .Brq Ar name | all .Cm destroy .Nm .Oo Cm set Ar N Oc Cm nptv6 Ar name Cm stats Op Cm reset .Ss INTERNAL DIAGNOSTICS .Nm .Cm internal iflist .Nm .Cm internal talist .Nm .Cm internal vlist .Ss LIST OF RULES AND PREPROCESSING .Nm .Op Fl cfnNqS .Oo .Fl p Ar preproc .Oo .Ar preproc-flags .Oc .Oc .Ar pathname .Sh DESCRIPTION The .Nm utility is the user interface for controlling the .Xr ipfw 4 firewall, the .Xr dummynet 4 traffic shaper/packet scheduler, and the in-kernel NAT services. .Pp A firewall configuration, or .Em ruleset , is made of a list of .Em rules numbered from 1 to 65535. Packets are passed to the firewall from a number of different places in the protocol stack (depending on the source and destination of the packet, it is possible for the firewall to be invoked multiple times on the same packet). The packet passed to the firewall is compared against each of the rules in the .Em ruleset , in rule-number order (multiple rules with the same number are permitted, in which case they are processed in order of insertion). When a match is found, the action corresponding to the matching rule is performed. .Pp Depending on the action and certain system settings, packets can be reinjected into the firewall at some rule after the matching one for further processing. .Pp A ruleset always includes a .Em default rule (numbered 65535) which cannot be modified or deleted, and matches all packets. The action associated with the .Em default rule can be either .Cm deny or .Cm allow depending on how the kernel is configured. .Pp If the ruleset includes one or more rules with the .Cm keep-state , .Cm record-state , .Cm limit or .Cm set-limit option, the firewall will have a .Em stateful behaviour, i.e., upon a match it will create .Em dynamic rules , i.e., rules that match packets with the same 5-tuple (protocol, source and destination addresses and ports) as the packet which caused their creation. Dynamic rules, which have a limited lifetime, are checked at the first occurrence of a .Cm check-state , .Cm keep-state or .Cm limit rule, and are typically used to open the firewall on-demand to legitimate traffic only. Please note, that .Cm keep-state and .Cm limit imply implicit .Cm check-state for all packets (not only these matched by the rule) but .Cm record-state and .Cm set-limit have no implicit .Cm check-state . See the .Sx STATEFUL FIREWALL and .Sx EXAMPLES Sections below for more information on the stateful behaviour of .Nm . .Pp All rules (including dynamic ones) have a few associated counters: a packet count, a byte count, a log count and a timestamp indicating the time of the last match. Counters can be displayed or reset with .Nm commands. .Pp Each rule belongs to one of 32 different .Em sets , and there are .Nm commands to atomically manipulate sets, such as enable, disable, swap sets, move all rules in a set to another one, delete all rules in a set. These can be useful to install temporary configurations, or to test them. See Section .Sx SETS OF RULES for more information on .Em sets . .Pp Rules can be added with the .Cm add command; deleted individually or in groups with the .Cm delete command, and globally (except those in set 31) with the .Cm flush command; displayed, optionally with the content of the counters, using the .Cm show and .Cm list commands. Finally, counters can be reset with the .Cm zero and .Cm resetlog commands. .Ss COMMAND OPTIONS The following general options are available when invoking .Nm : .Bl -tag -width indent .It Fl a Show counter values when listing rules. The .Cm show command implies this option. .It Fl b Only show the action and the comment, not the body of a rule. Implies .Fl c . .It Fl c When entering or showing rules, print them in compact form, i.e., omitting the "ip from any to any" string when this does not carry any additional information. .It Fl d When listing, show dynamic rules in addition to static ones. .It Fl D When listing, show only dynamic states. When deleting, delete only dynamic states. .It Fl f Run without prompting for confirmation for commands that can cause problems if misused, i.e., .Cm flush . If there is no tty associated with the process, this is implied. The .Cm delete command with this flag ignores possible errors, i.e., nonexistent rule number. And for batched commands execution continues with the next command. .It Fl i When listing a table (see the .Sx LOOKUP TABLES section below for more information on lookup tables), format values as IP addresses. By default, values are shown as integers. .It Fl n Only check syntax of the command strings, without actually passing them to the kernel. .It Fl N Try to resolve addresses and service names in output. .It Fl q Be quiet when executing the .Cm add , .Cm nat , .Cm zero , .Cm resetlog or .Cm flush commands; (implies .Fl f ) . This is useful when updating rulesets by executing multiple .Nm commands in a script (e.g., .Ql sh\ /etc/rc.firewall ) , or by processing a file with many .Nm rules across a remote login session. It also stops a table add or delete from failing if the entry already exists or is not present. .Pp The reason why this option may be important is that for some of these actions, .Nm may print a message; if the action results in blocking the traffic to the remote client, the remote login session will be closed and the rest of the ruleset will not be processed. Access to the console would then be required to recover. .It Fl S When listing rules, show the .Em set each rule belongs to. If this flag is not specified, disabled rules will not be listed. .It Fl s Op Ar field When listing pipes, sort according to one of the four counters (total or current packets or bytes). .It Fl t When listing, show last match timestamp converted with .Fn ctime . .It Fl T When listing, show last match timestamp as seconds from the epoch. This form can be more convenient for postprocessing by scripts. .El .Ss LIST OF RULES AND PREPROCESSING To ease configuration, rules can be put into a file which is processed using .Nm as shown in the last synopsis line. An absolute .Ar pathname must be used. The file will be read line by line and applied as arguments to the .Nm utility. .Pp Optionally, a preprocessor can be specified using .Fl p Ar preproc where .Ar pathname is to be piped through. Useful preprocessors include .Xr cpp 1 and .Xr m4 1 . If .Ar preproc does not start with a slash .Pq Ql / as its first character, the usual .Ev PATH name search is performed. Care should be taken with this in environments where not all file systems are mounted (yet) by the time .Nm is being run (e.g.\& when they are mounted over NFS). Once .Fl p has been specified, any additional arguments are passed on to the preprocessor for interpretation. This allows for flexible configuration files (like conditionalizing them on the local hostname) and the use of macros to centralize frequently required arguments like IP addresses. .Ss TRAFFIC SHAPER CONFIGURATION The .Nm dnctl .Cm pipe , queue and .Cm sched commands are used to configure the traffic shaper and packet scheduler. See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION Section below for details. .Pp If the world and the kernel get out of sync the .Nm ABI may break, preventing you from being able to add any rules. This can adversely affect the booting process. You can use .Nm .Cm disable .Cm firewall to temporarily disable the firewall to regain access to the network, allowing you to fix the problem. .Sh PACKET FLOW A packet is checked against the active ruleset in multiple places in the protocol stack, under control of several sysctl variables. These places and variables are shown below, and it is important to have this picture in mind in order to design a correct ruleset. .Bd -literal -offset indent ^ to upper layers V | | +----------->-----------+ ^ V [ip(6)_input] [ip(6)_output] net.inet(6).ip(6).fw.enable=1 | | ^ V [ether_demux] [ether_output_frame] net.link.ether.ipfw=1 | | +-->--[bdg_forward]-->--+ net.link.bridge.ipfw=1 ^ V | to devices | .Ed .Pp The number of times the same packet goes through the firewall can vary between 0 and 4 depending on packet source and destination, and system configuration. .Pp Note that as packets flow through the stack, headers can be stripped or added to it, and so they may or may not be available for inspection. E.g., incoming packets will include the MAC header when .Nm is invoked from .Cm ether_demux() , but the same packets will have the MAC header stripped off when .Nm is invoked from .Cm ip_input() or .Cm ip6_input() . .Pp Also note that each packet is always checked against the complete ruleset, irrespective of the place where the check occurs, or the source of the packet. If a rule contains some match patterns or actions which are not valid for the place of invocation (e.g.\& trying to match a MAC header within .Cm ip_input or .Cm ip6_input ), the match pattern will not match, but a .Cm not operator in front of such patterns .Em will cause the pattern to .Em always match on those packets. It is thus the responsibility of the programmer, if necessary, to write a suitable ruleset to differentiate among the possible places. .Cm skipto rules can be useful here, as an example: .Bd -literal -offset indent # packets from ether_demux or bdg_forward ipfw add 10 skipto 1000 all from any to any layer2 in # packets from ip_input ipfw add 10 skipto 2000 all from any to any not layer2 in # packets from ip_output ipfw add 10 skipto 3000 all from any to any not layer2 out # packets from ether_output_frame ipfw add 10 skipto 4000 all from any to any layer2 out .Ed .Pp (yes, at the moment there is no way to differentiate between ether_demux and bdg_forward). .Pp Also note that only actions .Cm allow , .Cm deny , .Cm netgraph , .Cm ngtee and related to .Cm dummynet are processed for .Cm layer2 frames and all other actions act as if they were .Cm allow for such frames. Full set of actions is supported for IP packets without .Cm layer2 headers only. For example, .Cm divert action does not divert .Cm layer2 frames. .Sh SYNTAX In general, each keyword or argument must be provided as a separate command line argument, with no leading or trailing spaces. Keywords are case-sensitive, whereas arguments may or may not be case-sensitive depending on their nature (e.g.\& uid's are, hostnames are not). .Pp Some arguments (e.g., port or address lists) are comma-separated lists of values. In this case, spaces after commas ',' are allowed to make the line more readable. You can also put the entire command (including flags) into a single argument. E.g., the following forms are equivalent: .Bd -literal -offset indent ipfw -q add deny src-ip 10.0.0.0/24,127.0.0.1/8 ipfw -q add deny src-ip 10.0.0.0/24, 127.0.0.1/8 ipfw "-q add deny src-ip 10.0.0.0/24, 127.0.0.1/8" .Ed .Sh RULE FORMAT The format of firewall rules is the following: .Bd -ragged -offset indent .Bk -words .Op Ar rule_number .Op Cm set Ar set_number .Op Cm prob Ar match_probability .Ar action .Op Cm log Op Cm logamount Ar number .Op Cm altq Ar queue .Oo .Bro Cm tag | untag .Brc Ar number .Oc .Ar body .Ek .Ed .Pp where the body of the rule specifies which information is used for filtering packets, among the following: .Pp .Bl -tag -width "Source and dest. addresses and ports" -offset XXX -compact .It Layer2 header fields When available .It IPv4 and IPv6 Protocol SCTP, TCP, UDP, ICMP, etc. .It Source and dest. addresses and ports .It Direction See Section .Sx PACKET FLOW .It Transmit and receive interface By name or address .It Misc. IP header fields Version, type of service, datagram length, identification, fragmentation flags, Time To Live .It IP options .It IPv6 Extension headers Fragmentation, Hop-by-Hop options, Routing Headers, Source routing rthdr0, Mobile IPv6 rthdr2, IPSec options. .It IPv6 Flow-ID .It Misc. TCP header fields TCP flags (SYN, FIN, ACK, RST, etc.), sequence number, acknowledgment number, window .It TCP options .It ICMP types for ICMP packets .It ICMP6 types for ICMP6 packets .It User/group ID When the packet can be associated with a local socket. .It Divert status Whether a packet came from a divert socket (e.g., .Xr natd 8 ) . .It Fib annotation state Whether a packet has been tagged for using a specific FIB (routing table) in future forwarding decisions. .El .Pp Note that some of the above information, e.g.\& source MAC or IP addresses and TCP/UDP ports, can be easily spoofed, so filtering on those fields alone might not guarantee the desired results. .Bl -tag -width indent .It Ar rule_number Each rule is associated with a .Ar rule_number in the range 1..65535, with the latter reserved for the .Em default rule. Rules are checked sequentially by rule number. Multiple rules can have the same number, in which case they are checked (and listed) according to the order in which they have been added. If a rule is entered without specifying a number, the kernel will assign one in such a way that the rule becomes the last one before the .Em default rule. Automatic rule numbers are assigned by incrementing the last non-default rule number by the value of the sysctl variable .Ar net.inet.ip.fw.autoinc_step which defaults to 100. If this is not possible (e.g.\& because we would go beyond the maximum allowed rule number), the number of the last non-default value is used instead. .It Cm set Ar set_number Each rule is associated with a .Ar set_number in the range 0..31. Sets can be individually disabled and enabled, so this parameter is of fundamental importance for atomic ruleset manipulation. It can be also used to simplify deletion of groups of rules. If a rule is entered without specifying a set number, set 0 will be used. .br Set 31 is special in that it cannot be disabled, and rules in set 31 are not deleted by the .Nm ipfw flush command (but you can delete them with the .Nm ipfw delete set 31 command). Set 31 is also used for the .Em default rule. .It Cm prob Ar match_probability A match is only declared with the specified probability (floating point number between 0 and 1). This can be useful for a number of applications such as random packet drop or (in conjunction with .Nm dummynet ) to simulate the effect of multiple paths leading to out-of-order packet delivery. .Pp Note: this condition is checked before any other condition, including ones such as .Cm keep-state or .Cm check-state which might have side effects. .It Cm log Op Cm logamount Ar number Packets matching a rule with the .Cm log keyword will be made available for logging in two ways: if the sysctl variable .Va net.inet.ip.fw.verbose is set to 0 (default), one can use .Xr bpf 4 attached to the .Li ipfw0 pseudo interface. This pseudo interface can be created manually after a system boot by using the following command: .Bd -literal -offset indent # ifconfig ipfw0 create .Ed .Pp Or, automatically at boot time by adding the following line to the .Xr rc.conf 5 file: .Bd -literal -offset indent firewall_logif="YES" .Ed .Pp There is zero overhead when no .Xr bpf 4 is attached to the pseudo interface. .Pp If .Va net.inet.ip.fw.verbose is set to 1, packets will be logged to .Xr syslogd 8 with a .Dv LOG_SECURITY facility up to a maximum of .Cm logamount packets. If no .Cm logamount is specified, the limit is taken from the sysctl variable .Va net.inet.ip.fw.verbose_limit . In both cases, a value of 0 means unlimited logging. .Pp Once the limit is reached, logging can be re-enabled by clearing the logging counter or the packet counter for that entry, see the .Cm resetlog command. .Pp Note: logging is done after all other packet matching conditions have been successfully verified, and before performing the final action (accept, deny, etc.) on the packet. .It Cm tag Ar number When a packet matches a rule with the .Cm tag keyword, the numeric tag for the given .Ar number in the range 1..65534 will be attached to the packet. The tag acts as an internal marker (it is not sent out over the wire) that can be used to identify these packets later on. This can be used, for example, to provide trust between interfaces and to start doing policy-based filtering. A packet can have multiple tags at the same time. Tags are "sticky", meaning once a tag is applied to a packet by a matching rule it exists until explicit removal. Tags are kept with the packet everywhere within the kernel, but are lost when the packet leaves the kernel, for example, on transmitting packet out to the network or sending packet to a .Xr divert 4 socket. .Pp To check for previously applied tags, use the .Cm tagged rule option. To delete previously applied tag, use the .Cm untag keyword. .Pp Note: since tags are kept with the packet everywhere in kernelspace, they can be set and unset anywhere in the kernel network subsystem (using the .Xr mbuf_tags 9 facility), not only by means of the .Xr ipfw 4 .Cm tag and .Cm untag keywords. For example, there can be a specialized .Xr netgraph 4 node doing traffic analyzing and tagging for later inspecting in firewall. .It Cm untag Ar number When a packet matches a rule with the .Cm untag keyword, the tag with the number .Ar number is searched among the tags attached to this packet and, if found, removed from it. Other tags bound to packet, if present, are left untouched. .It Cm setmark Ar value | tablearg When a packet matches a rule with the .Cm setmark keyword, a 32-bit numeric mark is assigned to the packet. The mark is an extension to the tags. As tags, mark is "sticky" so the value is kept the same within the kernel and is lost when the packet leaves the kernel. Unlike tags, mark can be matched as a lookup table key or compared with bitwise mask applied against another value. Each packet can have only one mark, so .Cm setmark always overwrites the previous mark value. .Pp The initial mark value is 0. To check the current mark value, use the .Cm mark rule option. Mark .Ar value can be entered as decimal or hexadecimal (if prefixed by 0x), and they are always printed as hexadecimal. .It Cm altq Ar queue When a packet matches a rule with the .Cm altq keyword, the ALTQ identifier for the given .Ar queue (see .Xr altq 4 ) will be attached. Note that this ALTQ tag is only meaningful for packets going "out" of IPFW, and not being rejected or going to divert sockets. Note that if there is insufficient memory at the time the packet is processed, it will not be tagged, so it is wise to make your ALTQ "default" queue policy account for this. If multiple .Cm altq rules match a single packet, only the first one adds the ALTQ classification tag. In doing so, traffic may be shaped by using .Cm count Cm altq Ar queue rules for classification early in the ruleset, then later applying the filtering decision. For example, .Cm check-state and .Cm keep-state rules may come later and provide the actual filtering decisions in addition to the fallback ALTQ tag. .Pp You must run .Xr pfctl 8 to set up the queues before IPFW will be able to look them up by name, and if the ALTQ disciplines are rearranged, the rules in containing the queue identifiers in the kernel will likely have gone stale and need to be reloaded. Stale queue identifiers will probably result in misclassification. .Pp All system ALTQ processing can be turned on or off via .Nm .Cm enable Ar altq and .Nm .Cm disable Ar altq . The usage of .Va net.inet.ip.fw.one_pass is irrelevant to ALTQ traffic shaping, as the actual rule action is followed always after adding an ALTQ tag. .El .Ss RULE ACTIONS A rule can be associated with one of the following actions, which will be executed when the packet matches the body of the rule. .Bl -tag -width indent .It Cm allow | accept | pass | permit Allow packets that match rule. The search terminates. .It Cm check-state Op Ar :flowname | Cm :any Checks the packet against the dynamic ruleset. If a match is found, execute the action associated with the rule which generated this dynamic rule, otherwise move to the next rule. .br .Cm Check-state rules do not have a body. If no .Cm check-state rule is found, the dynamic ruleset is checked at the first .Cm keep-state or .Cm limit rule. The .Ar :flowname is symbolic name assigned to dynamic rule by .Cm keep-state opcode. The special flowname .Cm :any can be used to ignore states flowname when matching. The .Cm :default keyword is special name used for compatibility with old rulesets. .It Cm count Update counters for all packets that match rule. The search continues with the next rule. .It Cm deny | drop Discard packets that match this rule. The search terminates. .It Cm divert Ar port Divert packets that match this rule to the .Xr divert 4 socket bound to port .Ar port . The search terminates. .It Cm fwd | forward Ar ipaddr | tablearg Ns Op , Ns Ar port Change the next-hop on matching packets to .Ar ipaddr , which can be an IP address or a host name. The next hop can also be supplied by the last table looked up for the packet by using the .Cm tablearg keyword instead of an explicit address. The search terminates if this rule matches. .Pp If .Ar ipaddr is a local address, then matching packets will be forwarded to .Ar port (or the port number in the packet if one is not specified in the rule) on the local machine. .br If .Ar ipaddr is not a local address, then the port number (if specified) is ignored, and the packet will be forwarded to the remote address, using the route as found in the local routing table for that IP. .br A .Ar fwd rule will not match layer2 packets (those received on ether_input, ether_output, or bridged). .br The .Cm fwd action does not change the contents of the packet at all. In particular, the destination address remains unmodified, so packets forwarded to another system will usually be rejected by that system unless there is a matching rule on that system to capture them. For packets forwarded locally, the local address of the socket will be set to the original destination address of the packet. This makes the .Xr netstat 1 entry look rather weird but is intended for use with transparent proxy servers. .It Cm nat Ar nat_nr | global | tablearg Pass packet to a nat instance (for network address translation, address redirect, etc.): see the .Sx NETWORK ADDRESS TRANSLATION (NAT) Section for further information. .It Cm nat64lsn Ar name Pass packet to a stateful NAT64 instance (for IPv6/IPv4 network address and protocol translation): see the .Sx IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION Section for further information. .It Cm nat64stl Ar name Pass packet to a stateless NAT64 instance (for IPv6/IPv4 network address and protocol translation): see the .Sx IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION Section for further information. .It Cm nat64clat Ar name Pass packet to a CLAT NAT64 instance (for client-side IPv6/IPv4 network address and protocol translation): see the .Sx IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION Section for further information. .It Cm nptv6 Ar name Pass packet to a NPTv6 instance (for IPv6-to-IPv6 network prefix translation): see the .Sx IPv6-to-IPv6 NETWORK PREFIX TRANSLATION (NPTv6) Section for further information. .It Cm pipe Ar pipe_nr Pass packet to a .Nm dummynet .Dq pipe (for bandwidth limitation, delay, etc.). See the .Sx TRAFFIC SHAPER (DUMMYNET) CONFIGURATION Section for further information. The search terminates; however, on exit from the pipe and if the .Xr sysctl 8 variable .Va net.inet.ip.fw.one_pass is not set, the packet is passed again to the firewall code starting from the next rule. .It Cm queue Ar queue_nr Pass packet to a .Nm dummynet .Dq queue (for bandwidth limitation using WF2Q+). .It Cm reject (Deprecated). Synonym for .Cm unreach host . .It Cm reset Discard packets that match this rule, and if the packet is a TCP packet, try to send a TCP reset (RST) notice. The search terminates. .It Cm reset6 Discard packets that match this rule, and if the packet is a TCP packet, try to send a TCP reset (RST) notice. The search terminates. .It Cm skipto Ar number | tablearg Skip all subsequent rules numbered less than .Ar number . The search continues with the first rule numbered .Ar number or higher. It is possible to use the .Cm tablearg keyword with a skipto for a .Em computed skipto. Skipto may work either in O(log(N)) or in O(1) depending on amount of memory and/or sysctl variables. See the .Sx SYSCTL VARIABLES section for more details. .It Cm call Ar number | tablearg The current rule number is saved in the internal stack and ruleset processing continues with the first rule numbered .Ar number or higher. If later a rule with the .Cm return action is encountered, the processing returns to the first rule with number of this .Cm call rule plus one or higher (the same behaviour as with packets returning from .Xr divert 4 socket after a .Cm divert action). This could be used to make somewhat like an assembly language .Dq subroutine calls to rules with common checks for different interfaces, etc. .Pp Rule with any number could be called, not just forward jumps as with .Cm skipto . So, to prevent endless loops in case of mistakes, both .Cm call and .Cm return actions don't do any jumps and simply go to the next rule if memory cannot be allocated or stack overflowed/underflowed. .Pp Internally stack for rule numbers is implemented using .Xr mbuf_tags 9 facility and currently has size of 16 entries. As mbuf tags are lost when packet leaves the kernel, .Cm divert should not be used in subroutines to avoid endless loops and other undesired effects. .It Cm return Takes rule number saved to internal stack by the last .Cm call action and returns ruleset processing to the first rule with number greater than number of corresponding .Cm call rule. See description of the .Cm call action for more details. .Pp Note that .Cm return rules usually end a .Dq subroutine and thus are unconditional, but .Nm command-line utility currently requires every action except .Cm check-state to have body. While it is sometimes useful to return only on some packets, usually you want to print just .Dq return for readability. A workaround for this is to use new syntax and .Fl c switch: .Bd -literal -offset indent # Add a rule without actual body ipfw add 2999 return via any # List rules without "from any to any" part ipfw -c list .Ed .Pp This cosmetic annoyance may be fixed in future releases. .It Cm tee Ar port Send a copy of packets matching this rule to the .Xr divert 4 socket bound to port .Ar port . The search continues with the next rule. .It Cm unreach Ar code Op mtu Discard packets that match this rule, and try to send an ICMP unreachable notice with code .Ar code , where .Ar code is a number from 0 to 255, or one of these aliases: .Cm net , host , protocol , port , .Cm needfrag , srcfail , net-unknown , host-unknown , .Cm isolated , net-prohib , host-prohib , tosnet , .Cm toshost , filter-prohib , host-precedence or .Cm precedence-cutoff . The .Cm needfrag code may have an optional .Ar mtu parameter. If specified, the MTU value will be put into generated ICMP packet. The search terminates. .It Cm unreach6 Ar code Discard packets that match this rule, and try to send an ICMPv6 unreachable notice with code .Ar code , where .Ar code is a number from 0, 1, 3 or 4, or one of these aliases: .Cm no-route, admin-prohib, address or .Cm port . The search terminates. .It Cm netgraph Ar cookie Divert packet into netgraph with given .Ar cookie . The search terminates. If packet is later returned from netgraph it is either accepted or continues with the next rule, depending on .Va net.inet.ip.fw.one_pass sysctl variable. .It Cm ngtee Ar cookie A copy of packet is diverted into netgraph, original packet continues with the next rule. See .Xr ng_ipfw 4 for more information on .Cm netgraph and .Cm ngtee actions. .It Cm setfib Ar fibnum | tablearg The packet is tagged so as to use the FIB (routing table) .Ar fibnum in any subsequent forwarding decisions. In the current implementation, this is limited to the values 0 through 15, see .Xr setfib 2 . Processing continues at the next rule. It is possible to use the .Cm tablearg keyword with setfib. If the tablearg value is not within the compiled range of fibs, the packet's fib is set to 0. .It Cm setdscp Ar DSCP | number | tablearg Set specified DiffServ codepoint for an IPv4/IPv6 packet. Processing continues at the next rule. Supported values are: .Pp .Cm cs0 .Pq Dv 000000 , .Cm cs1 .Pq Dv 001000 , .Cm cs2 .Pq Dv 010000 , .Cm cs3 .Pq Dv 011000 , .Cm cs4 .Pq Dv 100000 , .Cm cs5 .Pq Dv 101000 , .Cm cs6 .Pq Dv 110000 , .Cm cs7 .Pq Dv 111000 , .Cm af11 .Pq Dv 001010 , .Cm af12 .Pq Dv 001100 , .Cm af13 .Pq Dv 001110 , .Cm af21 .Pq Dv 010010 , .Cm af22 .Pq Dv 010100 , .Cm af23 .Pq Dv 010110 , .Cm af31 .Pq Dv 011010 , .Cm af32 .Pq Dv 011100 , .Cm af33 .Pq Dv 011110 , .Cm af41 .Pq Dv 100010 , .Cm af42 .Pq Dv 100100 , .Cm af43 .Pq Dv 100110 , .Cm va .Pq Dv 101100 , .Cm ef .Pq Dv 101110 , .Cm be .Pq Dv 000000 . Additionally, DSCP value can be specified by number (0..63). It is also possible to use the .Cm tablearg keyword with setdscp. If the tablearg value is not within the 0..63 range, lower 6 bits of supplied value are used. .It Cm tcp-setmss Ar mss Set the Maximum Segment Size (MSS) in the TCP segment to value .Ar mss . The kernel module .Cm ipfw_pmod should be loaded or kernel should have .Cm options IPFIREWALL_PMOD to be able use this action. This command does not change a packet if original MSS value is lower than specified value. Both TCP over IPv4 and over IPv6 are supported. Regardless of matched a packet or not by the .Cm tcp-setmss rule, the search continues with the next rule. .It Cm reass Queue and reassemble IPv4 fragments. If the packet is not fragmented, counters are updated and processing continues with the next rule. If the packet is the last logical fragment, the packet is reassembled and, if .Va net.inet.ip.fw.one_pass is set to 0, processing continues with the next rule. Otherwise, the packet is allowed to pass and the search terminates. If the packet is a fragment in the middle of a logical group of fragments, it is consumed and processing stops immediately. .Pp Fragment handling can be tuned via .Va net.inet.ip.maxfragpackets and .Va net.inet.ip.maxfragsperpacket which limit, respectively, the maximum number of processable fragments (default: 800) and the maximum number of fragments per packet (default: 16). .Pp NOTA BENE: since fragments do not contain port numbers, they should be avoided with the .Nm reass rule. Alternatively, direction-based (like .Nm in / .Nm out ) and source-based (like .Nm via ) match patterns can be used to select fragments. .Pp Usually a simple rule like: .Bd -literal -offset indent # reassemble incoming fragments ipfw add reass all from any to any in .Ed .Pp is all you need at the beginning of your ruleset. .It Cm abort Discard packets that match this rule, and if the packet is an SCTP packet, try to send an SCTP packet containing an ABORT chunk. The search terminates. .It Cm abort6 Discard packets that match this rule, and if the packet is an SCTP packet, try to send an SCTP packet containing an ABORT chunk. The search terminates. .El .Ss RULE BODY The body of a rule contains zero or more patterns (such as specific source and destination addresses or ports, protocol options, incoming or outgoing interfaces, etc.) that the packet must match in order to be recognised. In general, the patterns are connected by (implicit) .Cm and operators -- i.e., all must match in order for the rule to match. Individual patterns can be prefixed by the .Cm not operator to reverse the result of the match, as in .Pp .Dl "ipfw add 100 allow ip from not 1.2.3.4 to any" .Pp Additionally, sets of alternative match patterns .Pq Em or-blocks can be constructed by putting the patterns in lists enclosed between parentheses ( ) or braces { }, and using the .Cm or operator as follows: .Pp .Dl "ipfw add 100 allow ip from { x or not y or z } to any" .Pp Only one level of parentheses is allowed. Beware that most shells have special meanings for parentheses or braces, so it is advisable to put a backslash \\ in front of them to prevent such interpretations. .Pp The body of a rule must in general include a source and destination address specifier. The keyword .Ar any can be used in various places to specify that the content of a required field is irrelevant. .Pp The rule body has the following format: .Bd -ragged -offset indent .Op Ar proto Cm from Ar src Cm to Ar dst .Op Ar options .Ed .Pp The first part (proto from src to dst) is for backward compatibility with earlier versions of .Fx . In modern .Fx any match pattern (including MAC headers, IP protocols, addresses and ports) can be specified in the .Ar options section. .Pp Rule fields have the following meaning: .Bl -tag -width indent .It Ar proto : protocol | Cm { Ar protocol Cm or ... } .It Ar protocol : Oo Cm not Oc Ar protocol-name | protocol-number An IP protocol specified by number or name (for a complete list see .Pa /etc/protocols ) , or one of the following keywords: .Bl -tag -width indent .It Cm ip4 | ipv4 Matches IPv4 packets. .It Cm ip6 | ipv6 Matches IPv6 packets. .It Cm ip | all Matches any packet. .El .Pp The .Cm ipv6 in .Cm proto option will be treated as inner protocol. And, the .Cm ipv4 is not available in .Cm proto option. .Pp The .Cm { Ar protocol Cm or ... } format (an .Em or-block ) is provided for convenience only but its use is deprecated. .It Ar src No and Ar dst : Bro Cm addr | Cm { Ar addr Cm or ... } Brc Op Oo Cm not Oc Ar ports An address (or a list, see below) optionally followed by .Ar ports specifiers. .Pp The second format .Em ( or-block with multiple addresses) is provided for convenience only and its use is discouraged. .It Ar addr : Oo Cm not Oc Bro .Cm any | me | me6 | .Cm table Ns Pq Ar name Ns Op , Ns Ar value .Ar | addr-list | addr-set .Brc .Bl -tag -width indent .It Cm any Matches any IP address. .It Cm me Matches any IP address configured on an interface in the system. .It Cm me6 Matches any IPv6 address configured on an interface in the system. The address list is evaluated at the time the packet is analysed. .It Cm table Ns Pq Ar name Ns Op , Ns Ar value Matches any IPv4 or IPv6 address for which an entry exists in the lookup table .Ar number . If an optional 32-bit unsigned .Ar value is also specified, an entry will match only if it has this value. +If +.Ar value +is specified in form +.Ar valtype=value , +then specified value type field will be checked. +It can be +.Ar skipto, pipe, fib, nat, dscp, tag, divert, netgraph, limit, nh4 +and +.Ar mark. + See the .Sx LOOKUP TABLES section below for more information on lookup tables. .El .It Ar addr-list : ip-addr Ns Op Ns , Ns Ar addr-list .It Ar ip-addr : A host or subnet address specified in one of the following ways: .Bl -tag -width indent .It Ar numeric-ip | hostname Matches a single IPv4 address, specified as dotted-quad or a hostname. Hostnames are resolved at the time the rule is added to the firewall list. .It Ar addr Ns / Ns Ar masklen Matches all addresses with base .Ar addr (specified as an IP address, a network number, or a hostname) and mask width of .Cm masklen bits. As an example, 1.2.3.4/25 or 1.2.3.0/25 will match all IP numbers from 1.2.3.0 to 1.2.3.127 . .It Ar addr Ns : Ns Ar mask Matches all addresses with base .Ar addr (specified as an IP address, a network number, or a hostname) and the mask of .Ar mask , specified as a dotted quad. As an example, 1.2.3.4:255.0.255.0 or 1.0.3.0:255.0.255.0 will match 1.*.3.*. This form is advised only for non-contiguous masks. It is better to resort to the .Ar addr Ns / Ns Ar masklen format for contiguous masks, which is more compact and less error-prone. .El .It Ar addr-set : addr Ns Oo Ns / Ns Ar masklen Oc Ns Cm { Ns Ar list Ns Cm } .It Ar list : Bro Ar num | num-num Brc Ns Op Ns , Ns Ar list Matches all addresses with base address .Ar addr (specified as an IP address, a network number, or a hostname) and whose last byte is in the list between braces { } . Note that there must be no spaces between braces and numbers (spaces after commas are allowed). Elements of the list can be specified as single entries or ranges. The .Ar masklen field is used to limit the size of the set of addresses, and can have any value between 24 and 32. If not specified, it will be assumed as 24. .br This format is particularly useful to handle sparse address sets within a single rule. Because the matching occurs using a bitmask, it takes constant time and dramatically reduces the complexity of rulesets. .br As an example, an address specified as 1.2.3.4/24{128,35-55,89} or 1.2.3.0/24{128,35-55,89} will match the following IP addresses: .br 1.2.3.128, 1.2.3.35 to 1.2.3.55, 1.2.3.89 . .It Ar addr6-list : ip6-addr Ns Op Ns , Ns Ar addr6-list .It Ar ip6-addr : A host or subnet specified one of the following ways: .Bl -tag -width indent .It Ar numeric-ip | hostname Matches a single IPv6 address as allowed by .Xr inet_pton 3 or a hostname. Hostnames are resolved at the time the rule is added to the firewall list. .It Ar addr Ns / Ns Ar masklen Matches all IPv6 addresses with base .Ar addr (specified as allowed by .Xr inet_pton 3 or a hostname) and mask width of .Cm masklen bits. .It Ar addr Ns / Ns Ar mask Matches all IPv6 addresses with base .Ar addr (specified as allowed by .Xr inet_pton 3 or a hostname) and the mask of .Ar mask , specified as allowed by .Xr inet_pton 3 . As an example, fe::640:0:0/ffff::ffff:ffff:0:0 will match fe:*:*:*:0:640:*:*. This form is advised only for non-contiguous masks. It is better to resort to the .Ar addr Ns / Ns Ar masklen format for contiguous masks, which is more compact and less error-prone. .El .Pp No support for sets of IPv6 addresses is provided because IPv6 addresses are typically random past the initial prefix. .It Ar ports : Bro Ar port | port Ns \&- Ns Ar port Ns Brc Ns Op , Ns Ar ports For protocols which support port numbers (such as SCTP, TCP and UDP), optional .Cm ports may be specified as one or more ports or port ranges, separated by commas but no spaces, and an optional .Cm not operator. The .Ql \&- notation specifies a range of ports (including boundaries). .Pp Service names (from .Pa /etc/services ) may be used instead of numeric port values. The length of the port list is limited to 30 ports or ranges, though one can specify larger ranges by using an .Em or-block in the .Cm options section of the rule. .Pp A backslash .Pq Ql \e can be used to escape the dash .Pq Ql - character in a service name (from a shell, the backslash must be typed twice to avoid the shell itself interpreting it as an escape character). .Pp .Dl "ipfw add count tcp from any ftp\e\e-data-ftp to any" .Pp Fragmented packets which have a non-zero offset (i.e., not the first fragment) will never match a rule which has one or more port specifications. See the .Cm frag option for details on matching fragmented packets. .El .Ss RULE OPTIONS (MATCH PATTERNS) Additional match patterns can be used within rules. Zero or more of these so-called .Em options can be present in a rule, optionally prefixed by the .Cm not operand, and possibly grouped into .Em or-blocks . .Pp The following match patterns can be used (listed in alphabetical order): .Bl -tag -width indent .It Cm // this is a comment . Inserts the specified text as a comment in the rule. Everything following // is considered as a comment and stored in the rule. You can have comment-only rules, which are listed as having a .Cm count action followed by the comment. .It Cm bridged Alias for .Cm layer2 . .It Cm defer-immediate-action | defer-action A rule with this option will not perform normal action upon a match. This option is intended to be used with .Cm record-state or .Cm keep-state as the dynamic rule, created but ignored on match, will work as intended. Rules with both .Cm record-state and .Cm defer-immediate-action create a dynamic rule and continue with the next rule without actually performing the action part of this rule. When the rule is later activated via the state table, the action is performed as usual. .It Cm diverted Matches only packets generated by a divert socket. .It Cm diverted-loopback Matches only packets coming from a divert socket back into the IP stack input for delivery. .It Cm diverted-output Matches only packets going from a divert socket back outward to the IP stack output for delivery. .It Cm dst-ip Ar ip-address Matches IPv4 packets whose destination IP is one of the address(es) specified as argument. .It Bro Cm dst-ip6 | dst-ipv6 Brc Ar ip6-address Matches IPv6 packets whose destination IP is one of the address(es) specified as argument. .It Cm dst-port Ar ports Matches IP packets whose destination port is one of the port(s) specified as argument. .It Cm established Matches TCP packets that have the RST or ACK bits set. .It Cm ext6hdr Ar header Matches IPv6 packets containing the extended header given by .Ar header . Supported headers are: .Pp Fragment, .Pq Cm frag , Hop-to-hop options .Pq Cm hopopt , any type of Routing Header .Pq Cm route , Source routing Routing Header Type 0 .Pq Cm rthdr0 , Mobile IPv6 Routing Header Type 2 .Pq Cm rthdr2 , Destination options .Pq Cm dstopt , IPSec authentication headers .Pq Cm ah , and IPsec encapsulated security payload headers .Pq Cm esp . .It Cm fib Ar fibnum Matches a packet that has been tagged to use the given FIB (routing table) number. .It Cm flow Ar table Ns Pq Ar name Ns Op , Ns Ar value Search for the flow entry in lookup table .Ar name . If not found, the match fails. Otherwise, the match succeeds and .Cm tablearg is set to the value extracted from the table. .Pp This option can be useful to quickly dispatch traffic based on certain packet fields. See the .Sx LOOKUP TABLES section below for more information on lookup tables. .It Cm flow-id Ar labels Matches IPv6 packets containing any of the flow labels given in .Ar labels . .Ar labels is a comma separated list of numeric flow labels. .It Cm dst-mac Ar table Ns Pq Ar name Ns Op , Ns Ar value Search for the destination MAC address entry in lookup table .Ar name . If not found, the match fails. Otherwise, the match succeeds and .Cm tablearg is set to the value extracted from the table. .It Cm src-mac Ar table Ns Pq Ar name Ns Op , Ns Ar value Search for the source MAC address entry in lookup table .Ar name . If not found, the match fails. Otherwise, the match succeeds and .Cm tablearg is set to the value extracted from the table. .It Cm frag Ar spec Matches IPv4 packets whose .Cm ip_off field contains the comma separated list of IPv4 fragmentation options specified in .Ar spec . The recognized options are: .Cm df .Pq Dv don't fragment , .Cm mf .Pq Dv more fragments , .Cm rf .Pq Dv reserved fragment bit .Cm offset .Pq Dv non-zero fragment offset . The absence of a particular options may be denoted with a .Ql \&! . .Pp Empty list of options defaults to matching on non-zero fragment offset. Such rule would match all not the first fragment datagrams, both IPv4 and IPv6. This is a backward compatibility with older rulesets. .It Cm gid Ar group Matches all TCP or UDP packets sent by or received for a .Ar group . A .Ar group may be specified by name or number. .It Cm jail Ar jail Matches all TCP or UDP packets sent by or received for the jail whose ID or name is .Ar jail . .It Cm icmptypes Ar types Matches ICMP packets whose ICMP type is in the list .Ar types . The list may be specified as any combination of individual types (numeric) separated by commas. .Em Ranges are not allowed . The supported ICMP types are: .Pp echo reply .Pq Cm 0 , destination unreachable .Pq Cm 3 , source quench .Pq Cm 4 , redirect .Pq Cm 5 , echo request .Pq Cm 8 , router advertisement .Pq Cm 9 , router solicitation .Pq Cm 10 , time-to-live exceeded .Pq Cm 11 , IP header bad .Pq Cm 12 , timestamp request .Pq Cm 13 , timestamp reply .Pq Cm 14 , information request .Pq Cm 15 , information reply .Pq Cm 16 , address mask request .Pq Cm 17 and address mask reply .Pq Cm 18 . .It Cm icmp6types Ar types Matches ICMP6 packets whose ICMP6 type is in the list of .Ar types . The list may be specified as any combination of individual types (numeric) separated by commas. .Em Ranges are not allowed . .It Cm in | out Matches incoming or outgoing packets, respectively. .Cm in and .Cm out are mutually exclusive (in fact, .Cm out is implemented as .Cm not in Ns No ). .It Cm ipid Ar id-list Matches IPv4 packets whose .Cm ip_id field has value included in .Ar id-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm iplen Ar len-list Matches IP packets whose total length, including header and data, is in the set .Ar len-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm ipoptions Ar spec Matches packets whose IPv4 header contains the comma separated list of options specified in .Ar spec . The supported IP options are: .Pp .Cm ssrr (strict source route), .Cm lsrr (loose source route), .Cm rr (record packet route) and .Cm ts (timestamp). The absence of a particular option may be denoted with a .Ql \&! . .It Cm ipprecedence Ar precedence Matches IPv4 packets whose precedence field is equal to .Ar precedence . .It Cm ipsec Matches packets that have IPSEC history associated with them (i.e., the packet comes encapsulated in IPSEC, the kernel has IPSEC support, and can correctly decapsulate it). .Pp Note that specifying .Cm ipsec is different from specifying .Cm proto Ar ipsec as the latter will only look at the specific IP protocol field, irrespective of IPSEC kernel support and the validity of the IPSEC data. .Pp Further note that this flag is silently ignored in kernels without IPSEC support. It does not affect rule processing when given and the rules are handled as if with no .Cm ipsec flag. .It Cm iptos Ar spec Matches IPv4 packets whose .Cm tos field contains the comma separated list of service types specified in .Ar spec . The supported IP types of service are: .Pp .Cm lowdelay .Pq Dv IPTOS_LOWDELAY , .Cm throughput .Pq Dv IPTOS_THROUGHPUT , .Cm reliability .Pq Dv IPTOS_RELIABILITY , .Cm mincost .Pq Dv IPTOS_MINCOST , .Cm congestion .Pq Dv IPTOS_ECN_CE . The absence of a particular type may be denoted with a .Ql \&! . .It Cm dscp spec Ns Op , Ns Ar spec Matches IPv4/IPv6 packets whose .Cm DS field value is contained in .Ar spec mask. Multiple values can be specified via the comma separated list. Value can be one of keywords used in .Cm setdscp action or exact number. .It Cm ipttl Ar ttl-list Matches IPv4 packets whose time to live is included in .Ar ttl-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm ipversion Ar ver Matches IP packets whose IP version field is .Ar ver . .It Cm keep-state Op Ar :flowname Upon a match, the firewall will create a dynamic rule, whose default behaviour is to match bidirectional traffic between source and destination IP/port using the same protocol. The rule has a limited lifetime (controlled by a set of .Xr sysctl 8 variables), and the lifetime is refreshed every time a matching packet is found. The .Ar :flowname is used to assign additional to addresses, ports and protocol parameter to dynamic rule. It can be used for more accurate matching by .Cm check-state rule. The .Cm :default keyword is special name used for compatibility with old rulesets. .It Cm layer2 Matches only layer2 packets, i.e., those passed to .Nm from .Fn ether_demux and .Fn ether_output_frame . .It Cm limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N Op Ar :flowname The firewall will only allow .Ar N connections with the same set of parameters as specified in the rule. One or more of source and destination addresses and ports can be specified. .It Cm lookup Bro Cm dst-ip | dst-port | dst-mac | src-ip | src-port | src-mac | uid | -.Cm jail | dscp | mark Brc Ar name +.Cm jail | dscp | mark | rulenum Brc Ar name Search an entry in lookup table .Ar name that matches the field specified as argument. If not found, the match fails. Otherwise, the match succeeds and .Cm tablearg is set to the value extracted from the table. .Pp This option can be useful to quickly dispatch traffic based on certain packet fields. See the .Sx LOOKUP TABLES section below for more information on lookup tables. .It Cm { MAC | mac } Ar dst-mac src-mac Match packets with a given .Ar dst-mac and .Ar src-mac addresses, specified as the .Cm any keyword (matching any MAC address), or six groups of hex digits separated by colons, and optionally followed by a mask indicating the significant bits. The mask may be specified using either of the following methods: .Bl -enum -width indent .It A slash .Pq / followed by the number of significant bits. For example, an address with 33 significant bits could be specified as: .Pp .Dl "MAC 10:20:30:40:50:60/33 any" .It An ampersand .Pq & followed by a bitmask specified as six groups of hex digits separated by colons. For example, an address in which the last 16 bits are significant could be specified as: .Pp .Dl "MAC 10:20:30:40:50:60&00:00:00:00:ff:ff any" .Pp Note that the ampersand character has a special meaning in many shells and should generally be escaped. .El Note that the order of MAC addresses (destination first, source second) is the same as on the wire, but the opposite of the one used for IP addresses. .It Cm mac-type Ar mac-type Matches packets whose Ethernet Type field corresponds to one of those specified as argument. .Ar mac-type is specified in the same way as .Cm port numbers (i.e., one or more comma-separated single values or ranges). You can use symbolic names for known values such as .Em vlan , ipv4, ipv6 . Values can be entered as decimal or hexadecimal (if prefixed by 0x), and they are always printed as hexadecimal (unless the .Cm -N option is used, in which case symbolic resolution will be attempted). .It Cm proto Ar protocol Matches packets with the corresponding IP protocol. .It Cm record-state Upon a match, the firewall will create a dynamic rule as if .Cm keep-state was specified. However, this option doesn't imply an implicit .Cm check-state in contrast to .Cm keep-state . .It Cm recv | xmit | via Brq Ar ifX | Ar ifmask | Ar table Ns Po Ar name Ns Oo , Ns Ar value Oc Pc | Ar ipno | Ar any Matches packets received, transmitted or going through, respectively, the interface specified by exact name .Po Ar ifX Pc , by device mask .Po Ar ifmask Pc , by IP address, or through some interface. .Pp Interface name may be matched against .Ar ifmask with .Xr fnmatch 3 according to the rules used by the shell (f.e. tun*). See also the .Sx EXAMPLES section. .Pp Table .Ar name may be used to match interface by its kernel ifindex. See the .Sx LOOKUP TABLES section below for more information on lookup tables. .Pp The .Cm via keyword causes the interface to always be checked. If .Cm recv or .Cm xmit is used instead of .Cm via , then only the receive or transmit interface (respectively) is checked. By specifying both, it is possible to match packets based on both receive and transmit interface, e.g.: .Pp .Dl "ipfw add deny ip from any to any out recv ed0 xmit ed1" .Pp The .Cm recv interface can be tested on either incoming or outgoing packets, while the .Cm xmit interface can only be tested on outgoing packets. So .Cm out is required (and .Cm in is invalid) whenever .Cm xmit is used. .Pp A packet might not have a receive or transmit interface: packets originating from the local host have no receive interface, while packets destined for the local host have no transmit interface. .It Cm set-limit Bro Cm src-addr | src-port | dst-addr | dst-port Brc Ar N Works like .Cm limit but does not have an implicit .Cm check-state attached to it. .It Cm setup Matches TCP packets that have the SYN bit set but no ACK bit. This is the short form of .Dq Li tcpflags\ syn,!ack . .It Cm sockarg Matches packets that are associated to a local socket and for which the SO_USER_COOKIE socket option has been set to a non-zero value. As a side effect, the value of the option is made available as .Cm tablearg value, which in turn can be used as .Cm skipto or .Cm pipe number. .It Cm src-ip Ar ip-address Matches IPv4 packets whose source IP is one of the address(es) specified as an argument. .It Cm src-ip6 Ar ip6-address Matches IPv6 packets whose source IP is one of the address(es) specified as an argument. .It Cm src-port Ar ports Matches IP packets whose source port is one of the port(s) specified as argument. .It Cm tagged Ar tag-list Matches packets whose tags are included in .Ar tag-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . Tags can be applied to the packet using .Cm tag rule action parameter (see it's description for details on tags). .It Cm mark Ar value[:bitmask] | tablearg[:bitmask] Matches packets whose mark is equal to .Ar value with optional .Ar bitmask applied to it. .Cm tablearg can also be used instead of an explicit .Ar value to match a value supplied by the last table lookup. .Pp Both .Ar value and .Ar bitmask can be entered as decimal or hexadecimal (if prefixed by 0x), and they are always printed as hexadecimal. .It Cm tcpack Ar ack TCP packets only. Match if the TCP header acknowledgment number field is set to .Ar ack . .It Cm tcpdatalen Ar tcpdatalen-list Matches TCP packets whose length of TCP data is .Ar tcpdatalen-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm tcpflags Ar spec TCP packets only. Match if the TCP header contains the comma separated list of flags specified in .Ar spec . The supported TCP flags are: .Pp .Cm fin , .Cm syn , .Cm rst , .Cm psh , .Cm ack and .Cm urg . The absence of a particular flag may be denoted with a .Ql \&! . A rule which contains a .Cm tcpflags specification can never match a fragmented packet which has a non-zero offset. See the .Cm frag option for details on matching fragmented packets. .It Cm tcpmss Ar tcpmss-list Matches TCP packets whose MSS (maximum segment size) value is set to .Ar tcpmss-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm tcpseq Ar seq TCP packets only. Match if the TCP header sequence number field is set to .Ar seq . .It Cm tcpwin Ar tcpwin-list Matches TCP packets whose header window field is set to .Ar tcpwin-list , which is either a single value or a list of values or ranges specified in the same way as .Ar ports . .It Cm tcpoptions Ar spec TCP packets only. Match if the TCP header contains the comma separated list of options specified in .Ar spec . The supported TCP options are: .Pp .Cm mss (maximum segment size), .Cm window (tcp window advertisement), .Cm sack (selective ack), .Cm ts (rfc1323 timestamp) and .Cm cc (rfc1644 t/tcp connection count). The absence of a particular option may be denoted with a .Ql \&! . .It Cm uid Ar user Match all TCP or UDP packets sent by or received for a .Ar user . A .Ar user may be matched by name or identification number. .It Cm verrevpath For incoming packets, a routing table lookup is done on the packet's source address. If the interface on which the packet entered the system matches the outgoing interface for the route, the packet matches. If the interfaces do not match up, the packet does not match. All outgoing packets or packets with no incoming interface match. .Pp The name and functionality of the option is intentionally similar to the Cisco IOS command: .Pp .Dl ip verify unicast reverse-path .Pp This option can be used to make anti-spoofing rules to reject all packets with source addresses not from this interface. See also the option .Cm antispoof . .It Cm versrcreach For incoming packets, a routing table lookup is done on the packet's source address. If a route to the source address exists, but not the default route or a blackhole/reject route, the packet matches. Otherwise, the packet does not match. All outgoing packets match. .Pp The name and functionality of the option is intentionally similar to the Cisco IOS command: .Pp .Dl ip verify unicast source reachable-via any .Pp This option can be used to make anti-spoofing rules to reject all packets whose source address is unreachable. .It Cm antispoof For incoming packets, the packet's source address is checked if it belongs to a directly connected network. If the network is directly connected, then the interface the packet came on in is compared to the interface the network is connected to. When incoming interface and directly connected interface are not the same, the packet does not match. Otherwise, the packet does match. All outgoing packets match. .Pp This option can be used to make anti-spoofing rules to reject all packets that pretend to be from a directly connected network but do not come in through that interface. This option is similar to but more restricted than .Cm verrevpath because it engages only on packets with source addresses of directly connected networks instead of all source addresses. .El .Sh LOOKUP TABLES Lookup tables are useful to handle large sparse sets of addresses or other search keys (e.g., ports, jail IDs, interface names). In the rest of this section we will use the term ``key''. Table name needs to match the following spec: .Ar table-name . Tables with the same name can be created in different .Ar sets . However, rule links to the tables in .Ar set 0 by default. This behavior can be controlled by .Va net.inet.ip.fw.tables_sets variable. See the .Sx SETS OF RULES section for more information. There may be up to 65535 different lookup tables. .Pp The following table types are supported: .Bl -tag -width indent .It Ar table-type : Ar addr | iface | number | flow | mac .It Ar table-key : Ar addr Ns Oo / Ns Ar masklen Oc | iface-name | number | flow-spec .It Ar flow-spec : Ar flow-field Ns Op , Ns Ar flow-spec .It Ar flow-field : src-ip | proto | src-port | dst-ip | dst-port .It Cm addr Matches IPv4 or IPv6 address. Each entry is represented by an .Ar addr Ns Op / Ns Ar masklen and will match all addresses with base .Ar addr (specified as an IPv4/IPv6 address, or a hostname) and mask width of .Ar masklen bits. If .Ar masklen is not specified, it defaults to 32 for IPv4 and 128 for IPv6. When looking up an IP address in a table, the most specific entry will match. .It Cm iface Matches interface names. Each entry is represented by string treated as interface name. Wildcards are not supported. .It Cm number Matches protocol ports, uids/gids or jail IDs. Each entry is represented by 32-bit unsigned integer. Ranges are not supported. .It Cm flow Matches packet fields specified by .Ar flow type suboptions with table entries. .It Cm mac Matches MAC address. Each entry is represented by an .Ar addr Ns Op / Ns Ar masklen and will match all addresses with base .Ar addr and mask width of .Ar masklen bits. If .Ar masklen is not specified, it defaults to 48. When looking up an MAC address in a table, the most specific entry will match. .El .Pp Tables require explicit creation via .Cm create before use. .Pp The following creation options are supported: .Bl -tag -width indent .It Ar create-options : Ar create-option | create-options .It Ar create-option : Cm type Ar table-type | Cm valtype Ar value-mask | Cm algo Ar algo-desc | .Cm limit Ar number | Cm locked | Cm missing | Cm or-flush .It Cm type Table key type. .It Cm valtype Table value mask. .It Cm algo Table algorithm to use (see below). .It Cm limit Maximum number of items that may be inserted into table. .It Cm locked Restrict any table modifications. .It Cm missing Do not fail if table already exists and has exactly same options as new one. .It Cm or-flush Flush existing table with same name instead of returning error. Implies .Cm missing so existing table must be compatible with new one. .El .Pp Some of these options may be modified later via .Cm modify keyword. The following options can be changed: .Bl -tag -width indent .It Ar modify-options : Ar modify-option | modify-options .It Ar modify-option : Cm limit Ar number .It Cm limit Alter maximum number of items that may be inserted into table. .El .Pp Additionally, table can be locked or unlocked using .Cm lock or .Cm unlock commands. .Pp Tables of the same .Ar type can be swapped with each other using .Cm swap Ar name command. Swap may fail if tables limits are set and data exchange would result in limits hit. Operation is performed atomically. .Pp One or more entries can be added to a table at once using .Cm add command. Addition of all items are performed atomically. By default, error in addition of one entry does not influence addition of other entries. However, non-zero error code is returned in that case. Special .Cm atomic keyword may be specified before .Cm add to indicate all-or-none add request. .Pp One or more entries can be removed from a table at once using .Cm delete command. By default, error in removal of one entry does not influence removing of other entries. However, non-zero error code is returned in that case. .Pp It may be possible to check what entry will be found on particular .Ar table-key using .Cm lookup .Ar table-key command. This functionality is optional and may be unsupported in some algorithms. .Pp The following operations can be performed on .Ar one or .Cm all tables: .Bl -tag -width indent .It Cm list List all entries. .It Cm flush Removes all entries. .It Cm info Shows generic table information. .It Cm detail Shows generic table information and algo-specific data. .El .Pp The following lookup algorithms are supported: .Bl -tag -width indent .It Ar algo-desc : algo-name | "algo-name algo-data" .It Ar algo-name : Ar addr: radix | addr: hash | iface: array | number: array | flow: hash | mac: radix .It Cm addr: radix Separate Radix trees for IPv4 and IPv6, the same way as the routing table (see .Xr route 4 ) . Default choice for .Ar addr type. .It Cm addr:hash Separate auto-growing hashes for IPv4 and IPv6. Accepts entries with the same mask length specified initially via .Cm "addr:hash masks=/v4,/v6" algorithm creation options. Assume /32 and /128 masks by default. Search removes host bits (according to mask) from supplied address and checks resulting key in appropriate hash. Mostly optimized for /64 and byte-ranged IPv6 masks. .It Cm iface:array Array storing sorted indexes for entries which are presented in the system. Optimized for very fast lookup. .It Cm number:array Array storing sorted u32 numbers. .It Cm flow:hash Auto-growing hash storing flow entries. Search calculates hash on required packet fields and searches for matching entries in selected bucket. .It Cm mac: radix Radix tree for MAC address .El .Pp The .Cm tablearg feature provides the ability to use a value, looked up in the table, as the argument for a rule action, action parameter or rule option. This can significantly reduce number of rules in some configurations. If two tables are used in a rule, the result of the second (destination) is used. .Pp Each record may hold one or more values according to .Ar value-mask . This mask is set on table creation via .Cm valtype option. The following value types are supported: .Bl -tag -width indent .It Ar value-mask : Ar value-type Ns Op , Ns Ar value-mask .It Ar value-type : Ar skipto | pipe | fib | nat | dscp | tag | divert | .Ar netgraph | limit | ipv4 | ipv6 | mark .It Cm skipto rule number to jump to. .It Cm pipe Pipe number to use. .It Cm fib fib number to match/set. .It Cm nat nat number to jump to. .It Cm dscp dscp value to match/set. .It Cm tag tag number to match/set. .It Cm divert port number to divert traffic to. .It Cm netgraph hook number to move packet to. .It Cm limit maximum number of connections. .It Cm ipv4 IPv4 nexthop to fwd packets to. .It Cm ipv6 IPv6 nexthop to fwd packets to. .It Cm mark mark value to match/set. .El .Pp The .Cm tablearg argument can be used with the following actions: .Cm nat, pipe, queue, divert, tee, netgraph, ngtee, fwd, skipto, setfib , .Cm setmark , action parameters: .Cm tag, untag , rule options: .Cm limit, tagged, mark . .Pp When used with the .Cm skipto action, the user should be aware that the code will walk the ruleset up to a rule equal to, or past, the given number. .Pp See the .Sx EXAMPLES Section for example usage of tables and the tablearg keyword. .Sh SETS OF RULES Each rule or table belongs to one of 32 different .Em sets , numbered 0 to 31. Set 31 is reserved for the default rule. .Pp By default, rules or tables are put in set 0, unless you use the .Cm set N attribute when adding a new rule or table. Sets can be individually and atomically enabled or disabled, so this mechanism permits an easy way to store multiple configurations of the firewall and quickly (and atomically) switch between them. .Pp By default, tables from set 0 are referenced when adding rule with table opcodes regardless of rule set. This behavior can be changed by setting .Va net.inet.ip.fw.tables_sets variable to 1. Rule's set will then be used for table references. .Pp The command to enable/disable sets is .Bd -ragged -offset indent .Nm .Cm set Oo Cm disable Ar number ... Oc Op Cm enable Ar number ... .Ed .Pp where multiple .Cm enable or .Cm disable sections can be specified. Command execution is atomic on all the sets specified in the command. By default, all sets are enabled. .Pp When you disable a set, its rules behave as if they do not exist in the firewall configuration, with only one exception: .Bd -ragged -offset indent dynamic rules created from a rule before it had been disabled will still be active until they expire. In order to delete dynamic rules you have to explicitly delete the parent rule which generated them. .Ed .Pp The set number of rules can be changed with the command .Bd -ragged -offset indent .Nm .Cm set move .Brq Cm rule Ar rule-number | old-set .Cm to Ar new-set .Ed .Pp Also, you can atomically swap two rulesets with the command .Bd -ragged -offset indent .Nm .Cm set swap Ar first-set second-set .Ed .Pp See the .Sx EXAMPLES Section on some possible uses of sets of rules. .Sh STATEFUL FIREWALL Stateful operation is a way for the firewall to dynamically create rules for specific flows when packets that match a given pattern are detected. Support for stateful operation comes through the .Cm check-state , keep-state , record-state , limit and .Cm set-limit options of .Nm rules . .Pp Dynamic rules are created when a packet matches a .Cm keep-state , .Cm record-state , .Cm limit or .Cm set-limit rule, causing the creation of a .Em dynamic rule which will match all and only packets with a given .Em protocol between a .Em src-ip/src-port dst-ip/dst-port pair of addresses .Em ( src and .Em dst are used here only to denote the initial match addresses, but they are completely equivalent afterwards). Rules created by .Cm keep-state option also have a .Ar :flowname taken from it. This name is used in matching together with addresses, ports and protocol. Dynamic rules will be checked at the first .Cm check-state, keep-state or .Cm limit occurrence, and the action performed upon a match will be the same as in the parent rule. .Pp Note that no additional attributes other than protocol and IP addresses and ports and :flowname are checked on dynamic rules. .Pp The typical use of dynamic rules is to keep a closed firewall configuration, but let the first TCP SYN packet from the inside network install a dynamic rule for the flow so that packets belonging to that session will be allowed through the firewall: .Pp .Dl "ipfw add check-state :OUTBOUND" .Dl "ipfw add allow tcp from my-subnet to any setup keep-state :OUTBOUND" .Dl "ipfw add deny tcp from any to any" .Pp A similar approach can be used for UDP, where an UDP packet coming from the inside will install a dynamic rule to let the response through the firewall: .Pp .Dl "ipfw add check-state :OUTBOUND" .Dl "ipfw add allow udp from my-subnet to any keep-state :OUTBOUND" .Dl "ipfw add deny udp from any to any" .Pp Dynamic rules expire after some time, which depends on the status of the flow and the setting of some .Cm sysctl variables. See Section .Sx SYSCTL VARIABLES for more details. For TCP sessions, dynamic rules can be instructed to periodically send keepalive packets to refresh the state of the rule when it is about to expire. .Pp See Section .Sx EXAMPLES for more examples on how to use dynamic rules. .Sh TRAFFIC SHAPER (DUMMYNET) CONFIGURATION .Nm is also the user interface for the .Nm dummynet traffic shaper, packet scheduler and network emulator, a subsystem that can artificially queue, delay or drop packets emulating the behaviour of certain network links or queueing systems. .Pp .Nm dummynet operates by first using the firewall to select packets using any match pattern that can be used in .Nm rules. Matching packets are then passed to either of two different objects, which implement the traffic regulation: .Bl -hang -offset XXXX .It Em pipe A .Em pipe emulates a .Em link with given bandwidth and propagation delay, driven by a FIFO scheduler and a single queue with programmable queue size and packet loss rate. Packets are appended to the queue as they come out from .Nm ipfw , and then transferred in FIFO order to the link at the desired rate. .It Em queue A .Em queue is an abstraction used to implement packet scheduling using one of several packet scheduling algorithms. Packets sent to a .Em queue are first grouped into flows according to a mask on the 5-tuple. Flows are then passed to the scheduler associated to the .Em queue , and each flow uses scheduling parameters (weight and others) as configured in the .Em queue itself. A scheduler in turn is connected to an emulated link, and arbitrates the link's bandwidth among backlogged flows according to weights and to the features of the scheduling algorithm in use. .El .Pp In practice, .Em pipes can be used to set hard limits to the bandwidth that a flow can use, whereas .Em queues can be used to determine how different flows share the available bandwidth. .Pp A graphical representation of the binding of queues, flows, schedulers and links is below. .Bd -literal -offset indent (flow_mask|sched_mask) sched_mask +---------+ weight Wx +-------------+ | |->-[flow]-->--| |-+ -->--| QUEUE x | ... | | | | |->-[flow]-->--| SCHEDuler N | | +---------+ | | | ... | +--[LINK N]-->-- +---------+ weight Wy | | +--[LINK N]-->-- | |->-[flow]-->--| | | -->--| QUEUE y | ... | | | | |->-[flow]-->--| | | +---------+ +-------------+ | +-------------+ .Ed It is important to understand the role of the SCHED_MASK and FLOW_MASK, which are configured through the commands .Dl "ipfw sched N config mask SCHED_MASK ..." and .Dl "ipfw queue X config mask FLOW_MASK ..." . .Pp The SCHED_MASK is used to assign flows to one or more scheduler instances, one for each value of the packet's 5-tuple after applying SCHED_MASK. As an example, using ``src-ip 0xffffff00'' creates one instance for each /24 destination subnet. .Pp The FLOW_MASK, together with the SCHED_MASK, is used to split packets into flows. As an example, using ``src-ip 0x000000ff'' together with the previous SCHED_MASK makes a flow for each individual source address. In turn, flows for each /24 subnet will be sent to the same scheduler instance. .Pp The above diagram holds even for the .Em pipe case, with the only restriction that a .Em pipe only supports a SCHED_MASK, and forces the use of a FIFO scheduler (these are for backward compatibility reasons; in fact, internally, a .Nm dummynet's pipe is implemented exactly as above). .Pp There are two modes of .Nm dummynet operation: .Dq normal and .Dq fast . The .Dq normal mode tries to emulate a real link: the .Nm dummynet scheduler ensures that the packet will not leave the pipe faster than it would on the real link with a given bandwidth. The .Dq fast mode allows certain packets to bypass the .Nm dummynet scheduler (if packet flow does not exceed pipe's bandwidth). This is the reason why the .Dq fast mode requires less CPU cycles per packet (on average) and packet latency can be significantly lower in comparison to a real link with the same bandwidth. The default mode is .Dq normal . The .Dq fast mode can be enabled by setting the .Va net.inet.ip.dummynet.io_fast .Xr sysctl 8 variable to a non-zero value. .Ss PIPE, QUEUE AND SCHEDULER CONFIGURATION The .Em pipe , .Em queue and .Em scheduler configuration commands are the following: .Bd -ragged -offset indent .Cm pipe Ar number Cm config Ar pipe-configuration .Pp .Cm queue Ar number Cm config Ar queue-configuration .Pp .Cm sched Ar number Cm config Ar sched-configuration .Ed .Pp The following parameters can be configured for a pipe: .Pp .Bl -tag -width indent -compact .It Cm bw Ar bandwidth | device Bandwidth, measured in .Sm off .Op Cm K | M | G .Brq Cm bit/s | Byte/s . .Sm on .Pp A value of 0 (default) means unlimited bandwidth. The unit must immediately follow the number, as in .Pp .Dl "dnctl pipe 1 config bw 300Kbit/s" .Pp If a device name is specified instead of a numeric value, as in .Pp .Dl "dnctl pipe 1 config bw tun0" .Pp then the transmit clock is supplied by the specified device. At the moment only the .Xr tun 4 device supports this functionality, for use in conjunction with .Xr ppp 8 . .Pp .It Cm delay Ar ms-delay Propagation delay, measured in milliseconds. The value is rounded to the next multiple of the clock tick (typically 10ms, but it is a good practice to run kernels with .Dq "options HZ=1000" to reduce the granularity to 1ms or less). The default value is 0, meaning no delay. .Pp .It Cm burst Ar size If the data to be sent exceeds the pipe's bandwidth limit (and the pipe was previously idle), up to .Ar size bytes of data are allowed to bypass the .Nm dummynet scheduler, and will be sent as fast as the physical link allows. Any additional data will be transmitted at the rate specified by the .Nm pipe bandwidth. The burst size depends on how long the pipe has been idle; the effective burst size is calculated as follows: MAX( .Ar size , .Nm bw * pipe_idle_time). .Pp .It Cm profile Ar filename A file specifying the additional overhead incurred in the transmission of a packet on the link. .Pp Some link types introduce extra delays in the transmission of a packet, e.g., because of MAC level framing, contention on the use of the channel, MAC level retransmissions and so on. From our point of view, the channel is effectively unavailable for this extra time, which is constant or variable depending on the link type. Additionally, packets may be dropped after this time (e.g., on a wireless link after too many retransmissions). We can model the additional delay with an empirical curve that represents its distribution. .Bd -literal -offset indent cumulative probability 1.0 ^ | L +-- loss-level x | ****** | * | ***** | * | ** | * +-------*-------------------> delay .Ed The empirical curve may have both vertical and horizontal lines. Vertical lines represent constant delay for a range of probabilities. Horizontal lines correspond to a discontinuity in the delay distribution: the pipe will use the largest delay for a given probability. .Pp The file format is the following, with whitespace acting as a separator and '#' indicating the beginning a comment: .Bl -tag -width indent .It Cm name Ar identifier optional name (listed by "dnctl pipe show") to identify the delay distribution; .It Cm bw Ar value the bandwidth used for the pipe. If not specified here, it must be present explicitly as a configuration parameter for the pipe; .It Cm loss-level Ar L the probability above which packets are lost. (0.0 <= L <= 1.0, default 1.0 i.e., no loss); .It Cm samples Ar N the number of samples used in the internal representation of the curve (2..1024; default 100); .It Cm "delay prob" | "prob delay" One of these two lines is mandatory and defines the format of the following lines with data points. .It Ar XXX Ar YYY 2 or more lines representing points in the curve, with either delay or probability first, according to the chosen format. The unit for delay is milliseconds. Data points do not need to be sorted. Also, the number of actual lines can be different from the value of the "samples" parameter: .Nm utility will sort and interpolate the curve as needed. .El .Pp Example of a profile file: .Bd -literal -offset indent name bla_bla_bla samples 100 loss-level 0.86 prob delay 0 200 # minimum overhead is 200ms 0.5 200 0.5 300 0.8 1000 0.9 1300 1 1300 #configuration file end .Ed .El .Pp The following parameters can be configured for a queue: .Pp .Bl -tag -width indent -compact .It Cm pipe Ar pipe_nr Connects a queue to the specified pipe. Multiple queues (with the same or different weights) can be connected to the same pipe, which specifies the aggregate rate for the set of queues. .Pp .It Cm weight Ar weight Specifies the weight to be used for flows matching this queue. The weight must be in the range 1..100, and defaults to 1. .El .Pp The following case-insensitive parameters can be configured for a scheduler: .Pp .Bl -tag -width indent -compact .It Cm type Ar {fifo | wf2q+ | rr | qfq | fq_codel | fq_pie} specifies the scheduling algorithm to use. .Bl -tag -width indent -compact .It Cm fifo is just a FIFO scheduler (which means that all packets are stored in the same queue as they arrive to the scheduler). FIFO has O(1) per-packet time complexity, with very low constants (estimate 60-80ns on a 2GHz desktop machine) but gives no service guarantees. .It Cm wf2q+ implements the WF2Q+ algorithm, which is a Weighted Fair Queueing algorithm which permits flows to share bandwidth according to their weights. Note that weights are not priorities; even a flow with a minuscule weight will never starve. WF2Q+ has O(log N) per-packet processing cost, where N is the number of flows, and is the default algorithm used by previous versions dummynet's queues. .It Cm rr implements the Deficit Round Robin algorithm, which has O(1) processing costs (roughly, 100-150ns per packet) and permits bandwidth allocation according to weights, but with poor service guarantees. .It Cm qfq implements the QFQ algorithm, which is a very fast variant of WF2Q+, with similar service guarantees and O(1) processing costs (roughly, 200-250ns per packet). .It Cm fq_codel implements the FQ-CoDel (FlowQueue-CoDel) scheduler/AQM algorithm, which uses a modified Deficit Round Robin scheduler to manage two lists of sub-queues (old sub-queues and new sub-queues) for providing brief periods of priority to lightweight or short burst flows. By default, the total number of sub-queues is 1024. FQ-CoDel's internal, dynamically created sub-queues are controlled by separate instances of CoDel AQM. .It Cm fq_pie implements the FQ-PIE (FlowQueue-PIE) scheduler/AQM algorithm, which similar to .Cm fq_codel but uses per sub-queue PIE AQM instance to control the queue delay. .El .Pp .Cm fq_codel inherits AQM parameters and options from .Cm codel (see below), and .Cm fq_pie inherits AQM parameters and options from .Cm pie (see below). Additionally, both of .Cm fq_codel and .Cm fq_pie have shared scheduler parameters which are: .Bl -tag -width indent .It Cm quantum .Ar m specifies the quantum (credit) of the scheduler. .Ar m is the number of bytes a queue can serve before being moved to the tail of old queues list. The default is 1514 bytes, and the maximum acceptable value is 9000 bytes. .It Cm limit .Ar m specifies the hard size limit (in unit of packets) of all queues managed by an instance of the scheduler. The default value of .Ar m is 10240 packets, and the maximum acceptable value is 20480 packets. .It Cm flows .Ar m specifies the total number of flow queues (sub-queues) that fq_* creates and manages. By default, 1024 sub-queues are created when an instance of the fq_{codel/pie} scheduler is created. The maximum acceptable value is 65536. .El .Pp Note that any token after .Cm fq_codel or .Cm fq_pie is considered a parameter for fq_{codel/pie}. So, ensure all scheduler configuration options not related to fq_{codel/pie} are written before .Cm fq_codel/fq_pie tokens. .El .Pp In addition to the type, all parameters allowed for a pipe can also be specified for a scheduler. .Pp Finally, the following parameters can be configured for both pipes and queues: .Pp .Bl -tag -width XXXX -compact .It Cm buckets Ar hash-table-size Specifies the size of the hash table used for storing the various queues. Default value is 64 controlled by the .Xr sysctl 8 variable .Va net.inet.ip.dummynet.hash_size , allowed range is 16 to 65536. .Pp .It Cm mask Ar mask-specifier Packets sent to a given pipe or queue by an .Nm rule can be further classified into multiple flows, each of which is then sent to a different .Em dynamic pipe or queue. A flow identifier is constructed by masking the IP addresses, ports and protocol types as specified with the .Cm mask options in the configuration of the pipe or queue. For each different flow identifier, a new pipe or queue is created with the same parameters as the original object, and matching packets are sent to it. .Pp Thus, when .Em dynamic pipes are used, each flow will get the same bandwidth as defined by the pipe, whereas when .Em dynamic queues are used, each flow will share the parent's pipe bandwidth evenly with other flows generated by the same queue (note that other queues with different weights might be connected to the same pipe). .br Available mask specifiers are a combination of one or more of the following: .Pp .Cm dst-ip Ar mask , .Cm dst-ip6 Ar mask , .Cm src-ip Ar mask , .Cm src-ip6 Ar mask , .Cm dst-port Ar mask , .Cm src-port Ar mask , .Cm flow-id Ar mask , .Cm proto Ar mask or .Cm all , .Pp where the latter means all bits in all fields are significant. .Pp .It Cm noerror When a packet is dropped by a .Nm dummynet queue or pipe, the error is normally reported to the caller routine in the kernel, in the same way as it happens when a device queue fills up. Setting this option reports the packet as successfully delivered, which can be needed for some experimental setups where you want to simulate loss or congestion at a remote router. .Pp .It Cm plr Ar packet-loss-rate .It Cm plr Ar K,p,H,r Packet loss rate. Argument .Ar packet-loss-rate is a floating-point number between 0 and 1, with 0 meaning no loss, 1 meaning 100% loss. .Pp When invoked with four arguments, the simple Gilbert-Elliott channel model with two states (Good and Bad) is used. .Bd -literal -offset indent r .----------------. v | .------------. .------------. | G | | B | | drop (K) | | drop (H) | '------------' '------------' | ^ '----------------' p .Ed This has the associated probabilities .Po Ar K and .Ar H Pc for the loss probability. This is different from the literature, where this model is described with probabilities of successful transmission k and h. However, converting from literature is easy: .Pp K = 1 - k ; H = 1 - h .Pp This is to retain consistency within the interface and allow the quick re-use of loss probability when giving only a single argument. In addition the state change probabilities .Po Ar p and .Ar r Pc are given. All of the above probabilities are internally represented on 31 bits. .Pp .It Cm queue Brq Ar slots | size Ns Cm Kbytes Queue size, in .Ar slots or .Cm KBytes . Default value is 50 slots, which is the typical queue size for Ethernet devices. Note that for slow speed links you should keep the queue size short or your traffic might be affected by a significant queueing delay. E.g., 50 max-sized Ethernet packets (1500 bytes) mean 600Kbit or 20s of queue on a 30Kbit/s pipe. Even worse effects can result if you get packets from an interface with a much larger MTU, e.g.\& the loopback interface with its 16KB packets. The .Xr sysctl 8 variables .Em net.inet.ip.dummynet.pipe_byte_limit and .Em net.inet.ip.dummynet.pipe_slot_limit control the maximum lengths that can be specified. .Pp .It Cm red | gred Ar w_q Ns / Ns Ar min_th Ns / Ns Ar max_th Ns / Ns Ar max_p [ecn] Make use of the RED (Random Early Detection) queue management algorithm. .Ar w_q and .Ar max_p are floating point numbers between 0 and 1 (inclusive), while .Ar min_th and .Ar max_th are integer numbers specifying thresholds for queue management (thresholds are computed in bytes if the queue has been defined in bytes, in slots otherwise). The two parameters can also be of the same value if needed. The .Nm dummynet also supports the gentle RED variant (gred) and ECN (Explicit Congestion Notification) as optional. Three .Xr sysctl 8 variables can be used to control the RED behaviour: .Bl -tag -width indent .It Va net.inet.ip.dummynet.red_lookup_depth specifies the accuracy in computing the average queue when the link is idle (defaults to 256, must be greater than zero) .It Va net.inet.ip.dummynet.red_avg_pkt_size specifies the expected average packet size (defaults to 512, must be greater than zero) .It Va net.inet.ip.dummynet.red_max_pkt_size specifies the expected maximum packet size, only used when queue thresholds are in bytes (defaults to 1500, must be greater than zero). .El .Pp .It Cm codel Oo Cm target Ar time Oc Oo Cm interval Ar time Oc Oo Cm ecn | .Cm noecn Oc Make use of the CoDel (Controlled-Delay) queue management algorithm. .Ar time is interpreted as milliseconds by default but seconds (s), milliseconds (ms) or microseconds (us) can be specified instead. CoDel drops or marks (ECN) packets depending on packet sojourn time in the queue. .Cm target .Ar time (5ms by default) is the minimum acceptable persistent queue delay that CoDel allows. CoDel does not drop packets directly after packets sojourn time becomes higher than .Cm target .Ar time but waits for .Cm interval .Ar time (100ms default) before dropping. .Cm interval .Ar time should be set to maximum RTT for all expected connections. .Cm ecn enables (disabled by default) packet marking (instead of dropping) for ECN-enabled TCP flows when queue delay becomes high. .Pp Note that any token after .Cm codel is considered a parameter for CoDel. So, ensure all pipe/queue configuration options are written before .Cm codel token. .Pp The .Xr sysctl 8 variables .Va net.inet.ip.dummynet.codel.target and .Va net.inet.ip.dummynet.codel.interval can be used to set CoDel default parameters. .Pp .It Cm pie Oo Cm target Ar time Oc Oo Cm tupdate Ar time Oc Oo .Cm alpha Ar n Oc Oo Cm beta Ar n Oc Oo Cm max_burst Ar time Oc Oo .Cm max_ecnth Ar n Oc Oo Cm ecn | Cm noecn Oc Oo Cm capdrop | .Cm nocapdrop Oc Oo Cm drand | Cm nodrand Oc Oo Cm onoff .Oc Oo Cm dre | Cm ts Oc Make use of the PIE (Proportional Integral controller Enhanced) queue management algorithm. PIE drops or marks packets depending on a calculated drop probability during en-queue process, with the aim of achieving high throughput while keeping queue delay low. At regular time intervals of .Cm tupdate .Ar time (15ms by default) a background process (re)calculates the probability based on queue delay deviations from .Cm target .Ar time (15ms by default) and queue delay trends. PIE approximates current queue delay by using a departure rate estimation method, or (optionally) by using a packet timestamp method similar to CoDel. .Ar time is interpreted as milliseconds by default but seconds (s), milliseconds (ms) or microseconds (us) can be specified instead. The other PIE parameters and options are as follows: .Bl -tag -width indent .It Cm alpha Ar n .Ar n is a floating point number between 0 and 7 which specifies the weight of queue delay deviations that is used in drop probability calculation. 0.125 is the default. .It Cm beta Ar n .Ar n is a floating point number between 0 and 7 which specifies is the weight of queue delay trend that is used in drop probability calculation. 1.25 is the default. .It Cm max_burst Ar time The maximum period of time that PIE does not drop/mark packets. 150ms is the default and 10s is the maximum value. .It Cm max_ecnth Ar n Even when ECN is enabled, PIE drops packets instead of marking them when drop probability becomes higher than ECN probability threshold .Cm max_ecnth Ar n , the default is 0.1 (i.e 10%) and 1 is the maximum value. .It Cm ecn | noecn enable or disable ECN marking for ECN-enabled TCP flows. Disabled by default. .It Cm capdrop | nocapdrop enable or disable cap drop adjustment. Cap drop adjustment is enabled by default. .It Cm drand | nodrand enable or disable drop probability de-randomisation. De-randomisation eliminates the problem of dropping packets too close or too far. De-randomisation is enabled by default. .It Cm onoff enable turning PIE on and off depending on queue load. If this option is enabled, PIE turns on when over 1/3 of queue becomes full. This option is disabled by default. .It Cm dre | ts Calculate queue delay using departure rate estimation .Cm dre or timestamps .Cm ts . .Cm dre is used by default. .El .Pp Note that any token after .Cm pie is considered a parameter for PIE. So ensure all pipe/queue the configuration options are written before .Cm pie token. .Xr sysctl 8 variables can be used to control the .Cm pie default parameters. See the .Sx SYSCTL VARIABLES section for more details. .El .Pp When used with IPv6 data, .Nm dummynet currently has several limitations. Information necessary to route link-local packets to an interface is not available after processing by .Nm dummynet so those packets are dropped in the output path. Care should be taken to ensure that link-local packets are not passed to .Nm dummynet . .Sh CHECKLIST Here are some important points to consider when designing your rules: .Bl -bullet .It Remember that you filter both packets going .Cm in and .Cm out . Most connections need packets going in both directions. .It Remember to test very carefully. It is a good idea to be near the console when doing this. If you cannot be near the console, use an auto-recovery script such as the one in .Pa /usr/share/examples/ipfw/change_rules.sh . .It Do not forget the loopback interface. .El .Sh FINE POINTS .Bl -bullet .It There are circumstances where fragmented datagrams are unconditionally dropped. TCP packets are dropped if they do not contain at least 20 bytes of TCP header, UDP packets are dropped if they do not contain a full 8 byte UDP header, and ICMP packets are dropped if they do not contain 4 bytes of ICMP header, enough to specify the ICMP type, code, and checksum. These packets are simply logged as .Dq pullup failed since there may not be enough good data in the packet to produce a meaningful log entry. .It Another type of packet is unconditionally dropped, a TCP packet with a fragment offset of one. This is a valid packet, but it only has one use, to try to circumvent firewalls. When logging is enabled, these packets are reported as being dropped by rule -1. .It If you are logged in over a network, loading the .Xr kld 4 version of .Nm is probably not as straightforward as you would think. The following command line is recommended: .Bd -literal -offset indent kldload ipfw && \e ipfw add 32000 allow ip from any to any .Ed .Pp Along the same lines, doing an .Bd -literal -offset indent ipfw flush .Ed .Pp in similar surroundings is also a bad idea. .It The .Nm filter list may not be modified if the system security level is set to 3 or higher (see .Xr init 8 for information on system security levels). .El .Sh PACKET DIVERSION A .Xr divert 4 socket bound to the specified port will receive all packets diverted to that port. If no socket is bound to the destination port, or if the divert module is not loaded, or if the kernel was not compiled with divert socket support, the packets are dropped. .Sh NETWORK ADDRESS TRANSLATION (NAT) .Nm support in-kernel NAT using the kernel version of .Xr libalias 3 . The kernel module .Cm ipfw_nat should be loaded or kernel should have .Cm options IPFIREWALL_NAT to be able use NAT. .Pp The nat configuration command is the following: .Bd -ragged -offset indent .Bk -words .Cm nat .Ar nat_number .Cm config .Ar nat-configuration .Ek .Ed .Pp The following parameters can be configured: .Bl -tag -width indent .It Cm ip Ar ip_address Define an ip address to use for aliasing. .It Cm if Ar nic Use ip address of NIC for aliasing, dynamically changing it if NIC's ip address changes. .It Cm log Enable logging on this nat instance. .It Cm deny_in Deny any incoming connection from outside world. .It Cm same_ports Try to leave the alias port numbers unchanged from the actual local port numbers. .It Cm unreg_only Traffic on the local network not originating from a RFC 1918 unregistered address spaces will be ignored. .It Cm unreg_cgn Like unreg_only, but includes the RFC 6598 (Carrier Grade NAT) address range. .It Cm reset Reset table of the packet aliasing engine on address change. .It Cm reverse Reverse the way libalias handles aliasing. .It Cm proxy_only Obey transparent proxy rules only, packet aliasing is not performed. .It Cm skip_global Skip instance in case of global state lookup (see below). .It Cm port_range Ar lower-upper Set the aliasing ports between the ranges given. Upper port has to be greater than lower. .It Cm udp_eim When enabled, UDP packets use endpoint-independent mapping (EIM) from RFC 4787 ("full cone" NAT of RFC 3489). All packets from the same internal address:port are mapped to the same NAT address:port, regardless of their destination address:port. If filtering rules allow, and if .Em deny_in is unset, any other external address:port can also send to the internal address:port through its mapped NAT address:port. This is more compatible with applications, and can reduce the need for port forwarding, but less scalable as each NAT address:port can only be concurrently used by at most one internal address:port. .Pp When disabled, UDP packets use endpoint-dependent mapping (EDM) ("symmetric" NAT). Each connection from a particular internal address:port to different external addresses:ports is mapped to a random and unpredictable NAT address:port. Two appplications behind EDM NATs can only connect to each other by port forwarding on the NAT, or tunnelling through an in-between server. .El .Pp Some special values can be supplied instead of .Va nat_number in nat rule actions: .Bl -tag -width indent .It Cm global Looks up translation state in all configured nat instances. If an entry is found, packet is aliased according to that entry. If no entry was found in any of the instances, packet is passed unchanged, and no new entry will be created. See section .Sx MULTIPLE INSTANCES in .Xr natd 8 for more information. .It Cm tablearg Uses argument supplied in lookup table. See .Sx LOOKUP TABLES section below for more information on lookup tables. .El .Pp To let the packet continue after being (de)aliased, set the sysctl variable .Va net.inet.ip.fw.one_pass to 0. For more information about aliasing modes, refer to .Xr libalias 3 . See Section .Sx EXAMPLES for some examples of nat usage. .Ss REDIRECT AND LSNAT SUPPORT IN IPFW Redirect and LSNAT support follow closely the syntax used in .Xr natd 8 . See Section .Sx EXAMPLES for some examples on how to do redirect and lsnat. .Ss SCTP NAT SUPPORT SCTP nat can be configured in a similar manner to TCP through the .Nm command line tool. The main difference is that .Nm sctp nat does not do port translation. Since the local and global side ports will be the same, there is no need to specify both. Ports are redirected as follows: .Bd -ragged -offset indent .Bk -words .Cm nat .Ar nat_number .Cm config if .Ar nic .Cm redirect_port sctp .Ar ip_address [,addr_list] {[port | port-port] [,ports]} .Ek .Ed .Pp Most .Nm sctp nat configuration can be done in real-time through the .Xr sysctl 8 interface. All may be changed dynamically, though the hash_table size will only change for new .Nm nat instances. See .Sx SYSCTL VARIABLES for more info. .Sh IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION .Ss Stateful translation .Nm supports in-kernel IPv6/IPv4 network address and protocol translation. Stateful NAT64 translation allows IPv6-only clients to contact IPv4 servers using unicast TCP, UDP or ICMP protocols. One or more IPv4 addresses assigned to a stateful NAT64 translator are shared among several IPv6-only clients. When stateful NAT64 is used in conjunction with DNS64, no changes are usually required in the IPv6 client or the IPv4 server. The kernel module .Cm ipfw_nat64 should be loaded or kernel should have .Cm options IPFIREWALL_NAT64 to be able use stateful NAT64 translator. .Pp Stateful NAT64 uses a bunch of memory for several types of objects. When IPv6 client initiates connection, NAT64 translator creates a host entry in the states table. Each host entry uses preallocated IPv4 alias entry. Each alias entry has a number of ports group entries allocated on demand. Ports group entries contains connection state entries. There are several options to control limits and lifetime for these objects. .Pp NAT64 translator follows RFC7915 when does ICMPv6/ICMP translation, unsupported message types will be silently dropped. IPv6 needs several ICMPv6 message types to be explicitly allowed for correct operation. Make sure that ND6 neighbor solicitation (ICMPv6 type 135) and neighbor advertisement (ICMPv6 type 136) messages will not be handled by translation rules. .Pp After translation NAT64 translator by default sends packets through corresponding netisr queue. Thus translator host should be configured as IPv4 and IPv6 router. Also this means, that a packet is handled by firewall twice. First time an original packet is handled and consumed by translator, and then it is handled again as translated packet. This behavior can be changed by sysctl variable .Va net.inet.ip.fw.nat64_direct_output . Also translated packet can be tagged using .Cm tag rule action, and then matched by .Cm tagged opcode to avoid loops and extra overhead. .Pp The stateful NAT64 configuration command is the following: .Bd -ragged -offset indent .Bk -words .Cm nat64lsn .Ar name .Cm create .Ar create-options .Ek .Ed .Pp The following parameters can be configured: .Bl -tag -width indent .It Cm prefix4 Ar ipv4_prefix/plen The IPv4 prefix with mask defines the pool of IPv4 addresses used as source address after translation. Stateful NAT64 module translates IPv6 source address of client to one IPv4 address from this pool. Note that incoming IPv4 packets that don't have corresponding state entry in the states table will be dropped by translator. Make sure that translation rules handle packets, destined to configured prefix. .It Cm prefix6 Ar ipv6_prefix/length The IPv6 prefix defines IPv4-embedded IPv6 addresses used by translator to represent IPv4 addresses. This IPv6 prefix should be configured in DNS64. The translator implementation follows RFC6052, that restricts the length of prefixes to one of following: 32, 40, 48, 56, 64, or 96. The Well-Known IPv6 Prefix 64:ff9b:: must be 96 bits long. The special .Ar ::/length prefix can be used to handle several IPv6 prefixes with one NAT64 instance. The NAT64 instance will determine a destination IPv4 address from prefix .Ar length . .It Cm states_chunks Ar number The number of states chunks in single ports group. Each ports group by default can keep 64 state entries in single chunk. The above value affects the maximum number of states that can be associated with single IPv4 alias address and port. The value must be power of 2, and up to 128. .It Cm host_del_age Ar seconds The number of seconds until the host entry for a IPv6 client will be deleted and all its resources will be released due to inactivity. Default value is .Ar 3600 . .It Cm pg_del_age Ar seconds The number of seconds until a ports group with unused state entries will be released. Default value is .Ar 900 . .It Cm tcp_syn_age Ar seconds The number of seconds while a state entry for TCP connection with only SYN sent will be kept. If TCP connection establishing will not be finished, state entry will be deleted. Default value is .Ar 10 . .It Cm tcp_est_age Ar seconds The number of seconds while a state entry for established TCP connection will be kept. Default value is .Ar 7200 . .It Cm tcp_close_age Ar seconds The number of seconds while a state entry for closed TCP connection will be kept. Keeping state entries for closed connections is needed, because IPv4 servers typically keep closed connections in a TIME_WAIT state for a several minutes. Since translator's IPv4 addresses are shared among all IPv6 clients, new connections from the same addresses and ports may be rejected by server, because these connections are still in a TIME_WAIT state. Keeping them in translator's state table protects from such rejects. Default value is .Ar 180 . .It Cm udp_age Ar seconds The number of seconds while translator keeps state entry in a waiting for reply to the sent UDP datagram. Default value is .Ar 120 . .It Cm icmp_age Ar seconds The number of seconds while translator keeps state entry in a waiting for reply to the sent ICMP message. Default value is .Ar 60 . .It Cm log Turn on logging of all handled packets via BPF through .Ar ipfwlog0 interface. .Ar ipfwlog0 is a pseudo interface and can be created after a boot manually with .Cm ifconfig command. Note that it has different purpose than .Ar ipfw0 interface. Translators sends to BPF an additional information with each packet. With .Cm tcpdump you are able to see each handled packet before and after translation. .It Cm -log Turn off logging of all handled packets via BPF. .It Cm allow_private Turn on processing private IPv4 addresses. By default IPv6 packets with destinations mapped to private address ranges defined by RFC1918 are not processed. .It Cm -allow_private Turn off private address handling in .Nm nat64 instance. .El .Pp To inspect a states table of stateful NAT64 the following command can be used: .Bd -ragged -offset indent .Bk -words .Cm nat64lsn .Ar name .Cm show Cm states .Ek .Ed .Pp Stateless NAT64 translator doesn't use a states table for translation and converts IPv4 addresses to IPv6 and vice versa solely based on the mappings taken from configured lookup tables. Since a states table doesn't used by stateless translator, it can be configured to pass IPv4 clients to IPv6-only servers. .Pp The stateless NAT64 configuration command is the following: .Bd -ragged -offset indent .Bk -words .Cm nat64stl .Ar name .Cm create .Ar create-options .Ek .Ed .Pp The following parameters can be configured: .Bl -tag -width indent .It Cm prefix6 Ar ipv6_prefix/length The IPv6 prefix defines IPv4-embedded IPv6 addresses used by translator to represent IPv4 addresses. This IPv6 prefix should be configured in DNS64. .It Cm table4 Ar table46 The lookup table .Ar table46 contains mapping how IPv4 addresses should be translated to IPv6 addresses. .It Cm table6 Ar table64 The lookup table .Ar table64 contains mapping how IPv6 addresses should be translated to IPv4 addresses. .It Cm log Turn on logging of all handled packets via BPF through .Ar ipfwlog0 interface. .It Cm -log Turn off logging of all handled packets via BPF. .It Cm allow_private Turn on processing private IPv4 addresses. By default IPv6 packets with destinations mapped to private address ranges defined by RFC1918 are not processed. .It Cm -allow_private Turn off private address handling in .Nm nat64 instance. .El .Pp Note that the behavior of stateless translator with respect to not matched packets differs from stateful translator. If corresponding addresses was not found in the lookup tables, the packet will not be dropped and the search continues. .Ss XLAT464 CLAT translation XLAT464 CLAT NAT64 translator implements client-side stateless translation as defined in RFC6877 and is very similar to statless NAT64 translator explained above. Instead of lookup tables it uses one-to-one mapping between IPv4 and IPv6 addresses using configured prefixes. This mode can be used as a replacement of DNS64 service for applications that are not using it (e.g. VoIP) allowing them to access IPv4-only Internet over IPv6-only networks with help of remote NAT64 translator. .Pp The CLAT NAT64 configuration command is the following: .Bd -ragged -offset indent .Bk -words .Cm nat64clat .Ar name .Cm create .Ar create-options .Ek .Ed .Pp The following parameters can be configured: .Bl -tag -width indent .It Cm clat_prefix Ar ipv6_prefix/length The IPv6 prefix defines IPv4-embedded IPv6 addresses used by translator to represent source IPv4 addresses. .It Cm plat_prefix Ar ipv6_prefix/length The IPv6 prefix defines IPv4-embedded IPv6 addresses used by translator to represent destination IPv4 addresses. This IPv6 prefix should be configured on a remote NAT64 translator. .It Cm log Turn on logging of all handled packets via BPF through .Ar ipfwlog0 interface. .It Cm -log Turn off logging of all handled packets via BPF. .It Cm allow_private Turn on processing private IPv4 addresses. By default .Nm nat64clat instance will not process IPv4 packets with destination address from private ranges as defined in RFC1918. .It Cm -allow_private Turn off private address handling in .Nm nat64clat instance. .El .Pp Note that the behavior of CLAT translator with respect to not matched packets differs from stateful translator. If corresponding addresses were not matched against prefixes configured, the packet will not be dropped and the search continues. .Sh IPv6-to-IPv6 NETWORK PREFIX TRANSLATION (NPTv6) .Nm supports in-kernel IPv6-to-IPv6 network prefix translation as described in RFC6296. The kernel module .Cm ipfw_nptv6 should be loaded or kernel should has .Cm options IPFIREWALL_NPTV6 to be able use NPTv6 translator. .Pp The NPTv6 configuration command is the following: .Bd -ragged -offset indent .Bk -words .Cm nptv6 .Ar name .Cm create .Ar create-options .Ek .Ed .Pp The following parameters can be configured: .Bl -tag -width indent .It Cm int_prefix Ar ipv6_prefix IPv6 prefix used in internal network. NPTv6 module translates source address when it matches this prefix. .It Cm ext_prefix Ar ipv6_prefix IPv6 prefix used in external network. NPTv6 module translates destination address when it matches this prefix. .It Cm ext_if Ar nic The NPTv6 module will use first global IPv6 address from interface .Ar nic as external prefix. It can be useful when IPv6 prefix of external network is dynamically obtained. .Cm ext_prefix and .Cm ext_if options are mutually exclusive. .It Cm prefixlen Ar length The length of specified IPv6 prefixes. It must be in range from 8 to 64. .El .Pp Note that the prefix translation rules are silently ignored when IPv6 packet forwarding is disabled. To enable the packet forwarding, set the sysctl variable .Va net.inet6.ip6.forwarding to 1. .Pp To let the packet continue after being translated, set the sysctl variable .Va net.inet.ip.fw.one_pass to 0. .Sh LOADER TUNABLES Tunables can be set in .Xr loader 8 prompt, .Xr loader.conf 5 or .Xr kenv 1 before ipfw module gets loaded. .Bl -tag -width indent .It Va net.inet.ip.fw.enable : No 1 Enables the firewall. Setting this variable to 0 lets you run your machine without firewall even if compiled in. .It Va net.inet6.ip6.fw.enable : No 1 provides the same functionality as above for the IPv6 case. .It Va net.link.ether.ipfw : No 0 Controls whether layer2 packets are passed to .Nm . Default is no. .It Va net.inet.ip.fw.default_to_accept : No 0 Defines ipfw last rule behavior. This value overrides .Cd "options IPFW_DEFAULT_TO_(ACCEPT|DENY)" from kernel configuration file. .It Va net.inet.ip.fw.tables_max : No 128 Defines number of tables available in ipfw. Number cannot exceed 65534. .El .Sh SYSCTL VARIABLES A set of .Xr sysctl 8 variables controls the behaviour of the firewall and associated modules .Pq Nm dummynet , bridge , sctp nat . These are shown below together with their default value (but always check with the .Xr sysctl 8 command what value is actually in use) and meaning: .Bl -tag -width indent .It Va net.inet.ip.alias.sctp.accept_global_ootb_addip : No 0 Defines how the .Nm nat responds to receipt of global OOTB ASCONF-AddIP: .Bl -tag -width indent .It Cm 0 No response (unless a partially matching association exists - ports and vtags match but global address does not) .It Cm 1 .Nm nat will accept and process all OOTB global AddIP messages. .El .Pp Option 1 should never be selected as this forms a security risk. An attacker can establish multiple fake associations by sending AddIP messages. .It Va net.inet.ip.alias.sctp.chunk_proc_limit : No 5 Defines the maximum number of chunks in an SCTP packet that will be parsed for a packet that matches an existing association. This value is enforced to be greater or equal than .Cm net.inet.ip.alias.sctp.initialising_chunk_proc_limit . A high value is a DoS risk yet setting too low a value may result in important control chunks in the packet not being located and parsed. .It Va net.inet.ip.alias.sctp.error_on_ootb : No 1 Defines when the .Nm nat responds to any Out-of-the-Blue (OOTB) packets with ErrorM packets. An OOTB packet is a packet that arrives with no existing association registered in the .Nm nat and is not an INIT or ASCONF-AddIP packet: .Bl -tag -width indent .It Cm 0 ErrorM is never sent in response to OOTB packets. .It Cm 1 ErrorM is only sent to OOTB packets received on the local side. .It Cm 2 ErrorM is sent to the local side and on the global side ONLY if there is a partial match (ports and vtags match but the source global IP does not). This value is only useful if the .Nm nat is tracking global IP addresses. .It Cm 3 ErrorM is sent in response to all OOTB packets on both the local and global side (DoS risk). .El .Pp At the moment the default is 0, since the ErrorM packet is not yet supported by most SCTP stacks. When it is supported, and if not tracking global addresses, we recommend setting this value to 1 to allow multi-homed local hosts to function with the .Nm nat . To track global addresses, we recommend setting this value to 2 to allow global hosts to be informed when they need to (re)send an ASCONF-AddIP. Value 3 should never be chosen (except for debugging) as the .Nm nat will respond to all OOTB global packets (a DoS risk). .It Va net.inet.ip.alias.sctp.hashtable_size : No 2003 Size of hash tables used for .Nm nat lookups (100 < prime_number > 1000001). This value sets the .Nm hash table size for any future created .Nm nat instance and therefore must be set prior to creating a .Nm nat instance. The table sizes may be changed to suit specific needs. If there will be few concurrent associations, and memory is scarce, you may make these smaller. If there will be many thousands (or millions) of concurrent associations, you should make these larger. A prime number is best for the table size. The sysctl update function will adjust your input value to the next highest prime number. .It Va net.inet.ip.alias.sctp.holddown_time : No 0 Hold association in table for this many seconds after receiving a SHUTDOWN-COMPLETE. This allows endpoints to correct shutdown gracefully if a shutdown_complete is lost and retransmissions are required. .It Va net.inet.ip.alias.sctp.init_timer : No 15 Timeout value while waiting for (INIT-ACK|AddIP-ACK). This value cannot be 0. .It Va net.inet.ip.alias.sctp.initialising_chunk_proc_limit : No 2 Defines the maximum number of chunks in an SCTP packet that will be parsed when no existing association exists that matches that packet. Ideally this packet will only be an INIT or ASCONF-AddIP packet. A higher value may become a DoS risk as malformed packets can consume processing resources. .It Va net.inet.ip.alias.sctp.param_proc_limit : No 25 Defines the maximum number of parameters within a chunk that will be parsed in a packet. As for other similar sysctl variables, larger values pose a DoS risk. .It Va net.inet.ip.alias.sctp.log_level : No 0 Level of detail in the system log messages (0 \- minimal, 1 \- event, 2 \- info, 3 \- detail, 4 \- debug, 5 \- max debug). May be a good option in high loss environments. .It Va net.inet.ip.alias.sctp.shutdown_time : No 15 Timeout value while waiting for SHUTDOWN-COMPLETE. This value cannot be 0. .It Va net.inet.ip.alias.sctp.track_global_addresses : No 0 Enables/disables global IP address tracking within the .Nm nat and places an upper limit on the number of addresses tracked for each association: .Bl -tag -width indent .It Cm 0 Global tracking is disabled .It Cm >1 Enables tracking, the maximum number of addresses tracked for each association is limited to this value .El .Pp This variable is fully dynamic, the new value will be adopted for all newly arriving associations, existing associations are treated as they were previously. Global tracking will decrease the number of collisions within the .Nm nat at a cost of increased processing load, memory usage, complexity, and possible .Nm nat state problems in complex networks with multiple .Nm nats . We recommend not tracking global IP addresses, this will still result in a fully functional .Nm nat . .It Va net.inet.ip.alias.sctp.up_timer : No 300 Timeout value to keep an association up with no traffic. This value cannot be 0. .It Va net.inet.ip.dummynet.codel.interval : No 100000 Default .Cm codel AQM interval in microseconds. The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.codel.target : No 5000 Default .Cm codel AQM target delay time in microseconds (the minimum acceptable persistent queue delay). The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.expire : No 1 Lazily delete dynamic pipes/queue once they have no pending traffic. You can disable this by setting the variable to 0, in which case the pipes/queues will only be deleted when the threshold is reached. .It Va net.inet.ip.dummynet.fqcodel.flows : No 1024 Defines the default total number of flow queues (sub-queues) that .Cm fq_codel creates and manages. The value must be in the range 1..65536. .It Va net.inet.ip.dummynet.fqcodel.interval : No 100000 Default .Cm fq_codel scheduler/AQM interval in microseconds. The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.fqcodel.limit : No 10240 The default hard size limit (in unit of packet) of all queues managed by an instance of the .Cm fq_codel scheduler. The value must be in the range 1..20480. .It Va net.inet.ip.dummynet.fqcodel.quantum : No 1514 The default quantum (credit) of the .Cm fq_codel in unit of byte. The value must be in the range 1..9000. .It Va net.inet.ip.dummynet.fqcodel.target : No 5000 Default .Cm fq_codel scheduler/AQM target delay time in microseconds (the minimum acceptable persistent queue delay). The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.fqpie.alpha : No 125 The default .Ar alpha parameter (scaled by 1000) for .Cm fq_pie scheduler/AQM. The value must be in the range 1..7000. .It Va net.inet.ip.dummynet.fqpie.beta : No 1250 The default .Ar beta parameter (scaled by 1000) for .Cm fq_pie scheduler/AQM. The value must be in the range 1..7000. .It Va net.inet.ip.dummynet.fqpie.flows : No 1024 Defines the default total number of flow queues (sub-queues) that .Cm fq_pie creates and manages. The value must be in the range 1..65536. .It Va net.inet.ip.dummynet.fqpie.limit : No 10240 The default hard size limit (in unit of packet) of all queues managed by an instance of the .Cm fq_pie scheduler. The value must be in the range 1..20480. .It Va net.inet.ip.dummynet.fqpie.max_burst : No 150000 The default maximum period of microseconds that .Cm fq_pie scheduler/AQM does not drop/mark packets. The value must be in the range 1..10000000. .It Va net.inet.ip.dummynet.fqpie.max_ecnth : No 99 The default maximum ECN probability threshold (scaled by 1000) for .Cm fq_pie scheduler/AQM. The value must be in the range 1..7000. .It Va net.inet.ip.dummynet.fqpie.quantum : No 1514 The default quantum (credit) of the .Cm fq_pie in unit of byte. The value must be in the range 1..9000. .It Va net.inet.ip.dummynet.fqpie.target : No 15000 The default .Cm target delay of the .Cm fq_pie in unit of microsecond. The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.fqpie.tupdate : No 15000 The default .Cm tupdate of the .Cm fq_pie in unit of microsecond. The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.hash_size : No 64 Default size of the hash table used for dynamic pipes/queues. This value is used when no .Cm buckets option is specified when configuring a pipe/queue. .It Va net.inet.ip.dummynet.io_fast : No 0 If set to a non-zero value, the .Dq fast mode of .Nm dummynet operation (see above) is enabled. .It Va net.inet.ip.dummynet.io_pkt Number of packets passed to .Nm dummynet . .It Va net.inet.ip.dummynet.io_pkt_drop Number of packets dropped by .Nm dummynet . .It Va net.inet.ip.dummynet.io_pkt_fast Number of packets bypassed by the .Nm dummynet scheduler. .It Va net.inet.ip.dummynet.max_chain_len : No 16 Target value for the maximum number of pipes/queues in a hash bucket. The product .Cm max_chain_len*hash_size is used to determine the threshold over which empty pipes/queues will be expired even when .Cm net.inet.ip.dummynet.expire=0 . .It Va net.inet.ip.dummynet.red_lookup_depth : No 256 .It Va net.inet.ip.dummynet.red_avg_pkt_size : No 512 .It Va net.inet.ip.dummynet.red_max_pkt_size : No 1500 Parameters used in the computations of the drop probability for the RED algorithm. .It Va net.inet.ip.dummynet.pie.alpha : No 125 The default .Ar alpha parameter (scaled by 1000) for .Cm pie AQM. The value must be in the range 1..7000. .It Va net.inet.ip.dummynet.pie.beta : No 1250 The default .Ar beta parameter (scaled by 1000) for .Cm pie AQM. The value must be in the range 1..7000. .It Va net.inet.ip.dummynet.pie.max_burst : No 150000 The default maximum period of microseconds that .Cm pie AQM does not drop/mark packets. The value must be in the range 1..10000000. .It Va net.inet.ip.dummynet.pie.max_ecnth : No 99 The default maximum ECN probability threshold (scaled by 1000) for .Cm pie AQM. The value must be in the range 1..7000. .It Va net.inet.ip.dummynet.pie.target : No 15000 The default .Cm target delay of .Cm pie AQM in unit of microsecond. The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.pie.tupdate : No 15000 The default .Cm tupdate of .Cm pie AQM in unit of microsecond. The value must be in the range 1..5000000. .It Va net.inet.ip.dummynet.pipe_byte_limit : No 1048576 .It Va net.inet.ip.dummynet.pipe_slot_limit : No 100 The maximum queue size that can be specified in bytes or packets. These limits prevent accidental exhaustion of resources such as mbufs. If you raise these limits, you should make sure the system is configured so that sufficient resources are available. .It Va net.inet.ip.fw.autoinc_step : No 100 Delta between rule numbers when auto-generating them. The value must be in the range 1..1000. .It Va net.inet.ip.fw.curr_dyn_buckets : Va net.inet.ip.fw.dyn_buckets The current number of buckets in the hash table for dynamic rules (readonly). .It Va net.inet.ip.fw.debug : No 1 Controls debugging messages produced by .Nm . .It Va net.inet.ip.fw.default_rule : No 65535 The default rule number (read-only). By the design of .Nm , the default rule is the last one, so its number can also serve as the highest number allowed for a rule. .It Va net.inet.ip.fw.dyn_buckets : No 256 The number of buckets in the hash table for dynamic rules. Must be a power of 2, up to 65536. It only takes effect when all dynamic rules have expired, so you are advised to use a .Cm flush command to make sure that the hash table is resized. .It Va net.inet.ip.fw.dyn_count : No 3 Current number of dynamic rules (read-only). .It Va net.inet.ip.fw.dyn_keepalive : No 1 Enables generation of keepalive packets for .Cm keep-state rules on TCP sessions. A keepalive is generated to both sides of the connection every 5 seconds for the last 20 seconds of the lifetime of the rule. .It Va net.inet.ip.fw.dyn_max : No 8192 Maximum number of dynamic rules. When you hit this limit, no more dynamic rules can be installed until old ones expire. .It Va net.inet.ip.fw.dyn_ack_lifetime : No 300 .It Va net.inet.ip.fw.dyn_syn_lifetime : No 20 .It Va net.inet.ip.fw.dyn_fin_lifetime : No 1 .It Va net.inet.ip.fw.dyn_rst_lifetime : No 1 .It Va net.inet.ip.fw.dyn_udp_lifetime : No 5 .It Va net.inet.ip.fw.dyn_short_lifetime : No 30 These variables control the lifetime, in seconds, of dynamic rules. Upon the initial SYN exchange the lifetime is kept short, then increased after both SYN have been seen, then decreased again during the final FIN exchange or when a RST is received. Both .Em dyn_fin_lifetime and .Em dyn_rst_lifetime must be strictly lower than 5 seconds, the period of repetition of keepalives. The firewall enforces that. .It Va net.inet.ip.fw.dyn_keep_states : No 0 Keep dynamic states on rule/set deletion. States are relinked to default rule (65535). This can be handly for ruleset reload. Turned off by default. .It Va net.inet.ip.fw.one_pass : No 1 When set, the packet exiting from the .Nm dummynet pipe or from .Xr ng_ipfw 4 node is not passed though the firewall again. Otherwise, after an action, the packet is reinjected into the firewall at the next rule. .It Va net.inet.ip.fw.tables_max : No 128 Maximum number of tables. .It Va net.inet.ip.fw.verbose : No 1 Enables verbose messages. .It Va net.inet.ip.fw.verbose_limit : No 0 Limits the number of messages produced by a verbose firewall. .It Va net.inet6.ip6.fw.deny_unknown_exthdrs : No 1 If enabled packets with unknown IPv6 Extension Headers will be denied. .It Va net.link.bridge.ipfw : No 0 Controls whether bridged packets are passed to .Nm . Default is no. .It Va net.inet.ip.fw.nat64_debug : No 0 Controls debugging messages produced by .Nm ipfw_nat64 module. .It Va net.inet.ip.fw.nat64_direct_output : No 0 Controls the output method used by .Nm ipfw_nat64 module: .Bl -tag -width indent .It Cm 0 A packet is handled by .Nm ipfw twice. First time an original packet is handled by .Nm ipfw and consumed by .Nm ipfw_nat64 translator. Then translated packet is queued via netisr to input processing again. .It Cm 1 A packet is handled by .Nm ipfw only once, and after translation it will be pushed directly to outgoing interface. .El .El .Sh INTERNAL DIAGNOSTICS There are some commands that may be useful to understand current state of certain subsystems inside kernel module. These commands provide debugging output which may change without notice. .Pp Currently the following commands are available as .Cm internal sub-options: .Bl -tag -width indent .It Cm iflist Lists all interface which are currently tracked by .Nm with their in-kernel status. .It Cm talist List all table lookup algorithms currently available. .El .Sh EXAMPLES There are far too many possible uses of .Nm so this Section will only give a small set of examples. .Ss BASIC PACKET FILTERING This command adds an entry which denies all tcp packets from .Em cracker.evil.org to the telnet port of .Em wolf.tambov.su from being forwarded by the host: .Pp .Dl "ipfw add deny tcp from cracker.evil.org to wolf.tambov.su telnet" .Pp This one disallows any connection from the entire cracker's network to my host: .Pp .Dl "ipfw add deny ip from 123.45.67.0/24 to my.host.org" .Pp A first and efficient way to limit access (not using dynamic rules) is the use of the following rules: .Pp .Dl "ipfw add allow tcp from any to any established" .Dl "ipfw add allow tcp from net1 portlist1 to net2 portlist2 setup" .Dl "ipfw add allow tcp from net3 portlist3 to net3 portlist3 setup" .Dl "..." .Dl "ipfw add deny tcp from any to any" .Pp The first rule will be a quick match for normal TCP packets, but it will not match the initial SYN packet, which will be matched by the .Cm setup rules only for selected source/destination pairs. All other SYN packets will be rejected by the final .Cm deny rule. .Pp If you administer one or more subnets, you can take advantage of the address sets and or-blocks and write extremely compact rulesets which selectively enable services to blocks of clients, as below: .Pp .Dl "goodguys=\*q{ 10.1.2.0/24{20,35,66,18} or 10.2.3.0/28{6,3,11} }\*q" .Dl "badguys=\*q10.1.2.0/24{8,38,60}\*q" .Dl "" .Dl "ipfw add allow ip from ${goodguys} to any" .Dl "ipfw add deny ip from ${badguys} to any" .Dl "... normal policies ..." .Pp Allow any transit packets coming from single vlan 10 and going out to vlans 100-1000: .Pp .Dl "ipfw add 10 allow out recv vlan10 \e" .Dl "{ xmit vlan1000 or xmit \*qvlan[1-9]??\*q }" .Pp The .Cm verrevpath option could be used to do automated anti-spoofing by adding the following to the top of a ruleset: .Pp .Dl "ipfw add deny ip from any to any not verrevpath in" .Pp This rule drops all incoming packets that appear to be coming to the system on the wrong interface. For example, a packet with a source address belonging to a host on a protected internal network would be dropped if it tried to enter the system from an external interface. .Pp The .Cm antispoof option could be used to do similar but more restricted anti-spoofing by adding the following to the top of a ruleset: .Pp .Dl "ipfw add deny ip from any to any not antispoof in" .Pp This rule drops all incoming packets that appear to be coming from another directly connected system but on the wrong interface. For example, a packet with a source address of .Li 192.168.0.0/24 , configured on .Li fxp0 , but coming in on .Li fxp1 would be dropped. .Pp The .Cm setdscp option could be used to (re)mark user traffic, by adding the following to the appropriate place in ruleset: .Pp .Dl "ipfw add setdscp be ip from any to any dscp af11,af21" .Ss SELECTIVE MIRRORING If your network has network traffic analyzer connected to your host directly via dedicated interface or remotely via RSPAN vlan, you can selectively mirror some Ethernet layer2 frames to the analyzer. .Pp First, make sure your firewall is already configured and runs. Then, enable layer2 processing if not already enabled: .Pp .Dl "sysctl net.link.ether.ipfw=1" .Pp Next, load needed additional kernel modules: .Pp .Dl "kldload ng_ether ng_ipfw" .Pp Optionally, make system load these modules automatically at startup: .Pp .Dl sysrc kld_list+="ng_ether ng_ipfw" .Pp Next, configure .Xr ng_ipfw 4 kernel module to transmit mirrored copies of layer2 frames out via vlan900 interface: .Pp .Dl "ngctl connect ipfw: vlan900: 1 lower" .Pp Think of "1" here as of "mirroring instance index" and vlan900 is its destination. You can have arbitrary number of instances. Refer to .Xr ng_ipfw 4 for details. .Pp At last, actually start mirroring of selected frames using "instance 1". For frames incoming from em0 interface: .Pp .Dl "ipfw add ngtee 1 ip from any to 192.168.0.1 layer2 in recv em0" .Pp For frames outgoing to em0 interface: .Pp .Dl "ipfw add ngtee 1 ip from any to 192.168.0.1 layer2 out xmit em0" .Pp For both incoming and outgoing frames while flowing through em0: .Pp .Dl "ipfw add ngtee 1 ip from any to 192.168.0.1 layer2 via em0" .Pp Make sure you do not perform mirroring for already duplicated frames or kernel may hang as there is no safety net. .Ss DYNAMIC RULES In order to protect a site from flood attacks involving fake TCP packets, it is safer to use dynamic rules: .Pp .Dl "ipfw add check-state" .Dl "ipfw add deny tcp from any to any established" .Dl "ipfw add allow tcp from my-net to any setup keep-state" .Pp This will let the firewall install dynamic rules only for those connection which start with a regular SYN packet coming from the inside of our network. Dynamic rules are checked when encountering the first occurrence of a .Cm check-state , .Cm keep-state or .Cm limit rule. A .Cm check-state rule should usually be placed near the beginning of the ruleset to minimize the amount of work scanning the ruleset. Your mileage may vary. .Pp For more complex scenarios with dynamic rules .Cm record-state and .Cm defer-action can be used to precisely control creation and checking of dynamic rules. Example of usage of these options are provided in .Sx NETWORK ADDRESS TRANSLATION (NAT) Section. .Pp To limit the number of connections a user can open you can use the following type of rules: .Pp .Dl "ipfw add allow tcp from my-net/24 to any setup limit src-addr 10" .Dl "ipfw add allow tcp from any to me setup limit src-addr 4" .Pp The former (assuming it runs on a gateway) will allow each host on a /24 network to open at most 10 TCP connections. The latter can be placed on a server to make sure that a single client does not use more than 4 simultaneous connections. .Pp .Em BEWARE : stateful rules can be subject to denial-of-service attacks by a SYN-flood which opens a huge number of dynamic rules. The effects of such attacks can be partially limited by acting on a set of .Xr sysctl 8 variables which control the operation of the firewall. .Pp Here is a good usage of the .Cm list command to see accounting records and timestamp information: .Pp .Dl ipfw -at list .Pp or in short form without timestamps: .Pp .Dl ipfw -a list .Pp which is equivalent to: .Pp .Dl ipfw show .Pp Next rule diverts all incoming packets from 192.168.2.0/24 to divert port 5000: .Pp .Dl ipfw divert 5000 ip from 192.168.2.0/24 to any in .Ss TRAFFIC SHAPING The following rules show some of the applications of .Nm and .Nm dummynet for simulations and the like. .Pp This rule drops random incoming packets with a probability of 5%: .Pp .Dl "ipfw add prob 0.05 deny ip from any to any in" .Pp A similar effect can be achieved making use of .Nm dummynet pipes: .Pp .Dl "dnctl add pipe 10 ip from any to any" .Dl "dnctl pipe 10 config plr 0.05" .Pp We can use pipes to artificially limit bandwidth, e.g.\& on a machine acting as a router, if we want to limit traffic from local clients on 192.168.2.0/24 we do: .Pp .Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" .Dl "dnctl pipe 1 config bw 300Kbit/s queue 50KBytes" .Pp note that we use the .Cm out modifier so that the rule is not used twice. Remember in fact that .Nm rules are checked both on incoming and outgoing packets. .Pp Should we want to simulate a bidirectional link with bandwidth limitations, the correct way is the following: .Pp .Dl "ipfw add pipe 1 ip from any to any out" .Dl "ipfw add pipe 2 ip from any to any in" .Dl "dnctl pipe 1 config bw 64Kbit/s queue 10Kbytes" .Dl "dnctl pipe 2 config bw 64Kbit/s queue 10Kbytes" .Pp The above can be very useful, e.g.\& if you want to see how your fancy Web page will look for a residential user who is connected only through a slow link. You should not use only one pipe for both directions, unless you want to simulate a half-duplex medium (e.g.\& AppleTalk, Ethernet, IRDA). It is not necessary that both pipes have the same configuration, so we can also simulate asymmetric links. .Pp Should we want to verify network performance with the RED queue management algorithm: .Pp .Dl "ipfw add pipe 1 ip from any to any" .Dl "dnctl pipe 1 config bw 500Kbit/s queue 100 red 0.002/30/80/0.1" .Pp Another typical application of the traffic shaper is to introduce some delay in the communication. This can significantly affect applications which do a lot of Remote Procedure Calls, and where the round-trip-time of the connection often becomes a limiting factor much more than bandwidth: .Pp .Dl "ipfw add pipe 1 ip from any to any out" .Dl "ipfw add pipe 2 ip from any to any in" .Dl "dnctl pipe 1 config delay 250ms bw 1Mbit/s" .Dl "dnctl pipe 2 config delay 250ms bw 1Mbit/s" .Pp Per-flow queueing can be useful for a variety of purposes. A very simple one is counting traffic: .Pp .Dl "ipfw add pipe 1 tcp from any to any" .Dl "ipfw add pipe 1 udp from any to any" .Dl "ipfw add pipe 1 ip from any to any" .Dl "dnctl pipe 1 config mask all" .Pp The above set of rules will create queues (and collect statistics) for all traffic. Because the pipes have no limitations, the only effect is collecting statistics. Note that we need 3 rules, not just the last one, because when .Nm tries to match IP packets it will not consider ports, so we would not see connections on separate ports as different ones. .Pp A more sophisticated example is limiting the outbound traffic on a net with per-host limits, rather than per-network limits: .Pp .Dl "ipfw add pipe 1 ip from 192.168.2.0/24 to any out" .Dl "ipfw add pipe 2 ip from any to 192.168.2.0/24 in" .Dl "dnctl pipe 1 config mask src-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" .Dl "dnctl pipe 2 config mask dst-ip 0x000000ff bw 200Kbit/s queue 20Kbytes" .Ss LOOKUP TABLES In the following example, we need to create several traffic bandwidth classes and we need different hosts/networks to fall into different classes. We create one pipe for each class and configure them accordingly. Then we create a single table and fill it with IP subnets and addresses. For each subnet/host we set the argument equal to the number of the pipe that it should use. Then we classify traffic using a single rule: .Pp .Dl "dnctl pipe 1 config bw 1000Kbyte/s" .Dl "dnctl pipe 4 config bw 4000Kbyte/s" .Dl "..." .Dl "ipfw table T1 create type addr" .Dl "ipfw table T1 add 192.168.2.0/24 1" .Dl "ipfw table T1 add 192.168.0.0/27 4" .Dl "ipfw table T1 add 192.168.0.2 1" .Dl "..." .Dl "ipfw add pipe tablearg ip from 'table(T1)' to any" .Pp Using the .Cm fwd action, the table entries may include hostnames and IP addresses. .Pp .Dl "ipfw table T2 create type addr valtype ipv4" .Dl "ipfw table T2 add 192.168.2.0/24 10.23.2.1" .Dl "ipfw table T2 add 192.168.0.0/27 router1.dmz" .Dl "..." .Dl "ipfw add 100 fwd tablearg ip from any to 'table(T2)'" .Pp In the following example per-interface firewall is created: .Pp .Dl "ipfw table IN create type iface valtype skipto,fib" .Dl "ipfw table IN add vlan20 12000,12" .Dl "ipfw table IN add vlan30 13000,13" .Dl "ipfw table OUT create type iface valtype skipto" .Dl "ipfw table OUT add vlan20 22000" .Dl "ipfw table OUT add vlan30 23000" .Dl ".." .Dl "ipfw add 100 setfib tablearg ip from any to any recv 'table(IN)' in" .Dl "ipfw add 200 skipto tablearg ip from any to any recv 'table(IN)' in" .Dl "ipfw add 300 skipto tablearg ip from any to any xmit 'table(OUT)' out" .Pp The following example illustrate usage of flow tables: .Pp .Dl "ipfw table fl create type flow:src-ip,proto,dst-ip,dst-port" .Dl "ipfw table fl add 2a02:6b8:77::88,tcp,2a02:6b8:77::99,80 11" .Dl "ipfw table fl add 10.0.0.1,udp,10.0.0.2,53 12" .Dl ".." .Dl "ipfw add 100 allow ip from any to any flow 'table(fl,11)' recv ix0" .Ss SETS OF RULES To add a set of rules atomically, e.g.\& set 18: .Pp .Dl "ipfw set disable 18" .Dl "ipfw add NN set 18 ... # repeat as needed" .Dl "ipfw set enable 18" .Pp To delete a set of rules atomically the command is simply: .Pp .Dl "ipfw delete set 18" .Pp To test a ruleset and disable it and regain control if something goes wrong: .Pp .Dl "ipfw set disable 18" .Dl "ipfw add NN set 18 ... # repeat as needed" .Dl "ipfw set enable 18; echo done; sleep 30 && ipfw set disable 18" .Pp Here if everything goes well, you press control-C before the "sleep" terminates, and your ruleset will be left active. Otherwise, e.g.\& if you cannot access your box, the ruleset will be disabled after the sleep terminates thus restoring the previous situation. .Pp To show rules of the specific set: .Pp .Dl "ipfw set 18 show" .Pp To show rules of the disabled set: .Pp .Dl "ipfw -S set 18 show" .Pp To clear a specific rule counters of the specific set: .Pp .Dl "ipfw set 18 zero NN" .Pp To delete a specific rule of the specific set: .Pp .Dl "ipfw set 18 delete NN" .Ss NAT, REDIRECT AND LSNAT First redirect all the traffic to nat instance 123: .Pp .Dl "ipfw add nat 123 all from any to any" .Pp Then to configure nat instance 123 to alias all the outgoing traffic with ip 192.168.0.123, blocking all incoming connections, trying to keep same ports on both sides, clearing aliasing table on address change and keeping a log of traffic/link statistics: .Pp .Dl "ipfw nat 123 config ip 192.168.0.123 log deny_in reset same_ports" .Pp Or to change address of instance 123, aliasing table will be cleared (see reset option): .Pp .Dl "ipfw nat 123 config ip 10.0.0.1" .Pp To see configuration of nat instance 123: .Pp .Dl "ipfw nat 123 show config" .Pp To show logs of all instances: .Pp .Dl "ipfw nat show log" .Pp To see configurations of all instances: .Pp .Dl "ipfw nat show config" .Pp Or a redirect rule with mixed modes could looks like: .Bd -literal -offset 2n ipfw nat 123 config redirect_addr 10.0.0.1 10.0.0.66 redirect_port tcp 192.168.0.1:80 500 redirect_proto udp 192.168.1.43 192.168.1.1 redirect_addr 192.168.0.10,192.168.0.11 10.0.0.100 # LSNAT redirect_port tcp 192.168.0.1:80,192.168.0.10:22 500 # LSNAT .Ed .Pp or it could be split in: .Bd -literal -offset 2n ipfw nat 1 config redirect_addr 10.0.0.1 10.0.0.66 ipfw nat 2 config redirect_port tcp 192.168.0.1:80 500 ipfw nat 3 config redirect_proto udp 192.168.1.43 192.168.1.1 ipfw nat 4 config redirect_addr 192.168.0.10,192.168.0.11,192.168.0.12 10.0.0.100 ipfw nat 5 config redirect_port tcp 192.168.0.1:80,192.168.0.10:22,192.168.0.20:25 500 .Ed .Pp Sometimes you may want to mix NAT and dynamic rules. It could be achieved with .Cm record-state and .Cm defer-action options. Problem is, you need to create dynamic rule before NAT and check it after NAT actions (or vice versa) to have consistent addresses and ports. Rule with .Cm keep-state option will trigger activation of existing dynamic state, and action of such rule will be performed as soon as rule is matched. In case of NAT and .Cm allow rule packet need to be passed to NAT, not allowed as soon is possible. .Pp There is example of set of rules to achieve this. Bear in mind that this is example only and it is not very useful by itself. .Pp On way out, after all checks place this rules: .Pp .Dl "ipfw add allow record-state defer-action" .Dl "ipfw add nat 1" .Pp And on way in there should be something like this: .Pp .Dl "ipfw add nat 1" .Dl "ipfw add check-state" .Pp Please note, that first rule on way out doesn't allow packet and doesn't execute existing dynamic rules. All it does, create new dynamic rule with .Cm allow action, if it is not created yet. Later, this dynamic rule is used on way in by .Cm check-state rule. .Ss CONFIGURING CODEL, PIE, FQ-CODEL and FQ-PIE AQM .Cm codel and .Cm pie AQM can be configured for .Nm dummynet .Cm pipe or .Cm queue . .Pp To configure a .Cm pipe with .Cm codel AQM using default configuration for traffic from 192.168.0.0/24 and 1Mbits/s rate limit, we do: .Pp .Dl "dnctl pipe 1 config bw 1mbits/s codel" .Dl "ipfw add 100 pipe 1 ip from 192.168.0.0/24 to any" .Pp To configure a .Cm queue with .Cm codel AQM using different configurations parameters for traffic from 192.168.0.0/24 and 1Mbits/s rate limit, we do: .Pp .Dl "dnctl pipe 1 config bw 1mbits/s" .Dl "dnctl queue 1 config pipe 1 codel target 8ms interval 160ms ecn" .Dl "ipfw add 100 queue 1 ip from 192.168.0.0/24 to any" .Pp To configure a .Cm pipe with .Cm pie AQM using default configuration for traffic from 192.168.0.0/24 and 1Mbits/s rate limit, we do: .Pp .Dl "dnctl pipe 1 config bw 1mbits/s pie" .Dl "ipfw add 100 pipe 1 ip from 192.168.0.0/24 to any" .Pp To configure a .Cm queue with .Cm pie AQM using different configuration parameters for traffic from 192.168.0.0/24 and 1Mbits/s rate limit, we do: .Pp .Dl "dnctl pipe 1 config bw 1mbits/s" .Dl "dnctl queue 1 config pipe 1 pie target 20ms tupdate 30ms ecn" .Dl "ipfw add 100 queue 1 ip from 192.168.0.0/24 to any" .Pp .Cm fq_codel and .Cm fq_pie AQM can be configured for .Nm dummynet schedulers. .Pp To configure .Cm fq_codel scheduler using different configurations parameters for traffic from 192.168.0.0/24 and 1Mbits/s rate limit, we do: .Pp .Dl "dnctl pipe 1 config bw 1mbits/s" .Dl "dnctl sched 1 config pipe 1 type fq_codel" .Dl "dnctl queue 1 config sched 1" .Dl "ipfw add 100 queue 1 ip from 192.168.0.0/24 to any" .Pp To change .Cm fq_codel default configuration for a .Cm sched such as disable ECN and change the .Ar target to 10ms, we do: .Pp .Dl "dnctl sched 1 config pipe 1 type fq_codel target 10ms noecn" .Pp Similar to .Cm fq_codel , to configure .Cm fq_pie scheduler using different configurations parameters for traffic from 192.168.0.0/24 and 1Mbits/s rate limit, we do: .Pp .Dl "dnctl pipe 1 config bw 1mbits/s" .Dl "dnctl sched 1 config pipe 1 type fq_pie" .Dl "dnctl queue 1 config sched 1" .Dl "ipfw add 100 queue 1 ip from 192.168.0.0/24 to any" .Pp The configurations of .Cm fq_pie .Cm sched can be changed in a similar way as for .Cm fq_codel .Sh SEE ALSO .Xr cpp 1 , .Xr m4 1 , .Xr fnmatch 3 , .Xr altq 4 , .Xr divert 4 , .Xr dummynet 4 , .Xr if_bridge 4 , .Xr ip 4 , .Xr ipfirewall 4 , .Xr ng_ether 4 , .Xr ng_ipfw 4 , .Xr protocols 5 , .Xr services 5 , .Xr init 8 , .Xr kldload 8 , .Xr reboot 8 , .Xr sysctl 8 , .Xr syslogd 8 , .Xr sysrc 8 .Sh HISTORY The .Nm utility first appeared in .Fx 2.0 . .Nm dummynet was introduced in .Fx 2.2.8 . Stateful extensions were introduced in .Fx 4.0 . .Nm ipfw2 was introduced in Summer 2002. .Sh AUTHORS .An Ugen J. S. Antsilevich , .An Poul-Henning Kamp , .An Alex Nash , .An Archie Cobbs , .An Luigi Rizzo , .An Rasool Al-Saadi . .Pp .An -nosplit API based upon code written by .An Daniel Boulet for BSDI. .Pp Dummynet has been introduced by Luigi Rizzo in 1997-1998. .Pp Some early work (1999-2000) on the .Nm dummynet traffic shaper supported by Akamba Corp. .Pp The ipfw core (ipfw2) has been completely redesigned and reimplemented by Luigi Rizzo in summer 2002. Further actions and options have been added by various developers over the years. .Pp .An -nosplit In-kernel NAT support written by .An Paolo Pisati Aq Mt piso@FreeBSD.org as part of a Summer of Code 2005 project. .Pp SCTP .Nm nat support has been developed by .An The Centre for Advanced Internet Architectures (CAIA) Aq http://www.caia.swin.edu.au . The primary developers and maintainers are David Hayes and Jason But. For further information visit: .Aq http://www.caia.swin.edu.au/urp/SONATA .Pp Delay profiles have been developed by Alessandro Cerri and Luigi Rizzo, supported by the European Commission within Projects Onelab and Onelab2. .Pp CoDel, PIE, FQ-CoDel and FQ-PIE AQM for Dummynet have been implemented by .An The Centre for Advanced Internet Architectures (CAIA) in 2016, supported by The Comcast Innovation Fund. The primary developer is Rasool Al-Saadi. .Sh BUGS The syntax has grown over the years and sometimes it might be confusing. Unfortunately, backward compatibility prevents cleaning up mistakes made in the definition of the syntax. .Pp .Em !!! WARNING !!! .Pp Misconfiguring the firewall can put your computer in an unusable state, possibly shutting down network services and requiring console access to regain control of it. .Pp Incoming packet fragments diverted by .Cm divert are reassembled before delivery to the socket. The action used on those packet is the one from the rule which matches the first fragment of the packet. .Pp Packets diverted to userland, and then reinserted by a userland process may lose various packet attributes. The packet source interface name will be preserved if it is shorter than 8 bytes and the userland process saves and reuses the sockaddr_in (as does .Xr natd 8 ) ; otherwise, it may be lost. If a packet is reinserted in this manner, later rules may be incorrectly applied, making the order of .Cm divert rules in the rule sequence very important. .Pp Dummynet drops all packets with IPv6 link-local addresses. .Pp Rules using .Cm uid or .Cm gid may not behave as expected. In particular, incoming SYN packets may have no uid or gid associated with them since they do not yet belong to a TCP connection, and the uid/gid associated with a packet may not be as expected if the associated process calls .Xr setuid 2 or similar system calls. .Pp Rule syntax is subject to the command line environment and some patterns may need to be escaped with the backslash character or quoted appropriately. .Pp Due to the architecture of .Xr libalias 3 , ipfw nat is not compatible with the TCP segmentation offloading (TSO). Thus, to reliably nat your network traffic, please disable TSO on your NICs using .Xr ifconfig 8 . .Pp ICMP error messages are not implicitly matched by dynamic rules for the respective conversations. To avoid failures of network error detection and path MTU discovery, ICMP error messages may need to be allowed explicitly through static rules. .Pp Rules using .Cm call and .Cm return actions may lead to confusing behaviour if ruleset has mistakes, and/or interaction with other subsystems (netgraph, dummynet, etc.) is used. One possible case for this is packet leaving .Nm in subroutine on the input pass, while later on output encountering unpaired .Cm return first. As the call stack is kept intact after input pass, packet will suddenly return to the rule number used on input pass, not on output one. Order of processing should be checked carefully to avoid such mistakes. diff --git a/sbin/ipfw/ipfw2.c b/sbin/ipfw/ipfw2.c index cc0970207749..be796808380f 100644 --- a/sbin/ipfw/ipfw2.c +++ b/sbin/ipfw/ipfw2.c @@ -1,5905 +1,6130 @@ /*- * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility */ #include #include #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* ctime */ #include /* _long_to_time */ #include #include #include /* offsetof */ #include #include /* only IFNAMSIZ */ #include #include /* only n_short, n_long */ #include #include #include #include #include struct cmdline_opts g_co; /* global options */ struct format_opts { int bcwidth; int pcwidth; int show_counters; int show_time; /* show timestamp */ uint32_t set_mask; /* enabled sets mask */ uint32_t flags; /* request flags */ uint32_t first; /* first rule to request */ uint32_t last; /* last rule to request */ uint32_t dcnt; /* number of dynamic states */ ipfw_obj_ctlv *tstate; /* table state data */ }; int resvd_set_number = RESVD_SET; static int ipfw_socket = -1; #define CHECK_LENGTH(v, len) do { \ if ((v) < (len)) \ errx(EX_DATAERR, "Rule too long"); \ } while (0) /* * Check if we have enough space in cmd buffer. Note that since * first 8? u32 words are reserved by reserved header, full cmd * buffer can't be used, so we need to protect from buffer overrun * only. At the beginning, cblen is less than actual buffer size by * size of ipfw_insn_u32 instruction + 1 u32 work. This eliminates need * for checking small instructions fitting in given range. * We also (ab)use the fact that ipfw_insn is always the first field * for any custom instruction. */ #define CHECK_CMDLEN CHECK_LENGTH(cblen, F_LEN((ipfw_insn *)cmd)) #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TARG; \ break; \ } \ \ { \ long _xval; \ char *end; \ \ _xval = strtol(*av, &end, 10); \ \ if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ if (_xval == IP_FW_TARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ arg = _xval; \ } \ } while (0) static struct _s_x f_tcpflags[] = { { "syn", TH_SYN }, { "fin", TH_FIN }, { "ack", TH_ACK }, { "psh", TH_PUSH }, { "rst", TH_RST }, { "urg", TH_URG }, { "tcp flag", 0 }, { NULL, 0 } }; static struct _s_x f_tcpopts[] = { { "mss", IP_FW_TCPOPT_MSS }, { "maxseg", IP_FW_TCPOPT_MSS }, { "window", IP_FW_TCPOPT_WINDOW }, { "sack", IP_FW_TCPOPT_SACK }, { "ts", IP_FW_TCPOPT_TS }, { "timestamp", IP_FW_TCPOPT_TS }, { "cc", IP_FW_TCPOPT_CC }, { "tcp option", 0 }, { NULL, 0 } }; /* * IP options span the range 0 to 255 so we need to remap them * (though in fact only the low 5 bits are significant). */ static struct _s_x f_ipopts[] = { { "ssrr", IP_FW_IPOPT_SSRR}, { "lsrr", IP_FW_IPOPT_LSRR}, { "rr", IP_FW_IPOPT_RR}, { "ts", IP_FW_IPOPT_TS}, { "ip option", 0 }, { NULL, 0 } }; static struct _s_x f_iptos[] = { { "lowdelay", IPTOS_LOWDELAY}, { "throughput", IPTOS_THROUGHPUT}, { "reliability", IPTOS_RELIABILITY}, { "mincost", IPTOS_MINCOST}, { "congestion", IPTOS_ECN_CE}, { "ecntransport", IPTOS_ECN_ECT0}, { "ip tos option", 0}, { NULL, 0 } }; static struct _s_x f_ipoff[] = { { "rf", IP_RF >> 8 }, { "df", IP_DF >> 8 }, { "mf", IP_MF >> 8 }, { "offset", 0x1 }, { NULL, 0} }; struct _s_x f_ipdscp[] = { { "af11", IPTOS_DSCP_AF11 >> 2 }, /* 001010 */ { "af12", IPTOS_DSCP_AF12 >> 2 }, /* 001100 */ { "af13", IPTOS_DSCP_AF13 >> 2 }, /* 001110 */ { "af21", IPTOS_DSCP_AF21 >> 2 }, /* 010010 */ { "af22", IPTOS_DSCP_AF22 >> 2 }, /* 010100 */ { "af23", IPTOS_DSCP_AF23 >> 2 }, /* 010110 */ { "af31", IPTOS_DSCP_AF31 >> 2 }, /* 011010 */ { "af32", IPTOS_DSCP_AF32 >> 2 }, /* 011100 */ { "af33", IPTOS_DSCP_AF33 >> 2 }, /* 011110 */ { "af41", IPTOS_DSCP_AF41 >> 2 }, /* 100010 */ { "af42", IPTOS_DSCP_AF42 >> 2 }, /* 100100 */ { "af43", IPTOS_DSCP_AF43 >> 2 }, /* 100110 */ { "be", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ { "va", IPTOS_DSCP_VA >> 2 }, /* 101100 */ { "ef", IPTOS_DSCP_EF >> 2 }, /* 101110 */ { "cs0", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ { "cs1", IPTOS_DSCP_CS1 >> 2 }, /* 001000 */ { "cs2", IPTOS_DSCP_CS2 >> 2 }, /* 010000 */ { "cs3", IPTOS_DSCP_CS3 >> 2 }, /* 011000 */ { "cs4", IPTOS_DSCP_CS4 >> 2 }, /* 100000 */ { "cs5", IPTOS_DSCP_CS5 >> 2 }, /* 101000 */ { "cs6", IPTOS_DSCP_CS6 >> 2 }, /* 110000 */ { "cs7", IPTOS_DSCP_CS7 >> 2 }, /* 100000 */ { NULL, 0 } }; static struct _s_x limit_masks[] = { {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, {"src-addr", DYN_SRC_ADDR}, {"src-port", DYN_SRC_PORT}, {"dst-addr", DYN_DST_ADDR}, {"dst-port", DYN_DST_PORT}, {NULL, 0} }; /* * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines * This is only used in this code. */ #define IPPROTO_ETHERTYPE 0x1000 static struct _s_x ether_types[] = { /* * Note, we cannot use "-:&/" in the names because they are field * separators in the type specifications. Also, we use s = NULL as * end-delimiter, because a type of 0 can be legal. */ { "ip", 0x0800 }, { "ipv4", 0x0800 }, { "ipv6", 0x86dd }, { "arp", 0x0806 }, { "rarp", 0x8035 }, { "vlan", 0x8100 }, { "loop", 0x9000 }, { "trail", 0x1000 }, { "at", 0x809b }, { "atalk", 0x809b }, { "aarp", 0x80f3 }, { "pppoe_disc", 0x8863 }, { "pppoe_sess", 0x8864 }, { "ipx_8022", 0x00E0 }, { "ipx_8023", 0x0000 }, { "ipx_ii", 0x8137 }, { "ipx_snap", 0x8137 }, { "ipx", 0x8137 }, { "ns", 0x0600 }, { NULL, 0 } }; static struct _s_x rule_eactions[] = { { "nat64clat", TOK_NAT64CLAT }, { "nat64lsn", TOK_NAT64LSN }, { "nat64stl", TOK_NAT64STL }, { "nptv6", TOK_NPTV6 }, { "tcp-setmss", TOK_TCPSETMSS }, { NULL, 0 } /* terminator */ }; static struct _s_x rule_actions[] = { { "abort6", TOK_ABORT6 }, { "abort", TOK_ABORT }, { "accept", TOK_ACCEPT }, { "pass", TOK_ACCEPT }, { "allow", TOK_ACCEPT }, { "permit", TOK_ACCEPT }, { "count", TOK_COUNT }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, { "divert", TOK_DIVERT }, { "tee", TOK_TEE }, { "netgraph", TOK_NETGRAPH }, { "ngtee", TOK_NGTEE }, { "fwd", TOK_FORWARD }, { "forward", TOK_FORWARD }, { "skipto", TOK_SKIPTO }, { "deny", TOK_DENY }, { "drop", TOK_DENY }, { "reject", TOK_REJECT }, { "reset6", TOK_RESET6 }, { "reset", TOK_RESET }, { "unreach6", TOK_UNREACH6 }, { "unreach", TOK_UNREACH }, { "check-state", TOK_CHECKSTATE }, { "//", TOK_COMMENT }, { "nat", TOK_NAT }, { "reass", TOK_REASS }, { "setfib", TOK_SETFIB }, { "setdscp", TOK_SETDSCP }, { "call", TOK_CALL }, { "return", TOK_RETURN }, { "eaction", TOK_EACTION }, { "tcp-setmss", TOK_TCPSETMSS }, { "setmark", TOK_SETMARK }, { NULL, 0 } /* terminator */ }; +static struct _s_x return_types[] = { + { "next-rulenum", RETURN_NEXT_RULENUM }, + { "next-rule", RETURN_NEXT_RULE }, + { NULL, 0 } /* terminator */ +}; + static struct _s_x rule_action_params[] = { { "altq", TOK_ALTQ }, { "log", TOK_LOG }, { "tag", TOK_TAG }, { "untag", TOK_UNTAG }, { NULL, 0 } /* terminator */ }; /* * The 'lookup' instruction accepts one of the following arguments. - * Arguments are passed as v[1] in O_DST_LOOKUP options. + * Arguments are passed as o.arg1 and o->value in O_DST_LOOKUP option. */ static struct _s_x lookup_keys[] = { { "dst-ip", LOOKUP_DST_IP }, { "src-ip", LOOKUP_SRC_IP }, { "dst-port", LOOKUP_DST_PORT }, { "src-port", LOOKUP_SRC_PORT }, { "dst-mac", LOOKUP_DST_MAC }, { "src-mac", LOOKUP_SRC_MAC }, { "uid", LOOKUP_UID }, { "jail", LOOKUP_JAIL }, { "dscp", LOOKUP_DSCP }, { "mark", LOOKUP_MARK }, + { "rulenum", LOOKUP_RULENUM }, { NULL, 0 }, }; +/* + * table(name,valuename=value) instruction accepts one of the + * following arguments as valuename. + */ +static struct _s_x tvalue_names[] = { + { "tag", TVALUE_TAG }, + { "pipe", TVALUE_PIPE }, + { "divert", TVALUE_DIVERT }, + { "skipto", TVALUE_SKIPTO }, + { "netgraph", TVALUE_NETGRAPH }, + { "fib", TVALUE_FIB }, + { "nat", TVALUE_NAT }, + { "nh4", TVALUE_NH4 }, + { "dscp", TVALUE_DSCP }, + { "limit", TVALUE_LIMIT }, + { "mark", TVALUE_MARK }, + { NULL, 0 } +}; + static struct _s_x rule_options[] = { { "tagged", TOK_TAGGED }, { "uid", TOK_UID }, { "gid", TOK_GID }, { "jail", TOK_JAIL }, { "in", TOK_IN }, { "limit", TOK_LIMIT }, { "set-limit", TOK_SETLIMIT }, { "keep-state", TOK_KEEPSTATE }, { "record-state", TOK_RECORDSTATE }, { "bridged", TOK_LAYER2 }, { "layer2", TOK_LAYER2 }, { "out", TOK_OUT }, { "diverted", TOK_DIVERTED }, { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, { "diverted-output", TOK_DIVERTEDOUTPUT }, { "xmit", TOK_XMIT }, { "recv", TOK_RECV }, { "via", TOK_VIA }, { "fragment", TOK_FRAG }, { "frag", TOK_FRAG }, { "fib", TOK_FIB }, { "ipoptions", TOK_IPOPTS }, { "ipopts", TOK_IPOPTS }, { "iplen", TOK_IPLEN }, { "ipid", TOK_IPID }, { "ipprecedence", TOK_IPPRECEDENCE }, { "dscp", TOK_DSCP }, { "iptos", TOK_IPTOS }, { "ipttl", TOK_IPTTL }, { "ipversion", TOK_IPVER }, { "ipver", TOK_IPVER }, { "estab", TOK_ESTAB }, { "established", TOK_ESTAB }, { "setup", TOK_SETUP }, { "sockarg", TOK_SOCKARG }, { "tcpdatalen", TOK_TCPDATALEN }, { "tcpflags", TOK_TCPFLAGS }, { "tcpflgs", TOK_TCPFLAGS }, { "tcpmss", TOK_TCPMSS }, { "tcpoptions", TOK_TCPOPTS }, { "tcpopts", TOK_TCPOPTS }, { "tcpseq", TOK_TCPSEQ }, { "tcpack", TOK_TCPACK }, { "tcpwin", TOK_TCPWIN }, { "icmptype", TOK_ICMPTYPES }, { "icmptypes", TOK_ICMPTYPES }, { "dst-ip", TOK_DSTIP }, { "src-ip", TOK_SRCIP }, { "dst-port", TOK_DSTPORT }, { "src-port", TOK_SRCPORT }, { "dst-mac", TOK_DSTMAC }, { "src-mac", TOK_SRCMAC }, { "proto", TOK_PROTO }, { "MAC", TOK_MAC }, { "mac", TOK_MAC }, { "mac-type", TOK_MACTYPE }, { "verrevpath", TOK_VERREVPATH }, { "versrcreach", TOK_VERSRCREACH }, { "antispoof", TOK_ANTISPOOF }, { "ipsec", TOK_IPSEC }, { "icmp6type", TOK_ICMP6TYPES }, { "icmp6types", TOK_ICMP6TYPES }, { "ext6hdr", TOK_EXT6HDR }, { "flow-id", TOK_FLOWID }, { "ipv6", TOK_IPV6 }, { "ip6", TOK_IPV6 }, { "ipv4", TOK_IPV4 }, { "ip4", TOK_IPV4 }, { "dst-ipv6", TOK_DSTIP6 }, { "dst-ip6", TOK_DSTIP6 }, { "src-ipv6", TOK_SRCIP6 }, { "src-ip6", TOK_SRCIP6 }, { "lookup", TOK_LOOKUP }, { "flow", TOK_FLOW }, { "mark", TOK_MARK }, { "defer-action", TOK_SKIPACTION }, { "defer-immediate-action", TOK_SKIPACTION }, { "//", TOK_COMMENT }, { "not", TOK_NOT }, /* pseudo option */ { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ { "or", TOK_OR }, /* pseudo option */ { "|", /* escape */ TOK_OR }, /* pseudo option */ { "{", TOK_STARTBRACE }, /* pseudo option */ { "(", TOK_STARTBRACE }, /* pseudo option */ { "}", TOK_ENDBRACE }, /* pseudo option */ { ")", TOK_ENDBRACE }, /* pseudo option */ { NULL, 0 } /* terminator */ }; void bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg); static int ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader **pcfg, size_t *psize); static int ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader *cfg, size_t sz, int ac, char **av); static void ipfw_list_tifaces(void); struct tidx; -static uint16_t pack_object(struct tidx *tstate, const char *name, int otype); -static uint16_t pack_table(struct tidx *tstate, const char *name); +static uint32_t pack_object(struct tidx *tstate, const char *name, int otype); +static uint32_t pack_table(struct tidx *tstate, const char *name); -static char *table_search_ctlv(ipfw_obj_ctlv *ctlv, uint16_t idx); +static char *table_search_ctlv(ipfw_obj_ctlv *ctlv, uint32_t idx); static void object_sort_ctlv(ipfw_obj_ctlv *ctlv); -static char *object_search_ctlv(ipfw_obj_ctlv *ctlv, uint16_t idx, +static char *object_search_ctlv(ipfw_obj_ctlv *ctlv, uint32_t idx, uint16_t type); int is_ipfw(void) { return (g_co.prog == cmdline_prog_ipfw); } /* * Simple string buffer API. * Used to simplify buffer passing between function and for * transparent overrun handling. */ /* * Allocates new buffer of given size @sz. * * Returns 0 on success. */ int bp_alloc(struct buf_pr *b, size_t size) { memset(b, 0, sizeof(struct buf_pr)); if ((b->buf = calloc(1, size)) == NULL) return (ENOMEM); b->ptr = b->buf; b->size = size; b->avail = b->size; return (0); } void bp_free(struct buf_pr *b) { free(b->buf); } /* * Flushes buffer so new writer start from beginning. */ void bp_flush(struct buf_pr *b) { b->ptr = b->buf; b->avail = b->size; b->buf[0] = '\0'; } /* * Print message specified by @format and args. * Automatically manage buffer space and transparently handle * buffer overruns. * * Returns number of bytes that should have been printed. */ int bprintf(struct buf_pr *b, const char *format, ...) { va_list args; int i; va_start(args, format); i = vsnprintf(b->ptr, b->avail, format, args); va_end(args); if (i < 0 || (size_t)i > b->avail) { /* Overflow or print error */ b->avail = 0; } else { b->ptr += i; b->avail -= i; } b->needed += i; return (i); } /* * Special values printer for tablearg-aware opcodes. */ void bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg) { if (str != NULL) bprintf(bp, "%s", str); if (arg == IP_FW_TARG) bprintf(bp, "tablearg"); else bprintf(bp, "%u", arg); } /* * Helper routine to print a possibly unaligned uint64_t on * various platform. If width > 0, print the value with * the desired width, followed by a space; * otherwise, return the required width. */ int pr_u64(struct buf_pr *b, void *pd, int width) { #ifdef TCC #define U64_FMT "I64" #else #define U64_FMT "llu" #endif uint64_t u; unsigned long long d; bcopy (pd, &u, sizeof(u)); d = u; return (width > 0) ? bprintf(b, "%*" U64_FMT " ", width, d) : snprintf(NULL, 0, "%" U64_FMT, d) ; #undef U64_FMT } void * safe_calloc(size_t number, size_t size) { void *ret = calloc(number, size); if (ret == NULL) err(EX_OSERR, "calloc"); return ret; } void * safe_realloc(void *ptr, size_t size) { void *ret = realloc(ptr, size); if (ret == NULL) err(EX_OSERR, "realloc"); return ret; } /* * Compare things like interface or table names. */ int stringnum_cmp(const char *a, const char *b) { int la, lb; la = strlen(a); lb = strlen(b); if (la > lb) return (1); else if (la < lb) return (-01); return (strcmp(a, b)); } struct debug_header { uint16_t cmd_type; uint16_t spare1; uint32_t opt_name; uint32_t total_len; uint32_t spare2; }; /* * conditionally runs the command. * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) { int i; if (g_co.debug_only) { struct debug_header dbg = { .cmd_type = 1, .opt_name = optname, .total_len = optlen + sizeof(struct debug_header), }; write(1, &dbg, sizeof(dbg)); write(1, optval, optlen); } if (g_co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || optname == IP_FW_ADD || optname == IP_FW3 || optname == IP_FW_NAT_GET_CONFIG || optname < 0 || optname == IP_FW_NAT_GET_LOG) { if (optname < 0) optname = -optname; i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, (socklen_t *)optlen); } else { i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); } return (i); } /* * do_set3 - pass ipfw control cmd to kernel * @optname: option name * @optval: pointer to option data * @optlen: option length * * Assumes op3 header is already embedded. * Calls setsockopt() with IP_FW3 as kernel-visible opcode. * Returns 0 on success or errno otherwise. */ int do_set3(int optname, ip_fw3_opheader *op3, size_t optlen) { op3->opcode = optname; + op3->version = IP_FW3_OPVER; /* use last version */ if (g_co.debug_only) { struct debug_header dbg = { .cmd_type = 2, .opt_name = optname, .total_len = optlen, sizeof(struct debug_header), }; write(1, &dbg, sizeof(dbg)); write(1, op3, optlen); } if (g_co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); return (setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, optlen)); } /* * do_get3 - pass ipfw control cmd to kernel * @optname: option name * @optval: pointer to option data * @optlen: pointer to option length * * Assumes op3 header is already embedded. * Calls getsockopt() with IP_FW3 as kernel-visible opcode. * Returns 0 on success or errno otherwise. */ int do_get3(int optname, ip_fw3_opheader *op3, size_t *optlen) { int error; socklen_t len; op3->opcode = optname; + op3->version = IP_FW3_OPVER; /* use last version */ if (g_co.debug_only) { struct debug_header dbg = { .cmd_type = 3, .opt_name = optname, .total_len = *optlen + sizeof(struct debug_header), }; write(1, &dbg, sizeof(dbg)); write(1, op3, *optlen); } if (g_co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); len = *optlen; error = getsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, &len); *optlen = len; return (error); } /** * match_token takes a table and a string, returns the value associated * with the string (-1 in case of failure). */ int match_token(struct _s_x *table, const char *string) { struct _s_x *pt; uint i = strlen(string); for (pt = table ; i && pt->s != NULL ; pt++) if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) return pt->x; return (-1); } /** * match_token_relaxed takes a table and a string, returns the value associated * with the string for the best match. * * Returns: * value from @table for matched records * -1 for non-matched records * -2 if more than one records match @string. */ int match_token_relaxed(struct _s_x *table, const char *string) { struct _s_x *pt, *m; int i, c; i = strlen(string); c = 0; for (pt = table ; i != 0 && pt->s != NULL ; pt++) { if (strncmp(pt->s, string, i) != 0) continue; m = pt; c++; } if (c == 1) return (m->x); return (c > 0 ? -2: -1); } int get_token(struct _s_x *table, const char *string, const char *errbase) { int tcmd; if ((tcmd = match_token_relaxed(table, string)) < 0) errx(EX_USAGE, "%s %s %s", (tcmd == 0) ? "invalid" : "ambiguous", errbase, string); return (tcmd); } /** * match_value takes a table and a value, returns the string associated * with the value (NULL in case of failure). */ char const * match_value(struct _s_x *p, int value) { for (; p->s != NULL; p++) if (p->x == value) return p->s; return NULL; } size_t concat_tokens(char *buf, size_t bufsize, struct _s_x *table, const char *delimiter) { struct _s_x *pt; int l; size_t sz; for (sz = 0, pt = table ; pt->s != NULL; pt++) { l = snprintf(buf + sz, bufsize - sz, "%s%s", (sz == 0) ? "" : delimiter, pt->s); sz += l; bufsize += l; if (sz > bufsize) return (bufsize); } return (sz); } /* * helper function to process a set of flags and set bits in the * appropriate masks. */ int fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, uint32_t *clear) { char *q; /* points to the separator */ int val; uint32_t *which; /* mask we are working on */ while (p && *p) { if (*p == '!') { p++; which = clear; } else which = set; q = strchr(p, ','); if (q) *q++ = '\0'; val = match_token(flags, p); if (val <= 0) { if (e != NULL) *e = p; return (-1); } *which |= (uint32_t)val; p = q; } return (0); } void print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set) { char const *comma = ""; int i, l; for (i = 0; list[i].x != 0; i++) { if ((set & list[i].x) == 0) continue; set &= ~list[i].x; l = snprintf(buf, sz, "%s%s", comma, list[i].s); if (l < 0 || (size_t)l >= sz) return; comma = ","; buf += l; sz -=l; } } /* * _substrcmp takes two strings and returns 1 if they do not match, * and 0 if they match exactly or the first string is a sub-string * of the second. A warning is printed to stderr in the case that the * first string is a sub-string of the second. * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp(const char *str1, const char* str2) { if (strncmp(str1, str2, strlen(str1)) != 0) return 1; if (strlen(str1) != strlen(str2)) warnx("DEPRECATED: '%s' matched '%s' as a sub-string", str1, str2); return 0; } /* * _substrcmp2 takes three strings and returns 1 if the first two do not match, * and 0 if they match exactly or the second string is a sub-string * of the first. A warning is printed to stderr in the case that the * first string does not match the third. * * This function exists to warn about the bizarre construction * strncmp(str, "by", 2) which is used to allow people to use a shortcut * for "bytes". The problem is that in addition to accepting "by", * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any * other string beginning with "by". * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp2(const char *str1, const char* str2, const char* str3) { if (strncmp(str1, str2, strlen(str2)) != 0) return 1; if (strcmp(str1, str3) != 0) warnx("DEPRECATED: '%s' matched '%s'", str1, str3); return 0; } /* * prints one port, symbolic or numeric */ static void print_port(struct buf_pr *bp, int proto, uint16_t port) { if (proto == IPPROTO_ETHERTYPE) { char const *s; if (g_co.do_resolv && (s = match_value(ether_types, port)) ) bprintf(bp, "%s", s); else bprintf(bp, "0x%04x", port); } else { struct servent *se = NULL; if (g_co.do_resolv) { struct protoent *pe = getprotobynumber(proto); se = getservbyport(htons(port), pe ? pe->p_name : NULL); } if (se) bprintf(bp, "%s", se->s_name); else bprintf(bp, "%d", port); } } static struct _s_x _port_name[] = { {"dst-port", O_IP_DSTPORT}, {"src-port", O_IP_SRCPORT}, {"ipid", O_IPID}, {"iplen", O_IPLEN}, {"ipttl", O_IPTTL}, {"mac-type", O_MAC_TYPE}, {"tcpdatalen", O_TCPDATALEN}, {"tcpmss", O_TCPMSS}, {"tcpwin", O_TCPWIN}, {"tagged", O_TAGGED}, {NULL, 0} }; /* * Print the values in a list 16-bit items of the types above. * XXX todo: add support for mask. */ static void print_newports(struct buf_pr *bp, const ipfw_insn_u16 *cmd, int proto, int opcode) { const uint16_t *p = cmd->ports; int i; char const *sep; if (opcode != 0) { sep = match_value(_port_name, opcode); if (sep == NULL) sep = "???"; bprintf(bp, " %s", sep); } sep = " "; for (i = F_LEN((const ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { bprintf(bp, "%s", sep); print_port(bp, proto, p[0]); if (p[0] != p[1]) { bprintf(bp, "-"); print_port(bp, proto, p[1]); } sep = ","; } } /* * Like strtol, but also translates service names into port numbers * for some protocols. * In particular: * proto == -1 disables the protocol check; * proto == IPPROTO_ETHERTYPE looks up an internal table * proto == matches the values there. * Returns *end == s in case the parameter is not found. */ static int strtoport(char *s, char **end, int base, int proto) { char *p, *buf; char *s1; int i; *end = s; /* default - not found */ if (*s == '\0') return 0; /* not found */ if (isdigit(*s)) return strtol(s, end, base); /* * find separator. '\\' escapes the next char. */ for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\' || *s1 == '_' || *s1 == '.') ; s1++) if (*s1 == '\\' && s1[1] != '\0') s1++; buf = safe_calloc(s1 - s + 1, 1); /* * copy into a buffer skipping backslashes */ for (p = s, i = 0; p != s1 ; p++) if (*p != '\\') buf[i++] = *p; buf[i++] = '\0'; if (proto == IPPROTO_ETHERTYPE) { i = match_token(ether_types, buf); free(buf); if (i != -1) { /* found */ *end = s1; return i; } } else { struct protoent *pe = NULL; struct servent *se; if (proto != 0) pe = getprotobynumber(proto); setservent(1); se = getservbyname(buf, pe ? pe->p_name : NULL); free(buf); if (se != NULL) { *end = s1; return ntohs(se->s_port); } } return 0; /* not found */ } /* * Fill the body of the command with the list of port ranges. */ static int fill_newports(ipfw_insn_u16 *cmd, char *av, int proto, int cblen) { uint16_t a, b, *p = cmd->ports; int i = 0; char *s = av; while (*s) { a = strtoport(av, &s, 0, proto); if (s == av) /* empty or invalid argument */ return (0); CHECK_LENGTH(cblen, i + 2); switch (*s) { case '-': /* a range */ av = s + 1; b = strtoport(av, &s, 0, proto); /* Reject expressions like '1-abc' or '1-2-3'. */ if (s == av || (*s != ',' && *s != '\0')) return (0); p[0] = a; p[1] = b; break; case ',': /* comma separated list */ case '\0': p[0] = p[1] = a; break; default: warnx("port list: invalid separator <%c> in <%s>", *s, av); return (0); } i++; p += 2; av = s + 1; } if (i > 0) { if (i + 1 > F_LEN_MASK) errx(EX_DATAERR, "too many ports/ranges\n"); cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ } return (i); } /* * Fill the body of the command with the list of DiffServ codepoints. */ static void fill_dscp(ipfw_insn *cmd, char *av, int cblen) { uint32_t *low, *high; char *s = av, *a; int code; cmd->opcode = O_DSCP; cmd->len |= F_INSN_SIZE(ipfw_insn_u32) + 1; CHECK_CMDLEN; low = (uint32_t *)(cmd + 1); high = low + 1; *low = 0; *high = 0; while (s != NULL) { a = strchr(s, ','); if (a != NULL) *a++ = '\0'; if (isalpha(*s)) { if ((code = match_token(f_ipdscp, s)) == -1) errx(EX_DATAERR, "Unknown DSCP code"); } else { code = strtoul(s, NULL, 10); if (code < 0 || code > 63) errx(EX_DATAERR, "Invalid DSCP value"); } if (code >= 32) *high |= 1 << (code - 32); else *low |= 1 << code; s = a; } } /* * Fill the body of the command with mark value and mask. */ static void fill_mark(ipfw_insn *cmd, char *av, int cblen) { uint32_t *value, *mask; char *value_str; cmd->opcode = O_MARK; cmd->len |= F_INSN_SIZE(ipfw_insn_u32) + 1; CHECK_CMDLEN; value = (uint32_t *)(cmd + 1); mask = value + 1; value_str = strsep(&av, ":"); if (strcmp(value_str, "tablearg") == 0) { cmd->arg1 = IP_FW_TARG; *value = 0; } else { /* This is not a tablearg */ cmd->arg1 |= 0x8000; *value = strtoul(value_str, NULL, 0); } if (av) *mask = strtoul(av, NULL, 0); else *mask = 0xFFFFFFFF; if ((*value & *mask) != *value) errx(EX_DATAERR, "Static mark value: some bits in value are" " set that will be masked out by mask " "(%#x & %#x) = %#x != %#x", *value, *mask, (*value & *mask), *value); } static struct _s_x icmpcodes[] = { { "net", ICMP_UNREACH_NET }, { "host", ICMP_UNREACH_HOST }, { "protocol", ICMP_UNREACH_PROTOCOL }, { "port", ICMP_UNREACH_PORT }, { "needfrag", ICMP_UNREACH_NEEDFRAG }, { "srcfail", ICMP_UNREACH_SRCFAIL }, { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, { "isolated", ICMP_UNREACH_ISOLATED }, { "net-prohib", ICMP_UNREACH_NET_PROHIB }, { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, { "tosnet", ICMP_UNREACH_TOSNET }, { "toshost", ICMP_UNREACH_TOSHOST }, { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, { NULL, 0 } }; static uint16_t get_reject_code(const char *str) { int val; char *s; val = strtoul(str, &s, 0); if (s == str || *s != '\0' || val >= 0x100) val = match_token(icmpcodes, str); if (val < 0) errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); return (val); } static void print_reject_code(struct buf_pr *bp, uint16_t code) { char const *s; if ((s = match_value(icmpcodes, code)) != NULL) bprintf(bp, "unreach %s", s); else bprintf(bp, "unreach %u", code); } /* * Returns the number of bits set (from left) in a contiguous bitmask, * or -1 if the mask is not contiguous. * XXX this needs a proper fix. * This effectively works on masks in big-endian (network) format. * when compiled on little endian architectures. * * First bit is bit 7 of the first byte -- note, for MAC addresses, * the first bit on the wire is bit 0 of the first byte. * len is the max length in bits. */ int contigmask(const uint8_t *p, int len) { int i, n; for (i=0; iarg1 & 0xff; uint8_t clear = (cmd->arg1 >> 8) & 0xff; if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { bprintf(bp, " setup"); return; } bprintf(bp, " %s ", name); for (i=0; list[i].x != 0; i++) { if (set & list[i].x) { set &= ~list[i].x; bprintf(bp, "%s%s", comma, list[i].s); comma = ","; } if (clear & list[i].x) { clear &= ~list[i].x; bprintf(bp, "%s!%s", comma, list[i].s); comma = ","; } } } +static void +print_tvalue(struct buf_pr *bp, const ipfw_insn_table *cmd) +{ + const char *name; + + name = match_value(tvalue_names, IPFW_TVALUE_TYPE(&cmd->o)); + bprintf(bp, ",%s=%u", name != NULL ? name: "", cmd->value); +} + /* * Print the ip address contained in a command. */ static void print_ip(struct buf_pr *bp, const struct format_opts *fo, const ipfw_insn_ip *cmd) { struct hostent *he = NULL; const struct in_addr *ia; const uint32_t *a = ((const ipfw_insn_u32 *)cmd)->d; - uint32_t len = F_LEN((const ipfw_insn *)cmd); + uint32_t len = F_LEN(&cmd->o); char *t; bprintf(bp, " "); - if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { - const char *arg; - - arg = match_value(lookup_keys, a[1]); - t = table_search_ctlv(fo->tstate, - ((const ipfw_insn *)cmd)->arg1); - bprintf(bp, "lookup %s %s", arg, t); - return; - } - if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { + switch (cmd->o.opcode) { + case O_IP_SRC_ME: + case O_IP_DST_ME: bprintf(bp, "me"); return; - } - if (cmd->o.opcode == O_IP_SRC_LOOKUP || - cmd->o.opcode == O_IP_DST_LOOKUP) { + + case O_IP_DST_LOOKUP: + if ((len == F_INSN_SIZE(ipfw_insn_kidx) || + len == F_INSN_SIZE(ipfw_insn_table)) && + IPFW_LOOKUP_TYPE(&cmd->o) != LOOKUP_NONE) { + const char *key; + + key = match_value(lookup_keys, + IPFW_LOOKUP_TYPE(&cmd->o)); + t = table_search_ctlv(fo->tstate, + insntoc(&cmd->o, kidx)->kidx); + if (len == F_INSN_SIZE(ipfw_insn_table)) { + bprintf(bp, "lookup %s:%#x %s", + (key != NULL ? key : ""), + insntoc(&cmd->o, table)->value, t); + } else + bprintf(bp, "lookup %s %s", key != NULL ? key: + "", t); + return; + } + /* FALLTHROUGH */ + case O_IP_SRC_LOOKUP: t = table_search_ctlv(fo->tstate, - ((const ipfw_insn *)cmd)->arg1); + insntoc(&cmd->o, kidx)->kidx); bprintf(bp, "table(%s", t); - if (len == F_INSN_SIZE(ipfw_insn_u32)) - bprintf(bp, ",%u", *a); + if (len == F_INSN_SIZE(ipfw_insn_table)) + print_tvalue(bp, insntoc(&cmd->o, table)); bprintf(bp, ")"); return; } + if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { const uint32_t *map = (const uint32_t *)&cmd->mask; struct in_addr addr; uint32_t x; int i, j; char comma = '{'; x = cmd->o.arg1 - 1; x = htonl(~x); addr.s_addr = htonl(cmd->addr.s_addr); bprintf(bp, "%s/%d", inet_ntoa(addr), contigmask((uint8_t *)&x, 32)); x = cmd->addr.s_addr; x &= 0xff; /* base */ /* * Print bits and ranges. * Locate first bit set (i), then locate first bit unset (j). * If we have 3+ consecutive bits set, then print them as a * range, otherwise only print the initial bit and rescan. */ for (i=0; i < cmd->o.arg1; i++) if (map[i/32] & (1<<(i & 31))) { for (j=i+1; j < cmd->o.arg1; j++) if (!(map[ j/32] & (1<<(j & 31)))) break; bprintf(bp, "%c%d", comma, i+x); if (j>i+2) { /* range has at least 3 elements */ bprintf(bp, "-%d", j-1+x); i = j-1; } comma = ','; } bprintf(bp, "}"); return; } /* * len == 2 indicates a single IP, whereas lists of 1 or more * addr/mask pairs have len = (2n+1). We convert len to n so we * use that to count the number of entries. */ for (len = len / 2; len > 0; len--, a += 2) { int mb = /* mask length */ (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? 32 : contigmask((const uint8_t *)&(a[1]), 32); if (mb == 32 && g_co.do_resolv) he = gethostbyaddr((const char *)&(a[0]), sizeof(in_addr_t), AF_INET); if (he != NULL) /* resolved to name */ bprintf(bp, "%s", he->h_name); else if (mb == 0) /* any */ bprintf(bp, "any"); else { /* numeric IP followed by some kind of mask */ ia = (const struct in_addr *)&a[0]; bprintf(bp, "%s", inet_ntoa(*ia)); if (mb < 0) { ia = (const struct in_addr *)&a[1]; bprintf(bp, ":%s", inet_ntoa(*ia)); } else if (mb < 32) bprintf(bp, "/%d", mb); } if (len > 1) bprintf(bp, ","); } } /* * prints a MAC address/mask pair */ static void format_mac(struct buf_pr *bp, const uint8_t *addr, const uint8_t *mask) { int l = contigmask(mask, 48); if (l == 0) bprintf(bp, " any"); else { bprintf(bp, " %02x:%02x:%02x:%02x:%02x:%02x", addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); if (l == -1) bprintf(bp, "&%02x:%02x:%02x:%02x:%02x:%02x", mask[0], mask[1], mask[2], mask[3], mask[4], mask[5]); else if (l < 48) bprintf(bp, "/%d", l); } } static void print_mac(struct buf_pr *bp, const ipfw_insn_mac *mac) { bprintf(bp, " MAC"); format_mac(bp, mac->addr, mac->mask); format_mac(bp, mac->addr + 6, mac->mask + 6); } static void print_mac_lookup(struct buf_pr *bp, const struct format_opts *fo, const ipfw_insn *cmd) { uint32_t len = F_LEN(cmd); char *t; bprintf(bp, " "); - t = table_search_ctlv(fo->tstate, cmd->arg1); + t = table_search_ctlv(fo->tstate, insntoc(cmd, kidx)->kidx); bprintf(bp, "table(%s", t); - if (len == F_INSN_SIZE(ipfw_insn_u32)) - bprintf(bp, ",%u", ((const ipfw_insn_u32 *)cmd)->d[0]); + if (len == F_INSN_SIZE(ipfw_insn_table)) + print_tvalue(bp, insntoc(cmd, table)); bprintf(bp, ")"); } static void fill_icmptypes(ipfw_insn_u32 *cmd, char *av) { uint8_t type; cmd->d[0] = 0; while (*av) { if (*av == ',') av++; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ICMP type"); if (type > 31) errx(EX_DATAERR, "ICMP type out of range"); cmd->d[0] |= 1 << type; } cmd->o.opcode = O_ICMPTYPE; cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); } static void print_icmptypes(struct buf_pr *bp, const ipfw_insn_u32 *cmd) { int i; char sep= ' '; bprintf(bp, " icmptypes"); for (i = 0; i < 32; i++) { if ( (cmd->d[0] & (1 << (i))) == 0) continue; bprintf(bp, "%c%d", sep, i); sep = ','; } } static void print_dscp(struct buf_pr *bp, const ipfw_insn_u32 *cmd) { const uint32_t *v; const char *code; int i = 0; char sep= ' '; bprintf(bp, " dscp"); v = cmd->d; while (i < 64) { if (*v & (1 << i)) { if ((code = match_value(f_ipdscp, i)) != NULL) bprintf(bp, "%c%s", sep, code); else bprintf(bp, "%c%d", sep, i); sep = ','; } if ((++i % 32) == 0) v++; } } -#define insntod(cmd, type) ((const ipfw_insn_ ## type *)(cmd)) struct show_state { struct ip_fw_rule *rule; - const ipfw_insn *eaction; + const ipfw_insn_kidx *eaction; uint8_t *printed; int flags; #define HAVE_PROTO 0x0001 #define HAVE_SRCIP 0x0002 #define HAVE_DSTIP 0x0004 #define HAVE_PROBE_STATE 0x0008 int proto; int or_block; }; static int init_show_state(struct show_state *state, struct ip_fw_rule *rule) { state->printed = calloc(rule->cmd_len, sizeof(uint8_t)); if (state->printed == NULL) return (ENOMEM); state->rule = rule; state->eaction = NULL; state->flags = 0; state->proto = 0; state->or_block = 0; return (0); } static void free_show_state(struct show_state *state) { free(state->printed); } static uint8_t is_printed_opcode(struct show_state *state, const ipfw_insn *cmd) { return (state->printed[cmd - state->rule->cmd]); } static void mark_printed(struct show_state *state, const ipfw_insn *cmd) { state->printed[cmd - state->rule->cmd] = 1; } static void print_limit_mask(struct buf_pr *bp, const ipfw_insn_limit *limit) { struct _s_x *p = limit_masks; char const *comma = " "; uint8_t x; for (x = limit->limit_mask; p->x != 0; p++) { if ((x & p->x) == p->x) { x &= ~p->x; bprintf(bp, "%s%s", comma, p->s); comma = ","; } } bprint_uint_arg(bp, " ", limit->conn_limit); } static int print_instruction(struct buf_pr *bp, const struct format_opts *fo, struct show_state *state, const ipfw_insn *cmd) { struct protoent *pe; struct passwd *pwd; struct group *grp; const char *s; double d; if (is_printed_opcode(state, cmd)) return (0); if ((cmd->len & F_OR) != 0 && state->or_block == 0) bprintf(bp, " {"); if (cmd->opcode != O_IN && (cmd->len & F_NOT) != 0) bprintf(bp, " not"); switch (cmd->opcode) { case O_PROB: - d = 1.0 * insntod(cmd, u32)->d[0] / 0x7fffffff; + d = 1.0 * insntoc(cmd, u32)->d[0] / 0x7fffffff; bprintf(bp, "prob %f ", d); break; case O_PROBE_STATE: /* no need to print anything here */ state->flags |= HAVE_PROBE_STATE; break; case O_IP_SRC: case O_IP_SRC_LOOKUP: case O_IP_SRC_MASK: case O_IP_SRC_ME: case O_IP_SRC_SET: if (state->flags & HAVE_SRCIP) bprintf(bp, " src-ip"); - print_ip(bp, fo, insntod(cmd, ip)); + print_ip(bp, fo, insntoc(cmd, ip)); break; case O_IP_DST: - case O_IP_DST_LOOKUP: case O_IP_DST_MASK: case O_IP_DST_ME: case O_IP_DST_SET: - if (state->flags & HAVE_DSTIP) + case O_IP_DST_LOOKUP: + /* + * Special handling for O_IP_DST_LOOKUP when + * lookup type is not LOOKUP_NONE. + */ + if ((state->flags & HAVE_DSTIP) != 0 && ( + cmd->opcode != O_IP_DST_LOOKUP || + IPFW_LOOKUP_TYPE(cmd) == LOOKUP_NONE)) bprintf(bp, " dst-ip"); - print_ip(bp, fo, insntod(cmd, ip)); + print_ip(bp, fo, insntoc(cmd, ip)); break; case O_IP6_SRC: case O_IP6_SRC_MASK: case O_IP6_SRC_ME: if (state->flags & HAVE_SRCIP) bprintf(bp, " src-ip6"); - print_ip6(bp, insntod(cmd, ip6)); + print_ip6(bp, insntoc(cmd, ip6)); break; case O_IP6_DST: case O_IP6_DST_MASK: case O_IP6_DST_ME: if (state->flags & HAVE_DSTIP) bprintf(bp, " dst-ip6"); - print_ip6(bp, insntod(cmd, ip6)); + print_ip6(bp, insntoc(cmd, ip6)); break; case O_MAC_SRC_LOOKUP: bprintf(bp, " src-mac"); print_mac_lookup(bp, fo, cmd); break; case O_MAC_DST_LOOKUP: bprintf(bp, " dst-mac"); print_mac_lookup(bp, fo, cmd); break; case O_FLOW6ID: - print_flow6id(bp, insntod(cmd, u32)); + print_flow6id(bp, insntoc(cmd, u32)); break; case O_IP_DSTPORT: case O_IP_SRCPORT: - print_newports(bp, insntod(cmd, u16), state->proto, + print_newports(bp, insntoc(cmd, u16), state->proto, (state->flags & (HAVE_SRCIP | HAVE_DSTIP)) == (HAVE_SRCIP | HAVE_DSTIP) ? cmd->opcode: 0); break; case O_PROTO: pe = getprotobynumber(cmd->arg1); if (state->flags & HAVE_PROTO) bprintf(bp, " proto"); if (pe != NULL) bprintf(bp, " %s", pe->p_name); else bprintf(bp, " %u", cmd->arg1); state->proto = cmd->arg1; break; case O_MACADDR2: - print_mac(bp, insntod(cmd, mac)); + print_mac(bp, insntoc(cmd, mac)); break; case O_MAC_TYPE: - print_newports(bp, insntod(cmd, u16), + print_newports(bp, insntoc(cmd, u16), IPPROTO_ETHERTYPE, cmd->opcode); break; case O_FRAG: print_flags(bp, "frag", cmd, f_ipoff); break; case O_FIB: bprintf(bp, " fib %u", cmd->arg1); break; case O_SOCKARG: bprintf(bp, " sockarg"); break; case O_IN: bprintf(bp, cmd->len & F_NOT ? " out" : " in"); break; case O_DIVERTED: switch (cmd->arg1) { case 3: bprintf(bp, " diverted"); break; case 2: bprintf(bp, " diverted-output"); break; case 1: bprintf(bp, " diverted-loopback"); break; default: bprintf(bp, " diverted-?<%u>", cmd->arg1); break; } break; case O_LAYER2: bprintf(bp, " layer2"); break; case O_XMIT: case O_RECV: case O_VIA: if (cmd->opcode == O_XMIT) s = "xmit"; else if (cmd->opcode == O_RECV) s = "recv"; else /* if (cmd->opcode == O_VIA) */ s = "via"; - switch (insntod(cmd, if)->name[0]) { + switch (insntoc(cmd, if)->name[0]) { case '\0': bprintf(bp, " %s %s", s, - inet_ntoa(insntod(cmd, if)->p.ip)); + inet_ntoa(insntoc(cmd, if)->p.ip)); break; case '\1': bprintf(bp, " %s table(%s)", s, table_search_ctlv(fo->tstate, - insntod(cmd, if)->p.kidx)); + insntoc(cmd, if)->p.kidx)); break; default: bprintf(bp, " %s %s", s, - insntod(cmd, if)->name); + insntoc(cmd, if)->name); } break; case O_IP_FLOW_LOOKUP: - s = table_search_ctlv(fo->tstate, cmd->arg1); + s = table_search_ctlv(fo->tstate, + insntoc(cmd, kidx)->kidx); bprintf(bp, " flow table(%s", s); - if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) - bprintf(bp, ",%u", insntod(cmd, u32)->d[0]); + if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_table)) + print_tvalue(bp, insntoc(cmd, table)); bprintf(bp, ")"); break; case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: case O_TCPMSS: case O_TCPWIN: if (F_LEN(cmd) == 1) { switch (cmd->opcode) { case O_IPID: s = "ipid"; break; case O_IPTTL: s = "ipttl"; break; case O_IPLEN: s = "iplen"; break; case O_TCPDATALEN: s = "tcpdatalen"; break; case O_TCPMSS: s = "tcpmss"; break; case O_TCPWIN: s = "tcpwin"; break; default: s = ""; break; } bprintf(bp, " %s %u", s, cmd->arg1); } else - print_newports(bp, insntod(cmd, u16), 0, + print_newports(bp, insntoc(cmd, u16), 0, cmd->opcode); break; case O_IPVER: bprintf(bp, " ipver %u", cmd->arg1); break; case O_IPPRECEDENCE: bprintf(bp, " ipprecedence %u", cmd->arg1 >> 5); break; case O_DSCP: - print_dscp(bp, insntod(cmd, u32)); + print_dscp(bp, insntoc(cmd, u32)); break; case O_IPOPT: print_flags(bp, "ipoptions", cmd, f_ipopts); break; case O_IPTOS: print_flags(bp, "iptos", cmd, f_iptos); break; case O_ICMPTYPE: - print_icmptypes(bp, insntod(cmd, u32)); + print_icmptypes(bp, insntoc(cmd, u32)); break; case O_ESTAB: bprintf(bp, " established"); break; case O_TCPFLAGS: print_flags(bp, "tcpflags", cmd, f_tcpflags); break; case O_TCPOPTS: print_flags(bp, "tcpoptions", cmd, f_tcpopts); break; case O_TCPACK: bprintf(bp, " tcpack %d", - ntohl(insntod(cmd, u32)->d[0])); + ntohl(insntoc(cmd, u32)->d[0])); break; case O_TCPSEQ: bprintf(bp, " tcpseq %d", - ntohl(insntod(cmd, u32)->d[0])); + ntohl(insntoc(cmd, u32)->d[0])); break; case O_UID: - pwd = getpwuid(insntod(cmd, u32)->d[0]); + pwd = getpwuid(insntoc(cmd, u32)->d[0]); if (pwd != NULL) bprintf(bp, " uid %s", pwd->pw_name); else bprintf(bp, " uid %u", - insntod(cmd, u32)->d[0]); + insntoc(cmd, u32)->d[0]); break; case O_GID: - grp = getgrgid(insntod(cmd, u32)->d[0]); + grp = getgrgid(insntoc(cmd, u32)->d[0]); if (grp != NULL) bprintf(bp, " gid %s", grp->gr_name); else bprintf(bp, " gid %u", - insntod(cmd, u32)->d[0]); + insntoc(cmd, u32)->d[0]); break; case O_JAIL: - bprintf(bp, " jail %d", insntod(cmd, u32)->d[0]); + bprintf(bp, " jail %d", insntoc(cmd, u32)->d[0]); break; case O_VERREVPATH: bprintf(bp, " verrevpath"); break; case O_VERSRCREACH: bprintf(bp, " versrcreach"); break; case O_ANTISPOOF: bprintf(bp, " antispoof"); break; case O_IPSEC: bprintf(bp, " ipsec"); break; case O_NOP: bprintf(bp, " // %s", (const char *)(cmd + 1)); break; case O_KEEP_STATE: if (state->flags & HAVE_PROBE_STATE) bprintf(bp, " keep-state"); else bprintf(bp, " record-state"); bprintf(bp, " :%s", - object_search_ctlv(fo->tstate, cmd->arg1, + object_search_ctlv(fo->tstate, + insntoc(cmd, kidx)->kidx, IPFW_TLV_STATE_NAME)); break; case O_LIMIT: if (state->flags & HAVE_PROBE_STATE) bprintf(bp, " limit"); else bprintf(bp, " set-limit"); - print_limit_mask(bp, insntod(cmd, limit)); + print_limit_mask(bp, insntoc(cmd, limit)); bprintf(bp, " :%s", - object_search_ctlv(fo->tstate, cmd->arg1, + object_search_ctlv(fo->tstate, + insntoc(cmd, kidx)->kidx, IPFW_TLV_STATE_NAME)); break; case O_IP6: if (state->flags & HAVE_PROTO) bprintf(bp, " proto"); bprintf(bp, " ip6"); break; case O_IP4: if (state->flags & HAVE_PROTO) bprintf(bp, " proto"); bprintf(bp, " ip4"); break; case O_ICMP6TYPE: - print_icmp6types(bp, insntod(cmd, u32)); + print_icmp6types(bp, insntoc(cmd, u32)); break; case O_EXT_HDR: print_ext6hdr(bp, cmd); break; case O_TAGGED: if (F_LEN(cmd) == 1) bprint_uint_arg(bp, " tagged ", cmd->arg1); else - print_newports(bp, insntod(cmd, u16), + print_newports(bp, insntoc(cmd, u16), 0, O_TAGGED); break; case O_SKIP_ACTION: bprintf(bp, " defer-immediate-action"); break; case O_MARK: bprintf(bp, " mark"); if (cmd->arg1 == IP_FW_TARG) bprintf(bp, " tablearg"); else - bprintf(bp, " %#x", - ((const ipfw_insn_u32 *)cmd)->d[0]); + bprintf(bp, " %#x", insntoc(cmd, u32)->d[0]); - if (((const ipfw_insn_u32 *)cmd)->d[1] != 0xFFFFFFFF) - bprintf(bp, ":%#x", - ((const ipfw_insn_u32 *)cmd)->d[1]); + if (insntoc(cmd, u32)->d[1] != 0xFFFFFFFF) + bprintf(bp, ":%#x", insntoc(cmd, u32)->d[1]); break; default: bprintf(bp, " [opcode %d len %d]", cmd->opcode, cmd->len); } if (cmd->len & F_OR) { bprintf(bp, " or"); state->or_block = 1; } else if (state->or_block != 0) { bprintf(bp, " }"); state->or_block = 0; } mark_printed(state, cmd); return (1); } static ipfw_insn * print_opcode(struct buf_pr *bp, struct format_opts *fo, struct show_state *state, int opcode) { ipfw_insn *cmd; int l; for (l = state->rule->act_ofs, cmd = state->rule->cmd; l > 0; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { /* We use zero opcode to print the rest of options */ if (opcode >= 0 && cmd->opcode != opcode) continue; /* * Skip O_NOP, when we printing the rest * of options, it will be handled separately. */ if (cmd->opcode == O_NOP && opcode != O_NOP) continue; if (!print_instruction(bp, fo, state, cmd)) continue; return (cmd); } return (NULL); } static void print_fwd(struct buf_pr *bp, const ipfw_insn *cmd) { char buf[INET6_ADDRSTRLEN + IF_NAMESIZE + 2]; const ipfw_insn_sa6 *sa6; const ipfw_insn_sa *sa; uint16_t port; if (cmd->opcode == O_FORWARD_IP) { - sa = insntod(cmd, sa); + sa = insntoc(cmd, sa); port = sa->sa.sin_port; if (sa->sa.sin_addr.s_addr == INADDR_ANY) bprintf(bp, "fwd tablearg"); else bprintf(bp, "fwd %s", inet_ntoa(sa->sa.sin_addr)); } else { - sa6 = insntod(cmd, sa6); + sa6 = insntoc(cmd, sa6); port = sa6->sa.sin6_port; bprintf(bp, "fwd "); if (getnameinfo((const struct sockaddr *)&sa6->sa, sizeof(struct sockaddr_in6), buf, sizeof(buf), NULL, 0, NI_NUMERICHOST) == 0) bprintf(bp, "%s", buf); } if (port != 0) bprintf(bp, ",%u", port); } +static void +print_logdst(struct buf_pr *bp, uint16_t arg1) +{ + char const *comma = ""; + + bprintf(bp, " logdst ", arg1); + if (arg1 & IPFW_LOG_SYSLOG) { + bprintf(bp, "%ssyslog", comma); + comma = ","; + } + if (arg1 & IPFW_LOG_IPFW0) { + bprintf(bp, "%sipfw0", comma); + comma = ","; + } + if (arg1 & IPFW_LOG_RTSOCK) { + bprintf(bp, "%srtsock", comma); + comma = ","; + } +} + static int print_action_instruction(struct buf_pr *bp, const struct format_opts *fo, struct show_state *state, const ipfw_insn *cmd) { const char *s; if (is_printed_opcode(state, cmd)) return (0); switch (cmd->opcode) { case O_CHECK_STATE: bprintf(bp, "check-state"); - if (cmd->arg1 != 0) - s = object_search_ctlv(fo->tstate, cmd->arg1, + if (insntoc(cmd, kidx)->kidx != 0) + s = object_search_ctlv(fo->tstate, + insntoc(cmd, kidx)->kidx, IPFW_TLV_STATE_NAME); else s = NULL; bprintf(bp, " :%s", s ? s: "any"); break; case O_ACCEPT: bprintf(bp, "allow"); break; case O_COUNT: bprintf(bp, "count"); break; case O_DENY: bprintf(bp, "deny"); break; case O_REJECT: if (cmd->arg1 == ICMP_REJECT_RST) bprintf(bp, "reset"); else if (cmd->arg1 == ICMP_REJECT_ABORT) bprintf(bp, "abort"); else if (cmd->arg1 == ICMP_UNREACH_HOST) bprintf(bp, "reject"); else if (cmd->arg1 == ICMP_UNREACH_NEEDFRAG && cmd->len == F_INSN_SIZE(ipfw_insn_u16)) bprintf(bp, "needfrag %u", - ((const ipfw_insn_u16 *)cmd)->ports[0]); + insntoc(cmd, u16)->ports[0]); else print_reject_code(bp, cmd->arg1); break; case O_UNREACH6: if (cmd->arg1 == ICMP6_UNREACH_RST) bprintf(bp, "reset6"); else if (cmd->arg1 == ICMP6_UNREACH_ABORT) bprintf(bp, "abort6"); else print_unreach6_code(bp, cmd->arg1); break; case O_SKIPTO: - bprint_uint_arg(bp, "skipto ", cmd->arg1); + bprint_uint_arg(bp, "skipto ", insntoc(cmd, u32)->d[0]); break; case O_PIPE: bprint_uint_arg(bp, "pipe ", cmd->arg1); break; case O_QUEUE: bprint_uint_arg(bp, "queue ", cmd->arg1); break; case O_DIVERT: bprint_uint_arg(bp, "divert ", cmd->arg1); break; case O_TEE: bprint_uint_arg(bp, "tee ", cmd->arg1); break; case O_NETGRAPH: bprint_uint_arg(bp, "netgraph ", cmd->arg1); break; case O_NGTEE: bprint_uint_arg(bp, "ngtee ", cmd->arg1); break; case O_FORWARD_IP: case O_FORWARD_IP6: print_fwd(bp, cmd); break; case O_LOG: - if (insntod(cmd, log)->max_log > 0) - bprintf(bp, " log logamount %d", - insntod(cmd, log)->max_log); - else - bprintf(bp, " log"); + bprintf(bp, " log"); + if (insntoc(cmd, log)->max_log > 0) + bprintf(bp, " logamount %d", + insntoc(cmd, log)->max_log); + if (cmd->arg1 != IPFW_LOG_DEFAULT) + print_logdst(bp, cmd->arg1); break; case O_ALTQ: #ifndef NO_ALTQ - print_altq_cmd(bp, insntod(cmd, altq)); + print_altq_cmd(bp, insntoc(cmd, altq)); #endif break; case O_TAG: bprint_uint_arg(bp, cmd->len & F_NOT ? " untag ": " tag ", cmd->arg1); break; case O_NAT: if (cmd->arg1 != IP_FW_NAT44_GLOBAL) bprint_uint_arg(bp, "nat ", cmd->arg1); else bprintf(bp, "nat global"); break; case O_SETFIB: if (cmd->arg1 == IP_FW_TARG) bprint_uint_arg(bp, "setfib ", cmd->arg1); else bprintf(bp, "setfib %u", cmd->arg1 & 0x7FFF); break; case O_EXTERNAL_ACTION: /* * The external action can consists of two following * each other opcodes - O_EXTERNAL_ACTION and * O_EXTERNAL_INSTANCE. The first contains the ID of * name of external action. The second contains the ID * of name of external action instance. * NOTE: in case when external action has no named * instances support, the second opcode isn't needed. */ - state->eaction = cmd; - s = object_search_ctlv(fo->tstate, cmd->arg1, + state->eaction = insntoc(cmd, kidx); + s = object_search_ctlv(fo->tstate, + state->eaction->kidx, IPFW_TLV_EACTION); if (match_token(rule_eactions, s) != -1) bprintf(bp, "%s", s); else bprintf(bp, "eaction %s", s); break; case O_EXTERNAL_INSTANCE: if (state->eaction == NULL) break; /* * XXX: we need to teach ipfw(9) to rewrite opcodes * in the user buffer on rule addition. When we add * the rule, we specify zero TLV type for * O_EXTERNAL_INSTANCE object. To show correct * rule after `ipfw add` we need to search instance * name with zero type. But when we do `ipfw show` * we calculate TLV type using IPFW_TLV_EACTION_NAME() * macro. */ - s = object_search_ctlv(fo->tstate, cmd->arg1, 0); + s = object_search_ctlv(fo->tstate, + insntoc(cmd, kidx)->kidx, 0); if (s == NULL) s = object_search_ctlv(fo->tstate, - cmd->arg1, IPFW_TLV_EACTION_NAME( - state->eaction->arg1)); + insntoc(cmd, kidx)->kidx, IPFW_TLV_EACTION_NAME( + state->eaction->kidx)); bprintf(bp, " %s", s); break; case O_EXTERNAL_DATA: if (state->eaction == NULL) break; /* * Currently we support data formatting only for * external data with datalen u16. For unknown data * print its size in bytes. */ if (cmd->len == F_INSN_SIZE(ipfw_insn)) bprintf(bp, " %u", cmd->arg1); else bprintf(bp, " %ubytes", cmd->len * sizeof(uint32_t)); break; case O_SETDSCP: if (cmd->arg1 == IP_FW_TARG) { bprintf(bp, "setdscp tablearg"); break; } s = match_value(f_ipdscp, cmd->arg1 & 0x3F); if (s != NULL) bprintf(bp, "setdscp %s", s); else bprintf(bp, "setdscp %u", cmd->arg1 & 0x3F); break; case O_REASS: bprintf(bp, "reass"); break; case O_CALLRETURN: - if (cmd->len & F_NOT) - bprintf(bp, "return"); - else - bprint_uint_arg(bp, "call ", cmd->arg1); + if (cmd->len & F_NOT) { + s = match_value(return_types, cmd->arg1); + bprintf(bp, "return %s", s ? s: ""); + } else + bprint_uint_arg(bp, "call ", insntoc(cmd, u32)->d[0]); break; case O_SETMARK: if (cmd->arg1 == IP_FW_TARG) { bprintf(bp, "setmark tablearg"); break; } - bprintf(bp, "setmark %#x", ((const ipfw_insn_u32 *)cmd)->d[0]); + bprintf(bp, "setmark %#x", insntoc(cmd, u32)->d[0]); break; default: bprintf(bp, "** unrecognized action %d len %d ", cmd->opcode, cmd->len); } mark_printed(state, cmd); return (1); } static ipfw_insn * print_action(struct buf_pr *bp, struct format_opts *fo, struct show_state *state, uint8_t opcode) { ipfw_insn *cmd; int l; for (l = state->rule->cmd_len - state->rule->act_ofs, cmd = ACTION_PTR(state->rule); l > 0; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { if (cmd->opcode != opcode) continue; if (!print_action_instruction(bp, fo, state, cmd)) continue; return (cmd); } return (NULL); } static void print_proto(struct buf_pr *bp, struct format_opts *fo, struct show_state *state) { ipfw_insn *cmd; int l, proto, ip4, ip6; /* Count all O_PROTO, O_IP4, O_IP6 instructions. */ proto = ip4 = ip6 = 0; for (l = state->rule->act_ofs, cmd = state->rule->cmd; l > 0; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { switch (cmd->opcode) { case O_PROTO: proto++; break; case O_IP4: ip4 = 1; if (cmd->len & F_OR) ip4++; break; case O_IP6: ip6 = 1; if (cmd->len & F_OR) ip6++; break; default: continue; } } if (proto == 0 && ip4 == 0 && ip6 == 0) { state->proto = IPPROTO_IP; state->flags |= HAVE_PROTO; bprintf(bp, " ip"); return; } /* To handle the case { ip4 or ip6 }, print opcode with F_OR first */ cmd = NULL; if (ip4 || ip6) cmd = print_opcode(bp, fo, state, ip4 > ip6 ? O_IP4: O_IP6); if (cmd != NULL && (cmd->len & F_OR)) cmd = print_opcode(bp, fo, state, ip4 > ip6 ? O_IP6: O_IP4); if (cmd == NULL || (cmd->len & F_OR)) for (l = proto; l > 0; l--) { cmd = print_opcode(bp, fo, state, O_PROTO); if (cmd == NULL || (cmd->len & F_OR) == 0) break; } /* Initialize proto, it is used by print_newports() */ state->flags |= HAVE_PROTO; if (state->proto == 0 && ip6 != 0) state->proto = IPPROTO_IPV6; } static int match_opcode(int opcode, const int opcodes[], size_t nops) { size_t i; for (i = 0; i < nops; i++) if (opcode == opcodes[i]) return (1); return (0); } static void print_address(struct buf_pr *bp, struct format_opts *fo, struct show_state *state, const int opcodes[], size_t nops, int portop, int flag) { ipfw_insn *cmd; int count, l, portcnt, pf; count = portcnt = 0; for (l = state->rule->act_ofs, cmd = state->rule->cmd; l > 0; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { - if (match_opcode(cmd->opcode, opcodes, nops)) + if (match_opcode(cmd->opcode, opcodes, nops)) { + /* + * Special handling for O_IP_DST_LOOKUP when + * lookup type is not LOOKUP_NONE. + */ + if (cmd->opcode == O_IP_DST_LOOKUP && + IPFW_LOOKUP_TYPE(cmd) != LOOKUP_NONE) + continue; count++; - else if (cmd->opcode == portop) + } else if (cmd->opcode == portop) portcnt++; } if (count == 0) bprintf(bp, " any"); for (l = state->rule->act_ofs, cmd = state->rule->cmd; l > 0 && count > 0; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { if (!match_opcode(cmd->opcode, opcodes, nops)) continue; print_instruction(bp, fo, state, cmd); if ((cmd->len & F_OR) == 0) break; count--; } /* * If several O_IP_?PORT opcodes specified, leave them to the * options section. */ if (portcnt == 1) { for (l = state->rule->act_ofs, cmd = state->rule->cmd, pf = 0; l > 0; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { if (cmd->opcode != portop) { pf = (cmd->len & F_OR); continue; } /* Print opcode iff it is not in OR block. */ if (pf == 0 && (cmd->len & F_OR) == 0) print_instruction(bp, fo, state, cmd); break; } } state->flags |= flag; } static const int action_opcodes[] = { O_CHECK_STATE, O_ACCEPT, O_COUNT, O_DENY, O_REJECT, O_UNREACH6, O_SKIPTO, O_PIPE, O_QUEUE, O_DIVERT, O_TEE, O_NETGRAPH, O_NGTEE, O_FORWARD_IP, O_FORWARD_IP6, O_NAT, O_SETFIB, O_SETDSCP, O_REASS, O_CALLRETURN, O_SETMARK, /* keep the following opcodes at the end of the list */ O_EXTERNAL_ACTION, O_EXTERNAL_INSTANCE, O_EXTERNAL_DATA }; static const int modifier_opcodes[] = { O_LOG, O_ALTQ, O_TAG }; static const int src_opcodes[] = { O_IP_SRC, O_IP_SRC_LOOKUP, O_IP_SRC_MASK, O_IP_SRC_ME, O_IP_SRC_SET, O_IP6_SRC, O_IP6_SRC_MASK, O_IP6_SRC_ME }; static const int dst_opcodes[] = { O_IP_DST, O_IP_DST_LOOKUP, O_IP_DST_MASK, O_IP_DST_ME, O_IP_DST_SET, O_IP6_DST, O_IP6_DST_MASK, O_IP6_DST_ME }; +#if IPFW_DEFAULT_RULE > 65535 +#define RULENUM_FORMAT "%06d" +#else +#define RULENUM_FORMAT "%05d" +#endif + static void show_static_rule(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, struct ip_fw_rule *rule, struct ip_fw_bcounter *cntr) { static int twidth = 0; struct show_state state; ipfw_insn *cmd; size_t i; /* Print # DISABLED or skip the rule */ if ((fo->set_mask & (1 << rule->set)) == 0) { /* disabled mask */ if (!co->show_sets) return; else bprintf(bp, "# DISABLED "); } if (init_show_state(&state, rule) != 0) { warn("init_show_state() failed"); return; } - bprintf(bp, "%05u ", rule->rulenum); + + bprintf(bp, RULENUM_FORMAT " ", rule->rulenum); /* only if counters are available */ if (cntr != NULL) { /* Print counters if enabled */ if (fo->pcwidth > 0 || fo->bcwidth > 0) { pr_u64(bp, &cntr->pcnt, fo->pcwidth); pr_u64(bp, &cntr->bcnt, fo->bcwidth); } /* Print timestamp */ if (co->do_time == TIMESTAMP_NUMERIC) bprintf(bp, "%10u ", cntr->timestamp); else if (co->do_time == TIMESTAMP_STRING) { char timestr[30]; time_t t = (time_t)0; if (twidth == 0) { strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; twidth = strlen(timestr); } if (cntr->timestamp > 0) { t = _long_to_time(cntr->timestamp); strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; bprintf(bp, "%s ", timestr); } else { bprintf(bp, "%*s ", twidth, ""); } } } /* Print set number */ if (co->show_sets) bprintf(bp, "set %d ", rule->set); /* Print the optional "match probability" */ cmd = print_opcode(bp, fo, &state, O_PROB); /* Print rule action */ for (i = 0; i < nitems(action_opcodes); i++) { cmd = print_action(bp, fo, &state, action_opcodes[i]); if (cmd == NULL) continue; /* Handle special cases */ switch (cmd->opcode) { case O_CHECK_STATE: goto end; case O_EXTERNAL_ACTION: case O_EXTERNAL_INSTANCE: /* External action can have several instructions */ continue; } break; } /* Print rule modifiers */ for (i = 0; i < nitems(modifier_opcodes); i++) print_action(bp, fo, &state, modifier_opcodes[i]); /* * Print rule body */ if (co->comment_only != 0) goto end; if (rule->flags & IPFW_RULE_JUSTOPTS) { state.flags |= HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP; /* * Print `proto ip` if all opcodes has been already printed. */ if (memchr(state.printed, 0, rule->act_ofs) == NULL) { bprintf(bp, " proto ip"); goto end; } goto justopts; } print_proto(bp, fo, &state); if (co->do_compact != 0 && (rule->flags & IPFW_RULE_NOOPT)) goto justopts; /* Print source */ bprintf(bp, " from"); print_address(bp, fo, &state, src_opcodes, nitems(src_opcodes), O_IP_SRCPORT, HAVE_SRCIP); /* Print destination */ bprintf(bp, " to"); print_address(bp, fo, &state, dst_opcodes, nitems(dst_opcodes), O_IP_DSTPORT, HAVE_DSTIP); justopts: /* Print the rest of options */ while (print_opcode(bp, fo, &state, -1)) ; end: /* Print comment at the end */ cmd = print_opcode(bp, fo, &state, O_NOP); if (co->comment_only != 0 && cmd == NULL) bprintf(bp, " // ..."); bprintf(bp, "\n"); free_show_state(&state); } static void show_dyn_state(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, ipfw_dyn_rule *d) { struct protoent *pe; struct in_addr a; - uint16_t rulenum; char buf[INET6_ADDRSTRLEN]; - if (d->expire == 0 && d->dyn_type != O_LIMIT_PARENT) + if (!d->expire && !(d->type == O_LIMIT_PARENT)) return; - bcopy(&d->rule, &rulenum, sizeof(rulenum)); - bprintf(bp, "%05d", rulenum); + bprintf(bp, RULENUM_FORMAT, d->rulenum); if (fo->pcwidth > 0 || fo->bcwidth > 0) { bprintf(bp, " "); pr_u64(bp, &d->pcnt, fo->pcwidth); pr_u64(bp, &d->bcnt, fo->bcwidth); bprintf(bp, "(%ds)", d->expire); } - switch (d->dyn_type) { + switch (d->type) { case O_LIMIT_PARENT: - bprintf(bp, " PARENT %d", d->count); + bprintf(bp, " PARENT %u", d->count); break; case O_LIMIT: bprintf(bp, " LIMIT"); break; case O_KEEP_STATE: /* bidir, no mask */ bprintf(bp, " STATE"); break; } if ((pe = getprotobynumber(d->id.proto)) != NULL) bprintf(bp, " %s", pe->p_name); else bprintf(bp, " proto %u", d->id.proto); if (d->id.addr_type == 4) { a.s_addr = htonl(d->id.src_ip); bprintf(bp, " %s %d", inet_ntoa(a), d->id.src_port); a.s_addr = htonl(d->id.dst_ip); bprintf(bp, " <-> %s %d", inet_ntoa(a), d->id.dst_port); } else if (d->id.addr_type == 6) { bprintf(bp, " %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, sizeof(buf)), d->id.src_port); bprintf(bp, " <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf, sizeof(buf)), d->id.dst_port); } else bprintf(bp, " UNKNOWN <-> UNKNOWN"); if (d->kidx != 0) bprintf(bp, " :%s", object_search_ctlv(fo->tstate, d->kidx, IPFW_TLV_STATE_NAME)); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) if (co->verbose) { bprintf(bp, " state 0x%08x%s", d->state, d->state ? " ": ","); if (d->state & IPFW_DYN_ORPHANED) bprintf(bp, "ORPHANED,"); if ((d->state & BOTH_SYN) == BOTH_SYN) bprintf(bp, "BOTH_SYN,"); else { if (d->state & TH_SYN) bprintf(bp, "F_SYN,"); if (d->state & (TH_SYN << 8)) bprintf(bp, "R_SYN,"); } if ((d->state & BOTH_FIN) == BOTH_FIN) bprintf(bp, "BOTH_FIN,"); else { if (d->state & TH_FIN) bprintf(bp, "F_FIN,"); if (d->state & (TH_FIN << 8)) bprintf(bp, "R_FIN,"); } bprintf(bp, " f_ack 0x%x, r_ack 0x%x", d->ack_fwd, d->ack_rev); } } static int do_range_cmd(int cmd, ipfw_range_tlv *rt) { ipfw_range_header rh; size_t sz; memset(&rh, 0, sizeof(rh)); memcpy(&rh.range, rt, sizeof(*rt)); rh.range.head.length = sizeof(*rt); rh.range.head.type = IPFW_TLV_RANGE; sz = sizeof(rh); if (do_get3(cmd, &rh.opheader, &sz) != 0) return (-1); /* Save number of matched objects */ rt->new_set = rh.range.new_set; return (0); } /* * This one handles all set-related commands * ipfw set { show | enable | disable } * ipfw set swap X Y * ipfw set move X to Y * ipfw set move rule X to Y */ void ipfw_sets_handler(char *av[]) { ipfw_range_tlv rt; const char *msg; size_t size; - uint32_t masks[2]; + uint32_t masks[2], rulenum; int i; - uint16_t rulenum; uint8_t cmd; av++; memset(&rt, 0, sizeof(rt)); if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { struct format_opts fo; ipfw_cfg_lheader *cfg; memset(&fo, 0, sizeof(fo)); if (ipfw_get_config(&g_co, &fo, &cfg, &size) != 0) err(EX_OSERR, "requesting config failed"); for (i = 0, msg = "disable"; i < RESVD_SET; i++) if ((cfg->set_mask & (1<set_mask != (uint32_t)-1) ? " enable" : "enable"; for (i = 0; i < RESVD_SET; i++) if ((cfg->set_mask & (1< RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[0]); if (!isdigit(*(av[1])) || rt.new_set > RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[1]); i = do_range_cmd(IP_FW_SET_SWAP, &rt); } else if (_substrcmp(*av, "move") == 0) { av++; if (av[0] && _substrcmp(*av, "rule") == 0) { rt.flags = IPFW_RCFLAG_RANGE; /* move rules to new set */ cmd = IP_FW_XMOVE; av++; } else cmd = IP_FW_SET_MOVE; /* Move set to new one */ if (av[0] == NULL || av[1] == NULL || av[2] == NULL || av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); - rulenum = atoi(av[0]); + rulenum = (uint32_t)strtoul(av[0], NULL, 10); rt.new_set = atoi(av[2]); if (cmd == IP_FW_XMOVE) { rt.start_rule = rulenum; rt.end_rule = rulenum; } else rt.set = rulenum; rt.new_set = atoi(av[2]); if (!isdigit(*(av[0])) || (cmd == 3 && rt.set > RESVD_SET) || (cmd == 2 && rt.start_rule == IPFW_DEFAULT_RULE) ) errx(EX_DATAERR, "invalid source number %s\n", av[0]); if (!isdigit(*(av[2])) || rt.new_set > RESVD_SET) errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); i = do_range_cmd(cmd, &rt); if (i < 0) err(EX_OSERR, "failed to move %s", cmd == IP_FW_SET_MOVE ? "set": "rule"); } else if (_substrcmp(*av, "disable") == 0 || _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; av++; masks[0] = masks[1] = 0; while (av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) errx(EX_DATAERR, "invalid set number %d\n", i); masks[which] |= (1<dcnt++; if (fo->show_counters == 0) return; - if (co->use_set) { - /* skip states from another set */ - bcopy((char *)&d->rule + sizeof(uint16_t), &set, - sizeof(uint8_t)); - if (set != co->use_set - 1) - return; - } + /* skip states from another set */ + if (co->use_set != 0 && d->set != co->use_set - 1) + return; width = pr_u64(NULL, &d->pcnt, 0); if (width > fo->pcwidth) fo->pcwidth = width; width = pr_u64(NULL, &d->bcnt, 0); if (width > fo->bcwidth) fo->bcwidth = width; } static int foreach_state(struct cmdline_opts *co, struct format_opts *fo, caddr_t base, size_t sz, state_cb dyn_bc, void *dyn_arg) { int ttype; state_cb *fptr; void *farg; ipfw_obj_tlv *tlv; ipfw_obj_ctlv *ctlv; fptr = NULL; ttype = 0; while (sz > 0) { ctlv = (ipfw_obj_ctlv *)base; switch (ctlv->head.type) { case IPFW_TLV_DYNSTATE_LIST: base += sizeof(*ctlv); sz -= sizeof(*ctlv); ttype = IPFW_TLV_DYN_ENT; fptr = dyn_bc; farg = dyn_arg; break; default: return (sz); } while (sz > 0) { tlv = (ipfw_obj_tlv *)base; if (tlv->type != ttype) break; fptr(co, fo, farg, tlv + 1); sz -= tlv->length; base += tlv->length; } } return (sz); } static void prepare_format_opts(struct cmdline_opts *co, struct format_opts *fo, ipfw_obj_tlv *rtlv, int rcnt, caddr_t dynbase, size_t dynsz) { int bcwidth, pcwidth, width; int n; struct ip_fw_bcounter *cntr; struct ip_fw_rule *r; bcwidth = 0; pcwidth = 0; if (fo->show_counters != 0) { for (n = 0; n < rcnt; n++, rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { cntr = (struct ip_fw_bcounter *)(rtlv + 1); r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); /* skip rules from another set */ if (co->use_set && r->set != co->use_set - 1) continue; /* packet counter */ width = pr_u64(NULL, &cntr->pcnt, 0); if (width > pcwidth) pcwidth = width; /* byte counter */ width = pr_u64(NULL, &cntr->bcnt, 0); if (width > bcwidth) bcwidth = width; } } fo->bcwidth = bcwidth; fo->pcwidth = pcwidth; fo->dcnt = 0; if (co->do_dynamic && dynsz > 0) foreach_state(co, fo, dynbase, dynsz, prepare_format_dyn, NULL); } static int list_static_range(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, ipfw_obj_tlv *rtlv, int rcnt) { int n, seen; struct ip_fw_rule *r; struct ip_fw_bcounter *cntr; for (n = seen = 0; n < rcnt; n++, rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { if ((fo->show_counters | fo->show_time) != 0) { cntr = (struct ip_fw_bcounter *)(rtlv + 1); r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); } else { cntr = NULL; r = (struct ip_fw_rule *)(rtlv + 1); } if (r->rulenum > fo->last) break; if (co->use_set && r->set != co->use_set - 1) continue; if (r->rulenum >= fo->first && r->rulenum <= fo->last) { show_static_rule(co, fo, bp, r, cntr); printf("%s", bp->buf); bp_flush(bp); seen++; } } return (seen); } static void list_dyn_state(struct cmdline_opts *co, struct format_opts *fo, void *_arg, void *_state) { - uint16_t rulenum; - uint8_t set; ipfw_dyn_rule *d; struct buf_pr *bp; d = (ipfw_dyn_rule *)_state; bp = (struct buf_pr *)_arg; - bcopy(&d->rule, &rulenum, sizeof(rulenum)); - if (rulenum > fo->last) + if (d->rulenum > fo->last) return; - if (co->use_set) { - bcopy((char *)&d->rule + sizeof(uint16_t), - &set, sizeof(uint8_t)); - if (set != co->use_set - 1) - return; - } - if (rulenum >= fo->first) { + if (co->use_set != 0 && d->set != co->use_set - 1) + return; + if (d->rulenum >= fo->first) { show_dyn_state(co, fo, bp, d); printf("%s\n", bp->buf); bp_flush(bp); } } static int list_dyn_range(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, caddr_t base, size_t sz) { sz = foreach_state(co, fo, base, sz, list_dyn_state, bp); return (sz); } void ipfw_list(int ac, char *av[], int show_counters) { ipfw_cfg_lheader *cfg; struct format_opts sfo; size_t sz; int error; int lac; char **lav; uint32_t rnum; char *endptr; if (g_co.test_only) { fprintf(stderr, "Testing only, list disabled\n"); return; } if (g_co.do_pipe) { dummynet_list(ac, av, show_counters); return; } ac--; av++; memset(&sfo, 0, sizeof(sfo)); /* Determine rule range to request */ if (ac > 0) { for (lac = ac, lav = av; lac != 0; lac--) { rnum = strtoul(*lav++, &endptr, 10); if (sfo.first == 0 || rnum < sfo.first) sfo.first = rnum; if (*endptr == '-') rnum = strtoul(endptr + 1, &endptr, 10); if (sfo.last == 0 || rnum > sfo.last) sfo.last = rnum; } } /* get configuration from kernel */ cfg = NULL; sfo.show_counters = show_counters; sfo.show_time = g_co.do_time; if (g_co.do_dynamic != 2) sfo.flags |= IPFW_CFG_GET_STATIC; if (g_co.do_dynamic != 0) sfo.flags |= IPFW_CFG_GET_STATES; if ((sfo.show_counters | sfo.show_time) != 0) sfo.flags |= IPFW_CFG_GET_COUNTERS; if (ipfw_get_config(&g_co, &sfo, &cfg, &sz) != 0) err(EX_OSERR, "retrieving config failed"); error = ipfw_show_config(&g_co, &sfo, cfg, sz, ac, av); free(cfg); if (error != EX_OK) exit(error); } static int ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader *cfg, size_t sz, int ac, char *av[]) { caddr_t dynbase; size_t dynsz; int rcnt; int exitval = EX_OK; int lac; char **lav; char *endptr; size_t readsz; struct buf_pr bp; ipfw_obj_ctlv *ctlv; ipfw_obj_tlv *rbase; /* * Handle tablenames TLV first, if any */ rbase = NULL; dynbase = NULL; dynsz = 0; readsz = sizeof(*cfg); rcnt = 0; fo->set_mask = cfg->set_mask; ctlv = (ipfw_obj_ctlv *)(cfg + 1); if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { object_sort_ctlv(ctlv); fo->tstate = ctlv; readsz += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } if (cfg->flags & IPFW_CFG_GET_STATIC) { /* We've requested static rules */ if (ctlv->head.type == IPFW_TLV_RULE_LIST) { rbase = (ipfw_obj_tlv *)(ctlv + 1); rcnt = ctlv->count; readsz += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } } if ((cfg->flags & IPFW_CFG_GET_STATES) && (readsz != sz)) { /* We may have some dynamic states */ dynsz = sz - readsz; /* Skip empty header */ if (dynsz != sizeof(ipfw_obj_ctlv)) dynbase = (caddr_t)ctlv; else dynsz = 0; } prepare_format_opts(co, fo, rbase, rcnt, dynbase, dynsz); bp_alloc(&bp, 4096); /* if no rule numbers were specified, list all rules */ if (ac == 0) { fo->first = 0; fo->last = IPFW_DEFAULT_RULE; if (cfg->flags & IPFW_CFG_GET_STATIC) list_static_range(co, fo, &bp, rbase, rcnt); if (co->do_dynamic && dynsz > 0) { printf("## Dynamic rules (%d %zu):\n", fo->dcnt, dynsz); list_dyn_range(co, fo, &bp, dynbase, dynsz); } bp_free(&bp); return (EX_OK); } /* display specific rules requested on command line */ for (lac = ac, lav = av; lac != 0; lac--) { /* convert command line rule # */ fo->last = fo->first = strtoul(*lav++, &endptr, 10); if (*endptr == '-') fo->last = strtoul(endptr + 1, &endptr, 10); if (*endptr) { exitval = EX_USAGE; warnx("invalid rule number: %s", *(lav - 1)); continue; } if ((cfg->flags & IPFW_CFG_GET_STATIC) == 0) continue; if (list_static_range(co, fo, &bp, rbase, rcnt) == 0) { /* give precedence to other error(s) */ if (exitval == EX_OK) exitval = EX_UNAVAILABLE; if (fo->first == fo->last) warnx("rule %u does not exist", fo->first); else warnx("no rules in range %u-%u", fo->first, fo->last); } } if (co->do_dynamic && dynsz > 0) { printf("## Dynamic rules:\n"); for (lac = ac, lav = av; lac != 0; lac--) { fo->last = fo->first = strtoul(*lav++, &endptr, 10); if (*endptr == '-') fo->last = strtoul(endptr+1, &endptr, 10); if (*endptr) /* already warned */ continue; list_dyn_range(co, fo, &bp, dynbase, dynsz); } } bp_free(&bp); return (exitval); } /* * Retrieves current ipfw configuration of given type * and stores its pointer to @pcfg. * * Caller is responsible for freeing @pcfg. * * Returns 0 on success. */ static int ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader **pcfg, size_t *psize) { ipfw_cfg_lheader *cfg; size_t sz; int i; if (co->test_only != 0) { fprintf(stderr, "Testing only, list disabled\n"); return (0); } /* Start with some data size */ sz = 4096; cfg = NULL; for (i = 0; i < 16; i++) { if (cfg != NULL) free(cfg); if ((cfg = calloc(1, sz)) == NULL) return (ENOMEM); cfg->flags = fo->flags; cfg->start_rule = fo->first; cfg->end_rule = fo->last; if (do_get3(IP_FW_XGET, &cfg->opheader, &sz) != 0) { if (errno != ENOMEM) { free(cfg); return (errno); } /* Buffer size is not enough. Try to increase */ sz = sz * 2; if (sz < cfg->size) sz = cfg->size; continue; } *pcfg = cfg; *psize = sz; return (0); } free(cfg); return (ENOMEM); } static int lookup_host (char *host, struct in_addr *ipaddr) { struct hostent *he; if (!inet_aton(host, ipaddr)) { if ((he = gethostbyname(host)) == NULL) return(-1); *ipaddr = *(struct in_addr *)he->h_addr_list[0]; } return(0); } struct tidx { ipfw_obj_ntlv *idx; uint32_t count; uint32_t size; uint16_t counter; uint8_t set; }; int ipfw_check_object_name(const char *name) { int c, i, l; /* * Check that name is null-terminated and contains * valid symbols only. Valid mask is: * [a-zA-Z0-9\-_\.]{1,63} */ l = strlen(name); if (l == 0 || l >= 64) return (EINVAL); for (i = 0; i < l; i++) { c = name[i]; if (isalpha(c) || isdigit(c) || c == '_' || c == '-' || c == '.') continue; return (EINVAL); } return (0); } static const char *default_state_name = "default"; static int state_check_name(const char *name) { if (ipfw_check_object_name(name) != 0) return (EINVAL); if (strcmp(name, "any") == 0) return (EINVAL); return (0); } static int eaction_check_name(const char *name) { if (ipfw_check_object_name(name) != 0) return (EINVAL); /* Restrict some 'special' names */ if (match_token(rule_actions, name) != -1 && match_token(rule_action_params, name) != -1) return (EINVAL); return (0); } -static uint16_t +static uint32_t pack_object(struct tidx *tstate, const char *name, int otype) { ipfw_obj_ntlv *ntlv; uint32_t i; for (i = 0; i < tstate->count; i++) { if (strcmp(tstate->idx[i].name, name) != 0) continue; if (tstate->idx[i].set != tstate->set) continue; if (tstate->idx[i].head.type != otype) continue; return (tstate->idx[i].idx); } if (tstate->count + 1 > tstate->size) { tstate->size += 4; tstate->idx = realloc(tstate->idx, tstate->size * sizeof(ipfw_obj_ntlv)); if (tstate->idx == NULL) return (0); } ntlv = &tstate->idx[i]; memset(ntlv, 0, sizeof(ipfw_obj_ntlv)); strlcpy(ntlv->name, name, sizeof(ntlv->name)); ntlv->head.type = otype; ntlv->head.length = sizeof(ipfw_obj_ntlv); ntlv->set = tstate->set; ntlv->idx = ++tstate->counter; tstate->count++; return (ntlv->idx); } -static uint16_t +static uint32_t pack_table(struct tidx *tstate, const char *name) { if (table_check_name(name) != 0) return (0); return (pack_object(tstate, name, IPFW_TLV_TBL_NAME)); } +static void +fill_table_value(ipfw_insn *cmd, char *s) +{ + char *p; + int i; + + p = strchr(s, '='); + if (p != NULL) { + *p++ = '\0'; + i = match_token(tvalue_names, s); + if (i == -1) + errx(EX_USAGE, + "format: unknown table value name %s", s); + } else { + i = TVALUE_TAG; + p = s; + } + + IPFW_SET_TVALUE_TYPE(cmd, i); + insntod(cmd, table)->value = strtoul(p, NULL, 0); +} + void -fill_table(struct _ipfw_insn *cmd, char *av, uint8_t opcode, - struct tidx *tstate) +fill_table(ipfw_insn *cmd, char *av, uint8_t opcode, struct tidx *tstate) { - uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; - uint16_t uidx; + ipfw_insn_kidx *c = insntod(cmd, kidx); char *p; if ((p = strchr(av + 6, ')')) == NULL) errx(EX_DATAERR, "forgotten parenthesis: '%s'", av); *p = '\0'; p = strchr(av + 6, ','); if (p) *p++ = '\0'; - if ((uidx = pack_table(tstate, av + 6)) == 0) + if ((c->kidx = pack_table(tstate, av + 6)) == 0) errx(EX_DATAERR, "Invalid table name: %s", av + 6); cmd->opcode = opcode; - cmd->arg1 = uidx; if (p) { - cmd->len |= F_INSN_SIZE(ipfw_insn_u32); - d[0] = strtoul(p, NULL, 0); - } else - cmd->len |= F_INSN_SIZE(ipfw_insn); + cmd->len |= F_INSN_SIZE(ipfw_insn_table); + fill_table_value(cmd, p); + } else { + IPFW_SET_LOOKUP_TYPE(cmd, LOOKUP_NONE); + cmd->len |= F_INSN_SIZE(ipfw_insn_kidx); + } } - /* * fills the addr and mask fields in the instruction as appropriate from av. * Update length as appropriate. * The following formats are allowed: * me returns O_IP_*_ME * 1.2.3.4 single IP address * 1.2.3.4:5.6.7.8 address:mask * 1.2.3.4/24 address/mask * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet * We can have multiple comma-separated address/mask entries. */ static void fill_ip(ipfw_insn_ip *cmd, char *av, int cblen, struct tidx *tstate) { int len = 0; uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; cmd->o.len &= ~F_LEN_MASK; /* zero len */ if (_substrcmp(av, "any") == 0) return; if (_substrcmp(av, "me") == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn); return; } if (strncmp(av, "table(", 6) == 0) { fill_table(&cmd->o, av, O_IP_DST_LOOKUP, tstate); return; } while (av) { /* * After the address we can have '/' or ':' indicating a mask, * ',' indicating another address follows, '{' indicating a * set of addresses of unspecified size. */ char *t = NULL, *p = strpbrk(av, "/:,{"); int masklen; char md, nd = '\0'; CHECK_LENGTH(cblen, (int)F_INSN_SIZE(ipfw_insn) + 2 + len); if (p) { md = *p; *p++ = '\0'; if ((t = strpbrk(p, ",{")) != NULL) { nd = *t; *t = '\0'; } } else md = '\0'; if (lookup_host(av, (struct in_addr *)&d[0]) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", av); switch (md) { case ':': if (!inet_aton(p, (struct in_addr *)&d[1])) errx(EX_DATAERR, "bad netmask ``%s''", p); break; case '/': masklen = atoi(p); if (masklen == 0) d[1] = htonl(0U); /* mask */ else if (masklen > 32) errx(EX_DATAERR, "bad width ``%s''", p); else d[1] = htonl(~0U << (32 - masklen)); break; case '{': /* no mask, assume /24 and put back the '{' */ d[1] = htonl(~0U << (32 - 24)); *(--p) = md; break; case ',': /* single address plus continuation */ *(--p) = md; /* FALLTHROUGH */ case 0: /* initialization value */ default: d[1] = htonl(~0U); /* force /32 */ break; } d[0] &= d[1]; /* mask base address with mask */ if (t) *t = nd; /* find next separator */ if (p) p = strpbrk(p, ",{"); if (p && *p == '{') { /* * We have a set of addresses. They are stored as follows: * arg1 is the set size (powers of 2, 2..256) * addr is the base address IN HOST FORMAT * mask.. is an array of arg1 bits (rounded up to * the next multiple of 32) with bits set * for each host in the map. */ uint32_t *map = (uint32_t *)&cmd->mask; int low, high; int i = contigmask((uint8_t *)&(d[1]), 32); if (len > 0) errx(EX_DATAERR, "address set cannot be in a list"); if (i < 24 || i > 31) errx(EX_DATAERR, "invalid set with mask %d\n", i); cmd->o.arg1 = 1<<(32-i); /* map length */ d[0] = ntohl(d[0]); /* base addr in host format */ cmd->o.opcode = O_IP_DST_SET; /* default */ cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) map[i] = 0; /* clear map */ av = p + 1; low = d[0] & 0xff; high = low + cmd->o.arg1 - 1; /* * Here, i stores the previous value when we specify a range * of addresses within a mask, e.g. 45-63. i = -1 means we * have no previous value. */ i = -1; /* previous value in a range */ while (isdigit(*av)) { char *s; int a = strtol(av, &s, 0); if (s == av) { /* no parameter */ if (*av != '}') errx(EX_DATAERR, "set not closed\n"); if (i != -1) errx(EX_DATAERR, "incomplete range %d-", i); break; } if (a < low || a > high) errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", a, low, high); a -= low; if (i == -1) /* no previous in range */ i = a; else { /* check that range is valid */ if (i > a) errx(EX_DATAERR, "invalid range %d-%d", i+low, a+low); if (*s == '-') errx(EX_DATAERR, "double '-' in range"); } for (; i <= a; i++) map[i/32] |= 1<<(i & 31); i = -1; if (*s == '-') i = a; else if (*s == '}') break; av = s+1; } return; } av = p; if (av) /* then *av must be a ',' */ av++; /* Check this entry */ if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ /* * 'any' turns the entire list into a NOP. * 'not any' never matches, so it is removed from the * list unless it is the only item, in which case we * report an error. */ if (cmd->o.len & F_NOT) { /* "not any" never matches */ if (av == NULL && len == 0) /* only this entry */ errx(EX_DATAERR, "not any never matches"); } /* else do nothing and skip this entry */ return; } /* A single IP can be stored in an optimized format */ if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } len += 2; /* two words... */ d += 2; } /* end while */ if (len + 1 > F_LEN_MASK) errx(EX_DATAERR, "address list too long"); cmd->o.len |= len+1; } /* n2mask sets n bits of the mask */ void n2mask(struct in6_addr *mask, int n) { static int minimask[9] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; u_char *p; memset(mask, 0, sizeof(struct in6_addr)); p = (u_char *) mask; for (; n > 0; p++, n -= 8) { if (n >= 8) *p = 0xff; else *p = minimask[n]; } return; } static void fill_flags_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, struct _s_x *flags, char *p) { char *e; uint32_t set = 0, clear = 0; if (fill_flags(flags, p, &e, &set, &clear) != 0) errx(EX_DATAERR, "invalid flag %s", e); cmd->opcode = opcode; cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); } void ipfw_delete(char *av[]) { ipfw_range_tlv rt; char *sep; - int i, j; + uint32_t i, j; int exitval = EX_OK; int do_set = 0; av++; NEED1("missing rule specification"); if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (g_co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ av++; } /* Rule number */ while (*av && isdigit(**av)) { i = strtol(*av, &sep, 10); j = i; if (*sep== '-') j = strtol(sep + 1, NULL, 10); av++; if (g_co.do_nat) { exitval = ipfw_delete_nat(i); } else if (g_co.do_pipe) { exitval = ipfw_delete_pipe(g_co.do_pipe, i); } else { memset(&rt, 0, sizeof(rt)); if (do_set != 0) { rt.set = i & 31; rt.flags = IPFW_RCFLAG_SET; } else { - rt.start_rule = i & 0xffff; - rt.end_rule = j & 0xffff; + rt.start_rule = i; + rt.end_rule = j; if (rt.start_rule == 0 && rt.end_rule == 0) rt.flags |= IPFW_RCFLAG_ALL; else rt.flags |= IPFW_RCFLAG_RANGE; if (g_co.use_set != 0) { rt.set = g_co.use_set - 1; rt.flags |= IPFW_RCFLAG_SET; } } if (g_co.do_dynamic == 2) rt.flags |= IPFW_RCFLAG_DYNAMIC; i = do_range_cmd(IP_FW_XDEL, &rt); if (i != 0) { exitval = EX_UNAVAILABLE; if (g_co.do_quiet) continue; warn("rule %u: setsockopt(IP_FW_XDEL)", rt.start_rule); } else if (rt.new_set == 0 && do_set == 0 && g_co.do_dynamic != 2) { exitval = EX_UNAVAILABLE; if (g_co.do_quiet) continue; if (rt.start_rule != rt.end_rule) warnx("no rules in %u-%u range", rt.start_rule, rt.end_rule); else warnx("rule %u not found", rt.start_rule); } } } if (exitval != EX_OK && g_co.do_force == 0) exit(exitval); } - /* * fill the interface structure. We do not check the name as we can * create interfaces dynamically, so checking them at insert time * makes relatively little sense. * Interface names containing '*', '?', or '[' are assumed to be shell * patterns which match interfaces. */ static void fill_iface(ipfw_insn_if *cmd, char *arg, int cblen, struct tidx *tstate) { char *p; - uint16_t uidx; + uint32_t uidx; cmd->name[0] = '\0'; cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); CHECK_CMDLEN; /* Parse the interface or address */ if (strcmp(arg, "any") == 0) cmd->o.len = 0; /* effectively ignore this command */ else if (strncmp(arg, "table(", 6) == 0) { if ((p = strchr(arg + 6, ')')) == NULL) errx(EX_DATAERR, "forgotten parenthesis: '%s'", arg); *p = '\0'; p = strchr(arg + 6, ','); if (p) *p++ = '\0'; if ((uidx = pack_table(tstate, arg + 6)) == 0) errx(EX_DATAERR, "Invalid table name: %s", arg + 6); cmd->name[0] = '\1'; /* Special value indicating table */ cmd->p.kidx = uidx; } else if (!isdigit(*arg)) { strlcpy(cmd->name, arg, sizeof(cmd->name)); cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; } else if (!inet_aton(arg, &cmd->p.ip)) errx(EX_DATAERR, "bad ip address ``%s''", arg); } static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { int i; size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; if (strcmp(p, "any") == 0) { for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] = mask[i] = 0; return; } optr = ptr = strdup(p); if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { l = strlen(ap); if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) errx(EX_DATAERR, "Incorrect MAC address"); bcopy(mac, addr, ETHER_ADDR_LEN); } else errx(EX_DATAERR, "Incorrect MAC address"); if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ long ml = strtol(ptr, &ap, 10); if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || (mac = ether_aton(ptr)) == NULL) errx(EX_DATAERR, "Incorrect mask"); bcopy(mac, mask, ETHER_ADDR_LEN); } } else { /* default mask: ff:ff:ff:ff:ff:ff */ for (i = 0; i < ETHER_ADDR_LEN; i++) mask[i] = 0xff; } for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] &= mask[i]; free(optr); } /* * helper function, updates the pointer to cmd with the length * of the current command, and also cleans up the first word of * the new command in case it has been clobbered before. */ static ipfw_insn * next_cmd(ipfw_insn *cmd, int *len) { *len -= F_LEN(cmd); CHECK_LENGTH(*len, 0); cmd += F_LEN(cmd); bzero(cmd, sizeof(*cmd)); return cmd; } /* * Takes arguments and copies them into a comment */ static void fill_comment(ipfw_insn *cmd, char **av, int cblen) { int i, l; char *p = (char *)(cmd + 1); cmd->opcode = O_NOP; cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; if (l > 84) errx(EX_DATAERR, "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; CHECK_CMDLEN; for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; } *(--p) = '\0'; } /* * A function to fill simple commands of size 1. * Existing flags are preserved. */ static void fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) { cmd->opcode = opcode; cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; cmd->arg1 = arg; } /* * Fetch and add the MAC address and type, with masks. This generates one or * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * add_mac(ipfw_insn *cmd, char *av[], int cblen) { ipfw_insn_mac *mac; if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); CHECK_CMDLEN; mac = (ipfw_insn_mac *)cmd; get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), &(mac->mask[ETHER_ADDR_LEN])); /* src */ return cmd; } static ipfw_insn * add_mactype(ipfw_insn *cmd, char *av, int cblen) { if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE, cblen); cmd->opcode = O_MAC_TYPE; return cmd; } else return NULL; } static ipfw_insn * add_proto0(ipfw_insn *cmd, char *av, u_char *protop) { struct protoent *pe; char *ep; int proto; proto = strtol(av, &ep, 10); if (*ep != '\0' || proto <= 0) { if ((pe = getprotobyname(av)) == NULL) return NULL; proto = pe->p_proto; } fill_cmd(cmd, O_PROTO, 0, proto); *protop = proto; return cmd; } static ipfw_insn * add_proto(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_srcip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) { fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); if (cmd->opcode == O_IP_DST_SET) /* set */ cmd->opcode = O_IP_SRC_SET; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ cmd->opcode = O_IP_SRC_LOOKUP; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_SRC_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_SRC; else /* addr/mask */ cmd->opcode = O_IP_SRC_MASK; return cmd; } static ipfw_insn * add_dstip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) { fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); if (cmd->opcode == O_IP_DST_SET) /* set */ ; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ ; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_DST_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_DST; else /* addr/mask */ cmd->opcode = O_IP_DST_MASK; return cmd; } static ipfw_insn * add_srcmac(ipfw_insn *cmd, char *av, struct tidx *tstate) { if (strncmp(av, "table(", 6) == 0) fill_table(cmd, av, O_MAC_SRC_LOOKUP, tstate); else errx(EX_DATAERR, "only mac table lookup is supported %s", av); return cmd; } static ipfw_insn * add_dstmac(ipfw_insn *cmd, char *av, struct tidx *tstate) { if (strncmp(av, "table(", 6) == 0) fill_table(cmd, av, O_MAC_DST_LOOKUP, tstate); else errx(EX_DATAERR, "only mac table lookup is supported %s", av); return cmd; } static struct _s_x f_reserved_keywords[] = { { "altq", TOK_OR }, { "//", TOK_OR }, { "diverted", TOK_OR }, { "dst-port", TOK_OR }, { "src-port", TOK_OR }, { "established", TOK_OR }, { "keep-state", TOK_OR }, { "frag", TOK_OR }, { "icmptypes", TOK_OR }, { "in", TOK_OR }, { "out", TOK_OR }, { "ip6", TOK_OR }, { "any", TOK_OR }, { "to", TOK_OR }, { "via", TOK_OR }, { "{", TOK_OR }, + { "lookup", TOK_OR }, + { "tagged", TOK_OR }, { NULL, 0 } /* terminator */ }; static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode, int cblen) { if (match_token(f_reserved_keywords, av) != -1) return (NULL); if (fill_newports((ipfw_insn_u16 *)cmd, av, proto, cblen)) { /* XXX todo: check that we have a protocol with ports */ cmd->opcode = opcode; return cmd; } return NULL; } static ipfw_insn * add_src(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) { struct in6_addr a; char *host, *ch, buf[INET6_ADDRSTRLEN]; ipfw_insn *ret = NULL; size_t len; /* Copy first address in set if needed */ if ((ch = strpbrk(av, "/,")) != NULL) { len = ch - av; strlcpy(buf, av, sizeof(buf)); if (len < sizeof(buf)) buf[len] = '\0'; host = buf; } else host = av; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av, cblen, tstate); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av, cblen, tstate); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; return ret; } static ipfw_insn * add_dst(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) { struct in6_addr a; char *host, *ch, buf[INET6_ADDRSTRLEN]; ipfw_insn *ret = NULL; size_t len; /* Copy first address in set if needed */ if ((ch = strpbrk(av, "/,")) != NULL) { len = ch - av; strlcpy(buf, av, sizeof(buf)); if (len < sizeof(buf)) buf[len] = '\0'; host = buf; } else host = av; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av, cblen, tstate); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av, cblen, tstate); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; return ret; } -static inline int -arg_or_targ_relaxed(const char *arg, const char *action) +static uint16_t +parse_logdst(char *logdst_iter) +{ + char *token; + uint16_t ret; + + ret = IPFW_LOG_DEFAULT; + while ((token = strsep(&logdst_iter, ",")) != NULL) { + if (_substrcmp(token, "syslog") == 0) { + ret |= IPFW_LOG_SYSLOG; + continue; + } + if (_substrcmp(token, "ipfw0") == 0) { + /* XXX add multiple ipfw* */ + ret |= IPFW_LOG_IPFW0; + continue; + } + if (_substrcmp(token, "rtsock") == 0) { + ret |= IPFW_LOG_RTSOCK; + continue; + } + errx(EX_DATAERR, + "unsupported logdst token"); + } + return (ret); +} + +static inline uint32_t +arg_or_targ_relaxed(const char *arg, const char *action, uint32_t maxarg) { uint32_t arg1 = (uint32_t)(-1); if (arg == NULL) errx(EX_USAGE, "missing argument for %s", action); if (isdigit(arg[0])) { arg1 = strtoul(arg, NULL, 10); - if (arg1 <= 0 || arg1 >= IP_FW_TABLEARG) + if (arg1 < IPFW_ARG_MIN || arg1 > maxarg) errx(EX_DATAERR, "illegal argument %s(%u) for %s", arg, arg1, action); } else if (_substrcmp(arg, "tablearg") == 0) arg1 = IP_FW_TARG; return (arg1); } -static inline uint16_t +static inline uint32_t arg_or_targ(const char *arg, const char *action) { - uint32_t arg1 = arg_or_targ_relaxed(arg, action); + uint32_t arg1 = arg_or_targ_relaxed(arg, action, IPFW_ARG_MAX); if (arg1 == (uint32_t)(-1)) errx(EX_DATAERR, "illegal argument %s(%u) for %s", arg, arg1, action); return (arg1); } static uint16_t get_divert_port(const char *arg, const char *action) { - uint32_t arg1 = arg_or_targ_relaxed(arg, action); + uint32_t arg1 = arg_or_targ_relaxed(arg, action, IPFW_ARG_MAX); if (arg1 != (uint32_t)(-1)) return (arg1); struct servent *s; setservent(1); s = getservbyname(arg, "divert"); if (s == NULL) errx(EX_DATAERR, "illegal divert/tee port"); return (ntohs(s->s_port)); } /* * Parse arguments and assemble the microinstructions which make up a rule. * Rules are added into the 'rulebuf' and then copied in the correct order * into the actual rule. * * The syntax for a rule starts with the action, followed by * optional action parameters, and the various match patterns. * In the assembled microcode, the first opcode must be an O_PROBE_STATE * (generated if the rule includes a keep-state option), then the * various match patterns, log/altq actions, and the actual action. * */ static void compile_rule(char *av[], uint32_t *rbuf, int *rbufsize, struct tidx *tstate) { /* * rules are added into the 'rulebuf' and then copied in * the correct order into the actual rule. * Some things that need to go out of order (prob, action etc.) * go into actbuf[]. */ static uint32_t actbuf[255], cmdbuf[255]; int rblen, ablen, cblen; ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; ipfw_insn *first_cmd; /* first match pattern */ struct ip_fw_rule *rule; /* * various flags used to record that we entered some fields. */ ipfw_insn *have_state = NULL; /* any state-related option */ int have_rstate = 0; ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; ipfw_insn *have_skipcmd = NULL; size_t len; int i; int open_par = 0; /* open parenthesis ( */ /* proto is here because it is used to fetch ports */ u_char proto = IPPROTO_IP; /* default protocol */ double match_prob = 1; /* match probability, default is always match */ bzero(actbuf, sizeof(actbuf)); /* actions go here */ bzero(cmdbuf, sizeof(cmdbuf)); bzero(rbuf, *rbufsize); rule = (struct ip_fw_rule *)rbuf; cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; rblen = *rbufsize / sizeof(uint32_t); rblen -= sizeof(struct ip_fw_rule) / sizeof(uint32_t); ablen = nitems(actbuf); cblen = nitems(cmdbuf); cblen -= F_INSN_SIZE(ipfw_insn_u32) + 1; #define CHECK_RBUFLEN(len) { CHECK_LENGTH(rblen, len); rblen -= len; } #define CHECK_ACTLEN CHECK_LENGTH(ablen, action->len) av++; /* [rule N] -- Rule number optional */ if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; } /* [set N] -- set number (0..RESVD_SET), optional */ if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; tstate->set = set; av += 2; } /* [prob D] -- match probability, optional */ if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); av++; action->len = 1; /* default */ CHECK_ACTLEN; switch(i) { case TOK_CHECKSTATE: have_state = action; action->opcode = O_CHECK_STATE; + action->len = F_INSN_SIZE(ipfw_insn_kidx); + CHECK_ACTLEN; if (*av == NULL || match_token(rule_options, *av) == TOK_COMMENT) { - action->arg1 = pack_object(tstate, + insntod(have_state, kidx)->kidx = pack_object(tstate, default_state_name, IPFW_TLV_STATE_NAME); break; } if (*av[0] == ':') { if (strcmp(*av + 1, "any") == 0) - action->arg1 = 0; + insntod(have_state, kidx)->kidx = 0; else if (state_check_name(*av + 1) == 0) - action->arg1 = pack_object(tstate, *av + 1, - IPFW_TLV_STATE_NAME); + insntod(have_state, kidx)->kidx = pack_object( + tstate, *av + 1, IPFW_TLV_STATE_NAME); else errx(EX_DATAERR, "Invalid state name %s", *av); av++; break; } errx(EX_DATAERR, "Invalid state name %s", *av); break; case TOK_ABORT: action->opcode = O_REJECT; action->arg1 = ICMP_REJECT_ABORT; break; case TOK_ABORT6: action->opcode = O_UNREACH6; action->arg1 = ICMP6_UNREACH_ABORT; break; case TOK_ACCEPT: action->opcode = O_ACCEPT; break; case TOK_DENY: action->opcode = O_DENY; action->arg1 = 0; break; case TOK_REJECT: action->opcode = O_REJECT; action->arg1 = ICMP_UNREACH_HOST; break; case TOK_RESET: action->opcode = O_REJECT; action->arg1 = ICMP_REJECT_RST; break; case TOK_RESET6: action->opcode = O_UNREACH6; action->arg1 = ICMP6_UNREACH_RST; break; case TOK_UNREACH: action->opcode = O_REJECT; NEED1("missing reject code"); action->arg1 = get_reject_code(*av); av++; if (action->arg1 == ICMP_UNREACH_NEEDFRAG && isdigit(**av)) { uint16_t mtu; mtu = strtoul(*av, NULL, 10); if (mtu < 68 || mtu >= IP_MAXPACKET) errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); action->len = F_INSN_SIZE(ipfw_insn_u16); - ((ipfw_insn_u16 *)action)->ports[0] = mtu; + insntod(action, u16)->ports[0] = mtu; av++; } break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); action->arg1 = get_unreach6_code(*av); av++; break; case TOK_COUNT: action->opcode = O_COUNT; break; case TOK_NAT: action->opcode = O_NAT; action->len = F_INSN_SIZE(ipfw_insn_nat); CHECK_ACTLEN; if (*av != NULL && _substrcmp(*av, "global") == 0) action->arg1 = IP_FW_NAT44_GLOBAL; else action->arg1 = arg_or_targ(av[0], *(av - 1)); av++; break; case TOK_QUEUE: action->opcode = O_QUEUE; action->arg1 = arg_or_targ(av[0], *(av - 1)); av++; break; case TOK_PIPE: action->opcode = O_PIPE; action->arg1 = arg_or_targ(av[0], *(av - 1)); av++; break; case TOK_SKIPTO: action->opcode = O_SKIPTO; - action->arg1 = arg_or_targ(av[0], *(av - 1)); + action->len = F_INSN_SIZE(ipfw_insn_u32); + CHECK_ACTLEN; + insntod(action, u32)->d[0] = + arg_or_targ_relaxed(av[0], *(av - 1), IPFW_DEFAULT_RULE); av++; break; case TOK_NETGRAPH: action->opcode = O_NETGRAPH; action->arg1 = arg_or_targ(av[0], *(av - 1)); av++; break; case TOK_NGTEE: action->opcode = O_NGTEE; action->arg1 = arg_or_targ(av[0], *(av - 1)); av++; break; case TOK_DIVERT: action->opcode = O_DIVERT; action->arg1 = get_divert_port(av[0], *(av - 1)); av++; break; case TOK_TEE: action->opcode = O_TEE; action->arg1 = get_divert_port(av[0], *(av - 1)); av++; break; case TOK_CALL: action->opcode = O_CALLRETURN; - action->arg1 = arg_or_targ(av[0], *(av - 1)); + action->len = F_INSN_SIZE(ipfw_insn_u32); + CHECK_ACTLEN; + insntod(action, u32)->d[0] = + arg_or_targ_relaxed(av[0], *(av - 1), IPFW_DEFAULT_RULE); av++; break; case TOK_FORWARD: { /* * Locate the address-port separator (':' or ','). * Could be one of the following: * hostname:port * IPv4 a.b.c.d,port * IPv4 a.b.c.d:port * IPv6 w:x:y::z,port * IPv6 [w:x:y::z]:port */ struct sockaddr_storage result; struct addrinfo *res; char *s, *end; int family; u_short port_number = 0; NEED1("missing forward address[:port]"); if (strncmp(*av, "tablearg", 8) == 0 && ((*av)[8] == '\0' || (*av)[8] == ',' || (*av)[8] == ':')) memcpy(++(*av), "0.0.0.0", 7); /* * Are we an bracket-enclosed IPv6 address? */ if (strchr(*av, '[')) (*av)++; /* * locate the address-port separator (':' or ',') */ s = strchr(*av, ','); if (s == NULL) { s = strchr(*av, ']'); /* Prevent erroneous parsing on brackets. */ if (s != NULL) *(s++) = '\0'; else s = *av; /* Distinguish between IPv4:port and IPv6 cases. */ s = strchr(s, ':'); if (s && strchr(s+1, ':')) s = NULL; /* no port */ } if (s != NULL) { /* Terminate host portion and set s to start of port. */ *(s++) = '\0'; i = strtoport(s, &end, 0 /* base */, 0 /* proto */); if (s == end) errx(EX_DATAERR, "illegal forwarding port ``%s''", s); port_number = (u_short)i; } /* * Resolve the host name or address to a family and a * network representation of the address. */ if (getaddrinfo(*av, NULL, NULL, &res)) errx(EX_DATAERR, NULL); /* Just use the first host in the answer. */ family = res->ai_family; memcpy(&result, res->ai_addr, res->ai_addrlen); freeaddrinfo(res); if (family == PF_INET) { ipfw_insn_sa *p = (ipfw_insn_sa *)action; action->opcode = O_FORWARD_IP; action->len = F_INSN_SIZE(ipfw_insn_sa); CHECK_ACTLEN; /* * In the kernel we assume AF_INET and use only * sin_port and sin_addr. Remember to set sin_len as * the routing code seems to use it too. */ p->sa.sin_len = sizeof(struct sockaddr_in); p->sa.sin_family = AF_INET; p->sa.sin_port = port_number; p->sa.sin_addr.s_addr = ((struct sockaddr_in *)&result)->sin_addr.s_addr; } else if (family == PF_INET6) { ipfw_insn_sa6 *p = (ipfw_insn_sa6 *)action; action->opcode = O_FORWARD_IP6; action->len = F_INSN_SIZE(ipfw_insn_sa6); CHECK_ACTLEN; p->sa.sin6_len = sizeof(struct sockaddr_in6); p->sa.sin6_family = AF_INET6; p->sa.sin6_port = port_number; p->sa.sin6_flowinfo = 0; p->sa.sin6_scope_id = ((struct sockaddr_in6 *)&result)->sin6_scope_id; bcopy(&((struct sockaddr_in6*)&result)->sin6_addr, &p->sa.sin6_addr, sizeof(p->sa.sin6_addr)); } else { errx(EX_DATAERR, "Invalid address family in forward action"); } av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; av--; /* go back... */ break; case TOK_SETFIB: { int numfibs; size_t intsize = sizeof(int); action->opcode = O_SETFIB; NEED1("missing fib number"); if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TARG; } else { action->arg1 = strtoul(*av, NULL, 10); if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) errx(EX_DATAERR, "fibs not supported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); /* Add high-order bit to fib to make room for tablearg*/ action->arg1 |= 0x8000; } av++; break; } case TOK_SETDSCP: { int code; action->opcode = O_SETDSCP; NEED1("missing DSCP code"); if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TARG; } else { if (isalpha(*av[0])) { if ((code = match_token(f_ipdscp, *av)) == -1) errx(EX_DATAERR, "Unknown DSCP code"); action->arg1 = code; } else action->arg1 = strtoul(*av, NULL, 10); /* * Add high-order bit to DSCP to make room * for tablearg */ action->arg1 |= 0x8000; } av++; break; } case TOK_REASS: action->opcode = O_REASS; break; case TOK_RETURN: - fill_cmd(action, O_CALLRETURN, F_NOT, 0); + action->opcode = O_CALLRETURN; + action->len = F_INSN_SIZE(ipfw_insn_u32) | F_NOT; + CHECK_ACTLEN; + if (*av != NULL) { + /* + * Return type is optional. + * By default we use RETURN_NEXT_RULENUM. + */ + i = match_token(return_types, *av); + if (i >= 0) { + action->arg1 = i; + av++; + } else + action->arg1 = RETURN_NEXT_RULENUM; + } break; case TOK_SETMARK: { action->opcode = O_SETMARK; action->len = F_INSN_SIZE(ipfw_insn_u32); NEED1("missing mark"); if (strcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TARG; } else { - ((ipfw_insn_u32 *)action)->d[0] = - strtoul(*av, NULL, 0); + insntod(action, u32)->d[0] = strtoul(*av, NULL, 0); /* This is not a tablearg */ action->arg1 |= 0x8000; } av++; CHECK_CMDLEN; break; } case TOK_TCPSETMSS: { u_long mss; - uint16_t idx; + uint32_t idx; idx = pack_object(tstate, "tcp-setmss", IPFW_TLV_EACTION); if (idx == 0) errx(EX_DATAERR, "pack_object failed"); - fill_cmd(action, O_EXTERNAL_ACTION, 0, idx); + action->opcode = O_EXTERNAL_ACTION; + action->len = F_INSN_SIZE(ipfw_insn_kidx); + CHECK_ACTLEN; + insntod(action, kidx)->kidx = idx; + NEED1("Missing MSS value"); action = next_cmd(action, &ablen); action->len = 1; CHECK_ACTLEN; mss = strtoul(*av, NULL, 10); if (mss == 0 || mss > UINT16_MAX) errx(EX_USAGE, "invalid MSS value %s", *av); fill_cmd(action, O_EXTERNAL_DATA, 0, (uint16_t)mss); av++; break; } default: av--; if (match_token(rule_eactions, *av) == -1) errx(EX_DATAERR, "invalid action %s\n", *av); /* * External actions support. * XXX: we support only syntax with instance name. * For known external actions (from rule_eactions list) * we can handle syntax directly. But with `eaction' * keyword we can use only `eaction ' * syntax. */ case TOK_EACTION: { - uint16_t idx; + uint32_t idx; NEED1("Missing eaction name"); if (eaction_check_name(*av) != 0) errx(EX_DATAERR, "Invalid eaction name %s", *av); idx = pack_object(tstate, *av, IPFW_TLV_EACTION); if (idx == 0) errx(EX_DATAERR, "pack_object failed"); - fill_cmd(action, O_EXTERNAL_ACTION, 0, idx); + action->opcode = O_EXTERNAL_ACTION; + action->len = F_INSN_SIZE(ipfw_insn_kidx); + CHECK_ACTLEN; + insntod(action, kidx)->kidx = idx; + av++; NEED1("Missing eaction instance name"); - action = next_cmd(action, &ablen); - action->len = 1; - CHECK_ACTLEN; if (eaction_check_name(*av) != 0) errx(EX_DATAERR, "Invalid eaction instance name %s", *av); /* * External action instance object has TLV type depended * from the external action name object index. Since we * currently don't know this index, use zero as TLV type. */ idx = pack_object(tstate, *av, 0); if (idx == 0) errx(EX_DATAERR, "pack_object failed"); - fill_cmd(action, O_EXTERNAL_INSTANCE, 0, idx); + action = next_cmd(action, &ablen); + action->opcode = O_EXTERNAL_INSTANCE; + action->len = F_INSN_SIZE(ipfw_insn_kidx); + CHECK_ACTLEN; + insntod(action, kidx)->kidx = idx; av++; } } action = next_cmd(action, &ablen); /* * [altq queuename] -- altq tag, optional * [log [logamount N]] -- log, optional * * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { av++; switch (i) { case TOK_LOG: { ipfw_insn_log *c = (ipfw_insn_log *)cmd; int l; if (have_log) errx(EX_DATAERR, "log cannot be specified more than once"); have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); CHECK_CMDLEN; cmd->opcode = O_LOG; + cmd->arg1 = IPFW_LOG_DEFAULT; + /* logdst before logamount */ + if (av[0] && _substrcmp(*av, "logdst") == 0) { + av++; + NEED1("logdst requires argument"); + cmd->arg1 = parse_logdst(*av); + av++; + } + if (av[0] && _substrcmp(*av, "logamount") == 0) { av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", &c->max_log, &len, NULL, 0) == -1) { if (g_co.test_only) { c->max_log = 0; break; } errx(1, "sysctlbyname(\"%s\")", "net.inet.ip.fw.verbose_limit"); } } + + /* logdst after logamount */ + if (av[0] && _substrcmp(*av, "logdst") == 0) { + av++; + NEED1("logdst requires argument"); + cmd->arg1 = parse_logdst(*av); + av++; + } } break; #ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; NEED1("missing altq queue name"); if (have_altq) errx(EX_DATAERR, "altq cannot be specified more than once"); have_altq = (ipfw_insn *)a; cmd->len = F_INSN_SIZE(ipfw_insn_altq); CHECK_CMDLEN; cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); av++; } break; #endif case TOK_TAG: case TOK_UNTAG: { uint16_t tag; if (have_tag) errx(EX_USAGE, "tag and untag cannot be " "specified more than once"); GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); av++; break; } default: abort(); } cmd = next_cmd(cmd, &cblen); } if (have_state) { /* must be a check-state, we are done */ if (*av != NULL && match_token(rule_options, *av) == TOK_COMMENT) { /* check-state has a comment */ av++; fill_comment(cmd, av, cblen); cmd = next_cmd(cmd, &cblen); av[0] = NULL; } goto done; } #define OR_START(target) \ if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ av++; \ } else \ (*av)++; \ } \ target: \ #define CLOSE_PAR \ if (open_par) { \ if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ av++; \ } #define OR_BLOCK(target) \ if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ av++; \ goto target; \ } \ CLOSE_PAR; first_cmd = cmd; #if 0 /* * MAC addresses, optional. * If we have this, we skip the part "proto from src to dst" * and jump straight to the option parsing. */ NOT_BLOCK; NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { av++; /* the "MAC" keyword */ add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); av++; /* any or mac-type */ goto read_options; } #endif /* * protocol, mandatory */ OR_START(get_proto); NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd, &cblen); } } else if (first_cmd != cmd) { errx(EX_DATAERR, "invalid protocol ``%s''", *av); } else { rule->flags |= IPFW_RULE_JUSTOPTS; goto read_options; } OR_BLOCK(get_proto); first_cmd = cmd; /* update pointer to use in compact form */ /* * "from", mandatory */ if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); av++; /* * source IP, mandatory */ OR_START(source_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto, cblen, tstate)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd, &cblen); } } else errx(EX_USAGE, "bad source address %s", *av); OR_BLOCK(source_ip); /* * source ports, optional */ NOT_BLOCK; /* optional "not" */ if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd, &cblen); } } /* * "to", mandatory */ if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); av++; /* * destination, mandatory */ OR_START(dest_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto, cblen, tstate)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd, &cblen); } } else errx( EX_USAGE, "bad destination address %s", *av); OR_BLOCK(dest_ip); /* * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd, &cblen); } } if (first_cmd == cmd) rule->flags |= IPFW_RULE_NOOPT; read_options: prev = NULL; while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ s = *av; cmd32 = (ipfw_insn_u32 *)cmd; if (*s == '!') { /* alternate syntax for NOT */ if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; s++; } i = match_token(rule_options, s); av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; break; case TOK_OR: if (open_par == 0 || prev == NULL) errx(EX_USAGE, "invalid \"or\" block\n"); prev->len |= F_OR; break; case TOK_STARTBRACE: if (open_par) errx(EX_USAGE, "+nested \"(\" not allowed\n"); open_par = 1; break; case TOK_ENDBRACE: if (!open_par) errx(EX_USAGE, "+missing \")\"\n"); open_par = 0; prev = NULL; break; case TOK_IN: fill_cmd(cmd, O_IN, 0, 0); break; case TOK_OUT: cmd->len ^= F_NOT; /* toggle F_NOT */ fill_cmd(cmd, O_IN, 0, 0); break; case TOK_DIVERTED: fill_cmd(cmd, O_DIVERTED, 0, 3); break; case TOK_DIVERTEDLOOPBACK: fill_cmd(cmd, O_DIVERTED, 0, 1); break; case TOK_DIVERTEDOUTPUT: fill_cmd(cmd, O_DIVERTED, 0, 2); break; case TOK_FRAG: { uint32_t set = 0, clear = 0; if (*av != NULL && fill_flags(f_ipoff, *av, NULL, &set, &clear) == 0) av++; else { /* * Compatibility: no argument after "frag" * keyword equals to "frag offset". */ set = 0x01; clear = 0; } fill_cmd(cmd, O_FRAG, 0, (set & 0xff) | ( (clear & 0xff) << 8)); break; } case TOK_LAYER2: fill_cmd(cmd, O_LAYER2, 0, 0); break; case TOK_XMIT: case TOK_RECV: case TOK_VIA: NEED1("recv, xmit, via require interface name" " or address"); - fill_iface((ipfw_insn_if *)cmd, av[0], cblen, tstate); + fill_iface(insntod(cmd, if), av[0], cblen, tstate); av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) cmd->opcode = O_XMIT; else if (i == TOK_RECV) cmd->opcode = O_RECV; else if (i == TOK_VIA) cmd->opcode = O_VIA; break; case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av, cblen); av++; break; case TOK_IPTTL: NEED1("ipttl requires TTL"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPTTL, cblen)) errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPID: NEED1("ipid requires id"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPID, cblen)) errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPLEN: NEED1("iplen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPLEN, cblen)) errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); av++; break; case TOK_DSCP: NEED1("missing DSCP code"); fill_dscp(cmd, *av, cblen); av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags_cmd(cmd, O_IPOPT, f_ipopts, *av); av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags_cmd(cmd, O_IPTOS, f_iptos, *av); av++; break; case TOK_UID: NEED1("uid requires argument"); { char *end; uid_t uid; struct passwd *pwd; cmd->opcode = O_UID; uid = strtoul(*av, &end, 0); pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); if (pwd == NULL) errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_GID: NEED1("gid requires argument"); { char *end; gid_t gid; struct group *grp; cmd->opcode = O_GID; gid = strtoul(*av, &end, 0); grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); if (grp == NULL) errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_JAIL: NEED1("jail requires argument"); { char *end; int jid; cmd->opcode = O_JAIL; /* * If av is a number, then we'll just pass it as-is. If * it's a name, try to resolve that to a jid. * * We save the jail_getid(3) call for a fallback because * it entails an unconditional trip to the kernel to * either validate a jid or resolve a name to a jid. * This specific token doesn't currently require a * jid to be an active jail, so we save a transition * by simply using a number that we're given. */ jid = strtoul(*av, &end, 10); if (*end != '\0') { jid = jail_getid(*av); if (jid < 0) errx(EX_DATAERR, "%s", jail_errmsg); } cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_ESTAB: fill_cmd(cmd, O_ESTAB, 0, 0); break; case TOK_SETUP: fill_cmd(cmd, O_TCPFLAGS, 0, (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); break; case TOK_TCPDATALEN: NEED1("tcpdatalen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TCPDATALEN, cblen)) errx(EX_DATAERR, "invalid tcpdata len %s", *av); } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags_cmd(cmd, O_TCPOPTS, f_tcpopts, *av); av++; break; case TOK_TCPSEQ: case TOK_TCPACK: NEED1("tcpseq/tcpack requires argument"); cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); av++; break; case TOK_TCPMSS: case TOK_TCPWIN: NEED1("tcpmss/tcpwin requires size"); if (strpbrk(*av, "-,")) { if (add_ports(cmd, *av, 0, i == TOK_TCPWIN ? O_TCPWIN : O_TCPMSS, cblen) == NULL) errx(EX_DATAERR, "invalid %s size %s", s, *av); } else fill_cmd(cmd, i == TOK_TCPWIN ? O_TCPWIN : O_TCPMSS, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags_cmd(cmd, O_TCPFLAGS, f_tcpflags, *av); av++; break; case TOK_KEEPSTATE: case TOK_RECORDSTATE: { - uint16_t uidx; + uint32_t uidx; if (open_par) errx(EX_USAGE, "keep-state or record-state cannot be part " "of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state, record-state, " " limit and set-limit is allowed"); if (*av != NULL && *av[0] == ':') { if (state_check_name(*av + 1) != 0) errx(EX_DATAERR, "Invalid state name %s", *av); uidx = pack_object(tstate, *av + 1, IPFW_TLV_STATE_NAME); av++; } else uidx = pack_object(tstate, default_state_name, IPFW_TLV_STATE_NAME); have_state = cmd; have_rstate = i == TOK_RECORDSTATE; - fill_cmd(cmd, O_KEEP_STATE, 0, uidx); + cmd->opcode = O_KEEP_STATE; + cmd->len = F_INSN_SIZE(ipfw_insn_kidx); + CHECK_CMDLEN; + insntod(have_state, kidx)->kidx = uidx; break; } case TOK_LIMIT: case TOK_SETLIMIT: { - ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; + ipfw_insn_limit *c = insntod(cmd, limit); int val; if (open_par) errx(EX_USAGE, "limit or set-limit cannot be part of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state, record-state, " " limit and set-limit is allowed"); have_state = cmd; have_rstate = i == TOK_SETLIMIT; + cmd->opcode = O_LIMIT; cmd->len = F_INSN_SIZE(ipfw_insn_limit); CHECK_CMDLEN; - cmd->opcode = O_LIMIT; - c->limit_mask = c->conn_limit = 0; + c->limit_mask = c->conn_limit = c->kidx = 0; while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; av++; } if (c->limit_mask == 0) errx(EX_USAGE, "limit: missing limit mask"); GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); av++; if (*av != NULL && *av[0] == ':') { if (state_check_name(*av + 1) != 0) errx(EX_DATAERR, "Invalid state name %s", *av); - cmd->arg1 = pack_object(tstate, *av + 1, + c->kidx = pack_object(tstate, *av + 1, IPFW_TLV_STATE_NAME); av++; } else - cmd->arg1 = pack_object(tstate, + c->kidx = pack_object(tstate, default_state_name, IPFW_TLV_STATE_NAME); break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); break; case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av, cblen, tstate)) { av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av, cblen, tstate)) { av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av, cblen, tstate)) { av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av, cblen, tstate)) { av++; } break; case TOK_SRCMAC: NEED1("missing source MAC"); if (add_srcmac(cmd, *av, tstate)) { av++; } break; case TOK_DSTMAC: NEED1("missing destination MAC"); if (add_dstmac(cmd, *av, tstate)) { av++; } break; case TOK_SRCPORT: NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; case TOK_DSTPORT: NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: if (add_mac(cmd, av, cblen)) av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); if (!add_mactype(cmd, *av, cblen)) errx(EX_DATAERR, "invalid mac type %s", *av); av++; break; case TOK_VERREVPATH: fill_cmd(cmd, O_VERREVPATH, 0, 0); break; case TOK_VERSRCREACH: fill_cmd(cmd, O_VERSRCREACH, 0, 0); break; case TOK_ANTISPOOF: fill_cmd(cmd, O_ANTISPOOF, 0, 0); break; case TOK_IPSEC: fill_cmd(cmd, O_IPSEC, 0, 0); break; case TOK_IPV6: fill_cmd(cmd, O_IP6, 0, 0); break; case TOK_IPV4: fill_cmd(cmd, O_IP4, 0, 0); break; case TOK_EXT6HDR: NEED1("missing extension header"); fill_ext6hdr( cmd, *av ); av++; break; case TOK_FLOWID: if (proto != IPPROTO_IPV6 ) errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av, cblen); av++; break; case TOK_COMMENT: fill_comment(cmd, av, cblen); av[0]=NULL; break; case TOK_TAGGED: if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED, cblen)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); } else { uint16_t tag; GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_SOCKARG: fill_cmd(cmd, O_SOCKARG, 0, 0); break; case TOK_LOOKUP: { - ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; + /* optional mask for some LOOKUP types */ + ipfw_insn_table *c = insntod(cmd, table); + char *lkey; if (!av[0] || !av[1]) - errx(EX_USAGE, "format: lookup argument tablenum"); + errx(EX_USAGE, + "format: lookup argument tablenum"); cmd->opcode = O_IP_DST_LOOKUP; - cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; - i = match_token(lookup_keys, *av); - if (i == -1) - errx(EX_USAGE, "format: cannot lookup on %s", *av); - __PAST_END(c->d, 1) = i; - av++; - if ((i = pack_table(tstate, *av)) == 0) - errx(EX_DATAERR, "Invalid table name: %s", *av); + lkey = strsep(av, ":"); + i = match_token(lookup_keys, lkey); + if (i == -1) + errx(EX_USAGE, + "format: cannot lookup on %s", lkey); + /* masked lookup key */ + if (*av != NULL) { + switch (i) { + case LOOKUP_DST_PORT: + case LOOKUP_SRC_PORT: + case LOOKUP_UID: + case LOOKUP_JAIL: + case LOOKUP_DSCP: + case LOOKUP_MARK: + case LOOKUP_RULENUM: + break; + default: + errx(EX_USAGE, + "masked lookup is not supported " + "for %s", lkey); + } + cmd->len |= F_INSN_SIZE(ipfw_insn_table); + c->value = strtoul(*av, NULL, 0); + if (c->value == 0) + errx(EX_USAGE, + "all-zeroes bitmask for lookup " + "is meaningless"); + } else { + cmd->len |= F_INSN_SIZE(ipfw_insn_kidx); + } + CHECK_CMDLEN; - cmd->arg1 = i; + IPFW_SET_LOOKUP_TYPE(cmd, i); av++; - } + c->kidx = pack_table(tstate, *av); + if (c->kidx == 0) + errx(EX_DATAERR, + "Invalid table name: %s", *av); + av++; + } break; case TOK_FLOW: NEED1("missing table name"); if (strncmp(*av, "table(", 6) != 0) errx(EX_DATAERR, "enclose table name into \"table()\""); fill_table(cmd, *av, O_IP_FLOW_LOOKUP, tstate); av++; break; case TOK_SKIPACTION: if (have_skipcmd) errx(EX_USAGE, "only one defer-action " "is allowed"); have_skipcmd = cmd; fill_cmd(cmd, O_SKIP_ACTION, 0, 0); break; case TOK_MARK: NEED1("missing mark value:mask"); fill_mark(cmd, *av, cblen); av++; break; default: errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); } if (F_LEN(cmd) > 0) { /* prepare to advance */ prev = cmd; cmd = next_cmd(cmd, &cblen); } } done: if (!have_state && have_skipcmd) warnx("Rule contains \"defer-immediate-action\" " "and doesn't contain any state-related options."); /* * Now copy stuff into the rule. * If we have a keep-state option, the first instruction * must be a PROBE_STATE (which is generated here). * If we have a LOG option, it was stored as the first command, * and now must be moved to the top of the action part. */ dst = (ipfw_insn *)rule->cmd; /* * First thing to write into the command stream is the match probability. */ if (match_prob != 1) { /* 1 means always match */ dst->opcode = O_PROB; dst->len = 2; *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); dst += dst->len; } /* * generate O_PROBE_STATE if necessary */ if (have_state && have_state->opcode != O_CHECK_STATE && !have_rstate) { - fill_cmd(dst, O_PROBE_STATE, 0, have_state->arg1); + dst->opcode = O_PROBE_STATE; + dst->len = F_INSN_SIZE(ipfw_insn_kidx); + insntod(dst, kidx)->kidx = insntod(have_state, kidx)->kidx; dst = next_cmd(dst, &rblen); } /* * copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG, * O_SKIP_ACTION */ for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { i = F_LEN(src); CHECK_RBUFLEN(i); switch (src->opcode) { case O_LOG: case O_KEEP_STATE: case O_LIMIT: case O_ALTQ: case O_TAG: case O_SKIP_ACTION: break; default: bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } } /* * put back the have_state command as last opcode */ if (have_state && have_state->opcode != O_CHECK_STATE) { i = F_LEN(have_state); CHECK_RBUFLEN(i); bcopy(have_state, dst, i * sizeof(uint32_t)); dst += i; } /* * put back the have_skipcmd command as very last opcode */ if (have_skipcmd) { i = F_LEN(have_skipcmd); CHECK_RBUFLEN(i); bcopy(have_skipcmd, dst, i * sizeof(uint32_t)); dst += i; } /* * start action section */ rule->act_ofs = dst - rule->cmd; /* put back O_LOG, O_ALTQ, O_TAG if necessary */ if (have_log) { i = F_LEN(have_log); CHECK_RBUFLEN(i); bcopy(have_log, dst, i * sizeof(uint32_t)); dst += i; } if (have_altq) { i = F_LEN(have_altq); CHECK_RBUFLEN(i); bcopy(have_altq, dst, i * sizeof(uint32_t)); dst += i; } if (have_tag) { i = F_LEN(have_tag); CHECK_RBUFLEN(i); bcopy(have_tag, dst, i * sizeof(uint32_t)); dst += i; } /* * copy all other actions */ for (src = (ipfw_insn *)actbuf; src != action; src += i) { i = F_LEN(src); CHECK_RBUFLEN(i); bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); *rbufsize = (char *)dst - (char *)rule; } static int compare_ntlv(const void *_a, const void *_b) { const ipfw_obj_ntlv *a, *b; a = (const ipfw_obj_ntlv *)_a; b = (const ipfw_obj_ntlv *)_b; if (a->set < b->set) return (-1); else if (a->set > b->set) return (1); if (a->idx < b->idx) return (-1); else if (a->idx > b->idx) return (1); if (a->head.type < b->head.type) return (-1); else if (a->head.type > b->head.type) return (1); return (0); } /* * Provide kernel with sorted list of referenced objects */ static void object_sort_ctlv(ipfw_obj_ctlv *ctlv) { qsort(ctlv + 1, ctlv->count, ctlv->objsize, compare_ntlv); } struct object_kt { - uint16_t uidx; + uint32_t uidx; uint16_t type; }; static int compare_object_kntlv(const void *k, const void *v) { const ipfw_obj_ntlv *ntlv; struct object_kt key; key = *((const struct object_kt *)k); ntlv = (const ipfw_obj_ntlv *)v; if (key.uidx < ntlv->idx) return (-1); else if (key.uidx > ntlv->idx) return (1); if (key.type < ntlv->head.type) return (-1); else if (key.type > ntlv->head.type) return (1); return (0); } /* * Finds object name in @ctlv by @idx and @type. * Uses the following facts: * 1) All TLVs are the same size * 2) Kernel implementation provides already sorted list. * * Returns table name or NULL. */ static char * -object_search_ctlv(ipfw_obj_ctlv *ctlv, uint16_t idx, uint16_t type) +object_search_ctlv(ipfw_obj_ctlv *ctlv, uint32_t idx, uint16_t type) { ipfw_obj_ntlv *ntlv; struct object_kt key; key.uidx = idx; key.type = type; ntlv = bsearch(&key, (ctlv + 1), ctlv->count, ctlv->objsize, compare_object_kntlv); if (ntlv != NULL) return (ntlv->name); return (NULL); } static char * -table_search_ctlv(ipfw_obj_ctlv *ctlv, uint16_t idx) +table_search_ctlv(ipfw_obj_ctlv *ctlv, uint32_t idx) { return (object_search_ctlv(ctlv, idx, IPFW_TLV_TBL_NAME)); } /* * Adds one or more rules to ipfw chain. * Data layout: * Request: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] (*2) (*3) * ] * Reply: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] * ] * * Rules in reply are modified to store their actual ruleset number. * * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending * according to their idx field and there has to be no duplicates. * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. * (*3) Each ip_fw structure needs to be aligned to u64 boundary. */ void ipfw_add(char *av[]) { uint32_t rulebuf[1024]; int rbufsize, default_off, tlen, rlen; size_t sz; struct tidx ts; struct ip_fw_rule *rule; caddr_t tbuf; ip_fw3_opheader *op3; ipfw_obj_ctlv *ctlv, *tstate; rbufsize = sizeof(rulebuf); memset(rulebuf, 0, rbufsize); memset(&ts, 0, sizeof(ts)); /* Optimize case with no tables */ default_off = sizeof(ipfw_obj_ctlv) + sizeof(ip_fw3_opheader); op3 = (ip_fw3_opheader *)rulebuf; ctlv = (ipfw_obj_ctlv *)(op3 + 1); rule = (struct ip_fw_rule *)(ctlv + 1); rbufsize -= default_off; compile_rule(av, (uint32_t *)rule, &rbufsize, &ts); /* Align rule size to u64 boundary */ rlen = roundup2(rbufsize, sizeof(uint64_t)); tbuf = NULL; sz = 0; tstate = NULL; if (ts.count != 0) { /* Some tables. We have to alloc more data */ tlen = ts.count * sizeof(ipfw_obj_ntlv); sz = default_off + sizeof(ipfw_obj_ctlv) + tlen + rlen; if ((tbuf = calloc(1, sz)) == NULL) err(EX_UNAVAILABLE, "malloc() failed for IP_FW_ADD"); op3 = (ip_fw3_opheader *)tbuf; /* Tables first */ ctlv = (ipfw_obj_ctlv *)(op3 + 1); ctlv->head.type = IPFW_TLV_TBLNAME_LIST; ctlv->head.length = sizeof(ipfw_obj_ctlv) + tlen; ctlv->count = ts.count; ctlv->objsize = sizeof(ipfw_obj_ntlv); memcpy(ctlv + 1, ts.idx, tlen); object_sort_ctlv(ctlv); tstate = ctlv; /* Rule next */ ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); ctlv->head.type = IPFW_TLV_RULE_LIST; ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; ctlv->count = 1; memcpy(ctlv + 1, rule, rbufsize); } else { /* Simply add header */ sz = rlen + default_off; memset(ctlv, 0, sizeof(*ctlv)); ctlv->head.type = IPFW_TLV_RULE_LIST; ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; ctlv->count = 1; } if (do_get3(IP_FW_XADD, op3, &sz) != 0) err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_XADD"); if (!g_co.do_quiet) { struct format_opts sfo; struct buf_pr bp; memset(&sfo, 0, sizeof(sfo)); sfo.tstate = tstate; sfo.set_mask = (uint32_t)(-1); bp_alloc(&bp, 4096); show_static_rule(&g_co, &sfo, &bp, rule, NULL); printf("%s", bp.buf); bp_free(&bp); } if (tbuf != NULL) free(tbuf); if (ts.idx != NULL) free(ts.idx); } /* * clear the counters or the log counters. * optname has the following values: * 0 (zero both counters and logging) * 1 (zero logging only) */ void ipfw_zero(int ac, char *av[], int optname) { ipfw_range_tlv rt; char const *errstr; char const *name = optname ? "RESETLOG" : "ZERO"; uint32_t arg; int failed = EX_OK; optname = optname ? IP_FW_XRESETLOG : IP_FW_XZERO; av++; ac--; if (ac == 0) { /* clear all entries */ memset(&rt, 0, sizeof(rt)); rt.flags = IPFW_RCFLAG_ALL; if (do_range_cmd(optname, &rt) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_X%s)", name); if (!g_co.do_quiet) printf("%s.\n", optname == IP_FW_XZERO ? "Accounting cleared":"Logging counts reset"); return; } while (ac) { /* Rule number */ if (isdigit(**av)) { arg = strtonum(*av, 0, 0xffff, &errstr); if (errstr) errx(EX_DATAERR, "invalid rule number %s\n", *av); memset(&rt, 0, sizeof(rt)); rt.start_rule = arg; rt.end_rule = arg; rt.flags |= IPFW_RCFLAG_RANGE; if (g_co.use_set != 0) { rt.set = g_co.use_set - 1; rt.flags |= IPFW_RCFLAG_SET; } if (do_range_cmd(optname, &rt) != 0) { warn("rule %u: setsockopt(IP_FW_X%s)", arg, name); failed = EX_UNAVAILABLE; } else if (rt.new_set == 0) { printf("Entry %d not found\n", arg); failed = EX_UNAVAILABLE; } else if (!g_co.do_quiet) printf("Entry %d %s.\n", arg, optname == IP_FW_XZERO ? "cleared" : "logging count reset"); } else { errx(EX_USAGE, "invalid rule number ``%s''", *av); } av++; ac--; } if (failed != EX_OK) exit(failed); } void ipfw_flush(int force) { ipfw_range_tlv rt; if (!force && !g_co.do_quiet) { /* need to ask user */ int c; printf("Are you sure? [yn] "); fflush(stdout); do { c = toupper(getc(stdin)); while (c != '\n' && getc(stdin) != '\n') if (feof(stdin)) return; /* and do not flush */ } while (c != 'Y' && c != 'N'); printf("\n"); if (c == 'N') /* user said no */ return; } if (g_co.do_pipe) { dummynet_flush(); return; } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ memset(&rt, 0, sizeof(rt)); if (g_co.use_set != 0) { rt.set = g_co.use_set - 1; rt.flags = IPFW_RCFLAG_SET; } else rt.flags = IPFW_RCFLAG_ALL; if (do_range_cmd(IP_FW_XDEL, &rt) != 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_XDEL)"); if (!g_co.do_quiet) printf("Flushed all %s.\n", g_co.do_pipe ? "pipes" : "rules"); } static struct _s_x intcmds[] = { { "talist", TOK_TALIST }, { "iflist", TOK_IFLIST }, { "olist", TOK_OLIST }, { "vlist", TOK_VLIST }, { NULL, 0 } }; static struct _s_x otypes[] = { { "EACTION", IPFW_TLV_EACTION }, { "DYNSTATE", IPFW_TLV_STATE_NAME }, { NULL, 0 } }; static const char* lookup_eaction_name(ipfw_obj_ntlv *ntlv, int cnt, uint16_t type) { const char *name; int i; name = NULL; for (i = 0; i < cnt; i++) { if (ntlv[i].head.type != IPFW_TLV_EACTION) continue; if (IPFW_TLV_EACTION_NAME(ntlv[i].idx) != type) continue; name = ntlv[i].name; break; } return (name); } static void ipfw_list_objects(int ac __unused, char *av[] __unused) { ipfw_obj_lheader req, *olh; ipfw_obj_ntlv *ntlv; const char *name; size_t sz; uint32_t i; memset(&req, 0, sizeof(req)); sz = sizeof(req); if (do_get3(IP_FW_DUMP_SRVOBJECTS, &req.opheader, &sz) != 0) if (errno != ENOMEM) return; sz = req.size; if ((olh = calloc(1, sz)) == NULL) return; olh->size = sz; if (do_get3(IP_FW_DUMP_SRVOBJECTS, &olh->opheader, &sz) != 0) { free(olh); return; } if (olh->count > 0) printf("Objects list:\n"); else printf("There are no objects\n"); ntlv = (ipfw_obj_ntlv *)(olh + 1); for (i = 0; i < olh->count; i++) { name = match_value(otypes, ntlv->head.type); if (name == NULL) name = lookup_eaction_name( (ipfw_obj_ntlv *)(olh + 1), olh->count, ntlv->head.type); if (name == NULL) printf(" kidx: %4d\ttype: %10d\tname: %s\n", ntlv->idx, ntlv->head.type, ntlv->name); else printf(" kidx: %4d\ttype: %10s\tname: %s\n", ntlv->idx, name, ntlv->name); ntlv++; } free(olh); } void ipfw_internal_handler(int ac, char *av[]) { int tcmd; ac--; av++; NEED1("internal cmd required"); if ((tcmd = match_token(intcmds, *av)) == -1) errx(EX_USAGE, "invalid internal sub-cmd: %s", *av); switch (tcmd) { case TOK_IFLIST: ipfw_list_tifaces(); break; case TOK_TALIST: ipfw_list_ta(ac, av); break; case TOK_OLIST: ipfw_list_objects(ac, av); break; case TOK_VLIST: ipfw_list_values(ac, av); break; } } static int ipfw_get_tracked_ifaces(ipfw_obj_lheader **polh) { ipfw_obj_lheader req, *olh; size_t sz; memset(&req, 0, sizeof(req)); sz = sizeof(req); if (do_get3(IP_FW_XIFLIST, &req.opheader, &sz) != 0) { if (errno != ENOMEM) return (errno); } sz = req.size; if ((olh = calloc(1, sz)) == NULL) return (ENOMEM); olh->size = sz; if (do_get3(IP_FW_XIFLIST, &olh->opheader, &sz) != 0) { free(olh); return (errno); } *polh = olh; return (0); } static int ifinfo_cmp(const void *a, const void *b) { const ipfw_iface_info *ia, *ib; ia = (const ipfw_iface_info *)a; ib = (const ipfw_iface_info *)b; return (stringnum_cmp(ia->ifname, ib->ifname)); } /* * Retrieves table list from kernel, * optionally sorts it and calls requested function for each table. * Returns 0 on success. */ static void ipfw_list_tifaces(void) { ipfw_obj_lheader *olh = NULL; ipfw_iface_info *info; uint32_t i; int error; if ((error = ipfw_get_tracked_ifaces(&olh)) != 0) err(EX_OSERR, "Unable to request ipfw tracked interface list"); qsort(olh + 1, olh->count, olh->objsize, ifinfo_cmp); info = (ipfw_iface_info *)(olh + 1); for (i = 0; i < olh->count; i++) { if (info->flags & IPFW_IFFLAG_RESOLVED) printf("%s ifindex: %d refcount: %u changes: %u\n", info->ifname, info->ifindex, info->refcnt, info->gencnt); else printf("%s ifindex: unresolved refcount: %u changes: %u\n", info->ifname, info->refcnt, info->gencnt); info = (ipfw_iface_info *)((caddr_t)info + olh->objsize); } free(olh); } diff --git a/sbin/ipfw/ipfw2.h b/sbin/ipfw/ipfw2.h index 0c0bf32e94d6..788e9fe365f3 100644 --- a/sbin/ipfw/ipfw2.h +++ b/sbin/ipfw/ipfw2.h @@ -1,467 +1,469 @@ /*- * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility */ enum cmdline_prog { cmdline_prog_ipfw, cmdline_prog_dnctl }; /* * Options that can be set on the command line. * When reading commands from a file, a subset of the options can also * be applied globally by specifying them before the file name. * After that, each line can contain its own option that changes * the global value. * XXX The context is not restored after each line. */ struct cmdline_opts { /* boolean options: */ int do_value_as_ip; /* show table value as IP */ int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_compact; /* show rules in compact mode */ int do_force; /* do not ask for confirmation */ int show_sets; /* display the set each rule belongs to */ int test_only; /* only check syntax */ int comment_only; /* only print action and comment */ int verbose; /* be verbose on some commands */ int debug_only; /* output ioctl i/o on stdout */ /* The options below can have multiple values. */ int do_dynamic; /* 1 - display dynamic rules */ /* 2 - display/delete only dynamic rules */ int do_sort; /* field to sort results (0 = no) */ /* valid fields are 1 and above */ uint32_t use_set; /* work with specified set number */ /* 0 means all sets, otherwise apply to set use_set - 1 */ enum cmdline_prog prog; /* Are we ipfw or dnctl? */ }; int is_ipfw(void); enum { TIMESTAMP_NONE = 0, TIMESTAMP_STRING, TIMESTAMP_NUMERIC, }; extern struct cmdline_opts g_co; /* * _s_x is a structure that stores a string <-> token pairs, used in * various places in the parser. Entries are stored in arrays, * with an entry with s=NULL as terminator. * The search routines are match_token() and match_value(). * Often, an element with x=0 contains an error string. * */ struct _s_x { char const *s; int x; }; extern struct _s_x f_ipdscp[]; enum tokens { TOK_NULL=0, TOK_OR, TOK_NOT, TOK_STARTBRACE, TOK_ENDBRACE, TOK_ABORT6, TOK_ABORT, TOK_ACCEPT, TOK_COUNT, TOK_EACTION, TOK_PIPE, TOK_LINK, TOK_QUEUE, TOK_FLOWSET, TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, TOK_NGTEE, TOK_FORWARD, TOK_SKIPTO, TOK_DENY, TOK_REJECT, TOK_RESET, TOK_UNREACH, TOK_CHECKSTATE, TOK_NAT, TOK_REASS, TOK_CALL, TOK_RETURN, TOK_ALTQ, TOK_LOG, TOK_TAG, TOK_UNTAG, TOK_TAGGED, TOK_UID, TOK_GID, TOK_JAIL, TOK_IN, TOK_LIMIT, TOK_SETLIMIT, TOK_KEEPSTATE, TOK_RECORDSTATE, TOK_LAYER2, TOK_OUT, TOK_DIVERTED, TOK_DIVERTEDLOOPBACK, TOK_DIVERTEDOUTPUT, TOK_XMIT, TOK_RECV, TOK_VIA, TOK_FRAG, TOK_IPOPTS, TOK_IPLEN, TOK_IPID, TOK_IPPRECEDENCE, TOK_DSCP, TOK_IPTOS, TOK_IPTTL, TOK_IPVER, TOK_ESTAB, TOK_SETUP, TOK_TCPDATALEN, TOK_TCPFLAGS, TOK_TCPOPTS, TOK_TCPSEQ, TOK_TCPACK, TOK_TCPMSS, TOK_TCPWIN, TOK_ICMPTYPES, TOK_MAC, TOK_MACTYPE, TOK_VERREVPATH, TOK_VERSRCREACH, TOK_ANTISPOOF, TOK_IPSEC, TOK_COMMENT, TOK_PLR, TOK_NOERROR, TOK_BUCKETS, TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_DSTMAC, TOK_SRCMAC, TOK_ALL, TOK_MASK, TOK_FLOW_MASK, TOK_SCHED_MASK, TOK_BW, TOK_DELAY, TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_ECN, TOK_DROPTAIL, TOK_PROTO, #ifdef NEW_AQM /* AQM tokens*/ TOK_NO_ECN, TOK_CODEL, TOK_FQ_CODEL, TOK_TARGET, TOK_INTERVAL, TOK_FLOWS, TOK_QUANTUM, TOK_PIE, TOK_FQ_PIE, TOK_TUPDATE, TOK_MAX_BURST, TOK_MAX_ECNTH, TOK_ALPHA, TOK_BETA, TOK_CAPDROP, TOK_NO_CAPDROP, TOK_ONOFF, TOK_DRE, TOK_TS, TOK_DERAND, TOK_NO_DERAND, #endif /* dummynet tokens */ TOK_WEIGHT, TOK_LMAX, TOK_PRI, TOK_TYPE, TOK_SLOTSIZE, TOK_IP, TOK_IF, TOK_ALOG, TOK_DENY_INC, TOK_SAME_PORTS, TOK_UNREG_ONLY, TOK_UNREG_CGN, TOK_SKIP_GLOBAL, TOK_RESET_ADDR, TOK_ALIAS_REV, TOK_PROXY_ONLY, TOK_REDIR_ADDR, TOK_REDIR_PORT, TOK_REDIR_PROTO, TOK_IPV6, TOK_FLOWID, TOK_ICMP6TYPES, TOK_EXT6HDR, TOK_DSTIP6, TOK_SRCIP6, TOK_IPV4, TOK_UNREACH6, TOK_RESET6, TOK_FIB, TOK_SETFIB, TOK_LOOKUP, TOK_SOCKARG, TOK_SETDSCP, TOK_FLOW, TOK_IFLIST, /* Table tokens */ TOK_CREATE, TOK_DESTROY, TOK_LIST, TOK_INFO, TOK_DETAIL, TOK_MODIFY, TOK_FLUSH, TOK_SWAP, TOK_ADD, TOK_DEL, TOK_VALTYPE, TOK_ALGO, TOK_TALIST, TOK_ATOMIC, TOK_LOCK, TOK_UNLOCK, TOK_VLIST, TOK_OLIST, TOK_MISSING, TOK_ORFLUSH, /* NAT64 tokens */ TOK_NAT64STL, TOK_NAT64LSN, TOK_STATS, TOK_STATES, TOK_CONFIG, TOK_TABLE4, TOK_TABLE6, TOK_PREFIX4, TOK_PREFIX6, TOK_AGG_LEN, TOK_AGG_COUNT, TOK_MAX_PORTS, TOK_STATES_CHUNKS, TOK_JMAXLEN, TOK_PORT_RANGE, TOK_PORT_ALIAS, TOK_HOST_DEL_AGE, TOK_PG_DEL_AGE, TOK_TCP_SYN_AGE, TOK_TCP_CLOSE_AGE, TOK_TCP_EST_AGE, TOK_UDP_AGE, TOK_ICMP_AGE, TOK_LOGOFF, TOK_PRIVATE, TOK_PRIVATEOFF, + TOK_SWAPCONF, + TOK_SWAPCONFOFF, /* NAT64 CLAT tokens */ TOK_NAT64CLAT, TOK_PLAT_PREFIX, TOK_CLAT_PREFIX, /* NPTv6 tokens */ TOK_NPTV6, TOK_INTPREFIX, TOK_EXTPREFIX, TOK_PREFIXLEN, TOK_EXTIF, TOK_TCPSETMSS, TOK_MARK, TOK_SETMARK, TOK_SKIPACTION, TOK_UDP_EIM, }; /* * the following macro returns an error message if we run out of * arguments. */ #define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} #define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} struct buf_pr { char *buf; /* allocated buffer */ char *ptr; /* current pointer */ size_t size; /* total buffer size */ size_t avail; /* available storage */ size_t needed; /* length needed */ }; int pr_u64(struct buf_pr *bp, void *pd, int width); int bp_alloc(struct buf_pr *b, size_t size); void bp_free(struct buf_pr *b); int bprintf(struct buf_pr *b, const char *format, ...); /* memory allocation support */ void *safe_calloc(size_t number, size_t size); void *safe_realloc(void *ptr, size_t size); /* string comparison functions used for historical compatibility */ int _substrcmp(const char *str1, const char* str2); int _substrcmp2(const char *str1, const char* str2, const char* str3); int stringnum_cmp(const char *a, const char *b); /* utility functions */ int match_token(struct _s_x *table, const char *string); int match_token_relaxed(struct _s_x *table, const char *string); int get_token(struct _s_x *table, const char *string, const char *errbase); char const *match_value(struct _s_x *p, int value); size_t concat_tokens(char *buf, size_t bufsize, struct _s_x *table, const char *delimiter); int fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, uint32_t *clear); void print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set); struct _ip_fw3_opheader; int do_cmd(int optname, void *optval, uintptr_t optlen); int do_set3(int optname, struct _ip_fw3_opheader *op3, size_t optlen); int do_get3(int optname, struct _ip_fw3_opheader *op3, size_t *optlen); struct in6_addr; void n2mask(struct in6_addr *mask, int n); int contigmask(const uint8_t *p, int len); /* * Forward declarations to avoid include way too many headers. * C does not allow duplicated typedefs, so we use the base struct * that the typedef points to. * Should the typedefs use a different type, the compiler will * still detect the change when compiling the body of the * functions involved, so we do not lose error checking. */ struct _ipfw_insn; struct _ipfw_insn_altq; struct _ipfw_insn_u32; struct _ipfw_insn_ip6; struct _ipfw_insn_icmp6; /* * The reserved set numer. This is a constant in ip_fw.h * but we store it in a variable so other files do not depend * in that header just for one constant. */ extern int resvd_set_number; /* first-level command handlers */ void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); int ipfw_delete_nat(int i); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); void ipfw_sysctl_handler(char *av[], int which); void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); void ipfw_internal_handler(int ac, char *av[]); void ipfw_nat64clat_handler(int ac, char *av[]); void ipfw_nat64lsn_handler(int ac, char *av[]); void ipfw_nat64stl_handler(int ac, char *av[]); void ipfw_nptv6_handler(int ac, char *av[]); int ipfw_check_object_name(const char *name); int ipfw_check_nat64prefix(const struct in6_addr *prefix, int length); #ifdef PF /* altq.c */ void altq_set_enabled(int enabled); u_int32_t altq_name_to_qid(const char *name); void print_altq_cmd(struct buf_pr *bp, const struct _ipfw_insn_altq *altqptr); #else #define NO_ALTQ #endif /* dummynet.c */ void dummynet_list(int ac, char *av[], int show_counters); void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ void print_unreach6_code(struct buf_pr *bp, uint16_t code); void print_ip6(struct buf_pr *bp, const struct _ipfw_insn_ip6 *cmd); void print_flow6id(struct buf_pr *bp, const struct _ipfw_insn_u32 *cmd); void print_icmp6types(struct buf_pr *bp, const struct _ipfw_insn_u32 *cmd); void print_ext6hdr(struct buf_pr *bp, const struct _ipfw_insn *cmd); struct tidx; struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate); struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate); void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av, int cblen); uint16_t get_unreach6_code(const char *str); void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av, int cblen); int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); /* ipfw2.c */ void bp_flush(struct buf_pr *b); void fill_table(struct _ipfw_insn *cmd, char *av, uint8_t opcode, struct tidx *tstate); /* tables.c */ struct _ipfw_obj_ctlv; struct _ipfw_obj_ntlv; int table_check_name(const char *tablename); void ipfw_list_ta(int ac, char *av[]); void ipfw_list_values(int ac, char *av[]); void table_fill_ntlv(struct _ipfw_obj_ntlv *ntlv, const char *name, - uint8_t set, uint16_t uidx); + uint8_t set, uint32_t uidx); diff --git a/sbin/ipfw/nat64lsn.c b/sbin/ipfw/nat64lsn.c index 0a3e0415d459..fe3ff8e0fdb8 100644 --- a/sbin/ipfw/nat64lsn.c +++ b/sbin/ipfw/nat64lsn.c @@ -1,885 +1,901 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015-2016 Alexander V. Chernikov * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void nat64lsn_fill_ntlv(ipfw_obj_ntlv *ntlv, const char *name, uint8_t set); typedef int (nat64lsn_cb_t)(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set); static int nat64lsn_foreach(nat64lsn_cb_t *f, const char *name, uint8_t set, int sort); static void nat64lsn_create(const char *name, uint8_t set, int ac, char **av); static void nat64lsn_config(const char *name, uint8_t set, int ac, char **av); static void nat64lsn_destroy(const char *name, uint8_t set); static void nat64lsn_stats(const char *name, uint8_t set); static void nat64lsn_reset_stats(const char *name, uint8_t set); static int nat64lsn_show_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set); static int nat64lsn_destroy_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set); static int nat64lsn_states_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set); static struct _s_x nat64cmds[] = { { "create", TOK_CREATE }, { "config", TOK_CONFIG }, { "destroy", TOK_DESTROY }, { "list", TOK_LIST }, { "show", TOK_LIST }, { "stats", TOK_STATS }, { NULL, 0 } }; static uint64_t nat64lsn_print_states(void *buf) { char s[INET6_ADDRSTRLEN], a[INET_ADDRSTRLEN], f[INET_ADDRSTRLEN]; const char *proto; char sflags[4], *sf; ipfw_obj_header *oh; ipfw_obj_data *od; ipfw_nat64lsn_stg_v1 *stg; ipfw_nat64lsn_state_v1 *ste; uint64_t next_idx; uint32_t i; int sz; oh = (ipfw_obj_header *)buf; od = (ipfw_obj_data *)(oh + 1); stg = (ipfw_nat64lsn_stg_v1 *)(od + 1); sz = od->head.length - sizeof(*od); next_idx = 0; proto = NULL; while (sz > 0 && next_idx != 0xFF) { next_idx = stg->next.index; sz -= sizeof(*stg); if (stg->count == 0) { stg++; continue; } /* * NOTE: addresses are in network byte order, * ports are in host byte order. */ inet_ntop(AF_INET, &stg->alias4, a, sizeof(a)); ste = (ipfw_nat64lsn_state_v1 *)(stg + 1); for (i = 0; i < stg->count && sz > 0; i++) { sf = sflags; inet_ntop(AF_INET6, &ste->host6, s, sizeof(s)); inet_ntop(AF_INET, &ste->daddr, f, sizeof(f)); switch (ste->proto) { case IPPROTO_TCP: proto = "TCP"; if (ste->flags & 0x02) *sf++ = 'S'; if (ste->flags & 0x04) *sf++ = 'E'; if (ste->flags & 0x01) *sf++ = 'F'; break; case IPPROTO_UDP: proto = "UDP"; break; case IPPROTO_ICMP: proto = "ICMPv6"; break; } *sf = '\0'; switch (ste->proto) { case IPPROTO_TCP: case IPPROTO_UDP: printf("%s:%d\t%s:%d\t%s\t%s\t%d\t%s:%d\n", s, ste->sport, a, ste->aport, proto, sflags, ste->idle, f, ste->dport); break; case IPPROTO_ICMP: printf("%s\t%s\t%s\t\t%d\t%s\n", s, a, proto, ste->idle, f); break; default: printf("%s\t%s\t%d\t\t%d\t%s\n", s, a, ste->proto, ste->idle, f); } ste++; sz -= sizeof(*ste); } stg = (ipfw_nat64lsn_stg_v1 *)ste; } return (next_idx); } static int nat64lsn_states_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set) { ipfw_obj_header *oh; ipfw_obj_data *od; void *buf; uint64_t next_idx; size_t sz; if (name != NULL && strcmp(cfg->name, name) != 0) return (ESRCH); if (set != 0 && cfg->set != set) return (ESRCH); next_idx = 0; sz = 4096; if ((buf = calloc(1, sz)) == NULL) err(EX_OSERR, NULL); do { oh = (ipfw_obj_header *)buf; oh->opheader.version = 1; /* Force using ov new API */ od = (ipfw_obj_data *)(oh + 1); nat64lsn_fill_ntlv(&oh->ntlv, cfg->name, set); od->head.type = IPFW_TLV_OBJDATA; od->head.length = sizeof(*od) + sizeof(next_idx); *((uint64_t *)(od + 1)) = next_idx; if (do_get3(IP_FW_NAT64LSN_LIST_STATES, &oh->opheader, &sz)) err(EX_OSERR, "Error reading nat64lsn states"); next_idx = nat64lsn_print_states(buf); sz = 4096; memset(buf, 0, sz); } while (next_idx != 0xFF); free(buf); return (0); } static struct _s_x nat64statscmds[] = { { "reset", TOK_RESET }, { NULL, 0 } }; static void ipfw_nat64lsn_stats_handler(const char *name, uint8_t set, int ac, char *av[]) { int tcmd; if (ac == 0) { nat64lsn_stats(name, set); return; } NEED1("nat64lsn stats needs command"); tcmd = get_token(nat64statscmds, *av, "nat64lsn stats command"); switch (tcmd) { case TOK_RESET: nat64lsn_reset_stats(name, set); } } static struct _s_x nat64listcmds[] = { { "states", TOK_STATES }, { "config", TOK_CONFIG }, { NULL, 0 } }; static void ipfw_nat64lsn_list_handler(const char *name, uint8_t set, int ac, char *av[]) { int tcmd; if (ac == 0) { nat64lsn_foreach(nat64lsn_show_cb, name, set, 1); return; } NEED1("nat64lsn list needs command"); tcmd = get_token(nat64listcmds, *av, "nat64lsn list command"); switch (tcmd) { case TOK_STATES: nat64lsn_foreach(nat64lsn_states_cb, name, set, 1); break; case TOK_CONFIG: nat64lsn_foreach(nat64lsn_show_cb, name, set, 1); } } /* * This one handles all nat64lsn-related commands * ipfw [set N] nat64lsn NAME {create | config} ... * ipfw [set N] nat64lsn NAME stats * ipfw [set N] nat64lsn {NAME | all} destroy * ipfw [set N] nat64lsn {NAME | all} {list | show} [config | states] */ #define nat64lsn_check_name table_check_name void ipfw_nat64lsn_handler(int ac, char *av[]) { const char *name; int tcmd; uint8_t set; if (g_co.use_set != 0) set = g_co.use_set - 1; else set = 0; ac--; av++; NEED1("nat64lsn needs instance name"); name = *av; if (nat64lsn_check_name(name) != 0) { if (strcmp(name, "all") == 0) name = NULL; else errx(EX_USAGE, "nat64lsn instance name %s is invalid", name); } ac--; av++; NEED1("nat64lsn needs command"); tcmd = get_token(nat64cmds, *av, "nat64lsn command"); if (name == NULL && tcmd != TOK_DESTROY && tcmd != TOK_LIST) errx(EX_USAGE, "nat64lsn instance name required"); switch (tcmd) { case TOK_CREATE: ac--; av++; nat64lsn_create(name, set, ac, av); break; case TOK_CONFIG: ac--; av++; nat64lsn_config(name, set, ac, av); break; case TOK_LIST: ac--; av++; ipfw_nat64lsn_list_handler(name, set, ac, av); break; case TOK_DESTROY: if (name == NULL) nat64lsn_foreach(nat64lsn_destroy_cb, NULL, set, 0); else nat64lsn_destroy(name, set); break; case TOK_STATS: ac--; av++; ipfw_nat64lsn_stats_handler(name, set, ac, av); } } static void nat64lsn_fill_ntlv(ipfw_obj_ntlv *ntlv, const char *name, uint8_t set) { ntlv->head.type = IPFW_TLV_EACTION_NAME(1); /* it doesn't matter */ ntlv->head.length = sizeof(ipfw_obj_ntlv); ntlv->idx = 1; ntlv->set = set; strlcpy(ntlv->name, name, sizeof(ntlv->name)); } static void nat64lsn_apply_mask(int af, void *prefix, uint16_t plen) { struct in6_addr mask6, *p6; struct in_addr mask4, *p4; if (af == AF_INET) { p4 = (struct in_addr *)prefix; mask4.s_addr = htonl(~((1 << (32 - plen)) - 1)); p4->s_addr &= mask4.s_addr; } else if (af == AF_INET6) { p6 = (struct in6_addr *)prefix; n2mask(&mask6, plen); APPLY_MASK(p6, &mask6); } } static void nat64lsn_parse_prefix(const char *arg, int af, void *prefix, uint16_t *plen) { char *p, *l; p = strdup(arg); if (p == NULL) err(EX_OSERR, NULL); if ((l = strchr(p, '/')) != NULL) *l++ = '\0'; if (l == NULL) errx(EX_USAGE, "Prefix length required"); if (inet_pton(af, p, prefix) != 1) errx(EX_USAGE, "Bad prefix: %s", p); *plen = (uint16_t)strtol(l, &l, 10); if (*l != '\0' || *plen == 0 || (af == AF_INET && *plen > 32) || (af == AF_INET6 && *plen > 96)) errx(EX_USAGE, "Bad prefix length: %s", arg); nat64lsn_apply_mask(af, prefix, *plen); free(p); } static uint32_t nat64lsn_parse_int(const char *arg, const char *desc) { char *p; uint32_t val; val = (uint32_t)strtol(arg, &p, 10); if (*p != '\0') errx(EX_USAGE, "Invalid %s value: %s\n", desc, arg); return (val); } static struct _s_x nat64newcmds[] = { { "prefix6", TOK_PREFIX6 }, { "jmaxlen", TOK_JMAXLEN }, { "prefix4", TOK_PREFIX4 }, { "host_del_age", TOK_HOST_DEL_AGE }, { "pg_del_age", TOK_PG_DEL_AGE }, { "tcp_syn_age", TOK_TCP_SYN_AGE }, { "tcp_close_age",TOK_TCP_CLOSE_AGE }, { "tcp_est_age", TOK_TCP_EST_AGE }, { "udp_age", TOK_UDP_AGE }, { "icmp_age", TOK_ICMP_AGE }, { "states_chunks",TOK_STATES_CHUNKS }, { "log", TOK_LOG }, { "-log", TOK_LOGOFF }, { "allow_private", TOK_PRIVATE }, { "-allow_private", TOK_PRIVATEOFF }, + { "swap_conf", TOK_SWAPCONF }, + { "-swap_conf", TOK_SWAPCONFOFF }, /* for compatibility with old configurations */ { "max_ports", TOK_MAX_PORTS }, /* unused */ { NULL, 0 } }; /* * Creates new nat64lsn instance * ipfw nat64lsn create * [ max_ports ] * Request: [ ipfw_obj_lheader ipfw_nat64lsn_cfg ] */ #define NAT64LSN_HAS_PREFIX4 0x01 #define NAT64LSN_HAS_PREFIX6 0x02 static void nat64lsn_create(const char *name, uint8_t set, int ac, char **av) { char buf[sizeof(ipfw_obj_lheader) + sizeof(ipfw_nat64lsn_cfg)]; ipfw_nat64lsn_cfg *cfg; ipfw_obj_lheader *olh; int tcmd, flags; char *opt; memset(&buf, 0, sizeof(buf)); olh = (ipfw_obj_lheader *)buf; cfg = (ipfw_nat64lsn_cfg *)(olh + 1); /* Some reasonable defaults */ inet_pton(AF_INET6, "64:ff9b::", &cfg->prefix6); cfg->plen6 = 96; cfg->set = set; cfg->max_ports = NAT64LSN_MAX_PORTS; cfg->jmaxlen = NAT64LSN_JMAXLEN; cfg->nh_delete_delay = NAT64LSN_HOST_AGE; cfg->pg_delete_delay = NAT64LSN_PG_AGE; cfg->st_syn_ttl = NAT64LSN_TCP_SYN_AGE; cfg->st_estab_ttl = NAT64LSN_TCP_EST_AGE; cfg->st_close_ttl = NAT64LSN_TCP_FIN_AGE; cfg->st_udp_ttl = NAT64LSN_UDP_AGE; cfg->st_icmp_ttl = NAT64LSN_ICMP_AGE; flags = NAT64LSN_HAS_PREFIX6; while (ac > 0) { tcmd = get_token(nat64newcmds, *av, "option"); opt = *av; ac--; av++; switch (tcmd) { case TOK_PREFIX4: NEED1("IPv4 prefix required"); nat64lsn_parse_prefix(*av, AF_INET, &cfg->prefix4, &cfg->plen4); flags |= NAT64LSN_HAS_PREFIX4; ac--; av++; break; case TOK_PREFIX6: NEED1("IPv6 prefix required"); nat64lsn_parse_prefix(*av, AF_INET6, &cfg->prefix6, &cfg->plen6); if (ipfw_check_nat64prefix(&cfg->prefix6, cfg->plen6) != 0 && !IN6_IS_ADDR_UNSPECIFIED(&cfg->prefix6)) errx(EX_USAGE, "Bad prefix6 %s", *av); ac--; av++; break; case TOK_JMAXLEN: NEED1("job queue length required"); cfg->jmaxlen = nat64lsn_parse_int(*av, opt); ac--; av++; break; case TOK_MAX_PORTS: NEED1("Max per-user ports required"); cfg->max_ports = nat64lsn_parse_int(*av, opt); ac--; av++; break; case TOK_HOST_DEL_AGE: NEED1("host delete delay required"); cfg->nh_delete_delay = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_PG_DEL_AGE: NEED1("portgroup delete delay required"); cfg->pg_delete_delay = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_TCP_SYN_AGE: NEED1("tcp syn age required"); cfg->st_syn_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_TCP_CLOSE_AGE: NEED1("tcp close age required"); cfg->st_close_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_TCP_EST_AGE: NEED1("tcp est age required"); cfg->st_estab_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_UDP_AGE: NEED1("udp age required"); cfg->st_udp_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_ICMP_AGE: NEED1("icmp age required"); cfg->st_icmp_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_STATES_CHUNKS: NEED1("number of chunks required"); cfg->states_chunks = (uint8_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_LOG: cfg->flags |= NAT64_LOG; break; case TOK_LOGOFF: cfg->flags &= ~NAT64_LOG; break; case TOK_PRIVATE: cfg->flags |= NAT64_ALLOW_PRIVATE; break; case TOK_PRIVATEOFF: cfg->flags &= ~NAT64_ALLOW_PRIVATE; break; + case TOK_SWAPCONF: + cfg->flags |= NAT64LSN_ALLOW_SWAPCONF; + break; + case TOK_SWAPCONFOFF: + cfg->flags &= ~NAT64LSN_ALLOW_SWAPCONF; + break; } } /* Check validness */ if ((flags & NAT64LSN_HAS_PREFIX4) != NAT64LSN_HAS_PREFIX4) errx(EX_USAGE, "prefix4 required"); olh->count = 1; olh->objsize = sizeof(*cfg); olh->size = sizeof(buf); strlcpy(cfg->name, name, sizeof(cfg->name)); if (do_set3(IP_FW_NAT64LSN_CREATE, &olh->opheader, sizeof(buf)) != 0) err(EX_OSERR, "nat64lsn instance creation failed"); } /* * Configures existing nat64lsn instance * ipfw nat64lsn config * Request: [ ipfw_obj_header ipfw_nat64lsn_cfg ] */ static void nat64lsn_config(const char *name, uint8_t set, int ac, char **av) { char buf[sizeof(ipfw_obj_header) + sizeof(ipfw_nat64lsn_cfg)]; ipfw_nat64lsn_cfg *cfg; ipfw_obj_header *oh; size_t sz; char *opt; int tcmd; if (ac == 0) errx(EX_USAGE, "config options required"); memset(&buf, 0, sizeof(buf)); oh = (ipfw_obj_header *)buf; cfg = (ipfw_nat64lsn_cfg *)(oh + 1); sz = sizeof(buf); nat64lsn_fill_ntlv(&oh->ntlv, name, set); if (do_get3(IP_FW_NAT64LSN_CONFIG, &oh->opheader, &sz) != 0) err(EX_OSERR, "failed to get config for instance %s", name); while (ac > 0) { tcmd = get_token(nat64newcmds, *av, "option"); opt = *av; ac--; av++; switch (tcmd) { case TOK_MAX_PORTS: NEED1("Max per-user ports required"); cfg->max_ports = nat64lsn_parse_int(*av, opt); ac--; av++; break; case TOK_JMAXLEN: NEED1("job queue length required"); cfg->jmaxlen = nat64lsn_parse_int(*av, opt); ac--; av++; break; case TOK_HOST_DEL_AGE: NEED1("host delete delay required"); cfg->nh_delete_delay = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_PG_DEL_AGE: NEED1("portgroup delete delay required"); cfg->pg_delete_delay = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_TCP_SYN_AGE: NEED1("tcp syn age required"); cfg->st_syn_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_TCP_CLOSE_AGE: NEED1("tcp close age required"); cfg->st_close_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_TCP_EST_AGE: NEED1("tcp est age required"); cfg->st_estab_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_UDP_AGE: NEED1("udp age required"); cfg->st_udp_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_ICMP_AGE: NEED1("icmp age required"); cfg->st_icmp_ttl = (uint16_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_STATES_CHUNKS: NEED1("number of chunks required"); cfg->states_chunks = (uint8_t)nat64lsn_parse_int( *av, opt); ac--; av++; break; case TOK_LOG: cfg->flags |= NAT64_LOG; break; case TOK_LOGOFF: cfg->flags &= ~NAT64_LOG; break; case TOK_PRIVATE: cfg->flags |= NAT64_ALLOW_PRIVATE; break; case TOK_PRIVATEOFF: cfg->flags &= ~NAT64_ALLOW_PRIVATE; break; + case TOK_SWAPCONF: + cfg->flags |= NAT64LSN_ALLOW_SWAPCONF; + break; + case TOK_SWAPCONFOFF: + cfg->flags &= ~NAT64LSN_ALLOW_SWAPCONF; + break; default: errx(EX_USAGE, "Can't change %s option", opt); } } if (do_set3(IP_FW_NAT64LSN_CONFIG, &oh->opheader, sizeof(buf)) != 0) err(EX_OSERR, "nat64lsn instance configuration failed"); } /* * Reset nat64lsn instance statistics specified by @oh->ntlv. * Request: [ ipfw_obj_header ] */ static void nat64lsn_reset_stats(const char *name, uint8_t set) { ipfw_obj_header oh; memset(&oh, 0, sizeof(oh)); nat64lsn_fill_ntlv(&oh.ntlv, name, set); if (do_set3(IP_FW_NAT64LSN_RESET_STATS, &oh.opheader, sizeof(oh)) != 0) err(EX_OSERR, "failed to reset stats for instance %s", name); } /* * Destroys nat64lsn instance specified by @oh->ntlv. * Request: [ ipfw_obj_header ] */ static void nat64lsn_destroy(const char *name, uint8_t set) { ipfw_obj_header oh; memset(&oh, 0, sizeof(oh)); nat64lsn_fill_ntlv(&oh.ntlv, name, set); if (do_set3(IP_FW_NAT64LSN_DESTROY, &oh.opheader, sizeof(oh)) != 0) err(EX_OSERR, "failed to destroy nat instance %s", name); } /* * Get nat64lsn instance statistics. * Request: [ ipfw_obj_header ] * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ] ] */ static int nat64lsn_get_stats(const char *name, uint8_t set, struct ipfw_nat64lsn_stats *stats) { ipfw_obj_header *oh; ipfw_obj_ctlv *oc; size_t sz; sz = sizeof(*oh) + sizeof(*oc) + sizeof(*stats); oh = calloc(1, sz); nat64lsn_fill_ntlv(&oh->ntlv, name, set); if (do_get3(IP_FW_NAT64LSN_STATS, &oh->opheader, &sz) == 0) { oc = (ipfw_obj_ctlv *)(oh + 1); memcpy(stats, oc + 1, sizeof(*stats)); free(oh); return (0); } free(oh); return (-1); } static void nat64lsn_stats(const char *name, uint8_t set) { struct ipfw_nat64lsn_stats stats; if (nat64lsn_get_stats(name, set, &stats) != 0) err(EX_OSERR, "Error retrieving stats"); if (g_co.use_set != 0 || set != 0) printf("set %u ", set); printf("nat64lsn %s\n", name); printf("\t%ju packets translated from IPv6 to IPv4\n", (uintmax_t)stats.opcnt64); printf("\t%ju packets translated from IPv4 to IPv6\n", (uintmax_t)stats.opcnt46); printf("\t%ju IPv6 fragments created\n", (uintmax_t)stats.ofrags); printf("\t%ju IPv4 fragments received\n", (uintmax_t)stats.ifrags); printf("\t%ju output packets dropped due to no bufs, etc.\n", (uintmax_t)stats.oerrors); printf("\t%ju output packets discarded due to no IPv4 route\n", (uintmax_t)stats.noroute4); printf("\t%ju output packets discarded due to no IPv6 route\n", (uintmax_t)stats.noroute6); printf("\t%ju packets discarded due to unsupported protocol\n", (uintmax_t)stats.noproto); printf("\t%ju packets discarded due to memory allocation problems\n", (uintmax_t)stats.nomem); printf("\t%ju packets discarded due to some errors\n", (uintmax_t)stats.dropped); printf("\t%ju packets not matched with IPv4 prefix\n", (uintmax_t)stats.nomatch4); printf("\t%ju mbufs queued for post processing\n", (uintmax_t)stats.jreinjected); printf("\t%ju times the job queue was processed\n", (uintmax_t)stats.jcalls); printf("\t%ju job requests queued\n", (uintmax_t)stats.jrequests); printf("\t%ju job requests queue limit reached\n", (uintmax_t)stats.jmaxlen); printf("\t%ju job requests failed due to memory allocation problems\n", (uintmax_t)stats.jnomem); printf("\t%ju hosts allocated\n", (uintmax_t)stats.hostcount); printf("\t%ju hosts requested\n", (uintmax_t)stats.jhostsreq); printf("\t%ju host requests failed\n", (uintmax_t)stats.jhostfails); printf("\t%ju portgroups requested\n", (uintmax_t)stats.jportreq); printf("\t%ju portgroups allocated\n", (uintmax_t)stats.spgcreated); printf("\t%ju portgroups deleted\n", (uintmax_t)stats.spgdeleted); printf("\t%ju portgroup requests failed\n", (uintmax_t)stats.jportfails); printf("\t%ju portgroups allocated for TCP\n", (uintmax_t)stats.tcpchunks); printf("\t%ju portgroups allocated for UDP\n", (uintmax_t)stats.udpchunks); printf("\t%ju portgroups allocated for ICMP\n", (uintmax_t)stats.icmpchunks); printf("\t%ju states created\n", (uintmax_t)stats.screated); printf("\t%ju states deleted\n", (uintmax_t)stats.sdeleted); } static int nat64lsn_show_cb(ipfw_nat64lsn_cfg *cfg, const char *name, uint8_t set) { char abuf[INET6_ADDRSTRLEN]; if (name != NULL && strcmp(cfg->name, name) != 0) return (ESRCH); if (g_co.use_set != 0 && cfg->set != set) return (ESRCH); if (g_co.use_set != 0 || cfg->set != 0) printf("set %u ", cfg->set); inet_ntop(AF_INET, &cfg->prefix4, abuf, sizeof(abuf)); printf("nat64lsn %s prefix4 %s/%u", cfg->name, abuf, cfg->plen4); inet_ntop(AF_INET6, &cfg->prefix6, abuf, sizeof(abuf)); printf(" prefix6 %s/%u", abuf, cfg->plen6); if (g_co.verbose || cfg->states_chunks > 1) printf(" states_chunks %u", cfg->states_chunks); if (g_co.verbose || cfg->nh_delete_delay != NAT64LSN_HOST_AGE) printf(" host_del_age %u", cfg->nh_delete_delay); if (g_co.verbose || cfg->pg_delete_delay != NAT64LSN_PG_AGE) printf(" pg_del_age %u", cfg->pg_delete_delay); if (g_co.verbose || cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE) printf(" tcp_syn_age %u", cfg->st_syn_ttl); if (g_co.verbose || cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE) printf(" tcp_close_age %u", cfg->st_close_ttl); if (g_co.verbose || cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE) printf(" tcp_est_age %u", cfg->st_estab_ttl); if (g_co.verbose || cfg->st_udp_ttl != NAT64LSN_UDP_AGE) printf(" udp_age %u", cfg->st_udp_ttl); if (g_co.verbose || cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE) printf(" icmp_age %u", cfg->st_icmp_ttl); if (g_co.verbose || cfg->jmaxlen != NAT64LSN_JMAXLEN) printf(" jmaxlen %u", cfg->jmaxlen); + if (cfg->flags & NAT64LSN_ALLOW_SWAPCONF) + printf(" swap_conf"); if (cfg->flags & NAT64_LOG) printf(" log"); if (cfg->flags & NAT64_ALLOW_PRIVATE) printf(" allow_private"); printf("\n"); return (0); } static int nat64lsn_destroy_cb(ipfw_nat64lsn_cfg *cfg, const char *name __unused, uint8_t set) { if (g_co.use_set != 0 && cfg->set != set) return (ESRCH); nat64lsn_destroy(cfg->name, cfg->set); return (0); } /* * Compare nat64lsn instances names. * Honor number comparison. */ static int nat64name_cmp(const void *a, const void *b) { const ipfw_nat64lsn_cfg *ca, *cb; ca = (const ipfw_nat64lsn_cfg *)a; cb = (const ipfw_nat64lsn_cfg *)b; if (ca->set > cb->set) return (1); else if (ca->set < cb->set) return (-1); return (stringnum_cmp(ca->name, cb->name)); } /* * Retrieves nat64lsn instance list from kernel, * optionally sorts it and calls requested function for each instance. * * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader ipfw_nat64lsn_cfg x N ] */ static int nat64lsn_foreach(nat64lsn_cb_t *f, const char *name, uint8_t set, int sort) { ipfw_obj_lheader *olh; ipfw_nat64lsn_cfg *cfg; size_t sz; uint32_t i; /* Start with reasonable default */ sz = sizeof(*olh) + 16 * sizeof(ipfw_nat64lsn_cfg); for (;;) { if ((olh = calloc(1, sz)) == NULL) return (ENOMEM); olh->size = sz; if (do_get3(IP_FW_NAT64LSN_LIST, &olh->opheader, &sz) != 0) { sz = olh->size; free(olh); if (errno != ENOMEM) return (errno); continue; } if (sort != 0) qsort(olh + 1, olh->count, olh->objsize, nat64name_cmp); cfg = (ipfw_nat64lsn_cfg *)(olh + 1); for (i = 0; i < olh->count; i++) { (void)f(cfg, name, set); /* Ignore errors for now */ cfg = (ipfw_nat64lsn_cfg *)((caddr_t)cfg + olh->objsize); } free(olh); break; } return (0); } diff --git a/sbin/ipfw/tables.c b/sbin/ipfw/tables.c index 4c783e02f856..7c3b1bb35a01 100644 --- a/sbin/ipfw/tables.c +++ b/sbin/ipfw/tables.c @@ -1,2099 +1,2099 @@ /* * Copyright (c) 2014 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * in-kernel ipfw tables support. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ipfw2.h" static void table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add, int quiet, int update, int atomic); static int table_flush(ipfw_obj_header *oh); static int table_destroy(ipfw_obj_header *oh); static int table_do_create(ipfw_obj_header *oh, ipfw_xtable_info *i); static int table_do_modify(ipfw_obj_header *oh, ipfw_xtable_info *i); static int table_do_swap(ipfw_obj_header *oh, char *second); static void table_create(ipfw_obj_header *oh, int ac, char *av[]); static void table_modify(ipfw_obj_header *oh, int ac, char *av[]); static void table_lookup(ipfw_obj_header *oh, int ac, char *av[]); static void table_lock(ipfw_obj_header *oh, int lock); static int table_swap(ipfw_obj_header *oh, char *second); static int table_get_info(ipfw_obj_header *oh, ipfw_xtable_info *i); static int table_show_info(ipfw_xtable_info *i, void *arg); static int table_destroy_one(ipfw_xtable_info *i, void *arg); static int table_flush_one(ipfw_xtable_info *i, void *arg); static int table_show_one(ipfw_xtable_info *i, void *arg); static int table_do_get_list(ipfw_xtable_info *i, ipfw_obj_header **poh); static void table_show_list(ipfw_obj_header *oh, int need_header); static void table_show_entry(ipfw_xtable_info *i, ipfw_obj_tentry *tent); static void tentry_fill_key(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *key, int add, uint8_t *ptype, uint32_t *pvmask, ipfw_xtable_info *xi); static void tentry_fill_value(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *arg, uint8_t type, uint32_t vmask); static void table_show_value(char *buf, size_t bufsize, ipfw_table_value *v, uint32_t vmask, int print_ip); typedef int (table_cb_t)(ipfw_xtable_info *i, void *arg); static int tables_foreach(table_cb_t *f, void *arg, int sort); #ifndef s6_addr32 #define s6_addr32 __u6_addr.__u6_addr32 #endif static struct _s_x tabletypes[] = { { "addr", IPFW_TABLE_ADDR }, { "mac", IPFW_TABLE_MAC }, { "iface", IPFW_TABLE_INTERFACE }, { "number", IPFW_TABLE_NUMBER }, { "flow", IPFW_TABLE_FLOW }, { NULL, 0 } }; /* Default algorithms for various table types */ static struct _s_x tablealgos[] = { { "addr:radix", IPFW_TABLE_ADDR }, { "flow:hash", IPFW_TABLE_FLOW }, { "iface:array", IPFW_TABLE_INTERFACE }, { "number:array", IPFW_TABLE_NUMBER }, { NULL, 0 } }; static struct _s_x tablevaltypes[] = { { "skipto", IPFW_VTYPE_SKIPTO }, { "pipe", IPFW_VTYPE_PIPE }, { "fib", IPFW_VTYPE_FIB }, { "nat", IPFW_VTYPE_NAT }, { "dscp", IPFW_VTYPE_DSCP }, { "tag", IPFW_VTYPE_TAG }, { "divert", IPFW_VTYPE_DIVERT }, { "netgraph", IPFW_VTYPE_NETGRAPH }, { "limit", IPFW_VTYPE_LIMIT }, { "ipv4", IPFW_VTYPE_NH4 }, { "ipv6", IPFW_VTYPE_NH6 }, { "mark", IPFW_VTYPE_MARK }, { NULL, 0 } }; static struct _s_x tablecmds[] = { { "add", TOK_ADD }, { "delete", TOK_DEL }, { "create", TOK_CREATE }, { "destroy", TOK_DESTROY }, { "flush", TOK_FLUSH }, { "modify", TOK_MODIFY }, { "swap", TOK_SWAP }, { "info", TOK_INFO }, { "detail", TOK_DETAIL }, { "list", TOK_LIST }, { "lookup", TOK_LOOKUP }, { "atomic", TOK_ATOMIC }, { "lock", TOK_LOCK }, { "unlock", TOK_UNLOCK }, { NULL, 0 } }; static int lookup_host (char *host, struct in_addr *ipaddr) { struct hostent *he; if (!inet_aton(host, ipaddr)) { if ((he = gethostbyname(host)) == NULL) return(-1); *ipaddr = *(struct in_addr *)he->h_addr_list[0]; } return(0); } /* * This one handles all table-related commands * ipfw table NAME create ... * ipfw table NAME modify ... * ipfw table {NAME | all} destroy * ipfw table NAME swap NAME * ipfw table NAME lock * ipfw table NAME unlock * ipfw table NAME add addr[/masklen] [value] * ipfw table NAME add [addr[/masklen] value] [addr[/masklen] value] .. * ipfw table NAME delete addr[/masklen] [addr[/masklen]] .. * ipfw table NAME lookup addr * ipfw table {NAME | all} flush * ipfw table {NAME | all} list * ipfw table {NAME | all} info * ipfw table {NAME | all} detail */ void ipfw_table_handler(int ac, char *av[]) { int do_add, is_all; int atomic, error, tcmd; ipfw_xtable_info i; ipfw_obj_header oh; char *tablename; uint8_t set; void *arg; memset(&oh, 0, sizeof(oh)); is_all = 0; if (g_co.use_set != 0) set = g_co.use_set - 1; else set = 0; ac--; av++; NEED1("table needs name"); tablename = *av; if (table_check_name(tablename) == 0) { table_fill_ntlv(&oh.ntlv, *av, set, 1); oh.idx = 1; } else { if (strcmp(tablename, "all") == 0) is_all = 1; else errx(EX_USAGE, "table name %s is invalid", tablename); } ac--; av++; NEED1("table needs command"); tcmd = get_token(tablecmds, *av, "table command"); /* Check if atomic operation was requested */ atomic = 0; if (tcmd == TOK_ATOMIC) { ac--; av++; NEED1("atomic needs command"); tcmd = get_token(tablecmds, *av, "table command"); switch (tcmd) { case TOK_ADD: break; default: errx(EX_USAGE, "atomic is not compatible with %s", *av); } atomic = 1; } switch (tcmd) { case TOK_LIST: case TOK_INFO: case TOK_DETAIL: case TOK_FLUSH: case TOK_DESTROY: break; default: if (is_all != 0) errx(EX_USAGE, "table name required"); } switch (tcmd) { case TOK_ADD: case TOK_DEL: do_add = **av == 'a'; ac--; av++; table_modify_record(&oh, ac, av, do_add, g_co.do_quiet, g_co.do_quiet, atomic); break; case TOK_CREATE: ac--; av++; table_create(&oh, ac, av); break; case TOK_MODIFY: ac--; av++; table_modify(&oh, ac, av); break; case TOK_DESTROY: if (is_all == 0) { if (table_destroy(&oh) == 0) break; if (errno != ESRCH) err(EX_OSERR, "failed to destroy table %s", tablename); /* ESRCH isn't fatal, warn if not quiet mode */ if (g_co.do_quiet == 0) warn("failed to destroy table %s", tablename); } else { error = tables_foreach(table_destroy_one, &oh, 1); if (error != 0) err(EX_OSERR, "failed to destroy tables list"); } break; case TOK_FLUSH: if (is_all == 0) { if ((error = table_flush(&oh)) == 0) break; if (errno != ESRCH) err(EX_OSERR, "failed to flush table %s info", tablename); /* ESRCH isn't fatal, warn if not quiet mode */ if (g_co.do_quiet == 0) warn("failed to flush table %s info", tablename); } else { error = tables_foreach(table_flush_one, &oh, 1); if (error != 0) err(EX_OSERR, "failed to flush tables list"); /* XXX: we ignore errors here */ } break; case TOK_SWAP: ac--; av++; NEED1("second table name required"); table_swap(&oh, *av); break; case TOK_LOCK: case TOK_UNLOCK: table_lock(&oh, (tcmd == TOK_LOCK)); break; case TOK_DETAIL: case TOK_INFO: arg = (tcmd == TOK_DETAIL) ? (void *)1 : NULL; if (is_all == 0) { if ((error = table_get_info(&oh, &i)) != 0) err(EX_OSERR, "failed to request table info"); table_show_info(&i, arg); } else { error = tables_foreach(table_show_info, arg, 1); if (error != 0) err(EX_OSERR, "failed to request tables list"); } break; case TOK_LIST: arg = is_all ? (void*)1 : NULL; if (is_all == 0) { if ((error = table_get_info(&oh, &i)) != 0) err(EX_OSERR, "failed to request table info"); table_show_one(&i, arg); } else { error = tables_foreach(table_show_one, arg, 1); if (error != 0) err(EX_OSERR, "failed to request tables list"); } break; case TOK_LOOKUP: ac--; av++; table_lookup(&oh, ac, av); break; } } void table_fill_ntlv(ipfw_obj_ntlv *ntlv, const char *name, uint8_t set, - uint16_t uidx) + uint32_t uidx) { ntlv->head.type = IPFW_TLV_TBL_NAME; ntlv->head.length = sizeof(ipfw_obj_ntlv); ntlv->idx = uidx; ntlv->set = set; strlcpy(ntlv->name, name, sizeof(ntlv->name)); } static void table_fill_objheader(ipfw_obj_header *oh, ipfw_xtable_info *i) { oh->idx = 1; table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); } static struct _s_x tablenewcmds[] = { { "type", TOK_TYPE }, { "valtype", TOK_VALTYPE }, { "algo", TOK_ALGO }, { "limit", TOK_LIMIT }, { "locked", TOK_LOCK }, { "missing", TOK_MISSING }, { "or-flush", TOK_ORFLUSH }, { NULL, 0 } }; static struct _s_x flowtypecmds[] = { { "src-ip", IPFW_TFFLAG_SRCIP }, { "proto", IPFW_TFFLAG_PROTO }, { "src-port", IPFW_TFFLAG_SRCPORT }, { "dst-ip", IPFW_TFFLAG_DSTIP }, { "dst-port", IPFW_TFFLAG_DSTPORT }, { NULL, 0 } }; static int table_parse_type(uint8_t ttype, char *p, uint8_t *tflags) { uint32_t fset, fclear; char *e; /* Parse type options */ switch(ttype) { case IPFW_TABLE_FLOW: fset = fclear = 0; if (fill_flags(flowtypecmds, p, &e, &fset, &fclear) != 0) errx(EX_USAGE, "unable to parse flow option %s", e); *tflags = fset; break; default: return (EX_USAGE); } return (0); } static void table_print_type(char *tbuf, size_t size, uint8_t type, uint8_t tflags) { const char *tname; int l; if ((tname = match_value(tabletypes, type)) == NULL) tname = "unknown"; l = snprintf(tbuf, size, "%s", tname); tbuf += l; size -= l; switch(type) { case IPFW_TABLE_FLOW: if (tflags != 0) { *tbuf++ = ':'; l--; print_flags_buffer(tbuf, size, flowtypecmds, tflags); } break; } } /* * Creates new table * * ipfw table NAME create [ type { addr | iface | number | flow } ] * [ algo algoname ] [missing] [or-flush] */ static void table_create(ipfw_obj_header *oh, int ac, char *av[]) { ipfw_xtable_info xi, xie; int error, missing, orflush, tcmd, val; uint32_t fset, fclear; char *e, *p; char tbuf[128]; missing = orflush = 0; memset(&xi, 0, sizeof(xi)); while (ac > 0) { tcmd = get_token(tablenewcmds, *av, "option"); ac--; av++; switch (tcmd) { case TOK_LIMIT: NEED1("limit value required"); xi.limit = strtol(*av, NULL, 10); ac--; av++; break; case TOK_TYPE: NEED1("table type required"); /* Type may have suboptions after ':' */ if ((p = strchr(*av, ':')) != NULL) *p++ = '\0'; val = match_token(tabletypes, *av); if (val == -1) { concat_tokens(tbuf, sizeof(tbuf), tabletypes, ", "); errx(EX_USAGE, "Unknown tabletype: %s. Supported: %s", *av, tbuf); } xi.type = val; if (p != NULL) { error = table_parse_type(val, p, &xi.tflags); if (error != 0) errx(EX_USAGE, "Unsupported suboptions: %s", p); } ac--; av++; break; case TOK_VALTYPE: NEED1("table value type required"); fset = fclear = 0; val = fill_flags(tablevaltypes, *av, &e, &fset, &fclear); if (val != -1) { xi.vmask = fset; ac--; av++; break; } concat_tokens(tbuf, sizeof(tbuf), tablevaltypes, ", "); errx(EX_USAGE, "Unknown value type: %s. Supported: %s", e, tbuf); break; case TOK_ALGO: NEED1("table algorithm name required"); if (strlen(*av) > sizeof(xi.algoname)) errx(EX_USAGE, "algorithm name too long"); strlcpy(xi.algoname, *av, sizeof(xi.algoname)); ac--; av++; break; case TOK_LOCK: xi.flags |= IPFW_TGFLAGS_LOCKED; break; case TOK_ORFLUSH: orflush = 1; /* FALLTHROUGH */ case TOK_MISSING: missing = 1; break; } } /* Set some defaults to preserve compatibility. */ if (xi.algoname[0] == '\0') { const char *algo; if (xi.type == 0) xi.type = IPFW_TABLE_ADDR; algo = match_value(tablealgos, xi.type); if (algo != NULL) strlcpy(xi.algoname, algo, sizeof(xi.algoname)); } if (xi.vmask == 0) xi.vmask = IPFW_VTYPE_LEGACY; error = table_do_create(oh, &xi); if (error == 0) return; if (errno != EEXIST || missing == 0) err(EX_OSERR, "Table creation failed"); /* Check that existing table is the same we are trying to create */ if (table_get_info(oh, &xie) != 0) err(EX_OSERR, "Existing table check failed"); if (xi.limit != xie.limit || xi.type != xie.type || xi.tflags != xie.tflags || xi.vmask != xie.vmask || ( xi.algoname[0] != '\0' && strcmp(xi.algoname, xie.algoname) != 0) || xi.flags != xie.flags) errx(EX_DATAERR, "The existing table is not compatible " "with one you are creating."); /* Flush existing table if instructed to do so */ if (orflush != 0 && table_flush(oh) != 0) err(EX_OSERR, "Table flush on creation failed"); } /* * Creates new table * * Request: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success. */ static int table_do_create(ipfw_obj_header *oh, ipfw_xtable_info *i) { char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; int error; memcpy(tbuf, oh, sizeof(*oh)); memcpy(tbuf + sizeof(*oh), i, sizeof(*i)); oh = (ipfw_obj_header *)tbuf; error = do_set3(IP_FW_TABLE_XCREATE, &oh->opheader, sizeof(tbuf)); return (error); } /* * Modifies existing table * * ipfw table NAME modify [ limit number ] */ static void table_modify(ipfw_obj_header *oh, int ac, char *av[]) { ipfw_xtable_info xi; int tcmd; memset(&xi, 0, sizeof(xi)); while (ac > 0) { tcmd = get_token(tablenewcmds, *av, "option"); ac--; av++; switch (tcmd) { case TOK_LIMIT: NEED1("limit value required"); xi.limit = strtol(*av, NULL, 10); xi.mflags |= IPFW_TMFLAGS_LIMIT; ac--; av++; break; default: errx(EX_USAGE, "cmd is not supported for modification"); } } if (table_do_modify(oh, &xi) != 0) err(EX_OSERR, "Table modification failed"); } /* * Modifies existing table. * * Request: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success. */ static int table_do_modify(ipfw_obj_header *oh, ipfw_xtable_info *i) { char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; int error; memcpy(tbuf, oh, sizeof(*oh)); memcpy(tbuf + sizeof(*oh), i, sizeof(*i)); oh = (ipfw_obj_header *)tbuf; error = do_set3(IP_FW_TABLE_XMODIFY, &oh->opheader, sizeof(tbuf)); return (error); } /* * Locks or unlocks given table */ static void table_lock(ipfw_obj_header *oh, int lock) { ipfw_xtable_info xi; memset(&xi, 0, sizeof(xi)); xi.mflags |= IPFW_TMFLAGS_LOCK; xi.flags |= (lock != 0) ? IPFW_TGFLAGS_LOCKED : 0; if (table_do_modify(oh, &xi) != 0) err(EX_OSERR, "Table %s failed", lock != 0 ? "lock" : "unlock"); } /* * Destroys given table specified by @oh->ntlv. * Returns 0 on success. */ static int table_destroy(ipfw_obj_header *oh) { if (do_set3(IP_FW_TABLE_XDESTROY, &oh->opheader, sizeof(*oh)) != 0) return (-1); return (0); } static int table_destroy_one(ipfw_xtable_info *i, void *arg) { ipfw_obj_header *oh; oh = (ipfw_obj_header *)arg; table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); if (table_destroy(oh) != 0) { if (g_co.do_quiet == 0) warn("failed to destroy table(%s) in set %u", i->tablename, i->set); return (-1); } return (0); } /* * Flushes given table specified by @oh->ntlv. * Returns 0 on success. */ static int table_flush(ipfw_obj_header *oh) { if (do_set3(IP_FW_TABLE_XFLUSH, &oh->opheader, sizeof(*oh)) != 0) return (-1); return (0); } static int table_do_swap(ipfw_obj_header *oh, char *second) { char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ntlv)]; int error; memset(tbuf, 0, sizeof(tbuf)); memcpy(tbuf, oh, sizeof(*oh)); oh = (ipfw_obj_header *)tbuf; table_fill_ntlv((ipfw_obj_ntlv *)(oh + 1), second, oh->ntlv.set, 1); error = do_set3(IP_FW_TABLE_XSWAP, &oh->opheader, sizeof(tbuf)); return (error); } /* * Swaps given table with @second one. */ static int table_swap(ipfw_obj_header *oh, char *second) { if (table_check_name(second) != 0) errx(EX_USAGE, "table name %s is invalid", second); if (table_do_swap(oh, second) == 0) return (0); switch (errno) { case EINVAL: errx(EX_USAGE, "Unable to swap table: check types"); case EFBIG: errx(EX_USAGE, "Unable to swap table: check limits"); } return (0); } /* * Retrieves table in given table specified by @oh->ntlv. * it inside @i. * Returns 0 on success. */ static int table_get_info(ipfw_obj_header *oh, ipfw_xtable_info *i) { char tbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info)]; size_t sz; sz = sizeof(tbuf); memset(tbuf, 0, sizeof(tbuf)); memcpy(tbuf, oh, sizeof(*oh)); oh = (ipfw_obj_header *)tbuf; if (do_get3(IP_FW_TABLE_XINFO, &oh->opheader, &sz) != 0) return (errno); if (sz < sizeof(tbuf)) return (EINVAL); *i = *(ipfw_xtable_info *)(oh + 1); return (0); } static struct _s_x tablealgoclass[] = { { "hash", IPFW_TACLASS_HASH }, { "array", IPFW_TACLASS_ARRAY }, { "radix", IPFW_TACLASS_RADIX }, { NULL, 0 } }; struct ta_cldata { uint8_t taclass; uint8_t spare4; uint16_t itemsize; uint16_t itemsize6; uint32_t size; uint32_t count; }; /* * Print global/per-AF table @i algorithm info. */ static void table_show_tainfo(ipfw_xtable_info *i __unused, struct ta_cldata *d, const char *af, const char *taclass) { switch (d->taclass) { case IPFW_TACLASS_HASH: case IPFW_TACLASS_ARRAY: printf(" %salgorithm %s info\n", af, taclass); if (d->itemsize == d->itemsize6) printf(" size: %u items: %u itemsize: %u\n", d->size, d->count, d->itemsize); else printf(" size: %u items: %u " "itemsize4: %u itemsize6: %u\n", d->size, d->count, d->itemsize, d->itemsize6); break; case IPFW_TACLASS_RADIX: printf(" %salgorithm %s info\n", af, taclass); if (d->itemsize == d->itemsize6) printf(" items: %u itemsize: %u\n", d->count, d->itemsize); else printf(" items: %u " "itemsize4: %u itemsize6: %u\n", d->count, d->itemsize, d->itemsize6); break; default: printf(" algo class: %s\n", taclass); } } static void table_print_valheader(char *buf, size_t bufsize, uint32_t vmask) { if (vmask == IPFW_VTYPE_LEGACY) { snprintf(buf, bufsize, "legacy"); return; } memset(buf, 0, bufsize); print_flags_buffer(buf, bufsize, tablevaltypes, vmask); } /* * Prints table info struct @i in human-readable form. */ static int table_show_info(ipfw_xtable_info *i, void *arg) { const char *vtype; ipfw_ta_tinfo *tainfo; int afdata, afitem; struct ta_cldata d; char ttype[64], tvtype[64]; table_print_type(ttype, sizeof(ttype), i->type, i->tflags); table_print_valheader(tvtype, sizeof(tvtype), i->vmask); printf("--- table(%s), set(%u) ---\n", i->tablename, i->set); if ((i->flags & IPFW_TGFLAGS_LOCKED) != 0) printf(" kindex: %d, type: %s, locked\n", i->kidx, ttype); else printf(" kindex: %d, type: %s\n", i->kidx, ttype); printf(" references: %u, valtype: %s\n", i->refcnt, tvtype); printf(" algorithm: %s\n", i->algoname); printf(" items: %u, size: %u\n", i->count, i->size); if (i->limit > 0) printf(" limit: %u\n", i->limit); /* Print algo-specific info if requested & set */ if (arg == NULL) return (0); if ((i->ta_info.flags & IPFW_TATFLAGS_DATA) == 0) return (0); tainfo = &i->ta_info; afdata = 0; afitem = 0; if (tainfo->flags & IPFW_TATFLAGS_AFDATA) afdata = 1; if (tainfo->flags & IPFW_TATFLAGS_AFITEM) afitem = 1; memset(&d, 0, sizeof(d)); d.taclass = tainfo->taclass4; d.size = tainfo->size4; d.count = tainfo->count4; d.itemsize = tainfo->itemsize4; if (afdata == 0 && afitem != 0) d.itemsize6 = tainfo->itemsize6; else d.itemsize6 = d.itemsize; if ((vtype = match_value(tablealgoclass, d.taclass)) == NULL) vtype = "unknown"; if (afdata == 0) { table_show_tainfo(i, &d, "", vtype); } else { table_show_tainfo(i, &d, "IPv4 ", vtype); memset(&d, 0, sizeof(d)); d.taclass = tainfo->taclass6; if ((vtype = match_value(tablealgoclass, d.taclass)) == NULL) vtype = "unknown"; d.size = tainfo->size6; d.count = tainfo->count6; d.itemsize = tainfo->itemsize6; d.itemsize6 = d.itemsize; table_show_tainfo(i, &d, "IPv6 ", vtype); } return (0); } /* * Function wrappers which can be used either * as is or as foreach function parameter. */ static int table_show_one(ipfw_xtable_info *i, void *arg) { ipfw_obj_header *oh = NULL; int error; int is_all; is_all = arg == NULL ? 0 : 1; if ((error = table_do_get_list(i, &oh)) != 0) { err(EX_OSERR, "Error requesting table %s list", i->tablename); return (error); } table_show_list(oh, is_all); free(oh); return (0); } static int table_flush_one(ipfw_xtable_info *i, void *arg) { ipfw_obj_header *oh; oh = (ipfw_obj_header *)arg; table_fill_ntlv(&oh->ntlv, i->tablename, i->set, 1); return (table_flush(oh)); } static int table_do_modify_record(int cmd, ipfw_obj_header *oh, ipfw_obj_tentry *tent, int count, int atomic) { ipfw_obj_ctlv *ctlv; ipfw_obj_tentry *tent_base; caddr_t pbuf; char xbuf[sizeof(*oh) + sizeof(ipfw_obj_ctlv) + sizeof(*tent)]; int error, i; size_t sz; sz = sizeof(*ctlv) + sizeof(*tent) * count; if (count == 1) { memset(xbuf, 0, sizeof(xbuf)); pbuf = xbuf; } else { if ((pbuf = calloc(1, sizeof(*oh) + sz)) == NULL) return (ENOMEM); } memcpy(pbuf, oh, sizeof(*oh)); oh = (ipfw_obj_header *)pbuf; oh->opheader.version = 1; /* Current version */ ctlv = (ipfw_obj_ctlv *)(oh + 1); ctlv->count = count; ctlv->head.length = sz; if (atomic != 0) ctlv->flags |= IPFW_CTF_ATOMIC; tent_base = tent; memcpy(ctlv + 1, tent, sizeof(*tent) * count); tent = (ipfw_obj_tentry *)(ctlv + 1); for (i = 0; i < count; i++, tent++) { tent->head.length = sizeof(ipfw_obj_tentry); tent->idx = oh->idx; } sz += sizeof(*oh); error = do_get3(cmd, &oh->opheader, &sz); if (error != 0) error = errno; tent = (ipfw_obj_tentry *)(ctlv + 1); /* Copy result back to provided buffer */ memcpy(tent_base, ctlv + 1, sizeof(*tent) * count); if (pbuf != xbuf) free(pbuf); return (error); } static void table_modify_record(ipfw_obj_header *oh, int ac, char *av[], int add, int quiet, int update, int atomic) { ipfw_obj_tentry *ptent, tent, *tent_buf; ipfw_xtable_info xi; const char *etxt, *px, *texterr; uint8_t type; uint32_t vmask; int cmd, count, error, i, ignored; if (ac == 0) errx(EX_USAGE, "address required"); if (add != 0) { cmd = IP_FW_TABLE_XADD; texterr = "Adding record failed"; } else { cmd = IP_FW_TABLE_XDEL; texterr = "Deleting record failed"; } /* * Calculate number of entries: * Assume [key val] x N for add * and * key x N for delete */ count = (add != 0) ? ac / 2 + 1 : ac; if (count <= 1) { /* Adding single entry with/without value */ memset(&tent, 0, sizeof(tent)); tent_buf = &tent; } else { if ((tent_buf = calloc(count, sizeof(tent))) == NULL) errx(EX_OSERR, "Unable to allocate memory for all entries"); } ptent = tent_buf; memset(&xi, 0, sizeof(xi)); count = 0; while (ac > 0) { tentry_fill_key(oh, ptent, *av, add, &type, &vmask, &xi); /* * Compatibility layer: auto-create table if not exists. */ if (xi.tablename[0] == '\0') { xi.type = type; xi.vmask = vmask; strlcpy(xi.tablename, oh->ntlv.name, sizeof(xi.tablename)); if (quiet == 0) warnx("DEPRECATED: inserting data into " "non-existent table %s. (auto-created)", xi.tablename); table_do_create(oh, &xi); } oh->ntlv.type = type; ac--; av++; if (add != 0 && ac > 0) { tentry_fill_value(oh, ptent, *av, type, vmask); ac--; av++; } if (update != 0) ptent->head.flags |= IPFW_TF_UPDATE; count++; ptent++; } error = table_do_modify_record(cmd, oh, tent_buf, count, atomic); /* * Compatibility stuff: do not yell on duplicate keys or * failed deletions. */ if (error == 0 || (error == EEXIST && add != 0) || (error == ENOENT && add == 0)) { if (quiet != 0) { if (tent_buf != &tent) free(tent_buf); return; } } /* Get real OS error */ error = errno; /* Report results back */ ptent = tent_buf; for (i = 0; i < count; ptent++, i++) { ignored = 0; switch (ptent->result) { case IPFW_TR_ADDED: px = "added"; break; case IPFW_TR_DELETED: px = "deleted"; break; case IPFW_TR_UPDATED: px = "updated"; break; case IPFW_TR_LIMIT: px = "limit"; ignored = 1; break; case IPFW_TR_ERROR: px = "error"; ignored = 1; break; case IPFW_TR_NOTFOUND: px = "notfound"; ignored = 1; break; case IPFW_TR_EXISTS: px = "exists"; ignored = 1; break; case IPFW_TR_IGNORED: px = "ignored"; ignored = 1; break; default: px = "unknown"; ignored = 1; } if (error != 0 && atomic != 0 && ignored == 0) printf("%s(reverted): ", px); else printf("%s: ", px); table_show_entry(&xi, ptent); } if (tent_buf != &tent) free(tent_buf); if (error == 0) return; /* Try to provide more human-readable error */ switch (error) { case EEXIST: etxt = "record already exists"; break; case EFBIG: etxt = "limit hit"; break; case ESRCH: etxt = "table not found"; break; case ENOENT: etxt = "record not found"; break; case EACCES: etxt = "table is locked"; break; default: etxt = strerror(error); } errx(EX_OSERR, "%s: %s", texterr, etxt); } static int table_do_lookup(ipfw_obj_header *oh, char *key, ipfw_xtable_info *xi, ipfw_obj_tentry *xtent) { char xbuf[sizeof(ipfw_obj_header) + sizeof(ipfw_obj_tentry)]; ipfw_obj_tentry *tent; uint8_t type; uint32_t vmask; size_t sz; memcpy(xbuf, oh, sizeof(*oh)); oh = (ipfw_obj_header *)xbuf; tent = (ipfw_obj_tentry *)(oh + 1); memset(tent, 0, sizeof(*tent)); tent->head.length = sizeof(*tent); tent->idx = 1; tentry_fill_key(oh, tent, key, 0, &type, &vmask, xi); oh->ntlv.type = type; sz = sizeof(xbuf); if (do_get3(IP_FW_TABLE_XFIND, &oh->opheader, &sz) != 0) return (errno); if (sz < sizeof(xbuf)) return (EINVAL); *xtent = *tent; return (0); } static void table_lookup(ipfw_obj_header *oh, int ac, char *av[]) { ipfw_obj_tentry xtent; ipfw_xtable_info xi; char key[64]; int error; if (ac == 0) errx(EX_USAGE, "address required"); strlcpy(key, *av, sizeof(key)); memset(&xi, 0, sizeof(xi)); error = table_do_lookup(oh, key, &xi, &xtent); switch (error) { case 0: break; case ESRCH: errx(EX_UNAVAILABLE, "Table %s not found", oh->ntlv.name); case ENOENT: errx(EX_UNAVAILABLE, "Entry %s not found", *av); case ENOTSUP: errx(EX_UNAVAILABLE, "Table %s algo does not support " "\"lookup\" method", oh->ntlv.name); default: err(EX_OSERR, "getsockopt(IP_FW_TABLE_XFIND)"); } table_show_entry(&xi, &xtent); } static void tentry_fill_key_type(char *arg, ipfw_obj_tentry *tentry, uint8_t type, uint8_t tflags) { char *p, *pp; int mask, af; struct in6_addr *paddr, tmp; struct ether_addr *mac; struct tflow_entry *tfe; uint32_t key, *pkey; uint16_t port; struct protoent *pent; struct servent *sent; int masklen; mask = masklen = 0; af = 0; paddr = (struct in6_addr *)&tentry->k; switch (type) { case IPFW_TABLE_ADDR: /* Remove / if exists */ if ((p = strchr(arg, '/')) != NULL) { *p = '\0'; mask = atoi(p + 1); } if (inet_pton(AF_INET, arg, paddr) == 1) { if (p != NULL && mask > 32) errx(EX_DATAERR, "bad IPv4 mask width: %s", p + 1); masklen = p ? mask : 32; af = AF_INET; } else if (inet_pton(AF_INET6, arg, paddr) == 1) { if (IN6_IS_ADDR_V4COMPAT(paddr)) errx(EX_DATAERR, "Use IPv4 instead of v4-compatible"); if (p != NULL && mask > 128) errx(EX_DATAERR, "bad IPv6 mask width: %s", p + 1); masklen = p ? mask : 128; af = AF_INET6; } else { /* Assume FQDN */ if (lookup_host(arg, (struct in_addr *)paddr) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", arg); masklen = 32; type = IPFW_TABLE_ADDR; af = AF_INET; } break; case IPFW_TABLE_MAC: /* Remove / if exists */ if ((p = strchr(arg, '/')) != NULL) { *p = '\0'; mask = atoi(p + 1); } if (p != NULL && mask > 8 * ETHER_ADDR_LEN) errx(EX_DATAERR, "bad MAC mask width: %s", p + 1); if ((mac = ether_aton(arg)) == NULL) errx(EX_DATAERR, "Incorrect MAC address"); memcpy(tentry->k.mac, mac->octet, ETHER_ADDR_LEN); masklen = p ? mask : 8 * ETHER_ADDR_LEN; af = AF_LINK; break; case IPFW_TABLE_INTERFACE: /* Assume interface name. Copy significant data only */ mask = MIN(strlen(arg), IF_NAMESIZE - 1); memcpy(paddr, arg, mask); /* Set mask to exact match */ masklen = 8 * IF_NAMESIZE; break; case IPFW_TABLE_NUMBER: /* Port or any other key */ key = strtol(arg, &p, 10); if (*p != '\0') errx(EX_DATAERR, "Invalid number: %s", arg); pkey = (uint32_t *)paddr; *pkey = key; masklen = 32; break; case IPFW_TABLE_FLOW: /* Assume [src-ip][,proto][,src-port][,dst-ip][,dst-port] */ tfe = &tentry->k.flow; af = 0; /* Handle */ if ((tflags & IPFW_TFFLAG_SRCIP) != 0) { if ((p = strchr(arg, ',')) != NULL) *p++ = '\0'; /* Determine family using temporary storage */ if (inet_pton(AF_INET, arg, &tmp) == 1) { if (af != 0 && af != AF_INET) errx(EX_DATAERR, "Inconsistent address family\n"); af = AF_INET; memcpy(&tfe->a.a4.sip, &tmp, 4); } else if (inet_pton(AF_INET6, arg, &tmp) == 1) { if (af != 0 && af != AF_INET6) errx(EX_DATAERR, "Inconsistent address family\n"); af = AF_INET6; memcpy(&tfe->a.a6.sip6, &tmp, 16); } arg = p; } /* Handle */ if ((tflags & IPFW_TFFLAG_PROTO) != 0) { if (arg == NULL) errx(EX_DATAERR, "invalid key: proto missing"); if ((p = strchr(arg, ',')) != NULL) *p++ = '\0'; key = strtol(arg, &pp, 10); if (*pp != '\0') { if ((pent = getprotobyname(arg)) == NULL) errx(EX_DATAERR, "Unknown proto: %s", arg); else key = pent->p_proto; } if (key > 255) errx(EX_DATAERR, "Bad protocol number: %u",key); tfe->proto = key; arg = p; } /* Handle */ if ((tflags & IPFW_TFFLAG_SRCPORT) != 0) { if (arg == NULL) errx(EX_DATAERR, "invalid key: src port missing"); if ((p = strchr(arg, ',')) != NULL) *p++ = '\0'; port = htons(strtol(arg, &pp, 10)); if (*pp != '\0') { if ((sent = getservbyname(arg, NULL)) == NULL) errx(EX_DATAERR, "Unknown service: %s", arg); port = sent->s_port; } tfe->sport = port; arg = p; } /* Handle */ if ((tflags & IPFW_TFFLAG_DSTIP) != 0) { if (arg == NULL) errx(EX_DATAERR, "invalid key: dst ip missing"); if ((p = strchr(arg, ',')) != NULL) *p++ = '\0'; /* Determine family using temporary storage */ if (inet_pton(AF_INET, arg, &tmp) == 1) { if (af != 0 && af != AF_INET) errx(EX_DATAERR, "Inconsistent address family"); af = AF_INET; memcpy(&tfe->a.a4.dip, &tmp, 4); } else if (inet_pton(AF_INET6, arg, &tmp) == 1) { if (af != 0 && af != AF_INET6) errx(EX_DATAERR, "Inconsistent address family"); af = AF_INET6; memcpy(&tfe->a.a6.dip6, &tmp, 16); } arg = p; } /* Handle */ if ((tflags & IPFW_TFFLAG_DSTPORT) != 0) { if (arg == NULL) errx(EX_DATAERR, "invalid key: dst port missing"); if ((p = strchr(arg, ',')) != NULL) *p++ = '\0'; port = htons(strtol(arg, &pp, 10)); if (*pp != '\0') { if ((sent = getservbyname(arg, NULL)) == NULL) errx(EX_DATAERR, "Unknown service: %s", arg); port = sent->s_port; } tfe->dport = port; arg = p; } tfe->af = af; break; default: errx(EX_DATAERR, "Unsupported table type: %d", type); } tentry->subtype = af; tentry->masklen = masklen; } /* * Tries to guess table key type. * This procedure is used in legacy table auto-create * code AND in `ipfw -n` ruleset checking. * * Imported from old table_fill_xentry() parse code. */ static int guess_key_type(char *key, uint8_t *ptype) { char *p; struct in6_addr addr; if (ishexnumber(*key) != 0 || *key == ':') { /* Remove / if exists */ if ((p = strchr(key, '/')) != NULL) *p = '\0'; if ((inet_pton(AF_INET, key, &addr) == 1) || (inet_pton(AF_INET6, key, &addr) == 1)) { *ptype = IPFW_TABLE_CIDR; if (p != NULL) *p = '/'; return (0); } else { /* Port or any other key */ /* Skip non-base 10 entries like 'fa1' */ (void)strtol(key, &p, 10); if (*p == '\0') { *ptype = IPFW_TABLE_NUMBER; return (0); } else if ((p != key) && (*p == '.')) { /* * Warn on IPv4 address strings * which are "valid" for inet_aton() but not * in inet_pton(). * * Typical examples: '10.5' or '10.0.0.05' */ return (1); } } } if (strchr(key, '.') == NULL) { *ptype = IPFW_TABLE_INTERFACE; return (0); } if (lookup_host(key, (struct in_addr *)&addr) != 0) return (1); *ptype = IPFW_TABLE_CIDR; return (0); } static void tentry_fill_key(ipfw_obj_header *oh, ipfw_obj_tentry *tent, char *key, int add, uint8_t *ptype, uint32_t *pvmask, ipfw_xtable_info *xi) { uint8_t type, tflags; uint32_t vmask; int error; type = 0; tflags = 0; vmask = 0; if (xi->tablename[0] == '\0') error = table_get_info(oh, xi); else error = 0; if (error == 0) { if (g_co.test_only == 0) { /* Table found */ type = xi->type; tflags = xi->tflags; vmask = xi->vmask; } else { /* * We're running `ipfw -n` * Compatibility layer: try to guess key type * before failing. */ if (guess_key_type(key, &type) != 0) { /* Inknown key */ errx(EX_USAGE, "Cannot guess " "key '%s' type", key); } vmask = IPFW_VTYPE_LEGACY; } } else { if (error != ESRCH) errx(EX_OSERR, "Error requesting table %s info", oh->ntlv.name); if (add == 0) errx(EX_DATAERR, "Table %s does not exist", oh->ntlv.name); /* * Table does not exist * Compatibility layer: try to guess key type before failing. */ if (guess_key_type(key, &type) != 0) { /* Inknown key */ errx(EX_USAGE, "Table %s does not exist, cannot guess " "key '%s' type", oh->ntlv.name, key); } vmask = IPFW_VTYPE_LEGACY; } tentry_fill_key_type(key, tent, type, tflags); *ptype = type; *pvmask = vmask; } static void set_legacy_value(uint32_t val, ipfw_table_value *v) { v->tag = val; v->pipe = val; v->divert = val; v->skipto = val; v->netgraph = val; v->fib = val; v->nat = val; v->nh4 = val; v->dscp = (uint8_t)val; v->limit = val; } static void tentry_fill_value(ipfw_obj_header *oh __unused, ipfw_obj_tentry *tent, char *arg, uint8_t type __unused, uint32_t vmask) { struct addrinfo hints, *res; struct in_addr ipaddr; const char *etype; char *comma, *e, *n, *p; uint32_t a4, flag, val; ipfw_table_value *v; uint32_t i; int dval; v = &tent->v.value; /* Compat layer: keep old behavior for legacy value types */ if (vmask == IPFW_VTYPE_LEGACY) { /* Try to interpret as number first */ val = strtoul(arg, &p, 0); if (*p == '\0') { set_legacy_value(val, v); return; } if (inet_pton(AF_INET, arg, &val) == 1) { set_legacy_value(ntohl(val), v); return; } /* Try hostname */ if (lookup_host(arg, &ipaddr) == 0) { set_legacy_value(ntohl(ipaddr.s_addr), v); return; } errx(EX_OSERR, "Unable to parse value %s", arg); } /* * Shorthands: handle single value if vmask consists * of numbers only. e.g.: * vmask = "fib,skipto" -> treat input "1" as "1,1" */ n = arg; etype = NULL; for (i = 1; i < (1u << 31); i *= 2) { if ((flag = (vmask & i)) == 0) continue; vmask &= ~flag; if ((comma = strchr(n, ',')) != NULL) *comma = '\0'; switch (flag) { case IPFW_VTYPE_TAG: v->tag = strtol(n, &e, 10); if (*e != '\0') etype = "tag"; break; case IPFW_VTYPE_PIPE: v->pipe = strtol(n, &e, 10); if (*e != '\0') etype = "pipe"; break; case IPFW_VTYPE_DIVERT: v->divert = strtol(n, &e, 10); if (*e != '\0') etype = "divert"; break; case IPFW_VTYPE_SKIPTO: v->skipto = strtol(n, &e, 10); if (*e != '\0') etype = "skipto"; break; case IPFW_VTYPE_NETGRAPH: v->netgraph = strtol(n, &e, 10); if (*e != '\0') etype = "netgraph"; break; case IPFW_VTYPE_FIB: v->fib = strtol(n, &e, 10); if (*e != '\0') etype = "fib"; break; case IPFW_VTYPE_NAT: v->nat = strtol(n, &e, 10); if (*e != '\0') etype = "nat"; break; case IPFW_VTYPE_LIMIT: v->limit = strtol(n, &e, 10); if (*e != '\0') etype = "limit"; break; case IPFW_VTYPE_NH4: if (strchr(n, '.') != NULL && inet_pton(AF_INET, n, &a4) == 1) { v->nh4 = ntohl(a4); break; } if (lookup_host(n, &ipaddr) == 0) { v->nh4 = ntohl(ipaddr.s_addr); break; } etype = "ipv4"; break; case IPFW_VTYPE_DSCP: if (isalpha(*n)) { if ((dval = match_token(f_ipdscp, n)) != -1) { v->dscp = dval; break; } else etype = "DSCP code"; } else { v->dscp = strtol(n, &e, 10); if (v->dscp > 63 || *e != '\0') etype = "DSCP value"; } break; case IPFW_VTYPE_NH6: if (strchr(n, ':') != NULL) { memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_INET6; hints.ai_flags = AI_NUMERICHOST; if (getaddrinfo(n, NULL, &hints, &res) == 0) { v->nh6 = ((struct sockaddr_in6 *) res->ai_addr)->sin6_addr; v->zoneid = ((struct sockaddr_in6 *) res->ai_addr)->sin6_scope_id; freeaddrinfo(res); break; } } etype = "ipv6"; break; case IPFW_VTYPE_MARK: v->mark = strtol(n, &e, 16); if (*e != '\0') etype = "mark"; break; } if (etype != NULL) errx(EX_USAGE, "Unable to parse %s as %s", n, etype); if (comma != NULL) *comma++ = ','; if ((n = comma) != NULL) continue; /* End of input. */ if (vmask != 0) errx(EX_USAGE, "Not enough fields inside value"); } } /* * Compare table names. * Honor number comparison. */ static int tablename_cmp(const void *a, const void *b) { const ipfw_xtable_info *ia, *ib; ia = (const ipfw_xtable_info *)a; ib = (const ipfw_xtable_info *)b; return (stringnum_cmp(ia->tablename, ib->tablename)); } /* * Retrieves table list from kernel, * optionally sorts it and calls requested function for each table. * Returns 0 on success. */ static int tables_foreach(table_cb_t *f, void *arg, int sort) { ipfw_obj_lheader *olh; ipfw_xtable_info *info; size_t sz; uint32_t i; /* Start with reasonable default */ sz = sizeof(*olh) + 16 * sizeof(ipfw_xtable_info); for (;;) { if ((olh = calloc(1, sz)) == NULL) return (ENOMEM); olh->size = sz; if (do_get3(IP_FW_TABLES_XLIST, &olh->opheader, &sz) != 0) { sz = olh->size; free(olh); if (errno != ENOMEM) return (errno); continue; } if (sort != 0) qsort(olh + 1, olh->count, olh->objsize, tablename_cmp); info = (ipfw_xtable_info *)(olh + 1); for (i = 0; i < olh->count; i++) { if (g_co.use_set == 0 || info->set == g_co.use_set - 1) (void)f(info, arg); info = (ipfw_xtable_info *)((caddr_t)info + olh->objsize); } free(olh); break; } return (0); } /* * Retrieves all entries for given table @i in * eXtended format. Allocate buffer large enough * to store result. Called needs to free it later. * * Returns 0 on success. */ static int table_do_get_list(ipfw_xtable_info *i, ipfw_obj_header **poh) { ipfw_obj_header *oh; size_t sz; int c; sz = 0; oh = NULL; for (c = 0; c < 8; c++) { if (sz < i->size) sz = i->size + 44; if (oh != NULL) free(oh); if ((oh = calloc(1, sz)) == NULL) continue; table_fill_objheader(oh, i); oh->opheader.version = 1; /* Current version */ if (do_get3(IP_FW_TABLE_XLIST, &oh->opheader, &sz) == 0) { *poh = oh; return (0); } if (errno != ENOMEM) break; } free(oh); return (errno); } /* * Shows all entries from @oh in human-readable format */ static void table_show_list(ipfw_obj_header *oh, int need_header) { ipfw_obj_tentry *tent; uint32_t count; ipfw_xtable_info *i; i = (ipfw_xtable_info *)(oh + 1); tent = (ipfw_obj_tentry *)(i + 1); if (need_header) printf("--- table(%s), set(%u) ---\n", i->tablename, i->set); count = i->count; while (count > 0) { table_show_entry(i, tent); tent = (ipfw_obj_tentry *)((caddr_t)tent + tent->head.length); count--; } } static void table_show_value(char *buf, size_t bufsize, ipfw_table_value *v, uint32_t vmask, int print_ip) { char abuf[INET6_ADDRSTRLEN + IF_NAMESIZE + 2]; struct sockaddr_in6 sa6; uint32_t flag, i, l; size_t sz; struct in_addr a4; sz = bufsize; /* * Some shorthands for printing values: * legacy assumes all values are equal, so keep the first one. */ if (vmask == IPFW_VTYPE_LEGACY) { if (print_ip != 0) { flag = htonl(v->tag); inet_ntop(AF_INET, &flag, buf, sz); } else snprintf(buf, sz, "%u", v->tag); return; } for (i = 1; i < (1u << 31); i *= 2) { if ((flag = (vmask & i)) == 0) continue; l = 0; switch (flag) { case IPFW_VTYPE_TAG: l = snprintf(buf, sz, "%u,", v->tag); break; case IPFW_VTYPE_PIPE: l = snprintf(buf, sz, "%u,", v->pipe); break; case IPFW_VTYPE_DIVERT: l = snprintf(buf, sz, "%d,", v->divert); break; case IPFW_VTYPE_SKIPTO: l = snprintf(buf, sz, "%d,", v->skipto); break; case IPFW_VTYPE_NETGRAPH: l = snprintf(buf, sz, "%u,", v->netgraph); break; case IPFW_VTYPE_FIB: l = snprintf(buf, sz, "%u,", v->fib); break; case IPFW_VTYPE_NAT: l = snprintf(buf, sz, "%u,", v->nat); break; case IPFW_VTYPE_LIMIT: l = snprintf(buf, sz, "%u,", v->limit); break; case IPFW_VTYPE_NH4: a4.s_addr = htonl(v->nh4); inet_ntop(AF_INET, &a4, abuf, sizeof(abuf)); l = snprintf(buf, sz, "%s,", abuf); break; case IPFW_VTYPE_DSCP: l = snprintf(buf, sz, "%d,", v->dscp); break; case IPFW_VTYPE_NH6: sa6.sin6_family = AF_INET6; sa6.sin6_len = sizeof(sa6); sa6.sin6_addr = v->nh6; sa6.sin6_port = 0; sa6.sin6_scope_id = v->zoneid; if (getnameinfo((const struct sockaddr *)&sa6, sa6.sin6_len, abuf, sizeof(abuf), NULL, 0, NI_NUMERICHOST) == 0) l = snprintf(buf, sz, "%s,", abuf); break; case IPFW_VTYPE_MARK: l = snprintf(buf, sz, "%#x,", v->mark); break; } buf += l; sz -= l; } if (sz != bufsize) *(buf - 1) = '\0'; } static void table_show_entry(ipfw_xtable_info *i, ipfw_obj_tentry *tent) { char tbuf[128], pval[128]; const char *comma; const u_char *mac; void *paddr; struct tflow_entry *tfe; table_show_value(pval, sizeof(pval), &tent->v.value, i->vmask, g_co.do_value_as_ip); switch (i->type) { case IPFW_TABLE_ADDR: /* IPv4 or IPv6 prefixes */ inet_ntop(tent->subtype, &tent->k, tbuf, sizeof(tbuf)); printf("%s/%u %s\n", tbuf, tent->masklen, pval); break; case IPFW_TABLE_MAC: /* MAC prefixes */ mac = tent->k.mac; printf("%02x:%02x:%02x:%02x:%02x:%02x/%u %s\n", mac[0], mac[1], mac[2], mac[3], mac[4], mac[5], tent->masklen, pval); break; case IPFW_TABLE_INTERFACE: /* Interface names */ printf("%s %s\n", tent->k.iface, pval); break; case IPFW_TABLE_NUMBER: /* numbers */ printf("%u %s\n", tent->k.key, pval); break; case IPFW_TABLE_FLOW: /* flows */ tfe = &tent->k.flow; comma = ""; if ((i->tflags & IPFW_TFFLAG_SRCIP) != 0) { if (tfe->af == AF_INET) paddr = &tfe->a.a4.sip; else paddr = &tfe->a.a6.sip6; inet_ntop(tfe->af, paddr, tbuf, sizeof(tbuf)); printf("%s%s", comma, tbuf); comma = ","; } if ((i->tflags & IPFW_TFFLAG_PROTO) != 0) { printf("%s%d", comma, tfe->proto); comma = ","; } if ((i->tflags & IPFW_TFFLAG_SRCPORT) != 0) { printf("%s%d", comma, ntohs(tfe->sport)); comma = ","; } if ((i->tflags & IPFW_TFFLAG_DSTIP) != 0) { if (tfe->af == AF_INET) paddr = &tfe->a.a4.dip; else paddr = &tfe->a.a6.dip6; inet_ntop(tfe->af, paddr, tbuf, sizeof(tbuf)); printf("%s%s", comma, tbuf); comma = ","; } if ((i->tflags & IPFW_TFFLAG_DSTPORT) != 0) { printf("%s%d", comma, ntohs(tfe->dport)); comma = ","; } printf(" %s\n", pval); } } static int table_do_get_stdlist(uint16_t opcode, ipfw_obj_lheader **polh) { ipfw_obj_lheader req, *olh; size_t sz; memset(&req, 0, sizeof(req)); sz = sizeof(req); if (do_get3(opcode, &req.opheader, &sz) != 0) if (errno != ENOMEM) return (errno); sz = req.size; if ((olh = calloc(1, sz)) == NULL) return (ENOMEM); olh->size = sz; if (do_get3(opcode, &olh->opheader, &sz) != 0) { free(olh); return (errno); } *polh = olh; return (0); } static int table_do_get_algolist(ipfw_obj_lheader **polh) { return (table_do_get_stdlist(IP_FW_TABLES_ALIST, polh)); } static int table_do_get_vlist(ipfw_obj_lheader **polh) { return (table_do_get_stdlist(IP_FW_TABLE_VLIST, polh)); } void ipfw_list_ta(int ac __unused, char *av[] __unused) { ipfw_obj_lheader *olh; ipfw_ta_info *info; const char *atype; uint32_t i; int error; error = table_do_get_algolist(&olh); if (error != 0) err(EX_OSERR, "Unable to request algorithm list"); info = (ipfw_ta_info *)(olh + 1); for (i = 0; i < olh->count; i++) { if ((atype = match_value(tabletypes, info->type)) == NULL) atype = "unknown"; printf("--- %s ---\n", info->algoname); printf(" type: %s\n refcount: %u\n", atype, info->refcnt); info = (ipfw_ta_info *)((caddr_t)info + olh->objsize); } free(olh); } static int compare_values(const void *_a, const void *_b) { const ipfw_table_value *a, *b; a = (const ipfw_table_value *)_a; b = (const ipfw_table_value *)_b; if (a->kidx < b->kidx) return (-1); else if (a->kidx > b->kidx) return (1); return (0); } void ipfw_list_values(int ac __unused, char *av[] __unused) { char buf[128]; ipfw_obj_lheader *olh; ipfw_table_value *v; uint32_t i, vmask; int error; error = table_do_get_vlist(&olh); if (error != 0) err(EX_OSERR, "Unable to request value list"); vmask = 0x7FFFFFFF; /* Similar to IPFW_VTYPE_LEGACY */ table_print_valheader(buf, sizeof(buf), vmask); printf("HEADER: %s\n", buf); v = (ipfw_table_value *)(olh + 1); qsort(v, olh->count, olh->objsize, compare_values); for (i = 0; i < olh->count; i++) { table_show_value(buf, sizeof(buf), (ipfw_table_value *)v, vmask, 0); printf("[%u] refs=%lu %s\n", v->kidx, (u_long)v->refcnt, buf); v = (ipfw_table_value *)((caddr_t)v + olh->objsize); } free(olh); } int table_check_name(const char *tablename) { if (ipfw_check_object_name(tablename) != 0) return (EINVAL); /* Restrict some 'special' names */ if (strcmp(tablename, "all") == 0) return (EINVAL); return (0); } diff --git a/sbin/route/route.c b/sbin/route/route.c index cea63df3aa11..f37d23d25c2c 100644 --- a/sbin/route/route.c +++ b/sbin/route/route.c @@ -1,2025 +1,2026 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1983, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #ifdef JAIL #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef JAIL #include #endif #include #include #include #include #include #include #include #include #include #include struct fibl { TAILQ_ENTRY(fibl) fl_next; int fl_num; int fl_error; int fl_errno; }; static struct keytab { const char *kt_cp; int kt_i; } const keywords[] = { #include "keywords.h" {0, 0} }; int verbose, debugonly; #ifdef JAIL char * jail_name; #endif static struct sockaddr_storage so[RTAX_MAX]; static int pid, rtm_addrs; static int nflag, af, aflen, qflag, tflag; static int locking, lockrest; static struct rt_metrics rt_metrics; static u_long rtm_inits; static uid_t uid; static int defaultfib; static int numfibs; static char domain_storage[MAXHOSTNAMELEN + 1]; static const char *domain; static char rt_line[NI_MAXHOST]; static char net_line[MAXHOSTNAMELEN + 1]; #ifdef WITHOUT_NETLINK static int s; static int rtm_seq; static struct { struct rt_msghdr m_rtm; char m_space[512]; } m_rtmsg; static int rtmsg_rtsock(int, int, int); static int flushroutes_fib_rtsock(int); static void monitor_rtsock(void); #else int rtmsg_nl(int, int, int, int, struct sockaddr_storage *, struct rt_metrics *); int flushroutes_fib_nl(int, int); void monitor_nl(int); #endif static TAILQ_HEAD(fibl_head_t, fibl) fibl_head; void printb(int, const char *); static void flushroutes(int argc, char *argv[]); static int flushroutes_fib(int); static int getaddr(int, char *, int); static int keyword(const char *); #ifdef INET static void inet_makemask(struct sockaddr_in *, u_long); #endif #ifdef INET6 static int inet6_makenetandmask(struct sockaddr_in6 *, const char *); #endif static void interfaces(void); static void monitor(int, char*[]); const char *netname(struct sockaddr *); static void newroute(int, char **); static int newroute_fib(int, char *, int); static void pmsg_addrs(char *, int, size_t); static void pmsg_common(struct rt_msghdr *, size_t); static int prefixlen(const char *); static void print_getmsg(struct rt_msghdr *, int, int); static void print_rtmsg(struct rt_msghdr *, size_t); const char *routename(struct sockaddr *); static int rtmsg(int, int, int); static void set_metric(char *, int); static int set_sofib(int); static void sockaddr(char *, struct sockaddr *, size_t); static void sodump(struct sockaddr *, const char *); static int fiboptlist_csv(const char *, struct fibl_head_t *); static int fiboptlist_range(const char *, struct fibl_head_t *); static void usage(const char *) __dead2; #define READ_TIMEOUT 10 static volatile sig_atomic_t stop_read; static void stopit(int sig __unused) { stop_read = 1; } static void usage(const char *cp) { if (cp != NULL) warnx("bad keyword: %s", cp); errx(EX_USAGE, "usage: route [-j jail] [-46dnqtv] command [[modifiers] args]"); /* NOTREACHED */ } int main(int argc, char **argv) { int ch; #ifdef JAIL int jid; #endif size_t len; if (argc < 2) usage(NULL); while ((ch = getopt(argc, argv, "46nqdtvj:")) != -1) switch(ch) { case '4': #ifdef INET af = AF_INET; aflen = sizeof(struct sockaddr_in); #else errx(1, "IPv4 support is not compiled in"); #endif break; case '6': #ifdef INET6 af = AF_INET6; aflen = sizeof(struct sockaddr_in6); #else errx(1, "IPv6 support is not compiled in"); #endif break; case 'n': nflag = 1; break; case 'q': qflag = 1; break; case 'v': verbose = 1; break; case 't': tflag = 1; break; case 'd': debugonly = 1; break; case 'j': #ifdef JAIL if (optarg == NULL) usage(NULL); jail_name = optarg; #else errx(1, "Jail support is not compiled in"); #endif break; case '?': default: usage(NULL); } argc -= optind; argv += optind; pid = getpid(); uid = geteuid(); #ifdef JAIL if (jail_name != NULL) { jid = jail_getid(jail_name); if (jid == -1) errx(1, "Jail not found"); if (jail_attach(jid) != 0) errx(1, "Cannot attach to jail"); } #endif #ifdef WITHOUT_NETLINK if (tflag) s = open(_PATH_DEVNULL, O_WRONLY, 0); else s = socket(PF_ROUTE, SOCK_RAW, 0); if (s < 0) err(EX_OSERR, "socket"); #endif len = sizeof(numfibs); if (sysctlbyname("net.fibs", (void *)&numfibs, &len, NULL, 0) == -1) numfibs = -1; len = sizeof(defaultfib); if (numfibs != -1 && sysctlbyname("net.my_fibnum", (void *)&defaultfib, &len, NULL, 0) == -1) defaultfib = -1; if (*argv != NULL) switch (keyword(*argv)) { case K_GET: case K_SHOW: uid = 0; /* FALLTHROUGH */ case K_CHANGE: case K_ADD: case K_DEL: case K_DELETE: newroute(argc, argv); /* NOTREACHED */ case K_MONITOR: monitor(argc, argv); /* NOTREACHED */ case K_FLUSH: flushroutes(argc, argv); exit(0); /* NOTREACHED */ } usage(*argv); /* NOTREACHED */ } static int set_sofib(int fib) { #ifdef WITHOUT_NETLINK if (fib < 0) return (0); return (setsockopt(s, SOL_SOCKET, SO_SETFIB, (void *)&fib, sizeof(fib))); #else return (0); #endif } static int fiboptlist_range(const char *arg, struct fibl_head_t *flh) { struct fibl *fl; char *str0, *str, *token, *endptr; int fib[2], i, error; str0 = str = strdup(arg); error = 0; i = 0; while ((token = strsep(&str, "-")) != NULL) { switch (i) { case 0: case 1: errno = 0; fib[i] = strtol(token, &endptr, 0); if (errno == 0) { if (*endptr != '\0' || fib[i] < 0 || (numfibs != -1 && fib[i] > numfibs - 1)) errno = EINVAL; } if (errno) error = 1; break; default: error = 1; } if (error) goto fiboptlist_range_ret; i++; } if (fib[0] >= fib[1]) { error = 1; goto fiboptlist_range_ret; } for (i = fib[0]; i <= fib[1]; i++) { fl = calloc(1, sizeof(*fl)); if (fl == NULL) { error = 1; goto fiboptlist_range_ret; } fl->fl_num = i; TAILQ_INSERT_TAIL(flh, fl, fl_next); } fiboptlist_range_ret: free(str0); return (error); } #define ALLSTRLEN 64 static int fiboptlist_csv(const char *arg, struct fibl_head_t *flh) { struct fibl *fl; char *str0, *str, *token, *endptr; int fib, error; str0 = str = NULL; if (strcmp("all", arg) == 0) { str = calloc(1, ALLSTRLEN); if (str == NULL) { error = 1; goto fiboptlist_csv_ret; } if (numfibs > 1) snprintf(str, ALLSTRLEN - 1, "%d-%d", 0, numfibs - 1); else snprintf(str, ALLSTRLEN - 1, "%d", 0); } else if (strcmp("default", arg) == 0) { str0 = str = calloc(1, ALLSTRLEN); if (str == NULL) { error = 1; goto fiboptlist_csv_ret; } snprintf(str, ALLSTRLEN - 1, "%d", defaultfib); } else str0 = str = strdup(arg); error = 0; while ((token = strsep(&str, ",")) != NULL) { if (*token != '-' && strchr(token, '-') != NULL) { error = fiboptlist_range(token, flh); if (error) goto fiboptlist_csv_ret; } else { errno = 0; fib = strtol(token, &endptr, 0); if (errno == 0) { if (*endptr != '\0' || fib < 0 || (numfibs != -1 && fib > numfibs - 1)) errno = EINVAL; } if (errno) { error = 1; goto fiboptlist_csv_ret; } fl = calloc(1, sizeof(*fl)); if (fl == NULL) { error = 1; goto fiboptlist_csv_ret; } fl->fl_num = fib; TAILQ_INSERT_TAIL(flh, fl, fl_next); } } fiboptlist_csv_ret: if (str0 != NULL) free(str0); return (error); } /* * Purge all entries in the routing tables not * associated with network interfaces. */ static void flushroutes(int argc, char *argv[]) { struct fibl *fl; int error; if (uid != 0 && !debugonly && !tflag) errx(EX_NOPERM, "must be root to alter routing table"); #ifdef WITHOUT_NETLINK shutdown(s, SHUT_RD); /* Don't want to read back our messages */ #endif TAILQ_INIT(&fibl_head); while (argc > 1) { argc--; argv++; if (**argv != '-') usage(*argv); switch (keyword(*argv + 1)) { #ifdef INET case K_4: case K_INET: af = AF_INET; break; #endif #ifdef INET6 case K_6: case K_INET6: af = AF_INET6; break; #endif case K_LINK: af = AF_LINK; break; case K_FIB: if (!--argc) usage(*argv); error = fiboptlist_csv(*++argv, &fibl_head); if (error) errx(EX_USAGE, "invalid fib number: %s", *argv); break; default: usage(*argv); } } if (TAILQ_EMPTY(&fibl_head)) { error = fiboptlist_csv("default", &fibl_head); if (error) errx(EX_OSERR, "fiboptlist_csv failed."); } TAILQ_FOREACH(fl, &fibl_head, fl_next) flushroutes_fib(fl->fl_num); } static int flushroutes_fib(int fib) { #ifdef WITHOUT_NETLINK return (flushroutes_fib_rtsock(fib)); #else return (flushroutes_fib_nl(fib, af)); #endif } #ifdef WITHOUT_NETLINK static int flushroutes_fib_rtsock(int fib) { struct rt_msghdr *rtm; size_t needed; char *buf, *next, *lim; int mib[7], rlen, seqno, count = 0; int error; error = set_sofib(fib); if (error) { warn("fib number %d is ignored", fib); return (error); } retry: mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; /* protocol */ mib[3] = AF_UNSPEC; mib[4] = NET_RT_DUMP; mib[5] = 0; /* no flags */ mib[6] = fib; if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) err(EX_OSERR, "route-sysctl-estimate"); if ((buf = malloc(needed)) == NULL) errx(EX_OSERR, "malloc failed"); if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) { if (errno == ENOMEM && count++ < 10) { warnx("Routing table grew, retrying"); sleep(1); free(buf); goto retry; } err(EX_OSERR, "route-sysctl-get"); } lim = buf + needed; if (verbose) (void)printf("Examining routing table from sysctl\n"); seqno = 0; /* ??? */ for (next = buf; next < lim; next += rtm->rtm_msglen) { rtm = (struct rt_msghdr *)(void *)next; if (verbose) print_rtmsg(rtm, rtm->rtm_msglen); if ((rtm->rtm_flags & RTF_GATEWAY) == 0) continue; if (af != 0) { struct sockaddr *sa = (struct sockaddr *)(rtm + 1); if (sa->sa_family != af) continue; } if (debugonly) continue; rtm->rtm_type = RTM_DELETE; rtm->rtm_seq = seqno; rlen = write(s, next, rtm->rtm_msglen); if (rlen < 0 && errno == EPERM) err(1, "write to routing socket"); if (rlen < (int)rtm->rtm_msglen) { warn("write to routing socket"); (void)printf("got only %d for rlen\n", rlen); free(buf); goto retry; break; } seqno++; if (qflag) continue; if (verbose) print_rtmsg(rtm, rlen); else { struct sockaddr *sa = (struct sockaddr *)(rtm + 1); printf("%-20.20s ", rtm->rtm_flags & RTF_HOST ? routename(sa) : netname(sa)); sa = (struct sockaddr *)(SA_SIZE(sa) + (char *)sa); printf("%-20.20s ", routename(sa)); if (fib >= 0) printf("-fib %-3d ", fib); printf("done\n"); } } free(buf); return (error); } #endif const char * routename(struct sockaddr *sa) { struct sockaddr_dl *sdl; const char *cp; int n; if (domain == NULL) { if (gethostname(domain_storage, sizeof(domain_storage) - 1) == 0 && (cp = strchr(domain_storage, '.')) != NULL) { domain_storage[sizeof(domain_storage) - 1] = '\0'; domain = cp + 1; } else { domain_storage[0] = '\0'; domain = domain_storage; } } /* If the address is zero-filled, use "default". */ if (sa->sa_len == 0 && nflag == 0) return ("default"); #if defined(INET) || defined(INET6) switch (sa->sa_family) { #ifdef INET case AF_INET: /* If the address is zero-filled, use "default". */ if (nflag == 0 && ((struct sockaddr_in *)(void *)sa)->sin_addr.s_addr == INADDR_ANY) return("default"); break; #endif #ifdef INET6 case AF_INET6: /* If the address is zero-filled, use "default". */ if (nflag == 0 && IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)(void *)sa)->sin6_addr)) return("default"); break; #endif } #endif switch (sa->sa_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif { struct sockaddr_storage ss; int error; char *p; memset(&ss, 0, sizeof(ss)); if (sa->sa_len == 0) ss.ss_family = sa->sa_family; else memcpy(&ss, sa, sa->sa_len); /* Expand sa->sa_len because it could be shortened. */ if (sa->sa_family == AF_INET) ss.ss_len = sizeof(struct sockaddr_in); else if (sa->sa_family == AF_INET6) ss.ss_len = sizeof(struct sockaddr_in6); error = getnameinfo((struct sockaddr *)&ss, ss.ss_len, rt_line, sizeof(rt_line), NULL, 0, (nflag == 0) ? 0 : NI_NUMERICHOST); if (error) { warnx("getnameinfo(): %s", gai_strerror(error)); strncpy(rt_line, "invalid", sizeof(rt_line)); } /* Remove the domain part if any. */ p = strchr(rt_line, '.'); if (p != NULL && strcmp(p + 1, domain) == 0) *p = '\0'; return (rt_line); break; } #endif case AF_LINK: sdl = (struct sockaddr_dl *)(void *)sa; if (sdl->sdl_nlen == 0 && sdl->sdl_alen == 0 && sdl->sdl_slen == 0) { n = snprintf(rt_line, sizeof(rt_line), "link#%d", sdl->sdl_index); if (n > (int)sizeof(rt_line)) rt_line[0] = '\0'; return (rt_line); } else return (link_ntoa(sdl)); break; default: { u_short *sp = (u_short *)(void *)sa; u_short *splim = sp + ((sa->sa_len + 1) >> 1); char *cps = rt_line + sprintf(rt_line, "(%d)", sa->sa_family); char *cpe = rt_line + sizeof(rt_line); while (++sp < splim && cps < cpe) /* start with sa->sa_data */ if ((n = snprintf(cps, cpe - cps, " %x", *sp)) > 0) cps += n; else *cps = '\0'; break; } } return (rt_line); } /* * Return the name of the network whose address is given. * The address is assumed to be that of a net, not a host. */ const char * netname(struct sockaddr *sa) { struct sockaddr_dl *sdl; int n; #ifdef INET struct netent *np = NULL; const char *cp = NULL; u_long i; #endif switch (sa->sa_family) { #ifdef INET case AF_INET: { struct in_addr in; in = ((struct sockaddr_in *)(void *)sa)->sin_addr; i = in.s_addr = ntohl(in.s_addr); if (in.s_addr == 0) cp = "default"; else if (!nflag) { np = getnetbyaddr(i, AF_INET); if (np != NULL) cp = np->n_name; } #define C(x) (unsigned)((x) & 0xff) if (cp != NULL) strncpy(net_line, cp, sizeof(net_line)); else if ((in.s_addr & 0xffffff) == 0) (void)sprintf(net_line, "%u", C(in.s_addr >> 24)); else if ((in.s_addr & 0xffff) == 0) (void)sprintf(net_line, "%u.%u", C(in.s_addr >> 24), C(in.s_addr >> 16)); else if ((in.s_addr & 0xff) == 0) (void)sprintf(net_line, "%u.%u.%u", C(in.s_addr >> 24), C(in.s_addr >> 16), C(in.s_addr >> 8)); else (void)sprintf(net_line, "%u.%u.%u.%u", C(in.s_addr >> 24), C(in.s_addr >> 16), C(in.s_addr >> 8), C(in.s_addr)); #undef C break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 sin6; int niflags = 0; memset(&sin6, 0, sizeof(sin6)); memcpy(&sin6, sa, sa->sa_len); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; if (nflag) niflags |= NI_NUMERICHOST; if (getnameinfo((struct sockaddr *)&sin6, sin6.sin6_len, net_line, sizeof(net_line), NULL, 0, niflags) != 0) strncpy(net_line, "invalid", sizeof(net_line)); return(net_line); } #endif case AF_LINK: sdl = (struct sockaddr_dl *)(void *)sa; if (sdl->sdl_nlen == 0 && sdl->sdl_alen == 0 && sdl->sdl_slen == 0) { n = snprintf(net_line, sizeof(net_line), "link#%d", sdl->sdl_index); if (n > (int)sizeof(net_line)) net_line[0] = '\0'; return (net_line); } else return (link_ntoa(sdl)); break; default: { u_short *sp = (u_short *)(void *)sa->sa_data; u_short *splim = sp + ((sa->sa_len + 1)>>1); char *cps = net_line + sprintf(net_line, "af %d:", sa->sa_family); char *cpe = net_line + sizeof(net_line); while (sp < splim && cps < cpe) if ((n = snprintf(cps, cpe - cps, " %x", *sp++)) > 0) cps += n; else *cps = '\0'; break; } } return (net_line); } static void set_metric(char *value, int key) { int flag = 0; char *endptr; u_long noval, *valp = &noval; switch (key) { #define caseof(x, y, z) case x: valp = &rt_metrics.z; flag = y; break caseof(K_MTU, RTV_MTU, rmx_mtu); caseof(K_HOPCOUNT, RTV_HOPCOUNT, rmx_hopcount); caseof(K_EXPIRE, RTV_EXPIRE, rmx_expire); caseof(K_RECVPIPE, RTV_RPIPE, rmx_recvpipe); caseof(K_SENDPIPE, RTV_SPIPE, rmx_sendpipe); caseof(K_SSTHRESH, RTV_SSTHRESH, rmx_ssthresh); caseof(K_RTT, RTV_RTT, rmx_rtt); caseof(K_RTTVAR, RTV_RTTVAR, rmx_rttvar); caseof(K_WEIGHT, RTV_WEIGHT, rmx_weight); } rtm_inits |= flag; if (lockrest || locking) rt_metrics.rmx_locks |= flag; if (locking) locking = 0; errno = 0; *valp = strtol(value, &endptr, 0); if (errno == 0 && *endptr != '\0') errno = EINVAL; if (errno) err(EX_USAGE, "%s", value); if (flag & RTV_EXPIRE && (value[0] == '+' || value[0] == '-')) { struct timespec ts; clock_gettime(CLOCK_REALTIME_FAST, &ts); *valp += ts.tv_sec; } } #define F_ISHOST 0x01 #define F_FORCENET 0x02 #define F_FORCEHOST 0x04 #define F_PROXY 0x08 #define F_INTERFACE 0x10 static void newroute(int argc, char **argv) { struct sigaction sa; struct fibl *fl; char *cmd; const char *dest, *gateway, *errmsg; int key, error, flags, nrflags, fibnum; if (uid != 0 && !debugonly && !tflag) errx(EX_NOPERM, "must be root to alter routing table"); dest = NULL; gateway = NULL; flags = RTF_STATIC; nrflags = 0; TAILQ_INIT(&fibl_head); sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sa.sa_handler = stopit; if (sigaction(SIGALRM, &sa, 0) == -1) warn("sigaction SIGALRM"); cmd = argv[0]; #ifdef WITHOUT_NETLINK if (*cmd != 'g' && *cmd != 's') shutdown(s, SHUT_RD); /* Don't want to read back our messages */ #endif while (--argc > 0) { if (**(++argv)== '-') { switch (key = keyword(1 + *argv)) { case K_LINK: af = AF_LINK; aflen = sizeof(struct sockaddr_dl); break; #ifdef INET case K_4: case K_INET: af = AF_INET; aflen = sizeof(struct sockaddr_in); break; #endif #ifdef INET6 case K_6: case K_INET6: af = AF_INET6; aflen = sizeof(struct sockaddr_in6); break; #endif case K_SA: af = PF_ROUTE; aflen = sizeof(struct sockaddr_storage); break; case K_IFACE: case K_INTERFACE: nrflags |= F_INTERFACE; break; case K_NOSTATIC: flags &= ~RTF_STATIC; break; case K_LOCK: locking = 1; break; case K_LOCKREST: lockrest = 1; break; case K_HOST: nrflags |= F_FORCEHOST; break; case K_REJECT: flags |= RTF_REJECT; break; case K_BLACKHOLE: flags |= RTF_BLACKHOLE; break; case K_PROTO1: flags |= RTF_PROTO1; break; case K_PROTO2: flags |= RTF_PROTO2; break; case K_PROXY: nrflags |= F_PROXY; break; case K_XRESOLVE: flags |= RTF_XRESOLVE; break; case K_STATIC: flags |= RTF_STATIC; break; case K_STICKY: flags |= RTF_STICKY; break; case K_NOSTICK: flags &= ~RTF_STICKY; break; case K_FIB: if (!--argc) usage(NULL); error = fiboptlist_csv(*++argv, &fibl_head); if (error) errx(EX_USAGE, "invalid fib number: %s", *argv); break; case K_IFA: if (!--argc) usage(NULL); getaddr(RTAX_IFA, *++argv, nrflags); break; case K_IFP: if (!--argc) usage(NULL); getaddr(RTAX_IFP, *++argv, nrflags); break; case K_GENMASK: if (!--argc) usage(NULL); getaddr(RTAX_GENMASK, *++argv, nrflags); break; case K_GATEWAY: if (!--argc) usage(NULL); getaddr(RTAX_GATEWAY, *++argv, nrflags); gateway = *argv; break; case K_DST: if (!--argc) usage(NULL); if (getaddr(RTAX_DST, *++argv, nrflags)) nrflags |= F_ISHOST; dest = *argv; break; case K_NETMASK: if (!--argc) usage(NULL); getaddr(RTAX_NETMASK, *++argv, nrflags); /* FALLTHROUGH */ case K_NET: nrflags |= F_FORCENET; break; case K_PREFIXLEN: if (!--argc) usage(NULL); if (prefixlen(*++argv) == -1) { nrflags &= ~F_FORCENET; nrflags |= F_ISHOST; } else { nrflags |= F_FORCENET; nrflags &= ~F_ISHOST; } break; case K_MTU: case K_HOPCOUNT: case K_EXPIRE: case K_RECVPIPE: case K_SENDPIPE: case K_SSTHRESH: case K_RTT: case K_RTTVAR: case K_WEIGHT: if (!--argc) usage(NULL); set_metric(*++argv, key); break; default: usage(1+*argv); } } else { if ((rtm_addrs & RTA_DST) == 0) { dest = *argv; if (getaddr(RTAX_DST, *argv, nrflags)) nrflags |= F_ISHOST; } else if ((rtm_addrs & RTA_GATEWAY) == 0) { gateway = *argv; getaddr(RTAX_GATEWAY, *argv, nrflags); } else { getaddr(RTAX_NETMASK, *argv, nrflags); nrflags |= F_FORCENET; } } } /* Do some sanity checks on resulting request */ if (so[RTAX_DST].ss_len == 0) { warnx("destination parameter required"); usage(NULL); } if (so[RTAX_NETMASK].ss_len != 0 && so[RTAX_DST].ss_family != so[RTAX_NETMASK].ss_family) { warnx("destination and netmask family need to be the same"); usage(NULL); } if (nrflags & F_FORCEHOST) { nrflags |= F_ISHOST; #ifdef INET6 if (af == AF_INET6) { rtm_addrs &= ~RTA_NETMASK; memset(&so[RTAX_NETMASK], 0, sizeof(so[RTAX_NETMASK])); } #endif } if (nrflags & F_FORCENET) nrflags &= ~F_ISHOST; flags |= RTF_UP; if (nrflags & F_ISHOST) flags |= RTF_HOST; if ((nrflags & F_INTERFACE) == 0) flags |= RTF_GATEWAY; if (nrflags & F_PROXY) flags |= RTF_ANNOUNCE; if (dest == NULL) dest = ""; if (gateway == NULL) gateway = ""; if (TAILQ_EMPTY(&fibl_head)) { error = fiboptlist_csv("default", &fibl_head); if (error) errx(EX_OSERR, "fiboptlist_csv failed."); } error = 0; TAILQ_FOREACH(fl, &fibl_head, fl_next) { fl->fl_error = newroute_fib(fl->fl_num, cmd, flags); if (fl->fl_error) fl->fl_errno = errno; error += fl->fl_error; } if (*cmd == 'g' || *cmd == 's') exit(error); error = 0; if (!qflag) { fibnum = 0; TAILQ_FOREACH(fl, &fibl_head, fl_next) { if (fl->fl_error == 0) fibnum++; } if (fibnum > 0) { int firstfib = 1; printf("%s %s %s", cmd, (nrflags & F_ISHOST) ? "host" : "net", dest); if (*gateway) printf(": gateway %s", gateway); if (numfibs > 1) { TAILQ_FOREACH(fl, &fibl_head, fl_next) { if (fl->fl_error == 0 && fl->fl_num >= 0) { if (firstfib) { printf(" fib "); firstfib = 0; } printf("%d", fl->fl_num); if (fibnum-- > 1) printf(","); } } } printf("\n"); } } fibnum = 0; TAILQ_FOREACH(fl, &fibl_head, fl_next) { if (fl->fl_error != 0) { error = 1; if (!qflag) { printf("%s %s %s", cmd, (nrflags & F_ISHOST) ? "host" : "net", dest); if (*gateway) printf(": gateway %s", gateway); if (fl->fl_num >= 0) printf(" fib %d", fl->fl_num); switch (fl->fl_errno) { case ESRCH: errmsg = "not in table"; break; case EBUSY: errmsg = "entry in use"; break; case ENOBUFS: errmsg = "not enough memory"; break; case EADDRINUSE: /* * handle recursion avoidance * in rt_setgate() */ errmsg = "gateway uses the same route"; break; case EEXIST: errmsg = "route already in table"; break; default: errmsg = strerror(fl->fl_errno); break; } printf(": %s\n", errmsg); } } } exit(error); } static int newroute_fib(int fib, char *cmd, int flags) { int error; error = set_sofib(fib); if (error) { warn("fib number %d is ignored", fib); return (error); } error = rtmsg(*cmd, flags, fib); return (error); } #ifdef INET static void inet_makemask(struct sockaddr_in *sin_mask, u_long bits) { u_long mask = 0; rtm_addrs |= RTA_NETMASK; if (bits != 0) mask = 0xffffffff << (32 - bits); sin_mask->sin_addr.s_addr = htonl(mask); sin_mask->sin_len = sizeof(struct sockaddr_in); sin_mask->sin_family = AF_INET; } #endif #ifdef INET6 /* * XXX the function may need more improvement... */ static int inet6_makenetandmask(struct sockaddr_in6 *sin6, const char *plen) { if (plen == NULL) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && sin6->sin6_scope_id == 0) plen = "0"; } if (plen == NULL || strcmp(plen, "128") == 0) return (1); rtm_addrs |= RTA_NETMASK; prefixlen(plen); return (0); } #endif /* * Interpret an argument as a network address of some kind, * returning 1 if a host address, 0 if a network address. */ static int getaddr(int idx, char *str, int nrflags) { struct sockaddr *sa; #if defined(INET) struct sockaddr_in *sin; struct hostent *hp; char *q; #elif defined(INET6) char *q; #endif if (idx < 0 || idx >= RTAX_MAX) usage("internal error"); if (af == 0) { #if defined(INET) af = AF_INET; aflen = sizeof(struct sockaddr_in); #elif defined(INET6) af = AF_INET6; aflen = sizeof(struct sockaddr_in6); #else af = AF_LINK; aflen = sizeof(struct sockaddr_dl); #endif } rtm_addrs |= (1 << idx); sa = (struct sockaddr *)&so[idx]; sa->sa_family = af; sa->sa_len = aflen; switch (idx) { case RTAX_GATEWAY: if (nrflags & F_INTERFACE) { struct ifaddrs *ifap, *ifa; struct sockaddr_dl *sdl0 = (struct sockaddr_dl *)(void *)sa; struct sockaddr_dl *sdl = NULL; if (getifaddrs(&ifap)) err(EX_OSERR, "getifaddrs"); for (ifa = ifap; ifa != NULL; ifa = ifa->ifa_next) { if (ifa->ifa_addr->sa_family != AF_LINK) continue; if (strcmp(str, ifa->ifa_name) != 0) continue; sdl = (struct sockaddr_dl *)(void *)ifa->ifa_addr; } /* If we found it, then use it */ if (sdl != NULL) { /* * Note that we need to copy before calling * freeifaddrs(). */ memcpy(sdl0, sdl, sdl->sdl_len); } freeifaddrs(ifap); if (sdl != NULL) return(1); else errx(EX_DATAERR, "interface '%s' does not exist", str); } break; case RTAX_IFP: sa->sa_family = AF_LINK; break; } if (strcmp(str, "default") == 0) { /* * Default is net 0.0.0.0/0 */ switch (idx) { case RTAX_DST: nrflags |= F_FORCENET; getaddr(RTAX_NETMASK, str, nrflags); break; } return (0); } switch (sa->sa_family) { #ifdef INET6 case AF_INET6: { struct addrinfo hints, *res; int ecode; q = NULL; if (idx == RTAX_DST && (q = strchr(str, '/')) != NULL) *q = '\0'; memset(&hints, 0, sizeof(hints)); hints.ai_family = sa->sa_family; hints.ai_socktype = SOCK_DGRAM; ecode = getaddrinfo(str, NULL, &hints, &res); if (ecode != 0 || res->ai_family != AF_INET6 || res->ai_addrlen != sizeof(struct sockaddr_in6)) errx(EX_OSERR, "%s: %s", str, gai_strerror(ecode)); memcpy(sa, res->ai_addr, res->ai_addrlen); freeaddrinfo(res); if (q != NULL) *q++ = '/'; if (idx == RTAX_DST) return (inet6_makenetandmask((struct sockaddr_in6 *)(void *)sa, q)); return (0); } #endif /* INET6 */ case AF_LINK: link_addr(str, (struct sockaddr_dl *)(void *)sa); return (1); case PF_ROUTE: sockaddr(str, sa, sizeof(struct sockaddr_storage)); return (1); #ifdef INET case AF_INET: #endif default: break; } #ifdef INET sin = (struct sockaddr_in *)(void *)sa; q = strchr(str,'/'); if (q != NULL && idx == RTAX_DST) { /* A.B.C.D/NUM */ struct sockaddr_in *mask; uint32_t mask_bits; *q = '\0'; if (inet_aton(str, &sin->sin_addr) == 0) errx(EX_NOHOST, "bad address: %s", str); int masklen = strtol(q + 1, NULL, 10); if (masklen < 0 || masklen > 32) errx(EX_NOHOST, "bad mask length: %s", q + 1); inet_makemask((struct sockaddr_in *)&so[RTAX_NETMASK],masklen); /* * Check for bogus destination such as "10/8"; heuristic is * that there are bits set in the host part, and no dot * is present. */ mask = ((struct sockaddr_in *) &so[RTAX_NETMASK]); mask_bits = ntohl(mask->sin_addr.s_addr); if ((ntohl(sin->sin_addr.s_addr) & ~mask_bits) != 0 && strchr(str, '.') == NULL) errx(EX_NOHOST, "malformed address, bits set after mask;" " %s means %s", str, inet_ntoa(sin->sin_addr)); return (0); } if (inet_aton(str, &sin->sin_addr) != 0) return (1); hp = gethostbyname(str); if (hp != NULL) { sin->sin_family = hp->h_addrtype; memmove((char *)&sin->sin_addr, hp->h_addr, MIN((size_t)hp->h_length, sizeof(sin->sin_addr))); return (1); } #endif errx(EX_NOHOST, "bad address: %s", str); } static int prefixlen(const char *str) { int len = atoi(str), q, r; int max; char *p; rtm_addrs |= RTA_NETMASK; switch (af) { #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&so[RTAX_NETMASK]; max = 128; p = (char *)&sin6->sin6_addr; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)&so[RTAX_NETMASK]; max = 32; p = (char *)&sin->sin_addr; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); break; } #endif default: errx(EX_OSERR, "prefixlen not supported in this af"); } if (len < 0 || max < len) errx(EX_USAGE, "%s: invalid prefixlen", str); q = len >> 3; r = len & 7; memset((void *)p, 0, max / 8); if (q > 0) memset((void *)p, 0xff, q); if (r > 0) *((u_char *)p + q) = (0xff00 >> r) & 0xff; if (len == max) return (-1); else return (len); } static void interfaces(void) { size_t needed; int mib[6]; char *buf, *lim, *next, count = 0; struct rt_msghdr *rtm; retry2: mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; /* protocol */ mib[3] = AF_UNSPEC; mib[4] = NET_RT_IFLIST; mib[5] = 0; /* no flags */ if (sysctl(mib, nitems(mib), NULL, &needed, NULL, 0) < 0) err(EX_OSERR, "route-sysctl-estimate"); if ((buf = malloc(needed)) == NULL) errx(EX_OSERR, "malloc failed"); if (sysctl(mib, nitems(mib), buf, &needed, NULL, 0) < 0) { if (errno == ENOMEM && count++ < 10) { warnx("Routing table grew, retrying"); sleep(1); free(buf); goto retry2; } err(EX_OSERR, "actual retrieval of interface table"); } lim = buf + needed; for (next = buf; next < lim; next += rtm->rtm_msglen) { rtm = (struct rt_msghdr *)(void *)next; print_rtmsg(rtm, rtm->rtm_msglen); } free(buf); } static void monitor(int argc, char *argv[]) { int fib, error; char *endptr; fib = defaultfib; while (argc > 1) { argc--; argv++; if (**argv != '-') usage(*argv); switch (keyword(*argv + 1)) { case K_FIB: if (!--argc) usage(*argv); errno = 0; fib = strtol(*++argv, &endptr, 0); if (errno == 0) { if (*endptr != '\0' || fib < 0 || (numfibs != -1 && fib > numfibs - 1)) errno = EINVAL; } if (errno) errx(EX_USAGE, "invalid fib number: %s", *argv); break; default: usage(*argv); } } error = set_sofib(fib); if (error) errx(EX_USAGE, "invalid fib number: %d", fib); verbose = 1; if (debugonly) { interfaces(); exit(0); } #ifdef WITHOUT_NETLINK monitor_rtsock(); #else monitor_nl(fib); #endif } #ifdef WITHOUT_NETLINK static void monitor_rtsock(void) { char msg[2048]; int n; #ifdef SO_RERROR n = 1; if (setsockopt(s, SOL_SOCKET, SO_RERROR, &n, sizeof(n)) == -1) warn("SO_RERROR"); #endif for (;;) { time_t now; n = read(s, msg, sizeof(msg)); if (n == -1) { warn("read"); continue; } now = time(NULL); (void)printf("\ngot message of size %d on %s", n, ctime(&now)); print_rtmsg((struct rt_msghdr *)(void *)msg, n); } } #endif static int rtmsg(int cmd, int flags, int fib) { errno = 0; if (cmd == 'a') cmd = RTM_ADD; else if (cmd == 'c') cmd = RTM_CHANGE; else if (cmd == 'g' || cmd == 's') { cmd = RTM_GET; if (so[RTAX_IFP].ss_family == 0) { so[RTAX_IFP].ss_family = AF_LINK; so[RTAX_IFP].ss_len = sizeof(struct sockaddr_dl); rtm_addrs |= RTA_IFP; } } else { cmd = RTM_DELETE; flags |= RTF_PINNED; } #ifdef WITHOUT_NETLINK return (rtmsg_rtsock(cmd, flags, fib)); #else errno = rtmsg_nl(cmd, flags, fib, rtm_addrs, so, &rt_metrics); return (errno == 0 ? 0 : -1); #endif } #ifdef WITHOUT_NETLINK static int rtmsg_rtsock(int cmd, int flags, int fib) { int rlen; char *cp = m_rtmsg.m_space; int l; memset(&m_rtmsg, 0, sizeof(m_rtmsg)); #define NEXTADDR(w, u) \ if (rtm_addrs & (w)) { \ l = SA_SIZE(&(u)); \ memmove(cp, (char *)&(u), l); \ cp += l; \ if (verbose) \ sodump((struct sockaddr *)&(u), #w); \ } #define rtm m_rtmsg.m_rtm rtm.rtm_type = cmd; rtm.rtm_flags = flags; rtm.rtm_version = RTM_VERSION; rtm.rtm_seq = ++rtm_seq; rtm.rtm_addrs = rtm_addrs; rtm.rtm_rmx = rt_metrics; rtm.rtm_inits = rtm_inits; NEXTADDR(RTA_DST, so[RTAX_DST]); NEXTADDR(RTA_GATEWAY, so[RTAX_GATEWAY]); NEXTADDR(RTA_NETMASK, so[RTAX_NETMASK]); NEXTADDR(RTA_GENMASK, so[RTAX_GENMASK]); NEXTADDR(RTA_IFP, so[RTAX_IFP]); NEXTADDR(RTA_IFA, so[RTAX_IFA]); rtm.rtm_msglen = l = cp - (char *)&m_rtmsg; if (verbose) print_rtmsg(&rtm, l); if (debugonly) return (0); if ((rlen = write(s, (char *)&m_rtmsg, l)) < 0) { switch (errno) { case EPERM: err(1, "writing to routing socket"); break; case ESRCH: warnx("route has not been found"); break; case EEXIST: /* Handled by newroute() */ break; default: warn("writing to routing socket"); } return (-1); } if (cmd == RTM_GET) { stop_read = 0; alarm(READ_TIMEOUT); do { l = read(s, (char *)&m_rtmsg, sizeof(m_rtmsg)); } while (l > 0 && stop_read == 0 && (rtm.rtm_type != RTM_GET || rtm.rtm_seq != rtm_seq || rtm.rtm_pid != pid)); if (stop_read != 0) { warnx("read from routing socket timed out"); return (-1); } else alarm(0); if (l < 0) warn("read from routing socket"); else print_getmsg(&rtm, l, fib); } #undef rtm return (0); } #endif static const char *const msgtypes[] = { "", "RTM_ADD: Add Route", "RTM_DELETE: Delete Route", "RTM_CHANGE: Change Metrics or flags", "RTM_GET: Report Metrics", "RTM_LOSING: Kernel Suspects Partitioning", "RTM_REDIRECT: Told to use different route", "RTM_MISS: Lookup failed on this address", "RTM_LOCK: fix specified metrics", "RTM_OLDADD: caused by SIOCADDRT", "RTM_OLDDEL: caused by SIOCDELRT", "RTM_RESOLVE: Route created by cloning", "RTM_NEWADDR: address being added to iface", "RTM_DELADDR: address being removed from iface", "RTM_IFINFO: iface status change", "RTM_NEWMADDR: new multicast group membership on iface", "RTM_DELMADDR: multicast group membership removed from iface", "RTM_IFANNOUNCE: interface arrival/departure", "RTM_IEEE80211: IEEE 802.11 wireless event", + "RTM_IPFWLOG: IPFW log", }; static const char metricnames[] = "\011weight\010rttvar\7rtt\6ssthresh\5sendpipe\4recvpipe\3expire" "\1mtu"; const char routeflags[] = "\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE" "\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE" "\017PROTO2\020PROTO1\021PRCLONING\022WASCLONED\023PROTO3" "\024FIXEDMTU\025PINNED\026LOCAL\027BROADCAST\030MULTICAST\035STICKY"; static const char ifnetflags[] = "\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6b6\7RUNNING\010NOARP" "\011PPROMISC\012ALLMULTI\013OACTIVE\014SIMPLEX\015LINK0\016LINK1" "\017LINK2\020MULTICAST"; static const char addrnames[] = "\1DST\2GATEWAY\3NETMASK\4GENMASK\5IFP\6IFA\7AUTHOR\010BRD"; static const char errfmt[] = "\n%s: truncated route message, only %zu bytes left\n"; static void print_rtmsg(struct rt_msghdr *rtm, size_t msglen) { struct if_msghdr *ifm; struct ifa_msghdr *ifam; #ifdef RTM_NEWMADDR struct ifma_msghdr *ifmam; #endif struct if_announcemsghdr *ifan; const char *state; if (verbose == 0) return; if (rtm->rtm_version != RTM_VERSION) { (void)printf("routing message version %d not understood\n", rtm->rtm_version); return; } if (rtm->rtm_type < nitems(msgtypes)) (void)printf("%s: ", msgtypes[rtm->rtm_type]); else (void)printf("unknown type %d: ", rtm->rtm_type); (void)printf("len %d, ", rtm->rtm_msglen); #define REQUIRE(x) do { \ if (msglen < sizeof(x)) \ goto badlen; \ else \ msglen -= sizeof(x); \ } while (0) switch (rtm->rtm_type) { case RTM_IFINFO: REQUIRE(struct if_msghdr); ifm = (struct if_msghdr *)rtm; (void)printf("if# %d, ", ifm->ifm_index); switch (ifm->ifm_data.ifi_link_state) { case LINK_STATE_DOWN: state = "down"; break; case LINK_STATE_UP: state = "up"; break; default: state = "unknown"; break; } (void)printf("link: %s, flags:", state); printb(ifm->ifm_flags, ifnetflags); pmsg_addrs((char *)(ifm + 1), ifm->ifm_addrs, msglen); break; case RTM_NEWADDR: case RTM_DELADDR: REQUIRE(struct ifa_msghdr); ifam = (struct ifa_msghdr *)rtm; (void)printf("metric %d, flags:", ifam->ifam_metric); printb(ifam->ifam_flags, routeflags); pmsg_addrs((char *)(ifam + 1), ifam->ifam_addrs, msglen); break; #ifdef RTM_NEWMADDR case RTM_NEWMADDR: case RTM_DELMADDR: REQUIRE(struct ifma_msghdr); ifmam = (struct ifma_msghdr *)rtm; pmsg_addrs((char *)(ifmam + 1), ifmam->ifmam_addrs, msglen); break; #endif case RTM_IFANNOUNCE: REQUIRE(struct if_announcemsghdr); ifan = (struct if_announcemsghdr *)rtm; (void)printf("if# %d, what: ", ifan->ifan_index); switch (ifan->ifan_what) { case IFAN_ARRIVAL: (void)printf("arrival"); break; case IFAN_DEPARTURE: printf("departure"); break; default: printf("#%d", ifan->ifan_what); break; } printf("\n"); fflush(stdout); break; default: if (rtm->rtm_type <= RTM_RESOLVE) { printf("pid: %ld, seq %d, errno %d, flags:", (long)rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno); printb(rtm->rtm_flags, routeflags); pmsg_common(rtm, msglen); } else printf("type: %u, len: %zu\n", rtm->rtm_type, msglen); } return; badlen: (void)printf(errfmt, __func__, msglen); #undef REQUIRE } static void print_getmsg(struct rt_msghdr *rtm, int msglen, int fib) { struct sockaddr *sp[RTAX_MAX]; struct timespec ts; char *cp; int i; memset(sp, 0, sizeof(sp)); (void)printf(" route to: %s\n", routename((struct sockaddr *)&so[RTAX_DST])); if (rtm->rtm_version != RTM_VERSION) { warnx("routing message version %d not understood", rtm->rtm_version); return; } if (rtm->rtm_msglen > msglen) { warnx("message length mismatch, in packet %d, returned %d", rtm->rtm_msglen, msglen); return; } if (rtm->rtm_errno) { errno = rtm->rtm_errno; warn("message indicates error %d", errno); return; } cp = ((char *)(rtm + 1)); for (i = 0; i < RTAX_MAX; i++) if (rtm->rtm_addrs & (1 << i)) { sp[i] = (struct sockaddr *)cp; cp += SA_SIZE((struct sockaddr *)cp); } if ((rtm->rtm_addrs & RTA_IFP) && (sp[RTAX_IFP]->sa_family != AF_LINK || ((struct sockaddr_dl *)(void *)sp[RTAX_IFP])->sdl_nlen == 0)) sp[RTAX_IFP] = NULL; if (sp[RTAX_DST]) (void)printf("destination: %s\n", routename(sp[RTAX_DST])); if (sp[RTAX_NETMASK]) (void)printf(" mask: %s\n", routename(sp[RTAX_NETMASK])); if (sp[RTAX_GATEWAY] && (rtm->rtm_flags & RTF_GATEWAY)) (void)printf(" gateway: %s\n", routename(sp[RTAX_GATEWAY])); if (fib >= 0) (void)printf(" fib: %u\n", (unsigned int)fib); if (sp[RTAX_IFP]) (void)printf(" interface: %.*s\n", ((struct sockaddr_dl *)(void *)sp[RTAX_IFP])->sdl_nlen, ((struct sockaddr_dl *)(void *)sp[RTAX_IFP])->sdl_data); (void)printf(" flags: "); printb(rtm->rtm_flags, routeflags); #define lock(f) ((rtm->rtm_rmx.rmx_locks & __CONCAT(RTV_,f)) ? 'L' : ' ') #define msec(u) (((u) + 500) / 1000) /* usec to msec */ printf("\n%9s %9s %9s %9s %9s %10s %9s\n", "recvpipe", "sendpipe", "ssthresh", "rtt,msec", "mtu ", "weight", "expire"); printf("%8lu%c ", rtm->rtm_rmx.rmx_recvpipe, lock(RPIPE)); printf("%8lu%c ", rtm->rtm_rmx.rmx_sendpipe, lock(SPIPE)); printf("%8lu%c ", rtm->rtm_rmx.rmx_ssthresh, lock(SSTHRESH)); printf("%8lu%c ", msec(rtm->rtm_rmx.rmx_rtt), lock(RTT)); printf("%8lu%c ", rtm->rtm_rmx.rmx_mtu, lock(MTU)); printf("%8lu%c ", rtm->rtm_rmx.rmx_weight, lock(WEIGHT)); if (rtm->rtm_rmx.rmx_expire > 0) clock_gettime(CLOCK_REALTIME_FAST, &ts); else ts.tv_sec = 0; printf("%8ld%c\n", (long)(rtm->rtm_rmx.rmx_expire - ts.tv_sec), lock(EXPIRE)); #undef lock #undef msec #define RTA_IGN (RTA_DST|RTA_GATEWAY|RTA_NETMASK|RTA_IFP|RTA_IFA|RTA_BRD) if (verbose) pmsg_common(rtm, msglen); else if (rtm->rtm_addrs &~ RTA_IGN) { (void)printf("sockaddrs: "); printb(rtm->rtm_addrs, addrnames); putchar('\n'); } #undef RTA_IGN } static void pmsg_common(struct rt_msghdr *rtm, size_t msglen) { (void)printf("\nlocks: "); printb(rtm->rtm_rmx.rmx_locks, metricnames); (void)printf(" inits: "); printb(rtm->rtm_inits, metricnames); if (msglen > sizeof(struct rt_msghdr)) pmsg_addrs(((char *)(rtm + 1)), rtm->rtm_addrs, msglen - sizeof(struct rt_msghdr)); else (void)fflush(stdout); } static void pmsg_addrs(char *cp, int addrs, size_t len) { struct sockaddr *sa; int i; if (addrs == 0) { (void)putchar('\n'); return; } (void)printf("\nsockaddrs: "); printb(addrs, addrnames); putchar('\n'); for (i = 0; i < RTAX_MAX; i++) if (addrs & (1 << i)) { sa = (struct sockaddr *)cp; if (len == 0 || len < SA_SIZE(sa)) { (void)printf(errfmt, __func__, len); break; } (void)printf(" %s", routename(sa)); len -= SA_SIZE(sa); cp += SA_SIZE(sa); } (void)putchar('\n'); (void)fflush(stdout); } void printb(int b, const char *str) { int i; int gotsome = 0; if (b == 0) return; while ((i = *str++) != 0) { if (b & (1 << (i-1))) { if (gotsome == 0) i = '<'; else i = ','; putchar(i); gotsome = 1; for (; (i = *str) > 32; str++) putchar(i); } else while (*str > 32) str++; } if (gotsome) putchar('>'); } int keyword(const char *cp) { const struct keytab *kt = keywords; while (kt->kt_cp != NULL && strcmp(kt->kt_cp, cp) != 0) kt++; return (kt->kt_i); } static void sodump(struct sockaddr *sa, const char *which) { #ifdef INET6 char nbuf[INET6_ADDRSTRLEN]; #endif switch (sa->sa_family) { case AF_LINK: (void)printf("%s: link %s; ", which, link_ntoa((struct sockaddr_dl *)(void *)sa)); break; #ifdef INET case AF_INET: (void)printf("%s: inet %s; ", which, inet_ntoa(((struct sockaddr_in *)(void *)sa)->sin_addr)); break; #endif #ifdef INET6 case AF_INET6: (void)printf("%s: inet6 %s; ", which, inet_ntop(sa->sa_family, &((struct sockaddr_in6 *)(void *)sa)->sin6_addr, nbuf, sizeof(nbuf))); break; #endif } (void)fflush(stdout); } /* States*/ #define VIRGIN 0 #define GOTONE 1 #define GOTTWO 2 /* Inputs */ #define DIGIT (4*0) #define END (4*1) #define DELIM (4*2) static void sockaddr(char *addr, struct sockaddr *sa, size_t size) { char *cp = (char *)sa; char *cplim = cp + size; int byte = 0, state = VIRGIN, new = 0 /* foil gcc */; memset(cp, 0, size); cp++; do { if ((*addr >= '0') && (*addr <= '9')) { new = *addr - '0'; } else if ((*addr >= 'a') && (*addr <= 'f')) { new = *addr - 'a' + 10; } else if ((*addr >= 'A') && (*addr <= 'F')) { new = *addr - 'A' + 10; } else if (*addr == '\0') state |= END; else state |= DELIM; addr++; switch (state /* | INPUT */) { case GOTTWO | DIGIT: *cp++ = byte; /*FALLTHROUGH*/ case VIRGIN | DIGIT: state = GOTONE; byte = new; continue; case GOTONE | DIGIT: state = GOTTWO; byte = new + (byte << 4); continue; default: /* | DELIM */ state = VIRGIN; *cp++ = byte; byte = 0; continue; case GOTONE | END: case GOTTWO | END: *cp++ = byte; /* FALLTHROUGH */ case VIRGIN | END: break; } break; } while (cp < cplim); sa->sa_len = cp - (char *)sa; } diff --git a/sys/net/route.h b/sys/net/route.h index 603e4fac5354..8c713d65ec95 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -1,453 +1,454 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { struct nhop_object *ro_nh; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a * preformed header. They are not cacheable. */ char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; struct sockaddr ro_dst; }; #define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ #define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ #define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ #define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ #define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ #define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ #define RT_REJECT 0x0020 /* Destination is reject */ #define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ #define RT_HAS_GW 0x0080 /* Destination has GW */ #define RT_LLE_CACHE 0x0100 /* Cache link layer */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ u_long rmx_nhidx; /* route nexhop index */ u_long rmx_filler[2]; /* will be used for T/TCP later */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ /* lle state is exported in rmx_state rt_metrics field */ #define rmx_state rmx_weight /* default route weight */ #define RT_DEFAULT_WEIGHT 1 #define RT_MAX_WEIGHT 16777215 /* 3 bytes */ /* * Keep a generation count of routing table, incremented on route addition, * so we can invalidate caches. This is accessed without a lock, as precision * is not required. */ typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ #define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ #ifdef _KERNEL VNET_DECLARE(uint32_t, _rt_numfibs); /* number of existing route tables */ #define V_rt_numfibs VNET(_rt_numfibs) /* temporary compat arg */ #define rt_numfibs V_rt_numfibs VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) /* Calculate flowid for locally-originated packets */ #define V_fib_hash_outbound VNET(fib_hash_outbound) VNET_DECLARE(u_int, fib_hash_outbound); /* Outbound flowid generation rules */ #ifdef RSS #define fib4_calc_packet_hash xps_proto_software_hash_v4 #define fib6_calc_packet_hash xps_proto_software_hash_v6 #define CALC_FLOWID_OUTBOUND_SENDTO true #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND false #endif #else /* !RSS */ #define fib4_calc_packet_hash fib4_calc_software_hash #define fib6_calc_packet_hash fib6_calc_software_hash #ifdef ROUTE_MPATH #define CALC_FLOWID_OUTBOUND_SENDTO V_fib_hash_outbound #define CALC_FLOWID_OUTBOUND V_fib_hash_outbound #else #define CALC_FLOWID_OUTBOUND_SENDTO false #define CALC_FLOWID_OUTBOUND false #endif #endif /* RSS */ #endif /* _KERNEL */ /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ /* 0x80 unused, was RTF_DELCLONE */ /* 0x100 unused, was RTF_CLONING */ #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ #define RTF_LLINFO 0x400 /* DEPRECATED - exists ONLY for backward compatibility */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ /* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ #define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ /* 0x40000000 unused, was RTF_RNH_LOCKED */ #define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ #define NHF_MULTIPATH 0x0008 /* Nexhop is a nexthop group */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ #define NHF_HOST 0x0400 /* RTF_HOST */ /* Nexthop request flags */ #define NHR_NONE 0x00 /* empty flags field */ #define NHR_REF 0x01 /* reference nexhop */ #define NHR_NODEFAULT 0x02 /* uRPF: do not consider default route */ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ #define NHR_UNLOCKED 0x200 /* Do not lock table */ /* * Routing statistics. */ struct rtstat { uint64_t rts_badredirect; /* bogus redirect calls */ uint64_t rts_dynamic; /* routes created by redirects */ uint64_t rts_newgateway; /* routes modified by redirects */ uint64_t rts_unreach; /* lookups which failed */ uint64_t rts_wildcard; /* lookups satisfied by a wildcard */ uint64_t rts_nh_idx_alloc_failure; /* nexthop index alloc failure*/ uint64_t rts_nh_alloc_failure; /* nexthop allocation failure*/ uint64_t rts_add_failure; /* # of route addition failures */ uint64_t rts_add_retry; /* # of route addition retries */ uint64_t rts_del_failure; /* # of route deletion failure */ uint64_t rts_del_retry; /* # of route deletion retries */ uint64_t rts_spare[5]; }; /* * Structures for routing messages. */ struct rt_msghdr { u_short rtm_msglen; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ u_short _rtm_spare1; int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; #define RTM_VERSION 5 /* Up the ante and ignore older versions */ #ifndef NETLINK_COMPAT /* * Message types. * * The format for each message is annotated below using the following * identifiers: * * (1) struct rt_msghdr * (2) struct ifa_msghdr * (3) struct if_msghdr * (4) struct ifma_msghdr * (5) struct if_announcemsghdr * */ #define RTM_ADD 0x1 /* (1) Add Route */ #define RTM_DELETE 0x2 /* (1) Delete Route */ #define RTM_CHANGE 0x3 /* (1) Change Metrics or flags */ #define RTM_GET 0x4 /* (1) Report Metrics */ #define RTM_LOSING 0x5 /* (1) Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* (1) Told to use different route */ #define RTM_MISS 0x7 /* (1) Lookup failed on this address */ #define RTM_LOCK 0x8 /* (1) fix specified metrics */ /* 0x9 */ /* 0xa */ #define RTM_RESOLVE 0xb /* (1) req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* (2) address being added to iface */ #define RTM_DELADDR 0xd /* (2) address being removed from iface */ #define RTM_IFINFO 0xe /* (3) iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* (4) mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* (4) mcast group membership being deleted */ #define RTM_IFANNOUNCE 0x11 /* (5) iface arrival/departure */ #define RTM_IEEE80211 0x12 /* (5) IEEE80211 wireless event */ +#define RTM_IPFWLOG 0x13 /* (1) IPFW rule match log event */ #endif /* NETLINK_COMPAT*/ /* * Bitmask values for rtm_inits and rmx_locks. */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTV_WEIGHT 0x100 /* init or lock _weight */ #ifndef NETLINK_COMPAT /* * Bitmask values for rtm_addrs. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ #endif /* NETLINK_COMPAT*/ /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ struct rtentry; struct nhop_object; typedef int rib_filter_f_t(const struct rtentry *, const struct nhop_object *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rib_filter_f_t *rti_filter; /* filter function */ void *rti_filterdata; /* filter parameters */ u_long rti_mflags; /* metrics RTV_ flags */ u_long rti_spare; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* * This macro returns the size of a struct sockaddr when passed * through a routing socket. Basically we round up sa_len to * a multiple of sizeof(long), with a minimum of sizeof(long). * The case sa_len == 0 should only apply to empty structures. */ #define SA_SIZE(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) #define sa_equal(a, b) ( \ (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RO_NHFREE(_ro) do { \ if ((_ro)->ro_nh) { \ NH_FREE((_ro)->ro_nh); \ (_ro)->ro_nh = NULL; \ } \ } while (0) #define RO_INVALIDATE_CACHE(ro) do { \ if ((ro)->ro_lle != NULL) { \ LLE_FREE((ro)->ro_lle); \ (ro)->ro_lle = NULL; \ } \ if ((ro)->ro_nh != NULL) { \ NH_FREE((ro)->ro_nh); \ (ro)->ro_nh = NULL; \ } \ } while (0) #define RO_GET_FAMILY(ro, dst) ((ro) != NULL && \ (ro)->ro_flags & RT_HAS_GW \ ? (ro)->ro_dst.sa_family : (dst)->sa_family) /* * Validate a cached route based on a supplied cookie. If there is an * out-of-date cache, simply free it. Update the generation number * for the new allocation */ #define NH_VALIDATE(ro, cookiep, fibnum) do { \ rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ if (*(cookiep) != cookie) { \ RO_INVALIDATE_CACHE(ro); \ *(cookiep) = cookie; \ } \ } while (0) struct ifmultiaddr; struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifmsg(struct ifnet *, int); void rt_missmsg(int, struct rt_addrinfo *, int, int); void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); int rt_addrmsg(int, struct ifaddr *, int); int rt_routemsg(int, struct rtentry *, struct nhop_object *, int); int rt_routemsg_info(int, struct rt_addrinfo *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); void rt_maskedcopy(const struct sockaddr *, struct sockaddr *, const struct sockaddr *); struct rib_head *rt_table_init(int, int, u_int); void rt_table_destroy(struct rib_head *); u_int rt_tables_get_gen(uint32_t table, sa_family_t family); struct sockaddr *rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask, struct sockaddr_storage *dmask); void rt_updatemtu(struct ifnet *); void rt_flushifroutes(struct ifnet *ifp); /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ int rtioctl_fib(u_long, caddr_t, u_int); /* New API */ void rib_flush_routes_family(int family); struct nhop_object *rib_lookup(uint32_t fibnum, const struct sockaddr *dst, uint32_t flags, uint32_t flowid); const char *rib_print_family(int family); #endif #endif diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h index 937dc8fbbbc2..c440223b81f8 100644 --- a/sys/netinet/ip_fw.h +++ b/sys/netinet/ip_fw.h @@ -1,1097 +1,1172 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _IPFW2_H #define _IPFW2_H /* * The default rule number. By the design of ip_fw, the default rule * is the last one, so its number can also serve as the highest number * allowed for a rule. The ip_fw code relies on both meanings of this * constant. */ #define IPFW_DEFAULT_RULE 65535 #define RESVD_SET 31 /*set for default and persistent rules*/ #define IPFW_MAX_SETS 32 /* Number of sets supported by ipfw*/ /* * Compat values for old clients */ #ifndef _KERNEL #define IPFW_TABLES_MAX 65535 #define IPFW_TABLES_DEFAULT 128 #endif /* * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit * argument between 1 and 65534. The value 0 (IP_FW_TARG) is used * to represent 'tablearg' value, e.g. indicate the use of a 'tablearg' * result of the most recent table() lookup. * Note that 16bit is only a historical limit, resulting from * the use of a 16-bit fields for that value. In reality, we can have * 2^32 pipes, queues, tag values and so on. */ #define IPFW_ARG_MIN 1 #define IPFW_ARG_MAX 65534 #define IP_FW_TABLEARG 65535 /* Compat value for old clients */ #define IP_FW_TARG 0 /* Current tablearg value */ #define IP_FW_NAT44_GLOBAL 65535 /* arg1 value for "nat global" */ /* * Number of entries in the call stack of the call/return commands. * Call stack currently is an uint16_t array with rule numbers. */ #define IPFW_CALLSTACK_SIZE 16 /* IP_FW3 header/opcodes */ typedef struct _ip_fw3_opheader { uint16_t opcode; /* Operation opcode */ uint16_t version; /* Opcode version */ uint16_t reserved[2]; /* Align to 64-bit boundary */ } ip_fw3_opheader; +#define IP_FW3_OPVER_0 0 +#define IP_FW3_OPVER_1 1 /* 32bit rulenum */ +#define IP_FW3_OPVER IP_FW3_OPVER_1 + /* IP_FW3 opcodes */ #define IP_FW_TABLE_XADD 86 /* add entry */ #define IP_FW_TABLE_XDEL 87 /* delete entry */ #define IP_FW_TABLE_XGETSIZE 88 /* get table size (deprecated) */ #define IP_FW_TABLE_XLIST 89 /* list table contents */ #define IP_FW_TABLE_XDESTROY 90 /* destroy table */ #define IP_FW_TABLES_XLIST 92 /* list all tables */ #define IP_FW_TABLE_XINFO 93 /* request info for one table */ #define IP_FW_TABLE_XFLUSH 94 /* flush table data */ #define IP_FW_TABLE_XCREATE 95 /* create new table */ #define IP_FW_TABLE_XMODIFY 96 /* modify existing table */ #define IP_FW_XGET 97 /* Retrieve configuration */ #define IP_FW_XADD 98 /* add rule */ #define IP_FW_XDEL 99 /* del rule */ #define IP_FW_XMOVE 100 /* move rules to different set */ #define IP_FW_XZERO 101 /* clear accounting */ #define IP_FW_XRESETLOG 102 /* zero rules logs */ #define IP_FW_SET_SWAP 103 /* Swap between 2 sets */ #define IP_FW_SET_MOVE 104 /* Move one set to another one */ #define IP_FW_SET_ENABLE 105 /* Enable/disable sets */ #define IP_FW_TABLE_XFIND 106 /* finds an entry */ #define IP_FW_XIFLIST 107 /* list tracked interfaces */ #define IP_FW_TABLES_ALIST 108 /* list table algorithms */ #define IP_FW_TABLE_XSWAP 109 /* swap two tables */ #define IP_FW_TABLE_VLIST 110 /* dump table value hash */ #define IP_FW_NAT44_XCONFIG 111 /* Create/modify NAT44 instance */ #define IP_FW_NAT44_DESTROY 112 /* Destroys NAT44 instance */ #define IP_FW_NAT44_XGETCONFIG 113 /* Get NAT44 instance config */ #define IP_FW_NAT44_LIST_NAT 114 /* List all NAT44 instances */ #define IP_FW_NAT44_XGETLOG 115 /* Get log from NAT44 instance */ #define IP_FW_DUMP_SOPTCODES 116 /* Dump available sopts/versions */ #define IP_FW_DUMP_SRVOBJECTS 117 /* Dump existing named objects */ +#define IP_FW_SKIPTO_CACHE 118 /* Manage skipto cache */ #define IP_FW_NAT64STL_CREATE 130 /* Create stateless NAT64 instance */ #define IP_FW_NAT64STL_DESTROY 131 /* Destroy stateless NAT64 instance */ #define IP_FW_NAT64STL_CONFIG 132 /* Modify stateless NAT64 instance */ #define IP_FW_NAT64STL_LIST 133 /* List stateless NAT64 instances */ #define IP_FW_NAT64STL_STATS 134 /* Get NAT64STL instance statistics */ #define IP_FW_NAT64STL_RESET_STATS 135 /* Reset NAT64STL instance statistics */ #define IP_FW_NAT64LSN_CREATE 140 /* Create stateful NAT64 instance */ #define IP_FW_NAT64LSN_DESTROY 141 /* Destroy stateful NAT64 instance */ #define IP_FW_NAT64LSN_CONFIG 142 /* Modify stateful NAT64 instance */ #define IP_FW_NAT64LSN_LIST 143 /* List stateful NAT64 instances */ #define IP_FW_NAT64LSN_STATS 144 /* Get NAT64LSN instance statistics */ #define IP_FW_NAT64LSN_LIST_STATES 145 /* Get stateful NAT64 states */ #define IP_FW_NAT64LSN_RESET_STATS 146 /* Reset NAT64LSN instance statistics */ #define IP_FW_NPTV6_CREATE 150 /* Create NPTv6 instance */ #define IP_FW_NPTV6_DESTROY 151 /* Destroy NPTv6 instance */ #define IP_FW_NPTV6_CONFIG 152 /* Modify NPTv6 instance */ #define IP_FW_NPTV6_LIST 153 /* List NPTv6 instances */ #define IP_FW_NPTV6_STATS 154 /* Get NPTv6 instance statistics */ #define IP_FW_NPTV6_RESET_STATS 155 /* Reset NPTv6 instance statistics */ #define IP_FW_NAT64CLAT_CREATE 160 /* Create clat NAT64 instance */ #define IP_FW_NAT64CLAT_DESTROY 161 /* Destroy clat NAT64 instance */ #define IP_FW_NAT64CLAT_CONFIG 162 /* Modify clat NAT64 instance */ #define IP_FW_NAT64CLAT_LIST 163 /* List clat NAT64 instances */ #define IP_FW_NAT64CLAT_STATS 164 /* Get NAT64CLAT instance statistics */ #define IP_FW_NAT64CLAT_RESET_STATS 165 /* Reset NAT64CLAT instance statistics */ /* * The kernel representation of ipfw rules is made of a list of * 'instructions' (for all practical purposes equivalent to BPF * instructions), which specify which fields of the packet * (or its metadata) should be analysed. * * Each instruction is stored in a structure which begins with * "ipfw_insn", and can contain extra fields depending on the * instruction type (listed below). * Note that the code is written so that individual instructions * have a size which is a multiple of 32 bits. This means that, if * such structures contain pointers or other 64-bit entities, * (there is just one instance now) they may end up unaligned on * 64-bit architectures, so the must be handled with care. * * "enum ipfw_opcodes" are the opcodes supported. We can have up * to 256 different opcodes. When adding new opcodes, they should * be appended to the end of the opcode list before O_LAST_OPCODE, * this will prevent the ABI from being broken, otherwise users * will have to recompile ipfw(8) when they update the kernel. */ enum ipfw_opcodes { /* arguments (4 byte each) */ O_NOP, O_IP_SRC, /* u32 = IP */ O_IP_SRC_MASK, /* ip = IP/mask */ O_IP_SRC_ME, /* none */ O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ O_IP_DST, /* u32 = IP */ O_IP_DST_MASK, /* ip = IP/mask */ O_IP_DST_ME, /* none */ O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ O_PROTO, /* arg1=protocol */ O_MACADDR2, /* 2 mac addr:mask */ O_MAC_TYPE, /* same as srcport */ O_LAYER2, /* none */ O_IN, /* none */ O_FRAG, /* none */ O_RECV, /* none */ O_XMIT, /* none */ O_VIA, /* none */ O_IPOPT, /* arg1 = 2*u8 bitmap */ O_IPLEN, /* arg1 = len */ O_IPID, /* arg1 = id */ O_IPTOS, /* arg1 = id */ O_IPPRECEDENCE, /* arg1 = precedence << 5 */ O_IPTTL, /* arg1 = TTL */ O_IPVER, /* arg1 = version */ O_UID, /* u32 = id */ O_GID, /* u32 = id */ O_ESTAB, /* none (tcp established) */ O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ O_TCPWIN, /* arg1 = desired win */ O_TCPSEQ, /* u32 = desired seq. */ O_TCPACK, /* u32 = desired seq. */ O_ICMPTYPE, /* u32 = icmp bitmap */ O_TCPOPTS, /* arg1 = 2*u8 bitmap */ O_VERREVPATH, /* none */ O_VERSRCREACH, /* none */ - O_PROBE_STATE, /* none */ - O_KEEP_STATE, /* none */ + O_PROBE_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ + O_KEEP_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ O_LIMIT, /* ipfw_insn_limit */ O_LIMIT_PARENT, /* dyn_type, not an opcode. */ /* * These are really 'actions'. */ O_LOG, /* ipfw_insn_log */ O_PROB, /* u32 = match probability */ - O_CHECK_STATE, /* none */ + O_CHECK_STATE, /* v0:arg1=kidx, v1:kidx=kidx */ O_ACCEPT, /* none */ O_DENY, /* none */ O_REJECT, /* arg1=icmp arg (same as deny) */ O_COUNT, /* none */ - O_SKIPTO, /* arg1=next rule number */ + O_SKIPTO, /* v0:arg1=next rule number */ + /* v1:kidx= next rule number */ O_PIPE, /* arg1=pipe number */ O_QUEUE, /* arg1=queue number */ O_DIVERT, /* arg1=port number */ O_TEE, /* arg1=port number */ O_FORWARD_IP, /* fwd sockaddr */ O_FORWARD_MAC, /* fwd mac */ O_NAT, /* nope */ O_REASS, /* none */ /* * More opcodes. */ O_IPSEC, /* has ipsec history */ - O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ + O_IP_SRC_LOOKUP, /* v0:arg1=table number, u32=value */ + /* v1:kidx=name, u32=value, arg1=key */ O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + /* v1:kidx=name, u32=value, arg1=key */ O_ANTISPOOF, /* none */ O_JAIL, /* u32 = id */ O_ALTQ, /* u32 = altq classif. qid */ O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ O_TCPDATALEN, /* arg1 = tcp data len */ O_IP6_SRC, /* address without mask */ O_IP6_SRC_ME, /* my addresses */ O_IP6_SRC_MASK, /* address with the mask */ O_IP6_DST, O_IP6_DST_ME, O_IP6_DST_MASK, O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ O_ICMP6TYPE, /* icmp6 packet type filtering */ O_EXT_HDR, /* filtering for ipv6 extension header */ O_IP6, /* * actions for ng_ipfw */ O_NETGRAPH, /* send to ng_ipfw */ O_NGTEE, /* copy to ng_ipfw */ O_IP4, O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ O_TAG, /* arg1=tag number */ O_TAGGED, /* arg1=tag number */ O_SETFIB, /* arg1=FIB number */ O_FIB, /* arg1=FIB desired fib number */ O_SOCKARG, /* socket argument */ - O_CALLRETURN, /* arg1=called rule number */ + O_CALLRETURN, /* v0:arg1=called rule number */ + /* v1:kidx=called rule number */ O_FORWARD_IP6, /* fwd sockaddr_in6 */ O_DSCP, /* 2 u32 = DSCP mask */ O_SETDSCP, /* arg1=DSCP value */ - O_IP_FLOW_LOOKUP, /* arg1=table number, u32=value */ + O_IP_FLOW_LOOKUP, /* v0:arg1=table number, u32=value */ + /* v1:kidx=name, u32=value */ - O_EXTERNAL_ACTION, /* arg1=id of external action handler */ - O_EXTERNAL_INSTANCE, /* arg1=id of eaction handler instance */ + O_EXTERNAL_ACTION, /* v0:arg1=id of external action handler */ + /* v1:kidx=id of external action handler */ + O_EXTERNAL_INSTANCE, /* v0:arg1=id of eaction handler instance */ + /* v1:kidx=id of eaction handler instance */ O_EXTERNAL_DATA, /* variable length data */ O_SKIP_ACTION, /* none */ O_TCPMSS, /* arg1=MSS value */ - O_MAC_SRC_LOOKUP, /* arg1=table number, u32=value */ - O_MAC_DST_LOOKUP, /* arg1=table number, u32=value */ + O_MAC_SRC_LOOKUP, /* kidx=name, u32=value, arg1=key */ + O_MAC_DST_LOOKUP, /* kidx=name, u32=value, arg1=key */ O_SETMARK, /* u32 = value */ O_MARK, /* 2 u32 = value, bitmask */ O_LAST_OPCODE /* not an opcode! */ }; -/* - * Defines key types used by lookup instruction - */ -enum ipfw_table_lookup_type { - LOOKUP_DST_IP, - LOOKUP_SRC_IP, - LOOKUP_DST_PORT, - LOOKUP_SRC_PORT, - LOOKUP_UID, - LOOKUP_JAIL, - LOOKUP_DSCP, - LOOKUP_DST_MAC, - LOOKUP_SRC_MAC, - LOOKUP_MARK, -}; - /* * The extension header are filtered only for presence using a bit * vector with a flag for each header. */ #define EXT_FRAGMENT 0x1 #define EXT_HOPOPTS 0x2 #define EXT_ROUTING 0x4 #define EXT_AH 0x8 #define EXT_ESP 0x10 #define EXT_DSTOPTS 0x20 #define EXT_RTHDR0 0x40 #define EXT_RTHDR2 0x80 /* * Template for instructions. * * ipfw_insn is used for all instructions which require no operands, * a single 16-bit value (arg1), or a couple of 8-bit values. * * For other instructions which require different/larger arguments * we have derived structures, ipfw_insn_*. * * The size of the instruction (in 32-bit words) is in the low * 6 bits of "len". The 2 remaining bits are used to implement * NOT and OR on individual instructions. Given a type, you can * compute the length to be put in "len" using F_INSN_SIZE(t) * * F_NOT negates the match result of the instruction. * * F_OR is used to build or blocks. By default, instructions * are evaluated as part of a logical AND. An "or" block * { X or Y or Z } contains F_OR set in all but the last * instruction of the block. A match will cause the code * to skip past the last instruction of the block. * * NOTA BENE: in a couple of places we assume that * sizeof(ipfw_insn) == sizeof(u_int32_t) * this needs to be fixed. * */ typedef struct _ipfw_insn { /* template for instructions */ _Alignas(_Alignof(u_int32_t)) u_int8_t opcode; u_int8_t len; /* number of 32-bit words */ #define F_NOT 0x80 #define F_OR 0x40 #define F_LEN_MASK 0x3f #define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) u_int16_t arg1; } ipfw_insn; /* * The F_INSN_SIZE(type) computes the size, in 4-byte words, of * a given type. */ #define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) /* * This is used to store an array of 16-bit entries (ports etc.) */ typedef struct _ipfw_insn_u16 { ipfw_insn o; u_int16_t ports[2]; /* there may be more */ } ipfw_insn_u16; /* * This is used to store an array of 32-bit entries * (uid, single IPv4 addresses etc.) */ typedef struct _ipfw_insn_u32 { ipfw_insn o; u_int32_t d[1]; /* one or more */ } ipfw_insn_u32; +typedef struct _ipfw_insn_kidx { + ipfw_insn o; + uint32_t kidx; +} ipfw_insn_kidx; + /* * This is used to store IP addr-mask pairs. */ typedef struct _ipfw_insn_ip { ipfw_insn o; struct in_addr addr; struct in_addr mask; } ipfw_insn_ip; +typedef struct _ipfw_insn_table { + ipfw_insn o; /* arg1 is optional lookup key */ + uint32_t kidx; /* table name index */ + uint32_t value; /* table value */ +} ipfw_insn_table; + +#define IPFW_LOOKUP_TYPE_MASK 0x00FF +#define IPFW_LOOKUP_TYPE(insn) ((insn)->arg1 & IPFW_LOOKUP_TYPE_MASK) +#define IPFW_SET_LOOKUP_TYPE(insn, type) do { \ + (insn)->arg1 &= ~IPFW_LOOKUP_TYPE_MASK; \ + (insn)->arg1 |= (type) & IPFW_LOOKUP_TYPE_MASK; \ +} while (0) + +/* + * Defines key types used by lookup instruction + */ +enum ipfw_table_lookup_type { + LOOKUP_NONE = 0, + LOOKUP_DST_IP, + LOOKUP_SRC_IP, + LOOKUP_DST_PORT, + LOOKUP_SRC_PORT, + LOOKUP_UID, + LOOKUP_JAIL, + LOOKUP_DSCP, + LOOKUP_DST_MAC, + LOOKUP_SRC_MAC, + LOOKUP_MARK, + LOOKUP_RULENUM, +}; + +enum ipfw_return_type { + RETURN_NEXT_RULENUM = 0, + RETURN_NEXT_RULE, +}; + +enum ipfw_skipto_cache_op { + SKIPTO_CACHE_DISABLE = 0, + SKIPTO_CACHE_ENABLE, +}; + /* * This is used to forward to a given address (ip). */ typedef struct _ipfw_insn_sa { ipfw_insn o; struct sockaddr_in sa; } ipfw_insn_sa; /* * This is used to forward to a given address (ipv6). */ typedef struct _ipfw_insn_sa6 { ipfw_insn o; struct sockaddr_in6 sa; } ipfw_insn_sa6; /* * This is used for MAC addr-mask pairs. */ typedef struct _ipfw_insn_mac { ipfw_insn o; u_char addr[12]; /* dst[6] + src[6] */ u_char mask[12]; /* dst[6] + src[6] */ } ipfw_insn_mac; /* * This is used for interface match rules (recv xx, xmit xx). */ typedef struct _ipfw_insn_if { ipfw_insn o; union { struct in_addr ip; int glob; - uint16_t kidx; + uint16_t kidx_v0; + uint32_t kidx; } p; char name[IFNAMSIZ]; } ipfw_insn_if; /* * This is used for storing an altq queue id number. */ typedef struct _ipfw_insn_altq { ipfw_insn o; u_int32_t qid; } ipfw_insn_altq; /* * This is used for limit rules. */ typedef struct _ipfw_insn_limit { ipfw_insn o; + u_int32_t kidx; u_int8_t _pad; u_int8_t limit_mask; /* combination of DYN_* below */ #define DYN_SRC_ADDR 0x1 #define DYN_SRC_PORT 0x2 #define DYN_DST_ADDR 0x4 #define DYN_DST_PORT 0x8 u_int16_t conn_limit; } ipfw_insn_limit; +/* MAC/InfiniBand/etc address length */ +#define IPFW_MAX_L2_ADDR_LEN 20 + /* * This is used for log instructions. */ typedef struct _ipfw_insn_log { ipfw_insn o; u_int32_t max_log; /* how many do we log -- 0 = all */ u_int32_t log_left; /* how many left to log */ } ipfw_insn_log; +/* ipfw_insn_log->o.arg1 bitmasks */ +#define IPFW_LOG_DEFAULT 0x0000 +#define IPFW_LOG_SYSLOG (1 << 15) +#define IPFW_LOG_IPFW0 (1 << 14) +#define IPFW_LOG_RTSOCK (1 << 13) + +typedef struct _ipfwlog_rtsock_hdr_v2 { + uint32_t rulenum; + uint32_t tablearg; + ipfw_insn cmd; + u_char ether_shost[IPFW_MAX_L2_ADDR_LEN]; + u_char ether_dhost[IPFW_MAX_L2_ADDR_LEN]; + uint32_t mark; + char comment[0]; +} ipfwlog_rtsock_hdr_v2; + /* Legacy NAT structures, compat only */ #ifndef _KERNEL /* * Data structures required by both ipfw(8) and ipfw(4) but not part of the * management API are protected by IPFW_INTERNAL. */ #ifdef IPFW_INTERNAL /* Server pool support (LSNAT). */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; u_short port; }; #endif /* Redirect modes id. */ #define REDIR_ADDR 0x01 #define REDIR_PORT 0x02 #define REDIR_PROTO 0x04 #ifdef IPFW_INTERNAL /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ u_int16_t mode; /* type of redirect mode */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ u_short lport; /* local port */ u_short pport; /* public port */ u_short rport; /* remote port */ u_short pport_cnt; /* number of public ports */ u_short rport_cnt; /* number of remote ports */ int proto; /* protocol: tcp/udp */ struct alias_link **alink; /* num of entry in spool chain */ u_int16_t spool_cnt; /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; #endif #ifdef IPFW_INTERNAL /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ char if_name[IF_NAMESIZE]; /* interface name */ int mode; /* aliasing mode */ struct libalias *lib; /* libalias instance */ /* number of entry in spool chain */ int redir_cnt; /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; }; #endif #define SOF_NAT sizeof(struct cfg_nat) #define SOF_REDIR sizeof(struct cfg_redir) #define SOF_SPOOL sizeof(struct cfg_spool) #endif /* ifndef _KERNEL */ struct nat44_cfg_spool { struct in_addr addr; uint16_t port; uint16_t spare; }; #define NAT44_REDIR_ADDR 0x01 #define NAT44_REDIR_PORT 0x02 #define NAT44_REDIR_PROTO 0x04 /* Nat redirect configuration. */ struct nat44_cfg_redir { struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ uint16_t lport; /* local port */ uint16_t pport; /* public port */ uint16_t rport; /* remote port */ uint16_t pport_cnt; /* number of public ports */ uint16_t rport_cnt; /* number of remote ports */ uint16_t mode; /* type of redirect mode */ uint16_t spool_cnt; /* num of entry in spool chain */ uint16_t spare; uint32_t proto; /* protocol: tcp/udp */ }; /* Nat configuration data struct. */ struct nat44_cfg_nat { char name[64]; /* nat name */ char if_name[64]; /* interface name */ uint32_t size; /* structure size incl. redirs */ struct in_addr ip; /* nat IPv4 address */ uint32_t mode; /* aliasing mode */ uint32_t redir_cnt; /* number of entry in spool chain */ u_short alias_port_lo; /* low range for port aliasing */ u_short alias_port_hi; /* high range for port aliasing */ }; /* Nat command. */ typedef struct _ipfw_insn_nat { ipfw_insn o; struct cfg_nat *nat; } ipfw_insn_nat; /* Apply ipv6 mask on ipv6 addr */ #define APPLY_MASK(addr,mask) do { \ (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; \ } while (0) /* Structure for ipv6 */ typedef struct _ipfw_insn_ip6 { ipfw_insn o; struct in6_addr addr6; struct in6_addr mask6; } ipfw_insn_ip6; /* Used to support icmp6 types */ typedef struct _ipfw_insn_icmp6 { ipfw_insn o; uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h * define ICMP6_MAXTYPE * as follows: n = ICMP6_MAXTYPE/32 + 1 * Actually is 203 */ } ipfw_insn_icmp6; +/* Convert pointer to instruction with specified type */ +#define insntod(p, type) ((ipfw_insn_ ## type *)(p)) +#define insntoc(p, type) ((const ipfw_insn_ ## type *)(p)) + /* * Here we have the structure representing an ipfw rule. * * Layout: * struct ip_fw_rule * [ counter block, size = rule->cntr_len ] * [ one or more instructions, size = rule->cmd_len * 4 ] * * It starts with a general area (with link fields). * Counter block may be next (if rule->cntr_len > 0), * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. rule->cmd_len represents * the total instructions legth in u32 worrd, while act_ofs represents * rule action offset in u32 words. * * When assembling instruction, remember the following: * * + if a rule has a "keep-state" (or "limit") option, then the * first instruction (at r->cmd) MUST BE an O_PROBE_STATE * + if a rule has a "log" option, then the first action * (at ACTION_PTR(r)) MUST be O_LOG * + if a rule has an "altq" option, it comes after "log" * + if a rule has an O_TAG option, it comes after "log" and "altq" * * * All structures (excluding instructions) are u64-aligned. * Please keep this. */ struct ip_fw_rule { uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t spare; uint8_t set; /* rule set (0..31) */ uint8_t flags; /* rule flags */ uint32_t rulenum; /* rule number */ uint32_t id; /* rule id */ ipfw_insn cmd[1]; /* storage for commands */ }; #define IPFW_RULE_NOOPT 0x01 /* Has no options in body */ #define IPFW_RULE_JUSTOPTS 0x02 /* new format of rule body */ /* Unaligned version */ /* Base ipfw rule counter block. */ struct ip_fw_bcounter { uint16_t size; /* Size of counter block, bytes */ uint8_t flags; /* flags for given block */ uint8_t spare; uint32_t timestamp; /* tv_sec of last match */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ }; #ifndef _KERNEL /* * Legacy rule format */ struct ip_fw { struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; #endif #define ACTION_PTR(rule) \ (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) #define RULESIZE(rule) (sizeof(*(rule)) + (rule)->cmd_len * 4 - 4) #if 1 // should be moved to in.h /* * This structure is used as a flow mask and a flow id for various * parts of the code. * addr_type is used in userland and kernel to mark the address type. * fib is used in the kernel to record the fib in use. * _flags is used in the kernel to store tcp flags for dynamic rules. */ struct ipfw_flow_id { uint32_t dst_ip; uint32_t src_ip; uint16_t dst_port; uint16_t src_port; uint8_t fib; /* XXX: must be uint16_t */ uint8_t proto; uint8_t _flags; /* protocol-specific flags */ uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ struct in6_addr dst_ip6; struct in6_addr src_ip6; uint32_t flow_id6; uint32_t extra; /* queue/pipe or frag_id */ }; #endif #define IS_IP4_FLOW_ID(id) ((id)->addr_type == 4) #define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) /* * Dynamic ipfw rule. */ -typedef struct _ipfw_dyn_rule ipfw_dyn_rule; - -struct _ipfw_dyn_rule { - ipfw_dyn_rule *next; /* linked list of rules. */ - struct ip_fw *rule; /* pointer to rule */ - /* 'rule' is used to pass up the rule number (from the parent) */ +#define IPFW_DYN_ORPHANED 0x40000 /* state's parent rule was deleted */ - ipfw_dyn_rule *parent; /* pointer to parent rule */ - u_int64_t pcnt; /* packet match counter */ - u_int64_t bcnt; /* byte match counter */ +typedef struct _ipfw_dyn_rule { struct ipfw_flow_id id; /* (masked) flow id */ - u_int32_t expire; /* expire time */ - u_int32_t bucket; /* which bucket in hash table */ - u_int32_t state; /* state of this rule (typically a + uint8_t set; + uint8_t type; /* rule type */ + uint16_t pad; + uint32_t expire; /* expire time */ + uint32_t rulenum; /* parent's rule number */ + uint32_t kidx; /* index of named object */ + uint64_t pcnt; /* packet match counter */ + uint64_t bcnt; /* byte match counter */ + uint32_t hashval; /* hash value */ + union { + uint32_t state; /* state of this rule (typically a * combination of TCP flags) */ -#define IPFW_DYN_ORPHANED 0x40000 /* state's parent rule was deleted */ - u_int32_t ack_fwd; /* most recent ACKs in forward */ - u_int32_t ack_rev; /* and reverse directions (used */ + uint32_t count; /* number of linked states */ + }; + uint32_t ack_fwd; /* most recent ACKs in forward */ + uint32_t ack_rev; /* and reverse directions (used */ /* to generate keepalives) */ - u_int16_t dyn_type; /* rule type */ - u_int16_t count; /* refcount */ - u_int16_t kidx; /* index of named object */ -} __packed __aligned(8); +} __packed __aligned(8) ipfw_dyn_rule; /* * Definitions for IP option names. */ #define IP_FW_IPOPT_LSRR 0x01 #define IP_FW_IPOPT_SSRR 0x02 #define IP_FW_IPOPT_RR 0x04 #define IP_FW_IPOPT_TS 0x08 /* * Definitions for TCP option names. */ #define IP_FW_TCPOPT_MSS 0x01 #define IP_FW_TCPOPT_WINDOW 0x02 #define IP_FW_TCPOPT_SACK 0x04 #define IP_FW_TCPOPT_TS 0x08 #define IP_FW_TCPOPT_CC 0x10 #define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ #define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ #define ICMP_REJECT_ABORT 0x101 /* fake ICMP code (send an SCTP ABORT) */ #define ICMP6_UNREACH_ABORT 0x101 /* fake ICMPv6 code (send an SCTP ABORT) */ /* * These are used for lookup tables. */ #define IPFW_TABLE_ADDR 1 /* Table for holding IPv4/IPv6 prefixes */ #define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ #define IPFW_TABLE_NUMBER 3 /* Table for holding ports/uid/gid/etc */ #define IPFW_TABLE_FLOW 4 /* Table for holding flow data */ #define IPFW_TABLE_MAC 5 /* Table for holding mac address prefixes */ #define IPFW_TABLE_MAXTYPE 5 /* Maximum valid number */ #define IPFW_TABLE_CIDR IPFW_TABLE_ADDR /* compat */ /* Value types */ #define IPFW_VTYPE_LEGACY 0xFFFFFFFF /* All data is filled in */ #define IPFW_VTYPE_SKIPTO 0x00000001 /* skipto/call/callreturn */ #define IPFW_VTYPE_PIPE 0x00000002 /* pipe/queue */ #define IPFW_VTYPE_FIB 0x00000004 /* setfib */ #define IPFW_VTYPE_NAT 0x00000008 /* nat */ #define IPFW_VTYPE_DSCP 0x00000010 /* dscp */ #define IPFW_VTYPE_TAG 0x00000020 /* tag/untag */ #define IPFW_VTYPE_DIVERT 0x00000040 /* divert/tee */ #define IPFW_VTYPE_NETGRAPH 0x00000080 /* netgraph/ngtee */ #define IPFW_VTYPE_LIMIT 0x00000100 /* limit */ #define IPFW_VTYPE_NH4 0x00000200 /* IPv4 nexthop */ #define IPFW_VTYPE_NH6 0x00000400 /* IPv6 nexthop */ #define IPFW_VTYPE_MARK 0x00000800 /* [fw]mark */ -/* MAC/InfiniBand/etc address length */ -#define IPFW_MAX_L2_ADDR_LEN 20 - -typedef struct _ipfw_table_entry { - in_addr_t addr; /* network address */ - u_int32_t value; /* value */ - u_int16_t tbl; /* table number */ - u_int8_t masklen; /* mask length */ -} ipfw_table_entry; - typedef struct _ipfw_table_xentry { uint16_t len; /* Total entry length */ uint8_t type; /* entry type */ uint8_t masklen; /* mask length */ uint16_t tbl; /* table number */ uint16_t flags; /* record flags */ uint32_t value; /* value */ union { /* Longest field needs to be aligned by 4-byte boundary */ struct in6_addr addr6; /* IPv6 address */ char iface[IF_NAMESIZE]; /* interface name */ } k; } ipfw_table_xentry; #define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ -typedef struct _ipfw_table { - u_int32_t size; /* size of entries in bytes */ - u_int32_t cnt; /* # of entries */ - u_int16_t tbl; /* table number */ - ipfw_table_entry ent[0]; /* entries */ -} ipfw_table; - typedef struct _ipfw_xtable { ip_fw3_opheader opheader; /* IP_FW3 opcode */ uint32_t size; /* size of entries in bytes */ uint32_t cnt; /* # of entries */ uint16_t tbl; /* table number */ uint8_t type; /* table type */ ipfw_table_xentry xent[0]; /* entries */ } ipfw_xtable; typedef struct _ipfw_obj_tlv { uint16_t type; /* TLV type */ uint16_t flags; /* TLV-specific flags */ uint32_t length; /* Total length, aligned to u64 */ } ipfw_obj_tlv; #define IPFW_TLV_TBL_NAME 1 #define IPFW_TLV_TBLNAME_LIST 2 #define IPFW_TLV_RULE_LIST 3 #define IPFW_TLV_DYNSTATE_LIST 4 #define IPFW_TLV_TBL_ENT 5 #define IPFW_TLV_DYN_ENT 6 #define IPFW_TLV_RULE_ENT 7 #define IPFW_TLV_TBLENT_LIST 8 #define IPFW_TLV_RANGE 9 #define IPFW_TLV_EACTION 10 #define IPFW_TLV_COUNTERS 11 #define IPFW_TLV_OBJDATA 12 #define IPFW_TLV_STATE_NAME 14 #define IPFW_TLV_EACTION_BASE 1000 #define IPFW_TLV_EACTION_NAME(arg) (IPFW_TLV_EACTION_BASE + (arg)) typedef struct _ipfw_obj_data { ipfw_obj_tlv head; void *data[0]; } ipfw_obj_data; /* Object name TLV */ typedef struct _ipfw_obj_ntlv { ipfw_obj_tlv head; /* TLV header */ - uint16_t idx; /* Name index */ + uint32_t idx; /* Name index */ uint8_t set; /* set, if applicable */ uint8_t type; /* object type, if applicable */ - uint32_t spare; /* unused */ + uint16_t spare; /* unused */ char name[64]; /* Null-terminated name */ } ipfw_obj_ntlv; /* IPv4/IPv6 L4 flow description */ struct tflow_entry { uint8_t af; uint8_t proto; uint16_t spare; uint16_t sport; uint16_t dport; union { struct { struct in_addr sip; struct in_addr dip; } a4; struct { struct in6_addr sip6; struct in6_addr dip6; } a6; } a; }; +#define IPFW_TVALUE_TYPE_MASK 0xFF00 +#define IPFW_TVALUE_TYPE(insn) (((insn)->arg1 & IPFW_TVALUE_TYPE_MASK) >> 8) +#define IPFW_SET_TVALUE_TYPE(insn, type) do { \ + (insn)->arg1 &= ~IPFW_TVALUE_TYPE_MASK; \ + (insn)->arg1 |= ((type) << 8) & IPFW_TVALUE_TYPE_MASK; \ +} while (0) + +enum ipfw_table_value_type { + TVALUE_TAG = 0, + TVALUE_PIPE, + TVALUE_DIVERT, + TVALUE_SKIPTO, + TVALUE_NETGRAPH, + TVALUE_FIB, + TVALUE_NAT, + TVALUE_NH4, + TVALUE_DSCP, + TVALUE_LIMIT, + TVALUE_MARK, +}; + /* 64-byte structure representing multi-field table value */ typedef struct _ipfw_table_value { uint32_t tag; /* O_TAG/O_TAGGED */ - uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t pipe; /* O_PIPE/O_QUEUE */ uint16_t divert; /* O_DIVERT/O_TEE */ - uint16_t skipto; /* skipto, CALLRET */ + uint32_t skipto; /* skipto, CALLRET */ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ - uint32_t fib; /* O_SETFIB */ uint32_t nat; /* O_NAT */ uint32_t nh4; + uint16_t fib; /* O_SETFIB */ uint8_t dscp; uint8_t spare0; - uint16_t kidx; /* value kernel index */ + uint32_t kidx; /* value kernel index */ struct in6_addr nh6; uint32_t limit; /* O_LIMIT */ uint32_t zoneid; /* scope zone id for nh6 */ uint32_t mark; /* O_SETMARK/O_MARK */ uint32_t refcnt; /* XXX 64-bit in kernel */ } ipfw_table_value; /* Table entry TLV */ typedef struct _ipfw_obj_tentry { ipfw_obj_tlv head; /* TLV header */ uint8_t subtype; /* subtype (IPv4,IPv6) */ uint8_t masklen; /* mask length */ uint8_t result; /* request result */ uint8_t spare0; - uint16_t idx; /* Table name index */ - uint16_t spare1; + uint32_t idx; /* Table name index */ union { /* Longest field needs to be aligned by 8-byte boundary */ struct in_addr addr; /* IPv4 address */ uint32_t key; /* uid/gid/port */ struct in6_addr addr6; /* IPv6 address */ char iface[IF_NAMESIZE]; /* interface name */ u_char mac[IPFW_MAX_L2_ADDR_LEN]; /* MAC address */ struct tflow_entry flow; } k; union { ipfw_table_value value; /* value data */ uint32_t kidx; /* value kernel index */ } v; } ipfw_obj_tentry; #define IPFW_TF_UPDATE 0x01 /* Update record if exists */ /* Container TLV */ #define IPFW_CTF_ATOMIC 0x01 /* Perform atomic operation */ /* Operation results */ #define IPFW_TR_IGNORED 0 /* Entry was ignored (rollback) */ #define IPFW_TR_ADDED 1 /* Entry was successfully added */ #define IPFW_TR_UPDATED 2 /* Entry was successfully updated*/ #define IPFW_TR_DELETED 3 /* Entry was successfully deleted*/ #define IPFW_TR_LIMIT 4 /* Entry was ignored (limit) */ #define IPFW_TR_NOTFOUND 5 /* Entry was not found */ #define IPFW_TR_EXISTS 6 /* Entry already exists */ #define IPFW_TR_ERROR 7 /* Request has failed (unknown) */ typedef struct _ipfw_obj_dyntlv { ipfw_obj_tlv head; ipfw_dyn_rule state; } ipfw_obj_dyntlv; #define IPFW_DF_LAST 0x01 /* Last state in chain */ /* Containter TLVs */ typedef struct _ipfw_obj_ctlv { ipfw_obj_tlv head; /* TLV header */ uint32_t count; /* Number of sub-TLVs */ uint16_t objsize; /* Single object size */ uint8_t version; /* TLV version */ uint8_t flags; /* TLV-specific flags */ } ipfw_obj_ctlv; /* Range TLV */ typedef struct _ipfw_range_tlv { ipfw_obj_tlv head; /* TLV header */ uint32_t flags; /* Range flags */ - uint16_t start_rule; /* Range start */ - uint16_t end_rule; /* Range end */ + uint32_t start_rule; /* Range start */ + uint32_t end_rule; /* Range end */ uint32_t set; /* Range set to match */ uint32_t new_set; /* New set to move/swap to */ } ipfw_range_tlv; #define IPFW_RCFLAG_RANGE 0x01 /* rule range is set */ #define IPFW_RCFLAG_ALL 0x02 /* match ALL rules */ #define IPFW_RCFLAG_SET 0x04 /* match rules in given set */ #define IPFW_RCFLAG_DYNAMIC 0x08 /* match only dynamic states */ /* User-settable flags */ #define IPFW_RCFLAG_USER (IPFW_RCFLAG_RANGE | IPFW_RCFLAG_ALL | \ IPFW_RCFLAG_SET | IPFW_RCFLAG_DYNAMIC) /* Internally used flags */ #define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip default rule */ typedef struct _ipfw_ta_tinfo { uint32_t flags; /* Format flags */ uint32_t spare; uint8_t taclass4; /* algorithm class */ uint8_t spare4; uint16_t itemsize4; /* item size in runtime */ uint32_t size4; /* runtime structure size */ uint32_t count4; /* number of items in runtime */ uint8_t taclass6; /* algorithm class */ uint8_t spare6; uint16_t itemsize6; /* item size in runtime */ uint32_t size6; /* runtime structure size */ uint32_t count6; /* number of items in runtime */ } ipfw_ta_tinfo; #define IPFW_TACLASS_HASH 1 /* algo is based on hash */ #define IPFW_TACLASS_ARRAY 2 /* algo is based on array */ #define IPFW_TACLASS_RADIX 3 /* algo is based on radix tree */ #define IPFW_TATFLAGS_DATA 0x0001 /* Has data filled in */ #define IPFW_TATFLAGS_AFDATA 0x0002 /* Separate data per AF */ #define IPFW_TATFLAGS_AFITEM 0x0004 /* diff. items per AF */ typedef struct _ipfw_xtable_info { uint8_t type; /* table type (addr,iface,..) */ uint8_t tflags; /* type flags */ uint16_t mflags; /* modification flags */ uint16_t flags; /* generic table flags */ uint16_t spare[3]; uint32_t vmask; /* bitmask with value types */ uint32_t set; /* set table is in */ uint32_t kidx; /* kernel index */ uint32_t refcnt; /* number of references */ uint32_t count; /* Number of records */ uint32_t size; /* Total size of records(export)*/ uint32_t limit; /* Max number of records */ char tablename[64]; /* table name */ char algoname[64]; /* algorithm name */ ipfw_ta_tinfo ta_info; /* additional algo stats */ } ipfw_xtable_info; /* Generic table flags */ #define IPFW_TGFLAGS_LOCKED 0x01 /* Tables is locked from changes*/ /* Table type-specific flags */ #define IPFW_TFFLAG_SRCIP 0x01 #define IPFW_TFFLAG_DSTIP 0x02 #define IPFW_TFFLAG_SRCPORT 0x04 #define IPFW_TFFLAG_DSTPORT 0x08 #define IPFW_TFFLAG_PROTO 0x10 /* Table modification flags */ #define IPFW_TMFLAGS_LIMIT 0x0002 /* Change limit value */ #define IPFW_TMFLAGS_LOCK 0x0004 /* Change table lock state */ typedef struct _ipfw_iface_info { char ifname[64]; /* interface name */ uint32_t ifindex; /* interface index */ uint32_t flags; /* flags */ uint32_t refcnt; /* number of references */ uint32_t gencnt; /* number of changes */ uint64_t spare; } ipfw_iface_info; #define IPFW_IFFLAG_RESOLVED 0x01 /* Interface exists */ typedef struct _ipfw_ta_info { char algoname[64]; /* algorithm name */ uint32_t type; /* lookup type */ uint32_t flags; uint32_t refcnt; uint32_t spare0; uint64_t spare1; } ipfw_ta_info; +typedef struct _ipfw_cmd_header { /* control command header */ + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t size; /* Total size (incl. header) */ + uint32_t cmd; /* command */ +} ipfw_cmd_header; + typedef struct _ipfw_obj_header { ip_fw3_opheader opheader; /* IP_FW3 opcode */ - uint32_t spare; - uint16_t idx; /* object name index */ + uint32_t idx; /* object name index */ + uint16_t spare; uint8_t objtype; /* object type */ uint8_t objsubtype; /* object subtype */ ipfw_obj_ntlv ntlv; /* object name tlv */ } ipfw_obj_header; typedef struct _ipfw_obj_lheader { ip_fw3_opheader opheader; /* IP_FW3 opcode */ uint32_t set_mask; /* disabled set mask */ uint32_t count; /* Total objects count */ uint32_t size; /* Total size (incl. header) */ uint32_t objsize; /* Size of one object */ } ipfw_obj_lheader; #define IPFW_CFG_GET_STATIC 0x01 #define IPFW_CFG_GET_STATES 0x02 #define IPFW_CFG_GET_COUNTERS 0x04 typedef struct _ipfw_cfg_lheader { ip_fw3_opheader opheader; /* IP_FW3 opcode */ uint32_t set_mask; /* enabled set mask */ uint32_t spare; uint32_t flags; /* Request flags */ uint32_t size; /* neded buffer size */ uint32_t start_rule; uint32_t end_rule; } ipfw_cfg_lheader; typedef struct _ipfw_range_header { ip_fw3_opheader opheader; /* IP_FW3 opcode */ ipfw_range_tlv range; } ipfw_range_header; typedef struct _ipfw_sopt_info { uint16_t opcode; uint8_t version; uint8_t dir; uint8_t spare; uint64_t refcnt; } ipfw_sopt_info; #endif /* _IPFW2_H */ diff --git a/sys/netinet6/ip_fw_nat64.h b/sys/netinet6/ip_fw_nat64.h index 06baa02e8c3a..6f113acca44e 100644 --- a/sys/netinet6/ip_fw_nat64.h +++ b/sys/netinet6/ip_fw_nat64.h @@ -1,208 +1,212 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015 Alexander V. Chernikov * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _NETINET6_IP_FW_NAT64_H_ #define _NETINET6_IP_FW_NAT64_H_ struct ipfw_nat64stl_stats { uint64_t opcnt64; /* 6to4 of packets translated */ uint64_t opcnt46; /* 4to6 of packets translated */ uint64_t ofrags; /* number of fragments generated */ uint64_t ifrags; /* number of fragments received */ uint64_t oerrors; /* number of output errors */ uint64_t noroute4; uint64_t noroute6; uint64_t noproto; /* Protocol not supported */ uint64_t nomem; /* mbuf allocation failed */ uint64_t dropped; /* dropped due to some errors */ }; struct ipfw_nat64clat_stats { uint64_t opcnt64; /* 6to4 of packets translated */ uint64_t opcnt46; /* 4to6 of packets translated */ uint64_t ofrags; /* number of fragments generated */ uint64_t ifrags; /* number of fragments received */ uint64_t oerrors; /* number of output errors */ uint64_t noroute4; uint64_t noroute6; uint64_t noproto; /* Protocol not supported */ uint64_t nomem; /* mbuf allocation failed */ uint64_t dropped; /* dropped due to some errors */ }; struct ipfw_nat64lsn_stats { uint64_t opcnt64; /* 6to4 of packets translated */ uint64_t opcnt46; /* 4to6 of packets translated */ uint64_t ofrags; /* number of fragments generated */ uint64_t ifrags; /* number of fragments received */ uint64_t oerrors; /* number of output errors */ uint64_t noroute4; uint64_t noroute6; uint64_t noproto; /* Protocol not supported */ uint64_t nomem; /* mbuf allocation failed */ uint64_t dropped; /* dropped due to some errors */ uint64_t nomatch4; /* No addr/port match */ uint64_t jcalls; /* Number of job handler calls */ uint64_t jrequests; /* Number of job requests */ uint64_t jhostsreq; /* Number of job host requests */ uint64_t jportreq; /* Number of portgroup requests */ uint64_t jhostfails; /* Number of failed host allocs */ uint64_t jportfails; /* Number of failed portgroup allocs */ uint64_t jreinjected; /* Number of packets reinjected to q */ uint64_t jmaxlen; /* Max queue length reached */ uint64_t jnomem; /* No memory to alloc queue item */ uint64_t screated; /* Number of states created */ uint64_t sdeleted; /* Number of states deleted */ uint64_t spgcreated; /* Number of portgroups created */ uint64_t spgdeleted; /* Number of portgroups deleted */ uint64_t hostcount; /* Number of hosts */ - uint64_t tcpchunks; /* Number of TCP chunks */ - uint64_t udpchunks; /* Number of UDP chunks */ - uint64_t icmpchunks; /* Number of ICMP chunks */ + uint64_t tcpchunks; /* Number of TCP portgroups */ + uint64_t udpchunks; /* Number of UDP portgroups */ + uint64_t icmpchunks; /* Number of ICMP portgroups */ uint64_t _reserved[4]; }; #define NAT64_LOG 0x0001 /* Enable logging via BPF */ #define NAT64_ALLOW_PRIVATE 0x0002 /* Allow private IPv4 address * translation */ +#define NAT64LSN_ALLOW_SWAPCONF 0x0004 /* Allow configuration exchange + * between NAT64LSN instances + * during the sets swapping. + */ typedef struct _ipfw_nat64stl_cfg { char name[64]; /* NAT name */ ipfw_obj_ntlv ntlv6; /* object name tlv */ ipfw_obj_ntlv ntlv4; /* object name tlv */ struct in6_addr prefix6; /* NAT64 prefix */ uint8_t plen6; /* Prefix length */ uint8_t set; /* Named instance set [0..31] */ uint8_t spare[2]; uint32_t flags; } ipfw_nat64stl_cfg; typedef struct _ipfw_nat64clat_cfg { char name[64]; /* NAT name */ struct in6_addr plat_prefix; /* NAT64 (PLAT) prefix */ struct in6_addr clat_prefix; /* Client (CLAT) prefix */ uint8_t plat_plen; /* PLAT Prefix length */ uint8_t clat_plen; /* CLAT Prefix length */ uint8_t set; /* Named instance set [0..31] */ uint8_t spare; uint32_t flags; } ipfw_nat64clat_cfg; /* * NAT64LSN default configuration values */ #define NAT64LSN_MAX_PORTS 2048 /* Unused */ #define NAT64LSN_JMAXLEN 2048 /* Max outstanding requests. */ #define NAT64LSN_TCP_SYN_AGE 10 /* State's TTL after SYN received. */ #define NAT64LSN_TCP_EST_AGE (2 * 3600) /* TTL for established connection */ #define NAT64LSN_TCP_FIN_AGE 180 /* State's TTL after FIN/RST received */ #define NAT64LSN_UDP_AGE 120 /* TTL for UDP states */ #define NAT64LSN_ICMP_AGE 60 /* TTL for ICMP states */ #define NAT64LSN_HOST_AGE 3600 /* TTL for stale host entry */ #define NAT64LSN_PG_AGE 900 /* TTL for stale ports groups */ typedef struct _ipfw_nat64lsn_cfg { char name[64]; /* NAT name */ uint32_t flags; uint32_t max_ports; /* Unused */ uint32_t agg_prefix_len; /* Unused */ uint32_t agg_prefix_max; /* Unused */ struct in_addr prefix4; uint16_t plen4; /* Prefix length */ uint16_t plen6; /* Prefix length */ struct in6_addr prefix6; /* NAT64 prefix */ uint32_t jmaxlen; /* Max jobqueue length */ uint16_t min_port; /* Unused */ uint16_t max_port; /* Unused */ uint16_t nh_delete_delay;/* Stale host delete delay */ uint16_t pg_delete_delay;/* Stale portgroup delete delay */ uint16_t st_syn_ttl; /* TCP syn expire */ uint16_t st_close_ttl; /* TCP fin expire */ uint16_t st_estab_ttl; /* TCP established expire */ uint16_t st_udp_ttl; /* UDP expire */ uint16_t st_icmp_ttl; /* ICMP expire */ uint8_t set; /* Named instance set [0..31] */ uint8_t states_chunks; /* Number of states chunks per PG */ } ipfw_nat64lsn_cfg; typedef struct _ipfw_nat64lsn_state { struct in_addr daddr; /* Remote IPv4 address */ uint16_t dport; /* Remote destination port */ uint16_t aport; /* Local alias port */ uint16_t sport; /* Source port */ uint8_t flags; /* State flags */ uint8_t spare[3]; uint16_t idle; /* Last used time */ } ipfw_nat64lsn_state; typedef struct _ipfw_nat64lsn_stg { uint64_t next_idx; /* next state index */ struct in_addr alias4; /* IPv4 alias address */ uint8_t proto; /* protocol */ uint8_t flags; uint16_t spare; struct in6_addr host6; /* Bound IPv6 host */ uint32_t count; /* Number of states */ uint32_t spare2; } ipfw_nat64lsn_stg; typedef struct _ipfw_nat64lsn_state_v1 { struct in6_addr host6; /* Bound IPv6 host */ struct in_addr daddr; /* Remote IPv4 address */ uint16_t dport; /* Remote destination port */ uint16_t aport; /* Local alias port */ uint16_t sport; /* Source port */ uint16_t spare; uint16_t idle; /* Last used time */ uint8_t flags; /* State flags */ uint8_t proto; /* protocol */ } ipfw_nat64lsn_state_v1; typedef struct _ipfw_nat64lsn_stg_v1 { union nat64lsn_pgidx { uint64_t index; struct { uint8_t chunk; /* states chunk */ uint8_t proto; /* protocol */ uint16_t port; /* base port */ in_addr_t addr; /* alias address */ }; } next; /* next state index */ struct in_addr alias4; /* IPv4 alias address */ uint32_t count; /* Number of states */ } ipfw_nat64lsn_stg_v1; #endif /* _NETINET6_IP_FW_NAT64_H_ */ diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c index 44a29b5689bc..923633d76df7 100644 --- a/sys/netpfil/ipfw/ip_fw2.c +++ b/sys/netpfil/ipfw/ip_fw2.c @@ -1,3725 +1,3838 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include /* for struct grehdr */ #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif #define IPFW_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(ipfw, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) SDT_PROVIDER_DEFINE(ipfw); SDT_PROBE_DEFINE6(ipfw, , , rule__matched, "int", /* retval */ "int", /* af */ "void *", /* src addr */ "void *", /* dst addr */ "struct ip_fw_args *", /* args */ "struct ip_fw *" /* rule */); /* * static variables followed by global ones. * All ipfw global variables are here. */ VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; -static int jump_lookup_pos(struct ip_fw_chain *chain, struct ip_fw *f, int num, - int tablearg, int jump_backwards); -#ifndef LINEAR_SKIPTO -static int jump_cached(struct ip_fw_chain *chain, struct ip_fw *f, int num, - int tablearg, int jump_backwards); -#define JUMP(ch, f, num, targ, back) jump_cached(ch, f, num, targ, back) +#ifndef IPFIREWALL_LINEAR_SKIPTO +VNET_DEFINE(int, skipto_cache) = 0; #else -#define JUMP(ch, f, num, targ, back) jump_lookup_pos(ch, f, num, targ, back) +VNET_DEFINE(int, skipto_cache) = 1; #endif +static uint32_t jump(struct ip_fw_chain *chain, struct ip_fw *f, + uint32_t num, int tablearg, bool jump_backwards); + /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); /* ipfw_vnet_ready controls when we are open for business */ VNET_DEFINE(int, ipfw_vnet_ready) = 0; VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, skipto_cache, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(skipto_cache), 0, + "Status of linear skipto cache: 1 - enabled, 0 - disabled."); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of concurrently used tables"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_ipfw_tables_sets, "IU", "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Firewall"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } /* * Parse TCP options. The logic copied from tcp_dooptions(). */ static int tcpopts_parse(const struct tcphdr *tcp, uint16_t *mss) { const u_char *cp = (const u_char *)(tcp + 1); int optlen, bits = 0; int cnt = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; cnt > 0; cnt -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) break; bits |= IP_FW_TCPOPT_MSS; if (mss != NULL) *mss = be16dec(cp + 2); break; case TCPOPT_WINDOW: if (optlen == TCPOLEN_WINDOW) bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: if (optlen == TCPOLEN_SACK_PERMITTED) bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_SACK: if (optlen > 2 && (optlen - 2) % TCPOLEN_SACK == 0) bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: if (optlen == TCPOLEN_TIMESTAMP) bits |= IP_FW_TCPOPT_TS; break; } } return (bits); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { return (flags_match(cmd, tcpopts_parse(tcp, NULL))); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return (0); /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table(chain, cmd->p.kidx, 0, &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) return (1); /* match */ } #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct nhop_object *nh; nh = fib4_lookup(fib, src, 0, NHR_NONE, 0); if (nh == NULL) return (0); /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ifp != nh->nh_aifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh->nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh->nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; #endif /* __FreeBSD__ */ } /* * Generate an SCTP packet containing an ABORT chunk. The verification tag * is given by vtag. The T-bit is set in the ABORT chunk if and only if * reflected is not 0. */ static struct mbuf * ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag, int reflected) { struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct sctphdr *sctp; struct sctp_chunkhdr *chunk; u_int16_t hlen, plen, tlen; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: hlen = sizeof(struct ip); break; #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); tlen = hlen + plen; m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = tlen; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, tlen); switch (id->addr_type) { case 4: ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(tlen); ip->ip_id = htons(0); ip->ip_off = htons(0); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_SCTP; ip->ip_sum = 0; ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); sctp = (struct sctphdr *)(ip + 1); break; #ifdef INET6 case 6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(plen); ip6->ip6_nxt = IPPROTO_SCTP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = id->dst_ip6; ip6->ip6_dst = id->src_ip6; sctp = (struct sctphdr *)(ip6 + 1); break; #endif } sctp->src_port = htons(id->dst_port); sctp->dest_port = htons(id->src_port); sctp->v_tag = htonl(vtag); sctp->checksum = htonl(0); chunk = (struct sctp_chunkhdr *)(sctp + 1); chunk->chunk_type = SCTP_ABORT_ASSOCIATION; chunk->chunk_flags = 0; if (reflected != 0) { chunk->chunk_flags |= SCTP_HAD_NO_TCB; } chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr)); sctp->checksum = sctp_calculate_cksum(m, hlen); return (m); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; int len, dir; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); tcp_set_flags(th, TH_RST); } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); tcp_set_flags(th, TH_RST | TH_ACK); } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); tcp_set_flags(th, TH_ACK); } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(0); h->ip_len = htons(len); h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match(int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match(int curr_flow, ipfw_insn_u32 *cmd) { int i; for (i=0; i <= cmd->o.arg1; ++i) if (curr_flow == cmd->d[i]) return 1; return 0; } /* support for IP6_*_ME opcodes */ static const struct in6_addr lla_mask = {{{ 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}; static int ipfw_localip6(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_MULTICAST(in6)) return (0); if (!IN6_IS_ADDR_LINKLOCAL(in6)) return (in6_localip(in6)); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, in6, &lla_mask)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct nhop_object *nh; if (IN6_IS_SCOPE_LINKLOCAL(src)) return (1); nh = fib6_lookup(fib, src, 0, NHR_NONE, 0); if (nh == NULL) return (0); /* If ifp is provided, check for equality with route table. */ if (ifp != NULL && ifp != nh->nh_aifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh->nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh->nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static int map_icmp_unreach(int code) { /* RFC 7915 p4.2 */ switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: return (ICMP6_DST_UNREACH_NOROUTE); case ICMP_UNREACH_PORT: return (ICMP6_DST_UNREACH_NOPORT); default: /* * Map the rest of codes into admit prohibited. * XXX: unreach proto should be mapped into ICMPv6 * parameter problem, but we use only unreach type. */ return (ICMP6_DST_UNREACH_ADMIN); } } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, const struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { const struct tcphdr * tcp; tcp = (const struct tcphdr *)((const char *)ip6 + hlen); if ((tcp_get_flags(tcp) & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp_get_flags(tcp) | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code == ICMP6_UNREACH_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m0; const struct sctphdr *sctp; u_int32_t v_tag; int reflected; sctp = (const struct sctphdr *)((const char *)ip6 + hlen); reflected = 1; v_tag = ntohl(sctp->v_tag); /* Investigate the first chunk header if available */ if (m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { const struct sctp_chunkhdr *chunk; chunk = (const struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (m->m_pkthdr.len > hlen + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { const struct sctp_init *init; init = (const struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but do not do that to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m0 = NULL; } else { m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, uint16_t mtu, int iplen, const struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, mtu); } else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp_get_flags(tcp) & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp_get_flags(tcp) | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else if (code == ICMP_REJECT_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m; struct sctphdr *sctp; struct sctp_chunkhdr *chunk; struct sctp_init *init; u_int32_t v_tag; int reflected; sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *)); reflected = 1; v_tag = ntohl(sctp->v_tag); if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { /* Look at the first chunk header if available */ chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (iplen > (ip->ip_hl << 2) + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but do not do that to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m = NULL; } else { m = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; int lookupflags; int match; id = &args->f_id; inp = args->inp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else if (id->proto == IPPROTO_UDPLITE) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_ulitecbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (args->flags & IPFW_ARGS_IN) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, args->ifp, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (args->flags & IPFW_ARGS_IN) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, args->ifp, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; args->flags |= IPFW_ARGS_REF; } -static int -jump_lookup_pos(struct ip_fw_chain *chain, struct ip_fw *f, int num, - int tablearg, int jump_backwards) +static uint32_t +jump_lookup_pos(struct ip_fw_chain *chain, struct ip_fw *f, uint32_t num, + int tablearg, bool jump_backwards) { int f_pos, i; + /* + * Make sure we do not jump backward. + */ i = IP_FW_ARG_TABLEARG(chain, num, skipto); - /* make sure we do not jump backward */ - if (jump_backwards == 0 && i <= f->rulenum) + if (!jump_backwards && i <= f->rulenum) i = f->rulenum + 1; -#ifndef LINEAR_SKIPTO - if (chain->idxmap != NULL) - f_pos = chain->idxmap[i]; - else + if (V_skipto_cache == 0) f_pos = ipfw_find_rule(chain, i, 0); -#else - f_pos = chain->idxmap[i]; -#endif /* LINEAR_SKIPTO */ + else { + /* + * Make sure we do not do out of bounds access. + */ + if (i >= IPFW_DEFAULT_RULE) + i = IPFW_DEFAULT_RULE - 1; + f_pos = chain->idxmap[i]; + } return (f_pos); } - -#ifndef LINEAR_SKIPTO -/* - * Helper function to enable cached rule lookups using - * cache.id and cache.pos fields in ipfw rule. - */ -static int -jump_cached(struct ip_fw_chain *chain, struct ip_fw *f, int num, - int tablearg, int jump_backwards) +static uint32_t +jump(struct ip_fw_chain *chain, struct ip_fw *f, uint32_t num, + int tablearg, bool jump_backwards) { int f_pos; /* Can't use cache with IP_FW_TARG */ if (num == IP_FW_TARG) return jump_lookup_pos(chain, f, num, tablearg, jump_backwards); /* * If possible use cached f_pos (in f->cache.pos), * whose version is written in f->cache.id (horrible hacks * to avoid changing the ABI). * * Multiple threads can execute the same rule simultaneously, * we need to ensure that cache.pos is updated before cache.id. */ #ifdef __LP64__ struct ip_fw_jump_cache cache; cache.raw_value = f->cache.raw_value; if (cache.id == chain->id) return (cache.pos); f_pos = jump_lookup_pos(chain, f, num, tablearg, jump_backwards); cache.pos = f_pos; cache.id = chain->id; f->cache.raw_value = cache.raw_value; #else if (f->cache.id == chain->id) { /* Load pos after id */ atomic_thread_fence_acq(); return (f->cache.pos); } f_pos = jump_lookup_pos(chain, f, num, tablearg, jump_backwards); f->cache.pos = f_pos; /* Store id after pos */ atomic_thread_fence_rel(); f->cache.id = chain->id; #endif /* !__LP64__ */ return (f_pos); } -#endif /* !LINEAR_SKIPTO */ #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) + +static inline int +tvalue_match(struct ip_fw_chain *ch, const ipfw_insn_table *cmd, + uint32_t tablearg) +{ + uint32_t tvalue; + + switch (IPFW_TVALUE_TYPE(&cmd->o)) { + case TVALUE_PIPE: + tvalue = TARG_VAL(ch, tablearg, pipe); + break; + case TVALUE_DIVERT: + tvalue = TARG_VAL(ch, tablearg, divert); + break; + case TVALUE_SKIPTO: + tvalue = TARG_VAL(ch, tablearg, skipto); + break; + case TVALUE_NETGRAPH: + tvalue = TARG_VAL(ch, tablearg, netgraph); + break; + case TVALUE_FIB: + tvalue = TARG_VAL(ch, tablearg, fib); + break; + case TVALUE_NAT: + tvalue = TARG_VAL(ch, tablearg, nat); + break; + case TVALUE_NH4: + tvalue = TARG_VAL(ch, tablearg, nh4); + break; + case TVALUE_DSCP: + tvalue = TARG_VAL(ch, tablearg, dscp); + break; + case TVALUE_LIMIT: + tvalue = TARG_VAL(ch, tablearg, limit); + break; + case TVALUE_MARK: + tvalue = TARG_VAL(ch, tablearg, mark); + break; + case TVALUE_TAG: + default: + tvalue = TARG_VAL(ch, tablearg, tag); + break; + } + return (tvalue == cmd->value); +} + /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->ifp Incoming or outgoing interface. * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * OR * args->mem Pointer to contigous memory chunk. * ip Is the beginning of the ip(4 or 6) header. * eh Ethernet header in case if input is Layer2. */ struct mbuf *m; struct ip *ip; struct ether_header *eh; /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif + uint32_t f_pos = 0; /* index of current rule in the array */ int ucred_lookup = 0; - int f_pos = 0; /* index of current rule in the array */ int retval = 0; struct ifnet *oif, *iif; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragment (fragment header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port, dst_port; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ int iplen = 0; int pktlen; struct ipfw_dyn_info dyn_info; struct ip_fw *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; #ifdef INET6 uint8_t icmp6_type = 0; #endif uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ IPFW_RLOCK_TRACKER; bool mem; bool need_send_reject = false; int reject_code; uint16_t reject_mtu; if ((mem = (args->flags & IPFW_ARGS_LENMASK))) { if (args->flags & IPFW_ARGS_ETHER) { eh = (struct ether_header *)args->mem; if (eh->ether_type == htons(ETHERTYPE_VLAN)) ip = (struct ip *) ((struct ether_vlan_header *)eh + 1); else ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = (struct ip *)args->mem; } pktlen = IPFW_ARGS_LENGTH(args->flags); args->f_id.fib = args->ifp->if_fib; /* best guess */ } else { m = args->m; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ if (args->flags & IPFW_ARGS_ETHER) { /* We need some amount of data to be contiguous. */ if (m->m_len < min(m->m_pkthdr.len, max_protohdr) && (args->m = m = m_pullup(m, min(m->m_pkthdr.len, max_protohdr))) == NULL) goto pullup_failed; eh = mtod(m, struct ether_header *); ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = mtod(m, struct ip *); } pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* mbuf not altered */ } dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ src_port = dst_port = 0; DYN_INFO_INIT(&dyn_info); /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define EHLEN (eh != NULL ? ((char *)ip - (char *)eh) : 0) #define _PULLUP_LOCKED(_len, p, T, unlock) \ do { \ int x = (_len) + T + EHLEN; \ if (mem) { \ if (__predict_false(pktlen < x)) { \ unlock; \ goto pullup_failed; \ } \ p = (char *)args->mem + (_len) + EHLEN; \ } else { \ if (__predict_false((m)->m_len < x)) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) { \ unlock; \ goto pullup_failed; \ } \ } \ p = mtod(m, char *) + (_len) + EHLEN; \ } \ } while (0) #define PULLUP_LEN(_len, p, T) _PULLUP_LOCKED(_len, p, T, ) #define PULLUP_LEN_LOCKED(_len, p, T) \ _PULLUP_LOCKED(_len, p, T, IPFW_PF_RUNLOCK(chain)); \ UPDATE_POINTERS() /* * In case pointers got stale after pullups, update them. */ #define UPDATE_POINTERS() \ do { \ if (!mem) { \ if (eh != NULL) { \ eh = mtod(m, struct ether_header *); \ ip = (struct ip *)(eh + 1); \ } else \ ip = mtod(m, struct ip *); \ args->m = m; \ } \ } while (0) /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->flags |= IPFW_ARGS_IP6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); #ifdef INET6 icmp6_type = ICMP6(ulp)->icmp6_type; #endif break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = tcp_get_flags(TCP(ulp)); break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_GRE: /* RFC 1701 */ /* XXX GRE header check? */ PULLUP_TO(hlen, ulp, struct grehdr); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, offsetof( struct carp_header, carp_counter)); if (CARP_ADVERTISEMENT != ((struct carp_header *)ulp)->carp_type) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; case IPPROTO_PFSYNC: PULLUP_TO(hlen, ulp, struct pfsync_header); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } UPDATE_POINTERS(); ip6 = (struct ip6_hdr *)ip; args->f_id.addr_type = 6; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6); } else if (pktlen >= sizeof(struct ip) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) && ip->ip_v == 4) { is_ipv4 = 1; args->flags |= IPFW_ARGS_IP4; hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster * matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = tcp_get_flags(TCP(ulp)); break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } else { if (offset == 1 && proto == IPPROTO_TCP) { /* RFC 3128 */ goto pullup_failed; } } UPDATE_POINTERS(); args->f_id.addr_type = 4; args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } else { proto = 0; dst_ip.s_addr = src_ip.s_addr = 0; args->f_id.addr_type = 1; /* XXX */ } #undef PULLUP_TO pktlen = iplen < pktlen ? iplen: pktlen; /* Properly initialize the rest of f_id */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->flags & IPFW_ARGS_REF) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } if (args->flags & IPFW_ARGS_IN) { iif = args->ifp; oif = NULL; } else { MPASS(args->flags & IPFW_ARGS_OUT); iif = mem ? NULL : m_rcvif(m); oif = args->ifp; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(iif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(args->ifp, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->flags & IPFW_ARGS_ETHER) { u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->flags & IPFW_ARGS_ETHER) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (ntohs(eh->ether_type) >= p[0] && ntohs(eh->ether_type) <= p[1]); } break; case O_FRAG: if (is_ipv4) { /* * Since flags_match() works with * uint8_t we pack ip_off into 8 bits. * For this match offset is a boolean. */ match = flags_match(cmd, ((ntohs(ip->ip_off) & ~IP_OFFMASK) >> 8) | (offset != 0)); } else { /* * Compatibility: historically bare * "frag" would match IPv6 fragments. */ match = (cmd->arg1 == 0x1 && (offset != 0)); } break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->flags & IPFW_ARGS_ETHER); break; case O_DIVERTED: if ((args->flags & IPFW_ARGS_REF) == 0) break; /* * For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ match = ((args->rule.info & IPFW_IS_MASK) == IPFW_IS_DIVERT) && ( ((args->rule.info & IPFW_INFO_IN) ? 1: 2) & cmd->arg1); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_DST_LOOKUP: - { - if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { - void *pkey; - uint32_t vidx, key; - uint16_t keylen = 0; /* zero if can't match the packet */ - - /* Determine lookup key type */ - vidx = ((ipfw_insn_u32 *)cmd)->d[1]; - switch (vidx) { - case LOOKUP_DST_IP: - case LOOKUP_SRC_IP: - /* Need IP frame */ - if (is_ipv6 == 0 && is_ipv4 == 0) - break; - if (vidx == LOOKUP_DST_IP) - pkey = is_ipv6 ? - (void *)&args->f_id.dst_ip6: - (void *)&dst_ip; + if (IPFW_LOOKUP_TYPE(cmd) != LOOKUP_NONE) { + void *pkey = NULL; + uint32_t key, vidx; + uint16_t keylen = 0; /* zero if can't match the packet */ + uint8_t lookup_type; + + lookup_type = IPFW_LOOKUP_TYPE(cmd); + + switch (lookup_type) { + case LOOKUP_DST_IP: + case LOOKUP_SRC_IP: + if (is_ipv4) { + keylen = sizeof(in_addr_t); + if (lookup_type == LOOKUP_DST_IP) + pkey = &dst_ip; else - pkey = is_ipv6 ? - (void *)&args->f_id.src_ip6: - (void *)&src_ip; - keylen = is_ipv6 ? - sizeof(struct in6_addr): - sizeof(in_addr_t); - break; - case LOOKUP_DST_PORT: - case LOOKUP_SRC_PORT: - /* Need IP frame */ - if (is_ipv6 == 0 && is_ipv4 == 0) - break; - /* Skip fragments */ - if (offset != 0) - break; - /* Skip proto without ports */ - if (proto != IPPROTO_TCP && - proto != IPPROTO_UDP && - proto != IPPROTO_UDPLITE && - proto != IPPROTO_SCTP) - break; - key = vidx == LOOKUP_DST_PORT ? - dst_port: - src_port; - pkey = &key; - keylen = sizeof(key); - break; - case LOOKUP_UID: - case LOOKUP_JAIL: - check_uidgid( - (ipfw_insn_u32 *)cmd, - args, &ucred_lookup, - &ucred_cache); - key = vidx == LOOKUP_UID ? - ucred_cache->cr_uid: - ucred_cache->cr_prison->pr_id; - pkey = &key; - keylen = sizeof(key); - break; - case LOOKUP_DSCP: - /* Need IP frame */ - if (is_ipv6 == 0 && is_ipv4 == 0) - break; - if (is_ipv6) - key = IPV6_DSCP( - (struct ip6_hdr *)ip) >> 2; + pkey = &src_ip; + } else if (is_ipv6) { + keylen = sizeof(struct in6_addr); + if (lookup_type == LOOKUP_DST_IP) + pkey = &args->f_id.dst_ip6; else - key = ip->ip_tos >> 2; - pkey = &key; - keylen = sizeof(key); + pkey = &args->f_id.src_ip6; + } else /* only for L3 */ break; - case LOOKUP_DST_MAC: - case LOOKUP_SRC_MAC: - /* Need ether frame */ - if ((args->flags & IPFW_ARGS_ETHER) == 0) - break; - pkey = vidx == LOOKUP_DST_MAC ? - eh->ether_dhost: - eh->ether_shost; - keylen = ETHER_ADDR_LEN; + case LOOKUP_DSCP: + if (is_ipv4) + key = ip->ip_tos >> 2; + else if (is_ipv6) + key = IPV6_DSCP( + (struct ip6_hdr *)ip) >> 2; + else + break; /* only for L3 */ + + key &= 0x3f; + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) + key &= insntod(cmd, table)->value; + pkey = &key; + keylen = sizeof(key); + break; + case LOOKUP_DST_PORT: + case LOOKUP_SRC_PORT: + /* only for L3 */ + if (is_ipv6 == 0 && is_ipv4 == 0) { break; - case LOOKUP_MARK: - key = args->rule.pkt_mark; - pkey = &key; - keylen = sizeof(key); + } + /* Skip fragments */ + if (offset != 0) { break; } - if (keylen == 0) + /* Skip proto without ports */ + if (proto != IPPROTO_TCP && + proto != IPPROTO_UDP && + proto != IPPROTO_UDPLITE && + proto != IPPROTO_SCTP) break; - match = ipfw_lookup_table(chain, - cmd->arg1, keylen, pkey, &vidx); - if (!match) + if (lookup_type == LOOKUP_DST_PORT) + key = dst_port; + else + key = src_port; + pkey = &key; + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) + key &= insntod(cmd, table)->value; + keylen = sizeof(key); + break; + case LOOKUP_DST_MAC: + case LOOKUP_SRC_MAC: + /* only for L2 */ + if ((args->flags & IPFW_ARGS_ETHER) == 0) break; - tablearg = vidx; + + pkey = lookup_type == LOOKUP_DST_MAC ? + eh->ether_dhost : eh->ether_shost; + keylen = ETHER_ADDR_LEN; + break; +#ifndef USERSPACE + case LOOKUP_UID: + case LOOKUP_JAIL: + check_uidgid(insntod(cmd, u32), + args, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache); + if (lookup_type == LOOKUP_UID) + key = ucred_cache->cr_uid; + else if (lookup_type == LOOKUP_JAIL) + key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache); + if (lookup_type == LOOKUP_UID) + key = ucred_cache.uid; + else if (lookup_type == LOOKUP_JAIL) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ + pkey = &key; + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) + key &= insntod(cmd, table)->value; + keylen = sizeof(key); + break; +#endif /* !USERSPACE */ + case LOOKUP_MARK: + key = args->rule.pkt_mark; + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) + key &= insntod(cmd, table)->value; + pkey = &key; + keylen = sizeof(key); + break; + case LOOKUP_RULENUM: + key = f->rulenum; + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) + key &= insntod(cmd, table)->value; + pkey = &key; + keylen = sizeof(key); break; } - /* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */ - /* FALLTHROUGH */ + /* unknown key type */ + if (keylen == 0) + break; + match = ipfw_lookup_table(chain, + insntod(cmd, kidx)->kidx, keylen, + pkey, &vidx); + + if (match) + tablearg = vidx; + break; } + /* LOOKUP_NONE */ + /* FALLTHROUGH */ case O_IP_SRC_LOOKUP: { void *pkey; uint32_t vidx; uint16_t keylen; if (is_ipv4) { keylen = sizeof(in_addr_t); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &dst_ip; else pkey = &src_ip; } else if (is_ipv6) { keylen = sizeof(struct in6_addr); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &args->f_id.dst_ip6; else pkey = &args->f_id.src_ip6; } else break; - match = ipfw_lookup_table(chain, cmd->arg1, + match = ipfw_lookup_table(chain, + insntod(cmd, kidx)->kidx, keylen, pkey, &vidx); if (!match) break; - if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) { - match = ((ipfw_insn_u32 *)cmd)->d[0] == - TARG_VAL(chain, vidx, tag); + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) { + match = tvalue_match(chain, + insntod(cmd, table), vidx); if (!match) break; } tablearg = vidx; break; } case O_MAC_SRC_LOOKUP: case O_MAC_DST_LOOKUP: { void *pkey; uint32_t vidx; uint16_t keylen = ETHER_ADDR_LEN; /* Need ether frame */ if ((args->flags & IPFW_ARGS_ETHER) == 0) break; if (cmd->opcode == O_MAC_DST_LOOKUP) pkey = eh->ether_dhost; else pkey = eh->ether_shost; - match = ipfw_lookup_table(chain, cmd->arg1, + match = ipfw_lookup_table(chain, + insntod(cmd, kidx)->kidx, keylen, pkey, &vidx); if (!match) break; - if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) { - match = ((ipfw_insn_u32 *)cmd)->d[0] == - TARG_VAL(chain, vidx, tag); + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) { + match = tvalue_match(chain, + insntod(cmd, table), vidx); if (!match) break; } tablearg = vidx; break; } case O_IP_FLOW_LOOKUP: - { - uint32_t v = 0; - match = ipfw_lookup_table(chain, - cmd->arg1, 0, &args->f_id, &v); - if (!match) - break; - if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) - match = ((ipfw_insn_u32 *)cmd)->d[0] == - TARG_VAL(chain, v, tag); - if (match) - tablearg = v; - } + { + uint32_t vidx = 0; + + match = ipfw_lookup_table(chain, + insntod(cmd, kidx)->kidx, 0, + &args->f_id, &vidx); + if (!match) + break; + if (cmdlen == F_INSN_SIZE(ipfw_insn_table)) + match = tvalue_match(chain, + insntod(cmd, table), vidx); + if (match) + tablearg = vidx; break; + } + case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { match = in_localip(src_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { match = in_localip(dst_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE || proto == IPPROTO_TCP || proto == IPPROTO_SCTP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = ((is_ipv4 || is_ipv6) && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPTTL: if (!is_ipv4) break; case O_IPLEN: { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { x = IPV6_DSCP( (struct ip6_hdr *)ip) >> 2; x &= 0x3f; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x >= 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; #ifdef INET6 if (is_ipv6) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip; if (ip6->ip6_plen == 0) { /* * Jumbo payload is not * supported by this * opcode. */ break; } x = iplen - hlen; } else #endif /* INET6 */ x = iplen - (ip->ip_hl << 2); tcp = TCP(ulp); x -= tcp->th_off << 2; if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: /* * Note that this is currently only set up to * match the lower 8 TCP header flag bits, not * the full compliment of all 12 flags. */ match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, tcp_get_flags(TCP(ulp)))); break; case O_TCPOPTS: if (proto == IPPROTO_TCP && offset == 0 && ulp){ PULLUP_LEN_LOCKED(hlen, ulp, (TCP(ulp)->th_off << 2)); match = tcpopts_match(TCP(ulp), cmd); } break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPMSS: if (proto == IPPROTO_TCP && (args->f_id._flags & TH_SYN) != 0 && ulp != NULL) { uint16_t mss, *p; int i; PULLUP_LEN_LOCKED(hlen, ulp, (TCP(ulp)->th_off << 2)); if ((tcpopts_parse(TCP(ulp), &mss) & IP_FW_TCPOPT_MSS) == 0) break; if (cmdlen == 1) { match = (cmd->arg1 == mss); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (mss >= p[0] && mss <= p[1]); } break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (tcp_get_flags(TCP(ulp)) & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(chain, f, hlen, args, - offset | ip6f_mf, tablearg, ip); + offset | ip6f_mf, tablearg, ip, eh); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = (args->flags & IPFW_ARGS_OUT || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib)))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib); else match = 1; break; case O_IPSEC: match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; bool inp_locked = false; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else if (proto == IPPROTO_UDPLITE) pi = &V_ulitecbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* * For incoming packet lookup the inpcb * using the src/dest ip/port tuple. */ if (is_ipv4 && inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); inp_locked = true; } #ifdef INET6 if (is_ipv6 && inp == NULL) { inp = in6_pcblookup(pi, &args->f_id.src_ip6, htons(src_port), &args->f_id.dst_ip6, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); inp_locked = true; } #endif /* INET6 */ if (inp != NULL) { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } if (inp_locked) INP_RUNLOCK(inp); } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } case O_MARK: { uint32_t mark; if (cmd->arg1 == IP_FW_TARG) mark = TARG_VAL(chain, tablearg, mark); else - mark = ((ipfw_insn_u32 *)cmd)->d[0]; + mark = insntoc(cmd, u32)->d[0]; match = (args->rule.pkt_mark & - ((ipfw_insn_u32 *)cmd)->d[1]) == - (mark & ((ipfw_insn_u32 *)cmd)->d[1]); + insntoc(cmd, u32)->d[1]) == + (mark & insntoc(cmd, u32)->d[1]); break; } - + /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule (one * exception is O_SKIP_ACTION which could be * between these opcodes and 'action' one). * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. * * O_SKIP_ACTION: this opcode is not a real 'action' * either, and is stored right before the 'action' * part of the rule, right after the O_KEEP_STATE * opcode. It causes match failure so the real * 'action' could be executed only if the rule * is checked via dynamic rule from the state * table, as in such case execution starts * from the true 'action' opcode directly. * */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_dyn_install_state(chain, f, (ipfw_insn_limit *)cmd, args, ulp, pktlen, &dyn_info, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_info. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) && (q = ipfw_dyn_lookup_state(args, ulp, pktlen, cmd, &dyn_info)) != NULL) { /* * Found dynamic entry, jump to the * 'action' part of the parent rule * by setting f, cmd, l and clearing * cmdlen. */ f = q; f_pos = dyn_info.f_pos; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; cmdlen = 0; continue; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_SKIP_ACTION: match = 0; /* skip to the next rule */ l = 0; /* exit inner loop */ break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->flags & IPFW_ARGS_ETHER) break; /* not on layer 2 */ /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); - f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); + f_pos = jump(chain, f, + insntod(cmd, u32)->d[0], tablearg, false); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; - uint16_t jmpto, *stack; + uint32_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } - if (mtag == NULL && IS_CALL) { - mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, - IPFW_CALLSTACK_SIZE * - sizeof(uint16_t), M_NOWAIT); - if (mtag != NULL) - m_tag_prepend(m, mtag); + + /* + * We keep ruleset id in the first element + * of stack. If it doesn't match chain->id, + * then we can't trust information in the + * stack, since rules were changed. + * We reset stack pointer to be able reuse + * tag if it will be needed. + */ + if (mtag != NULL) { + stack = (uint32_t *)(mtag + 1); + if (stack[0] != chain->id) { + stack[0] = chain->id; + mtag->m_tag_id = 0; + } } /* - * On error both `call' and `return' just - * continue with next rule. + * If there is no mtag or stack is empty, + * `return` continues with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } - if (IS_CALL && (mtag == NULL || - mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { - printf("ipfw: call stack error, " - "go to next rule\n"); + + if (mtag == NULL) { + MPASS(IS_CALL); + mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, + IPFW_CALLSTACK_SIZE * + sizeof(uint32_t), M_NOWAIT); + if (mtag != NULL) { + m_tag_prepend(m, mtag); + stack = (uint32_t *)(mtag + 1); + stack[0] = chain->id; + } + } + + if (mtag == NULL) { + printf("ipfw: rule %u: failed to " + "allocate call stack. " + "Denying packet.\n", + f->rulenum); + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + retval = IP_FW_DENY; /* drop packet */ + break; + } + + if (IS_CALL && mtag->m_tag_id >= + IPFW_CALLSTACK_SIZE - 1) { + printf("ipfw: rule %u: call stack " + "overflow. Denying packet.\n", + f->rulenum); l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + retval = IP_FW_DENY; /* drop packet */ break; } + MPASS(stack == (uint32_t *)(mtag + 1)); IPFW_INC_RULE_COUNTER(f, pktlen); - stack = (uint16_t *)(mtag + 1); - /* - * The `call' action may use cached f_pos - * (in f->next_rule), whose version is written - * in f->next_rule. - * The `return' action, however, doesn't have - * fixed jump address in cmd->arg1 and can't use - * cache. - */ if (IS_CALL) { - stack[mtag->m_tag_id] = f->rulenum; - mtag->m_tag_id++; - f_pos = JUMP(chain, f, cmd->arg1, - tablearg, 1); + stack[++mtag->m_tag_id] = f_pos; + f_pos = jump(chain, f, + insntod(cmd, u32)->d[0], + tablearg, true); } else { /* `return' action */ - mtag->m_tag_id--; - jmpto = stack[mtag->m_tag_id] + 1; - f_pos = ipfw_find_rule(chain, jmpto, 0); + jmpto = stack[mtag->m_tag_id--]; + if (cmd->arg1 == RETURN_NEXT_RULE) + f_pos = jmpto + 1; + else /* RETURN_NEXT_RULENUM */ + f_pos = ipfw_find_rule(chain, + chain->map[ + jmpto]->rulenum + 1, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ + MPASS(f_pos < chain->n_rules - 1); for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; - /* Re-enter the inner loop at the dest rule. */ + /* + * Re-enter the inner loop at the dest + * rule. + */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { KASSERT(!need_send_reject, ("o_reject - need_send_reject was set previously")); if ((reject_code = cmd->arg1) == ICMP_UNREACH_NEEDFRAG && cmd->len == F_INSN_SIZE(ipfw_insn_u16)) { reject_mtu = ((ipfw_insn_u16 *)cmd)->ports[0]; } else { reject_mtu = 0; } need_send_reject = true; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST( &args->f_id.dst_ip6)) { KASSERT(!need_send_reject, ("o_unreach6 - need_send_reject was set previously")); reject_code = cmd->arg1; if (cmd->opcode == O_REJECT) { reject_code = map_icmp_unreach(reject_code); } need_send_reject = true; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { #ifdef INET6 /* * We use O_FORWARD_IP opcode for * fwd rule with tablearg, but tables * now support IPv6 addresses. And * when we are inspecting IPv6 packet, * we can use nh6 field from * table_value as next_hop6 address. */ if (is_ipv6) { struct ip_fw_nh6 *nh6; args->flags |= IPFW_ARGS_NH6; nh6 = &args->hopstore6; nh6->sin6_addr = TARG_VAL( chain, tablearg, nh6); nh6->sin6_port = sa->sin_port; nh6->sin6_scope_id = TARG_VAL( chain, tablearg, zoneid); } else #endif { args->flags |= IPFW_ARGS_NH4; args->hopstore.sin_port = sa->sin_port; sa = &args->hopstore; sa->sin_family = AF_INET; sa->sin_len = sizeof(*sa); sa->sin_addr.s_addr = htonl( TARG_VAL(chain, tablearg, nh4)); } } else { args->flags |= IPFW_ARGS_NH4PTR; args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->flags |= IPFW_ARGS_NH6PTR; args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = TARG(cmd->arg1, fib) & 0x7FFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; /* XXX */ l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t old; old = *(uint16_t *)ip; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); } else if (is_ipv6) { /* update cached value */ args->f_id.flow_id6 = ntohl(*(uint32_t *)ip) & ~0x0FC00000; args->f_id.flow_id6 |= code << 22; *((uint32_t *)ip) = htonl(args->f_id.flow_id6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ /* * Ensure that we do not invoke NAT handler for * non IPv4 packets. Libalias expects only IPv4. */ if (!is_ipv4 || !IPFW_NAT_LOADED) { retval = IP_FW_DENY; break; } struct cfg_nat *t; int nat_id; args->rule.info = 0; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == IP_FW_NAT44_GLOBAL) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); break; case O_REASS: { int ip_off; l = 0; /* in any case exit inner loop */ if (is_ipv6) /* IPv6 is not supported yet */ break; IPFW_INC_RULE_COUNTER(f, pktlen); ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; args->rule.info = 0; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } case O_SETMARK: { l = 0; /* exit inner loop */ args->rule.pkt_mark = ( (cmd->arg1 == IP_FW_TARG) ? TARG_VAL(chain, tablearg, mark) : - ((ipfw_insn_u32 *)cmd)->d[0]); + insntoc(cmd, u32)->d[0]); IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_EXTERNAL_ACTION: l = 0; /* in any case exit inner loop */ retval = ipfw_run_eaction(chain, args, cmd, &done); /* * If both @retval and @done are zero, * consider this as rule matching and * update counters. */ if (retval == 0 && done == 0) { IPFW_INC_RULE_COUNTER(f, pktlen); /* * Reset the result of the last * dynamic state lookup. * External action can change * @args content, and it may be * used for new state lookup later. */ DYN_INFO_INIT(&dyn_info); } break; default: - panic("-- unknown opcode %d\n", cmd->opcode); + panic("ipfw: rule %u: unknown opcode %d\n", + f->rulenum, cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN #undef PULLUP_LEN_LOCKED if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); IPFW_PROBE(rule__matched, retval, is_ipv4 ? AF_INET : AF_INET6, is_ipv4 ? (uintptr_t)&src_ip : (uintptr_t)&args->f_id.src_ip6, is_ipv4 ? (uintptr_t)&dst_ip : (uintptr_t)&args->f_id.dst_ip6, args, rule); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); if (need_send_reject) { #ifdef INET6 if (is_ipv6) send_reject6(args, reject_code, hlen, (struct ip6_hdr *)ip); else #endif send_reject(args, reject_code, reject_mtu, iplen, ip); } #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } /* * Switches table namespace between global and per-set. */ static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) { int error; unsigned int sets; sets = V_fw_tables_sets; error = sysctl_handle_int(oidp, &sets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_iface_destroy(); ipfw_destroy_sopt_handler(); ipfw_destroy_obj_rewriter(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* Init shared services hash table */ ipfw_init_srv(chain); ipfw_init_counters(); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } IPFW_LOCK_INIT(chain); /* fill and insert the default rule */ rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); rule->flags |= IPFW_RULE_NOOPT; rule->cmd_len = 1; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = rule; ipfw_add_protected_rule(chain, rule, 0); ipfw_dyn_init(chain); ipfw_eaction_init(chain, first); -#ifdef LINEAR_SKIPTO ipfw_init_skipto_cache(chain); -#endif ipfw_bpf_init(first); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ ipfw_detach_hooks(); V_ip_fw_ctl_ptr = NULL; last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_UH_WLOCK(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) ipfw_reap_add(chain, &reap, chain->map[i]); free(chain->map, M_IPFW); -#ifdef LINEAR_SKIPTO ipfw_destroy_skipto_cache(chain); -#endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_destroy_tables(chain, last); ipfw_eaction_uninit(chain, last); if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); ipfw_bpf_uninit(last); return (0); } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ diff --git a/sys/netpfil/ipfw/ip_fw_compat.c b/sys/netpfil/ipfw/ip_fw_compat.c new file mode 100644 index 000000000000..a1c5349c4656 --- /dev/null +++ b/sys/netpfil/ipfw/ip_fw_compat.c @@ -0,0 +1,714 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 Yandex LLC + * Copyright (c) 2025 Andrey V. Elsukov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +/* + * Example of compatibility layer for ipfw's rule management routines. + */ + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipfw.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* hooks */ +#include + +#include +#include + +#ifdef MAC +#include +#endif + +/* + * These structures were used by IP_FW3 socket option with version 0. + */ +typedef struct _ipfw_dyn_rule_v0 { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ipfw_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ + u_int16_t kidx; /* index of named object */ +} __packed __aligned(8) ipfw_dyn_rule_v0; + +typedef struct _ipfw_obj_dyntlv_v0 { + ipfw_obj_tlv head; + ipfw_dyn_rule_v0 state; +} ipfw_obj_dyntlv_v0; + +typedef struct _ipfw_obj_ntlv_v0 { + ipfw_obj_tlv head; /* TLV header */ + uint16_t idx; /* Name index */ + uint8_t set; /* set, if applicable */ + uint8_t type; /* object type, if applicable */ + uint32_t spare; /* unused */ + char name[64]; /* Null-terminated name */ +} ipfw_obj_ntlv_v0; + +typedef struct _ipfw_range_tlv_v0 { + ipfw_obj_tlv head; /* TLV header */ + uint32_t flags; /* Range flags */ + uint16_t start_rule; /* Range start */ + uint16_t end_rule; /* Range end */ + uint32_t set; /* Range set to match */ + uint32_t new_set; /* New set to move/swap to */ +} ipfw_range_tlv_v0; + +typedef struct _ipfw_range_header_v0 { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + ipfw_range_tlv_v0 range; +} ipfw_range_header_v0; + +typedef struct _ipfw_insn_limit_v0 { + ipfw_insn o; + uint8_t _pad; + uint8_t limit_mask; + uint16_t conn_limit; +} ipfw_insn_limit_v0; + +typedef struct _ipfw_obj_tentry_v0 { + ipfw_obj_tlv head; /* TLV header */ + uint8_t subtype; /* subtype (IPv4,IPv6) */ + uint8_t masklen; /* mask length */ + uint8_t result; /* request result */ + uint8_t spare0; + uint16_t idx; /* Table name index */ + uint16_t spare1; + union { + /* Longest field needs to be aligned by 8-byte boundary */ + struct in_addr addr; /* IPv4 address */ + uint32_t key; /* uid/gid/port */ + struct in6_addr addr6; /* IPv6 address */ + char iface[IF_NAMESIZE]; /* interface name */ + struct tflow_entry flow; + } k; + union { + ipfw_table_value value; /* value data */ + uint32_t kidx; /* value kernel index */ + } v; +} ipfw_obj_tentry_v0; + +static sopt_handler_f dump_config_v0, add_rules_v0, del_rules_v0, + clear_rules_v0, move_rules_v0, manage_sets_v0, dump_soptcodes_v0, + dump_srvobjects_v0; + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_XGET, IP_FW3_OPVER_0, HDIR_GET, dump_config_v0 }, + { IP_FW_XADD, IP_FW3_OPVER_0, HDIR_BOTH, add_rules_v0 }, + { IP_FW_XDEL, IP_FW3_OPVER_0, HDIR_BOTH, del_rules_v0 }, + { IP_FW_XZERO, IP_FW3_OPVER_0, HDIR_SET, clear_rules_v0 }, + { IP_FW_XRESETLOG, IP_FW3_OPVER_0, HDIR_SET, clear_rules_v0 }, + { IP_FW_XMOVE, IP_FW3_OPVER_0, HDIR_SET, move_rules_v0 }, + { IP_FW_SET_SWAP, IP_FW3_OPVER_0, HDIR_SET, manage_sets_v0 }, + { IP_FW_SET_MOVE, IP_FW3_OPVER_0, HDIR_SET, manage_sets_v0 }, + { IP_FW_SET_ENABLE, IP_FW3_OPVER_0, HDIR_SET, manage_sets_v0 }, + { IP_FW_DUMP_SOPTCODES, IP_FW3_OPVER_0, HDIR_GET, dump_soptcodes_v0 }, + { IP_FW_DUMP_SRVOBJECTS, IP_FW3_OPVER_0, HDIR_GET, dump_srvobjects_v0 }, +}; + +static int +dump_config_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + return (EOPNOTSUPP); +} + +/* + * Calculate the size adjust needed to store opcodes converted from v0 + * to v1. + */ +static int +adjust_size_v0(ipfw_insn *cmd) +{ + int cmdlen, adjust; + + cmdlen = F_LEN(cmd); + switch (cmd->opcode) { + case O_CHECK_STATE: + case O_KEEP_STATE: + case O_PROBE_STATE: + case O_EXTERNAL_ACTION: + case O_EXTERNAL_INSTANCE: + adjust = F_INSN_SIZE(ipfw_insn_kidx) - cmdlen; + break; + case O_LIMIT: + adjust = F_INSN_SIZE(ipfw_insn_limit) - cmdlen; + break; + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + case O_IP_FLOW_LOOKUP: + case O_MAC_SRC_LOOKUP: + case O_MAC_DST_LOOKUP: + if (cmdlen == F_INSN_SIZE(ipfw_insn)) + adjust = F_INSN_SIZE(ipfw_insn_kidx) - cmdlen; + else + adjust = F_INSN_SIZE(ipfw_insn_table) - cmdlen; + break; + case O_SKIPTO: + case O_CALLRETURN: + adjust = F_INSN_SIZE(ipfw_insn_u32) - cmdlen; + break; + default: + adjust = 0; + } + return (adjust); +} + +static int +parse_rules_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd, ipfw_obj_ctlv **prtlv, + struct rule_check_info **pci) +{ + ipfw_obj_ctlv *ctlv, *rtlv, *tstate; + ipfw_obj_ntlv_v0 *ntlv; + struct rule_check_info *ci, *cbuf; + struct ip_fw_rule *r; + size_t count, clen, read, rsize; + uint32_t rulenum; + int idx, error; + + op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize); + ctlv = (ipfw_obj_ctlv *)(op3 + 1); + read = sizeof(ip_fw3_opheader); + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + + rtlv = NULL; + tstate = NULL; + cbuf = NULL; + /* Table names or other named objects. */ + if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { + /* Check size and alignment. */ + clen = ctlv->head.length; + if (read + clen > sd->valsize || clen < sizeof(*ctlv) || + (clen % sizeof(uint64_t)) != 0) + return (EINVAL); + /* Check for validness. */ + count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv); + if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv)) + return (EINVAL); + /* + * Check each TLV. + * Ensure TLVs are sorted ascending and + * there are no duplicates. + */ + idx = -1; + ntlv = (ipfw_obj_ntlv_v0 *)(ctlv + 1); + while (count > 0) { + if (ntlv->head.length != sizeof(ipfw_obj_ntlv_v0)) + return (EINVAL); + + error = ipfw_check_object_name_generic(ntlv->name); + if (error != 0) + return (error); + + if (ntlv->idx <= idx) + return (EINVAL); + + idx = ntlv->idx; + count--; + ntlv++; + } + + tstate = ctlv; + read += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + } + + /* List of rules. */ + if (ctlv->head.type == IPFW_TLV_RULE_LIST) { + clen = ctlv->head.length; + if (read + clen > sd->valsize || clen < sizeof(*ctlv) || + (clen % sizeof(uint64_t)) != 0) + return (EINVAL); + + clen -= sizeof(*ctlv); + if (ctlv->count == 0 || + ctlv->count > clen / sizeof(struct ip_fw_rule)) + return (EINVAL); + + /* Allocate state for each rule */ + cbuf = malloc(ctlv->count * sizeof(struct rule_check_info), + M_TEMP, M_WAITOK | M_ZERO); + + /* + * Check each rule for validness. + * Ensure numbered rules are sorted ascending + * and properly aligned + */ + rulenum = 0; + count = 0; + error = 0; + ci = cbuf; + r = (struct ip_fw_rule *)(ctlv + 1); + while (clen > 0) { + rsize = RULEUSIZE1(r); + if (rsize > clen || count > ctlv->count) { + error = EINVAL; + break; + } + ci->ctlv = tstate; + ci->version = IP_FW3_OPVER_0; + error = ipfw_check_rule(r, rsize, ci); + if (error != 0) + break; + + /* Check sorting */ + if (r->rulenum != 0 && r->rulenum < rulenum) { + printf("ipfw: wrong order: rulenum %u" + " vs %u\n", r->rulenum, rulenum); + error = EINVAL; + break; + } + rulenum = r->rulenum; + ci->urule = (caddr_t)r; + clen -= rsize; + r = (struct ip_fw_rule *)((caddr_t)r + rsize); + count++; + ci++; + } + + if (ctlv->count != count || error != 0) { + free(cbuf, M_TEMP); + return (EINVAL); + } + + rtlv = ctlv; + read += ctlv->head.length; + ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); + } + + if (read != sd->valsize || rtlv == NULL) { + free(cbuf, M_TEMP); + return (EINVAL); + } + + *prtlv = rtlv; + *pci = cbuf; + return (0); +} + +static void +convert_v0_to_v1(struct rule_check_info *ci, int rule_len) +{ + struct ip_fw_rule *urule; + struct ip_fw *krule; + ipfw_insn *src, *dst; + int l, cmdlen, newlen; + + urule = (struct ip_fw_rule *)ci->urule; + krule = ci->krule; + for (l = urule->cmd_len, src = urule->cmd, dst = krule->cmd; + l > 0 && rule_len > 0; + l -= cmdlen, src += cmdlen, + rule_len -= newlen, dst += newlen) { + cmdlen = F_LEN(src); + switch (src->opcode) { + case O_CHECK_STATE: + case O_KEEP_STATE: + case O_PROBE_STATE: + case O_EXTERNAL_ACTION: + case O_EXTERNAL_INSTANCE: + newlen = F_INSN_SIZE(ipfw_insn_kidx); + insntod(dst, kidx)->kidx = src->arg1; + break; + case O_LIMIT: + newlen = F_INSN_SIZE(ipfw_insn_limit); + insntod(dst, limit)->kidx = src->arg1; + insntod(dst, limit)->limit_mask = + insntoc(src, limit)->limit_mask; + insntod(dst, limit)->conn_limit = + insntoc(src, limit)->conn_limit; + break; + case O_IP_DST_LOOKUP: + if (cmdlen == F_INSN_SIZE(ipfw_insn) + 2) { + /* lookup type stored in d[1] */ + dst->arg1 = insntoc(src, table)->value; + } + case O_IP_SRC_LOOKUP: + case O_IP_FLOW_LOOKUP: + case O_MAC_SRC_LOOKUP: + case O_MAC_DST_LOOKUP: + if (cmdlen == F_INSN_SIZE(ipfw_insn)) { + newlen = F_INSN_SIZE(ipfw_insn_kidx); + insntod(dst, kidx)->kidx = src->arg1; + } else { + newlen = F_INSN_SIZE(ipfw_insn_table); + insntod(dst, table)->kidx = src->arg1; + insntod(dst, table)->value = + insntoc(src, u32)->d[0]; + } + break; + case O_CALLRETURN: + case O_SKIPTO: + newlen = F_INSN_SIZE(ipfw_insn_u32); + insntod(dst, u32)->d[0] = src->arg1; + break; + default: + newlen = cmdlen; + memcpy(dst, src, sizeof(uint32_t) * newlen); + continue; + } + dst->opcode = src->opcode; + dst->len = (src->len & (F_NOT | F_OR)) | newlen; + } +} + +/* + * Copy rule @urule from v0 userland format to kernel @krule. + */ +static void +import_rule_v0(struct ip_fw_chain *chain, struct rule_check_info *ci) +{ + struct ip_fw_rule *urule; + struct ip_fw *krule; + ipfw_insn *cmd; + int l, cmdlen, adjust, aadjust; + + urule = (struct ip_fw_rule *)ci->urule; + l = urule->cmd_len; + cmd = urule->cmd; + adjust = aadjust = 0; + + /* Scan all opcodes and determine the needed size */ + while (l > 0) { + adjust += adjust_size_v0(cmd); + if (ACTION_PTR(urule) < cmd) + aadjust = adjust; + cmdlen = F_LEN(cmd); + l -= cmdlen; + cmd += cmdlen; + } + + cmdlen = urule->cmd_len + adjust; + krule = ci->krule = ipfw_alloc_rule(chain, /* RULEKSIZE1(cmdlen) */ + roundup2(sizeof(struct ip_fw) + cmdlen * 4 - 4, 8)); + + krule->act_ofs = urule->act_ofs + aadjust; + krule->cmd_len = urule->cmd_len + adjust; + + if (adjust != 0) + printf("%s: converted rule %u: cmd_len %u -> %u, " + "act_ofs %u -> %u\n", __func__, urule->rulenum, + urule->cmd_len, krule->cmd_len, urule->act_ofs, + krule->act_ofs); + + krule->rulenum = urule->rulenum; + krule->set = urule->set; + krule->flags = urule->flags; + + /* Save rulenum offset */ + ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); + convert_v0_to_v1(ci, cmdlen); +} + +static int +add_rules_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_ctlv *rtlv; + struct rule_check_info *ci, *nci; + int i, ret; + + /* + * Check rules buffer for validness. + */ + ret = parse_rules_v0(chain, op3, sd, &rtlv, &nci); + if (ret != 0) + return (ret); + /* + * Allocate storage for the kernel representation of rules. + */ + for (i = 0, ci = nci; i < rtlv->count; i++, ci++) + import_rule_v0(chain, ci); + /* + * Try to add new rules to the chain. + */ + if ((ret = ipfw_commit_rules(chain, nci, rtlv->count)) != 0) { + for (i = 0, ci = nci; i < rtlv->count; i++, ci++) + ipfw_free_rule(ci->krule); + } + /* Cleanup after ipfw_parse_rules() */ + free(nci, M_TEMP); + return (ret); +} + +static int +check_range_tlv_v0(const ipfw_range_tlv_v0 *rt, ipfw_range_tlv *crt) +{ + if (rt->head.length != sizeof(*rt)) + return (1); + if (rt->start_rule > rt->end_rule) + return (1); + if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS) + return (1); + if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags) + return (1); + + crt->head = rt->head; + crt->head.length = sizeof(*crt); + crt->flags = rt->flags; + crt->start_rule = rt->start_rule; + crt->end_rule = rt->end_rule; + crt->set = rt->set; + crt->new_set = rt->new_set; + return (0); +} + +static int +del_rules_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_range_tlv rv; + ipfw_range_header_v0 *rh; + int error, ndel; + + if (sd->valsize != sizeof(*rh)) + return (EINVAL); + + rh = (ipfw_range_header_v0 *)ipfw_get_sopt_space(sd, sd->valsize); + if (check_range_tlv_v0(&rh->range, &rv) != 0) + return (EINVAL); + + ndel = 0; + if ((error = delete_range(chain, &rv, &ndel)) != 0) + return (error); + + /* Save number of rules deleted */ + rh->range.new_set = ndel; + return (0); +} + +static int +clear_rules_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + return (EOPNOTSUPP); +} + +static int +move_rules_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + return (EOPNOTSUPP); +} + +static int +manage_sets_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + return (EOPNOTSUPP); +} + +static int +dump_soptcodes_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + return (EOPNOTSUPP); +} + +static int +dump_srvobjects_v0(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + return (EOPNOTSUPP); +} + +static enum ipfw_opcheck_result +check_opcode_compat(ipfw_insn **pcmd, int *plen, struct rule_check_info *ci) +{ + ipfw_insn *cmd; + size_t cmdlen; + + if (ci->version != IP_FW3_OPVER_0) + return (FAILED); + + cmd = *pcmd; + cmdlen = F_LEN(cmd); + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + ci->object_opcodes++; + break; + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit_v0)) + return (BAD_SIZE); + ci->object_opcodes++; + break; + case O_IP_SRC_LOOKUP: + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) + return (BAD_SIZE); + /* FALLTHROUGH */ + case O_IP_DST_LOOKUP: + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + return (BAD_SIZE); + if (cmd->arg1 >= V_fw_tables_max) { + printf("ipfw: invalid table number %u\n", + cmd->arg1); + return (FAILED); + } + ci->object_opcodes++; + break; + case O_IP_FLOW_LOOKUP: + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + return (BAD_SIZE); + if (cmd->arg1 >= V_fw_tables_max) { + printf("ipfw: invalid table number %u\n", + cmd->arg1); + return (FAILED); + } + ci->object_opcodes++; + break; + case O_CHECK_STATE: + ci->object_opcodes++; + /* FALLTHROUGH */ + case O_SKIPTO: + case O_CALLRETURN: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + return (CHECK_ACTION); + + case O_EXTERNAL_ACTION: + if (cmd->arg1 == 0 || + cmdlen != F_INSN_SIZE(ipfw_insn)) { + printf("ipfw: invalid external " + "action opcode\n"); + return (FAILED); + } + ci->object_opcodes++; + /* + * Do we have O_EXTERNAL_INSTANCE or O_EXTERNAL_DATA + * opcode? + */ + if (*plen != cmdlen) { + *plen -= cmdlen; + *pcmd = cmd += cmdlen; + cmdlen = F_LEN(cmd); + if (cmd->opcode == O_EXTERNAL_DATA) + return (CHECK_ACTION); + if (cmd->opcode != O_EXTERNAL_INSTANCE) { + printf("ipfw: invalid opcode " + "next to external action %u\n", + cmd->opcode); + return (FAILED); + } + if (cmd->arg1 == 0 || + cmdlen != F_INSN_SIZE(ipfw_insn)) { + printf("ipfw: invalid external " + "action instance opcode\n"); + return (FAILED); + } + ci->object_opcodes++; + } + return (CHECK_ACTION); + + default: + return (ipfw_check_opcode(pcmd, plen, ci)); + } + return (SUCCESS); +} + +static int +ipfw_compat_modevent(module_t mod, int type, void *unused) +{ + switch (type) { + case MOD_LOAD: + IPFW_ADD_SOPT_HANDLER(1, scodes); + ipfw_register_compat(check_opcode_compat); + break; + case MOD_UNLOAD: + ipfw_unregister_compat(); + IPFW_DEL_SOPT_HANDLER(1, scodes); + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t ipfw_compat_mod = { + "ipfw_compat", + ipfw_compat_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_COMPAT_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL +#define IPFW_COMPAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ +#define IPFW_COMPAT_MODULE_ORDER (IPFW_COMPAT_MODEVENT_ORDER + 1) + +DECLARE_MODULE(ipfw_compat, ipfw_compat_mod, IPFW_COMPAT_SI_SUB_FIREWALL, + IPFW_COMPAT_MODULE_ORDER); +MODULE_DEPEND(ipfw_compat, ipfw, 3, 3, 3); +MODULE_VERSION(ipfw_compat, 1); diff --git a/sys/netpfil/ipfw/ip_fw_dynamic.c b/sys/netpfil/ipfw/ip_fw_dynamic.c index ff55e3360c13..40598cef8076 100644 --- a/sys/netpfil/ipfw/ip_fw_dynamic.c +++ b/sys/netpfil/ipfw/ip_fw_dynamic.c @@ -1,3300 +1,3241 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2017-2018 Yandex LLC - * Copyright (c) 2017-2018 Andrey V. Elsukov + * Copyright (c) 2017-2025 Yandex LLC + * Copyright (c) 2017-2025 Andrey V. Elsukov * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipfw.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include +#include #include /* IN6_ARE_ADDR_EQUAL */ #ifdef INET6 #include #include #include #endif #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * Description of dynamic states. * * Dynamic states are stored in lists accessed through a hash tables * whose size is curr_dyn_buckets. This value can be modified through * the sysctl variable dyn_buckets. * * Currently there are four tables: dyn_ipv4, dyn_ipv6, dyn_ipv4_parent, * and dyn_ipv6_parent. * * When a packet is received, its address fields hashed, then matched * against the entries in the corresponding list by addr_type. * Dynamic states can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic states is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic states is equal to UMA zone items count. * The max number of dynamic states is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each state holds a pointer to the parent ipfw rule so we know what * action to perform. Dynamic rules are removed when the parent rule is * deleted. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ /* By default use jenkins hash function */ #define IPFIREWALL_JENKINSHASH #define DYN_COUNTER_INC(d, dir, pktlen) do { \ (d)->pcnt_ ## dir++; \ (d)->bcnt_ ## dir += pktlen; \ } while (0) #define DYN_REFERENCED 0x01 /* * DYN_REFERENCED flag is used to show that state keeps reference to named * object, and this reference should be released when state becomes expired. */ struct dyn_data { void *parent; /* pointer to parent rule */ uint32_t chain_id; /* cached ruleset id */ uint32_t f_pos; /* cached rule index */ uint32_t hashval; /* hash value used for hash resize */ uint16_t fibnum; /* fib used to send keepalives */ - uint8_t _pad[3]; + uint8_t _pad; uint8_t flags; /* internal flags */ - uint16_t rulenum; /* parent rule number */ + uint32_t rulenum; /* parent rule number */ uint32_t ruleid; /* parent rule id */ uint32_t state; /* TCP session state and flags */ uint32_t ack_fwd; /* most recent ACKs in forward */ uint32_t ack_rev; /* and reverse direction (used */ /* to generate keepalives) */ uint32_t sync; /* synchronization time */ uint32_t expire; /* expire time */ uint64_t pcnt_fwd; /* packets counter in forward */ uint64_t bcnt_fwd; /* bytes counter in forward */ uint64_t pcnt_rev; /* packets counter in reverse */ uint64_t bcnt_rev; /* bytes counter in reverse */ }; #define DPARENT_COUNT_DEC(p) do { \ MPASS(p->count > 0); \ ck_pr_dec_32(&(p)->count); \ } while (0) #define DPARENT_COUNT_INC(p) ck_pr_inc_32(&(p)->count) #define DPARENT_COUNT(p) ck_pr_load_32(&(p)->count) struct dyn_parent { void *parent; /* pointer to parent rule */ uint32_t count; /* number of linked states */ - uint8_t _pad[2]; - uint16_t rulenum; /* parent rule number */ + uint32_t rulenum; /* parent rule number */ uint32_t ruleid; /* parent rule id */ uint32_t hashval; /* hash value used for hash resize */ uint32_t expire; /* expire time */ }; struct dyn_ipv4_state { uint8_t type; /* State type */ uint8_t proto; /* UL Protocol */ - uint16_t kidx; /* named object index */ + uint16_t spare; + uint32_t kidx; /* named object index */ uint16_t sport, dport; /* ULP source and destination ports */ in_addr_t src, dst; /* IPv4 source and destination */ union { struct dyn_data *data; struct dyn_parent *limit; }; CK_SLIST_ENTRY(dyn_ipv4_state) entry; SLIST_ENTRY(dyn_ipv4_state) expired; }; CK_SLIST_HEAD(dyn_ipv4ck_slist, dyn_ipv4_state); VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4); VNET_DEFINE_STATIC(struct dyn_ipv4ck_slist *, dyn_ipv4_parent); SLIST_HEAD(dyn_ipv4_slist, dyn_ipv4_state); VNET_DEFINE_STATIC(struct dyn_ipv4_slist, dyn_expired_ipv4); #define V_dyn_ipv4 VNET(dyn_ipv4) #define V_dyn_ipv4_parent VNET(dyn_ipv4_parent) #define V_dyn_expired_ipv4 VNET(dyn_expired_ipv4) #ifdef INET6 struct dyn_ipv6_state { uint8_t type; /* State type */ uint8_t proto; /* UL Protocol */ uint16_t kidx; /* named object index */ uint16_t sport, dport; /* ULP source and destination ports */ struct in6_addr src, dst; /* IPv6 source and destination */ uint32_t zoneid; /* IPv6 scope zone id */ union { struct dyn_data *data; struct dyn_parent *limit; }; CK_SLIST_ENTRY(dyn_ipv6_state) entry; SLIST_ENTRY(dyn_ipv6_state) expired; }; CK_SLIST_HEAD(dyn_ipv6ck_slist, dyn_ipv6_state); VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6); VNET_DEFINE_STATIC(struct dyn_ipv6ck_slist *, dyn_ipv6_parent); SLIST_HEAD(dyn_ipv6_slist, dyn_ipv6_state); VNET_DEFINE_STATIC(struct dyn_ipv6_slist, dyn_expired_ipv6); #define V_dyn_ipv6 VNET(dyn_ipv6) #define V_dyn_ipv6_parent VNET(dyn_ipv6_parent) #define V_dyn_expired_ipv6 VNET(dyn_expired_ipv6) #endif /* INET6 */ /* * Per-CPU pointer indicates that specified state is currently in use * and must not be reclaimed by expiration callout. */ static void **dyn_hp_cache; DPCPU_DEFINE_STATIC(void *, dyn_hp); #define DYNSTATE_GET(cpu) ck_pr_load_ptr(DPCPU_ID_PTR((cpu), dyn_hp)) #define DYNSTATE_PROTECT(v) ck_pr_store_ptr(DPCPU_PTR(dyn_hp), (v)) #define DYNSTATE_RELEASE() DYNSTATE_PROTECT(NULL) #define DYNSTATE_CRITICAL_ENTER() critical_enter() #define DYNSTATE_CRITICAL_EXIT() do { \ DYNSTATE_RELEASE(); \ critical_exit(); \ } while (0); /* * We keep two version numbers, one is updated when new entry added to * the list. Second is updated when an entry deleted from the list. * Versions are updated under bucket lock. * * Bucket "add" version number is used to know, that in the time between * state lookup (i.e. ipfw_dyn_lookup_state()) and the followed state * creation (i.e. ipfw_dyn_install_state()) another concurrent thread did * not install some state in this bucket. Using this info we can avoid * additional state lookup, because we are sure that we will not install * the state twice. * * Also doing the tracking of bucket "del" version during lookup we can * be sure, that state entry was not unlinked and freed in time between * we read the state pointer and protect it with hazard pointer. * * An entry unlinked from CK list keeps unchanged until it is freed. * Unlinked entries are linked into expired lists using "expired" field. */ /* * dyn_expire_lock is used to protect access to dyn_expired_xxx lists. * dyn_bucket_lock is used to get write access to lists in specific bucket. * Currently one dyn_bucket_lock is used for all ipv4, ipv4_parent, ipv6, * and ipv6_parent lists. */ VNET_DEFINE_STATIC(struct mtx, dyn_expire_lock); VNET_DEFINE_STATIC(struct mtx *, dyn_bucket_lock); #define V_dyn_expire_lock VNET(dyn_expire_lock) #define V_dyn_bucket_lock VNET(dyn_bucket_lock) /* * Bucket's add/delete generation versions. */ VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_del); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv4_parent_del); #define V_dyn_ipv4_add VNET(dyn_ipv4_add) #define V_dyn_ipv4_del VNET(dyn_ipv4_del) #define V_dyn_ipv4_parent_add VNET(dyn_ipv4_parent_add) #define V_dyn_ipv4_parent_del VNET(dyn_ipv4_parent_del) #ifdef INET6 VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_del); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_add); VNET_DEFINE_STATIC(uint32_t *, dyn_ipv6_parent_del); #define V_dyn_ipv6_add VNET(dyn_ipv6_add) #define V_dyn_ipv6_del VNET(dyn_ipv6_del) #define V_dyn_ipv6_parent_add VNET(dyn_ipv6_parent_add) #define V_dyn_ipv6_parent_del VNET(dyn_ipv6_parent_del) #endif /* INET6 */ #define DYN_BUCKET(h, b) ((h) & (b - 1)) #define DYN_BUCKET_VERSION(b, v) ck_pr_load_32(&V_dyn_ ## v[(b)]) #define DYN_BUCKET_VERSION_BUMP(b, v) ck_pr_inc_32(&V_dyn_ ## v[(b)]) #define DYN_BUCKET_LOCK_INIT(lock, b) \ mtx_init(&lock[(b)], "IPFW dynamic bucket", NULL, MTX_DEF) #define DYN_BUCKET_LOCK_DESTROY(lock, b) mtx_destroy(&lock[(b)]) #define DYN_BUCKET_LOCK(b) mtx_lock(&V_dyn_bucket_lock[(b)]) #define DYN_BUCKET_UNLOCK(b) mtx_unlock(&V_dyn_bucket_lock[(b)]) #define DYN_BUCKET_ASSERT(b) mtx_assert(&V_dyn_bucket_lock[(b)], MA_OWNED) #define DYN_EXPIRED_LOCK_INIT() \ mtx_init(&V_dyn_expire_lock, "IPFW expired states list", NULL, MTX_DEF) #define DYN_EXPIRED_LOCK_DESTROY() mtx_destroy(&V_dyn_expire_lock) #define DYN_EXPIRED_LOCK() mtx_lock(&V_dyn_expire_lock) #define DYN_EXPIRED_UNLOCK() mtx_unlock(&V_dyn_expire_lock) VNET_DEFINE_STATIC(uint32_t, dyn_buckets_max); VNET_DEFINE_STATIC(uint32_t, curr_dyn_buckets); VNET_DEFINE_STATIC(struct callout, dyn_timeout); #define V_dyn_buckets_max VNET(dyn_buckets_max) #define V_curr_dyn_buckets VNET(curr_dyn_buckets) #define V_dyn_timeout VNET(dyn_timeout) /* Maximum length of states chain in a bucket */ VNET_DEFINE_STATIC(uint32_t, curr_max_length); #define V_curr_max_length VNET(curr_max_length) VNET_DEFINE_STATIC(uint32_t, dyn_keep_states); #define V_dyn_keep_states VNET(dyn_keep_states) VNET_DEFINE_STATIC(uma_zone_t, dyn_data_zone); VNET_DEFINE_STATIC(uma_zone_t, dyn_parent_zone); VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv4_zone); #ifdef INET6 VNET_DEFINE_STATIC(uma_zone_t, dyn_ipv6_zone); #define V_dyn_ipv6_zone VNET(dyn_ipv6_zone) #endif /* INET6 */ #define V_dyn_data_zone VNET(dyn_data_zone) #define V_dyn_parent_zone VNET(dyn_parent_zone) #define V_dyn_ipv4_zone VNET(dyn_ipv4_zone) /* * Timeouts for various events in handing dynamic rules. */ VNET_DEFINE_STATIC(uint32_t, dyn_ack_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_syn_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_fin_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_rst_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_udp_lifetime); VNET_DEFINE_STATIC(uint32_t, dyn_short_lifetime); #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) #define V_dyn_short_lifetime VNET(dyn_short_lifetime) /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_interval); VNET_DEFINE_STATIC(uint32_t, dyn_keepalive_period); VNET_DEFINE_STATIC(uint32_t, dyn_keepalive); VNET_DEFINE_STATIC(time_t, dyn_keepalive_last); #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) #define V_dyn_keepalive_period VNET(dyn_keepalive_period) #define V_dyn_keepalive VNET(dyn_keepalive) #define V_dyn_keepalive_last VNET(dyn_keepalive_last) VNET_DEFINE_STATIC(uint32_t, dyn_max); /* max # of dynamic states */ VNET_DEFINE_STATIC(uint32_t, dyn_count); /* number of states */ VNET_DEFINE_STATIC(uint32_t, dyn_parent_max); /* max # of parent states */ VNET_DEFINE_STATIC(uint32_t, dyn_parent_count); /* number of parent states */ #define V_dyn_max VNET(dyn_max) #define V_dyn_count VNET(dyn_count) #define V_dyn_parent_max VNET(dyn_parent_max) #define V_dyn_parent_count VNET(dyn_parent_count) #define DYN_COUNT_DEC(name) do { \ MPASS((V_ ## name) > 0); \ ck_pr_dec_32(&(V_ ## name)); \ } while (0) #define DYN_COUNT_INC(name) ck_pr_inc_32(&(V_ ## name)) #define DYN_COUNT(name) ck_pr_load_32(&(V_ ## name)) static time_t last_log; /* Log ratelimiting */ /* * Get/set maximum number of dynamic states in given VNET instance. */ static int sysctl_dyn_max(SYSCTL_HANDLER_ARGS) { uint32_t nstates; int error; nstates = V_dyn_max; error = sysctl_handle_32(oidp, &nstates, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); V_dyn_max = nstates; uma_zone_set_max(V_dyn_data_zone, V_dyn_max); return (0); } static int sysctl_dyn_parent_max(SYSCTL_HANDLER_ARGS) { uint32_t nstates; int error; nstates = V_dyn_parent_max; error = sysctl_handle_32(oidp, &nstates, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); V_dyn_parent_max = nstates; uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); return (0); } static int sysctl_dyn_buckets(SYSCTL_HANDLER_ARGS) { uint32_t nbuckets; int error; nbuckets = V_dyn_buckets_max; error = sysctl_handle_32(oidp, &nbuckets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); if (nbuckets > 256) V_dyn_buckets_max = 1 << fls(nbuckets - 1); else return (EINVAL); return (0); } SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_count), 0, "Current number of dynamic states."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_parent_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(dyn_parent_count), 0, "Current number of parent states. "); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, "Current number of buckets for states hash table."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, curr_max_length, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_max_length), 0, "Current maximum length of states chains in hash buckets."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_dyn_buckets, "IU", "Max number of buckets for dynamic states hash table."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_dyn_max, "IU", "Max number of dynamic states."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_parent_max, CTLFLAG_VNET | CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, sysctl_dyn_parent_max, "IU", "Max number of parent dynamic states."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, "Lifetime of dynamic states for TCP ACK."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, "Lifetime of dynamic states for TCP SYN."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, "Lifetime of dynamic states for TCP FIN."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, "Lifetime of dynamic states for TCP RST."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, "Lifetime of dynamic states for UDP."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, "Lifetime of dynamic states for other situations."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, "Enable keepalives for dynamic states."); SYSCTL_U32(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, "Do not flush dynamic states on rule deletion"); #ifdef IPFIREWALL_DYNDEBUG #define DYN_DEBUG(fmt, ...) do { \ printf("%s: " fmt "\n", __func__, __VA_ARGS__); \ } while (0) #else #define DYN_DEBUG(fmt, ...) #endif /* !IPFIREWALL_DYNDEBUG */ #ifdef INET6 /* Functions to work with IPv6 states */ static struct dyn_ipv6_state *dyn_lookup_ipv6_state( const struct ipfw_flow_id *, uint32_t, const void *, struct ipfw_dyn_info *, int); static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *, - uint32_t, const void *, int, uint32_t, uint16_t); + uint32_t, const void *, int, uint32_t, uint32_t); static struct dyn_ipv6_state *dyn_alloc_ipv6_state( - const struct ipfw_flow_id *, uint32_t, uint16_t, uint8_t); -static int dyn_add_ipv6_state(void *, uint32_t, uint16_t, + const struct ipfw_flow_id *, uint32_t, uint32_t, uint8_t); +static int dyn_add_ipv6_state(void *, uint32_t, uint32_t, const struct ipfw_flow_id *, uint32_t, const void *, int, uint32_t, - struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); + struct ipfw_dyn_info *, uint16_t, uint32_t, uint8_t); static void dyn_export_ipv6_state(const struct dyn_ipv6_state *, ipfw_dyn_rule *); static uint32_t dyn_getscopeid(const struct ip_fw_args *); static void dyn_make_keepalive_ipv6(struct mbuf *, const struct in6_addr *, const struct in6_addr *, uint32_t, uint32_t, uint32_t, uint16_t, uint16_t); static void dyn_enqueue_keepalive_ipv6(struct mbufq *, const struct dyn_ipv6_state *); static void dyn_send_keepalive_ipv6(struct ip_fw_chain *); static struct dyn_ipv6_state *dyn_lookup_ipv6_parent( - const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, + const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint32_t, uint32_t); static struct dyn_ipv6_state *dyn_lookup_ipv6_parent_locked( - const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint16_t, + const struct ipfw_flow_id *, uint32_t, const void *, uint32_t, uint32_t, uint32_t); -static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint16_t, - const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, uint16_t); +static struct dyn_ipv6_state *dyn_add_ipv6_parent(void *, uint32_t, uint32_t, + const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t, uint32_t); #endif /* INET6 */ /* Functions to work with limit states */ static void *dyn_get_parent_state(const struct ipfw_flow_id *, uint32_t, - struct ip_fw *, uint32_t, uint32_t, uint16_t); + struct ip_fw *, uint32_t, uint32_t, uint32_t); static struct dyn_ipv4_state *dyn_lookup_ipv4_parent( - const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); + const struct ipfw_flow_id *, const void *, uint32_t, uint32_t, uint32_t); static struct dyn_ipv4_state *dyn_lookup_ipv4_parent_locked( - const struct ipfw_flow_id *, const void *, uint32_t, uint16_t, uint32_t); -static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint16_t, + const struct ipfw_flow_id *, const void *, uint32_t, uint32_t, uint32_t); +static struct dyn_parent *dyn_alloc_parent(void *, uint32_t, uint32_t, uint32_t); -static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint16_t, - const struct ipfw_flow_id *, uint32_t, uint32_t, uint16_t); +static struct dyn_ipv4_state *dyn_add_ipv4_parent(void *, uint32_t, uint32_t, + const struct ipfw_flow_id *, uint32_t, uint32_t, uint32_t); static void dyn_tick(void *); static void dyn_expire_states(struct ip_fw_chain *, ipfw_range_tlv *); static void dyn_free_states(struct ip_fw_chain *); -static void dyn_export_parent(const struct dyn_parent *, uint16_t, uint8_t, +static void dyn_export_parent(const struct dyn_parent *, uint32_t, uint8_t, ipfw_dyn_rule *); -static void dyn_export_data(const struct dyn_data *, uint16_t, uint8_t, +static void dyn_export_data(const struct dyn_data *, uint32_t, uint8_t, uint8_t, ipfw_dyn_rule *); static uint32_t dyn_update_tcp_state(struct dyn_data *, const struct ipfw_flow_id *, const struct tcphdr *, int); static void dyn_update_proto_state(struct dyn_data *, const struct ipfw_flow_id *, const void *, int, int); /* Functions to work with IPv4 states */ struct dyn_ipv4_state *dyn_lookup_ipv4_state(const struct ipfw_flow_id *, const void *, struct ipfw_dyn_info *, int); static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *, - const void *, int, uint32_t, uint16_t); + const void *, int, uint32_t, uint32_t); static struct dyn_ipv4_state *dyn_alloc_ipv4_state( - const struct ipfw_flow_id *, uint16_t, uint8_t); -static int dyn_add_ipv4_state(void *, uint32_t, uint16_t, + const struct ipfw_flow_id *, uint32_t, uint8_t); +static int dyn_add_ipv4_state(void *, uint32_t, uint32_t, const struct ipfw_flow_id *, const void *, int, uint32_t, - struct ipfw_dyn_info *, uint16_t, uint16_t, uint8_t); + struct ipfw_dyn_info *, uint16_t, uint32_t, uint8_t); static void dyn_export_ipv4_state(const struct dyn_ipv4_state *, ipfw_dyn_rule *); /* * Named states support. */ static char *default_state_name = "default"; struct dyn_state_obj { struct named_object no; char name[64]; }; -#define DYN_STATE_OBJ(ch, cmd) \ - ((struct dyn_state_obj *)SRV_OBJECT(ch, (cmd)->arg1)) /* * Classifier callback. * Return 0 if opcode contains object that should be referenced * or rewritten. */ static int -dyn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +dyn_classify(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { + ipfw_insn_kidx *cmd; - DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); + if (F_LEN(cmd0) < 2) + return (EINVAL); + + /* + * NOTE: ipfw_insn_kidx and ipfw_insn_limit has overlapped kidx + * field, so we can use one type to get access to kidx field. + */ + cmd = insntod(cmd0, kidx); + DYN_DEBUG("opcode %u, kidx %u", cmd0->opcode, cmd->kidx); /* Don't rewrite "check-state any" */ - if (cmd->arg1 == 0 && - cmd->opcode == O_CHECK_STATE) + if (cmd->kidx == 0 && + cmd0->opcode == O_CHECK_STATE) return (1); - *puidx = cmd->arg1; + *puidx = cmd->kidx; *ptype = 0; return (0); } static void -dyn_update(ipfw_insn *cmd, uint16_t idx) +dyn_update(ipfw_insn *cmd0, uint32_t idx) { - cmd->arg1 = idx; - DYN_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); + insntod(cmd0, kidx)->kidx = idx; + DYN_DEBUG("opcode %u, kidx %u", cmd0->opcode, idx); } static int dyn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno) { ipfw_obj_ntlv *ntlv; const char *name; - DYN_DEBUG("uidx %d", ti->uidx); + DYN_DEBUG("uidx %u", ti->uidx); if (ti->uidx != 0) { if (ti->tlvs == NULL) return (EINVAL); /* Search ntlv in the buffer provided by user */ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_STATE_NAME); if (ntlv == NULL) return (EINVAL); name = ntlv->name; } else name = default_state_name; /* * Search named object with corresponding name. * Since states objects are global - ignore the set value * and use zero instead. */ *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, IPFW_TLV_STATE_NAME, name); /* * We always return success here. * The caller will check *pno and mark object as unresolved, * then it will automatically create "default" object. */ return (0); } static struct named_object * -dyn_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +dyn_findbykidx(struct ip_fw_chain *ch, uint32_t idx) { - DYN_DEBUG("kidx %d", idx); + DYN_DEBUG("kidx %u", idx); return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); } static int dyn_create(struct ip_fw_chain *ch, struct tid_info *ti, - uint16_t *pkidx) + uint32_t *pkidx) { struct namedobj_instance *ni; struct dyn_state_obj *obj; struct named_object *no; ipfw_obj_ntlv *ntlv; char *name; - DYN_DEBUG("uidx %d", ti->uidx); + DYN_DEBUG("uidx %u", ti->uidx); if (ti->uidx != 0) { if (ti->tlvs == NULL) return (EINVAL); ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_STATE_NAME); if (ntlv == NULL) return (EINVAL); name = ntlv->name; } else name = default_state_name; ni = CHAIN_TO_SRV(ch); obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); obj->no.name = obj->name; obj->no.etlv = IPFW_TLV_STATE_NAME; strlcpy(obj->name, name, sizeof(obj->name)); IPFW_UH_WLOCK(ch); no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_STATE_NAME, name); if (no != NULL) { /* * Object is already created. * Just return its kidx and bump refcount. */ *pkidx = no->kidx; no->refcnt++; IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); - DYN_DEBUG("\tfound kidx %d", *pkidx); + DYN_DEBUG("\tfound kidx %u for name '%s'", *pkidx, no->name); return (0); } if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { DYN_DEBUG("\talloc_idx failed for %s", name); IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); return (ENOSPC); } ipfw_objhash_add(ni, &obj->no); SRV_OBJECT(ch, obj->no.kidx) = obj; obj->no.refcnt++; *pkidx = obj->no.kidx; IPFW_UH_WUNLOCK(ch); - DYN_DEBUG("\tcreated kidx %d", *pkidx); + DYN_DEBUG("\tcreated kidx %u for name '%s'", *pkidx, name); return (0); } static void dyn_destroy(struct ip_fw_chain *ch, struct named_object *no) { struct dyn_state_obj *obj; IPFW_UH_WLOCK_ASSERT(ch); KASSERT(no->etlv == IPFW_TLV_STATE_NAME, ("%s: wrong object type %u", __func__, no->etlv)); KASSERT(no->refcnt == 1, ("Destroying object '%s' (type %u, idx %u) with refcnt %u", no->name, no->etlv, no->kidx, no->refcnt)); - DYN_DEBUG("kidx %d", no->kidx); + DYN_DEBUG("kidx %u", no->kidx); obj = SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; ipfw_objhash_del(CHAIN_TO_SRV(ch), no); ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), no->kidx); free(obj, M_IPFW); } static struct opcode_obj_rewrite dyn_opcodes[] = { { O_KEEP_STATE, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, { O_CHECK_STATE, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, { O_PROBE_STATE, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, { O_LIMIT, IPFW_TLV_STATE_NAME, dyn_classify, dyn_update, dyn_findbyname, dyn_findbykidx, dyn_create, dyn_destroy }, }; /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ #ifndef IPFIREWALL_JENKINSHASH static __inline uint32_t hash_packet(const struct ipfw_flow_id *id) { uint32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = ntohl((id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3])); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip); i ^= (id->dst_port) ^ (id->src_port); return (i); } static __inline uint32_t hash_parent(const struct ipfw_flow_id *id, const void *rule) { return (hash_packet(id) ^ ((uintptr_t)rule)); } #else /* IPFIREWALL_JENKINSHASH */ VNET_DEFINE_STATIC(uint32_t, dyn_hashseed); #define V_dyn_hashseed VNET(dyn_hashseed) static __inline int addrcmp4(const struct ipfw_flow_id *id) { if (id->src_ip < id->dst_ip) return (0); if (id->src_ip > id->dst_ip) return (1); if (id->src_port <= id->dst_port) return (0); return (1); } #ifdef INET6 static __inline int addrcmp6(const struct ipfw_flow_id *id) { int ret; ret = memcmp(&id->src_ip6, &id->dst_ip6, sizeof(struct in6_addr)); if (ret < 0) return (0); if (ret > 0) return (1); if (id->src_port <= id->dst_port) return (0); return (1); } static __inline uint32_t hash_packet6(const struct ipfw_flow_id *id) { struct tuple6 { struct in6_addr addr[2]; uint16_t port[2]; } t6; if (addrcmp6(id) == 0) { t6.addr[0] = id->src_ip6; t6.addr[1] = id->dst_ip6; t6.port[0] = id->src_port; t6.port[1] = id->dst_port; } else { t6.addr[0] = id->dst_ip6; t6.addr[1] = id->src_ip6; t6.port[0] = id->dst_port; t6.port[1] = id->src_port; } return (jenkins_hash32((const uint32_t *)&t6, sizeof(t6) / sizeof(uint32_t), V_dyn_hashseed)); } #endif static __inline uint32_t hash_packet(const struct ipfw_flow_id *id) { struct tuple4 { in_addr_t addr[2]; uint16_t port[2]; } t4; if (IS_IP4_FLOW_ID(id)) { /* All fields are in host byte order */ if (addrcmp4(id) == 0) { t4.addr[0] = id->src_ip; t4.addr[1] = id->dst_ip; t4.port[0] = id->src_port; t4.port[1] = id->dst_port; } else { t4.addr[0] = id->dst_ip; t4.addr[1] = id->src_ip; t4.port[0] = id->dst_port; t4.port[1] = id->src_port; } return (jenkins_hash32((const uint32_t *)&t4, sizeof(t4) / sizeof(uint32_t), V_dyn_hashseed)); } else #ifdef INET6 if (IS_IP6_FLOW_ID(id)) return (hash_packet6(id)); #endif return (0); } static __inline uint32_t hash_parent(const struct ipfw_flow_id *id, const void *rule) { return (jenkins_hash32((const uint32_t *)&rule, sizeof(rule) / sizeof(uint32_t), hash_packet(id))); } #endif /* IPFIREWALL_JENKINSHASH */ /* * Print customizable flow id description via log(9) facility. */ static void print_dyn_rule_flags(const struct ipfw_flow_id *id, int dyn_type, int log_flags, char *prefix, char *postfix) { struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(id)) { ip6_sprintf(src, &id->src_ip6); ip6_sprintf(dst, &id->dst_ip6); } else #endif { da.s_addr = htonl(id->src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(id->dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", prefix, dyn_type, src, id->src_port, dst, id->dst_port, V_dyn_count, postfix); } #define print_dyn_rule(id, dtype, prefix, postfix) \ print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) #define TIME_LE(a,b) ((int)((a)-(b)) < 0) #define _SEQ_GE(a,b) ((int)((a)-(b)) >= 0) #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) #define BOTH_RST (TH_RST | (TH_RST << 8)) #define TCP_FLAGS (BOTH_SYN | BOTH_FIN | BOTH_RST) #define ACK_FWD 0x00010000 /* fwd ack seen */ #define ACK_REV 0x00020000 /* rev ack seen */ #define ACK_BOTH (ACK_FWD | ACK_REV) static uint32_t dyn_update_tcp_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, const struct tcphdr *tcp, int dir) { uint32_t ack, expire; uint32_t state, old; uint8_t th_flags; expire = data->expire; old = state = data->state; th_flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); state |= (dir == MATCH_FORWARD) ? th_flags: (th_flags << 8); switch (state & TCP_FLAGS) { case TH_SYN: /* opening */ expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN: /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8): if (tcp == NULL) break; ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (data->ack_fwd == 0 || _SEQ_GE(ack, data->ack_fwd)) { state |= ACK_FWD; if (data->ack_fwd != ack) ck_pr_store_32(&data->ack_fwd, ack); } } else { if (data->ack_rev == 0 || _SEQ_GE(ack, data->ack_rev)) { state |= ACK_REV; if (data->ack_rev != ack) ck_pr_store_32(&data->ack_rev, ack); } } if ((state & ACK_BOTH) == ACK_BOTH) { /* * Set expire time to V_dyn_ack_lifetime only if * we got ACKs for both directions. * We use XOR here to avoid possible state * overwriting in concurrent thread. */ expire = time_uptime + V_dyn_ack_lifetime; ck_pr_xor_32(&data->state, ACK_BOTH); } else if ((data->state & ACK_BOTH) != (state & ACK_BOTH)) ck_pr_or_32(&data->state, state & ACK_BOTH); break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; expire = time_uptime + V_dyn_fin_lifetime; break; default: if (V_dyn_keepalive != 0 && V_dyn_rst_lifetime >= V_dyn_keepalive_period) V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; expire = time_uptime + V_dyn_rst_lifetime; } /* Save TCP state if it was changed */ if ((state & TCP_FLAGS) != (old & TCP_FLAGS)) ck_pr_or_32(&data->state, state & TCP_FLAGS); return (expire); } /* * Update ULP specific state. * For TCP we keep sequence numbers and flags. For other protocols * currently we update only expire time. Packets and bytes counters * are also updated here. */ static void dyn_update_proto_state(struct dyn_data *data, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, int dir) { uint32_t expire; /* NOTE: we are in critical section here. */ switch (pkt->proto) { case IPPROTO_UDP: case IPPROTO_UDPLITE: expire = time_uptime + V_dyn_udp_lifetime; break; case IPPROTO_TCP: expire = dyn_update_tcp_state(data, pkt, ulp, dir); break; default: expire = time_uptime + V_dyn_short_lifetime; } /* * Expiration timer has the per-second granularity, no need to update * it every time when state is matched. */ if (data->expire != expire) ck_pr_store_32(&data->expire, expire); if (dir == MATCH_FORWARD) DYN_COUNTER_INC(data, fwd, pktlen); else DYN_COUNTER_INC(data, rev, pktlen); } /* * Lookup IPv4 state. * Must be called in critical section. */ struct dyn_ipv4_state * dyn_lookup_ipv4_state(const struct ipfw_flow_id *pkt, const void *ulp, struct ipfw_dyn_info *info, int pktlen) { struct dyn_ipv4_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); info->version = DYN_BUCKET_VERSION(bucket, ipv4_add); restart: version = DYN_BUCKET_VERSION(bucket, ipv4_del); CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv4_del)) goto restart; if (s->proto != pkt->proto) continue; if (info->kidx != 0 && s->kidx != info->kidx) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) { info->direction = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && s->src == pkt->dst_ip && s->dst == pkt->src_ip) { info->direction = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, info->direction); return (s); } /* * Lookup IPv4 state. * Simplifed version is used to check that matching state doesn't exist. */ static int dyn_lookup_ipv4_state_locked(const struct ipfw_flow_id *pkt, - const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx) + const void *ulp, int pktlen, uint32_t bucket, uint32_t kidx) { struct dyn_ipv4_state *s; int dir; dir = MATCH_NONE; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { if (s->proto != pkt->proto || s->kidx != kidx) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) { dir = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && s->src == pkt->dst_ip && s->dst == pkt->src_ip) { dir = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); return (s != NULL); } struct dyn_ipv4_state * dyn_lookup_ipv4_parent(const struct ipfw_flow_id *pkt, const void *rule, - uint32_t ruleid, uint16_t rulenum, uint32_t hashval) + uint32_t ruleid, uint32_t rulenum, uint32_t hashval) { struct dyn_ipv4_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); restart: version = DYN_BUCKET_VERSION(bucket, ipv4_parent_del); CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_del)) goto restart; /* * NOTE: we do not need to check kidx, because parent rule * can not create states with different kidx. * And parent rule always created for forward direction. */ if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) { if (s->limit->expire != time_uptime + V_dyn_short_lifetime) ck_pr_store_32(&s->limit->expire, time_uptime + V_dyn_short_lifetime); break; } } return (s); } static struct dyn_ipv4_state * dyn_lookup_ipv4_parent_locked(const struct ipfw_flow_id *pkt, - const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket) + const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t bucket) { struct dyn_ipv4_state *s; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv4_parent[bucket], entry) { if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->src == pkt->src_ip && s->dst == pkt->dst_ip) break; } return (s); } #ifdef INET6 static uint32_t dyn_getscopeid(const struct ip_fw_args *args) { /* * If source or destination address is an scopeid address, we need * determine the scope zone id to resolve address scope ambiguity. */ if (IN6_IS_ADDR_LINKLOCAL(&args->f_id.src_ip6) || IN6_IS_ADDR_LINKLOCAL(&args->f_id.dst_ip6)) return (in6_getscopezone(args->ifp, IPV6_ADDR_SCOPE_LINKLOCAL)); return (0); } /* * Lookup IPv6 state. * Must be called in critical section. */ static struct dyn_ipv6_state * dyn_lookup_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, struct ipfw_dyn_info *info, int pktlen) { struct dyn_ipv6_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(info->hashval, V_curr_dyn_buckets); info->version = DYN_BUCKET_VERSION(bucket, ipv6_add); restart: version = DYN_BUCKET_VERSION(bucket, ipv6_del); CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv6_del)) goto restart; if (s->proto != pkt->proto || s->zoneid != zoneid) continue; if (info->kidx != 0 && s->kidx != info->kidx) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { info->direction = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { info->direction = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, info->direction); return (s); } /* * Lookup IPv6 state. * Simplifed version is used to check that matching state doesn't exist. */ static int dyn_lookup_ipv6_state_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, - const void *ulp, int pktlen, uint32_t bucket, uint16_t kidx) + const void *ulp, int pktlen, uint32_t bucket, uint32_t kidx) { struct dyn_ipv6_state *s; int dir; dir = MATCH_NONE; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { if (s->proto != pkt->proto || s->kidx != kidx || s->zoneid != zoneid) continue; if (s->sport == pkt->src_port && s->dport == pkt->dst_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { dir = MATCH_FORWARD; break; } if (s->sport == pkt->dst_port && s->dport == pkt->src_port && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->dst_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->src_ip6)) { dir = MATCH_REVERSE; break; } } if (s != NULL) dyn_update_proto_state(s->data, pkt, ulp, pktlen, dir); return (s != NULL); } static struct dyn_ipv6_state * dyn_lookup_ipv6_parent(const struct ipfw_flow_id *pkt, uint32_t zoneid, - const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t hashval) + const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t hashval) { struct dyn_ipv6_state *s; uint32_t version, bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); restart: version = DYN_BUCKET_VERSION(bucket, ipv6_parent_del); CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { DYNSTATE_PROTECT(s); if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_del)) goto restart; /* * NOTE: we do not need to check kidx, because parent rule * can not create states with different kidx. * Also parent rule always created for forward direction. */ if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->zoneid == zoneid && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) { if (s->limit->expire != time_uptime + V_dyn_short_lifetime) ck_pr_store_32(&s->limit->expire, time_uptime + V_dyn_short_lifetime); break; } } return (s); } static struct dyn_ipv6_state * dyn_lookup_ipv6_parent_locked(const struct ipfw_flow_id *pkt, uint32_t zoneid, - const void *rule, uint32_t ruleid, uint16_t rulenum, uint32_t bucket) + const void *rule, uint32_t ruleid, uint32_t rulenum, uint32_t bucket) { struct dyn_ipv6_state *s; DYN_BUCKET_ASSERT(bucket); CK_SLIST_FOREACH(s, &V_dyn_ipv6_parent[bucket], entry) { if (s->limit->parent == rule && s->limit->ruleid == ruleid && s->limit->rulenum == rulenum && s->proto == pkt->proto && s->sport == pkt->src_port && s->dport == pkt->dst_port && s->zoneid == zoneid && IN6_ARE_ADDR_EQUAL(&s->src, &pkt->src_ip6) && IN6_ARE_ADDR_EQUAL(&s->dst, &pkt->dst_ip6)) break; } return (s); } #endif /* INET6 */ /* * Lookup dynamic state. * pkt - filled by ipfw_chk() ipfw_flow_id; * ulp - determined by ipfw_chk() upper level protocol header; * dyn_info - info about matched state to return back; * Returns pointer to state's parent rule and dyn_info. If there is * no state, NULL is returned. * On match ipfw_dyn_lookup() updates state's counters. */ struct ip_fw * ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info) { struct dyn_data *data; struct ip_fw *rule; IPFW_RLOCK_ASSERT(&V_layer3_chain); + MPASS(F_LEN(cmd) >= F_INSN_SIZE(ipfw_insn_kidx)); data = NULL; rule = NULL; - info->kidx = cmd->arg1; + info->kidx = insntoc(cmd, kidx)->kidx; info->direction = MATCH_NONE; info->hashval = hash_packet(&args->f_id); DYNSTATE_CRITICAL_ENTER(); if (IS_IP4_FLOW_ID(&args->f_id)) { struct dyn_ipv4_state *s; s = dyn_lookup_ipv4_state(&args->f_id, ulp, info, pktlen); if (s != NULL) { /* * Dynamic states are created using the same 5-tuple, * so it is assumed, that parent rule for O_LIMIT * state has the same address family. */ data = s->data; if (s->type == O_LIMIT) { s = data->parent; rule = s->limit->parent; } else rule = data->parent; } } #ifdef INET6 else if (IS_IP6_FLOW_ID(&args->f_id)) { struct dyn_ipv6_state *s; s = dyn_lookup_ipv6_state(&args->f_id, dyn_getscopeid(args), ulp, info, pktlen); if (s != NULL) { data = s->data; if (s->type == O_LIMIT) { s = data->parent; rule = s->limit->parent; } else rule = data->parent; } } #endif if (data != NULL) { /* * If cached chain id is the same, we can avoid rule index * lookup. Otherwise do lookup and update chain_id and f_pos. * It is safe even if there is concurrent thread that want * update the same state, because chain->id can be changed * only under IPFW_WLOCK(). */ if (data->chain_id != V_layer3_chain.id) { data->f_pos = ipfw_find_rule(&V_layer3_chain, data->rulenum, data->ruleid); /* * Check that found state has not orphaned. * When chain->id being changed the parent * rule can be deleted. If found rule doesn't * match the parent pointer, consider this * result as MATCH_NONE and return NULL. * * This will lead to creation of new similar state * that will be added into head of this bucket. * And the state that we currently have matched * should be deleted by dyn_expire_states(). * * In case when dyn_keep_states is enabled, return * pointer to deleted rule and f_pos value * corresponding to penultimate rule. * When we have enabled V_dyn_keep_states, states * that become orphaned will get the DYN_REFERENCED * flag and rule will keep around. So we can return * it. But since it is not in the rules map, we need * return such f_pos value, so after the state * handling if the search will continue, the next rule * will be the last one - the default rule. */ if (V_layer3_chain.map[data->f_pos] == rule) { data->chain_id = V_layer3_chain.id; - info->f_pos = data->f_pos; } else if (V_dyn_keep_states != 0) { /* * The original rule pointer is still usable. * So, we return it, but f_pos need to be * changed to point to the penultimate rule. */ MPASS(V_layer3_chain.n_rules > 1); data->chain_id = V_layer3_chain.id; data->f_pos = V_layer3_chain.n_rules - 2; - info->f_pos = data->f_pos; } else { rule = NULL; info->direction = MATCH_NONE; DYN_DEBUG("rule %p [%u, %u] is considered " "invalid in data %p", rule, data->ruleid, data->rulenum, data); /* info->f_pos doesn't matter here. */ } - } else - info->f_pos = data->f_pos; + } + info->f_pos = data->f_pos; } DYNSTATE_CRITICAL_EXIT(); #if 0 /* * Return MATCH_NONE if parent rule is in disabled set. * This will lead to creation of new similar state that * will be added into head of this bucket. * * XXXAE: we need to be able update state's set when parent * rule set is changed. */ if (rule != NULL && (V_set_disable & (1 << rule->set))) { rule = NULL; info->direction = MATCH_NONE; } #endif return (rule); } static struct dyn_parent * -dyn_alloc_parent(void *parent, uint32_t ruleid, uint16_t rulenum, +dyn_alloc_parent(void *parent, uint32_t ruleid, uint32_t rulenum, uint32_t hashval) { struct dyn_parent *limit; limit = uma_zalloc(V_dyn_parent_zone, M_NOWAIT | M_ZERO); if (limit == NULL) { if (last_log != time_uptime) { last_log = time_uptime; log(LOG_DEBUG, "ipfw: Cannot allocate parent dynamic state, " "consider increasing " "net.inet.ip.fw.dyn_parent_max\n"); } return (NULL); } limit->parent = parent; limit->ruleid = ruleid; limit->rulenum = rulenum; limit->hashval = hashval; limit->expire = time_uptime + V_dyn_short_lifetime; return (limit); } static struct dyn_data * -dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint16_t rulenum, +dyn_alloc_dyndata(void *parent, uint32_t ruleid, uint32_t rulenum, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, uint32_t hashval, uint16_t fibnum) { struct dyn_data *data; data = uma_zalloc(V_dyn_data_zone, M_NOWAIT | M_ZERO); if (data == NULL) { if (last_log != time_uptime) { last_log = time_uptime; log(LOG_DEBUG, "ipfw: Cannot allocate dynamic state, " "consider increasing net.inet.ip.fw.dyn_max\n"); } return (NULL); } data->parent = parent; data->ruleid = ruleid; data->rulenum = rulenum; data->fibnum = fibnum; data->hashval = hashval; data->expire = time_uptime + V_dyn_syn_lifetime; dyn_update_proto_state(data, pkt, ulp, pktlen, MATCH_FORWARD); return (data); } static struct dyn_ipv4_state * -dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint16_t kidx, +dyn_alloc_ipv4_state(const struct ipfw_flow_id *pkt, uint32_t kidx, uint8_t type) { struct dyn_ipv4_state *s; s = uma_zalloc(V_dyn_ipv4_zone, M_NOWAIT | M_ZERO); if (s == NULL) return (NULL); s->type = type; s->kidx = kidx; s->proto = pkt->proto; s->sport = pkt->src_port; s->dport = pkt->dst_port; s->src = pkt->src_ip; s->dst = pkt->dst_ip; return (s); } /* * Add IPv4 parent state. * Returns pointer to parent state. When it is not NULL we are in * critical section and pointer protected by hazard pointer. * When some error occurs, it returns NULL and exit from critical section * is not needed. */ static struct dyn_ipv4_state * -dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint16_t rulenum, +dyn_add_ipv4_parent(void *rule, uint32_t ruleid, uint32_t rulenum, const struct ipfw_flow_id *pkt, uint32_t hashval, uint32_t version, - uint16_t kidx) + uint32_t kidx) { struct dyn_ipv4_state *s; struct dyn_parent *limit; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (version != DYN_BUCKET_VERSION(bucket, ipv4_parent_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ s = dyn_lookup_ipv4_parent_locked(pkt, rule, ruleid, rulenum, bucket); if (s != NULL) { /* * Simultaneous thread has already created this * state. Just return it. */ DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } } limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); if (limit == NULL) { DYN_BUCKET_UNLOCK(bucket); return (NULL); } s = dyn_alloc_ipv4_state(pkt, kidx, O_LIMIT_PARENT); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_parent_zone, limit); return (NULL); } s->limit = limit; CK_SLIST_INSERT_HEAD(&V_dyn_ipv4_parent[bucket], s, entry); DYN_COUNT_INC(dyn_parent_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv4_parent_add); DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } static int -dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint16_t rulenum, +dyn_add_ipv4_state(void *parent, uint32_t ruleid, uint32_t rulenum, const struct ipfw_flow_id *pkt, const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, uint16_t fibnum, - uint16_t kidx, uint8_t type) + uint32_t kidx, uint8_t type) { struct dyn_ipv4_state *s; void *data; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (info->direction == MATCH_UNKNOWN || info->kidx != kidx || info->hashval != hashval || info->version != DYN_BUCKET_VERSION(bucket, ipv4_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ if (dyn_lookup_ipv4_state_locked(pkt, ulp, pktlen, bucket, kidx) != 0) { DYN_BUCKET_UNLOCK(bucket); return (EEXIST); } } data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, pktlen, hashval, fibnum); if (data == NULL) { DYN_BUCKET_UNLOCK(bucket); return (ENOMEM); } s = dyn_alloc_ipv4_state(pkt, kidx, type); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_data_zone, data); return (ENOMEM); } s->data = data; CK_SLIST_INSERT_HEAD(&V_dyn_ipv4[bucket], s, entry); DYN_COUNT_INC(dyn_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv4_add); DYN_BUCKET_UNLOCK(bucket); return (0); } #ifdef INET6 static struct dyn_ipv6_state * dyn_alloc_ipv6_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, - uint16_t kidx, uint8_t type) + uint32_t kidx, uint8_t type) { struct dyn_ipv6_state *s; s = uma_zalloc(V_dyn_ipv6_zone, M_NOWAIT | M_ZERO); if (s == NULL) return (NULL); s->type = type; s->kidx = kidx; s->zoneid = zoneid; s->proto = pkt->proto; s->sport = pkt->src_port; s->dport = pkt->dst_port; s->src = pkt->src_ip6; s->dst = pkt->dst_ip6; return (s); } /* * Add IPv6 parent state. * Returns pointer to parent state. When it is not NULL we are in * critical section and pointer protected by hazard pointer. * When some error occurs, it return NULL and exit from critical section * is not needed. */ static struct dyn_ipv6_state * -dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint16_t rulenum, +dyn_add_ipv6_parent(void *rule, uint32_t ruleid, uint32_t rulenum, const struct ipfw_flow_id *pkt, uint32_t zoneid, uint32_t hashval, - uint32_t version, uint16_t kidx) + uint32_t version, uint32_t kidx) { struct dyn_ipv6_state *s; struct dyn_parent *limit; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (version != DYN_BUCKET_VERSION(bucket, ipv6_parent_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ s = dyn_lookup_ipv6_parent_locked(pkt, zoneid, rule, ruleid, rulenum, bucket); if (s != NULL) { /* * Simultaneous thread has already created this * state. Just return it. */ DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } } limit = dyn_alloc_parent(rule, ruleid, rulenum, hashval); if (limit == NULL) { DYN_BUCKET_UNLOCK(bucket); return (NULL); } s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, O_LIMIT_PARENT); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_parent_zone, limit); return (NULL); } s->limit = limit; CK_SLIST_INSERT_HEAD(&V_dyn_ipv6_parent[bucket], s, entry); DYN_COUNT_INC(dyn_parent_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv6_parent_add); DYNSTATE_CRITICAL_ENTER(); DYNSTATE_PROTECT(s); DYN_BUCKET_UNLOCK(bucket); return (s); } static int -dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint16_t rulenum, +dyn_add_ipv6_state(void *parent, uint32_t ruleid, uint32_t rulenum, const struct ipfw_flow_id *pkt, uint32_t zoneid, const void *ulp, int pktlen, uint32_t hashval, struct ipfw_dyn_info *info, - uint16_t fibnum, uint16_t kidx, uint8_t type) + uint16_t fibnum, uint32_t kidx, uint8_t type) { struct dyn_ipv6_state *s; struct dyn_data *data; uint32_t bucket; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYN_BUCKET_LOCK(bucket); if (info->direction == MATCH_UNKNOWN || info->kidx != kidx || info->hashval != hashval || info->version != DYN_BUCKET_VERSION(bucket, ipv6_add)) { /* * Bucket version has been changed since last lookup, * do lookup again to be sure that state does not exist. */ if (dyn_lookup_ipv6_state_locked(pkt, zoneid, ulp, pktlen, bucket, kidx) != 0) { DYN_BUCKET_UNLOCK(bucket); return (EEXIST); } } data = dyn_alloc_dyndata(parent, ruleid, rulenum, pkt, ulp, pktlen, hashval, fibnum); if (data == NULL) { DYN_BUCKET_UNLOCK(bucket); return (ENOMEM); } s = dyn_alloc_ipv6_state(pkt, zoneid, kidx, type); if (s == NULL) { DYN_BUCKET_UNLOCK(bucket); uma_zfree(V_dyn_data_zone, data); return (ENOMEM); } s->data = data; CK_SLIST_INSERT_HEAD(&V_dyn_ipv6[bucket], s, entry); DYN_COUNT_INC(dyn_count); DYN_BUCKET_VERSION_BUMP(bucket, ipv6_add); DYN_BUCKET_UNLOCK(bucket); return (0); } #endif /* INET6 */ static void * dyn_get_parent_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, - struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint16_t kidx) + struct ip_fw *rule, uint32_t hashval, uint32_t limit, uint32_t kidx) { char sbuf[24]; struct dyn_parent *p; void *ret; uint32_t bucket, version; p = NULL; ret = NULL; bucket = DYN_BUCKET(hashval, V_curr_dyn_buckets); DYNSTATE_CRITICAL_ENTER(); if (IS_IP4_FLOW_ID(pkt)) { struct dyn_ipv4_state *s; version = DYN_BUCKET_VERSION(bucket, ipv4_parent_add); s = dyn_lookup_ipv4_parent(pkt, rule, rule->id, rule->rulenum, bucket); if (s == NULL) { /* * Exit from critical section because dyn_add_parent() * will acquire bucket lock. */ DYNSTATE_CRITICAL_EXIT(); s = dyn_add_ipv4_parent(rule, rule->id, rule->rulenum, pkt, hashval, version, kidx); if (s == NULL) return (NULL); /* Now we are in critical section again. */ } ret = s; p = s->limit; } #ifdef INET6 else if (IS_IP6_FLOW_ID(pkt)) { struct dyn_ipv6_state *s; version = DYN_BUCKET_VERSION(bucket, ipv6_parent_add); s = dyn_lookup_ipv6_parent(pkt, zoneid, rule, rule->id, rule->rulenum, bucket); if (s == NULL) { /* * Exit from critical section because dyn_add_parent() * can acquire bucket mutex. */ DYNSTATE_CRITICAL_EXIT(); s = dyn_add_ipv6_parent(rule, rule->id, rule->rulenum, pkt, zoneid, hashval, version, kidx); if (s == NULL) return (NULL); /* Now we are in critical section again. */ } ret = s; p = s->limit; } #endif else { DYNSTATE_CRITICAL_EXIT(); return (NULL); } /* Check the limit */ if (DPARENT_COUNT(p) >= limit) { DYNSTATE_CRITICAL_EXIT(); if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; snprintf(sbuf, sizeof(sbuf), "%u drop session", rule->rulenum); print_dyn_rule_flags(pkt, O_LIMIT, LOG_SECURITY | LOG_DEBUG, sbuf, "too many entries"); } return (NULL); } /* Take new session into account. */ DPARENT_COUNT_INC(p); /* * We must exit from critical section because the following code * can acquire bucket mutex. * We rely on the 'count' field. The state will not expire * until it has some child states, i.e. 'count' field is not zero. * Return state pointer, it will be used by child states as parent. */ DYNSTATE_CRITICAL_EXIT(); return (ret); } static int dyn_install_state(const struct ipfw_flow_id *pkt, uint32_t zoneid, uint16_t fibnum, const void *ulp, int pktlen, struct ip_fw *rule, struct ipfw_dyn_info *info, uint32_t limit, uint16_t limit_mask, - uint16_t kidx, uint8_t type) + uint32_t kidx, uint8_t type) { struct ipfw_flow_id id; uint32_t hashval, parent_hashval, ruleid, rulenum; int ret; MPASS(type == O_LIMIT || type == O_KEEP_STATE); ruleid = rule->id; rulenum = rule->rulenum; if (type == O_LIMIT) { /* Create masked flow id and calculate bucket */ id.addr_type = pkt->addr_type; id.proto = pkt->proto; id.fib = fibnum; /* unused */ id.src_port = (limit_mask & DYN_SRC_PORT) ? pkt->src_port: 0; id.dst_port = (limit_mask & DYN_DST_PORT) ? pkt->dst_port: 0; if (IS_IP4_FLOW_ID(pkt)) { id.src_ip = (limit_mask & DYN_SRC_ADDR) ? pkt->src_ip: 0; id.dst_ip = (limit_mask & DYN_DST_ADDR) ? pkt->dst_ip: 0; } #ifdef INET6 else if (IS_IP6_FLOW_ID(pkt)) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = pkt->src_ip6; else memset(&id.src_ip6, 0, sizeof(id.src_ip6)); if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = pkt->dst_ip6; else memset(&id.dst_ip6, 0, sizeof(id.dst_ip6)); } #endif else return (EAFNOSUPPORT); parent_hashval = hash_parent(&id, rule); rule = dyn_get_parent_state(&id, zoneid, rule, parent_hashval, limit, kidx); if (rule == NULL) { #if 0 if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; snprintf(sbuf, sizeof(sbuf), "%u drop session", rule->rulenum); print_dyn_rule_flags(pkt, O_LIMIT, LOG_SECURITY | LOG_DEBUG, sbuf, "too many entries"); } #endif return (EACCES); } /* * Limit is not reached, create new state. * Now rule points to parent state. */ } hashval = hash_packet(pkt); if (IS_IP4_FLOW_ID(pkt)) ret = dyn_add_ipv4_state(rule, ruleid, rulenum, pkt, ulp, pktlen, hashval, info, fibnum, kidx, type); #ifdef INET6 else if (IS_IP6_FLOW_ID(pkt)) ret = dyn_add_ipv6_state(rule, ruleid, rulenum, pkt, zoneid, ulp, pktlen, hashval, info, fibnum, kidx, type); #endif /* INET6 */ else ret = EAFNOSUPPORT; if (type == O_LIMIT) { if (ret != 0) { /* * We failed to create child state for O_LIMIT * opcode. Since we already counted it in the parent, * we must revert counter back. The 'rule' points to * parent state, use it to get dyn_parent. * * XXXAE: it should be safe to use 'rule' pointer * without extra lookup, parent state is referenced * and should not be freed. */ if (IS_IP4_FLOW_ID(&id)) DPARENT_COUNT_DEC( ((struct dyn_ipv4_state *)rule)->limit); #ifdef INET6 else if (IS_IP6_FLOW_ID(&id)) DPARENT_COUNT_DEC( ((struct dyn_ipv6_state *)rule)->limit); #endif } } /* * EEXIST means that simultaneous thread has created this * state. Consider this as success. * * XXXAE: should we invalidate 'info' content here? */ if (ret == EEXIST) return (0); return (ret); } /* * Install dynamic state. * chain - ipfw's instance; * rule - the parent rule that installs the state; * cmd - opcode that installs the state; * args - ipfw arguments; * ulp - upper level protocol header; * pktlen - packet length; * info - dynamic state lookup info; * tablearg - tablearg id. * * Returns non-zero value (failure) if state is not installed because * of errors or because session limitations are enforced. */ int ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, const ipfw_insn_limit *cmd, const struct ip_fw_args *args, const void *ulp, int pktlen, struct ipfw_dyn_info *info, uint32_t tablearg) { uint32_t limit; uint16_t limit_mask; if (cmd->o.opcode == O_LIMIT) { limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); limit_mask = cmd->limit_mask; } else { limit = 0; limit_mask = 0; } + /* + * NOTE: we assume that kidx field of struct ipfw_insn_kidx + * located in the same place as kidx field of ipfw_insn_limit. + */ return (dyn_install_state(&args->f_id, #ifdef INET6 IS_IP6_FLOW_ID(&args->f_id) ? dyn_getscopeid(args): #endif 0, M_GETFIB(args->m), ulp, pktlen, rule, info, limit, - limit_mask, cmd->o.arg1, cmd->o.opcode)); + limit_mask, cmd->kidx, cmd->o.opcode)); } /* * Free safe to remove state entries from expired lists. */ static void dyn_free_states(struct ip_fw_chain *chain) { struct dyn_ipv4_state *s4, *s4n; #ifdef INET6 struct dyn_ipv6_state *s6, *s6n; #endif int cached_count, i; /* * We keep pointers to objects that are in use on each CPU * in the per-cpu dyn_hp pointer. When object is going to be * removed, first of it is unlinked from the corresponding * list. This leads to changing of dyn_bucket_xxx_delver version. * Unlinked objects is placed into corresponding dyn_expired_xxx * list. Reader that is going to dereference object pointer checks * dyn_bucket_xxx_delver version before and after storing pointer * into dyn_hp. If version is the same, the object is protected * from freeing and it is safe to dereference. Othervise reader * tries to iterate list again from the beginning, but this object * now unlinked and thus will not be accessible. * * Copy dyn_hp pointers for each CPU into dyn_hp_cache array. * It does not matter that some pointer can be changed in * time while we are copying. We need to check, that objects * removed in the previous pass are not in use. And if dyn_hp * pointer does not contain it in the time when we are copying, * it will not appear there, because it is already unlinked. * And for new pointers we will not free objects that will be * unlinked in this pass. */ cached_count = 0; CPU_FOREACH(i) { dyn_hp_cache[cached_count] = DYNSTATE_GET(i); if (dyn_hp_cache[cached_count] != NULL) cached_count++; } /* * Free expired states that are safe to free. * Check each entry from previous pass in the dyn_expired_xxx * list, if pointer to the object is in the dyn_hp_cache array, * keep it until next pass. Otherwise it is safe to free the * object. * * XXXAE: optimize this to use SLIST_REMOVE_AFTER. */ #define DYN_FREE_STATES(s, next, name) do { \ s = SLIST_FIRST(&V_dyn_expired_ ## name); \ while (s != NULL) { \ next = SLIST_NEXT(s, expired); \ for (i = 0; i < cached_count; i++) \ if (dyn_hp_cache[i] == s) \ break; \ if (i == cached_count) { \ if (s->type == O_LIMIT_PARENT && \ s->limit->count != 0) { \ s = next; \ continue; \ } \ SLIST_REMOVE(&V_dyn_expired_ ## name, \ s, dyn_ ## name ## _state, expired); \ if (s->type == O_LIMIT_PARENT) \ uma_zfree(V_dyn_parent_zone, s->limit); \ else \ uma_zfree(V_dyn_data_zone, s->data); \ uma_zfree(V_dyn_ ## name ## _zone, s); \ } \ s = next; \ } \ } while (0) /* * Protect access to expired lists with DYN_EXPIRED_LOCK. * Userland can invoke ipfw_expire_dyn_states() to delete * specific states, this will lead to modification of expired * lists. * * XXXAE: do we need DYN_EXPIRED_LOCK? We can just use * IPFW_UH_WLOCK to protect access to these lists. */ DYN_EXPIRED_LOCK(); DYN_FREE_STATES(s4, s4n, ipv4); #ifdef INET6 DYN_FREE_STATES(s6, s6n, ipv6); #endif DYN_EXPIRED_UNLOCK(); #undef DYN_FREE_STATES } /* * Returns: * 0 when state is not matched by specified range; * 1 when state is matched by specified range; * 2 when state is matched by specified range and requested deletion of * dynamic states. */ static int -dyn_match_range(uint16_t rulenum, uint8_t set, const ipfw_range_tlv *rt) +dyn_match_range(uint32_t rulenum, uint8_t set, const ipfw_range_tlv *rt) { MPASS(rt != NULL); /* flush all states */ if (rt->flags & IPFW_RCFLAG_ALL) { if (rt->flags & IPFW_RCFLAG_DYNAMIC) return (2); /* forced */ return (1); } if ((rt->flags & IPFW_RCFLAG_SET) != 0 && set != rt->set) return (0); if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && (rulenum < rt->start_rule || rulenum > rt->end_rule)) return (0); if (rt->flags & IPFW_RCFLAG_DYNAMIC) return (2); return (1); } static void dyn_acquire_rule(struct ip_fw_chain *ch, struct dyn_data *data, - struct ip_fw *rule, uint16_t kidx) + struct ip_fw *rule, uint32_t kidx) { struct dyn_state_obj *obj; /* * Do not acquire reference twice. * This can happen when rule deletion executed for * the same range, but different ruleset id. */ if (data->flags & DYN_REFERENCED) return; IPFW_UH_WLOCK_ASSERT(ch); MPASS(kidx != 0); data->flags |= DYN_REFERENCED; /* Reference the named object */ obj = SRV_OBJECT(ch, kidx); obj->no.refcnt++; MPASS(obj->no.etlv == IPFW_TLV_STATE_NAME); /* Reference the parent rule */ rule->refcnt++; } static void dyn_release_rule(struct ip_fw_chain *ch, struct dyn_data *data, - struct ip_fw *rule, uint16_t kidx) + struct ip_fw *rule, uint32_t kidx) { struct dyn_state_obj *obj; IPFW_UH_WLOCK_ASSERT(ch); MPASS(kidx != 0); obj = SRV_OBJECT(ch, kidx); if (obj->no.refcnt == 1) dyn_destroy(ch, &obj->no); else obj->no.refcnt--; if (--rule->refcnt == 1) ipfw_free_rule(rule); } /* * We do not keep O_LIMIT_PARENT states when V_dyn_keep_states is enabled. * O_LIMIT state is created when new connection is going to be established * and there is no matching state. So, since the old parent rule was deleted * we can't create new states with old parent, and thus we can not account * new connections with already established connections, and can not do * proper limiting. */ static int dyn_match_ipv4_state(struct ip_fw_chain *ch, struct dyn_ipv4_state *s, const ipfw_range_tlv *rt) { struct ip_fw *rule; int ret; if (s->type == O_LIMIT_PARENT) { rule = s->limit->parent; return (dyn_match_range(s->limit->rulenum, rule->set, rt)); } rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv4_state *)rule)->limit->parent; ret = dyn_match_range(s->data->rulenum, rule->set, rt); if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) return (ret); dyn_acquire_rule(ch, s->data, rule, s->kidx); return (0); } #ifdef INET6 static int dyn_match_ipv6_state(struct ip_fw_chain *ch, struct dyn_ipv6_state *s, const ipfw_range_tlv *rt) { struct ip_fw *rule; int ret; if (s->type == O_LIMIT_PARENT) { rule = s->limit->parent; return (dyn_match_range(s->limit->rulenum, rule->set, rt)); } rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv6_state *)rule)->limit->parent; ret = dyn_match_range(s->data->rulenum, rule->set, rt); if (ret == 0 || V_dyn_keep_states == 0 || ret > 1) return (ret); dyn_acquire_rule(ch, s->data, rule, s->kidx); return (0); } #endif /* * Unlink expired entries from states lists. * @rt can be used to specify the range of states for deletion. */ static void dyn_expire_states(struct ip_fw_chain *ch, ipfw_range_tlv *rt) { struct dyn_ipv4_slist expired_ipv4; #ifdef INET6 struct dyn_ipv6_slist expired_ipv6; struct dyn_ipv6_state *s6, *s6n, *s6p; #endif struct dyn_ipv4_state *s4, *s4n, *s4p; void *rule; int bucket, removed, length, max_length; IPFW_UH_WLOCK_ASSERT(ch); /* * Unlink expired states from each bucket. * With acquired bucket lock iterate entries of each lists: * ipv4, ipv4_parent, ipv6, and ipv6_parent. Check expired time * and unlink entry from the list, link entry into temporary * expired_xxx lists then bump "del" bucket version. * * When an entry is removed, corresponding states counter is * decremented. If entry has O_LIMIT type, parent's reference * counter is decremented. * * NOTE: this function can be called from userspace context * when user deletes rules. In this case all matched states * will be forcedly unlinked. O_LIMIT_PARENT states will be kept * in the expired lists until reference counter become zero. */ #define DYN_UNLINK_STATES(s, prev, next, exp, af, name, extra) do { \ length = 0; \ removed = 0; \ prev = NULL; \ s = CK_SLIST_FIRST(&V_dyn_ ## name [bucket]); \ while (s != NULL) { \ next = CK_SLIST_NEXT(s, entry); \ if ((TIME_LEQ((s)->exp, time_uptime) && extra) || \ (rt != NULL && \ dyn_match_ ## af ## _state(ch, s, rt))) { \ if (prev != NULL) \ CK_SLIST_REMOVE_AFTER(prev, entry); \ else \ CK_SLIST_REMOVE_HEAD( \ &V_dyn_ ## name [bucket], entry); \ removed++; \ SLIST_INSERT_HEAD(&expired_ ## af, s, expired); \ if (s->type == O_LIMIT_PARENT) \ DYN_COUNT_DEC(dyn_parent_count); \ else { \ DYN_COUNT_DEC(dyn_count); \ if (s->data->flags & DYN_REFERENCED) { \ rule = s->data->parent; \ if (s->type == O_LIMIT) \ rule = ((__typeof(s)) \ rule)->limit->parent;\ dyn_release_rule(ch, s->data, \ rule, s->kidx); \ } \ if (s->type == O_LIMIT) { \ s = s->data->parent; \ DPARENT_COUNT_DEC(s->limit); \ } \ } \ } else { \ prev = s; \ length++; \ } \ s = next; \ } \ if (removed != 0) \ DYN_BUCKET_VERSION_BUMP(bucket, name ## _del); \ if (length > max_length) \ max_length = length; \ } while (0) SLIST_INIT(&expired_ipv4); #ifdef INET6 SLIST_INIT(&expired_ipv6); #endif max_length = 0; for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_BUCKET_LOCK(bucket); DYN_UNLINK_STATES(s4, s4p, s4n, data->expire, ipv4, ipv4, 1); DYN_UNLINK_STATES(s4, s4p, s4n, limit->expire, ipv4, ipv4_parent, (s4->limit->count == 0)); #ifdef INET6 DYN_UNLINK_STATES(s6, s6p, s6n, data->expire, ipv6, ipv6, 1); DYN_UNLINK_STATES(s6, s6p, s6n, limit->expire, ipv6, ipv6_parent, (s6->limit->count == 0)); #endif DYN_BUCKET_UNLOCK(bucket); } /* Update curr_max_length for statistics. */ V_curr_max_length = max_length; /* * Concatenate temporary lists with global expired lists. */ DYN_EXPIRED_LOCK(); SLIST_CONCAT(&V_dyn_expired_ipv4, &expired_ipv4, dyn_ipv4_state, expired); #ifdef INET6 SLIST_CONCAT(&V_dyn_expired_ipv6, &expired_ipv6, dyn_ipv6_state, expired); #endif DYN_EXPIRED_UNLOCK(); #undef DYN_UNLINK_STATES #undef DYN_UNREF_STATES } static struct mbuf * dyn_mgethdr(int len, uint16_t fibnum) { struct mbuf *m; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); #ifdef MAC mac_netinet_firewall_send(m); #endif M_SETFIB(m, fibnum); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_len = m->m_pkthdr.len = len; bzero(m->m_data, len); return (m); } static void dyn_make_keepalive_ipv4(struct mbuf *m, in_addr_t src, in_addr_t dst, uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) { struct tcphdr *tcp; struct ip *ip; ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(m->m_len); ip->ip_off |= htons(IP_DF); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_TCP; ip->ip_src.s_addr = htonl(src); ip->ip_dst.s_addr = htonl(dst); tcp = mtodo(m, sizeof(struct ip)); tcp->th_sport = htons(sport); tcp->th_dport = htons(dport); tcp->th_off = sizeof(struct tcphdr) >> 2; tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp_set_flags(tcp, TH_ACK); tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP)); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); m->m_pkthdr.csum_flags = CSUM_TCP; } static void dyn_enqueue_keepalive_ipv4(struct mbufq *q, const struct dyn_ipv4_state *s) { struct mbuf *m; if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv4(m, s->dst, s->src, s->data->ack_fwd - 1, s->data->ack_rev, s->dport, s->sport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv4 " "keepalive queue is reached.\n"); return; } } } if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { m = dyn_mgethdr(sizeof(struct ip) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv4(m, s->src, s->dst, s->data->ack_rev - 1, s->data->ack_fwd, s->sport, s->dport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv4 " "keepalive queue is reached.\n"); return; } } } } /* * Prepare and send keep-alive packets. */ static void dyn_send_keepalive_ipv4(struct ip_fw_chain *chain) { struct mbufq q; struct mbuf *m; struct dyn_ipv4_state *s; uint32_t bucket; mbufq_init(&q, INT_MAX); IPFW_UH_RLOCK(chain); /* * It is safe to not use hazard pointer and just do lockless * access to the lists, because states entries can not be deleted * while we hold IPFW_UH_RLOCK. */ for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { CK_SLIST_FOREACH(s, &V_dyn_ipv4[bucket], entry) { /* * Only established TCP connections that will * become expired within dyn_keepalive_interval. */ if (s->proto != IPPROTO_TCP || (s->data->state & BOTH_SYN) != BOTH_SYN || TIME_LEQ(time_uptime + V_dyn_keepalive_interval, s->data->expire)) continue; dyn_enqueue_keepalive_ipv4(&q, s); } } IPFW_UH_RUNLOCK(chain); while ((m = mbufq_dequeue(&q)) != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } #ifdef INET6 static void dyn_make_keepalive_ipv6(struct mbuf *m, const struct in6_addr *src, const struct in6_addr *dst, uint32_t zoneid, uint32_t seq, uint32_t ack, uint16_t sport, uint16_t dport) { struct tcphdr *tcp; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = *src; if (IN6_IS_ADDR_LINKLOCAL(src)) ip6->ip6_src.s6_addr16[1] = htons(zoneid & 0xffff); ip6->ip6_dst = *dst; if (IN6_IS_ADDR_LINKLOCAL(dst)) ip6->ip6_dst.s6_addr16[1] = htons(zoneid & 0xffff); tcp = mtodo(m, sizeof(struct ip6_hdr)); tcp->th_sport = htons(sport); tcp->th_dport = htons(dport); tcp->th_off = sizeof(struct tcphdr) >> 2; tcp->th_seq = htonl(seq); tcp->th_ack = htonl(ack); tcp_set_flags(tcp, TH_ACK); tcp->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr), IPPROTO_TCP, 0); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; } static void dyn_enqueue_keepalive_ipv6(struct mbufq *q, const struct dyn_ipv6_state *s) { struct mbuf *m; if ((s->data->state & ACK_FWD) == 0 && s->data->ack_fwd > 0) { m = dyn_mgethdr(sizeof(struct ip6_hdr) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv6(m, &s->dst, &s->src, s->zoneid, s->data->ack_fwd - 1, s->data->ack_rev, s->dport, s->sport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv6 " "keepalive queue is reached.\n"); return; } } } if ((s->data->state & ACK_REV) == 0 && s->data->ack_rev > 0) { m = dyn_mgethdr(sizeof(struct ip6_hdr) + sizeof(struct tcphdr), s->data->fibnum); if (m != NULL) { dyn_make_keepalive_ipv6(m, &s->src, &s->dst, s->zoneid, s->data->ack_rev - 1, s->data->ack_fwd, s->sport, s->dport); if (mbufq_enqueue(q, m)) { m_freem(m); log(LOG_DEBUG, "ipfw: limit for IPv6 " "keepalive queue is reached.\n"); return; } } } } static void dyn_send_keepalive_ipv6(struct ip_fw_chain *chain) { struct mbufq q; struct mbuf *m; struct dyn_ipv6_state *s; uint32_t bucket; mbufq_init(&q, INT_MAX); IPFW_UH_RLOCK(chain); /* * It is safe to not use hazard pointer and just do lockless * access to the lists, because states entries can not be deleted * while we hold IPFW_UH_RLOCK. */ for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { CK_SLIST_FOREACH(s, &V_dyn_ipv6[bucket], entry) { /* * Only established TCP connections that will * become expired within dyn_keepalive_interval. */ if (s->proto != IPPROTO_TCP || (s->data->state & BOTH_SYN) != BOTH_SYN || TIME_LEQ(time_uptime + V_dyn_keepalive_interval, s->data->expire)) continue; dyn_enqueue_keepalive_ipv6(&q, s); } } IPFW_UH_RUNLOCK(chain); while ((m = mbufq_dequeue(&q)) != NULL) ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif /* INET6 */ static void dyn_grow_hashtable(struct ip_fw_chain *chain, uint32_t new, int flags) { #ifdef INET6 struct dyn_ipv6ck_slist *ipv6, *ipv6_parent; uint32_t *ipv6_add, *ipv6_del, *ipv6_parent_add, *ipv6_parent_del; struct dyn_ipv6_state *s6; #endif struct dyn_ipv4ck_slist *ipv4, *ipv4_parent; uint32_t *ipv4_add, *ipv4_del, *ipv4_parent_add, *ipv4_parent_del; struct dyn_ipv4_state *s4; struct mtx *bucket_lock; void *tmp; uint32_t bucket; MPASS(powerof2(new)); DYN_DEBUG("grow hash size %u -> %u", V_curr_dyn_buckets, new); /* * Allocate and initialize new lists. */ bucket_lock = malloc(new * sizeof(struct mtx), M_IPFW, flags | M_ZERO); if (bucket_lock == NULL) return; ipv4 = ipv4_parent = NULL; ipv4_add = ipv4_del = ipv4_parent_add = ipv4_parent_del = NULL; #ifdef INET6 ipv6 = ipv6_parent = NULL; ipv6_add = ipv6_del = ipv6_parent_add = ipv6_parent_del = NULL; #endif ipv4 = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, flags | M_ZERO); if (ipv4 == NULL) goto bad; ipv4_parent = malloc(new * sizeof(struct dyn_ipv4ck_slist), M_IPFW, flags | M_ZERO); if (ipv4_parent == NULL) goto bad; ipv4_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv4_add == NULL) goto bad; ipv4_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv4_del == NULL) goto bad; ipv4_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv4_parent_add == NULL) goto bad; ipv4_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv4_parent_del == NULL) goto bad; #ifdef INET6 ipv6 = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, flags | M_ZERO); if (ipv6 == NULL) goto bad; ipv6_parent = malloc(new * sizeof(struct dyn_ipv6ck_slist), M_IPFW, flags | M_ZERO); if (ipv6_parent == NULL) goto bad; ipv6_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv6_add == NULL) goto bad; ipv6_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv6_del == NULL) goto bad; ipv6_parent_add = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv6_parent_add == NULL) goto bad; ipv6_parent_del = malloc(new * sizeof(uint32_t), M_IPFW, flags | M_ZERO); if (ipv6_parent_del == NULL) goto bad; #endif for (bucket = 0; bucket < new; bucket++) { DYN_BUCKET_LOCK_INIT(bucket_lock, bucket); CK_SLIST_INIT(&ipv4[bucket]); CK_SLIST_INIT(&ipv4_parent[bucket]); #ifdef INET6 CK_SLIST_INIT(&ipv6[bucket]); CK_SLIST_INIT(&ipv6_parent[bucket]); #endif } #define DYN_RELINK_STATES(s, hval, i, head, ohead) do { \ while ((s = CK_SLIST_FIRST(&V_dyn_ ## ohead[i])) != NULL) { \ CK_SLIST_REMOVE_HEAD(&V_dyn_ ## ohead[i], entry); \ CK_SLIST_INSERT_HEAD(&head[DYN_BUCKET(s->hval, new)], \ s, entry); \ } \ } while (0) /* * Prevent rules changing from userland. */ IPFW_UH_WLOCK(chain); /* * Hold traffic processing until we finish resize to * prevent access to states lists. */ IPFW_WLOCK(chain); /* Re-link all dynamic states */ for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_RELINK_STATES(s4, data->hashval, bucket, ipv4, ipv4); DYN_RELINK_STATES(s4, limit->hashval, bucket, ipv4_parent, ipv4_parent); #ifdef INET6 DYN_RELINK_STATES(s6, data->hashval, bucket, ipv6, ipv6); DYN_RELINK_STATES(s6, limit->hashval, bucket, ipv6_parent, ipv6_parent); #endif } #define DYN_SWAP_PTR(old, new, tmp) do { \ tmp = old; \ old = new; \ new = tmp; \ } while (0) /* Swap pointers */ DYN_SWAP_PTR(V_dyn_bucket_lock, bucket_lock, tmp); DYN_SWAP_PTR(V_dyn_ipv4, ipv4, tmp); DYN_SWAP_PTR(V_dyn_ipv4_parent, ipv4_parent, tmp); DYN_SWAP_PTR(V_dyn_ipv4_add, ipv4_add, tmp); DYN_SWAP_PTR(V_dyn_ipv4_parent_add, ipv4_parent_add, tmp); DYN_SWAP_PTR(V_dyn_ipv4_del, ipv4_del, tmp); DYN_SWAP_PTR(V_dyn_ipv4_parent_del, ipv4_parent_del, tmp); #ifdef INET6 DYN_SWAP_PTR(V_dyn_ipv6, ipv6, tmp); DYN_SWAP_PTR(V_dyn_ipv6_parent, ipv6_parent, tmp); DYN_SWAP_PTR(V_dyn_ipv6_add, ipv6_add, tmp); DYN_SWAP_PTR(V_dyn_ipv6_parent_add, ipv6_parent_add, tmp); DYN_SWAP_PTR(V_dyn_ipv6_del, ipv6_del, tmp); DYN_SWAP_PTR(V_dyn_ipv6_parent_del, ipv6_parent_del, tmp); #endif bucket = V_curr_dyn_buckets; V_curr_dyn_buckets = new; IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); /* Release old resources */ while (bucket-- != 0) DYN_BUCKET_LOCK_DESTROY(bucket_lock, bucket); bad: free(bucket_lock, M_IPFW); free(ipv4, M_IPFW); free(ipv4_parent, M_IPFW); free(ipv4_add, M_IPFW); free(ipv4_parent_add, M_IPFW); free(ipv4_del, M_IPFW); free(ipv4_parent_del, M_IPFW); #ifdef INET6 free(ipv6, M_IPFW); free(ipv6_parent, M_IPFW); free(ipv6_add, M_IPFW); free(ipv6_parent_add, M_IPFW); free(ipv6_del, M_IPFW); free(ipv6_parent_del, M_IPFW); #endif } /* * This function is used to perform various maintenance * on dynamic hash lists. Currently it is called every second. */ static void dyn_tick(void *vnetx) { struct epoch_tracker et; uint32_t buckets; CURVNET_SET((struct vnet *)vnetx); /* * First free states unlinked in previous passes. */ dyn_free_states(&V_layer3_chain); /* * Now unlink others expired states. * We use IPFW_UH_WLOCK to avoid concurrent call of * dyn_expire_states(). It is the only function that does * deletion of state entries from states lists. */ IPFW_UH_WLOCK(&V_layer3_chain); dyn_expire_states(&V_layer3_chain, NULL); IPFW_UH_WUNLOCK(&V_layer3_chain); /* * Send keepalives if they are enabled and the time has come. */ if (V_dyn_keepalive != 0 && V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) { V_dyn_keepalive_last = time_uptime; NET_EPOCH_ENTER(et); dyn_send_keepalive_ipv4(&V_layer3_chain); #ifdef INET6 dyn_send_keepalive_ipv6(&V_layer3_chain); #endif NET_EPOCH_EXIT(et); } /* * Check if we need to resize the hash: * if current number of states exceeds number of buckets in hash, * and dyn_buckets_max permits to grow the number of buckets, then * do it. Grow hash size to the minimum power of 2 which is bigger * than current states count. */ if (V_curr_dyn_buckets < V_dyn_buckets_max && (V_curr_dyn_buckets < V_dyn_count / 2 || ( V_curr_dyn_buckets < V_dyn_count && V_curr_max_length > 8))) { buckets = 1 << fls(V_dyn_count); if (buckets > V_dyn_buckets_max) buckets = V_dyn_buckets_max; dyn_grow_hashtable(&V_layer3_chain, buckets, M_NOWAIT); } callout_reset_on(&V_dyn_timeout, hz, dyn_tick, vnetx, 0); CURVNET_RESTORE(); } void ipfw_expire_dyn_states(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { /* * Do not perform any checks if we currently have no dynamic states */ if (V_dyn_count == 0) return; IPFW_UH_WLOCK_ASSERT(chain); dyn_expire_states(chain, rt); } /* * Pass through all states and reset eaction for orphaned rules. */ void -ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id, - uint16_t default_id, uint16_t instance_id) +ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint32_t eaction_id, + uint32_t default_id, uint32_t instance_id) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; struct ip_fw *rule; uint32_t bucket; #define DYN_RESET_EACTION(s, h, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ if ((s->data->flags & DYN_REFERENCED) == 0) \ continue; \ rule = s->data->parent; \ if (s->type == O_LIMIT) \ rule = ((__typeof(s))rule)->limit->parent; \ ipfw_reset_eaction(ch, rule, eaction_id, \ default_id, instance_id); \ } IPFW_UH_WLOCK_ASSERT(ch); if (V_dyn_count == 0) return; for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_RESET_EACTION(s4, ipv4, bucket); #ifdef INET6 DYN_RESET_EACTION(s6, ipv6, bucket); #endif } } /* * Returns size of dynamic states in legacy format */ int ipfw_dyn_len(void) { return ((V_dyn_count + V_dyn_parent_count) * sizeof(ipfw_dyn_rule)); } /* * Returns number of dynamic states. * Marks every named object index used by dynamic states with bit in @bmask. * Returns number of named objects accounted in bmask via @nocnt. * Used by dump format v1 (current). */ uint32_t ipfw_dyn_get_count(uint32_t *bmask, int *nocnt) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; uint32_t bucket; #define DYN_COUNT_OBJECTS(s, h, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ MPASS(s->kidx != 0); \ if (ipfw_mark_object_kidx(bmask, IPFW_TLV_STATE_NAME, \ s->kidx) != 0) \ (*nocnt)++; \ } IPFW_UH_RLOCK_ASSERT(&V_layer3_chain); /* No need to pass through all the buckets. */ *nocnt = 0; if (V_dyn_count + V_dyn_parent_count == 0) return (0); for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_COUNT_OBJECTS(s4, ipv4, bucket); #ifdef INET6 DYN_COUNT_OBJECTS(s6, ipv6, bucket); #endif } return (V_dyn_count + V_dyn_parent_count); } /* * Check if rule contains at least one dynamic opcode. * * Returns 1 if such opcode is found, 0 otherwise. */ int ipfw_is_dyn_rule(struct ip_fw *rule) { int cmdlen, l; ipfw_insn *cmd; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); switch (cmd->opcode) { case O_LIMIT: case O_KEEP_STATE: case O_PROBE_STATE: case O_CHECK_STATE: return (1); } } return (0); } static void -dyn_export_parent(const struct dyn_parent *p, uint16_t kidx, uint8_t set, +dyn_export_parent(const struct dyn_parent *p, uint32_t kidx, uint8_t set, ipfw_dyn_rule *dst) { - dst->dyn_type = O_LIMIT_PARENT; + dst->type = O_LIMIT_PARENT; + dst->set = set; dst->kidx = kidx; - dst->count = (uint16_t)DPARENT_COUNT(p); + dst->rulenum = p->rulenum; + dst->count = DPARENT_COUNT(p); dst->expire = TIME_LEQ(p->expire, time_uptime) ? 0: p->expire - time_uptime; - - /* 'rule' is used to pass up the rule number and set */ - memcpy(&dst->rule, &p->rulenum, sizeof(p->rulenum)); - - /* store set number into high word of dst->rule pointer. */ - memcpy((char *)&dst->rule + sizeof(p->rulenum), &set, sizeof(set)); + dst->hashval = p->hashval; /* unused fields */ + dst->pad = 0; dst->pcnt = 0; dst->bcnt = 0; - dst->parent = NULL; - dst->state = 0; dst->ack_fwd = 0; dst->ack_rev = 0; - dst->bucket = p->hashval; - /* - * The legacy userland code will interpret a NULL here as a marker - * for the last dynamic rule. - */ - dst->next = (ipfw_dyn_rule *)1; } static void -dyn_export_data(const struct dyn_data *data, uint16_t kidx, uint8_t type, +dyn_export_data(const struct dyn_data *data, uint32_t kidx, uint8_t type, uint8_t set, ipfw_dyn_rule *dst) { - dst->dyn_type = type; + dst->type = type; + dst->set = set; dst->kidx = kidx; + dst->rulenum = data->rulenum; dst->pcnt = data->pcnt_fwd + data->pcnt_rev; dst->bcnt = data->bcnt_fwd + data->bcnt_rev; dst->expire = TIME_LEQ(data->expire, time_uptime) ? 0: data->expire - time_uptime; - - /* 'rule' is used to pass up the rule number and set */ - memcpy(&dst->rule, &data->rulenum, sizeof(data->rulenum)); - - /* store set number into high word of dst->rule pointer. */ - memcpy((char *)&dst->rule + sizeof(data->rulenum), &set, sizeof(set)); - dst->state = data->state; if (data->flags & DYN_REFERENCED) dst->state |= IPFW_DYN_ORPHANED; - /* unused fields */ - dst->parent = NULL; dst->ack_fwd = data->ack_fwd; dst->ack_rev = data->ack_rev; - dst->count = 0; - dst->bucket = data->hashval; - /* - * The legacy userland code will interpret a NULL here as a marker - * for the last dynamic rule. - */ - dst->next = (ipfw_dyn_rule *)1; + dst->hashval = data->hashval; } static void dyn_export_ipv4_state(const struct dyn_ipv4_state *s, ipfw_dyn_rule *dst) { struct ip_fw *rule; switch (s->type) { case O_LIMIT_PARENT: rule = s->limit->parent; dyn_export_parent(s->limit, s->kidx, rule->set, dst); break; default: rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv4_state *)rule)->limit->parent; dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); } dst->id.dst_ip = s->dst; dst->id.src_ip = s->src; dst->id.dst_port = s->dport; dst->id.src_port = s->sport; dst->id.fib = s->data->fibnum; dst->id.proto = s->proto; dst->id._flags = 0; dst->id.addr_type = 4; memset(&dst->id.dst_ip6, 0, sizeof(dst->id.dst_ip6)); memset(&dst->id.src_ip6, 0, sizeof(dst->id.src_ip6)); dst->id.flow_id6 = dst->id.extra = 0; } #ifdef INET6 static void dyn_export_ipv6_state(const struct dyn_ipv6_state *s, ipfw_dyn_rule *dst) { struct ip_fw *rule; switch (s->type) { case O_LIMIT_PARENT: rule = s->limit->parent; dyn_export_parent(s->limit, s->kidx, rule->set, dst); break; default: rule = s->data->parent; if (s->type == O_LIMIT) rule = ((struct dyn_ipv6_state *)rule)->limit->parent; dyn_export_data(s->data, s->kidx, s->type, rule->set, dst); } dst->id.src_ip6 = s->src; dst->id.dst_ip6 = s->dst; dst->id.dst_port = s->dport; dst->id.src_port = s->sport; dst->id.fib = s->data->fibnum; dst->id.proto = s->proto; dst->id._flags = 0; dst->id.addr_type = 6; dst->id.dst_ip = dst->id.src_ip = 0; dst->id.flow_id6 = dst->id.extra = 0; } #endif /* INET6 */ /* * Fills the buffer given by @sd with dynamic states. * Used by dump format v1 (current). * * Returns 0 on success. */ int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; ipfw_obj_dyntlv *dst, *last; ipfw_obj_ctlv *ctlv; uint32_t bucket; if (V_dyn_count == 0) return (0); /* * IPFW_UH_RLOCK garantees that another userland request * and callout thread will not delete entries from states * lists. */ IPFW_UH_RLOCK_ASSERT(chain); ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; ctlv->objsize = sizeof(ipfw_obj_dyntlv); last = NULL; #define DYN_EXPORT_STATES(s, af, h, b) \ CK_SLIST_FOREACH(s, &V_dyn_ ## h[b], entry) { \ dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, \ sizeof(ipfw_obj_dyntlv)); \ if (dst == NULL) \ return (ENOMEM); \ dyn_export_ ## af ## _state(s, &dst->state); \ dst->head.length = sizeof(ipfw_obj_dyntlv); \ dst->head.type = IPFW_TLV_DYN_ENT; \ last = dst; \ } for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); #ifdef INET6 DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); #endif /* INET6 */ } /* mark last dynamic rule */ if (last != NULL) last->head.flags = IPFW_DF_LAST; /* XXX: unused */ return (0); #undef DYN_EXPORT_STATES } -/* - * Fill given buffer with dynamic states (legacy format). - * IPFW_UH_RLOCK has to be held while calling. - */ -void -ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) -{ -#ifdef INET6 - struct dyn_ipv6_state *s6; -#endif - struct dyn_ipv4_state *s4; - ipfw_dyn_rule *p, *last = NULL; - char *bp; - uint32_t bucket; - - if (V_dyn_count == 0) - return; - bp = *pbp; - - IPFW_UH_RLOCK_ASSERT(chain); - -#define DYN_EXPORT_STATES(s, af, head, b) \ - CK_SLIST_FOREACH(s, &V_dyn_ ## head[b], entry) { \ - if (bp + sizeof(*p) > ep) \ - break; \ - p = (ipfw_dyn_rule *)bp; \ - dyn_export_ ## af ## _state(s, p); \ - last = p; \ - bp += sizeof(*p); \ - } - - for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { - DYN_EXPORT_STATES(s4, ipv4, ipv4_parent, bucket); - DYN_EXPORT_STATES(s4, ipv4, ipv4, bucket); -#ifdef INET6 - DYN_EXPORT_STATES(s6, ipv6, ipv6_parent, bucket); - DYN_EXPORT_STATES(s6, ipv6, ipv6, bucket); -#endif /* INET6 */ - } - - if (last != NULL) /* mark last dynamic rule */ - last->next = NULL; - *pbp = bp; -#undef DYN_EXPORT_STATES -} - void ipfw_dyn_init(struct ip_fw_chain *chain) { #ifdef IPFIREWALL_JENKINSHASH V_dyn_hashseed = arc4random(); #endif V_dyn_max = 16384; /* max # of states */ V_dyn_parent_max = 4096; /* max # of parent states */ V_dyn_buckets_max = 8192; /* must be power of 2 */ V_dyn_ack_lifetime = 300; V_dyn_syn_lifetime = 20; V_dyn_fin_lifetime = 1; V_dyn_rst_lifetime = 1; V_dyn_udp_lifetime = 10; V_dyn_short_lifetime = 5; V_dyn_keepalive_interval = 20; V_dyn_keepalive_period = 5; V_dyn_keepalive = 1; /* send keepalives */ V_dyn_keepalive_last = time_uptime; V_dyn_data_zone = uma_zcreate("IPFW dynamic states data", sizeof(struct dyn_data), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_dyn_data_zone, V_dyn_max); V_dyn_parent_zone = uma_zcreate("IPFW parent dynamic states", sizeof(struct dyn_parent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_dyn_parent_zone, V_dyn_parent_max); SLIST_INIT(&V_dyn_expired_ipv4); V_dyn_ipv4 = NULL; V_dyn_ipv4_parent = NULL; V_dyn_ipv4_zone = uma_zcreate("IPFW IPv4 dynamic states", sizeof(struct dyn_ipv4_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef INET6 SLIST_INIT(&V_dyn_expired_ipv6); V_dyn_ipv6 = NULL; V_dyn_ipv6_parent = NULL; V_dyn_ipv6_zone = uma_zcreate("IPFW IPv6 dynamic states", sizeof(struct dyn_ipv6_state), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #endif /* Initialize buckets. */ V_curr_dyn_buckets = 0; V_dyn_bucket_lock = NULL; dyn_grow_hashtable(chain, 256, M_WAITOK); if (IS_DEFAULT_VNET(curvnet)) dyn_hp_cache = malloc(mp_ncpus * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); DYN_EXPIRED_LOCK_INIT(); callout_init(&V_dyn_timeout, 1); callout_reset(&V_dyn_timeout, hz, dyn_tick, curvnet); IPFW_ADD_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); } void ipfw_dyn_uninit(int pass) { #ifdef INET6 struct dyn_ipv6_state *s6; #endif struct dyn_ipv4_state *s4; int bucket; if (pass == 0) { callout_drain(&V_dyn_timeout); return; } IPFW_DEL_OBJ_REWRITER(IS_DEFAULT_VNET(curvnet), dyn_opcodes); DYN_EXPIRED_LOCK_DESTROY(); #define DYN_FREE_STATES_FORCED(CK, s, af, name, en) do { \ while ((s = CK ## SLIST_FIRST(&V_dyn_ ## name)) != NULL) { \ CK ## SLIST_REMOVE_HEAD(&V_dyn_ ## name, en); \ if (s->type == O_LIMIT_PARENT) \ uma_zfree(V_dyn_parent_zone, s->limit); \ else \ uma_zfree(V_dyn_data_zone, s->data); \ uma_zfree(V_dyn_ ## af ## _zone, s); \ } \ } while (0) for (bucket = 0; bucket < V_curr_dyn_buckets; bucket++) { DYN_BUCKET_LOCK_DESTROY(V_dyn_bucket_lock, bucket); DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4[bucket], entry); DYN_FREE_STATES_FORCED(CK_, s4, ipv4, ipv4_parent[bucket], entry); #ifdef INET6 DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6[bucket], entry); DYN_FREE_STATES_FORCED(CK_, s6, ipv6, ipv6_parent[bucket], entry); #endif /* INET6 */ } DYN_FREE_STATES_FORCED(, s4, ipv4, expired_ipv4, expired); #ifdef INET6 DYN_FREE_STATES_FORCED(, s6, ipv6, expired_ipv6, expired); #endif #undef DYN_FREE_STATES_FORCED uma_zdestroy(V_dyn_ipv4_zone); uma_zdestroy(V_dyn_data_zone); uma_zdestroy(V_dyn_parent_zone); #ifdef INET6 uma_zdestroy(V_dyn_ipv6_zone); free(V_dyn_ipv6, M_IPFW); free(V_dyn_ipv6_parent, M_IPFW); free(V_dyn_ipv6_add, M_IPFW); free(V_dyn_ipv6_parent_add, M_IPFW); free(V_dyn_ipv6_del, M_IPFW); free(V_dyn_ipv6_parent_del, M_IPFW); #endif free(V_dyn_bucket_lock, M_IPFW); free(V_dyn_ipv4, M_IPFW); free(V_dyn_ipv4_parent, M_IPFW); free(V_dyn_ipv4_add, M_IPFW); free(V_dyn_ipv4_parent_add, M_IPFW); free(V_dyn_ipv4_del, M_IPFW); free(V_dyn_ipv4_parent_del, M_IPFW); if (IS_DEFAULT_VNET(curvnet)) free(dyn_hp_cache, M_IPFW); } diff --git a/sys/netpfil/ipfw/ip_fw_eaction.c b/sys/netpfil/ipfw/ip_fw_eaction.c index 0dfa3df625d8..4abe85879ff7 100644 --- a/sys/netpfil/ipfw/ip_fw_eaction.c +++ b/sys/netpfil/ipfw/ip_fw_eaction.c @@ -1,452 +1,463 @@ /*- - * Copyright (c) 2016-2017 Yandex LLC - * Copyright (c) 2016-2017 Andrey V. Elsukov + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2016-2025 Yandex LLC + * Copyright (c) 2016-2025 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include /* struct ipfw_rule_ref */ #include #include #include "opt_ipfw.h" /* * External actions support for ipfw. * * This code provides KPI for implementing loadable modules, that * can provide handlers for external action opcodes in the ipfw's * rules. * Module should implement opcode handler with type ipfw_eaction_t. * This handler will be called by ipfw_chk() function when * O_EXTERNAL_ACTION opcode is matched. The handler must return * value used as return value in ipfw_chk(), i.e. IP_FW_PASS, * IP_FW_DENY (see ip_fw_private.h). * Also the last argument must be set by handler. If it is zero, * the search continues to the next rule. If it has non zero value, * the search terminates. * * The module that implements external action should register its * handler and name with ipfw_add_eaction() function. * This function will return eaction_id, that can be used by module. * * It is possible to pass some additional information to external * action handler using O_EXTERNAL_INSTANCE and O_EXTERNAL_DATA opcodes. * Such opcodes should be next after the O_EXTERNAL_ACTION opcode. - * For the O_EXTERNAL_INSTANCE opcode the cmd->arg1 contains index of named + * For the O_EXTERNAL_INSTANCE opcode the cmd->kidx contains index of named * object related to an instance of external action. * For the O_EXTERNAL_DATA opcode the cmd contains the data that can be used * by external action handler without needing to create named instance. * * In case when eaction module uses named instances, it should register * opcode rewriting routines for O_EXTERNAL_INSTANCE opcode. The * classifier callback can look back into O_EXTERNAL_ACTION opcode (it - * must be in the (ipfw_insn *)(cmd - 1)). By arg1 from O_EXTERNAL_ACTION + * must be in the (ipfw_insn *)(cmd - 2)). By kidx from O_EXTERNAL_ACTION * it can deteremine eaction_id and compare it with its own. * The macro IPFW_TLV_EACTION_NAME(eaction_id) can be used to deteremine * the type of named_object related to external action instance. * * On module unload handler should be deregistered with ipfw_del_eaction() * function using known eaction_id. */ struct eaction_obj { struct named_object no; ipfw_eaction_t *handler; char name[64]; }; #define EACTION_OBJ(ch, cmd) \ - ((struct eaction_obj *)SRV_OBJECT((ch), (cmd)->arg1)) + ((struct eaction_obj *)SRV_OBJECT((ch), insntod((cmd), kidx)->kidx)) #if 0 #define EACTION_DEBUG(fmt, ...) do { \ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ } while (0) #else #define EACTION_DEBUG(fmt, ...) #endif const char *default_eaction_typename = "drop"; static int default_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { *done = 1; /* terminate the search */ return (IP_FW_DENY); } /* * Opcode rewriting callbacks. */ static int -eaction_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +eaction_classify(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { + ipfw_insn_kidx *cmd; + + if (F_LEN(cmd0) <= 1) + return (EINVAL); - EACTION_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1); - *puidx = cmd->arg1; + cmd = insntod(cmd0, kidx); + EACTION_DEBUG("opcode %u, kidx %u", cmd0->opcode, cmd->kidx); + *puidx = cmd->kidx; *ptype = 0; return (0); } static void -eaction_update(ipfw_insn *cmd, uint16_t idx) +eaction_update(ipfw_insn *cmd0, uint32_t idx) { + ipfw_insn_kidx *cmd; - cmd->arg1 = idx; - EACTION_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1); + cmd = insntod(cmd0, kidx); + cmd->kidx = idx; + EACTION_DEBUG("opcode %u, kidx -> %u", cmd0->opcode, cmd->kidx); } static int eaction_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno) { ipfw_obj_ntlv *ntlv; if (ti->tlvs == NULL) return (EINVAL); /* Search ntlv in the buffer provided by user */ ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_EACTION); if (ntlv == NULL) return (EINVAL); EACTION_DEBUG("name %s, uidx %u, type %u", ntlv->name, ti->uidx, ti->type); /* * Search named object with corresponding name. * Since eaction objects are global - ignore the set value * and use zero instead. */ *pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, IPFW_TLV_EACTION, ntlv->name); if (*pno == NULL) return (ESRCH); return (0); } static struct named_object * -eaction_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +eaction_findbykidx(struct ip_fw_chain *ch, uint32_t idx) { EACTION_DEBUG("kidx %u", idx); return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx)); } static struct opcode_obj_rewrite eaction_opcodes[] = { { .opcode = O_EXTERNAL_ACTION, .etlv = IPFW_TLV_EACTION, .classifier = eaction_classify, .update = eaction_update, .find_byname = eaction_findbyname, .find_bykidx = eaction_findbykidx, }, }; static int create_eaction_obj(struct ip_fw_chain *ch, ipfw_eaction_t handler, - const char *name, uint16_t *eaction_id) + const char *name, uint32_t *eaction_id) { struct namedobj_instance *ni; struct eaction_obj *obj; IPFW_UH_UNLOCK_ASSERT(ch); ni = CHAIN_TO_SRV(ch); obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO); obj->no.name = obj->name; obj->no.etlv = IPFW_TLV_EACTION; obj->handler = handler; strlcpy(obj->name, name, sizeof(obj->name)); IPFW_UH_WLOCK(ch); if (ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION, name) != NULL) { /* * Object is already created. * We don't allow eactions with the same name. */ IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); EACTION_DEBUG("External action with typename " "'%s' already exists", name); return (EEXIST); } if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) { IPFW_UH_WUNLOCK(ch); free(obj, M_IPFW); EACTION_DEBUG("alloc_idx failed"); return (ENOSPC); } ipfw_objhash_add(ni, &obj->no); IPFW_WLOCK(ch); SRV_OBJECT(ch, obj->no.kidx) = obj; IPFW_WUNLOCK(ch); obj->no.refcnt++; IPFW_UH_WUNLOCK(ch); if (eaction_id != NULL) *eaction_id = obj->no.kidx; return (0); } static void destroy_eaction_obj(struct ip_fw_chain *ch, struct named_object *no) { struct namedobj_instance *ni; struct eaction_obj *obj; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_SRV(ch); IPFW_WLOCK(ch); obj = SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; IPFW_WUNLOCK(ch); ipfw_objhash_del(ni, no); ipfw_objhash_free_idx(ni, no->kidx); free(obj, M_IPFW); } /* * Resets all eaction opcodes to default handlers. */ static void -reset_eaction_rules(struct ip_fw_chain *ch, uint16_t eaction_id, - uint16_t instance_id, bool reset_rules) +reset_eaction_rules(struct ip_fw_chain *ch, uint32_t eaction_id, + uint32_t instance_id, bool reset_rules) { struct named_object *no; int i; IPFW_UH_WLOCK_ASSERT(ch); no = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0, IPFW_TLV_EACTION, default_eaction_typename); if (no == NULL) panic("Default external action handler is not found"); if (eaction_id == no->kidx) panic("Wrong eaction_id"); EACTION_DEBUG("Going to replace id %u with %u", eaction_id, no->kidx); IPFW_WLOCK(ch); /* * Reset eaction objects only if it is referenced by rules. * But always reset objects for orphaned dynamic states. */ if (reset_rules) { for (i = 0; i < ch->n_rules; i++) { /* * Refcount on the original object will be just * ignored on destroy. Refcount on default_eaction * will be decremented on rule deletion, thus we * need to reference default_eaction object. */ if (ipfw_reset_eaction(ch, ch->map[i], eaction_id, no->kidx, instance_id) != 0) no->refcnt++; } } /* * Reset eaction opcodes for orphaned dynamic states. * Since parent rules are already deleted, we don't need to * reference named object of default_eaction. */ ipfw_dyn_reset_eaction(ch, eaction_id, no->kidx, instance_id); IPFW_WUNLOCK(ch); } /* * Initialize external actions framework. * Create object with default eaction handler "drop". */ int ipfw_eaction_init(struct ip_fw_chain *ch, int first) { int error; error = create_eaction_obj(ch, default_eaction, default_eaction_typename, NULL); if (error != 0) return (error); IPFW_ADD_OBJ_REWRITER(first, eaction_opcodes); EACTION_DEBUG("External actions support initialized"); return (0); } void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last) { struct namedobj_instance *ni; struct named_object *no; ni = CHAIN_TO_SRV(ch); IPFW_UH_WLOCK(ch); no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION, default_eaction_typename); if (no != NULL) destroy_eaction_obj(ch, no); IPFW_UH_WUNLOCK(ch); IPFW_DEL_OBJ_REWRITER(last, eaction_opcodes); EACTION_DEBUG("External actions support uninitialized"); } /* * Registers external action handler to the global array. * On success it returns eaction id, otherwise - zero. */ -uint16_t +uint32_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, const char *name) { - uint16_t eaction_id; + uint32_t eaction_id; eaction_id = 0; if (ipfw_check_object_name_generic(name) == 0) { create_eaction_obj(ch, handler, name, &eaction_id); EACTION_DEBUG("Registered external action '%s' with id %u", name, eaction_id); } return (eaction_id); } /* * Deregisters external action handler with id eaction_id. */ int -ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id) +ipfw_del_eaction(struct ip_fw_chain *ch, uint32_t eaction_id) { struct named_object *no; IPFW_UH_WLOCK(ch); no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id); if (no == NULL || no->etlv != IPFW_TLV_EACTION) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } reset_eaction_rules(ch, eaction_id, 0, (no->refcnt > 1)); EACTION_DEBUG("External action '%s' with id %u unregistered", no->name, eaction_id); destroy_eaction_obj(ch, no); IPFW_UH_WUNLOCK(ch); return (0); } int ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule, - uint16_t eaction_id, uint16_t default_id, uint16_t instance_id) + uint32_t eaction_id, uint32_t default_id, uint32_t instance_id) { ipfw_insn *cmd, *icmd; int l; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); /* * Return if there is not O_EXTERNAL_ACTION or its id is * different. */ cmd = ipfw_get_action(rule); if (cmd->opcode != O_EXTERNAL_ACTION || - cmd->arg1 != eaction_id) + insntod(cmd, kidx)->kidx != eaction_id) return (0); /* * Check if there is O_EXTERNAL_INSTANCE opcode, we need * to truncate the rule length. * - * NOTE: F_LEN(cmd) must be 1 for O_EXTERNAL_ACTION opcode, + * NOTE: F_LEN(cmd) must be 2 for O_EXTERNAL_ACTION opcode, * and rule length should be enough to keep O_EXTERNAL_INSTANCE - * opcode, thus we do check for l > 1. + * opcode, thus we do check for l > 2. */ l = rule->cmd + rule->cmd_len - cmd; - if (l > 1) { - MPASS(F_LEN(cmd) == 1); - icmd = cmd + 1; + if (l > 2) { + MPASS(F_LEN(cmd) == 2); + icmd = cmd + F_LEN(cmd); if (icmd->opcode == O_EXTERNAL_INSTANCE && - instance_id != 0 && icmd->arg1 != instance_id) + instance_id != 0 && + insntod(icmd, kidx)->kidx != instance_id) return (0); /* * Since named_object related to this instance will be * destroyed, truncate the chain of opcodes to remove * the rest of cmd chain just after O_EXTERNAL_ACTION * opcode. */ - EACTION_DEBUG("truncate rule %d: len %u -> %u", + EACTION_DEBUG("truncate rule %u: len %u -> %u", rule->rulenum, rule->cmd_len, rule->cmd_len - F_LEN(icmd)); rule->cmd_len -= F_LEN(icmd); MPASS(((uint32_t *)icmd - (uint32_t *)rule->cmd) == rule->cmd_len); } - cmd->arg1 = default_id; /* Set to default id */ + insntod(cmd, kidx)->kidx = default_id; /* Set to default id */ /* * Return 1 when reset successfully happened. */ return (1); } /* * This function should be called before external action instance is * destroyed. It will reset eaction_id to default_id for rules, where * eaction has instance with id == kidx. */ int -ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id, - uint16_t kidx) +ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint32_t eaction_id, + uint32_t kidx) { struct named_object *no; IPFW_UH_WLOCK_ASSERT(ch); no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id); if (no == NULL || no->etlv != IPFW_TLV_EACTION) return (EINVAL); reset_eaction_rules(ch, eaction_id, kidx, 0); return (0); } int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { + MPASS(F_LEN(cmd) == 2); return (EACTION_OBJ(ch, cmd)->handler(ch, args, cmd, done)); } diff --git a/sys/netpfil/ipfw/ip_fw_iface.c b/sys/netpfil/ipfw/ip_fw_iface.c index 92692637b9e3..3c648feecba4 100644 --- a/sys/netpfil/ipfw/ip_fw_iface.c +++ b/sys/netpfil/ipfw/ip_fw_iface.c @@ -1,536 +1,538 @@ /*- - * Copyright (c) 2014 Yandex LLC. + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014-2025 Yandex LLC. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * Kernel interface tracking API. * */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* struct ipfw_rule_ref */ #include #include #define CHAIN_TO_II(ch) ((struct namedobj_instance *)ch->ifcfg) #define DEFAULT_IFACES 128 static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex); static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex); static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); static struct ipfw_sopt_handler scodes[] = { - { IP_FW_XIFLIST, 0, HDIR_GET, list_ifaces }, + { IP_FW_XIFLIST, IP_FW3_OPVER, HDIR_GET, list_ifaces }, }; /* * FreeBSD Kernel interface. */ static void ipfw_kifhandler(void *arg, struct ifnet *ifp); static int ipfw_kiflookup(char *name); static void iface_khandler_register(void); static void iface_khandler_deregister(void); static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event; static int num_vnets = 0; static struct mtx vnet_mtx; /* * Checks if kernel interface is contained in our tracked * interface list and calls attach/detach handler. */ static void ipfw_kifhandler(void *arg, struct ifnet *ifp) { struct ip_fw_chain *ch; struct ipfw_iface *iif; struct namedobj_instance *ii; uintptr_t htype; if (V_ipfw_vnet_ready == 0) return; ch = &V_layer3_chain; htype = (uintptr_t)arg; IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); if (ii == NULL) { IPFW_UH_WUNLOCK(ch); return; } iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0, if_name(ifp)); if (iif != NULL) { if (htype == 1) handle_ifattach(ch, iif, ifp->if_index); else handle_ifdetach(ch, iif, ifp->if_index); } IPFW_UH_WUNLOCK(ch); } /* * Reference current VNET as iface tracking API user. * Registers interface tracking handlers for first VNET. */ static void iface_khandler_register(void) { int create; create = 0; mtx_lock(&vnet_mtx); if (num_vnets == 0) create = 1; num_vnets++; mtx_unlock(&vnet_mtx); if (create == 0) return; printf("IPFW: starting up interface tracker\n"); ipfw_ifdetach_event = EVENTHANDLER_REGISTER( ifnet_departure_event, ipfw_kifhandler, NULL, EVENTHANDLER_PRI_ANY); ipfw_ifattach_event = EVENTHANDLER_REGISTER( ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1), EVENTHANDLER_PRI_ANY); } /* * * Detach interface event handlers on last VNET instance * detach. */ static void iface_khandler_deregister(void) { int destroy; destroy = 0; mtx_lock(&vnet_mtx); if (num_vnets == 1) destroy = 1; num_vnets--; mtx_unlock(&vnet_mtx); if (destroy == 0) return; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ipfw_ifattach_event); EVENTHANDLER_DEREGISTER(ifnet_departure_event, ipfw_ifdetach_event); } /* * Retrieves ifindex for given @name. * * Returns ifindex or 0. */ static int ipfw_kiflookup(char *name) { struct ifnet *ifp; int ifindex; ifindex = 0; if ((ifp = ifunit_ref(name)) != NULL) { ifindex = ifp->if_index; if_rele(ifp); } return (ifindex); } /* * Global ipfw startup hook. * Since we perform lazy initialization, do nothing except * mutex init. */ int ipfw_iface_init(void) { mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF); IPFW_ADD_SOPT_HANDLER(1, scodes); return (0); } /* * Global ipfw destroy hook. * Unregister khandlers iff init has been done. */ void ipfw_iface_destroy(void) { IPFW_DEL_SOPT_HANDLER(1, scodes); mtx_destroy(&vnet_mtx); } /* * Perform actual init on internal request. * Inits both namehash and global khandler. */ static void vnet_ipfw_iface_init(struct ip_fw_chain *ch) { struct namedobj_instance *ii; - ii = ipfw_objhash_create(DEFAULT_IFACES); + ii = ipfw_objhash_create(DEFAULT_IFACES, DEFAULT_OBJHASH_SIZE); IPFW_UH_WLOCK(ch); if (ch->ifcfg == NULL) { ch->ifcfg = ii; ii = NULL; } IPFW_UH_WUNLOCK(ch); if (ii != NULL) { /* Already initialized. Free namehash. */ ipfw_objhash_destroy(ii); } else { /* We're the first ones. Init kernel hooks. */ iface_khandler_register(); } } static int destroy_iface(struct namedobj_instance *ii, struct named_object *no, void *arg) { /* Assume all consumers have been already detached */ free(no, M_IPFW); return (0); } /* * Per-VNET ipfw detach hook. * */ void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch) { struct namedobj_instance *ii; IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); ch->ifcfg = NULL; IPFW_UH_WUNLOCK(ch); if (ii != NULL) { ipfw_objhash_foreach(ii, destroy_iface, ch); ipfw_objhash_destroy(ii); iface_khandler_deregister(); } } /* * Notify the subsystem that we are interested in tracking * interface @name. This function has to be called without * holding any locks to permit allocating the necessary states * for proper interface tracking. * * Returns 0 on success. */ int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, struct ipfw_ifc *ic) { struct namedobj_instance *ii; struct ipfw_iface *iif, *tmp; if (strlen(name) >= sizeof(iif->ifname)) return (EINVAL); IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); if (ii == NULL) { /* * First request to subsystem. * Let's perform init. */ IPFW_UH_WUNLOCK(ch); vnet_ipfw_iface_init(ch); IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); } iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); if (iif != NULL) { iif->no.refcnt++; ic->iface = iif; IPFW_UH_WUNLOCK(ch); return (0); } IPFW_UH_WUNLOCK(ch); /* Not found. Let's create one */ iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO); TAILQ_INIT(&iif->consumers); iif->no.name = iif->ifname; strlcpy(iif->ifname, name, sizeof(iif->ifname)); /* * Ref & link to the list. * * We assume ifnet_arrival_event / ifnet_departure_event * are not holding any locks. */ iif->no.refcnt = 1; IPFW_UH_WLOCK(ch); tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); if (tmp != NULL) { /* Interface has been created since unlock. Ref and return */ tmp->no.refcnt++; ic->iface = tmp; IPFW_UH_WUNLOCK(ch); free(iif, M_IPFW); return (0); } iif->ifindex = ipfw_kiflookup(name); if (iif->ifindex != 0) iif->resolved = 1; ipfw_objhash_add(ii, &iif->no); ic->iface = iif; IPFW_UH_WUNLOCK(ch); return (0); } /* * Adds @ic to the list of iif interface consumers. * Must be called with holding both UH+WLOCK. * Callback may be immediately called (if interface exists). */ void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) { struct ipfw_iface *iif; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); iif = ic->iface; TAILQ_INSERT_TAIL(&iif->consumers, ic, next); if (iif->resolved != 0) ic->cb(ch, ic->cbdata, iif->ifindex); } /* * Unlinks interface tracker object @ic from interface. * Must be called while holding UH lock. */ void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) { struct ipfw_iface *iif; IPFW_UH_WLOCK_ASSERT(ch); iif = ic->iface; TAILQ_REMOVE(&iif->consumers, ic, next); } /* * Unreference interface specified by @ic. * Must be called while holding UH lock. */ void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic) { struct ipfw_iface *iif; IPFW_UH_WLOCK_ASSERT(ch); iif = ic->iface; ic->iface = NULL; iif->no.refcnt--; /* TODO: check for references & delete */ } /* * Interface arrival handler. */ static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex) { struct ipfw_ifc *ic; IPFW_UH_WLOCK_ASSERT(ch); iif->gencnt++; iif->resolved = 1; iif->ifindex = ifindex; IPFW_WLOCK(ch); TAILQ_FOREACH(ic, &iif->consumers, next) ic->cb(ch, ic->cbdata, iif->ifindex); IPFW_WUNLOCK(ch); } /* * Interface departure handler. */ static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex) { struct ipfw_ifc *ic; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK(ch); TAILQ_FOREACH(ic, &iif->consumers, next) ic->cb(ch, ic->cbdata, 0); IPFW_WUNLOCK(ch); iif->gencnt++; iif->resolved = 0; iif->ifindex = 0; } struct dump_iface_args { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static int export_iface_internal(struct namedobj_instance *ii, struct named_object *no, void *arg) { ipfw_iface_info *i; struct dump_iface_args *da; struct ipfw_iface *iif; da = (struct dump_iface_args *)arg; i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i)); KASSERT(i != NULL, ("previously checked buffer is not enough")); iif = (struct ipfw_iface *)no; strlcpy(i->ifname, iif->ifname, sizeof(i->ifname)); if (iif->resolved) i->flags |= IPFW_IFFLAG_RESOLVED; i->ifindex = iif->ifindex; i->refcnt = iif->no.refcnt; i->gencnt = iif->gencnt; return (0); } /* * Lists all interface currently tracked by ipfw. - * Data layout (v0)(current): + * Data layout (v1)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ] * * Returns 0 on success */ static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct namedobj_instance *ii; struct _ipfw_obj_lheader *olh; struct dump_iface_args da; uint32_t count, size; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); ii = CHAIN_TO_II(ch); if (ii != NULL) count = ipfw_objhash_count(ii); else count = 0; size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_iface_info); if (size > olh->size) { olh->size = size; IPFW_UH_RUNLOCK(ch); return (ENOMEM); } olh->size = size; da.ch = ch; da.sd = sd; if (ii != NULL) ipfw_objhash_foreach(ii, export_iface_internal, &da); IPFW_UH_RUNLOCK(ch); return (0); } diff --git a/sys/netpfil/ipfw/ip_fw_log.c b/sys/netpfil/ipfw/ip_fw_log.c index 1cc39d02ea79..389b04ccbace 100644 --- a/sys/netpfil/ipfw/ip_fw_log.c +++ b/sys/netpfil/ipfw/ip_fw_log.c @@ -1,441 +1,688 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * Logging support for ipfw */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include +#include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include /* ip6_sprintf() */ #endif #include #ifdef MAC #include #endif /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) #ifdef __APPLE__ #undef snprintf #define snprintf sprintf #define SNPARGS(buf, len) buf + len #define SNP(buf) buf #else /* !__APPLE__ */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) #endif /* !__APPLE__ */ #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) + +static void +ipfw_log_ipfw0(struct ip_fw_args *args, struct ip *ip) +{ + if (args->flags & IPFW_ARGS_LENMASK) + ipfw_bpf_tap(args->mem, IPFW_ARGS_LENGTH(args->flags)); + else if (args->flags & IPFW_ARGS_ETHER) + /* layer2, use orig hdr */ + ipfw_bpf_mtap(args->m); + else { + /* Add fake header. Later we will store + * more info in the header. + */ + if (ip->ip_v == 4) + ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00", + ETHER_HDR_LEN, args->m); + else if (ip->ip_v == 6) + ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd", + ETHER_HDR_LEN, args->m); + else + /* Obviously bogus EtherType. */ + ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff", + ETHER_HDR_LEN, args->m); + } +} + /* - * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ -void -ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, +static void +ipfw_log_syslog(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip) { char *action; int limit_reached = 0; char action2[92], proto[128], fragment[32], mark_str[24]; - if (V_fw_verbose == 0) { - if (args->flags & IPFW_ARGS_LENMASK) - ipfw_bpf_tap(args->mem, IPFW_ARGS_LENGTH(args->flags)); - else if (args->flags & IPFW_ARGS_ETHER) - /* layer2, use orig hdr */ - ipfw_bpf_mtap(args->m); - else { - /* Add fake header. Later we will store - * more info in the header. - */ - if (ip->ip_v == 4) - ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00", - ETHER_HDR_LEN, args->m); - else if (ip->ip_v == 6) - ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd", - ETHER_HDR_LEN, args->m); - else - /* Obviously bogus EtherType. */ - ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff", - ETHER_HDR_LEN, args->m); - } - return; - } - /* the old 'log' function */ fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; V_norule_counter++; if (V_norule_counter == V_verbose_limit) limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB || cmd->opcode == O_TAG) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_REJECT_ABORT) action = "Abort"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else if (cmd->arg1==ICMP6_UNREACH_ABORT) action = "Abort"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", TARG(cmd->arg1, divert)); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", TARG(cmd->arg1, divert)); break; case O_SETDSCP: snprintf(SNPARGS(action2, 0), "SetDscp %d", TARG(cmd->arg1, dscp) & 0x3F); break; case O_SETFIB: snprintf(SNPARGS(action2, 0), "SetFib %d", TARG(cmd->arg1, fib) & 0x7FFF); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", - TARG(cmd->arg1, skipto)); + TARG(insntod(cmd, u32)->d[0], skipto)); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", TARG(cmd->arg1, pipe)); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", TARG(cmd->arg1, pipe)); break; case O_FORWARD_IP: { char buf[INET_ADDRSTRLEN]; ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa_r(dummyaddr, buf)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; #ifdef INET6 case O_FORWARD_IP6: { char buf[INET6_ADDRSTRLEN]; ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", ip6_sprintf(buf, &sa->sa.sin6_addr)); if (sa->sa.sin6_port) snprintf(SNPARGS(action2, len), ":%u", sa->sa.sin6_port); } break; #endif case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; case O_REASS: action = "Reass"; break; case O_CALLRETURN: if (cmd->len & F_NOT) - action = "Return"; + snprintf(SNPARGS(action2, 0), "Return %s", + cmd->arg1 == RETURN_NEXT_RULENUM ? + "next-rulenum": "next-rule"); else snprintf(SNPARGS(action2, 0), "Call %d", - cmd->arg1); + TARG(insntod(cmd, u32)->d[0], skipto)); break; case O_SETMARK: if (cmd->arg1 == IP_FW_TARG) - snprintf(SNPARGS(action2, 0), "SetMark %#x", + snprintf(SNPARGS(action2, 0), "SetMark %#010x", TARG(cmd->arg1, mark)); else - snprintf(SNPARGS(action2, 0), "SetMark %#x", - ((ipfw_insn_u32 *)cmd)->d[0]); + snprintf(SNPARGS(action2, 0), "SetMark %#010x", + insntoc(cmd, u32)->d[0]); break; case O_EXTERNAL_ACTION: snprintf(SNPARGS(action2, 0), "Eaction %s", ((struct named_object *)SRV_OBJECT(chain, - cmd->arg1))->name); + insntod(cmd, kidx)->kidx))->name); break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; u_short ip6f_mf; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 ip6f_mf = offset & IP6F_MORE_FRAG; offset &= IP6F_OFF_MASK; if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: case IPPROTO_UDPLITE: len = snprintf(SNPARGS(proto, 0), "UDP%s%s", args->f_id.proto == IPPROTO_UDP ? " ": "Lite ", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset || ip6f_mf) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.extra, ntohs(ip6->ip6_plen) - hlen, ntohs(offset) << 3, ip6f_mf ? "+" : ""); } else #endif { int ipoff, iplen; ipoff = ntohs(ip->ip_off); iplen = ntohs(ip->ip_len); if (ipoff & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), offset << 3, (ipoff & IP_MF) ? "+" : ""); } } /* [fw]mark */ if (args->rule.pkt_mark) snprintf(SNPARGS(mark_str, 0), " mark:%#x", args->rule.pkt_mark); else mark_str[0] = '\0'; #ifdef __FreeBSD__ log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s%s %s via %s%s\n", f ? f->rulenum : -1, action, proto, mark_str, args->flags & IPFW_ARGS_OUT ? "out" : "in", args->ifp->if_xname, fragment); #else log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s%s [no if info]%s\n", f ? f->rulenum : -1, action, proto, mark_str, fragment); #endif if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } + +static void +ipfw_rtsocklog_fill_l3(struct ip_fw_args *args, + char **buf, struct sockaddr **src, struct sockaddr **dst) +{ + struct sockaddr_in *v4src, *v4dst; +#ifdef INET6 + struct sockaddr_in6 *v6src, *v6dst; + + if (IS_IP6_FLOW_ID(&(args->f_id))) { + v6src = (struct sockaddr_in6 *)*buf; + *buf += sizeof(*v6src); + v6dst = (struct sockaddr_in6 *)*buf; + *buf += sizeof(*v6dst); + v6src->sin6_len = v6dst->sin6_len = sizeof(*v6src); + v6src->sin6_family = v6dst->sin6_family = AF_INET6; + v6src->sin6_addr = args->f_id.src_ip6; + v6dst->sin6_addr = args->f_id.dst_ip6; + + *src = (struct sockaddr *)v6src; + *dst = (struct sockaddr *)v6dst; + } else +#endif + { + v4src = (struct sockaddr_in *)*buf; + *buf += sizeof(*v4src); + v4dst = (struct sockaddr_in *)*buf; + *buf += sizeof(*v4dst); + v4src->sin_len = v4dst->sin_len = sizeof(*v4src); + v4src->sin_family = v4dst->sin_family = AF_INET; + v4src->sin_addr.s_addr = htonl(args->f_id.src_ip); + v4dst->sin_addr.s_addr = htonl(args->f_id.dst_ip); + + *src = (struct sockaddr *)v4src; + *dst = (struct sockaddr *)v4dst; + } +} + +static struct sockaddr * +ipfw_rtsocklog_handle_tablearg(struct ip_fw_chain *chain, ipfw_insn *cmd, + uint32_t tablearg, uint32_t *targ_value, char **buf) +{ + struct sockaddr_in *v4nh = NULL; + + /* handle tablearg now */ + switch (cmd->opcode) { + case O_DIVERT: + case O_TEE: + *targ_value = TARG(cmd->arg1, divert); + break; + case O_NETGRAPH: + case O_NGTEE: + *targ_value = TARG(cmd->arg1, netgraph); + break; + case O_SETDSCP: + *targ_value = (TARG(cmd->arg1, dscp) & 0x3F); + break; + case O_SETFIB: + *targ_value = (TARG(cmd->arg1, fib) & 0x7FFF); + break; + case O_SKIPTO: + case O_CALLRETURN: + if (cmd->opcode == O_CALLRETURN && (cmd->len & F_NOT)) + break; + *targ_value = (TARG(insntod(cmd, u32)->d[0], skipto)); + break; + case O_PIPE: + case O_QUEUE: + *targ_value = TARG(cmd->arg1, pipe); + break; + case O_MARK: + *targ_value = TARG(cmd->arg1, mark); + break; + case O_FORWARD_IP: + v4nh = (struct sockaddr_in *)buf; + buf += sizeof(*v4nh); + *v4nh = ((ipfw_insn_sa *)cmd)->sa; + if (v4nh->sin_addr.s_addr == INADDR_ANY) + v4nh->sin_addr.s_addr = htonl(tablearg); + + return (struct sockaddr *)v4nh; +#ifdef INET6 + case O_FORWARD_IP6: + return (struct sockaddr *)&(((ipfw_insn_sa6 *)cmd)->sa); +#endif + default: + break; + } + + return (NULL); +} + +#define MAX_COMMENT_LEN 80 + +static size_t +ipfw_copy_rule_comment(struct ip_fw *f, char *dst) +{ + ipfw_insn *cmd; + size_t rcomment_len = 0; + int l, cmdlen; + + for (l = f->cmd_len, cmd = f->cmd; l > 0; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmd->opcode != O_NOP) { + continue; + } else if (cmd->len == 1) { + return (0); + } + break; + } + if (l <= 0) { + return (0); + } + rcomment_len = strnlen((char *)(cmd + 1), MAX_COMMENT_LEN - 1) + 1; + strlcpy(dst, (char *)(cmd + 1), rcomment_len); + return (rcomment_len); +} + +static void +ipfw_log_rtsock(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, + struct ip_fw_args *args, u_short offset, uint32_t tablearg, + void *_eh) +{ + struct sockaddr_dl *sdl_ipfwcmd; + struct ether_header *eh = _eh; + struct rt_addrinfo *info; + uint32_t *targ_value; + ipfwlog_rtsock_hdr_v2 *hdr; + ipfw_insn *cmd; + ipfw_insn_log *l; + char *buf, *orig_buf; + /* at least 4 x sizeof(struct sockaddr_dl) + rule comment (80) */ + size_t buflen = 512; + + /* Should we log? O_LOG is the first one */ + cmd = ACTION_PTR(f); + l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + + l->log_left--; + if (V_fw_verbose != 0 && l->log_left == 0) { + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + l->max_log, f ? f->rulenum : -1); + } + + buf = orig_buf = malloc(buflen, M_TEMP, M_NOWAIT | M_ZERO); + if (buf == NULL) + return; + + info = (struct rt_addrinfo *)buf; + buf += sizeof (*info); + + cmd = ipfw_get_action(f); + sdl_ipfwcmd = (struct sockaddr_dl *)buf; + sdl_ipfwcmd->sdl_family = AF_IPFWLOG; + sdl_ipfwcmd->sdl_index = f->set; + sdl_ipfwcmd->sdl_type = 2; /* version */ + sdl_ipfwcmd->sdl_alen = sizeof(*hdr); + hdr = (ipfwlog_rtsock_hdr_v2 *)(sdl_ipfwcmd->sdl_data); + /* fill rule comment in if any */ + sdl_ipfwcmd->sdl_nlen = ipfw_copy_rule_comment(f, hdr->comment); + targ_value = &hdr->tablearg; + hdr->rulenum = f->rulenum; + hdr->mark = args->rule.pkt_mark; + hdr->cmd = *cmd; + + sdl_ipfwcmd->sdl_len = sizeof(*sdl_ipfwcmd); + if (sizeof(*hdr) + sdl_ipfwcmd->sdl_nlen > sizeof(sdl_ipfwcmd->sdl_data)) { + sdl_ipfwcmd->sdl_len += sizeof(*hdr) + sdl_ipfwcmd->sdl_nlen - + sizeof(sdl_ipfwcmd->sdl_data); + } + buf += sdl_ipfwcmd->sdl_len; + + /* fill L2 in if present */ + if (args->flags & IPFW_ARGS_ETHER && eh != NULL) { + sdl_ipfwcmd->sdl_slen = sizeof(eh->ether_shost); + memcpy(hdr->ether_shost, eh->ether_shost, + sdl_ipfwcmd->sdl_slen); + memcpy(hdr->ether_dhost, eh->ether_dhost, + sdl_ipfwcmd->sdl_slen); + } + + info->rti_info[RTAX_DST] = (struct sockaddr *)sdl_ipfwcmd; + + /* Warn if we're about to stop sending messages */ + if (l->max_log != 0 && l->log_left < (l->max_log >> 1)) { + info->rti_flags |= RTF_PROTO1; + } + + /* handle tablearg */ + info->rti_info[RTAX_GENMASK] = ipfw_rtsocklog_handle_tablearg( + chain, cmd, tablearg, targ_value, &buf); + + /* L3 */ + ipfw_rtsocklog_fill_l3(args, &buf, + &info->rti_info[RTAX_GATEWAY], + &info->rti_info[RTAX_NETMASK]); + + info->rti_ifp = args->ifp; + rtsock_routemsg_info(RTM_IPFWLOG, info, RT_ALL_FIBS); + + free(orig_buf, M_TEMP); +} + +/* + * We enter here when we have a rule with O_LOG. + */ +void +ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, + struct ip_fw_args *args, u_short offset, uint32_t tablearg, + struct ip *ip, void *eh) +{ + ipfw_insn *cmd; + + if (f == NULL || hlen == 0) + return; + + /* O_LOG is the first action */ + cmd = ACTION_PTR(f); + + if (cmd->arg1 == IPFW_LOG_DEFAULT) { + if (V_fw_verbose == 0) { + ipfw_log_ipfw0(args, ip); + return; + } + ipfw_log_syslog(chain, f, hlen, args, offset, tablearg, ip); + return; + } + + if (cmd->arg1 & IPFW_LOG_SYSLOG) + ipfw_log_syslog(chain, f, hlen, args, offset, tablearg, ip); + + if (cmd->arg1 & IPFW_LOG_RTSOCK) + ipfw_log_rtsock(chain, f, hlen, args, offset, tablearg, eh); + + if (cmd->arg1 & IPFW_LOG_IPFW0) + ipfw_log_ipfw0(args, ip); +} /* end of file */ diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c index 0f595279fec9..1e2ff1bca290 100644 --- a/sys/netpfil/ipfw/ip_fw_nat.c +++ b/sys/netpfil/ipfw/ip_fw_nat.c @@ -1,1246 +1,1246 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2008 Paolo Pisati * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for in_cksum */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; uint16_t port; }; /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ uint16_t mode; /* type of redirect mode */ uint16_t proto; /* protocol: tcp/udp */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ uint16_t lport; /* local port */ uint16_t pport; /* public port */ uint16_t rport; /* remote port */ uint16_t pport_cnt; /* number of public ports */ uint16_t rport_cnt; /* number of remote ports */ struct alias_link **alink; u_int16_t spool_cnt; /* num of entry in spool chain */ /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ struct libalias *lib; /* libalias instance */ int mode; /* aliasing mode */ int redir_cnt; /* number of entry in spool chain */ /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; char if_name[IF_NAMESIZE]; /* interface name */ u_short alias_port_lo; /* low range for port aliasing */ u_short alias_port_hi; /* high range for port aliasing */ }; static eventhandler_tag ifaddr_event_tag; static void ifaddr_change(void *arg __unused, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; struct ip_fw_chain *chain; KASSERT(curvnet == ifp->if_vnet, ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet)); if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0) return; chain = &V_layer3_chain; IPFW_UH_WLOCK(chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &chain->nat, _next) { struct epoch_tracker et; /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) continue; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; IPFW_WLOCK(chain); ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); IPFW_WUNLOCK(chain); } NET_EPOCH_EXIT(et); } IPFW_UH_WUNLOCK(chain); } /* * delete the pointers for nat entry ix, or all of them if ix < 0 */ static void flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) { ipfw_insn_nat *cmd; int i; IPFW_WLOCK_ASSERT(chain); for (i = 0; i < chain->n_rules; i++) { cmd = (ipfw_insn_nat *)ipfw_get_action(chain->map[i]); if (cmd->o.opcode == O_NAT && cmd->nat != NULL && (ix < 0 || cmd->nat->id == ix)) cmd->nat = NULL; } } static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { struct cfg_redir *r, *tmp_r; struct cfg_spool *s, *tmp_s; int i, num; LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { case NAT44_REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ case NAT44_REDIR_ADDR: case NAT44_REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); /* Del spool cfg if any. */ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { LIST_REMOVE(s, _next); free(s, M_IPFW); } free(r->alink, M_IPFW); LIST_REMOVE(r, _next); free(r, M_IPFW); break; default: printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ break; } } } static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; int cnt, off, i; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct nat44_cfg_redir *)&buf[off]; r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO); r->mode = ser_r->mode; r->laddr = ser_r->laddr; r->paddr = ser_r->paddr; r->raddr = ser_r->raddr; r->lport = ser_r->lport; r->pport = ser_r->pport; r->rport = ser_r->rport; r->pport_cnt = ser_r->pport_cnt; r->rport_cnt = ser_r->rport_cnt; r->proto = ser_r->proto; r->spool_cnt = ser_r->spool_cnt; //memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); off += sizeof(struct nat44_cfg_redir); r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { case NAT44_REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; case NAT44_REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; if (r->rport_cnt == 1 && r->rport == 0) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; break; } } break; case NAT44_REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; default: printf("unknown redirect mode: %u\n", r->mode); break; } if (r->alink[0] == NULL) { printf("LibAliasRedirect* returned NULL\n"); free(r->alink, M_IPFW); free(r, M_IPFW); return (EINVAL); } /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { ser_s = (struct nat44_cfg_spool *)&buf[off]; s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO); s->addr = ser_s->addr; s->port = ser_s->port; LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); off += sizeof(struct nat44_cfg_spool); /* Hook spool entry. */ LIST_INSERT_HEAD(&r->spool_chain, s, _next); } /* And finally hook this redir entry. */ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } return (0); } static void free_nat_instance(struct cfg_nat *ptr) { del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } /* * ipfw_nat - perform mbuf header translation. * * Note V_layer3_chain has to be locked while calling ipfw_nat() in * 'global' operation mode (t == NULL). * */ static int ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) { struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ int ldt, retval, found; struct ip_fw_chain *chain; char *c; ldt = 0; retval = 0; mcl = m_megapullup(m, m->m_pkthdr.len); if (mcl == NULL) { args->m = NULL; return (IP_FW_DENY); } M_ASSERTMAPPED(mcl); ip = mtod(mcl, struct ip *); /* * XXX - Libalias checksum offload 'duct tape': * * locally generated packets have only pseudo-header checksum * calculated and libalias will break it[1], so mark them for * later fix. Moreover there are cases when libalias modifies * tcp packet data[2], mark them for later fix too. * * [1] libalias was never meant to run in kernel, so it does * not have any knowledge about checksum offloading, and * expects a packet with a full internet checksum. * Unfortunately, packets generated locally will have just the * pseudo header calculated, and when libalias tries to adjust * the checksum it will actually compute a wrong value. * * [2] when libalias modifies tcp's data content, full TCP * checksum has to be recomputed: the problem is that * libalias does not have any idea about checksum offloading. * To work around this, we do not do checksumming in LibAlias, * but only mark the packets in th_x2 field. If we receive a * marked packet, we calculate correct checksum for it * aware of offloading. Why such a terrible hack instead of * recalculating checksum for each packet? * Because the previous checksum was not checked! * Recalculating checksums for EVERY packet will hide ALL * transmission errors. Yes, marked packets still suffer from * this problem. But, sigh, natd(8) has this problem, too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ if (mcl->m_pkthdr.rcvif == NULL && mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); /* Check if this is 'global' instance */ if (t == NULL) { if (args->flags & IPFW_ARGS_IN) { /* Wrong direction, skip processing */ args->m = mcl; return (IP_FW_NAT); } found = 0; chain = &V_layer3_chain; IPFW_RLOCK_ASSERT(chain); /* Check every nat entry... */ LIST_FOREACH(t, &chain->nat, _next) { if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) continue; retval = LibAliasOutTry(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl), 0); if (retval == PKT_ALIAS_OK) { /* Nat instance recognises state */ found = 1; break; } } if (found != 1) { /* No instance found, return ignore */ args->m = mcl; return (IP_FW_NAT); } } else { if (args->flags & IPFW_ARGS_IN) retval = LibAliasIn(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); else retval = LibAliasOut(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); } /* * We drop packet when: * 1. libalias returns PKT_ALIAS_ERROR; * 2. For incoming packets: * a) for unresolved fragments; * b) libalias returns PKT_ALIAS_IGNORED and * PKT_ALIAS_DENY_INCOMING flag is set. */ if (retval == PKT_ALIAS_ERROR || ((args->flags & IPFW_ARGS_IN) && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || (retval == PKT_ALIAS_IGNORED && (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; return (IP_FW_DENY); } if (retval == PKT_ALIAS_RESPOND) mcl->m_flags |= M_SKIP_FIREWALL; mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* * XXX - libalias checksum offload * 'duct tape' (see above) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th; th = (struct tcphdr *)(ip + 1); if (tcp_get_flags(th) & TH_RES1) ldt = 1; } if (ldt) { struct tcphdr *th; struct udphdr *uh; uint16_t ip_len, cksum; ip_len = ntohs(ip->ip_len); cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ip->ip_p + ip_len - (ip->ip_hl << 2))); switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); /* * Maybe it was set in * libalias... */ tcp_set_flags(th, tcp_get_flags(th) & ~TH_RES1); th->th_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); break; } /* No hw checksum offloading: do it ourselves */ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } } args->m = mcl; return (IP_FW_NAT); } static struct cfg_nat * lookup_nat(struct nat_list *l, int nat_id) { struct cfg_nat *res; LIST_FOREACH(res, l, _next) { if (res->id == nat_id) break; } return res; } static struct cfg_nat * lookup_nat_name(struct nat_list *l, char *name) { struct cfg_nat *res; int id; char *errptr; id = strtol(name, &errptr, 10); if (id == 0 || *errptr != '\0') return (NULL); LIST_FOREACH(res, l, _next) { if (res->id == id) break; } return (res); } /* IP_FW3 configuration routines */ static void nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg) { struct cfg_nat *ptr, *tcfg; int gencnt; /* * Find/create nat rule. */ IPFW_UH_WLOCK(chain); gencnt = chain->gencnt; ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO); ptr->lib = LibAliasInit(NULL); LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarily unhook it. */ IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); } /* * Basic nat (re)configuration. */ ptr->id = strtol(ucfg->name, NULL, 10); /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ucfg->ip; ptr->redir_cnt = ucfg->redir_cnt; ptr->mode = ucfg->mode; ptr->alias_port_lo = ucfg->alias_port_lo; ptr->alias_port_hi = ucfg->alias_port_hi; strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name)); LibAliasSetMode(ptr->lib, ptr->mode, ~0); LibAliasSetAddress(ptr->lib, ptr->ip); LibAliasSetAliasPortRange(ptr->lib, ptr->alias_port_lo, ptr->alias_port_hi); /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ add_redir_spool_cfg((char *)(ucfg + 1), ptr); IPFW_UH_WLOCK(chain); /* Extra check to avoid race with another ipfw_nat_cfg() */ tcfg = NULL; if (gencnt != chain->gencnt) tcfg = lookup_nat_name(&chain->nat, ucfg->name); IPFW_WLOCK(chain); if (tcfg != NULL) LIST_REMOVE(tcfg, _next); LIST_INSERT_HEAD(&chain->nat, ptr, _next); IPFW_WUNLOCK(chain); chain->gencnt++; IPFW_UH_WUNLOCK(chain); if (tcfg != NULL) free_nat_instance(ptr); } /* * Creates/configure nat44 instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; int id; size_t read; char *errptr; /* Check minimum header size */ if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg))) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated and looks like number */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); id = strtol(ucfg->name, &errptr, 10); if (id == 0 || *errptr != '\0') return (EINVAL); read = sizeof(*oh) + sizeof(*ucfg); /* Check number of redirs */ if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir)) return (EINVAL); nat44_config(chain, ucfg); return (0); } /* * Destroys given nat instances. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct cfg_nat *ptr; ipfw_obj_ntlv *ntlv; /* Check minimum header size */ if (sd->valsize < sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ntlv = &oh->ntlv; /* Check if name is properly terminated */ if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name)) return (EINVAL); IPFW_UH_WLOCK(chain); ptr = lookup_nat_name(&chain->nat, ntlv->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (ESRCH); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); free_nat_instance(ptr); return (0); } static void export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg) { snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id); ucfg->ip = ptr->ip; ucfg->redir_cnt = ptr->redir_cnt; ucfg->mode = ptr->mode; ucfg->alias_port_lo = ptr->alias_port_lo; ucfg->alias_port_hi = ptr->alias_port_hi; strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name)); } /* * Gets config for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat); LIST_FOREACH(r, &ptr->redir_chain, _next) { sz += sizeof(struct nat44_cfg_redir); LIST_FOREACH(s, &r->spool_chain, _next) sz += sizeof(struct nat44_cfg_spool); } ucfg->size = sz; if (sd->valsize < sz) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } /* Size OK, let's copy data */ LIST_FOREACH(r, &ptr->redir_chain, _next) { ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd, sizeof(*ser_r)); ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space( sd, sizeof(*ser_s)); ser_s->addr = s->addr; ser_s->port = s->port; } } IPFW_UH_RUNLOCK(chain); return (0); } /* * Lists all nat44 instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ] * * Returns 0 on success */ static int nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; int nat_count; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(chain); nat_count = 0; LIST_FOREACH(ptr, &chain->nat, _next) nat_count++; olh->count = nat_count; olh->objsize = sizeof(struct nat44_cfg_nat); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(chain); return (ENOMEM); } LIST_FOREACH(ptr, &chain->nat, _next) { ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd, sizeof(*ucfg)); export_nat_cfg(ptr, ucfg); } IPFW_UH_RUNLOCK(chain); return (0); } /* * Gets log for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat ] * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ] * * Returns 0 on success */ static int nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; void *pbuf; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } if (ptr->lib->logDesc == NULL) { IPFW_UH_RUNLOCK(chain); return (ENOENT); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE; if (sd->valsize < sz + sizeof(*oh)) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE); memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE); IPFW_UH_RUNLOCK(chain); return (0); } static struct ipfw_sopt_handler scodes[] = { - { IP_FW_NAT44_XCONFIG, 0, HDIR_SET, nat44_cfg }, - { IP_FW_NAT44_DESTROY, 0, HDIR_SET, nat44_destroy }, - { IP_FW_NAT44_XGETCONFIG, 0, HDIR_GET, nat44_get_cfg }, - { IP_FW_NAT44_LIST_NAT, 0, HDIR_GET, nat44_list_nat }, - { IP_FW_NAT44_XGETLOG, 0, HDIR_GET, nat44_get_log }, + { IP_FW_NAT44_XCONFIG, IP_FW3_OPVER, HDIR_SET, nat44_cfg }, + { IP_FW_NAT44_DESTROY, IP_FW3_OPVER, HDIR_SET, nat44_destroy }, + { IP_FW_NAT44_XGETCONFIG, IP_FW3_OPVER, HDIR_GET, nat44_get_cfg }, + { IP_FW_NAT44_LIST_NAT, IP_FW3_OPVER, HDIR_GET, nat44_list_nat }, + { IP_FW_NAT44_XGETLOG, IP_FW3_OPVER, HDIR_GET, nat44_get_log }, }; /* * Legacy configuration routines */ struct cfg_spool_legacy { LIST_ENTRY(cfg_spool_legacy) _next; struct in_addr addr; u_short port; }; struct cfg_redir_legacy { LIST_ENTRY(cfg_redir) _next; u_int16_t mode; struct in_addr laddr; struct in_addr paddr; struct in_addr raddr; u_short lport; u_short pport; u_short rport; u_short pport_cnt; u_short rport_cnt; int proto; struct alias_link **alink; u_int16_t spool_cnt; LIST_HEAD(, cfg_spool_legacy) spool_chain; }; struct cfg_nat_legacy { LIST_ENTRY(cfg_nat_legacy) _next; int id; struct in_addr ip; char if_name[IF_NAMESIZE]; int mode; struct libalias *lib; int redir_cnt; LIST_HEAD(, cfg_redir_legacy) redir_chain; }; static int ipfw_nat_cfg(struct sockopt *sopt) { struct cfg_nat_legacy *cfg; struct nat44_cfg_nat *ucfg; struct cfg_redir_legacy *rdir; struct nat44_cfg_redir *urdir; char *buf; size_t len, len2; int error, i; len = sopt->sopt_valsize; len2 = len + 128; /* * Allocate 2x buffer to store converted structures. * new redir_cfg has shrunk, so we're sure that * new buffer size is enough. */ buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy)); if (error != 0) goto out; cfg = (struct cfg_nat_legacy *)buf; if (cfg->id < 0) { error = EINVAL; goto out; } ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)]; snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id); strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name)); ucfg->ip = cfg->ip; ucfg->mode = cfg->mode; ucfg->redir_cnt = cfg->redir_cnt; if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) { error = EINVAL; goto out; } urdir = (struct nat44_cfg_redir *)(ucfg + 1); rdir = (struct cfg_redir_legacy *)(cfg + 1); for (i = 0; i < cfg->redir_cnt; i++) { urdir->mode = rdir->mode; urdir->laddr = rdir->laddr; urdir->paddr = rdir->paddr; urdir->raddr = rdir->raddr; urdir->lport = rdir->lport; urdir->pport = rdir->pport; urdir->rport = rdir->rport; urdir->pport_cnt = rdir->pport_cnt; urdir->rport_cnt = rdir->rport_cnt; urdir->proto = rdir->proto; urdir->spool_cnt = rdir->spool_cnt; urdir++; rdir++; } nat44_config(&V_layer3_chain, ucfg); out: free(buf, M_TEMP); return (error); } static int ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; int i; sooptcopyin(sopt, &i, sizeof i, sizeof i); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (EINVAL); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); free_nat_instance(ptr); return (0); } static int ipfw_nat_get_cfg(struct sockopt *sopt) { struct ip_fw_chain *chain = &V_layer3_chain; struct cfg_nat *n; struct cfg_nat_legacy *ucfg; struct cfg_redir *r; struct cfg_spool *s; struct cfg_redir_legacy *ser_r; struct cfg_spool_legacy *ser_s; char *data; int gencnt, nat_cnt, len, error; nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); retry: gencnt = chain->gencnt; /* Estimate memory amount */ LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) len += sizeof(struct cfg_spool_legacy); } } IPFW_UH_RUNLOCK(chain); data = malloc(len, M_TEMP, M_WAITOK | M_ZERO); bcopy(&nat_cnt, data, sizeof(nat_cnt)); nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); if (gencnt != chain->gencnt) { free(data, M_TEMP); goto retry; } /* Serialize all the data. */ LIST_FOREACH(n, &chain->nat, _next) { ucfg = (struct cfg_nat_legacy *)&data[len]; ucfg->id = n->id; ucfg->ip = n->ip; ucfg->redir_cnt = n->redir_cnt; ucfg->mode = n->mode; strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name)); len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { ser_r = (struct cfg_redir_legacy *)&data[len]; ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct cfg_spool_legacy *)&data[len]; ser_s->addr = s->addr; ser_s->port = s->port; len += sizeof(struct cfg_spool_legacy); } } } IPFW_UH_RUNLOCK(chain); error = sooptcopyout(sopt, data, len); free(data, M_TEMP); return (error); } static int ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; int i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; chain = &V_layer3_chain; IPFW_RLOCK(chain); /* one pass to count, one to copy the data */ i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; i++; } size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { IPFW_RUNLOCK(chain); return (ENOSPC); } i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); sooptcopyout(sopt, data, size); free(data, M_IPFW); return(0); } static int vnet_ipfw_nat_init(const void *arg __unused) { V_ipfw_nat_ready = 1; return (0); } static int vnet_ipfw_nat_uninit(const void *arg __unused) { struct cfg_nat *ptr, *ptr_temp; struct ip_fw_chain *chain; chain = &V_layer3_chain; IPFW_WLOCK(chain); V_ipfw_nat_ready = 0; LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); free_nat_instance(ptr); } flush_nat_ptrs(chain, -1 /* flush all */); IPFW_WUNLOCK(chain); return (0); } static void ipfw_nat_init(void) { /* init ipfw hooks */ ipfw_nat_ptr = ipfw_nat; lookup_nat_ptr = lookup_nat; ipfw_nat_cfg_ptr = ipfw_nat_cfg; ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; IPFW_ADD_SOPT_HANDLER(1, scodes); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); } static void ipfw_nat_destroy(void) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); /* deregister ipfw_nat */ IPFW_DEL_SOPT_HANDLER(1, scodes); ipfw_nat_ptr = NULL; lookup_nat_ptr = NULL; ipfw_nat_cfg_ptr = NULL; ipfw_nat_del_ptr = NULL; ipfw_nat_get_cfg_ptr = NULL; ipfw_nat_get_log_ptr = NULL; } static int ipfw_nat_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: break; default: return EOPNOTSUPP; break; } return err; } static moduledata_t ipfw_nat_mod = { "ipfw_nat", ipfw_nat_modevent, 0 }; /* Define startup order. */ #define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ #define IPFW_NAT_MODULE_ORDER (IPFW_NAT_MODEVENT_ORDER + 1) #define IPFW_NAT_VNET_ORDER (IPFW_NAT_MODEVENT_ORDER + 2) DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3); MODULE_VERSION(ipfw_nat, 1); SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_init, NULL); VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_init, NULL); SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL); /* end of file */ diff --git a/sys/netpfil/ipfw/ip_fw_private.h b/sys/netpfil/ipfw/ip_fw_private.h index 12cc8008f5df..79b3ed43f63b 100644 --- a/sys/netpfil/ipfw/ip_fw_private.h +++ b/sys/netpfil/ipfw/ip_fw_private.h @@ -1,828 +1,820 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _IPFW2_PRIVATE_H #define _IPFW2_PRIVATE_H /* * Internal constants and data structures used by ipfw components * and not meant to be exported outside the kernel. */ #ifdef _KERNEL /* * For platforms that do not have SYSCTL support, we wrap the * SYSCTL_* into a function (one per file) to collect the values * into an array at module initialization. The wrapping macros, * SYSBEGIN() and SYSEND, are empty in the default case. */ #ifndef SYSBEGIN #define SYSBEGIN(x) #endif #ifndef SYSEND #define SYSEND #endif /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, IP_FW_DENY, IP_FW_DIVERT, IP_FW_TEE, IP_FW_DUMMYNET, IP_FW_NETGRAPH, IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, IP_FW_NAT64, }; /* * Structure for collecting parameters to dummynet for ip6_output forwarding */ struct _ip6dn_args { struct ip6_pktopts *opt_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; }; /* * Arguments for calling ipfw_chk() and dummynet_io(). We put them * all into a structure because this way it is easier and more * efficient to pass variables around and extend the interface. */ struct ip_fw_args { uint32_t flags; #define IPFW_ARGS_ETHER 0x00010000 /* valid ethernet header */ #define IPFW_ARGS_NH4 0x00020000 /* IPv4 next hop in hopstore */ #define IPFW_ARGS_NH6 0x00040000 /* IPv6 next hop in hopstore */ #define IPFW_ARGS_NH4PTR 0x00080000 /* IPv4 next hop in next_hop */ #define IPFW_ARGS_NH6PTR 0x00100000 /* IPv6 next hop in next_hop6 */ #define IPFW_ARGS_REF 0x00200000 /* valid ipfw_rule_ref */ #define IPFW_ARGS_IN 0x00400000 /* called on input */ #define IPFW_ARGS_OUT 0x00800000 /* called on output */ #define IPFW_ARGS_IP4 0x01000000 /* belongs to v4 ISR */ #define IPFW_ARGS_IP6 0x02000000 /* belongs to v6 ISR */ #define IPFW_ARGS_DROP 0x04000000 /* drop it (dummynet) */ #define IPFW_ARGS_LENMASK 0x0000ffff /* length of data in *mem */ #define IPFW_ARGS_LENGTH(f) ((f) & IPFW_ARGS_LENMASK) /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and * contains the starting rule for an ipfw search. * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ struct ipfw_rule_ref rule; /* match/restart info */ struct ifnet *ifp; /* input/output interface */ struct inpcb *inp; union { /* * next_hop[6] pointers can be used to point to next hop * stored in rule's opcode to avoid copying into hopstore. * Also, it is expected that all 0x1-0x10 flags are mutually * exclusive. */ struct sockaddr_in *next_hop; struct sockaddr_in6 *next_hop6; /* ipfw next hop storage */ struct sockaddr_in hopstore; struct ip_fw_nh6 { struct in6_addr sin6_addr; uint32_t sin6_scope_id; uint16_t sin6_port; } hopstore6; }; union { struct mbuf *m; /* the mbuf chain */ void *mem; /* or memory pointer */ }; struct ipfw_flow_id f_id; /* grabbed from IP header */ }; MALLOC_DECLARE(M_IPFW); /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) #define FREE_PKT(m) netisr_dispatch(-1, m) #else #define FREE_PKT(m) m_freem(m) #endif #endif /* !FREE_PKT */ /* * Function definitions. */ int ipfw_chk(struct ip_fw_args *args); struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); int ipfw_attach_hooks(void); void ipfw_detach_hooks(void); #ifdef NOTYET void ipfw_nat_destroy(void); #endif /* In ip_fw_log.c */ struct ip; +struct ip_fw; struct ip_fw_chain; void ipfw_bpf_init(int); void ipfw_bpf_uninit(int); void ipfw_bpf_tap(u_char *, u_int); void ipfw_bpf_mtap(struct mbuf *); void ipfw_bpf_mtap2(void *, u_int, struct mbuf *); void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, - struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip); + struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip, + void *eh); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); #define V_verbose_limit VNET(verbose_limit) /* In ip_fw_dynamic.c */ struct sockopt_data; enum { /* result for matching dynamic rules */ MATCH_REVERSE = 0, MATCH_FORWARD, MATCH_NONE, MATCH_UNKNOWN, }; /* * Macro to determine that we need to do or redo dynamic state lookup. * direction == MATCH_UNKNOWN means that this is first lookup, then we need * to do lookup. * Otherwise check the state name, if previous lookup was for "any" name, * this means there is no state with specific name. Thus no need to do * lookup. If previous name was not "any", redo lookup for specific name. */ #define DYN_LOOKUP_NEEDED(p, cmd) \ ((p)->direction == MATCH_UNKNOWN || \ ((p)->kidx != 0 && (p)->kidx != (cmd)->arg1)) #define DYN_INFO_INIT(p) do { \ (p)->direction = MATCH_UNKNOWN; \ (p)->kidx = 0; \ } while (0) struct ipfw_dyn_info { - uint16_t direction; /* match direction */ - uint16_t kidx; /* state name kidx */ + uint32_t direction; /* match direction */ + uint32_t kidx; /* state name kidx */ uint32_t hashval; /* hash value */ uint32_t version; /* bucket version */ uint32_t f_pos; }; int ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, const ipfw_insn_limit *cmd, const struct ip_fw_args *args, const void *ulp, int pktlen, struct ipfw_dyn_info *info, uint32_t tablearg); struct ip_fw *ipfw_dyn_lookup_state(const struct ip_fw_args *args, const void *ulp, int pktlen, const ipfw_insn *cmd, struct ipfw_dyn_info *info); int ipfw_is_dyn_rule(struct ip_fw *rule); void ipfw_expire_dyn_states(struct ip_fw_chain *, ipfw_range_tlv *); void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); uint32_t ipfw_dyn_get_count(uint32_t *, int *); -void ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id, - uint16_t default_id, uint16_t instance_id); +void ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint32_t eaction_id, + uint32_t default_id, uint32_t instance_id); /* common variables */ VNET_DECLARE(int, fw_one_pass); #define V_fw_one_pass VNET(fw_one_pass) VNET_DECLARE(int, fw_verbose); #define V_fw_verbose VNET(fw_verbose) VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) VNET_DECLARE(int, ipfw_vnet_ready); #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) +VNET_DECLARE(int, skipto_cache); +#define V_skipto_cache VNET(skipto_cache) + VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DECLARE(int, autoinc_step); #define V_autoinc_step VNET(autoinc_step) VNET_DECLARE(unsigned int, fw_tables_max); #define V_fw_tables_max VNET(fw_tables_max) VNET_DECLARE(unsigned int, fw_tables_sets); #define V_fw_tables_sets VNET(fw_tables_sets) struct tables_config; #ifdef _KERNEL /* * Here we have the structure representing an ipfw rule. * * It starts with a general area * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. * * Given a rule pointer r: * * r->cmd is the start of the first instruction. * ACTION_PTR(r) is the start of the first action (things to do * once a rule matched). */ struct ip_fw_jump_cache { union { struct { uint32_t id; uint32_t pos; }; uint64_t raw_value; }; }; struct ip_fw { uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ - uint16_t rulenum; /* rule number */ + uint32_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t flags; /* currently unused */ + uint16_t _pad; counter_u64_t cntr; /* Pointer to rule counters */ struct ip_fw_jump_cache cache; /* used by jump_fast */ uint32_t timestamp; /* tv_sec of last match */ uint32_t id; /* rule id */ uint32_t refcnt; /* number of references */ struct ip_fw *next; /* linked list of deleted rules */ ipfw_insn cmd[1]; /* storage for commands */ }; #define IPFW_RULE_CNTR_SIZE (2 * sizeof(uint64_t)) #endif struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ uint32_t id; /* ruleset id */ int n_rules; /* number of static rules */ void *tablestate; /* runtime table info */ void *valuestate; /* runtime table value info */ int *idxmap; /* skipto array of rules */ void **srvstate; /* runtime service mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; #else struct rmlock rwmtx; #endif - int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct ip_fw *default_rule; struct tables_config *tblcfg; /* tables module data */ void *ifcfg; /* interface module data */ int *idxmap_back; /* standby skipto array of rules */ struct namedobj_instance *srvmap; /* cfg name->number mappings */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_lock; #else struct rwlock uh_lock; /* lock for upper half */ #endif }; /* 64-byte structure representing multi-field table value */ struct table_value { uint32_t tag; /* O_TAG/O_TAGGED */ - uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t pipe; /* O_PIPE/O_QUEUE */ uint16_t divert; /* O_DIVERT/O_TEE */ - uint16_t skipto; /* skipto, CALLRET */ + uint32_t skipto; /* skipto, CALLRET */ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ uint16_t fib; /* O_SETFIB */ uint16_t nat; /* O_NAT */ uint32_t mark; /* O_SETMARK/O_MARK */ uint32_t nh4; uint8_t dscp; uint8_t spare0; uint16_t kidx; /* value kernel index */ /* -- 32 bytes -- */ struct in6_addr nh6; uint32_t limit; /* O_LIMIT */ uint32_t zoneid; /* scope zone id for nh6 */ uint64_t refcnt; /* Number of references */ }; struct named_object { TAILQ_ENTRY(named_object) nn_next; /* namehash */ TAILQ_ENTRY(named_object) nv_next; /* valuehash */ char *name; /* object name */ uint16_t etlv; /* Export TLV id */ uint8_t subtype;/* object subtype within class */ uint8_t set; /* set object belongs to */ - uint16_t kidx; /* object kernel index */ - uint16_t spare; + uint32_t kidx; /* object kernel index */ uint32_t ocnt; /* object counter for internal use */ uint32_t refcnt; /* number of references */ }; TAILQ_HEAD(namedobjects_head, named_object); struct sockopt; /* used by tcp_var.h */ struct sockopt_data { caddr_t kbuf; /* allocated buffer */ size_t ksize; /* given buffer size */ size_t koff; /* data already used */ size_t kavail; /* number of bytes available */ size_t ktotal; /* total bytes pushed */ struct sockopt *sopt; /* socket data */ caddr_t sopt_val; /* sopt user buffer */ size_t valsize; /* original data size */ }; struct ipfw_ifc; typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); struct ipfw_iface { struct named_object no; char ifname[64]; int resolved; uint16_t ifindex; uint16_t spare; uint64_t gencnt; TAILQ_HEAD(, ipfw_ifc) consumers; }; struct ipfw_ifc { TAILQ_ENTRY(ipfw_ifc) next; struct ipfw_iface *iface; ipfw_ifc_cb *cb; void *cbdata; }; /* Macro for working with various counters */ #define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ counter_u64_add((_cntr)->cntr, 1); \ counter_u64_add((_cntr)->cntr + 1, _bytes); \ if ((_cntr)->timestamp != time_uptime) \ (_cntr)->timestamp = time_uptime; \ } while (0) #define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ (_cntr)->pcnt++; \ (_cntr)->bcnt += _bytes; \ } while (0) #define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ counter_u64_zero((_cntr)->cntr); \ counter_u64_zero((_cntr)->cntr + 1); \ (_cntr)->timestamp = 0; \ } while (0) #define IPFW_ZERO_DYN_COUNTER(_cntr) do { \ (_cntr)->pcnt = 0; \ (_cntr)->bcnt = 0; \ } while (0) #define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f #define IP_FW_ARG_TABLEARG(ch, a, f) \ (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ #if defined( __linux__ ) || defined( _WIN32 ) #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rw_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #else /* FreeBSD */ #define IPFW_LOCK_INIT(_chain) do { \ rm_init_flags(&(_chain)->rwmtx, "IPFW static rules", RM_RECURSE); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rm_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker #define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) #define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) #define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #endif #define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) #define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) #define IPFW_UH_UNLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_UNLOCKED) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) struct obj_idx { - uint16_t uidx; /* internal index supplied by userland */ - uint16_t kidx; /* kernel object index */ + uint32_t uidx; /* internal index supplied by userland */ + uint32_t kidx; /* kernel object index */ uint16_t off; /* tlv offset from rule end in 4-byte words */ uint8_t spare; uint8_t type; /* object type within its category */ }; struct rule_check_info { uint16_t flags; /* rule-specific check flags */ uint16_t object_opcodes; /* num of opcodes referencing objects */ uint16_t urule_numoff; /* offset of rulenum in bytes */ uint8_t version; /* rule version */ uint8_t spare; ipfw_obj_ctlv *ctlv; /* name TLV containter */ struct ip_fw *krule; /* resulting rule pointer */ caddr_t urule; /* original rule pointer */ struct obj_idx obuf[8]; /* table references storage */ }; -/* Legacy interface support */ -/* - * FreeBSD 8 export rule format - */ -struct ip_fw_rule0 { - struct ip_fw *x_next; /* linked list of rules */ - struct ip_fw *next_rule; /* ptr to next [skipto] rule */ - /* 'next_rule' is used to pass up 'set_disable' status */ - - uint16_t act_ofs; /* offset of action in 32-bit units */ - uint16_t cmd_len; /* # of 32-bit words in cmd */ - uint16_t rulenum; /* rule number */ - uint8_t set; /* rule set (0..31) */ - uint8_t _pad; /* padding */ - uint32_t id; /* rule id */ - - /* These fields are present in all rules. */ - uint64_t pcnt; /* Packet counter */ - uint64_t bcnt; /* Byte counter */ - uint32_t timestamp; /* tv_sec of last match */ - - ipfw_insn cmd[1]; /* storage for commands */ -}; - -struct ip_fw_bcounter0 { - uint64_t pcnt; /* Packet counter */ - uint64_t bcnt; /* Byte counter */ - uint32_t timestamp; /* tv_sec of last match */ -}; - /* Kernel rule length */ /* * RULE _K_ SIZE _V_ -> * get kernel size from userland rool version _V_. * RULE _U_ SIZE _V_ -> * get user size version _V_ from kernel rule * RULESIZE _V_ -> * get user size rule length */ -/* FreeBSD8 <> current kernel format */ -#define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) -#define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* FreeBSD11 <> current kernel format */ #define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ (r)->cmd_len * 4 - 4, 8)) #define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* * Tables/Objects index rewriting code */ /* Default and maximum number of ipfw tables/objects. */ #define IPFW_TABLES_MAX 65536 #define IPFW_TABLES_DEFAULT 128 #define IPFW_OBJECTS_MAX 65536 -#define IPFW_OBJECTS_DEFAULT 1024 +#define IPFW_OBJECTS_DEFAULT 4096 #define CHAIN_TO_SRV(ch) ((ch)->srvmap) #define SRV_OBJECT(ch, idx) ((ch)->srvstate[(idx)]) struct tid_info { uint32_t set; /* table set */ - uint16_t uidx; /* table index */ + uint32_t uidx; /* table index */ uint8_t type; /* table type */ uint8_t atype; - uint8_t spare; + uint16_t spare; int tlen; /* Total TLV size block */ void *tlvs; /* Pointer to first TLV */ }; /* * Classifier callback. Checks if @cmd opcode contains kernel object reference. * If true, returns its index and type. * Returns 0 if match is found, 1 overwise. */ -typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); +typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint32_t *puidx, uint8_t *ptype); /* * Updater callback. Sets kernel object reference index to @puidx */ -typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx); +typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint32_t puidx); /* * Finder callback. Tries to find named object by name (specified via @ti). * Stores found named object pointer in @pno. * If object was not found, NULL is stored. * * Return 0 if input data was valid. */ typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno); /* * Another finder callback. Tries to findex named object by kernel index. * * Returns pointer to named object or NULL. */ typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch, - uint16_t kidx); + uint32_t kidx); /* * Object creator callback. Tries to create object specified by @ti. * Stores newly-allocated object index in @pkidx. * * Returns 0 on success. */ typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti, - uint16_t *pkidx); + uint32_t *pkidx); /* * Object destroy callback. Intended to free resources allocated by * create_object callback. */ typedef void (ipfw_obj_destroy_cb)(struct ip_fw_chain *ch, struct named_object *no); /* * Sets handler callback. Handles moving and swaping set of named object. * SWAP_ALL moves all named objects from set `set' to `new_set' and vise versa; * TEST_ALL checks that there aren't any named object with conflicting names; * MOVE_ALL moves all named objects from set `set' to `new_set'; * COUNT_ONE used to count number of references used by object with kidx `set'; * TEST_ONE checks that named object with kidx `set' can be moved to `new_set`; * MOVE_ONE moves named object with kidx `set' to set `new_set'. */ enum ipfw_sets_cmd { SWAP_ALL = 0, TEST_ALL, MOVE_ALL, COUNT_ONE, TEST_ONE, MOVE_ONE }; typedef int (ipfw_obj_sets_cb)(struct ip_fw_chain *ch, - uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); + uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); struct opcode_obj_rewrite { uint32_t opcode; /* Opcode to act upon */ uint32_t etlv; /* Relevant export TLV id */ ipfw_obj_rw_cl *classifier; /* Check if rewrite is needed */ ipfw_obj_rw_upd *update; /* update cmd with new value */ ipfw_obj_fname_cb *find_byname; /* Find named object by name */ ipfw_obj_fidx_cb *find_bykidx; /* Find named object by kidx */ ipfw_obj_create_cb *create_object; /* Create named object */ ipfw_obj_destroy_cb *destroy_object;/* Destroy named object */ ipfw_obj_sets_cb *manage_sets; /* Swap or move sets */ }; #define IPFW_ADD_OBJ_REWRITER(f, c) do { \ if ((f) != 0) \ ipfw_add_obj_rewriter(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_OBJ_REWRITER(l, c) do { \ if ((l) != 0) \ ipfw_del_obj_rewriter(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) /* In ip_fw_iface.c */ int ipfw_iface_init(void); void ipfw_iface_destroy(void); void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, struct ipfw_ifc *ic); void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); /* In ip_fw_sockopt.c */ +enum ipfw_opcheck_result { + SUCCESS = 0, + FAILED, + BAD_SIZE, + CHECK_ACTION, +}; +typedef enum ipfw_opcheck_result (*ipfw_check_opcode_t)(ipfw_insn **, + int *, struct rule_check_info *); + +void ipfw_register_compat(ipfw_check_opcode_t); +void ipfw_unregister_compat(void); + +enum ipfw_opcheck_result ipfw_check_opcode(ipfw_insn **, int *, + struct rule_check_info *); void ipfw_init_skipto_cache(struct ip_fw_chain *chain); void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); +void ipfw_enable_skipto_cache(struct ip_fw_chain *chain); int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); int ipfw_ctl3(struct sockopt *sopt); int ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule, int locked); void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, struct ip_fw *rule); void ipfw_reap_rules(struct ip_fw *head); void ipfw_init_counters(void); void ipfw_destroy_counters(void); +int ipfw_commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, + int count); +int delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel); struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); void ipfw_free_rule(struct ip_fw *rule); int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); -int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx); +int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint32_t kidx); ipfw_insn *ipfw_get_action(struct ip_fw *); +int ipfw_check_rule(struct ip_fw_rule *rule, size_t size, + struct rule_check_info *ci); typedef int (sopt_handler_f)(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); struct ipfw_sopt_handler { uint16_t opcode; uint8_t version; uint8_t dir; sopt_handler_f *handler; uint64_t refcnt; }; #define HDIR_SET 0x01 /* Handler is used to set some data */ #define HDIR_GET 0x02 /* Handler is used to retrieve data */ #define HDIR_BOTH HDIR_GET|HDIR_SET void ipfw_init_sopt_handler(void); void ipfw_destroy_sopt_handler(void); void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); #define IPFW_ADD_SOPT_HANDLER(f, c) do { \ if ((f) != 0) \ ipfw_add_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_SOPT_HANDLER(l, c) do { \ if ((l) != 0) \ ipfw_del_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) +#define DEFAULT_OBJHASH_SIZE 32 struct namedobj_instance; typedef int (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, void *arg); typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, const void *key, uint32_t kopt); typedef int (objhash_cmp_f)(struct named_object *no, const void *key, uint32_t kopt); -struct namedobj_instance *ipfw_objhash_create(uint32_t items); +struct namedobj_instance *ipfw_objhash_create(uint32_t items, size_t hash_size); void ipfw_objhash_destroy(struct namedobj_instance *); void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_free(void *idx, int blocks); void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, const char *name); struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, uint32_t type, const char *name); struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, - uint16_t idx); + uint32_t idx); int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, struct named_object *b); void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); uint32_t ipfw_objhash_count(struct namedobj_instance *ni); uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type); int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg); int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, void *arg, uint16_t type); -int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); -int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); +int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint32_t idx); +int ipfw_objhash_alloc_idx(void *n, uint32_t *pidx); void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, uint32_t etlv, struct named_object **pno); void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv); -ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, +ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint32_t uidx, uint32_t etlv); void ipfw_init_obj_rewriter(void); void ipfw_destroy_obj_rewriter(void); void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count); int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti); -void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx); -int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx); +void update_opcode_kidx(ipfw_insn *cmd, uint32_t idx); +int classify_opcode_kidx(ipfw_insn *cmd, uint32_t *puidx); void ipfw_init_srv(struct ip_fw_chain *ch); void ipfw_destroy_srv(struct ip_fw_chain *ch); int ipfw_check_object_name_generic(const char *name); int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, - uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); + uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd); /* In ip_fw_eaction.c */ typedef int (ipfw_eaction_t)(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); int ipfw_eaction_init(struct ip_fw_chain *ch, int first); void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last); -uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, +uint32_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler, const char *name); -int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id); +int ipfw_del_eaction(struct ip_fw_chain *ch, uint32_t eaction_id); int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); int ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule, - uint16_t eaction_id, uint16_t default_id, uint16_t instance_id); -int ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id, - uint16_t instance_id); + uint32_t eaction_id, uint32_t default_id, uint32_t instance_id); +int ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint32_t eaction_id, + uint32_t instance_id); /* In ip_fw_table.c */ struct table_info; typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); -int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, +int ipfw_lookup_table(struct ip_fw_chain *ch, uint32_t tbl, uint16_t plen, void *paddr, uint32_t *val); struct named_object *ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, - uint16_t kidx); -int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx); -void ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx); + uint32_t kidx); +int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint32_t *kidx); +void ipfw_unref_table(struct ip_fw_chain *ch, uint32_t kidx); int ipfw_init_tables(struct ip_fw_chain *ch, int first); int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); typedef int ipfw_nat_cfg_t(struct sockopt *); VNET_DECLARE(int, ipfw_nat_ready); #define V_ipfw_nat_ready VNET(ipfw_nat_ready) #define IPFW_NAT_LOADED (V_ipfw_nat_ready) extern ipfw_nat_t *ipfw_nat_ptr; extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; /* Helper functions for IP checksum adjustment */ static __inline uint16_t cksum_add(uint16_t sum, uint16_t a) { uint16_t res; res = sum + a; return (res + (res < a)); } static __inline uint16_t cksum_adjust(uint16_t oldsum, uint16_t old, uint16_t new) { return (~cksum_add(cksum_add(~oldsum, ~old), new)); } #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */ diff --git a/sys/netpfil/ipfw/ip_fw_sockopt.c b/sys/netpfil/ipfw/ip_fw_sockopt.c index 2d9a1c6d62fd..907baafd878d 100644 --- a/sys/netpfil/ipfw/ip_fw_sockopt.c +++ b/sys/netpfil/ipfw/ip_fw_sockopt.c @@ -1,4708 +1,3751 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa - * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014-2025 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Supported by: Valeria Paoli * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * Control socket and rule management routines for ipfw. * Control is currently implemented via IP_FW3 setsockopt() code. */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include /* struct m_tag used by nested headers */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* hooks */ #include #include #include #ifdef MAC #include #endif -static int ipfw_ctl(struct sockopt *sopt); +static enum ipfw_opcheck_result +check_opcode_compat_nop(ipfw_insn **pcmd, int *plen, + struct rule_check_info *ci) +{ + /* Compatibility code is not registered */ + return (FAILED); +} + +static ipfw_check_opcode_t check_opcode_f = check_opcode_compat_nop; + static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci); -static int check_ipfw_rule1(struct ip_fw_rule *rule, int size, - struct rule_check_info *ci); -static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, - struct rule_check_info *ci); static int rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci); -#define NAMEDOBJ_HASH_SIZE 32 - struct namedobj_instance { struct namedobjects_head *names; struct namedobjects_head *values; uint32_t nn_size; /* names hash size */ uint32_t nv_size; /* number hash size */ u_long *idx_mask; /* used items bitmask */ uint32_t max_blocks; /* number of "long" blocks in bitmask */ uint32_t count; /* number of items */ uint16_t free_off[IPFW_MAX_SETS]; /* first possible free offset */ objhash_hash_f *hash_f; objhash_cmp_f *cmp_f; }; #define BLOCK_ITEMS (8 * sizeof(u_long)) /* Number of items for ffsl() */ static uint32_t objhash_hash_name(struct namedobj_instance *ni, const void *key, uint32_t kopt); static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val); static int objhash_cmp_name(struct named_object *no, const void *name, uint32_t set); MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); -static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); -static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd); - /* ctl3 handler data */ -struct mtx ctl3_lock; +static struct mtx ctl3_lock; #define CTL3_LOCK_INIT() mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF) #define CTL3_LOCK_DESTROY() mtx_destroy(&ctl3_lock) #define CTL3_LOCK() mtx_lock(&ctl3_lock) #define CTL3_UNLOCK() mtx_unlock(&ctl3_lock) static struct ipfw_sopt_handler *ctl3_handlers; static size_t ctl3_hsize; static uint64_t ctl3_refct, ctl3_gencnt; #define CTL3_SMALLBUF 4096 /* small page-size write buffer */ -#define CTL3_LARGEBUF 16 * 1024 * 1024 /* handle large rulesets */ +#define CTL3_LARGEBUF (16 * 1024 * 1024) /* handle large rulesets */ static int ipfw_flush_sopt_data(struct sockopt_data *sd); -static struct ipfw_sopt_handler scodes[] = { - { IP_FW_XGET, 0, HDIR_GET, dump_config }, - { IP_FW_XADD, 0, HDIR_BOTH, add_rules }, - { IP_FW_XDEL, 0, HDIR_BOTH, del_rules }, - { IP_FW_XZERO, 0, HDIR_SET, clear_rules }, - { IP_FW_XRESETLOG, 0, HDIR_SET, clear_rules }, - { IP_FW_XMOVE, 0, HDIR_SET, move_rules }, - { IP_FW_SET_SWAP, 0, HDIR_SET, manage_sets }, - { IP_FW_SET_MOVE, 0, HDIR_SET, manage_sets }, - { IP_FW_SET_ENABLE, 0, HDIR_SET, manage_sets }, - { IP_FW_DUMP_SOPTCODES, 0, HDIR_GET, dump_soptcodes }, - { IP_FW_DUMP_SRVOBJECTS,0, HDIR_GET, dump_srvobjects }, +static sopt_handler_f dump_config, add_rules, del_rules, clear_rules, + move_rules, manage_sets, dump_soptcodes, dump_srvobjects, + manage_skiptocache; + +static struct ipfw_sopt_handler scodes[] = { + { IP_FW_XGET, IP_FW3_OPVER, HDIR_GET, dump_config }, + { IP_FW_XADD, IP_FW3_OPVER, HDIR_BOTH, add_rules }, + { IP_FW_XDEL, IP_FW3_OPVER, HDIR_BOTH, del_rules }, + { IP_FW_XZERO, IP_FW3_OPVER, HDIR_SET, clear_rules }, + { IP_FW_XRESETLOG, IP_FW3_OPVER, HDIR_SET, clear_rules }, + { IP_FW_XMOVE, IP_FW3_OPVER, HDIR_SET, move_rules }, + { IP_FW_SET_SWAP, IP_FW3_OPVER, HDIR_SET, manage_sets }, + { IP_FW_SET_MOVE, IP_FW3_OPVER, HDIR_SET, manage_sets }, + { IP_FW_SET_ENABLE, IP_FW3_OPVER, HDIR_SET, manage_sets }, + { IP_FW_DUMP_SOPTCODES, IP_FW3_OPVER, HDIR_GET, dump_soptcodes }, + { IP_FW_DUMP_SRVOBJECTS, IP_FW3_OPVER, HDIR_GET, dump_srvobjects }, + { IP_FW_SKIPTO_CACHE, IP_FW3_OPVER, HDIR_BOTH, manage_skiptocache }, }; -static int -set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule); static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd, - uint16_t *puidx, uint8_t *ptype); + uint32_t *puidx, uint8_t *ptype); static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti); static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, struct obj_idx *pidx, int *unresolved); static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule); static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *end); -static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, +static int export_objhash_ntlv(struct namedobj_instance *ni, uint32_t kidx, struct sockopt_data *sd); /* * Opcode object rewriter variables */ struct opcode_obj_rewrite *ctl3_rewriters; static size_t ctl3_rsize; /* * static variables followed by global ones */ VNET_DEFINE_STATIC(uma_zone_t, ipfw_cntr_zone); #define V_ipfw_cntr_zone VNET(ipfw_cntr_zone) void ipfw_init_counters(void) { V_ipfw_cntr_zone = uma_zcreate("IPFW counters", IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } void ipfw_destroy_counters(void) { uma_zdestroy(V_ipfw_cntr_zone); } struct ip_fw * ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize) { struct ip_fw *rule; rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO); rule->cntr = uma_zalloc_pcpu(V_ipfw_cntr_zone, M_WAITOK | M_ZERO); rule->refcnt = 1; return (rule); } void ipfw_free_rule(struct ip_fw *rule) { /* * We don't release refcnt here, since this function * can be called without any locks held. The caller * must release reference under IPFW_UH_WLOCK, and then * call this function if refcount becomes 1. */ if (rule->refcnt > 1) return; uma_zfree_pcpu(V_ipfw_cntr_zone, rule->cntr); free(rule, M_IPFW); } /* * Find the smallest rule >= key, id. * We could use bsearch but it is so simple that we code it directly */ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) { int i, lo, hi; struct ip_fw *r; for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { i = (lo + hi) / 2; r = chain->map[i]; if (r->rulenum < key) lo = i + 1; /* continue from the next one */ else if (r->rulenum > key) hi = i; /* this might be good */ else if (r->id < id) lo = i + 1; /* continue from the next one */ else /* r->id >= id */ hi = i; /* this might be good */ } return hi; } /* * Builds skipto cache on rule set @map. */ static void update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map) { - int *smap, rulenum; + uint32_t *smap, rulenum; int i, mi; IPFW_UH_WLOCK_ASSERT(chain); mi = 0; rulenum = map[mi]->rulenum; smap = chain->idxmap_back; if (smap == NULL) return; - for (i = 0; i < 65536; i++) { + for (i = 0; i <= IPFW_DEFAULT_RULE; i++) { smap[i] = mi; /* Use the same rule index until i < rulenum */ - if (i != rulenum || i == 65535) + if (i != rulenum || i == IPFW_DEFAULT_RULE) continue; /* Find next rule with num > i */ rulenum = map[++mi]->rulenum; while (rulenum == i) rulenum = map[++mi]->rulenum; } } /* * Swaps prepared (backup) index with current one. */ static void swap_skipto_cache(struct ip_fw_chain *chain) { - int *map; + uint32_t *map; IPFW_UH_WLOCK_ASSERT(chain); IPFW_WLOCK_ASSERT(chain); map = chain->idxmap; chain->idxmap = chain->idxmap_back; chain->idxmap_back = map; } /* * Allocate and initialize skipto cache. */ void ipfw_init_skipto_cache(struct ip_fw_chain *chain) { - int *idxmap, *idxmap_back; + uint32_t *idxmap, *idxmap_back; - idxmap = malloc(65536 * sizeof(int), M_IPFW, M_WAITOK | M_ZERO); - idxmap_back = malloc(65536 * sizeof(int), M_IPFW, M_WAITOK); + idxmap = malloc((IPFW_DEFAULT_RULE + 1) * sizeof(uint32_t), + M_IPFW, M_WAITOK | M_ZERO); + idxmap_back = malloc((IPFW_DEFAULT_RULE + 1) * sizeof(uint32_t), + M_IPFW, M_WAITOK | M_ZERO); /* * Note we may be called at any time after initialization, * for example, on first skipto rule, so we need to * provide valid chain->idxmap on return */ IPFW_UH_WLOCK(chain); if (chain->idxmap != NULL) { IPFW_UH_WUNLOCK(chain); free(idxmap, M_IPFW); free(idxmap_back, M_IPFW); return; } /* Set backup pointer first to permit building cache */ chain->idxmap_back = idxmap_back; - update_skipto_cache(chain, chain->map); + if (V_skipto_cache != 0) + update_skipto_cache(chain, chain->map); IPFW_WLOCK(chain); /* It is now safe to set chain->idxmap ptr */ chain->idxmap = idxmap; swap_skipto_cache(chain); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); } /* * Destroys skipto cache. */ void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain) { - - if (chain->idxmap != NULL) - free(chain->idxmap, M_IPFW); - if (chain->idxmap != NULL) - free(chain->idxmap_back, M_IPFW); + free(chain->idxmap, M_IPFW); + free(chain->idxmap_back, M_IPFW); } /* * allocate a new map, returns the chain locked. extra is the number * of entries to add or delete. */ static struct ip_fw ** get_map(struct ip_fw_chain *chain, int extra, int locked) { for (;;) { struct ip_fw **map; u_int i, mflags; mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK); i = chain->n_rules + extra; map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags); if (map == NULL) { printf("%s: cannot allocate map\n", __FUNCTION__); return NULL; } if (!locked) IPFW_UH_WLOCK(chain); if (i >= chain->n_rules + extra) /* good */ return map; /* otherwise we lost the race, free and retry */ if (!locked) IPFW_UH_WUNLOCK(chain); free(map, M_IPFW); } } /* * swap the maps. It is supposed to be called with IPFW_UH_WLOCK */ static struct ip_fw ** swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) { struct ip_fw **old_map; IPFW_WLOCK(chain); chain->id++; chain->n_rules = new_len; old_map = chain->map; chain->map = new_map; swap_skipto_cache(chain); IPFW_WUNLOCK(chain); return old_map; } static void export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr) { struct timeval boottime; cntr->size = sizeof(*cntr); if (krule->cntr != NULL) { cntr->pcnt = counter_u64_fetch(krule->cntr); cntr->bcnt = counter_u64_fetch(krule->cntr + 1); cntr->timestamp = krule->timestamp; } if (cntr->timestamp > 0) { getboottime(&boottime); cntr->timestamp += boottime.tv_sec; } } -static void -export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr) -{ - struct timeval boottime; - - if (krule->cntr != NULL) { - cntr->pcnt = counter_u64_fetch(krule->cntr); - cntr->bcnt = counter_u64_fetch(krule->cntr + 1); - cntr->timestamp = krule->timestamp; - } - if (cntr->timestamp > 0) { - getboottime(&boottime); - cntr->timestamp += boottime.tv_sec; - } -} - -/* - * Copies rule @urule from v1 userland format (current). - * to kernel @krule. - * Assume @krule is zeroed. - */ -static void -import_rule1(struct rule_check_info *ci) -{ - struct ip_fw_rule *urule; - struct ip_fw *krule; - - urule = (struct ip_fw_rule *)ci->urule; - krule = (struct ip_fw *)ci->krule; - - /* copy header */ - krule->act_ofs = urule->act_ofs; - krule->cmd_len = urule->cmd_len; - krule->rulenum = urule->rulenum; - krule->set = urule->set; - krule->flags = urule->flags; - - /* Save rulenum offset */ - ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); - - /* Copy opcodes */ - memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); -} - /* * Export rule into v1 format (Current). * Layout: * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT) * [ ip_fw_rule ] OR * [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs). * ] * Assume @data is zeroed. */ static void export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs) { struct ip_fw_bcounter *cntr; struct ip_fw_rule *urule; ipfw_obj_tlv *tlv; /* Fill in TLV header */ tlv = (ipfw_obj_tlv *)data; tlv->type = IPFW_TLV_RULE_ENT; tlv->length = len; if (rcntrs != 0) { /* Copy counters */ cntr = (struct ip_fw_bcounter *)(tlv + 1); urule = (struct ip_fw_rule *)(cntr + 1); export_cntr1_base(krule, cntr); } else urule = (struct ip_fw_rule *)(tlv + 1); /* copy header */ urule->act_ofs = krule->act_ofs; urule->cmd_len = krule->cmd_len; urule->rulenum = krule->rulenum; urule->set = krule->set; urule->flags = krule->flags; urule->id = krule->id; /* Copy opcodes */ memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); } -/* - * Copies rule @urule from FreeBSD8 userland format (v0) - * to kernel @krule. - * Assume @krule is zeroed. - */ -static void -import_rule0(struct rule_check_info *ci) -{ - struct ip_fw_rule0 *urule; - struct ip_fw *krule; - int cmdlen, l; - ipfw_insn *cmd; - ipfw_insn_limit *lcmd; - ipfw_insn_if *cmdif; - - urule = (struct ip_fw_rule0 *)ci->urule; - krule = (struct ip_fw *)ci->krule; - - /* copy header */ - krule->act_ofs = urule->act_ofs; - krule->cmd_len = urule->cmd_len; - krule->rulenum = urule->rulenum; - krule->set = urule->set; - if ((urule->_pad & 1) != 0) - krule->flags |= IPFW_RULE_NOOPT; - - /* Save rulenum offset */ - ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum); - - /* Copy opcodes */ - memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); - - /* - * Alter opcodes: - * 1) convert tablearg value from 65535 to 0 - * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room - * for targ). - * 3) convert table number in iface opcodes to u16 - * 4) convert old `nat global` into new 65535 - */ - l = krule->cmd_len; - cmd = krule->cmd; - cmdlen = 0; - - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - - switch (cmd->opcode) { - /* Opcodes supporting tablearg */ - case O_TAG: - case O_TAGGED: - case O_PIPE: - case O_QUEUE: - case O_DIVERT: - case O_TEE: - case O_SKIPTO: - case O_CALLRETURN: - case O_NETGRAPH: - case O_NGTEE: - case O_NAT: - if (cmd->arg1 == IP_FW_TABLEARG) - cmd->arg1 = IP_FW_TARG; - else if (cmd->arg1 == 0) - cmd->arg1 = IP_FW_NAT44_GLOBAL; - break; - case O_SETFIB: - case O_SETDSCP: - case O_SETMARK: - case O_MARK: - if (cmd->arg1 == IP_FW_TABLEARG) - cmd->arg1 = IP_FW_TARG; - else - cmd->arg1 |= 0x8000; - break; - case O_LIMIT: - lcmd = (ipfw_insn_limit *)cmd; - if (lcmd->conn_limit == IP_FW_TABLEARG) - lcmd->conn_limit = IP_FW_TARG; - break; - /* Interface tables */ - case O_XMIT: - case O_RECV: - case O_VIA: - /* Interface table, possibly */ - cmdif = (ipfw_insn_if *)cmd; - if (cmdif->name[0] != '\1') - break; - - cmdif->p.kidx = (uint16_t)cmdif->p.glob; - break; - } - } -} - -/* - * Copies rule @krule from kernel to FreeBSD8 userland format (v0) - */ -static void -export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len) -{ - int cmdlen, l; - ipfw_insn *cmd; - ipfw_insn_limit *lcmd; - ipfw_insn_if *cmdif; - - /* copy header */ - memset(urule, 0, len); - urule->act_ofs = krule->act_ofs; - urule->cmd_len = krule->cmd_len; - urule->rulenum = krule->rulenum; - urule->set = krule->set; - if ((krule->flags & IPFW_RULE_NOOPT) != 0) - urule->_pad |= 1; - - /* Copy opcodes */ - memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t)); - - /* Export counters */ - export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt); - - /* - * Alter opcodes: - * 1) convert tablearg value from 0 to 65535 - * 2) Remove highest bit from O_SETFIB/O_SETDSCP values. - * 3) convert table number in iface opcodes to int - */ - l = urule->cmd_len; - cmd = urule->cmd; - cmdlen = 0; - - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - - switch (cmd->opcode) { - /* Opcodes supporting tablearg */ - case O_TAG: - case O_TAGGED: - case O_PIPE: - case O_QUEUE: - case O_DIVERT: - case O_TEE: - case O_SKIPTO: - case O_CALLRETURN: - case O_NETGRAPH: - case O_NGTEE: - case O_NAT: - if (cmd->arg1 == IP_FW_TARG) - cmd->arg1 = IP_FW_TABLEARG; - else if (cmd->arg1 == IP_FW_NAT44_GLOBAL) - cmd->arg1 = 0; - break; - case O_SETFIB: - case O_SETDSCP: - case O_SETMARK: - case O_MARK: - if (cmd->arg1 == IP_FW_TARG) - cmd->arg1 = IP_FW_TABLEARG; - else - cmd->arg1 &= ~0x8000; - break; - case O_LIMIT: - lcmd = (ipfw_insn_limit *)cmd; - if (lcmd->conn_limit == IP_FW_TARG) - lcmd->conn_limit = IP_FW_TABLEARG; - break; - /* Interface tables */ - case O_XMIT: - case O_RECV: - case O_VIA: - /* Interface table, possibly */ - cmdif = (ipfw_insn_if *)cmd; - if (cmdif->name[0] != '\1') - break; - - cmdif->p.glob = cmdif->p.kidx; - break; - } - } -} - /* * Add new rule(s) to the list possibly creating rule number for each. * Update the rule_number in the input struct so the caller knows it as well. * Must be called without IPFW_UH held */ -static int -commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count) +int +ipfw_commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, + int count) { - int error, i, insert_before, tcount; - uint16_t rulenum, *pnum; + int error, i, insert_before, tcount, rule_idx, last_rule_idx; + uint32_t rulenum; struct rule_check_info *ci; struct ip_fw *krule; struct ip_fw **map; /* the new array of pointers */ /* Check if we need to do table/obj index remap */ tcount = 0; for (ci = rci, i = 0; i < count; ci++, i++) { if (ci->object_opcodes == 0) continue; /* * Rule has some object opcodes. * We need to find (and create non-existing) * kernel objects, and reference existing ones. */ error = rewrite_rule_uidx(chain, ci); if (error != 0) { + /* * rewrite failed, state for current rule * has been reverted. Check if we need to * revert more. */ if (tcount > 0) { + /* * We have some more table rules * we need to rollback. */ IPFW_UH_WLOCK(chain); while (ci != rci) { ci--; if (ci->object_opcodes == 0) continue; unref_rule_objects(chain,ci->krule); + } IPFW_UH_WUNLOCK(chain); + } return (error); } tcount++; } /* get_map returns with IPFW_UH_WLOCK if successful */ map = get_map(chain, count, 0 /* not locked */); if (map == NULL) { if (tcount > 0) { /* Unbind tables */ IPFW_UH_WLOCK(chain); for (ci = rci, i = 0; i < count; ci++, i++) { if (ci->object_opcodes == 0) continue; unref_rule_objects(chain, ci->krule); } IPFW_UH_WUNLOCK(chain); } return (ENOSPC); } if (V_autoinc_step < 1) V_autoinc_step = 1; else if (V_autoinc_step > 1000) V_autoinc_step = 1000; - /* FIXME: Handle count > 1 */ - ci = rci; - krule = ci->krule; - rulenum = krule->rulenum; - - /* find the insertion point, we will insert before */ - insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE; - i = ipfw_find_rule(chain, insert_before, 0); - /* duplicate first part */ - if (i > 0) - bcopy(chain->map, map, i * sizeof(struct ip_fw *)); - map[i] = krule; - /* duplicate remaining part, we always have the default rule */ - bcopy(chain->map + i, map + i + 1, - sizeof(struct ip_fw *) *(chain->n_rules - i)); - if (rulenum == 0) { - /* Compute rule number and write it back */ - rulenum = i > 0 ? map[i-1]->rulenum : 0; - if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) - rulenum += V_autoinc_step; - krule->rulenum = rulenum; - /* Save number to userland rule */ - pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff); - *pnum = rulenum; + last_rule_idx = 0; + for (ci = rci, i = 0; i < count; ci++, i++) { + krule = ci->krule; + rulenum = krule->rulenum; + + krule->id = chain->id + 1; + + /* find the insertion point, we will insert before */ + insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE; + rule_idx = ipfw_find_rule(chain, insert_before, 0); + /* duplicate the previous part */ + if (last_rule_idx < rule_idx) + bcopy(chain->map + last_rule_idx, map + last_rule_idx + i, + (rule_idx - last_rule_idx) * sizeof(struct ip_fw *)); + last_rule_idx = rule_idx; + map[rule_idx + i] = krule; + if (rulenum == 0) { + /* Compute rule number and write it back */ + rulenum = rule_idx + i > 0 ? map[rule_idx + i - 1]->rulenum : 0; + if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rulenum += V_autoinc_step; + krule->rulenum = rulenum; + /* Save number to userland rule */ + memcpy((char *)ci->urule + ci->urule_numoff, &rulenum, + sizeof(rulenum)); + } } - krule->id = chain->id + 1; - update_skipto_cache(chain, map); - map = swap_map(chain, map, chain->n_rules + 1); - chain->static_len += RULEUSIZE0(krule); + /* duplicate the remaining part, we always have the default rule */ + bcopy(chain->map + last_rule_idx, map + last_rule_idx + count, + (chain->n_rules - last_rule_idx) * sizeof(struct ip_fw *)); + + if (V_skipto_cache != 0) + update_skipto_cache(chain, map); + map = swap_map(chain, map, chain->n_rules + count); IPFW_UH_WUNLOCK(chain); if (map) free(map, M_IPFW); return (0); } int ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule, int locked) { struct ip_fw **map; map = get_map(chain, 1, locked); if (map == NULL) return (ENOMEM); if (chain->n_rules > 0) bcopy(chain->map, map, chain->n_rules * sizeof(struct ip_fw *)); map[chain->n_rules] = rule; rule->rulenum = IPFW_DEFAULT_RULE; rule->set = RESVD_SET; rule->id = chain->id + 1; /* We add rule in the end of chain, no need to update skipto cache */ map = swap_map(chain, map, chain->n_rules + 1); - chain->static_len += RULEUSIZE0(rule); IPFW_UH_WUNLOCK(chain); free(map, M_IPFW); return (0); } /* * Adds @rule to the list of rules to reap */ void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, struct ip_fw *rule) { IPFW_UH_WLOCK_ASSERT(chain); /* Unlink rule from everywhere */ unref_rule_objects(chain, rule); rule->next = *head; *head = rule; } /* * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. * A NULL pointer on input is handled correctly. */ void ipfw_reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->next; ipfw_free_rule(rule); } } /* * Rules to keep are * (default || reserved || !match_set || !match_number) * where * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) * // the default rule is always protected * * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") * * match_set ::= (cmd == 0 || rule->set == set) * // set number is ignored for cmd == 0 * * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) * // number is ignored for cmd == 1 or n == 0 * */ int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt) { /* Don't match default rule for modification queries */ if (rule->rulenum == IPFW_DEFAULT_RULE && (rt->flags & IPFW_RCFLAG_DEFAULT) == 0) return (0); /* Don't match rules in reserved set for flush requests */ if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET) return (0); /* If we're filtering by set, don't match other sets */ if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set) return (0); if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 && (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule)) return (0); return (1); } struct manage_sets_args { - uint16_t set; + uint32_t set; uint8_t new_set; }; static int swap_sets_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct manage_sets_args *args; args = (struct manage_sets_args *)arg; if (no->set == (uint8_t)args->set) no->set = args->new_set; else if (no->set == args->new_set) no->set = (uint8_t)args->set; return (0); } static int move_sets_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct manage_sets_args *args; args = (struct manage_sets_args *)arg; if (no->set == (uint8_t)args->set) no->set = args->new_set; return (0); } static int test_sets_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct manage_sets_args *args; args = (struct manage_sets_args *)arg; if (no->set != (uint8_t)args->set) return (0); if (ipfw_objhash_lookup_name_type(ni, args->new_set, no->etlv, no->name) != NULL) return (EEXIST); return (0); } /* * Generic function to handler moving and swapping sets. */ int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type, - uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) + uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { struct manage_sets_args args; struct named_object *no; args.set = set; args.new_set = new_set; switch (cmd) { case SWAP_ALL: return (ipfw_objhash_foreach_type(ni, swap_sets_cb, &args, type)); case TEST_ALL: return (ipfw_objhash_foreach_type(ni, test_sets_cb, &args, type)); case MOVE_ALL: return (ipfw_objhash_foreach_type(ni, move_sets_cb, &args, type)); case COUNT_ONE: /* * @set used to pass kidx. * When @new_set is zero - reset object counter, * otherwise increment it. */ no = ipfw_objhash_lookup_kidx(ni, set); if (new_set != 0) no->ocnt++; else no->ocnt = 0; return (0); case TEST_ONE: /* @set used to pass kidx */ no = ipfw_objhash_lookup_kidx(ni, set); /* * First check number of references: * when it differs, this mean other rules are holding * reference to given object, so it is not possible to * change its set. Note that refcnt may account references * to some going-to-be-added rules. Since we don't know * their numbers (and even if they will be added) it is * perfectly OK to return error here. */ if (no->ocnt != no->refcnt) return (EBUSY); if (ipfw_objhash_lookup_name_type(ni, new_set, type, no->name) != NULL) return (EEXIST); return (0); case MOVE_ONE: /* @set used to pass kidx */ no = ipfw_objhash_lookup_kidx(ni, set); no->set = new_set; return (0); } return (EINVAL); } /* * Delete rules matching range @rt. * Saves number of deleted rules in @ndel. * * Returns 0 on success. */ -static int +int delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel) { struct ip_fw *reap, *rule, **map; - int end, start; + uint32_t end, start; int i, n, ndyn, ofs; reap = NULL; IPFW_UH_WLOCK(chain); /* arbitrate writers */ /* * Stage 1: Determine range to inspect. * Range is half-inclusive, e.g [start, end). */ start = 0; end = chain->n_rules - 1; if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) { start = ipfw_find_rule(chain, rt->start_rule, 0); if (rt->end_rule >= IPFW_DEFAULT_RULE) rt->end_rule = IPFW_DEFAULT_RULE - 1; end = ipfw_find_rule(chain, rt->end_rule, UINT32_MAX); } if (rt->flags & IPFW_RCFLAG_DYNAMIC) { /* * Requested deleting only for dynamic states. */ *ndel = 0; ipfw_expire_dyn_states(chain, rt); IPFW_UH_WUNLOCK(chain); return (0); } /* Allocate new map of the same size */ map = get_map(chain, 0, 1 /* locked */); if (map == NULL) { IPFW_UH_WUNLOCK(chain); return (ENOMEM); } n = 0; ndyn = 0; ofs = start; /* 1. bcopy the initial part of the map */ if (start > 0) bcopy(chain->map, map, start * sizeof(struct ip_fw *)); /* 2. copy active rules between start and end */ for (i = start; i < end; i++) { rule = chain->map[i]; if (ipfw_match_range(rule, rt) == 0) { map[ofs++] = rule; continue; } n++; if (ipfw_is_dyn_rule(rule) != 0) ndyn++; } /* 3. copy the final part of the map */ bcopy(chain->map + end, map + ofs, (chain->n_rules - end) * sizeof(struct ip_fw *)); /* 4. recalculate skipto cache */ update_skipto_cache(chain, map); /* 5. swap the maps (under UH_WLOCK + WHLOCK) */ map = swap_map(chain, map, chain->n_rules - n); /* 6. Remove all dynamic states originated by deleted rules */ if (ndyn > 0) ipfw_expire_dyn_states(chain, rt); /* 7. now remove the rules deleted from the old map */ for (i = start; i < end; i++) { rule = map[i]; if (ipfw_match_range(rule, rt) == 0) continue; - chain->static_len -= RULEUSIZE0(rule); ipfw_reap_add(chain, &reap, rule); } IPFW_UH_WUNLOCK(chain); ipfw_reap_rules(reap); if (map != NULL) free(map, M_IPFW); *ndel = n; return (0); } static int move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt) { struct opcode_obj_rewrite *rw; struct ip_fw *rule; ipfw_insn *cmd; + uint32_t kidx; int cmdlen, i, l, c; - uint16_t kidx; IPFW_UH_WLOCK_ASSERT(ch); /* Stage 1: count number of references by given rules */ for (c = 0, i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; if (rule->set == rt->new_set) /* nothing to do */ continue; /* Search opcodes with named objects */ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; l > 0; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, NULL); if (rw == NULL || rw->manage_sets == NULL) continue; /* * When manage_sets() returns non-zero value to * COUNT_ONE command, consider this as an object * doesn't support sets (e.g. disabled with sysctl). * So, skip checks for this object. */ if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0) continue; c++; } } if (c == 0) /* No objects found */ return (0); /* Stage 2: verify "ownership" */ for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; if (rule->set == rt->new_set) /* nothing to do */ continue; /* Search opcodes with named objects */ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, NULL); if (rw == NULL || rw->manage_sets == NULL) continue; /* Test for ownership and conflicting names */ c = rw->manage_sets(ch, kidx, (uint8_t)rt->new_set, TEST_ONE); } } /* Stage 3: change set and cleanup */ for (i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; if (rule->set == rt->new_set) /* nothing to do */ continue; /* Search opcodes with named objects */ for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd; l > 0; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, NULL); if (rw == NULL || rw->manage_sets == NULL) continue; /* cleanup object counter */ rw->manage_sets(ch, kidx, 0 /* reset counter */, COUNT_ONE); if (c != 0) continue; /* change set */ rw->manage_sets(ch, kidx, (uint8_t)rt->new_set, MOVE_ONE); } } return (c); } /* * Changes set of given rule rannge @rt * with each other. * * Returns 0 on success. */ static int move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { struct ip_fw *rule; int i; IPFW_UH_WLOCK(chain); /* * Move rules with matching paramenerts to a new set. * This one is much more complex. We have to ensure * that all referenced tables (if any) are referenced * by given rule subset only. Otherwise, we can't move * them to new set and have to return error. */ if ((i = move_objects(chain, rt)) != 0) { IPFW_UH_WUNLOCK(chain); return (i); } /* XXX: We have to do swap holding WLOCK */ for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; rule->set = rt->new_set; } IPFW_UH_WUNLOCK(chain); return (0); } /* * Returns pointer to action instruction, skips all possible rule * modifiers like O_LOG, O_TAG, O_ALTQ. */ ipfw_insn * ipfw_get_action(struct ip_fw *rule) { ipfw_insn *cmd; int l, cmdlen; cmd = ACTION_PTR(rule); l = rule->cmd_len - rule->act_ofs; while (l > 0) { switch (cmd->opcode) { case O_ALTQ: case O_LOG: case O_TAG: break; default: return (cmd); } cmdlen = F_LEN(cmd); l -= cmdlen; cmd += cmdlen; } panic("%s: rule (%p) has not action opcode", __func__, rule); return (NULL); } /* * Clear counters for a specific rule. * Normally run under IPFW_UH_RLOCK, but these are idempotent ops * so we only care that rules do not disappear. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) IPFW_ZERO_RULE_COUNTER(rule); if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /* * Flushes rules counters and/or log values on matching range. * * Returns number of items cleared. */ static int clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only) { struct ip_fw *rule; int num; int i; num = 0; rt->flags |= IPFW_RCFLAG_DEFAULT; IPFW_UH_WLOCK(chain); /* arbitrate writers */ for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; clear_counters(rule, log_only); num++; } IPFW_UH_WUNLOCK(chain); return (num); } static int check_range_tlv(ipfw_range_tlv *rt) { if (rt->head.length != sizeof(*rt)) return (1); if (rt->start_rule > rt->end_rule) return (1); if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS) return (1); if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags) return (1); return (0); } /* * Delete rules matching specified parameters * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * Reply: [ ipfw_obj_header ipfw_range_tlv ] * * Saves number of deleted rules in ipfw_range_tlv->new_set. * * Returns 0 on success. */ static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; int error, ndel; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (check_range_tlv(&rh->range) != 0) return (EINVAL); ndel = 0; if ((error = delete_range(chain, &rh->range, &ndel)) != 0) return (error); /* Save number of rules deleted */ rh->range.new_set = ndel; return (0); } /* * Move rules/sets matching specified parameters * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * * Returns 0 on success. */ static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (check_range_tlv(&rh->range) != 0) return (EINVAL); return (move_range(chain, &rh->range)); } /* * Clear rule accounting data matching specified parameters * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * Reply: [ ipfw_obj_header ipfw_range_tlv ] * * Saves number of cleared rules in ipfw_range_tlv->new_set. * * Returns 0 on success. */ static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; int log_only, num; char *msg; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (check_range_tlv(&rh->range) != 0) return (EINVAL); log_only = (op3->opcode == IP_FW_XRESETLOG); num = clear_range(chain, &rh->range, log_only); if (rh->range.flags & IPFW_RCFLAG_ALL) msg = log_only ? "All logging counts reset" : "Accounting cleared"; else msg = log_only ? "logging count reset" : "cleared"; if (V_fw_verbose) { int lev = LOG_SECURITY | LOG_NOTICE; log(lev, "ipfw: %s.\n", msg); } /* Save number of rules cleared */ rh->range.new_set = num; return (0); } static void enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { uint32_t v_set; IPFW_UH_WLOCK_ASSERT(chain); /* Change enabled/disabled sets mask */ v_set = (V_set_disable | rt->set) & ~rt->new_set; v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */ IPFW_WLOCK(chain); V_set_disable = v_set; IPFW_WUNLOCK(chain); } static int swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv) { struct opcode_obj_rewrite *rw; struct ip_fw *rule; int i; IPFW_UH_WLOCK_ASSERT(chain); if (rt->set == rt->new_set) /* nothing to do */ return (0); if (mv != 0) { /* * Berfore moving the rules we need to check that * there aren't any conflicting named objects. */ for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) { if (rw->manage_sets == NULL) continue; i = rw->manage_sets(chain, (uint8_t)rt->set, (uint8_t)rt->new_set, TEST_ALL); if (i != 0) return (EEXIST); } } /* Swap or move two sets */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->set == (uint8_t)rt->set) rule->set = (uint8_t)rt->new_set; else if (rule->set == (uint8_t)rt->new_set && mv == 0) rule->set = (uint8_t)rt->set; } for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) { if (rw->manage_sets == NULL) continue; rw->manage_sets(chain, (uint8_t)rt->set, (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL); } return (0); } /* * Swaps or moves set * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_range_tlv ] * * Returns 0 on success. */ static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_range_header *rh; int ret; if (sd->valsize != sizeof(*rh)) return (EINVAL); rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize); if (rh->range.head.length != sizeof(ipfw_range_tlv)) return (1); /* enable_sets() expects bitmasks. */ if (op3->opcode != IP_FW_SET_ENABLE && (rh->range.set >= IPFW_MAX_SETS || rh->range.new_set >= IPFW_MAX_SETS)) return (EINVAL); ret = 0; IPFW_UH_WLOCK(chain); switch (op3->opcode) { case IP_FW_SET_SWAP: case IP_FW_SET_MOVE: ret = swap_sets(chain, &rh->range, op3->opcode == IP_FW_SET_MOVE); break; case IP_FW_SET_ENABLE: enable_sets(chain, &rh->range); break; } IPFW_UH_WUNLOCK(chain); return (ret); } -/** - * Remove all rules with given number, or do set manipulation. - * Assumes chain != NULL && *chain != NULL. - * - * The argument is an uint32_t. The low 16 bit are the rule or set number; - * the next 8 bits are the new set; the top 8 bits indicate the command: - * - * 0 delete rules numbered "rulenum" - * 1 delete rules in set "rulenum" - * 2 move rules "rulenum" to set "new_set" - * 3 move rules from set "rulenum" to set "new_set" - * 4 swap sets "rulenum" and "new_set" - * 5 delete rules "rulenum" and set "new_set" - */ -static int -del_entry(struct ip_fw_chain *chain, uint32_t arg) -{ - uint32_t num; /* rule number or old_set */ - uint8_t cmd, new_set; - int do_del, ndel; - int error = 0; - ipfw_range_tlv rt; - - num = arg & 0xffff; - cmd = (arg >> 24) & 0xff; - new_set = (arg >> 16) & 0xff; - - if (cmd > 5 || new_set > RESVD_SET) - return EINVAL; - if (cmd == 0 || cmd == 2 || cmd == 5) { - if (num >= IPFW_DEFAULT_RULE) - return EINVAL; - } else { - if (num > RESVD_SET) /* old_set */ - return EINVAL; - } - - /* Convert old requests into new representation */ - memset(&rt, 0, sizeof(rt)); - rt.start_rule = num; - rt.end_rule = num; - rt.set = num; - rt.new_set = new_set; - do_del = 0; - - switch (cmd) { - case 0: /* delete rules numbered "rulenum" */ - if (num == 0) - rt.flags |= IPFW_RCFLAG_ALL; - else - rt.flags |= IPFW_RCFLAG_RANGE; - do_del = 1; - break; - case 1: /* delete rules in set "rulenum" */ - rt.flags |= IPFW_RCFLAG_SET; - do_del = 1; - break; - case 5: /* delete rules "rulenum" and set "new_set" */ - rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET; - rt.set = new_set; - rt.new_set = 0; - do_del = 1; - break; - case 2: /* move rules "rulenum" to set "new_set" */ - rt.flags |= IPFW_RCFLAG_RANGE; - break; - case 3: /* move rules from set "rulenum" to set "new_set" */ - IPFW_UH_WLOCK(chain); - error = swap_sets(chain, &rt, 1); - IPFW_UH_WUNLOCK(chain); - return (error); - case 4: /* swap sets "rulenum" and "new_set" */ - IPFW_UH_WLOCK(chain); - error = swap_sets(chain, &rt, 0); - IPFW_UH_WUNLOCK(chain); - return (error); - default: - return (ENOTSUP); - } - - if (do_del != 0) { - if ((error = delete_range(chain, &rt, &ndel)) != 0) - return (error); - - if (ndel == 0 && (cmd != 1 && num != 0)) - return (EINVAL); - - return (0); - } - - return (move_range(chain, &rt)); -} - -/** - * Reset some or all counters on firewall rules. - * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, - * the next 8 bits are the set number, the top 8 bits are the command: - * 0 work with rules from all set's; - * 1 work with rules only from specified set. - * Specified rule number is zero if we want to clear all entries. - * log_only is 1 if we only want to reset logs, zero otherwise. - */ -static int -zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) -{ - struct ip_fw *rule; - char *msg; - int i; - - uint16_t rulenum = arg & 0xffff; - uint8_t set = (arg >> 16) & 0xff; - uint8_t cmd = (arg >> 24) & 0xff; - - if (cmd > 1) - return (EINVAL); - if (cmd == 1 && set > RESVD_SET) - return (EINVAL); - - IPFW_UH_RLOCK(chain); - if (rulenum == 0) { - V_norule_counter = 0; - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - /* Skip rules not in our set. */ - if (cmd == 1 && rule->set != set) - continue; - clear_counters(rule, log_only); - } - msg = log_only ? "All logging counts reset" : - "Accounting cleared"; - } else { - int cleared = 0; - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - if (rule->rulenum == rulenum) { - if (cmd == 0 || rule->set == set) - clear_counters(rule, log_only); - cleared = 1; - } - if (rule->rulenum > rulenum) - break; - } - if (!cleared) { /* we did not find any matching rules */ - IPFW_UH_RUNLOCK(chain); - return (EINVAL); - } - msg = log_only ? "logging count reset" : "cleared"; - } - IPFW_UH_RUNLOCK(chain); - - if (V_fw_verbose) { - int lev = LOG_SECURITY | LOG_NOTICE; - - if (rulenum) - log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); - else - log(lev, "ipfw: %s.\n", msg); - } - return (0); -} - -/* - * Check rule head in FreeBSD11 format - * - */ -static int -check_ipfw_rule1(struct ip_fw_rule *rule, int size, +/* Check rule format */ +int +ipfw_check_rule(struct ip_fw_rule *rule, size_t size, struct rule_check_info *ci) { int l; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* Check for valid cmd_len */ l = roundup2(RULESIZE(rule), sizeof(uint64_t)); if (l != size) { - printf("ipfw: size mismatch (have %d want %d)\n", size, l); + printf("ipfw: size mismatch (have %zu want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } if (rule->rulenum > IPFW_DEFAULT_RULE - 1) return (EINVAL); return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); } -/* - * Check rule head in FreeBSD8 format - * - */ -static int -check_ipfw_rule0(struct ip_fw_rule0 *rule, int size, - struct rule_check_info *ci) +enum ipfw_opcheck_result +ipfw_check_opcode(ipfw_insn **pcmd, int *plen, struct rule_check_info *ci) { - int l; - - if (size < sizeof(*rule)) { - printf("ipfw: rule too short\n"); - return (EINVAL); - } - - /* Check for valid cmd_len */ - l = sizeof(*rule) + rule->cmd_len * 4 - 4; - if (l != size) { - printf("ipfw: size mismatch (have %d want %d)\n", size, l); - return (EINVAL); - } - if (rule->act_ofs >= rule->cmd_len) { - printf("ipfw: bogus action offset (%u > %u)\n", - rule->act_ofs, rule->cmd_len - 1); - return (EINVAL); - } - - if (rule->rulenum > IPFW_DEFAULT_RULE - 1) - return (EINVAL); - - return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci)); -} - -static int -check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) -{ - int cmdlen, l; - int have_action; + ipfw_insn *cmd; + size_t cmdlen; - have_action = 0; + cmd = *pcmd; + cmdlen = F_LEN(cmd); - /* - * Now go for the individual checks. Very simple ones, basically only - * instruction sizes. - */ - for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - if (cmdlen > l) { - printf("ipfw: opcode %d size truncated\n", - cmd->opcode); - return EINVAL; - } - switch (cmd->opcode) { - case O_PROBE_STATE: - case O_KEEP_STATE: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - ci->object_opcodes++; - break; - case O_PROTO: - case O_IP_SRC_ME: - case O_IP_DST_ME: - case O_LAYER2: - case O_IN: - case O_FRAG: - case O_DIVERTED: - case O_IPOPT: - case O_IPTOS: - case O_IPPRECEDENCE: - case O_IPVER: - case O_SOCKARG: - case O_TCPFLAGS: - case O_TCPOPTS: - case O_ESTAB: - case O_VERREVPATH: - case O_VERSRCREACH: - case O_ANTISPOOF: - case O_IPSEC: + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_kidx)) + return (BAD_SIZE); + ci->object_opcodes++; + break; + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_DIVERTED: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_SOCKARG: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_VERSRCREACH: + case O_ANTISPOOF: + case O_IPSEC: #ifdef INET6 - case O_IP6_SRC_ME: - case O_IP6_DST_ME: - case O_EXT_HDR: - case O_IP6: + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: #endif - case O_IP4: - case O_TAG: - case O_SKIP_ACTION: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - break; + case O_IP4: + case O_TAG: + case O_SKIP_ACTION: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + break; - case O_EXTERNAL_ACTION: - if (cmd->arg1 == 0 || - cmdlen != F_INSN_SIZE(ipfw_insn)) { - printf("ipfw: invalid external " - "action opcode\n"); - return (EINVAL); - } - ci->object_opcodes++; - /* - * Do we have O_EXTERNAL_INSTANCE or O_EXTERNAL_DATA - * opcode? - */ - if (l != cmdlen) { - l -= cmdlen; - cmd += cmdlen; - cmdlen = F_LEN(cmd); - if (cmd->opcode == O_EXTERNAL_DATA) - goto check_action; - if (cmd->opcode != O_EXTERNAL_INSTANCE) { - printf("ipfw: invalid opcode " - "next to external action %u\n", - cmd->opcode); - return (EINVAL); - } - if (cmd->arg1 == 0 || - cmdlen != F_INSN_SIZE(ipfw_insn)) { - printf("ipfw: invalid external " - "action instance opcode\n"); - return (EINVAL); - } - ci->object_opcodes++; - } - goto check_action; - - case O_FIB: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - if (cmd->arg1 >= rt_numfibs) { - printf("ipfw: invalid fib number %d\n", - cmd->arg1); - return EINVAL; - } - break; + case O_EXTERNAL_ACTION: + if (cmdlen != F_INSN_SIZE(ipfw_insn_kidx)) + return (BAD_SIZE); - case O_SETFIB: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - if ((cmd->arg1 != IP_FW_TARG) && - ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) { - printf("ipfw: invalid fib number %d\n", - cmd->arg1 & 0x7FFF); - return EINVAL; + if (insntod(cmd, kidx)->kidx == 0) + return (FAILED); + ci->object_opcodes++; + /* + * Do we have O_EXTERNAL_INSTANCE or O_EXTERNAL_DATA + * opcode? + */ + if (*plen != cmdlen) { + *plen -= cmdlen; + cmd += cmdlen; + *pcmd = cmd; + cmdlen = F_LEN(cmd); + if (cmd->opcode == O_EXTERNAL_DATA) + return (CHECK_ACTION); + if (cmd->opcode != O_EXTERNAL_INSTANCE) { + printf("ipfw: invalid opcode " + "next to external action %u\n", + cmd->opcode); + return (FAILED); } - goto check_action; - - case O_UID: - case O_GID: - case O_JAIL: - case O_IP_SRC: - case O_IP_DST: - case O_TCPSEQ: - case O_TCPACK: - case O_PROB: - case O_ICMPTYPE: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - break; - - case O_LIMIT: - if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) - goto bad_size; + if (cmdlen != F_INSN_SIZE(ipfw_insn_kidx)) + return (BAD_SIZE); + if (insntod(cmd, kidx)->kidx == 0) + return (FAILED); ci->object_opcodes++; - break; + } + return (CHECK_ACTION); + + case O_FIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return (FAILED); + } + break; - case O_LOG: - if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) - goto bad_size; + case O_SETFIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + if ((cmd->arg1 != IP_FW_TARG) && + ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1 & 0x7FFF); + return (FAILED); + } + return (CHECK_ACTION); + + case O_UID: + case O_GID: + case O_JAIL: + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + return (BAD_SIZE); + break; - ((ipfw_insn_log *)cmd)->log_left = - ((ipfw_insn_log *)cmd)->max_log; + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + return (BAD_SIZE); + ci->object_opcodes++; + break; - break; + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + return (BAD_SIZE); + insntod(cmd, log)->log_left = insntod(cmd, log)->max_log; + break; - case O_IP_SRC_MASK: - case O_IP_DST_MASK: - /* only odd command lengths */ - if ((cmdlen & 1) == 0) - goto bad_size; - break; + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ((cmdlen & 1) == 0) + return (BAD_SIZE); + break; - case O_IP_SRC_SET: - case O_IP_DST_SET: - if (cmd->arg1 == 0 || cmd->arg1 > 256) { - printf("ipfw: invalid set size %d\n", - cmd->arg1); - return EINVAL; - } - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + - (cmd->arg1+31)/32 ) - goto bad_size; - break; + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return (FAILED); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + return (BAD_SIZE); + break; - case O_IP_SRC_LOOKUP: - if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - case O_IP_DST_LOOKUP: - if (cmd->arg1 >= V_fw_tables_max) { - printf("ipfw: invalid table number %d\n", - cmd->arg1); - return (EINVAL); - } - if (cmdlen != F_INSN_SIZE(ipfw_insn) && - cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && - cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - ci->object_opcodes++; - break; - case O_IP_FLOW_LOOKUP: - case O_MAC_DST_LOOKUP: - case O_MAC_SRC_LOOKUP: - if (cmd->arg1 >= V_fw_tables_max) { - printf("ipfw: invalid table number %d\n", - cmd->arg1); - return (EINVAL); - } - if (cmdlen != F_INSN_SIZE(ipfw_insn) && - cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - ci->object_opcodes++; - break; - case O_MACADDR2: - if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) - goto bad_size; - break; + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + case O_IP_FLOW_LOOKUP: + case O_MAC_SRC_LOOKUP: + case O_MAC_DST_LOOKUP: + if (cmdlen != F_INSN_SIZE(ipfw_insn_kidx) && + cmdlen != F_INSN_SIZE(ipfw_insn_table)) + return (BAD_SIZE); + if (insntod(cmd, kidx)->kidx >= V_fw_tables_max) { + printf("ipfw: invalid table index %u\n", + insntod(cmd, kidx)->kidx); + return (FAILED); + } + ci->object_opcodes++; + break; + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + return (BAD_SIZE); + break; - case O_NOP: - case O_IPID: - case O_IPTTL: - case O_IPLEN: - case O_TCPDATALEN: - case O_TCPMSS: - case O_TCPWIN: - case O_TAGGED: - if (cmdlen < 1 || cmdlen > 31) - goto bad_size; - break; + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + case O_TCPDATALEN: + case O_TCPMSS: + case O_TCPWIN: + case O_TAGGED: + if (cmdlen < 1 || cmdlen > 31) + return (BAD_SIZE); + break; - case O_DSCP: - case O_MARK: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1) - goto bad_size; - break; + case O_DSCP: + case O_MARK: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1) + return (BAD_SIZE); + break; - case O_MAC_TYPE: - case O_IP_SRCPORT: - case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ - if (cmdlen < 2 || cmdlen > 31) - goto bad_size; - break; + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + return (BAD_SIZE); + break; - case O_RECV: - case O_XMIT: - case O_VIA: - if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) - goto bad_size; - ci->object_opcodes++; - break; + case O_RECV: + case O_XMIT: + case O_VIA: + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + return (BAD_SIZE); + ci->object_opcodes++; + break; - case O_ALTQ: - if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) - goto bad_size; - break; + case O_ALTQ: + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) + return (BAD_SIZE); + break; - case O_PIPE: - case O_QUEUE: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; - goto check_action; + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + return (CHECK_ACTION); - case O_FORWARD_IP: - if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) - goto bad_size; - goto check_action; + case O_FORWARD_IP: + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + return (BAD_SIZE); + return (CHECK_ACTION); #ifdef INET6 - case O_FORWARD_IP6: - if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) - goto bad_size; - goto check_action; + case O_FORWARD_IP6: + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) + return (BAD_SIZE); + return (CHECK_ACTION); #endif /* INET6 */ - case O_DIVERT: - case O_TEE: - if (ip_divert_ptr == NULL) - return EINVAL; - else - goto check_size; - case O_NETGRAPH: - case O_NGTEE: - if (ng_ipfw_input_p == NULL) - return EINVAL; - else - goto check_size; - case O_NAT: - if (!IPFW_NAT_LOADED) - return EINVAL; - if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) - goto bad_size; - goto check_action; - case O_CHECK_STATE: - ci->object_opcodes++; - goto check_size; - case O_SETMARK: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) - goto bad_size; - goto check_action; - case O_REJECT: - /* "unreach needfrag" has variable len. */ - if ((cmdlen == F_INSN_SIZE(ipfw_insn) || - cmdlen == F_INSN_SIZE(ipfw_insn_u16))) - goto check_action; - /* FALLTHROUGH */ - case O_FORWARD_MAC: /* XXX not implemented yet */ - case O_COUNT: - case O_ACCEPT: - case O_DENY: - case O_SETDSCP: + case O_DIVERT: + case O_TEE: + if (ip_divert_ptr == NULL) + return (FAILED); + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + return (CHECK_ACTION); + case O_NETGRAPH: + case O_NGTEE: + if (ng_ipfw_input_p == NULL) + return (FAILED); + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + return (CHECK_ACTION); + case O_NAT: + if (!IPFW_NAT_LOADED) + return (FAILED); + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) + return (BAD_SIZE); + return (CHECK_ACTION); + + case O_SKIPTO: + case O_CALLRETURN: + case O_SETMARK: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + return (BAD_SIZE); + return (CHECK_ACTION); + + case O_CHECK_STATE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_kidx)) + return (BAD_SIZE); + ci->object_opcodes++; + return (CHECK_ACTION); + + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: + case O_SETDSCP: #ifdef INET6 - case O_UNREACH6: + case O_UNREACH6: #endif - case O_SKIPTO: - case O_REASS: - case O_CALLRETURN: -check_size: - if (cmdlen != F_INSN_SIZE(ipfw_insn)) - goto bad_size; -check_action: - if (have_action) { - printf("ipfw: opcode %d, multiple actions" - " not allowed\n", - cmd->opcode); - return (EINVAL); - } - have_action = 1; - if (l != cmdlen) { - printf("ipfw: opcode %d, action must be" - " last opcode\n", - cmd->opcode); - return (EINVAL); - } - break; + case O_REASS: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + return (CHECK_ACTION); #ifdef INET6 + case O_IP6_SRC: + case O_IP6_DST: + if (cmdlen != F_INSN_SIZE(struct in6_addr) + + F_INSN_SIZE(ipfw_insn)) + return (BAD_SIZE); + break; + + case O_FLOW6ID: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + ((ipfw_insn_u32 *)cmd)->o.arg1) + return (BAD_SIZE); + break; + + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if ( !(cmdlen & 1) || cmdlen > 127) + return (BAD_SIZE); + break; + case O_ICMP6TYPE: + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) + return (BAD_SIZE); + break; +#endif + + default: + switch (cmd->opcode) { +#ifndef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: + case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: - if (cmdlen != F_INSN_SIZE(struct in6_addr) + - F_INSN_SIZE(ipfw_insn)) - goto bad_size; - break; - case O_FLOW6ID: - if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + - ((ipfw_insn_u32 *)cmd)->o.arg1) - goto bad_size; - break; - case O_IP6_SRC_MASK: case O_IP6_DST_MASK: - if ( !(cmdlen & 1) || cmdlen > 127) - goto bad_size; - break; case O_ICMP6TYPE: - if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) - goto bad_size; - break; + printf("ipfw: no IPv6 support in kernel\n"); + return (FAILED); #endif - default: - switch (cmd->opcode) { -#ifndef INET6 - case O_IP6_SRC_ME: - case O_IP6_DST_ME: - case O_EXT_HDR: - case O_IP6: - case O_UNREACH6: - case O_IP6_SRC: - case O_IP6_DST: - case O_FLOW6ID: - case O_IP6_SRC_MASK: - case O_IP6_DST_MASK: - case O_ICMP6TYPE: - printf("ipfw: no IPv6 support in kernel\n"); - return (EPROTONOSUPPORT); -#endif - default: - printf("ipfw: opcode %d, unknown opcode\n", - cmd->opcode); - return (EINVAL); - } + printf("ipfw: opcode %d: unknown opcode\n", + cmd->opcode); + return (FAILED); } } - if (have_action == 0) { - printf("ipfw: missing action\n"); - return (EINVAL); - } - return 0; - -bad_size: - printf("ipfw: opcode %d size %d wrong\n", - cmd->opcode, cmdlen); - return (EINVAL); + return (SUCCESS); } -/* - * Translation of requests for compatibility with FreeBSD 7.2/8. - * a static variable tells us if we have an old client from userland, - * and if necessary we translate requests and responses between the - * two formats. - */ -static int is7 = 0; - -struct ip_fw7 { - struct ip_fw7 *next; /* linked list of rules */ - struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ - /* 'next_rule' is used to pass up 'set_disable' status */ - - uint16_t act_ofs; /* offset of action in 32-bit units */ - uint16_t cmd_len; /* # of 32-bit words in cmd */ - uint16_t rulenum; /* rule number */ - uint8_t set; /* rule set (0..31) */ - // #define RESVD_SET 31 /* set for default and persistent rules */ - uint8_t _pad; /* padding */ - // uint32_t id; /* rule id, only in v.8 */ - /* These fields are present in all rules. */ - uint64_t pcnt; /* Packet counter */ - uint64_t bcnt; /* Byte counter */ - uint32_t timestamp; /* tv_sec of last match */ - - ipfw_insn cmd[1]; /* storage for commands */ -}; - -static int convert_rule_to_7(struct ip_fw_rule0 *rule); -static int convert_rule_to_8(struct ip_fw_rule0 *rule); - -#ifndef RULESIZE7 -#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ - ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) -#endif - -/* - * Copy the static and dynamic rules to the supplied buffer - * and return the amount of space actually used. - * Must be run under IPFW_UH_RLOCK - */ -static size_t -ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) +static __noinline int +check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci) { - char *bp = buf; - char *ep = bp + space; - struct ip_fw *rule; - struct ip_fw_rule0 *dst; - struct timeval boottime; - int error, i, l, warnflag; - time_t boot_seconds; - - warnflag = 0; - - getboottime(&boottime); - boot_seconds = boottime.tv_sec; - for (i = 0; i < chain->n_rules; i++) { - rule = chain->map[i]; - - if (is7) { - /* Convert rule to FreeBSd 7.2 format */ - l = RULESIZE7(rule); - if (bp + l + sizeof(uint32_t) <= ep) { - bcopy(rule, bp, l + sizeof(uint32_t)); - error = set_legacy_obj_kidx(chain, - (struct ip_fw_rule0 *)bp); - if (error != 0) - return (0); - error = convert_rule_to_7((struct ip_fw_rule0 *) bp); - if (error) - return 0; /*XXX correct? */ - /* - * XXX HACK. Store the disable mask in the "next" - * pointer in a wild attempt to keep the ABI the same. - * Why do we do this on EVERY rule? - */ - bcopy(&V_set_disable, - &(((struct ip_fw7 *)bp)->next_rule), - sizeof(V_set_disable)); - if (((struct ip_fw7 *)bp)->timestamp) - ((struct ip_fw7 *)bp)->timestamp += boot_seconds; - bp += l; - } - continue; /* go to next rule */ - } + int cmdlen, l; + int have_action, ret; - l = RULEUSIZE0(rule); - if (bp + l > ep) { /* should not happen */ - printf("overflow dumping static rules\n"); - break; + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + have_action = 0; + for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d: size truncated\n", + cmd->opcode); + return (EINVAL); } - dst = (struct ip_fw_rule0 *)bp; - export_rule0(rule, dst, l); - error = set_legacy_obj_kidx(chain, dst); - - /* - * XXX HACK. Store the disable mask in the "next" - * pointer in a wild attempt to keep the ABI the same. - * Why do we do this on EVERY rule? - * - * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask - * so we need to fail _after_ saving at least one mask. - */ - bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); - if (dst->timestamp) - dst->timestamp += boot_seconds; - bp += l; - - if (error != 0) { - if (error == 2) { - /* Non-fatal table rewrite error. */ - warnflag = 1; - continue; + if (ci->version != IP_FW3_OPVER) + ret = (*check_opcode_f)(&cmd, &l, ci); + else + ret = ipfw_check_opcode(&cmd, &l, ci); + + if (ret == CHECK_ACTION) { + if (have_action != 0) { + printf("ipfw: opcode %d: multiple actions" + " not allowed\n", cmd->opcode); + ret = FAILED; + } else + have_action = 1; + + if (l != F_LEN(cmd)) { + printf("ipfw: opcode %d: action must be" + " last opcode\n", cmd->opcode); + ret = FAILED; } - printf("Stop on rule %d. Fail to convert table\n", - rule->rulenum); - break; + } + switch (ret) { + case SUCCESS: + continue; + case BAD_SIZE: + printf("ipfw: opcode %d: wrong size %d\n", + cmd->opcode, cmdlen); + /* FALLTHROUGH */ + case FAILED: + return (EINVAL); } } - if (warnflag != 0) - printf("ipfw: process %s is using legacy interfaces," - " consider rebuilding\n", ""); - ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */ - return (bp - (char *)buf); + if (have_action == 0) { + printf("ipfw: missing action\n"); + return (EINVAL); + } + return (0); } struct dump_args { uint32_t b; /* start rule */ uint32_t e; /* end rule */ uint32_t rcount; /* number of rules */ uint32_t rsize; /* rules size */ uint32_t tcount; /* number of tables */ int rcounters; /* counters */ uint32_t *bmask; /* index bitmask of used named objects */ }; void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv) { ntlv->head.type = no->etlv; ntlv->head.length = sizeof(*ntlv); ntlv->idx = no->kidx; strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); } /* * Export named object info in instance @ni, identified by @kidx * to ipfw_obj_ntlv. TLV is allocated from @sd space. * * Returns 0 on success. */ static int -export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx, +export_objhash_ntlv(struct namedobj_instance *ni, uint32_t kidx, struct sockopt_data *sd) { struct named_object *no; ipfw_obj_ntlv *ntlv; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("invalid object kernel index passed")); ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); if (ntlv == NULL) return (ENOMEM); ipfw_export_obj_ntlv(no, ntlv); return (0); } static int export_named_objects(struct namedobj_instance *ni, struct dump_args *da, struct sockopt_data *sd) { - int error, i; + uint32_t i; + int error; for (i = 0; i < IPFW_TABLES_MAX && da->tcount > 0; i++) { if ((da->bmask[i / 32] & (1 << (i % 32))) == 0) continue; if ((error = export_objhash_ntlv(ni, i, sd)) != 0) return (error); da->tcount--; } return (0); } static int dump_named_objects(struct ip_fw_chain *ch, struct dump_args *da, struct sockopt_data *sd) { ipfw_obj_ctlv *ctlv; int error; MPASS(da->tcount > 0); /* Header first */ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); ctlv->head.type = IPFW_TLV_TBLNAME_LIST; ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) + sizeof(*ctlv); ctlv->count = da->tcount; ctlv->objsize = sizeof(ipfw_obj_ntlv); /* Dump table names first (if any) */ error = export_named_objects(ipfw_get_table_objhash(ch), da, sd); if (error != 0) return (error); /* Then dump another named objects */ da->bmask += IPFW_TABLES_MAX / 32; return (export_named_objects(CHAIN_TO_SRV(ch), da, sd)); } /* * Dumps static rules with table TLVs in buffer @sd. * * Returns 0 on success. */ static int dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da, struct sockopt_data *sd) { ipfw_obj_ctlv *ctlv; struct ip_fw *krule; caddr_t dst; int i, l; /* Dump rules */ ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); ctlv->head.type = IPFW_TLV_RULE_LIST; ctlv->head.length = da->rsize + sizeof(*ctlv); ctlv->count = da->rcount; for (i = da->b; i < da->e; i++) { krule = chain->map[i]; l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv); if (da->rcounters != 0) l += sizeof(struct ip_fw_bcounter); dst = (caddr_t)ipfw_get_sopt_space(sd, l); if (dst == NULL) return (ENOMEM); export_rule1(krule, dst, l, da->rcounters); } return (0); } int -ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx) +ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint32_t kidx) { uint32_t bidx; /* * Maintain separate bitmasks for table and non-table objects. */ bidx = (etlv == IPFW_TLV_TBL_NAME) ? 0: IPFW_TABLES_MAX / 32; bidx += kidx / 32; if ((bmask[bidx] & (1 << (kidx % 32))) != 0) return (0); bmask[bidx] |= 1 << (kidx % 32); return (1); } /* * Marks every object index used in @rule with bit in @bmask. * Used to generate bitmask of referenced tables/objects for given ruleset * or its part. */ static void mark_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, struct dump_args *da) { struct opcode_obj_rewrite *rw; ipfw_insn *cmd; + uint32_t kidx; int cmdlen, l; - uint16_t kidx; uint8_t subtype; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, &subtype); if (rw == NULL) continue; if (ipfw_mark_object_kidx(da->bmask, rw->etlv, kidx)) da->tcount++; } } /* * Dumps requested objects data * Data layout (version 0)(current): * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags * size = ipfw_cfg_lheader.size * Reply: [ ipfw_cfg_lheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) * ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ] * ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional) * ] * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize. * The rest (size, count) are set to zero and needs to be ignored. * * Returns 0 on success. */ static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct dump_args da; ipfw_cfg_lheader *hdr; struct ip_fw *rule; size_t sz, rnum; uint32_t hdr_flags, *bmask; int error, i; hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); if (hdr == NULL) return (EINVAL); error = 0; bmask = NULL; memset(&da, 0, sizeof(da)); /* * Allocate needed state. * Note we allocate 2xspace mask, for table & srv */ if (hdr->flags & (IPFW_CFG_GET_STATIC | IPFW_CFG_GET_STATES)) da.bmask = bmask = malloc( sizeof(uint32_t) * IPFW_TABLES_MAX * 2 / 32, M_TEMP, M_WAITOK | M_ZERO); IPFW_UH_RLOCK(chain); /* * STAGE 1: Determine size/count for objects in range. * Prepare used tables bitmask. */ sz = sizeof(ipfw_cfg_lheader); da.e = chain->n_rules; if (hdr->end_rule != 0) { /* Handle custom range */ if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE) rnum = IPFW_DEFAULT_RULE; da.b = ipfw_find_rule(chain, rnum, 0); rnum = (hdr->end_rule < IPFW_DEFAULT_RULE) ? hdr->end_rule + 1: IPFW_DEFAULT_RULE; da.e = ipfw_find_rule(chain, rnum, UINT32_MAX) + 1; } if (hdr->flags & IPFW_CFG_GET_STATIC) { for (i = da.b; i < da.e; i++) { rule = chain->map[i]; da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv); da.rcount++; /* Update bitmask of used objects for given range */ mark_rule_objects(chain, rule, &da); } /* Add counters if requested */ if (hdr->flags & IPFW_CFG_GET_COUNTERS) { da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount; da.rcounters = 1; } sz += da.rsize + sizeof(ipfw_obj_ctlv); } if (hdr->flags & IPFW_CFG_GET_STATES) { sz += sizeof(ipfw_obj_ctlv) + ipfw_dyn_get_count(bmask, &i) * sizeof(ipfw_obj_dyntlv); da.tcount += i; } if (da.tcount > 0) sz += da.tcount * sizeof(ipfw_obj_ntlv) + sizeof(ipfw_obj_ctlv); /* * Fill header anyway. * Note we have to save header fields to stable storage * buffer inside @sd can be flushed after dumping rules */ hdr->size = sz; hdr->set_mask = ~V_set_disable; hdr_flags = hdr->flags; hdr = NULL; if (sd->valsize < sz) { error = ENOMEM; goto cleanup; } /* STAGE2: Store actual data */ if (da.tcount > 0) { error = dump_named_objects(chain, &da, sd); if (error != 0) goto cleanup; } if (hdr_flags & IPFW_CFG_GET_STATIC) { error = dump_static_rules(chain, &da, sd); if (error != 0) goto cleanup; } if (hdr_flags & IPFW_CFG_GET_STATES) error = ipfw_dump_states(chain, sd); cleanup: IPFW_UH_RUNLOCK(chain); if (bmask != NULL) free(bmask, M_TEMP); return (error); } int ipfw_check_object_name_generic(const char *name) { int nsize; nsize = sizeof(((ipfw_obj_ntlv *)0)->name); if (strnlen(name, nsize) == nsize) return (EINVAL); if (name[0] == '\0') return (EINVAL); return (0); } /* * Creates non-existent objects referenced by rule. * * Return 0 on success. */ int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti) { struct opcode_obj_rewrite *rw; struct obj_idx *p; - uint16_t kidx; + uint32_t kidx; int error; /* * Compatibility stuff: do actual creation for non-existing, * but referenced objects. */ for (p = oib; p < pidx; p++) { if (p->kidx != 0) continue; ti->uidx = p->uidx; ti->type = p->type; ti->atype = 0; rw = find_op_rw(cmd + p->off, NULL, NULL); KASSERT(rw != NULL, ("Unable to find handler for op %d", (cmd + p->off)->opcode)); if (rw->create_object == NULL) error = EOPNOTSUPP; else error = rw->create_object(ch, ti, &kidx); if (error == 0) { p->kidx = kidx; continue; } /* * Error happened. We have to rollback everything. * Drop all already acquired references. */ IPFW_UH_WLOCK(ch); unref_oib_objects(ch, cmd, oib, pidx); IPFW_UH_WUNLOCK(ch); - return (error); - } - - return (0); -} - -/* - * Compatibility function for old ipfw(8) binaries. - * Rewrites table/nat kernel indices with userland ones. - * Convert tables matching '/^\d+$/' to their atoi() value. - * Use number 65535 for other tables. - * - * Returns 0 on success. - */ -static int -set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule) -{ - struct opcode_obj_rewrite *rw; - struct named_object *no; - ipfw_insn *cmd; - char *end; - long val; - int cmdlen, error, l; - uint16_t kidx, uidx; - uint8_t subtype; - - error = 0; - - l = rule->cmd_len; - cmd = rule->cmd; - cmdlen = 0; - for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { - cmdlen = F_LEN(cmd); - - /* Check if is index in given opcode */ - rw = find_op_rw(cmd, &kidx, &subtype); - if (rw == NULL) - continue; - - /* Try to find referenced kernel object */ - no = rw->find_bykidx(ch, kidx); - if (no == NULL) - continue; - - val = strtol(no->name, &end, 10); - if (*end == '\0' && val < 65535) { - uidx = val; - } else { - /* - * We are called via legacy opcode. - * Save error and show table as fake number - * not to make ipfw(8) hang. - */ - uidx = 65535; - error = 2; - } - - rw->update(cmd, uidx); + return (error); } - return (error); + return (0); } /* * Unreferences all already-referenced objects in given @cmd rule, * using information in @oib. * * Used to rollback partially converted rule on error. */ static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib, struct obj_idx *end) { struct opcode_obj_rewrite *rw; struct named_object *no; struct obj_idx *p; IPFW_UH_WLOCK_ASSERT(ch); for (p = oib; p < end; p++) { if (p->kidx == 0) continue; rw = find_op_rw(cmd + p->off, NULL, NULL); KASSERT(rw != NULL, ("Unable to find handler for op %d", (cmd + p->off)->opcode)); /* Find & unref by existing idx */ no = rw->find_bykidx(ch, p->kidx); KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx)); no->refcnt--; } } /* * Remove references from every object used in @rule. * Used at rule removal code. */ static void unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule) { struct opcode_obj_rewrite *rw; struct named_object *no; ipfw_insn *cmd; + uint32_t kidx; int cmdlen, l; - uint16_t kidx; uint8_t subtype; IPFW_UH_WLOCK_ASSERT(ch); l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); rw = find_op_rw(cmd, &kidx, &subtype); if (rw == NULL) continue; no = rw->find_bykidx(ch, kidx); KASSERT(no != NULL, ("object id %d not found", kidx)); KASSERT(no->subtype == subtype, ("wrong type %d (%d) for object id %d", no->subtype, subtype, kidx)); KASSERT(no->refcnt > 0, ("refcount for object %d is %d", kidx, no->refcnt)); if (no->refcnt == 1 && rw->destroy_object != NULL) rw->destroy_object(ch, no); else no->refcnt--; } } /* * Find and reference object (if any) stored in instruction @cmd. * * Saves object info in @pidx, sets * - @unresolved to 1 if object should exists but not found * * Returns non-zero value in case of error. */ static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti, struct obj_idx *pidx, int *unresolved) { struct named_object *no; struct opcode_obj_rewrite *rw; int error; /* Check if this opcode is candidate for rewrite */ rw = find_op_rw(cmd, &ti->uidx, &ti->type); if (rw == NULL) return (0); /* Need to rewrite. Save necessary fields */ pidx->uidx = ti->uidx; pidx->type = ti->type; /* Try to find referenced kernel object */ error = rw->find_byname(ch, ti, &no); if (error != 0) return (error); if (no == NULL) { /* * Report about unresolved object for automaic * creation. */ *unresolved = 1; return (0); } /* * Object is already exist. * Its subtype should match with expected value. */ if (ti->type != no->subtype) return (EINVAL); /* Bump refcount and update kidx. */ no->refcnt++; rw->update(cmd, no->kidx); return (0); } /* * Finds and bumps refcount for objects referenced by given @rule. * Auto-creates non-existing tables. * Fills in @oib array with userland/kernel indexes. * * Returns 0 on success. */ static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule, struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti) { struct obj_idx *pidx; ipfw_insn *cmd; int cmdlen, error, l, unresolved; pidx = oib; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; error = 0; IPFW_UH_WLOCK(ch); /* Increase refcount on each existing referenced table. */ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); unresolved = 0; error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved); if (error != 0) break; /* * Compatibility stuff for old clients: * prepare to automaitcally create non-existing objects. */ if (unresolved != 0) { pidx->off = rule->cmd_len - l; pidx++; } } if (error != 0) { /* Unref everything we have already done */ unref_oib_objects(ch, rule->cmd, oib, pidx); IPFW_UH_WUNLOCK(ch); return (error); } IPFW_UH_WUNLOCK(ch); /* Perform auto-creation for non-existing objects */ if (pidx != oib) error = create_objects_compat(ch, rule->cmd, oib, pidx, ti); /* Calculate real number of dynamic objects */ ci->object_opcodes = (uint16_t)(pidx - oib); return (error); } /* * Checks is opcode is referencing table of appropriate type. * Adds reference count for found table if true. * Rewrites user-supplied opcode values with kernel ones. * * Returns 0 on success and appropriate error code otherwise. */ static int rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci) { int error; ipfw_insn *cmd; struct obj_idx *p, *pidx_first, *pidx_last; struct tid_info ti; /* * Prepare an array for storing opcode indices. * Use stack allocation by default. */ if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { /* Stack */ pidx_first = ci->obuf; } else pidx_first = malloc( ci->object_opcodes * sizeof(struct obj_idx), M_IPFW, M_WAITOK | M_ZERO); error = 0; memset(&ti, 0, sizeof(ti)); /* Use set rule is assigned to. */ ti.set = ci->krule->set; if (ci->ctlv != NULL) { ti.tlvs = (void *)(ci->ctlv + 1); ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); } /* Reference all used tables and other objects */ error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti); if (error != 0) goto free; /* * Note that ref_rule_objects() might have updated ci->object_opcodes * to reflect actual number of object opcodes. */ /* Perform rewrite of remaining opcodes */ p = pidx_first; pidx_last = pidx_first + ci->object_opcodes; for (p = pidx_first; p < pidx_last; p++) { cmd = ci->krule->cmd + p->off; update_opcode_kidx(cmd, p->kidx); } free: if (pidx_first != ci->obuf) free(pidx_first, M_IPFW); return (error); } /* - * Adds one or more rules to ipfw @chain. - * Data layout (version 0)(current): + * Parses one or more rules from userland. + * Data layout (version 1)(current): * Request: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3) * ] * Reply: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] * ] * * Rules in reply are modified to store their actual ruleset number. * * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending * according to their idx field and there has to be no duplicates. * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. * (*3) Each ip_fw structure needs to be aligned to u64 boundary. * * Returns 0 on success. */ -static int -add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, - struct sockopt_data *sd) +static __noinline int +parse_rules_v1(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd, ipfw_obj_ctlv **prtlv, + struct rule_check_info **pci) { ipfw_obj_ctlv *ctlv, *rtlv, *tstate; ipfw_obj_ntlv *ntlv; - int clen, error, idx; - uint32_t count, read; + struct rule_check_info *ci, *cbuf; struct ip_fw_rule *r; - struct rule_check_info rci, *ci, *cbuf; - int i, rsize; + size_t count, clen, read, rsize; + uint32_t idx, rulenum; + int error; op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize); ctlv = (ipfw_obj_ctlv *)(op3 + 1); - read = sizeof(ip_fw3_opheader); - rtlv = NULL; - tstate = NULL; - cbuf = NULL; - memset(&rci, 0, sizeof(struct rule_check_info)); - if (read + sizeof(*ctlv) > sd->valsize) return (EINVAL); + rtlv = NULL; + tstate = NULL; + cbuf = NULL; + /* Table names or other named objects. */ if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { + /* Check size and alignment. */ clen = ctlv->head.length; - /* Check size and alignment */ - if (clen > sd->valsize || clen < sizeof(*ctlv)) - return (EINVAL); - if ((clen % sizeof(uint64_t)) != 0) + if (read + clen > sd->valsize || clen < sizeof(*ctlv) || + (clen % sizeof(uint64_t)) != 0) return (EINVAL); - - /* - * Some table names or other named objects. - * Check for validness. - */ + /* Check for validness. */ count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv); if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv)) return (EINVAL); - /* * Check each TLV. * Ensure TLVs are sorted ascending and * there are no duplicates. */ - idx = -1; + idx = 0; ntlv = (ipfw_obj_ntlv *)(ctlv + 1); while (count > 0) { if (ntlv->head.length != sizeof(ipfw_obj_ntlv)) return (EINVAL); error = ipfw_check_object_name_generic(ntlv->name); if (error != 0) return (error); if (ntlv->idx <= idx) return (EINVAL); idx = ntlv->idx; count--; ntlv++; } tstate = ctlv; read += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); - } - if (read + sizeof(*ctlv) > sd->valsize) - return (EINVAL); + if (read + sizeof(*ctlv) > sd->valsize) + return (EINVAL); + } + /* List of rules. */ if (ctlv->head.type == IPFW_TLV_RULE_LIST) { clen = ctlv->head.length; - if (clen + read > sd->valsize || clen < sizeof(*ctlv)) + if (read + clen > sd->valsize || clen < sizeof(*ctlv) || + (clen % sizeof(uint64_t)) != 0) return (EINVAL); - if ((clen % sizeof(uint64_t)) != 0) - return (EINVAL); - - /* - * TODO: Permit adding multiple rules at once - */ - if (ctlv->count != 1) - return (ENOTSUP); clen -= sizeof(*ctlv); - - if (ctlv->count > clen / sizeof(struct ip_fw_rule)) + if (ctlv->count == 0 || + ctlv->count > clen / sizeof(struct ip_fw_rule)) return (EINVAL); - /* Allocate state for each rule or use stack */ - if (ctlv->count == 1) { - memset(&rci, 0, sizeof(struct rule_check_info)); - cbuf = &rci; - } else - cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP, - M_WAITOK | M_ZERO); - ci = cbuf; + /* Allocate state for each rule */ + cbuf = malloc(ctlv->count * sizeof(struct rule_check_info), + M_TEMP, M_WAITOK | M_ZERO); /* * Check each rule for validness. * Ensure numbered rules are sorted ascending * and properly aligned */ - idx = 0; - r = (struct ip_fw_rule *)(ctlv + 1); + rulenum = 0; count = 0; error = 0; + ci = cbuf; + r = (struct ip_fw_rule *)(ctlv + 1); while (clen > 0) { - rsize = roundup2(RULESIZE(r), sizeof(uint64_t)); - if (rsize > clen || ctlv->count <= count) { + rsize = RULEUSIZE1(r); + if (rsize > clen || count > ctlv->count) { error = EINVAL; break; } - ci->ctlv = tstate; - error = check_ipfw_rule1(r, rsize, ci); + ci->version = IP_FW3_OPVER; + error = ipfw_check_rule(r, rsize, ci); if (error != 0) break; /* Check sorting */ - if (r->rulenum != 0 && r->rulenum < idx) { - printf("rulenum %d idx %d\n", r->rulenum, idx); + if (count != 0 && ((rulenum == 0) != (r->rulenum == 0) || + r->rulenum < rulenum)) { + printf("ipfw: wrong order: rulenum %u" + " vs %u\n", r->rulenum, rulenum); error = EINVAL; break; } - idx = r->rulenum; - + rulenum = r->rulenum; ci->urule = (caddr_t)r; - - rsize = roundup2(rsize, sizeof(uint64_t)); clen -= rsize; r = (struct ip_fw_rule *)((caddr_t)r + rsize); count++; ci++; } if (ctlv->count != count || error != 0) { - if (cbuf != &rci) - free(cbuf, M_TEMP); + free(cbuf, M_TEMP); return (EINVAL); } rtlv = ctlv; read += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } - if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) { - if (cbuf != NULL && cbuf != &rci) - free(cbuf, M_TEMP); + if (read != sd->valsize || rtlv == NULL) { + free(cbuf, M_TEMP); return (EINVAL); } + *prtlv = rtlv; + *pci = cbuf; + return (0); +} + +/* + * Copy rule @urule from v1 userland format (current) to kernel @krule. + */ +static void +import_rule_v1(struct ip_fw_chain *chain, struct rule_check_info *ci) +{ + struct ip_fw_rule *urule; + struct ip_fw *krule; + + urule = (struct ip_fw_rule *)ci->urule; + krule = ci->krule = ipfw_alloc_rule(chain, RULEKSIZE1(urule)); + + krule->act_ofs = urule->act_ofs; + krule->cmd_len = urule->cmd_len; + krule->rulenum = urule->rulenum; + krule->set = urule->set; + krule->flags = urule->flags; + + /* Save rulenum offset */ + ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum); + + /* Copy opcodes */ + memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t)); +} + +/* + * Adds one or more rules to ipfw @chain. + */ +static int +add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_obj_ctlv *rtlv; + struct rule_check_info *ci, *nci; + int i, ret; + /* - * Passed rules seems to be valid. - * Allocate storage and try to add them to chain. + * Check rules buffer for validness. */ - for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) { - clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule); - ci->krule = ipfw_alloc_rule(chain, clen); - import_rule1(ci); - } - - if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) { - /* Free allocate krules */ - for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) + ret = parse_rules_v1(chain, op3, sd, &rtlv, &nci); + if (ret != 0) + return (ret); + /* + * Allocate storage for the kernel representation of rules. + */ + for (i = 0, ci = nci; i < rtlv->count; i++, ci++) + import_rule_v1(chain, ci); + /* + * Try to add new rules to the chain. + */ + if ((ret = ipfw_commit_rules(chain, nci, rtlv->count)) != 0) { + for (i = 0, ci = nci; i < rtlv->count; i++, ci++) ipfw_free_rule(ci->krule); } - - if (cbuf != NULL && cbuf != &rci) - free(cbuf, M_TEMP); - - return (error); + /* Cleanup after parse_rules() */ + free(nci, M_TEMP); + return (ret); } /* * Lists all sopts currently registered. - * Data layout (v0)(current): + * Data layout (v1)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ] * * Returns 0 on success */ static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; ipfw_sopt_info *i; struct ipfw_sopt_handler *sh; uint32_t count, n, size; - olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); + olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd, + sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); CTL3_LOCK(); count = ctl3_hsize; size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_sopt_info); if (size > olh->size) { olh->size = size; CTL3_UNLOCK(); return (ENOMEM); } olh->size = size; for (n = 1; n <= count; n++) { i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i)); KASSERT(i != NULL, ("previously checked buffer is not enough")); sh = &ctl3_handlers[n]; i->opcode = sh->opcode; i->version = sh->version; i->refcnt = sh->refcnt; } CTL3_UNLOCK(); return (0); } /* * Compares two opcodes. * Used both in qsort() and bsearch(). * * Returns 0 if match is found. */ static int compare_opcodes(const void *_a, const void *_b) { const struct opcode_obj_rewrite *a, *b; a = (const struct opcode_obj_rewrite *)_a; b = (const struct opcode_obj_rewrite *)_b; if (a->opcode < b->opcode) return (-1); else if (a->opcode > b->opcode) return (1); return (0); } /* * XXX: Rewrite bsearch() */ static int find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo, struct opcode_obj_rewrite **phi) { struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw; memset(&h, 0, sizeof(h)); h.opcode = op; rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters, ctl3_rsize, sizeof(h), compare_opcodes); if (rw == NULL) return (1); /* Find the first element matching the same opcode */ lo = rw; for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--) ; /* Find the last element matching the same opcode */ hi = rw; ctl3_max = ctl3_rewriters + ctl3_rsize; for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++) ; *plo = lo; *phi = hi; return (0); } /* * Finds opcode object rewriter based on @code. * * Returns pointer to handler or NULL. */ static struct opcode_obj_rewrite * -find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +find_op_rw(ipfw_insn *cmd, uint32_t *puidx, uint8_t *ptype) { struct opcode_obj_rewrite *rw, *lo, *hi; - uint16_t uidx; + uint32_t uidx; uint8_t subtype; if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0) return (NULL); for (rw = lo; rw <= hi; rw++) { if (rw->classifier(cmd, &uidx, &subtype) == 0) { if (puidx != NULL) *puidx = uidx; if (ptype != NULL) *ptype = subtype; return (rw); } } return (NULL); } int -classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx) +classify_opcode_kidx(ipfw_insn *cmd, uint32_t *puidx) { if (find_op_rw(cmd, puidx, NULL) == NULL) return (1); return (0); } void -update_opcode_kidx(ipfw_insn *cmd, uint16_t idx) +update_opcode_kidx(ipfw_insn *cmd, uint32_t idx) { struct opcode_obj_rewrite *rw; rw = find_op_rw(cmd, NULL, NULL); KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode)); rw->update(cmd, idx); } void ipfw_init_obj_rewriter(void) { - ctl3_rewriters = NULL; ctl3_rsize = 0; } void ipfw_destroy_obj_rewriter(void) { - if (ctl3_rewriters != NULL) free(ctl3_rewriters, M_IPFW); ctl3_rewriters = NULL; ctl3_rsize = 0; } /* * Adds one or more opcode object rewrite handlers to the global array. * Function may sleep. */ void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) { size_t sz; struct opcode_obj_rewrite *tmp; CTL3_LOCK(); for (;;) { sz = ctl3_rsize + count; CTL3_UNLOCK(); tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO); CTL3_LOCK(); if (ctl3_rsize + count <= sz) break; /* Retry */ free(tmp, M_IPFW); } /* Merge old & new arrays */ sz = ctl3_rsize + count; memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw)); memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw)); qsort(tmp, sz, sizeof(*rw), compare_opcodes); /* Switch new and free old */ if (ctl3_rewriters != NULL) free(ctl3_rewriters, M_IPFW); ctl3_rewriters = tmp; ctl3_rsize = sz; CTL3_UNLOCK(); } /* * Removes one or more object rewrite handlers from the global array. */ int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count) { size_t sz; struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi; int i; CTL3_LOCK(); for (i = 0; i < count; i++) { if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0) continue; for (ktmp = lo; ktmp <= hi; ktmp++) { if (ktmp->classifier != rw[i].classifier) continue; ctl3_max = ctl3_rewriters + ctl3_rsize; sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp); memmove(ktmp, ktmp + 1, sz); ctl3_rsize--; break; } } if (ctl3_rsize == 0) { if (ctl3_rewriters != NULL) free(ctl3_rewriters, M_IPFW); ctl3_rewriters = NULL; } CTL3_UNLOCK(); return (0); } static int export_objhash_ntlv_internal(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct sockopt_data *sd; ipfw_obj_ntlv *ntlv; sd = (struct sockopt_data *)arg; ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); if (ntlv == NULL) return (ENOMEM); ipfw_export_obj_ntlv(no, ntlv); return (0); } /* * Lists all service objects. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ] * Returns 0 on success */ static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *hdr; int count; hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr)); if (hdr == NULL) return (EINVAL); IPFW_UH_RLOCK(chain); count = ipfw_objhash_count(CHAIN_TO_SRV(chain)); hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv); if (sd->valsize < hdr->size) { IPFW_UH_RUNLOCK(chain); return (ENOMEM); } hdr->count = count; hdr->objsize = sizeof(ipfw_obj_ntlv); if (count > 0) ipfw_objhash_foreach(CHAIN_TO_SRV(chain), export_objhash_ntlv_internal, sd); IPFW_UH_RUNLOCK(chain); return (0); } +void +ipfw_enable_skipto_cache(struct ip_fw_chain *chain) +{ + + IPFW_UH_WLOCK_ASSERT(chain); + update_skipto_cache(chain, chain->map); + + IPFW_WLOCK(chain); + swap_skipto_cache(chain); + V_skipto_cache = 1; + IPFW_WUNLOCK(chain); +} + +/* + * Enables or disable skipto cache. + * Request: [ ipfw_cmd_header ] size = ipfw_cmd_header.size + * Reply: [ ipfw_cmd_header ] + * Returns 0 on success + */ +static int +manage_skiptocache(struct ip_fw_chain *chain, ip_fw3_opheader *op3, + struct sockopt_data *sd) +{ + ipfw_cmd_header *hdr; + + if (sd->valsize != sizeof(*hdr)) + return (EINVAL); + + hdr = (ipfw_cmd_header *)ipfw_get_sopt_space(sd, sd->valsize); + if (hdr->cmd != SKIPTO_CACHE_DISABLE && + hdr->cmd != SKIPTO_CACHE_ENABLE) + return (EOPNOTSUPP); + + IPFW_UH_WLOCK(chain); + if (hdr->cmd != V_skipto_cache) { + if (hdr->cmd == SKIPTO_CACHE_ENABLE) + ipfw_enable_skipto_cache(chain); + V_skipto_cache = hdr->cmd; + } + IPFW_UH_WUNLOCK(chain); + return (0); +} + /* * Compares two sopt handlers (code, version and handler ptr). * Used both as qsort() and bsearch(). * Does not compare handler for latter case. * * Returns 0 if match is found. */ static int compare_sh(const void *_a, const void *_b) { const struct ipfw_sopt_handler *a, *b; a = (const struct ipfw_sopt_handler *)_a; b = (const struct ipfw_sopt_handler *)_b; if (a->opcode < b->opcode) return (-1); else if (a->opcode > b->opcode) return (1); if (a->version < b->version) return (-1); else if (a->version > b->version) return (1); /* bsearch helper */ if (a->handler == NULL) return (0); if ((uintptr_t)a->handler < (uintptr_t)b->handler) return (-1); else if ((uintptr_t)a->handler > (uintptr_t)b->handler) return (1); return (0); } /* * Finds sopt handler based on @code and @version. * * Returns pointer to handler or NULL. */ static struct ipfw_sopt_handler * find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler) { struct ipfw_sopt_handler *sh, h; memset(&h, 0, sizeof(h)); h.opcode = code; h.version = version; h.handler = handler; sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers, ctl3_hsize, sizeof(h), compare_sh); return (sh); } static int find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh) { struct ipfw_sopt_handler *sh; CTL3_LOCK(); if ((sh = find_sh(opcode, version, NULL)) == NULL) { CTL3_UNLOCK(); printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n", opcode, version); return (EINVAL); } sh->refcnt++; ctl3_refct++; /* Copy handler data to requested buffer */ *psh = *sh; CTL3_UNLOCK(); return (0); } static void find_unref_sh(struct ipfw_sopt_handler *psh) { struct ipfw_sopt_handler *sh; CTL3_LOCK(); sh = find_sh(psh->opcode, psh->version, NULL); KASSERT(sh != NULL, ("ctl3 handler disappeared")); sh->refcnt--; ctl3_refct--; CTL3_UNLOCK(); } void ipfw_init_sopt_handler(void) { - CTL3_LOCK_INIT(); IPFW_ADD_SOPT_HANDLER(1, scodes); } void ipfw_destroy_sopt_handler(void) { - IPFW_DEL_SOPT_HANDLER(1, scodes); CTL3_LOCK_DESTROY(); } +void +ipfw_register_compat(ipfw_check_opcode_t f) +{ + check_opcode_f = f; +} + +void +ipfw_unregister_compat(void) +{ + check_opcode_f = check_opcode_compat_nop; +} + /* * Adds one or more sockopt handlers to the global array. * Function may sleep. */ void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) { size_t sz; struct ipfw_sopt_handler *tmp; CTL3_LOCK(); for (;;) { sz = ctl3_hsize + count; CTL3_UNLOCK(); tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO); CTL3_LOCK(); if (ctl3_hsize + count <= sz) break; /* Retry */ free(tmp, M_IPFW); } /* Merge old & new arrays */ sz = ctl3_hsize + count; memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh)); memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh)); qsort(tmp, sz, sizeof(*sh), compare_sh); /* Switch new and free old */ if (ctl3_handlers != NULL) free(ctl3_handlers, M_IPFW); ctl3_handlers = tmp; ctl3_hsize = sz; ctl3_gencnt++; CTL3_UNLOCK(); } /* * Removes one or more sockopt handlers from the global array. */ int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count) { size_t sz; struct ipfw_sopt_handler *tmp, *h; int i; CTL3_LOCK(); for (i = 0; i < count; i++) { tmp = &sh[i]; h = find_sh(tmp->opcode, tmp->version, tmp->handler); if (h == NULL) continue; sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h); memmove(h, h + 1, sz); ctl3_hsize--; } if (ctl3_hsize == 0) { if (ctl3_handlers != NULL) free(ctl3_handlers, M_IPFW); ctl3_handlers = NULL; } ctl3_gencnt++; CTL3_UNLOCK(); return (0); } /* * Writes data accumulated in @sd to sockopt buffer. * Zeroes internal @sd buffer. */ static int ipfw_flush_sopt_data(struct sockopt_data *sd) { struct sockopt *sopt; int error; size_t sz; sz = sd->koff; if (sz == 0) return (0); sopt = sd->sopt; if (sopt->sopt_dir == SOPT_GET) { error = copyout(sd->kbuf, sopt->sopt_val, sz); if (error != 0) return (error); } memset(sd->kbuf, 0, sd->ksize); sd->ktotal += sz; sd->koff = 0; if (sd->ktotal + sd->ksize < sd->valsize) sd->kavail = sd->ksize; else sd->kavail = sd->valsize - sd->ktotal; /* Update sopt buffer data */ sopt->sopt_valsize = sd->ktotal; sopt->sopt_val = sd->sopt_val + sd->ktotal; return (0); } /* * Ensures that @sd buffer has contiguous @neeeded number of * bytes. * * Returns pointer to requested space or NULL. */ caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed) { int error; caddr_t addr; if (sd->kavail < needed) { /* * Flush data and try another time. */ error = ipfw_flush_sopt_data(sd); if (sd->kavail < needed || error != 0) return (NULL); } addr = sd->kbuf + sd->koff; sd->koff += needed; sd->kavail -= needed; return (addr); } /* * Requests @needed contiguous bytes from @sd buffer. * Function is used to notify subsystem that we are * interesed in first @needed bytes (request header) * and the rest buffer can be safely zeroed. * * Returns pointer to requested space or NULL. */ caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed) { caddr_t addr; if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL) return (NULL); if (sd->kavail > 0) memset(sd->kbuf + sd->koff, 0, sd->kavail); return (addr); } /* * New sockopt handler. */ int ipfw_ctl3(struct sockopt *sopt) { int error, locked; size_t size, valsize; struct ip_fw_chain *chain; char xbuf[256]; struct sockopt_data sdata; struct ipfw_sopt_handler h; ip_fw3_opheader *op3 = NULL; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); if (error != 0) return (error); if (sopt->sopt_name != IP_FW3) - return (ipfw_ctl(sopt)); + return (EOPNOTSUPP); chain = &V_layer3_chain; error = 0; /* Save original valsize before it is altered via sooptcopyin() */ valsize = sopt->sopt_valsize; memset(&sdata, 0, sizeof(sdata)); /* Read op3 header first to determine actual operation */ op3 = (ip_fw3_opheader *)xbuf; error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3)); if (error != 0) return (error); sopt->sopt_valsize = valsize; /* * Find and reference command. */ error = find_ref_sh(op3->opcode, op3->version, &h); if (error != 0) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error != 0) { find_unref_sh(&h); return (error); } } /* * Fill in sockopt_data structure that may be useful for * IP_FW3 get requests. */ locked = 0; if (valsize <= sizeof(xbuf)) { /* use on-stack buffer */ sdata.kbuf = xbuf; sdata.ksize = sizeof(xbuf); sdata.kavail = valsize; } else { /* * Determine opcode type/buffer size: * allocate sliding-window buf for data export or * contiguous buffer for special ops. */ if ((h.dir & HDIR_SET) != 0) { /* Set request. Allocate contigous buffer. */ if (valsize > CTL3_LARGEBUF) { find_unref_sh(&h); return (EFBIG); } size = valsize; } else { /* Get request. Allocate sliding window buffer */ size = (valsizesopt_val, valsize); if (error != 0) return (error); locked = 1; } } sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); sdata.ksize = size; sdata.kavail = size; } sdata.sopt = sopt; sdata.sopt_val = sopt->sopt_val; sdata.valsize = valsize; /* * Copy either all request (if valsize < bsize_max) * or first bsize_max bytes to guarantee most consumers * that all necessary data has been copied). * Anyway, copy not less than sizeof(ip_fw3_opheader). */ if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize, sizeof(ip_fw3_opheader))) != 0) return (error); op3 = (ip_fw3_opheader *)sdata.kbuf; /* Finally, run handler */ error = h.handler(chain, op3, &sdata); find_unref_sh(&h); /* Flush state and free buffers */ if (error == 0) error = ipfw_flush_sopt_data(&sdata); else ipfw_flush_sopt_data(&sdata); if (locked != 0) vsunlock(sdata.sopt_val, valsize); /* Restore original pointer and set number of bytes written */ sopt->sopt_val = sdata.sopt_val; sopt->sopt_valsize = sdata.ktotal; if (sdata.kbuf != xbuf) free(sdata.kbuf, M_TEMP); return (error); } -/** - * {set|get}sockopt parser. - */ -int -ipfw_ctl(struct sockopt *sopt) -{ -#define RULE_MAXSIZE (512*sizeof(u_int32_t)) - int error; - size_t size; - struct ip_fw *buf; - struct ip_fw_rule0 *rule; - struct ip_fw_chain *chain; - u_int32_t rulenum[2]; - uint32_t opt; - struct rule_check_info ci; - IPFW_RLOCK_TRACKER; - - chain = &V_layer3_chain; - error = 0; - - opt = sopt->sopt_name; - - /* - * Disallow modifications in really-really secure mode, but still allow - * the logging counters to be reset. - */ - if (opt == IP_FW_ADD || - (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) { - error = securelevel_ge(sopt->sopt_td->td_ucred, 3); - if (error != 0) - return (error); - } - - switch (opt) { - case IP_FW_GET: - /* - * pass up a copy of the current rules. Static rules - * come first (the last of which has number IPFW_DEFAULT_RULE), - * followed by a possibly empty list of dynamic rule. - * The last dynamic rule has NULL in the "next" field. - * - * Note that the calculated size is used to bound the - * amount of data returned to the user. The rule set may - * change between calculating the size and returning the - * data in which case we'll just return what fits. - */ - for (;;) { - int len = 0, want; - - size = chain->static_len; - size += ipfw_dyn_len(); - if (size >= sopt->sopt_valsize) - break; - buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO); - IPFW_UH_RLOCK(chain); - /* check again how much space we need */ - want = chain->static_len + ipfw_dyn_len(); - if (size >= want) - len = ipfw_getrules(chain, buf, size); - IPFW_UH_RUNLOCK(chain); - if (size >= want) - error = sooptcopyout(sopt, buf, len); - free(buf, M_TEMP); - if (size >= want) - break; - } - break; - - case IP_FW_FLUSH: - /* locking is done within del_entry() */ - error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ - break; - - case IP_FW_ADD: - rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, rule, RULE_MAXSIZE, - sizeof(struct ip_fw7) ); - - memset(&ci, 0, sizeof(struct rule_check_info)); - - /* - * If the size of commands equals RULESIZE7 then we assume - * a FreeBSD7.2 binary is talking to us (set is7=1). - * is7 is persistent so the next 'ipfw list' command - * will use this format. - * NOTE: If wrong version is guessed (this can happen if - * the first ipfw command is 'ipfw [pipe] list') - * the ipfw binary may crash or loop infinitly... - */ - size = sopt->sopt_valsize; - if (size == RULESIZE7(rule)) { - is7 = 1; - error = convert_rule_to_8(rule); - if (error) { - free(rule, M_TEMP); - return error; - } - size = RULESIZE(rule); - } else - is7 = 0; - if (error == 0) - error = check_ipfw_rule0(rule, size, &ci); - if (error == 0) { - /* locking is done within add_rule() */ - struct ip_fw *krule; - krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule)); - ci.urule = (caddr_t)rule; - ci.krule = krule; - import_rule0(&ci); - error = commit_rules(chain, &ci, 1); - if (error != 0) - ipfw_free_rule(ci.krule); - else if (sopt->sopt_dir == SOPT_GET) { - if (is7) { - error = convert_rule_to_7(rule); - size = RULESIZE7(rule); - if (error) { - free(rule, M_TEMP); - return error; - } - } - error = sooptcopyout(sopt, rule, size); - } - } - free(rule, M_TEMP); - break; - - case IP_FW_DEL: - /* - * IP_FW_DEL is used for deleting single rules or sets, - * and (ab)used to atomically manipulate sets. Argument size - * is used to distinguish between the two: - * sizeof(u_int32_t) - * delete single rule or set of rules, - * or reassign rules (or sets) to a different set. - * 2*sizeof(u_int32_t) - * atomic disable/enable sets. - * first u_int32_t contains sets to be disabled, - * second u_int32_t contains sets to be enabled. - */ - error = sooptcopyin(sopt, rulenum, - 2*sizeof(u_int32_t), sizeof(u_int32_t)); - if (error) - break; - size = sopt->sopt_valsize; - if (size == sizeof(u_int32_t) && rulenum[0] != 0) { - /* delete or reassign, locking done in del_entry() */ - error = del_entry(chain, rulenum[0]); - } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ - IPFW_UH_WLOCK(chain); - V_set_disable = - (V_set_disable | rulenum[0]) & ~rulenum[1] & - ~(1<sopt_val != 0) { - error = sooptcopyin(sopt, rulenum, - sizeof(u_int32_t), sizeof(u_int32_t)); - if (error) - break; - } - error = zero_entry(chain, rulenum[0], - sopt->sopt_name == IP_FW_RESETLOG); - break; - - /*--- TABLE opcodes ---*/ - case IP_FW_TABLE_ADD: - case IP_FW_TABLE_DEL: - { - ipfw_table_entry ent; - struct tentry_info tei; - struct tid_info ti; - struct table_value v; - - error = sooptcopyin(sopt, &ent, - sizeof(ent), sizeof(ent)); - if (error) - break; - - memset(&tei, 0, sizeof(tei)); - tei.paddr = &ent.addr; - tei.subtype = AF_INET; - tei.masklen = ent.masklen; - ipfw_import_table_value_legacy(ent.value, &v); - tei.pvalue = &v; - memset(&ti, 0, sizeof(ti)); - ti.uidx = ent.tbl; - ti.type = IPFW_TABLE_CIDR; - - error = (opt == IP_FW_TABLE_ADD) ? - add_table_entry(chain, &ti, &tei, 0, 1) : - del_table_entry(chain, &ti, &tei, 0, 1); - } - break; - - case IP_FW_TABLE_FLUSH: - { - u_int16_t tbl; - struct tid_info ti; - - error = sooptcopyin(sopt, &tbl, - sizeof(tbl), sizeof(tbl)); - if (error) - break; - memset(&ti, 0, sizeof(ti)); - ti.uidx = tbl; - error = flush_table(chain, &ti); - } - break; - - case IP_FW_TABLE_GETSIZE: - { - u_int32_t tbl, cnt; - struct tid_info ti; - - if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), - sizeof(tbl)))) - break; - memset(&ti, 0, sizeof(ti)); - ti.uidx = tbl; - IPFW_RLOCK(chain); - error = ipfw_count_table(chain, &ti, &cnt); - IPFW_RUNLOCK(chain); - if (error) - break; - error = sooptcopyout(sopt, &cnt, sizeof(cnt)); - } - break; - - case IP_FW_TABLE_LIST: - { - ipfw_table *tbl; - struct tid_info ti; - - if (sopt->sopt_valsize < sizeof(*tbl)) { - error = EINVAL; - break; - } - size = sopt->sopt_valsize; - tbl = malloc(size, M_TEMP, M_WAITOK); - error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); - if (error) { - free(tbl, M_TEMP); - break; - } - tbl->size = (size - sizeof(*tbl)) / - sizeof(ipfw_table_entry); - memset(&ti, 0, sizeof(ti)); - ti.uidx = tbl->tbl; - IPFW_RLOCK(chain); - error = ipfw_dump_table_legacy(chain, &ti, tbl); - IPFW_RUNLOCK(chain); - if (error) { - free(tbl, M_TEMP); - break; - } - error = sooptcopyout(sopt, tbl, size); - free(tbl, M_TEMP); - } - break; - - /*--- NAT operations are protected by the IPFW_LOCK ---*/ - case IP_FW_NAT_CFG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_cfg_ptr(sopt); - else { - printf("IP_FW_NAT_CFG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_DEL: - if (IPFW_NAT_LOADED) - error = ipfw_nat_del_ptr(sopt); - else { - printf("IP_FW_NAT_DEL: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_GET_CONFIG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_get_cfg_ptr(sopt); - else { - printf("IP_FW_NAT_GET_CFG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - case IP_FW_NAT_GET_LOG: - if (IPFW_NAT_LOADED) - error = ipfw_nat_get_log_ptr(sopt); - else { - printf("IP_FW_NAT_GET_LOG: %s\n", - "ipfw_nat not present, please load it"); - error = EINVAL; - } - break; - - default: - printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); - error = EINVAL; - } - - return (error); -#undef RULE_MAXSIZE -} -#define RULE_MAXSIZE (256*sizeof(u_int32_t)) - -/* Functions to convert rules 7.2 <==> 8.0 */ -static int -convert_rule_to_7(struct ip_fw_rule0 *rule) -{ - /* Used to modify original rule */ - struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; - /* copy of original rule, version 8 */ - struct ip_fw_rule0 *tmp; - - /* Used to copy commands */ - ipfw_insn *ccmd, *dst; - int ll = 0, ccmdlen = 0; - - tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); - if (tmp == NULL) { - return 1; //XXX error - } - bcopy(rule, tmp, RULE_MAXSIZE); - - /* Copy fields */ - //rule7->_pad = tmp->_pad; - rule7->set = tmp->set; - rule7->rulenum = tmp->rulenum; - rule7->cmd_len = tmp->cmd_len; - rule7->act_ofs = tmp->act_ofs; - rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; - rule7->cmd_len = tmp->cmd_len; - rule7->pcnt = tmp->pcnt; - rule7->bcnt = tmp->bcnt; - rule7->timestamp = tmp->timestamp; - - /* Copy commands */ - for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; - ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { - ccmdlen = F_LEN(ccmd); - - bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); - - if (dst->opcode > O_NAT) - /* O_REASS doesn't exists in 7.2 version, so - * decrement opcode if it is after O_REASS - */ - dst->opcode--; - - if (ccmdlen > ll) { - printf("ipfw: opcode %d size truncated\n", - ccmd->opcode); - return EINVAL; - } - } - free(tmp, M_TEMP); - - return 0; -} - -static int -convert_rule_to_8(struct ip_fw_rule0 *rule) -{ - /* Used to modify original rule */ - struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; - - /* Used to copy commands */ - ipfw_insn *ccmd, *dst; - int ll = 0, ccmdlen = 0; - - /* Copy of original rule */ - struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); - if (tmp == NULL) { - return 1; //XXX error - } - - bcopy(rule7, tmp, RULE_MAXSIZE); - - for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; - ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { - ccmdlen = F_LEN(ccmd); - - bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); - - if (dst->opcode > O_NAT) - /* O_REASS doesn't exists in 7.2 version, so - * increment opcode if it is after O_REASS - */ - dst->opcode++; - - if (ccmdlen > ll) { - printf("ipfw: opcode %d size truncated\n", - ccmd->opcode); - return EINVAL; - } - } - - rule->_pad = tmp->_pad; - rule->set = tmp->set; - rule->rulenum = tmp->rulenum; - rule->cmd_len = tmp->cmd_len; - rule->act_ofs = tmp->act_ofs; - rule->next_rule = (struct ip_fw *)tmp->next_rule; - rule->cmd_len = tmp->cmd_len; - rule->id = 0; /* XXX see if is ok = 0 */ - rule->pcnt = tmp->pcnt; - rule->bcnt = tmp->bcnt; - rule->timestamp = tmp->timestamp; - - free (tmp, M_TEMP); - return 0; -} - /* * Named object api * */ void ipfw_init_srv(struct ip_fw_chain *ch) { - - ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT); + ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT, + DEFAULT_OBJHASH_SIZE); ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT, M_IPFW, M_WAITOK | M_ZERO); } void ipfw_destroy_srv(struct ip_fw_chain *ch) { - free(ch->srvstate, M_IPFW); ipfw_objhash_destroy(ch->srvmap); } /* * Allocate new bitmask which can be used to enlarge/shrink * named instance index. */ void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks) { size_t size; int max_blocks; u_long *idx_mask; KASSERT((items % BLOCK_ITEMS) == 0, ("bitmask size needs to power of 2 and greater or equal to %zu", BLOCK_ITEMS)); max_blocks = items / BLOCK_ITEMS; size = items / 8; idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK); /* Mark all as free */ memset(idx_mask, 0xFF, size * IPFW_MAX_SETS); *idx_mask &= ~(u_long)1; /* Skip index 0 */ *idx = idx_mask; *pblocks = max_blocks; } /* * Copy current bitmask index to new one. */ void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks) { int old_blocks, new_blocks; u_long *old_idx, *new_idx; int i; old_idx = ni->idx_mask; old_blocks = ni->max_blocks; new_idx = *idx; new_blocks = *blocks; for (i = 0; i < IPFW_MAX_SETS; i++) { memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i], old_blocks * sizeof(u_long)); } } /* * Swaps current @ni index with new one. */ void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks) { int old_blocks; u_long *old_idx; old_idx = ni->idx_mask; old_blocks = ni->max_blocks; ni->idx_mask = *idx; ni->max_blocks = *blocks; /* Save old values */ *idx = old_idx; *blocks = old_blocks; } void ipfw_objhash_bitmap_free(void *idx, int blocks) { - free(idx, M_IPFW); } /* * Creates named hash instance. * Must be called without holding any locks. * Return pointer to new instance. */ struct namedobj_instance * -ipfw_objhash_create(uint32_t items) +ipfw_objhash_create(uint32_t items, size_t hash_size) { struct namedobj_instance *ni; int i; size_t size; size = sizeof(struct namedobj_instance) + - sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE + - sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE; + sizeof(struct namedobjects_head) * hash_size + + sizeof(struct namedobjects_head) * hash_size; ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO); - ni->nn_size = NAMEDOBJ_HASH_SIZE; - ni->nv_size = NAMEDOBJ_HASH_SIZE; + ni->nn_size = hash_size; + ni->nv_size = hash_size; ni->names = (struct namedobjects_head *)(ni +1); ni->values = &ni->names[ni->nn_size]; for (i = 0; i < ni->nn_size; i++) TAILQ_INIT(&ni->names[i]); for (i = 0; i < ni->nv_size; i++) TAILQ_INIT(&ni->values[i]); /* Set default hashing/comparison functions */ ni->hash_f = objhash_hash_name; ni->cmp_f = objhash_cmp_name; /* Allocate bitmask separately due to possible resize */ ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks); return (ni); } void ipfw_objhash_destroy(struct namedobj_instance *ni) { - free(ni->idx_mask, M_IPFW); free(ni, M_IPFW); } void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f) { ni->hash_f = hash_f; ni->cmp_f = cmp_f; } static uint32_t objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set) { return (fnv_32_str((const char *)name, FNV1_32_INIT)); } static int objhash_cmp_name(struct named_object *no, const void *name, uint32_t set) { if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set)) return (0); return (1); } static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val) { uint32_t v; v = val % (ni->nv_size - 1); return (v); } struct named_object * ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, const char *name) { struct named_object *no; uint32_t hash; hash = ni->hash_f(ni, name, set) % ni->nn_size; TAILQ_FOREACH(no, &ni->names[hash], nn_next) { if (ni->cmp_f(no, name, set) == 0) return (no); } return (NULL); } /* * Find named object by @uid. * Check @tlvs for valid data inside. * * Returns pointer to found TLV or NULL. */ ipfw_obj_ntlv * -ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv) +ipfw_find_name_tlv_type(void *tlvs, int len, uint32_t uidx, uint32_t etlv) { ipfw_obj_ntlv *ntlv; uintptr_t pa, pe; int l; pa = (uintptr_t)tlvs; pe = pa + len; l = 0; for (; pa < pe; pa += l) { ntlv = (ipfw_obj_ntlv *)pa; l = ntlv->head.length; if (l != sizeof(*ntlv)) return (NULL); if (ntlv->idx != uidx) continue; /* * When userland has specified zero TLV type, do * not compare it with eltv. In some cases userland * doesn't know what type should it have. Use only * uidx and name for search named_object. */ if (ntlv->head.type != 0 && ntlv->head.type != (uint16_t)etlv) continue; if (ipfw_check_object_name_generic(ntlv->name) != 0) return (NULL); return (ntlv); } return (NULL); } /* * Finds object config based on either legacy index * or name in ntlv. * Note @ti structure contains unchecked data from userland. * * Returns 0 in success and fills in @pno with found config */ int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti, uint32_t etlv, struct named_object **pno) { char *name; ipfw_obj_ntlv *ntlv; uint32_t set; if (ti->tlvs == NULL) return (EINVAL); ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv); if (ntlv == NULL) return (EINVAL); name = ntlv->name; /* * Use set provided by @ti instead of @ntlv one. * This is needed due to different sets behavior * controlled by V_fw_tables_sets. */ set = ti->set; *pno = ipfw_objhash_lookup_name(ni, set, name); if (*pno == NULL) return (ESRCH); return (0); } /* * Find named object by name, considering also its TLV type. */ struct named_object * ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set, uint32_t type, const char *name) { struct named_object *no; uint32_t hash; hash = ni->hash_f(ni, name, set) % ni->nn_size; TAILQ_FOREACH(no, &ni->names[hash], nn_next) { if (ni->cmp_f(no, name, set) == 0 && no->etlv == (uint16_t)type) return (no); } return (NULL); } struct named_object * -ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx) +ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint32_t kidx) { struct named_object *no; uint32_t hash; hash = objhash_hash_idx(ni, kidx); TAILQ_FOREACH(no, &ni->values[hash], nv_next) { if (no->kidx == kidx) return (no); } return (NULL); } int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, struct named_object *b) { if ((strcmp(a->name, b->name) == 0) && a->set == b->set) return (1); return (0); } void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no) { uint32_t hash; hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next); hash = objhash_hash_idx(ni, no->kidx); TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next); ni->count++; } void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no) { uint32_t hash; hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size; TAILQ_REMOVE(&ni->names[hash], no, nn_next); hash = objhash_hash_idx(ni, no->kidx); TAILQ_REMOVE(&ni->values[hash], no, nv_next); ni->count--; } uint32_t ipfw_objhash_count(struct namedobj_instance *ni) { return (ni->count); } uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type) { struct named_object *no; uint32_t count; int i; count = 0; for (i = 0; i < ni->nn_size; i++) { TAILQ_FOREACH(no, &ni->names[i], nn_next) { if (no->etlv == type) count++; } } return (count); } /* * Runs @func for each found named object. * It is safe to delete objects from callback */ int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg) { struct named_object *no, *no_tmp; int i, ret; for (i = 0; i < ni->nn_size; i++) { TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) { ret = f(ni, no, arg); if (ret != 0) return (ret); } } return (0); } /* * Runs @f for each found named object with type @type. * It is safe to delete objects from callback */ int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f, void *arg, uint16_t type) { struct named_object *no, *no_tmp; int i, ret; for (i = 0; i < ni->nn_size; i++) { TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) { if (no->etlv != type) continue; ret = f(ni, no, arg); if (ret != 0) return (ret); } } return (0); } /* * Removes index from given set. * Returns 0 on success. */ int -ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx) +ipfw_objhash_free_idx(struct namedobj_instance *ni, uint32_t idx) { u_long *mask; int i, v; i = idx / BLOCK_ITEMS; v = idx % BLOCK_ITEMS; if (i >= ni->max_blocks) return (1); mask = &ni->idx_mask[i]; if ((*mask & ((u_long)1 << v)) != 0) return (1); /* Mark as free */ *mask |= (u_long)1 << v; /* Update free offset */ if (ni->free_off[0] > i) ni->free_off[0] = i; return (0); } /* * Allocate new index in given instance and stores in in @pidx. * Returns 0 on success. */ int -ipfw_objhash_alloc_idx(void *n, uint16_t *pidx) +ipfw_objhash_alloc_idx(void *n, uint32_t *pidx) { struct namedobj_instance *ni; u_long *mask; int i, off, v; ni = (struct namedobj_instance *)n; off = ni->free_off[0]; mask = &ni->idx_mask[off]; for (i = off; i < ni->max_blocks; i++, mask++) { if ((v = ffsl(*mask)) == 0) continue; /* Mark as busy */ *mask &= ~ ((u_long)1 << (v - 1)); ni->free_off[0] = i; v = BLOCK_ITEMS * i + v - 1; *pidx = v; return (0); } return (1); } /* end of file */ diff --git a/sys/netpfil/ipfw/ip_fw_table.c b/sys/netpfil/ipfw/ip_fw_table.c index 92bb7a63fd5a..da52166f8062 100644 --- a/sys/netpfil/ipfw/ip_fw_table.c +++ b/sys/netpfil/ipfw/ip_fw_table.c @@ -1,3360 +1,3045 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. - * Copyright (c) 2014 Yandex LLC + * Copyright (c) 2014-2024 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * Lookup table support for ipfw. * * This file contains handlers for all generic tables' operations: * add/del/flush entries, list/dump tables etc.. * * Table data modification is protected by both UH and runtime lock * while reading configuration/data is protected by UH lock. * * Lookup algorithms for all table types are located in ip_fw_table_algo.c */ #include "opt_ipfw.h" #include #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include /* struct ipfw_rule_ref */ #include #include #include /* * Table has the following `type` concepts: * * `no.type` represents lookup key type (addr, ifp, uid, etc..) * vmask represents bitmask of table values which are present at the moment. * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old * single-value-for-all approach. */ struct table_config { struct named_object no; uint8_t tflags; /* type flags */ uint8_t locked; /* 1 if locked from changes */ uint8_t linked; /* 1 if already linked */ uint8_t ochanged; /* used by set swapping */ uint8_t vshared; /* 1 if using shared value array */ uint8_t spare[3]; uint32_t count; /* Number of records */ uint32_t limit; /* Max number of records */ uint32_t vmask; /* bitmask with supported values */ uint32_t ocount; /* used by set swapping */ uint64_t gencnt; /* generation count */ char tablename[64]; /* table name */ struct table_algo *ta; /* Callbacks for given algo */ void *astate; /* algorithm state */ struct table_info ti_copy; /* data to put to table_info */ struct namedobj_instance *vi; }; static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti, struct table_config **tc); static struct table_config *find_table(struct namedobj_instance *ni, struct tid_info *ti); static struct table_config *alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags); static void free_table_config(struct namedobj_instance *ni, struct table_config *tc); static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, - char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref); + char *aname, ipfw_xtable_info *i, uint32_t *pkidx, int ref); static void link_table(struct ip_fw_chain *ch, struct table_config *tc); static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc); static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc); #define OP_ADD 1 #define OP_DEL 0 static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, struct sockopt_data *sd); static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, ipfw_xtable_info *i); static int dump_table_tentry(void *e, void *arg); -static int dump_table_xentry(void *e, void *arg); static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, struct tid_info *b); static int check_table_name(const char *name); static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, struct table_config *tc, struct table_info *ti, uint32_t count); static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti); static struct table_algo *find_table_algo(struct tables_config *tableconf, struct tid_info *ti, char *name); static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti); static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti); #define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash) #define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k])) #define TA_BUF_SZ 128 /* On-stack buffer for add/delete state */ void rollback_toperation_state(struct ip_fw_chain *ch, void *object) { struct tables_config *tcfg; struct op_state *os; tcfg = CHAIN_TO_TCFG(ch); TAILQ_FOREACH(os, &tcfg->state_list, next) os->func(object, os); } void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) { struct tables_config *tcfg; tcfg = CHAIN_TO_TCFG(ch); TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next); } void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) { struct tables_config *tcfg; tcfg = CHAIN_TO_TCFG(ch); TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next); } void tc_ref(struct table_config *tc) { tc->no.refcnt++; } void tc_unref(struct table_config *tc) { tc->no.refcnt--; } static struct table_value * get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx) { struct table_value *pval; pval = (struct table_value *)ch->valuestate; return (&pval[kidx]); } /* * Checks if we're able to insert/update entry @tei into table * w.r.t @tc limits. * May alter @tei to indicate insertion error / insert * options. * * Returns 0 if operation can be performed/ */ static int check_table_limit(struct table_config *tc, struct tentry_info *tei) { if (tc->limit == 0 || tc->count < tc->limit) return (0); if ((tei->flags & TEI_FLAGS_UPDATE) == 0) { /* Notify userland on error cause */ tei->flags |= TEI_FLAGS_LIMIT; return (EFBIG); } /* * We have UPDATE flag set. * Permit updating record (if found), * but restrict adding new one since we've * already hit the limit. */ tei->flags |= TEI_FLAGS_DONTADD; return (0); } /* * Convert algorithm callback return code into * one of pre-defined states known by userland. */ static void store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num) { int flag; flag = 0; switch (error) { case 0: if (op == OP_ADD && num != 0) flag = TEI_FLAGS_ADDED; if (op == OP_DEL) flag = TEI_FLAGS_DELETED; break; case ENOENT: flag = TEI_FLAGS_NOTFOUND; break; case EEXIST: flag = TEI_FLAGS_EXISTS; break; default: flag = TEI_FLAGS_ERROR; } tei->flags |= flag; } /* * Creates and references table with default parameters. * Saves table config, algo and allocated kidx info @ptc, @pta and * @pkidx if non-zero. * Used for table auto-creation to support old binaries. * * Returns 0 on success. */ static int create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti, - uint16_t *pkidx) + uint32_t *pkidx) { ipfw_xtable_info xi; int error; memset(&xi, 0, sizeof(xi)); /* Set default value mask for legacy clients */ xi.vmask = IPFW_VTYPE_LEGACY; error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1); if (error != 0) return (error); return (0); } /* * Find and reference existing table optionally * creating new one. * * Saves found table config into @ptc. * Note function may drop/acquire UH_WLOCK. * Returns 0 if table was found/created and referenced * or non-zero return code. */ static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc) { struct namedobj_instance *ni; struct table_config *tc; - uint16_t kidx; + uint32_t kidx; int error; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); tc = NULL; if ((tc = find_table(ni, ti)) != NULL) { /* check table type */ if (tc->no.subtype != ti->type) return (EINVAL); if (tc->locked != 0) return (EACCES); /* Try to exit early on limit hit */ if (op == OP_ADD && count == 1 && check_table_limit(tc, tei) != 0) return (EFBIG); /* Reference and return */ tc->no.refcnt++; *ptc = tc; return (0); } if (op == OP_DEL) return (ESRCH); /* Compatibility mode: create new table for old clients */ if ((tei->flags & TEI_FLAGS_COMPAT) == 0) return (ESRCH); IPFW_UH_WUNLOCK(ch); error = create_table_compat(ch, ti, &kidx); IPFW_UH_WLOCK(ch); if (error != 0) return (error); tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); - KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx)); + KASSERT(tc != NULL, ("create_table_compat returned bad idx %u", kidx)); /* OK, now we've got referenced table. */ *ptc = tc; return (0); } /* * Rolls back already @added to @tc entries using state array @ta_buf_m. * Assume the following layout: * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1]) * for storing deleted state */ static void rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc, struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m, uint32_t count, uint32_t added) { struct table_algo *ta; struct tentry_info *ptei; caddr_t v, vv; size_t ta_buf_sz; int error __diagused, i; uint32_t num; IPFW_UH_WLOCK_ASSERT(ch); ta = tc->ta; ta_buf_sz = ta->ta_buf_size; v = ta_buf_m; vv = v + count * ta_buf_sz; for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) { ptei = &tei[i]; if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) { /* * We have old value stored by previous * call in @ptei->value. Do add once again * to restore it. */ error = ta->add(tc->astate, tinfo, ptei, v, &num); KASSERT(error == 0, ("rollback UPDATE fail")); KASSERT(num == 0, ("rollback UPDATE fail2")); continue; } error = ta->prepare_del(ch, ptei, vv); KASSERT(error == 0, ("pre-rollback INSERT failed")); error = ta->del(tc->astate, tinfo, ptei, vv, &num); KASSERT(error == 0, ("rollback INSERT failed")); tc->count -= num; } } /* * Prepares add/del state for all @count entries in @tei. * Uses either stack buffer (@ta_buf) or allocates a new one. * Stores pointer to allocated buffer back to @ta_buf. * * Returns 0 on success. */ static int prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf) { caddr_t ta_buf_m, v; size_t ta_buf_sz, sz; struct tentry_info *ptei; int error, i; error = 0; ta_buf_sz = ta->ta_buf_size; if (count == 1) { /* Single add/delete, use on-stack buffer */ memset(*ta_buf, 0, TA_BUF_SZ); ta_buf_m = *ta_buf; } else { /* * Multiple adds/deletes, allocate larger buffer * * Note we need 2xcount buffer for add case: * we have hold both ADD state * and DELETE state (this may be needed * if we need to rollback all changes) */ sz = count * ta_buf_sz; ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP, M_WAITOK | M_ZERO); } v = ta_buf_m; for (i = 0; i < count; i++, v += ta_buf_sz) { ptei = &tei[i]; error = (op == OP_ADD) ? ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v); /* * Some syntax error (incorrect mask, or address, or * anything). Return error regardless of atomicity * settings. */ if (error != 0) break; } *ta_buf = ta_buf_m; return (error); } /* * Flushes allocated state for each @count entries in @tei. * Frees @ta_buf_m if differs from stack buffer @ta_buf. */ static void flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, struct tentry_info *tei, uint32_t count, int rollback, caddr_t ta_buf_m, caddr_t ta_buf) { caddr_t v; struct tentry_info *ptei; size_t ta_buf_sz; int i; ta_buf_sz = ta->ta_buf_size; /* Run cleaning callback anyway */ v = ta_buf_m; for (i = 0; i < count; i++, v += ta_buf_sz) { ptei = &tei[i]; ta->flush_entry(ch, ptei, v); if (ptei->ptv != NULL) { free(ptei->ptv, M_IPFW); ptei->ptv = NULL; } } /* Clean up "deleted" state in case of rollback */ if (rollback != 0) { v = ta_buf_m + count * ta_buf_sz; for (i = 0; i < count; i++, v += ta_buf_sz) ta->flush_entry(ch, &tei[i], v); } if (ta_buf_m != ta_buf) free(ta_buf_m, M_TEMP); } static void rollback_add_entry(void *object, struct op_state *_state) { struct ip_fw_chain *ch __diagused; struct tableop_state *ts; ts = (struct tableop_state *)_state; if (ts->tc != object && ts->ch != object) return; ch = ts->ch; IPFW_UH_WLOCK_ASSERT(ch); /* Call specifid unlockers */ rollback_table_values(ts); /* Indicate we've called */ ts->modified = 1; } /* * Adds/updates one or more entries in table @ti. * * Function may drop/reacquire UH wlock multiple times due to * items alloc, algorithm callbacks (check_space), value linkage * (new values, value storage realloc), etc.. * Other processes like other adds (which may involve storage resize), * table swaps (which changes table data and may change algo type), * table modify (which may change value mask) may be executed * simultaneously so we need to deal with it. * * The following approach was implemented: * we have per-chain linked list, protected with UH lock. * add_table_entry prepares special on-stack structure wthich is passed * to its descendants. Users add this structure to this list before unlock. * After performing needed operations and acquiring UH lock back, each user * checks if structure has changed. If true, it rolls local state back and * returns without error to the caller. * add_table_entry() on its own checks if structure has changed and restarts * its operation from the beginning (goto restart). * * Functions which are modifying fields of interest (currently * resize_shared_value_storage() and swap_tables() ) * traverses given list while holding UH lock immediately before * performing their operations calling function provided be list entry * ( currently rollback_add_entry ) which performs rollback for all necessary * state and sets appropriate values in structure indicating rollback * has happened. * * Algo interaction: * Function references @ti first to ensure table won't * disappear or change its type. * After that, prepare_add callback is called for each @tei entry. * Next, we try to add each entry under UH+WHLOCK * using add() callback. * Finally, we free all state by calling flush_entry callback * for each @tei. * * Returns 0 on success. */ int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint8_t flags, uint32_t count) { struct table_config *tc; struct table_algo *ta; - uint16_t kidx; - int error, first_error, i, rollback; - uint32_t num, numadd; struct tentry_info *ptei; struct tableop_state ts; char ta_buf[TA_BUF_SZ]; caddr_t ta_buf_m, v; + uint32_t kidx, num, numadd; + int error, first_error, i, rollback; memset(&ts, 0, sizeof(ts)); ta = NULL; IPFW_UH_WLOCK(ch); /* * Find and reference existing table. */ restart: if (ts.modified != 0) { IPFW_UH_WUNLOCK(ch); flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); memset(&ts, 0, sizeof(ts)); ta = NULL; IPFW_UH_WLOCK(ch); } error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc); if (error != 0) { IPFW_UH_WUNLOCK(ch); return (error); } ta = tc->ta; /* Fill in tablestate */ ts.ch = ch; ts.opstate.func = rollback_add_entry; ts.tc = tc; ts.vshared = tc->vshared; ts.vmask = tc->vmask; ts.ta = ta; ts.tei = tei; ts.count = count; rollback = 0; add_toperation_state(ch, &ts); IPFW_UH_WUNLOCK(ch); /* Allocate memory and prepare record(s) */ /* Pass stack buffer by default */ ta_buf_m = ta_buf; error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m); IPFW_UH_WLOCK(ch); del_toperation_state(ch, &ts); /* Drop reference we've used in first search */ tc->no.refcnt--; /* Check prepare_batch_buffer() error */ if (error != 0) goto cleanup; /* * Check if table swap has happened. * (so table algo might be changed). * Restart operation to achieve consistent behavior. */ if (ts.modified != 0) goto restart; /* * Link all values values to shared/per-table value array. * * May release/reacquire UH_WLOCK. */ error = ipfw_link_table_values(ch, &ts, flags); if (error != 0) goto cleanup; if (ts.modified != 0) goto restart; /* * Ensure we are able to add all entries without additional * memory allocations. May release/reacquire UH_WLOCK. */ kidx = tc->no.kidx; error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count); if (error != 0) goto cleanup; if (ts.modified != 0) goto restart; /* We've got valid table in @tc. Let's try to add data */ kidx = tc->no.kidx; ta = tc->ta; numadd = 0; first_error = 0; IPFW_WLOCK(ch); v = ta_buf_m; for (i = 0; i < count; i++, v += ta->ta_buf_size) { ptei = &tei[i]; num = 0; /* check limit before adding */ if ((error = check_table_limit(tc, ptei)) == 0) { /* * It should be safe to insert a record w/o * a properly-linked value if atomicity is * not required. * * If the added item does not have a valid value * index, it would get rejected by ta->add(). * */ error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, &num); /* Set status flag to inform userland */ store_tei_result(ptei, OP_ADD, error, num); } if (error == 0) { /* Update number of records to ease limit checking */ tc->count += num; numadd += num; continue; } if (first_error == 0) first_error = error; /* * Some error have happened. Check our atomicity * settings: continue if atomicity is not required, * rollback changes otherwise. */ if ((flags & IPFW_CTF_ATOMIC) == 0) continue; rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx), tei, ta_buf_m, count, i); rollback = 1; break; } IPFW_WUNLOCK(ch); ipfw_garbage_table_values(ch, tc, tei, count, rollback); /* Permit post-add algorithm grow/rehash. */ if (numadd != 0) check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); /* Return first error to user, if any */ error = first_error; cleanup: IPFW_UH_WUNLOCK(ch); flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); return (error); } /* * Deletes one or more entries in table @ti. * * Returns 0 on success. */ int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint8_t flags, uint32_t count) { struct table_config *tc; struct table_algo *ta; struct tentry_info *ptei; - uint16_t kidx; - int error, first_error, i; - uint32_t num, numdel; char ta_buf[TA_BUF_SZ]; caddr_t ta_buf_m, v; + uint32_t kidx, num, numdel; + int error, first_error, i; /* * Find and reference existing table. */ IPFW_UH_WLOCK(ch); error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc); if (error != 0) { IPFW_UH_WUNLOCK(ch); return (error); } ta = tc->ta; IPFW_UH_WUNLOCK(ch); /* Allocate memory and prepare record(s) */ /* Pass stack buffer by default */ ta_buf_m = ta_buf; error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m); if (error != 0) goto cleanup; IPFW_UH_WLOCK(ch); /* Drop reference we've used in first search */ tc->no.refcnt--; /* * Check if table algo is still the same. * (changed ta may be the result of table swap). */ if (ta != tc->ta) { IPFW_UH_WUNLOCK(ch); error = EINVAL; goto cleanup; } kidx = tc->no.kidx; numdel = 0; first_error = 0; IPFW_WLOCK(ch); v = ta_buf_m; for (i = 0; i < count; i++, v += ta->ta_buf_size) { ptei = &tei[i]; num = 0; error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, &num); /* Save state for userland */ store_tei_result(ptei, OP_DEL, error, num); if (error != 0 && first_error == 0) first_error = error; tc->count -= num; numdel += num; } IPFW_WUNLOCK(ch); /* Unlink non-used values */ ipfw_garbage_table_values(ch, tc, tei, count, 0); if (numdel != 0) { /* Run post-del hook to permit shrinking */ check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); } IPFW_UH_WUNLOCK(ch); /* Return first error to user, if any */ error = first_error; cleanup: flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf); return (error); } /* * Ensure that table @tc has enough space to add @count entries without * need for reallocation. * * Callbacks order: * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize. * * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags. * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage * 3) modify (UH_WLOCK + WLOCK) - switch pointers * 4) flush_modify (UH_WLOCK) - free state, if needed * * Returns 0 on success. */ static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, struct table_config *tc, struct table_info *ti, uint32_t count) { struct table_algo *ta; uint64_t pflags; char ta_buf[TA_BUF_SZ]; int error; IPFW_UH_WLOCK_ASSERT(ch); error = 0; ta = tc->ta; if (ta->need_modify == NULL) return (0); /* Acquire reference not to loose @tc between locks/unlocks */ tc->no.refcnt++; /* * TODO: think about avoiding race between large add/large delete * operation on algorithm which implements shrinking along with * growing. */ while (true) { pflags = 0; if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { error = 0; break; } /* We have to shrink/grow table */ if (ts != NULL) add_toperation_state(ch, ts); IPFW_UH_WUNLOCK(ch); memset(&ta_buf, 0, sizeof(ta_buf)); error = ta->prepare_mod(ta_buf, &pflags); IPFW_UH_WLOCK(ch); if (ts != NULL) del_toperation_state(ch, ts); if (error != 0) break; if (ts != NULL && ts->modified != 0) { /* * Swap operation has happened * so we're currently operating on other * table data. Stop doing this. */ ta->flush_mod(ta_buf); break; } /* Check if we still need to alter table */ ti = KIDX_TO_TI(ch, tc->no.kidx); if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { IPFW_UH_WUNLOCK(ch); /* * Other thread has already performed resize. * Flush our state and return. */ ta->flush_mod(ta_buf); break; } error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags); if (error == 0) { /* Do actual modification */ IPFW_WLOCK(ch); ta->modify(tc->astate, ti, ta_buf, pflags); IPFW_WUNLOCK(ch); } /* Anyway, flush data and retry */ ta->flush_mod(ta_buf); } tc->no.refcnt--; return (error); } -/* - * Adds or deletes record in table. - * Data layout (v0): - * Request: [ ip_fw3_opheader ipfw_table_xentry ] - * - * Returns 0 on success - */ -static int -manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, - struct sockopt_data *sd) -{ - ipfw_table_xentry *xent; - struct tentry_info tei; - struct tid_info ti; - struct table_value v; - int error, hdrlen, read; - - hdrlen = offsetof(ipfw_table_xentry, k); - - /* Check minimum header size */ - if (sd->valsize < (sizeof(*op3) + hdrlen)) - return (EINVAL); - - read = sizeof(ip_fw3_opheader); - - /* Check if xentry len field is valid */ - xent = (ipfw_table_xentry *)(op3 + 1); - if (xent->len < hdrlen || xent->len + read > sd->valsize) - return (EINVAL); - - memset(&tei, 0, sizeof(tei)); - tei.paddr = &xent->k; - tei.masklen = xent->masklen; - ipfw_import_table_value_legacy(xent->value, &v); - tei.pvalue = &v; - /* Old requests compatibility */ - tei.flags = TEI_FLAGS_COMPAT; - if (xent->type == IPFW_TABLE_ADDR) { - if (xent->len - hdrlen == sizeof(in_addr_t)) - tei.subtype = AF_INET; - else - tei.subtype = AF_INET6; - } - - memset(&ti, 0, sizeof(ti)); - ti.uidx = xent->tbl; - ti.type = xent->type; - - error = (op3->opcode == IP_FW_TABLE_XADD) ? - add_table_entry(ch, &ti, &tei, 0, 1) : - del_table_entry(ch, &ti, &tei, 0, 1); - - return (error); -} - /* * Adds or deletes record in table. * Data layout (v1)(current): * Request: [ ipfw_obj_header * ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ] * ] * * Returns 0 on success */ static int manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_tentry *tent, *ptent; ipfw_obj_ctlv *ctlv; ipfw_obj_header *oh; struct tentry_info *ptei, tei, *tei_buf; struct tid_info ti; - int error, i, kidx, read; + uint32_t kidx; + int error, i, read; /* Check minimum header size */ if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv))) return (EINVAL); /* Check if passed data is too long */ if (sd->valsize != sd->kavail) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); read = sizeof(*oh); ctlv = (ipfw_obj_ctlv *)(oh + 1); if (ctlv->head.length + read != sd->valsize) return (EINVAL); read += sizeof(*ctlv); tent = (ipfw_obj_tentry *)(ctlv + 1); if (ctlv->count * sizeof(*tent) + read != sd->valsize) return (EINVAL); if (ctlv->count == 0) return (0); /* * Mark entire buffer as "read". * This instructs sopt api write it back * after function return. */ ipfw_get_sopt_header(sd, sd->valsize); /* Perform basic checks for each entry */ ptent = tent; kidx = tent->idx; for (i = 0; i < ctlv->count; i++, ptent++) { if (ptent->head.length != sizeof(*ptent)) return (EINVAL); if (ptent->idx != kidx) return (ENOTSUP); } /* Convert data into kernel request objects */ objheader_to_ti(oh, &ti); ti.type = oh->ntlv.type; ti.uidx = kidx; /* Use on-stack buffer for single add/del */ if (ctlv->count == 1) { memset(&tei, 0, sizeof(tei)); tei_buf = &tei; } else tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP, M_WAITOK | M_ZERO); ptei = tei_buf; ptent = tent; for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { ptei->paddr = &ptent->k; ptei->subtype = ptent->subtype; ptei->masklen = ptent->masklen; if (ptent->head.flags & IPFW_TF_UPDATE) ptei->flags |= TEI_FLAGS_UPDATE; ipfw_import_table_value_v1(&ptent->v.value); ptei->pvalue = (struct table_value *)&ptent->v.value; } error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ? add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) : del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count); /* Translate result back to userland */ ptei = tei_buf; ptent = tent; for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { if (ptei->flags & TEI_FLAGS_ADDED) ptent->result = IPFW_TR_ADDED; else if (ptei->flags & TEI_FLAGS_DELETED) ptent->result = IPFW_TR_DELETED; else if (ptei->flags & TEI_FLAGS_UPDATED) ptent->result = IPFW_TR_UPDATED; else if (ptei->flags & TEI_FLAGS_LIMIT) ptent->result = IPFW_TR_LIMIT; else if (ptei->flags & TEI_FLAGS_ERROR) ptent->result = IPFW_TR_ERROR; else if (ptei->flags & TEI_FLAGS_NOTFOUND) ptent->result = IPFW_TR_NOTFOUND; else if (ptei->flags & TEI_FLAGS_EXISTS) ptent->result = IPFW_TR_EXISTS; ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value); } if (tei_buf != &tei) free(tei_buf, M_TEMP); return (error); } /* * Looks up an entry in given table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_obj_tentry ] * Reply: [ ipfw_obj_header ipfw_obj_tentry ] * * Returns 0 on success */ static int find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_tentry *tent; ipfw_obj_header *oh; struct tid_info ti; struct table_config *tc; struct table_algo *ta; struct table_info *kti; struct table_value *pval; struct namedobj_instance *ni; int error; size_t sz; /* Check minimum header size */ sz = sizeof(*oh) + sizeof(*tent); if (sd->valsize != sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); tent = (ipfw_obj_tentry *)(oh + 1); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); objheader_to_ti(oh, &ti); ti.type = oh->ntlv.type; ti.uidx = tent->idx; IPFW_UH_RLOCK(ch); ni = CHAIN_TO_NI(ch); /* * Find existing table and check its type . */ ta = NULL; if ((tc = find_table(ni, &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } /* check table type */ if (tc->no.subtype != ti.type) { IPFW_UH_RUNLOCK(ch); return (EINVAL); } kti = KIDX_TO_TI(ch, tc->no.kidx); ta = tc->ta; if (ta->find_tentry == NULL) return (ENOTSUP); error = ta->find_tentry(tc->astate, kti, tent); if (error == 0) { pval = get_table_value(ch, tc, tent->v.kidx); ipfw_export_table_value_v1(pval, &tent->v.value); } IPFW_UH_RUNLOCK(ch); return (error); } /* * Flushes all entries or destroys given table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { int error; struct _ipfw_obj_header *oh; struct tid_info ti; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (struct _ipfw_obj_header *)op3; objheader_to_ti(oh, &ti); if (op3->opcode == IP_FW_TABLE_XDESTROY) error = destroy_table(ch, &ti); else if (op3->opcode == IP_FW_TABLE_XFLUSH) error = flush_table(ch, &ti); else return (ENOTSUP); return (error); } static void restart_flush(void *object, struct op_state *_state) { struct tableop_state *ts; ts = (struct tableop_state *)_state; if (ts->tc != object) return; /* Indicate we've called */ ts->modified = 1; } /* * Flushes given table. * * Function create new table instance with the same * parameters, swaps it with old one and * flushes state without holding runtime WLOCK. * * Returns 0 on success. */ int flush_table(struct ip_fw_chain *ch, struct tid_info *ti) { struct namedobj_instance *ni; struct table_config *tc; struct table_algo *ta; struct table_info ti_old, ti_new, *tablestate; void *astate_old, *astate_new; char algostate[64], *pstate; struct tableop_state ts; int error, need_gc; - uint16_t kidx; + uint32_t kidx; uint8_t tflags; /* * Stage 1: save table algorithm. * Reference found table to ensure it won't disappear. */ IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc = find_table(ni, ti)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } need_gc = 0; astate_new = NULL; memset(&ti_new, 0, sizeof(ti_new)); restart: /* Set up swap handler */ memset(&ts, 0, sizeof(ts)); ts.opstate.func = restart_flush; ts.tc = tc; ta = tc->ta; /* Do not flush readonly tables */ if ((ta->flags & TA_FLAG_READONLY) != 0) { IPFW_UH_WUNLOCK(ch); return (EACCES); } /* Save startup algo parameters */ if (ta->print_config != NULL) { ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx), algostate, sizeof(algostate)); pstate = algostate; } else pstate = NULL; tflags = tc->tflags; tc->no.refcnt++; add_toperation_state(ch, &ts); IPFW_UH_WUNLOCK(ch); /* * Stage 1.5: if this is not the first attempt, destroy previous state */ if (need_gc != 0) { ta->destroy(astate_new, &ti_new); need_gc = 0; } /* * Stage 2: allocate new table instance using same algo. */ memset(&ti_new, 0, sizeof(struct table_info)); error = ta->init(ch, &astate_new, &ti_new, pstate, tflags); /* * Stage 3: swap old state pointers with newly-allocated ones. * Decrease refcount. */ IPFW_UH_WLOCK(ch); tc->no.refcnt--; del_toperation_state(ch, &ts); if (error != 0) { IPFW_UH_WUNLOCK(ch); return (error); } /* * Restart operation if table swap has happened: * even if algo may be the same, algo init parameters * may change. Restart operation instead of doing * complex checks. */ if (ts.modified != 0) { /* Delay destroying data since we're holding UH lock */ need_gc = 1; goto restart; } ni = CHAIN_TO_NI(ch); kidx = tc->no.kidx; tablestate = (struct table_info *)ch->tablestate; IPFW_WLOCK(ch); ti_old = tablestate[kidx]; tablestate[kidx] = ti_new; IPFW_WUNLOCK(ch); astate_old = tc->astate; tc->astate = astate_new; tc->ti_copy = ti_new; tc->count = 0; /* Notify algo on real @ti address */ if (ta->change_ti != NULL) ta->change_ti(tc->astate, &tablestate[kidx]); /* * Stage 4: unref values. */ ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old); IPFW_UH_WUNLOCK(ch); /* * Stage 5: perform real flush/destroy. */ ta->destroy(astate_old, &ti_old); return (0); } /* * Swaps two tables. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_obj_ntlv ] * * Returns 0 on success */ static int swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { int error; struct _ipfw_obj_header *oh; struct tid_info ti_a, ti_b; if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv)) return (EINVAL); oh = (struct _ipfw_obj_header *)op3; ntlv_to_ti(&oh->ntlv, &ti_a); ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b); error = swap_tables(ch, &ti_a, &ti_b); return (error); } /* * Swaps two tables of the same type/valtype. * * Checks if tables are compatible and limits * permits swap, than actually perform swap. * * Each table consists of 2 different parts: * config: * @tc (with name, set, kidx) and rule bindings, which is "stable". * number of items * table algo * runtime: * runtime data @ti (ch->tablestate) * runtime cache in @tc * algo-specific data (@tc->astate) * * So we switch: * all runtime data * number of items * table algo * * After that we call @ti change handler for each table. * * Note that referencing @tc won't protect tc->ta from change. * XXX: Do we need to restrict swap between locked tables? * XXX: Do we need to exchange ftype? * * Returns 0 on success. */ static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, struct tid_info *b) { struct namedobj_instance *ni; struct table_config *tc_a, *tc_b; struct table_algo *ta; struct table_info ti, *tablestate; void *astate; uint32_t count; /* * Stage 1: find both tables and ensure they are of * the same type. */ IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc_a = find_table(ni, a)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } if ((tc_b = find_table(ni, b)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } /* It is very easy to swap between the same table */ if (tc_a == tc_b) { IPFW_UH_WUNLOCK(ch); return (0); } /* Check type and value are the same */ if (tc_a->no.subtype!=tc_b->no.subtype || tc_a->tflags!=tc_b->tflags) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } /* Check limits before swap */ if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) || (tc_b->limit != 0 && tc_a->count > tc_b->limit)) { IPFW_UH_WUNLOCK(ch); return (EFBIG); } /* Check if one of the tables is readonly */ if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) { IPFW_UH_WUNLOCK(ch); return (EACCES); } /* Notify we're going to swap */ rollback_toperation_state(ch, tc_a); rollback_toperation_state(ch, tc_b); /* Everything is fine, prepare to swap */ tablestate = (struct table_info *)ch->tablestate; ti = tablestate[tc_a->no.kidx]; ta = tc_a->ta; astate = tc_a->astate; count = tc_a->count; IPFW_WLOCK(ch); /* a <- b */ tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx]; tc_a->ta = tc_b->ta; tc_a->astate = tc_b->astate; tc_a->count = tc_b->count; /* b <- a */ tablestate[tc_b->no.kidx] = ti; tc_b->ta = ta; tc_b->astate = astate; tc_b->count = count; IPFW_WUNLOCK(ch); /* Ensure tc.ti copies are in sync */ tc_a->ti_copy = tablestate[tc_a->no.kidx]; tc_b->ti_copy = tablestate[tc_b->no.kidx]; /* Notify both tables on @ti change */ if (tc_a->ta->change_ti != NULL) tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]); if (tc_b->ta->change_ti != NULL) tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]); IPFW_UH_WUNLOCK(ch); return (0); } /* * Destroys table specified by @ti. * Data layout (v0)(current): * Request: [ ip_fw3_opheader ] * * Returns 0 on success */ static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti) { struct namedobj_instance *ni; struct table_config *tc; IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc = find_table(ni, ti)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } /* Do not permit destroying referenced tables */ if (tc->no.refcnt > 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } IPFW_WLOCK(ch); unlink_table(ch, tc); IPFW_WUNLOCK(ch); /* Free obj index */ if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0) - printf("Error unlinking kidx %d from table %s\n", + printf("Error unlinking kidx %u from table %s\n", tc->no.kidx, tc->tablename); /* Unref values used in tables while holding UH lock */ ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy); IPFW_UH_WUNLOCK(ch); free_table_config(ni, tc); return (0); } /* * Grow tables index. * * Returns 0 on success. */ int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) { unsigned int tbl; struct namedobj_instance *ni; void *new_idx, *old_tablestate, *tablestate; struct table_info *ti; struct table_config *tc; int i, new_blocks; /* Check new value for validity */ if (ntables == 0) return (EINVAL); if (ntables > IPFW_TABLES_MAX) ntables = IPFW_TABLES_MAX; /* Alight to nearest power of 2 */ - ntables = roundup_pow_of_two(ntables); + ntables = roundup_pow_of_two(ntables); /* Allocate new pointers */ tablestate = malloc(ntables * sizeof(struct table_info), M_IPFW, M_WAITOK | M_ZERO); ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks); IPFW_UH_WLOCK(ch); tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; ni = CHAIN_TO_NI(ch); /* Temporary restrict decreasing max_tables */ if (ntables < V_fw_tables_max) { /* * FIXME: Check if we really can shrink */ IPFW_UH_WUNLOCK(ch); return (EINVAL); } /* Copy table info/indices */ memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl); ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks); IPFW_WLOCK(ch); /* Change pointers */ old_tablestate = ch->tablestate; ch->tablestate = tablestate; ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks); V_fw_tables_max = ntables; IPFW_WUNLOCK(ch); /* Notify all consumers that their @ti pointer has changed */ ti = (struct table_info *)ch->tablestate; for (i = 0; i < tbl; i++, ti++) { if (ti->lookup == NULL) continue; tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i); if (tc == NULL || tc->ta->change_ti == NULL) continue; tc->ta->change_ti(tc->astate, ti); } IPFW_UH_WUNLOCK(ch); /* Free old pointers */ free(old_tablestate, M_IPFW); ipfw_objhash_bitmap_free(new_idx, new_blocks); return (0); } /* * Lookup table's named object by its @kidx. */ struct named_object * -ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint16_t kidx) +ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch, uint32_t kidx) { return (ipfw_objhash_lookup_kidx(CHAIN_TO_NI(ch), kidx)); } /* * Take reference to table specified in @ntlv. * On success return its @kidx. */ int -ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx) +ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint32_t *kidx) { struct tid_info ti; struct table_config *tc; int error; IPFW_UH_WLOCK_ASSERT(ch); ntlv_to_ti(ntlv, &ti); error = find_table_err(CHAIN_TO_NI(ch), &ti, &tc); if (error != 0) return (error); if (tc == NULL) return (ESRCH); tc_ref(tc); *kidx = tc->no.kidx; return (0); } void -ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx) +ipfw_unref_table(struct ip_fw_chain *ch, uint32_t kidx) { struct namedobj_instance *ni; struct named_object *no; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); no = ipfw_objhash_lookup_kidx(ni, kidx); - KASSERT(no != NULL, ("Table with index %d not found", kidx)); + KASSERT(no != NULL, ("Table with index %u not found", kidx)); no->refcnt--; } /* * Lookup an arbitrary key @paddr of length @plen in table @tbl. * Stores found value in @val. * * Returns 1 if key was found. */ int -ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, +ipfw_lookup_table(struct ip_fw_chain *ch, uint32_t tbl, uint16_t plen, void *paddr, uint32_t *val) { struct table_info *ti; ti = KIDX_TO_TI(ch, tbl); return (ti->lookup(ti, paddr, plen, val)); } /* * Info/List/dump support for tables. * */ /* * High-level 'get' cmds sysctl handlers */ /* * Lists all tables currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ] * * Returns 0 on success */ static int list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; int error; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); error = export_tables(ch, olh, sd); IPFW_UH_RUNLOCK(ch); return (error); } /* * Store table info to buffer provided by @sd. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_xtable_info(empty)] * Reply: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success. */ static int describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; struct table_config *tc; struct tid_info ti; size_t sz; sz = sizeof(*oh) + sizeof(ipfw_xtable_info); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); objheader_to_ti(oh, &ti); IPFW_UH_RLOCK(ch); if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1)); IPFW_UH_RUNLOCK(ch); return (0); } /* * Modifies existing table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success */ static int modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; ipfw_xtable_info *i; char *tname; struct tid_info ti; struct namedobj_instance *ni; struct table_config *tc; if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) return (EINVAL); oh = (struct _ipfw_obj_header *)sd->kbuf; i = (ipfw_xtable_info *)(oh + 1); /* * Verify user-supplied strings. * Check for null-terminated/zero-length strings/ */ tname = oh->ntlv.name; if (check_table_name(tname) != 0) return (EINVAL); objheader_to_ti(oh, &ti); ti.type = i->type; IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc = find_table(ni, &ti)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } /* Do not support any modifications for readonly tables */ if ((tc->ta->flags & TA_FLAG_READONLY) != 0) { IPFW_UH_WUNLOCK(ch); return (EACCES); } if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0) tc->limit = i->limit; if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0) tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0); IPFW_UH_WUNLOCK(ch); return (0); } /* * Creates new table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success */ static int create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; ipfw_xtable_info *i; char *tname, *aname; struct tid_info ti; struct namedobj_instance *ni; if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) return (EINVAL); oh = (struct _ipfw_obj_header *)sd->kbuf; i = (ipfw_xtable_info *)(oh + 1); /* * Verify user-supplied strings. * Check for null-terminated/zero-length strings/ */ tname = oh->ntlv.name; aname = i->algoname; if (check_table_name(tname) != 0 || strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname)) return (EINVAL); if (aname[0] == '\0') { /* Use default algorithm */ aname = NULL; } objheader_to_ti(oh, &ti); ti.type = i->type; ni = CHAIN_TO_NI(ch); IPFW_UH_RLOCK(ch); if (find_table(ni, &ti) != NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } IPFW_UH_RUNLOCK(ch); return (create_table_internal(ch, &ti, aname, i, NULL, 0)); } /* * Creates new table based on @ti and @aname. * * Assume @aname to be checked and valid. * Stores allocated table kidx inside @pkidx (if non-NULL). * Reference created table if @compat is non-zero. * * Returns 0 on success. */ static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, - char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat) + char *aname, ipfw_xtable_info *i, uint32_t *pkidx, int compat) { struct namedobj_instance *ni; struct table_config *tc, *tc_new, *tmp; struct table_algo *ta; - uint16_t kidx; + uint32_t kidx; ni = CHAIN_TO_NI(ch); ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname); if (ta == NULL) return (ENOTSUP); tc = alloc_table_config(ch, ti, ta, aname, i->tflags); if (tc == NULL) return (ENOMEM); tc->vmask = i->vmask; tc->limit = i->limit; if (ta->flags & TA_FLAG_READONLY) tc->locked = 1; else tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0; IPFW_UH_WLOCK(ch); /* Check if table has been already created */ tc_new = find_table(ni, ti); if (tc_new != NULL) { /* * Compat: do not fail if we're * requesting to create existing table * which has the same type */ if (compat == 0 || tc_new->no.subtype != tc->no.subtype) { IPFW_UH_WUNLOCK(ch); free_table_config(ni, tc); return (EEXIST); } /* Exchange tc and tc_new for proper refcounting & freeing */ tmp = tc; tc = tc_new; tc_new = tmp; } else { /* New table */ if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) { IPFW_UH_WUNLOCK(ch); printf("Unable to allocate table index." " Consider increasing net.inet.ip.fw.tables_max"); free_table_config(ni, tc); return (EBUSY); } tc->no.kidx = kidx; tc->no.etlv = IPFW_TLV_TBL_NAME; link_table(ch, tc); } if (compat != 0) tc->no.refcnt++; if (pkidx != NULL) *pkidx = tc->no.kidx; IPFW_UH_WUNLOCK(ch); if (tc_new != NULL) free_table_config(ni, tc_new); return (0); } static void ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti) { memset(ti, 0, sizeof(struct tid_info)); ti->set = ntlv->set; ti->uidx = ntlv->idx; ti->tlvs = ntlv; ti->tlen = ntlv->head.length; } static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti) { ntlv_to_ti(&oh->ntlv, ti); } struct namedobj_instance * ipfw_get_table_objhash(struct ip_fw_chain *ch) { return (CHAIN_TO_NI(ch)); } /* * Exports basic table info as name TLV. * Used inside dump_static_rules() to provide info * about all tables referenced by current ruleset. * * Returns 0 on success. */ int -ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, +ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint32_t kidx, struct sockopt_data *sd) { struct namedobj_instance *ni; struct named_object *no; ipfw_obj_ntlv *ntlv; ni = CHAIN_TO_NI(ch); no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("invalid table kidx passed")); ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); if (ntlv == NULL) return (ENOMEM); ntlv->head.type = IPFW_TLV_TBL_NAME; ntlv->head.length = sizeof(*ntlv); ntlv->idx = no->kidx; strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); return (0); } struct dump_args { struct ip_fw_chain *ch; struct table_info *ti; struct table_config *tc; struct sockopt_data *sd; uint32_t cnt; uint16_t uidx; int error; uint32_t size; - ipfw_table_entry *ent; ta_foreach_f *f; void *farg; ipfw_obj_tentry tent; }; static int count_ext_entries(void *e, void *arg) { struct dump_args *da; da = (struct dump_args *)arg; da->cnt++; return (0); } /* * Gets number of items from table either using * internal counter or calling algo callback for * externally-managed tables. * * Returns number of records. */ static uint32_t table_get_count(struct ip_fw_chain *ch, struct table_config *tc) { struct table_info *ti; struct table_algo *ta; struct dump_args da; ti = KIDX_TO_TI(ch, tc->no.kidx); ta = tc->ta; /* Use internal counter for self-managed tables */ if ((ta->flags & TA_FLAG_READONLY) == 0) return (tc->count); /* Use callback to quickly get number of items */ if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0) return (ta->get_count(tc->astate, ti)); /* Count number of iterms ourselves */ memset(&da, 0, sizeof(da)); ta->foreach(tc->astate, ti, count_ext_entries, &da); return (da.cnt); } /* * Exports table @tc info into standard ipfw_xtable_info format. */ static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, ipfw_xtable_info *i) { struct table_info *ti; struct table_algo *ta; i->type = tc->no.subtype; i->tflags = tc->tflags; i->vmask = tc->vmask; i->set = tc->no.set; i->kidx = tc->no.kidx; i->refcnt = tc->no.refcnt; i->count = table_get_count(ch, tc); i->limit = tc->limit; i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0; i->size = i->count * sizeof(ipfw_obj_tentry); i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); strlcpy(i->tablename, tc->tablename, sizeof(i->tablename)); ti = KIDX_TO_TI(ch, tc->no.kidx); ta = tc->ta; if (ta->print_config != NULL) { /* Use algo function to print table config to string */ ta->print_config(tc->astate, ti, i->algoname, sizeof(i->algoname)); } else strlcpy(i->algoname, ta->name, sizeof(i->algoname)); /* Dump algo-specific data, if possible */ if (ta->dump_tinfo != NULL) { ta->dump_tinfo(tc->astate, ti, &i->ta_info); i->ta_info.flags |= IPFW_TATFLAGS_DATA; } } struct dump_table_args { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static int export_table_internal(struct namedobj_instance *ni, struct named_object *no, void *arg) { ipfw_xtable_info *i; struct dump_table_args *dta; dta = (struct dump_table_args *)arg; i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i)); KASSERT(i != NULL, ("previously checked buffer is not enough")); export_table_info(dta->ch, (struct table_config *)no, i); return (0); } /* * Export all tables as ipfw_xtable_info structures to * storage provided by @sd. * * If supplied buffer is too small, fills in required size * and returns ENOMEM. * Returns 0 on success. */ static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, struct sockopt_data *sd) { uint32_t size; uint32_t count; struct dump_table_args dta; count = ipfw_objhash_count(CHAIN_TO_NI(ch)); size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_xtable_info); if (size > olh->size) { olh->size = size; return (ENOMEM); } olh->size = size; dta.ch = ch; dta.sd = sd; ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta); return (0); } /* * Dumps all table data * Data layout (v1)(current): * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ] * * Returns 0 on success */ static int dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; ipfw_xtable_info *i; struct tid_info ti; struct table_config *tc; struct table_algo *ta; struct dump_args da; uint32_t sz; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); i = (ipfw_xtable_info *)(oh + 1); objheader_to_ti(oh, &ti); IPFW_UH_RLOCK(ch); if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } export_table_info(ch, tc, i); if (sd->valsize < i->size) { /* * Submitted buffer size is not enough. * WE've already filled in @i structure with * relevant table info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(ch); return (ENOMEM); } /* * Do the actual dump in eXtended format */ memset(&da, 0, sizeof(da)); da.ch = ch; da.ti = KIDX_TO_TI(ch, tc->no.kidx); da.tc = tc; da.sd = sd; ta = tc->ta; ta->foreach(tc->astate, da.ti, dump_table_tentry, &da); IPFW_UH_RUNLOCK(ch); return (da.error); } -/* - * Dumps all table data - * Data layout (version 0)(legacy): - * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE() - * Reply: [ ipfw_xtable ipfw_table_xentry x N ] - * - * Returns 0 on success - */ -static int -dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, - struct sockopt_data *sd) -{ - ipfw_xtable *xtbl; - struct tid_info ti; - struct table_config *tc; - struct table_algo *ta; - struct dump_args da; - size_t sz, count; - - xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable)); - if (xtbl == NULL) - return (EINVAL); - - memset(&ti, 0, sizeof(ti)); - ti.uidx = xtbl->tbl; - - IPFW_UH_RLOCK(ch); - if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { - IPFW_UH_RUNLOCK(ch); - return (0); - } - count = table_get_count(ch, tc); - sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable); - - xtbl->cnt = count; - xtbl->size = sz; - xtbl->type = tc->no.subtype; - xtbl->tbl = ti.uidx; - - if (sd->valsize < sz) { - /* - * Submitted buffer size is not enough. - * WE've already filled in @i structure with - * relevant table info including size, so we - * can return. Buffer will be flushed automatically. - */ - IPFW_UH_RUNLOCK(ch); - return (ENOMEM); - } - - /* Do the actual dump in eXtended format */ - memset(&da, 0, sizeof(da)); - da.ch = ch; - da.ti = KIDX_TO_TI(ch, tc->no.kidx); - da.tc = tc; - da.sd = sd; - - ta = tc->ta; - - ta->foreach(tc->astate, da.ti, dump_table_xentry, &da); - IPFW_UH_RUNLOCK(ch); - - return (0); -} - -/* - * Legacy function to retrieve number of items in table. - */ -static int -get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, - struct sockopt_data *sd) -{ - uint32_t *tbl; - struct tid_info ti; - size_t sz; - int error; - - sz = sizeof(*op3) + sizeof(uint32_t); - op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz); - if (op3 == NULL) - return (EINVAL); - - tbl = (uint32_t *)(op3 + 1); - memset(&ti, 0, sizeof(ti)); - ti.uidx = *tbl; - IPFW_UH_RLOCK(ch); - error = ipfw_count_xtable(ch, &ti, tbl); - IPFW_UH_RUNLOCK(ch); - return (error); -} - -/* - * Legacy IP_FW_TABLE_GETSIZE handler - */ -int -ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) -{ - struct table_config *tc; - - if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) - return (ESRCH); - *cnt = table_get_count(ch, tc); - return (0); -} - -/* - * Legacy IP_FW_TABLE_XGETSIZE handler - */ -int -ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) -{ - struct table_config *tc; - uint32_t count; - - if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) { - *cnt = 0; - return (0); /* 'table all list' requires success */ - } - - count = table_get_count(ch, tc); - *cnt = count * sizeof(ipfw_table_xentry); - if (count > 0) - *cnt += sizeof(ipfw_xtable); - return (0); -} - -static int -dump_table_entry(void *e, void *arg) -{ - struct dump_args *da; - struct table_config *tc; - struct table_algo *ta; - ipfw_table_entry *ent; - struct table_value *pval; - int error; - - da = (struct dump_args *)arg; - - tc = da->tc; - ta = tc->ta; - - /* Out of memory, returning */ - if (da->cnt == da->size) - return (1); - ent = da->ent++; - ent->tbl = da->uidx; - da->cnt++; - - error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); - if (error != 0) - return (error); - - ent->addr = da->tent.k.addr.s_addr; - ent->masklen = da->tent.masklen; - pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); - ent->value = ipfw_export_table_value_legacy(pval); - - return (0); -} - -/* - * Dumps table in pre-8.1 legacy format. - */ -int -ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, - ipfw_table *tbl) -{ - struct table_config *tc; - struct table_algo *ta; - struct dump_args da; - - tbl->cnt = 0; - - if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) - return (0); /* XXX: We should return ESRCH */ - - ta = tc->ta; - - /* This dump format supports IPv4 only */ - if (tc->no.subtype != IPFW_TABLE_ADDR) - return (0); - - memset(&da, 0, sizeof(da)); - da.ch = ch; - da.ti = KIDX_TO_TI(ch, tc->no.kidx); - da.tc = tc; - da.ent = &tbl->ent[0]; - da.size = tbl->size; - - tbl->cnt = 0; - ta->foreach(tc->astate, da.ti, dump_table_entry, &da); - tbl->cnt = da.cnt; - - return (0); -} - /* * Dumps table entry in eXtended format (v1)(current). */ static int dump_table_tentry(void *e, void *arg) { struct dump_args *da; struct table_config *tc; struct table_algo *ta; struct table_value *pval; ipfw_obj_tentry *tent; int error; da = (struct dump_args *)arg; tc = da->tc; ta = tc->ta; tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent)); /* Out of memory, returning */ if (tent == NULL) { da->error = ENOMEM; return (1); } tent->head.length = sizeof(ipfw_obj_tentry); tent->idx = da->uidx; error = ta->dump_tentry(tc->astate, da->ti, e, tent); if (error != 0) return (error); pval = get_table_value(da->ch, da->tc, tent->v.kidx); ipfw_export_table_value_v1(pval, &tent->v.value); return (0); } -/* - * Dumps table entry in eXtended format (v0). - */ -static int -dump_table_xentry(void *e, void *arg) -{ - struct dump_args *da; - struct table_config *tc; - struct table_algo *ta; - ipfw_table_xentry *xent; - ipfw_obj_tentry *tent; - struct table_value *pval; - int error; - - da = (struct dump_args *)arg; - - tc = da->tc; - ta = tc->ta; - - xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent)); - /* Out of memory, returning */ - if (xent == NULL) - return (1); - xent->len = sizeof(ipfw_table_xentry); - xent->tbl = da->uidx; - - memset(&da->tent, 0, sizeof(da->tent)); - tent = &da->tent; - error = ta->dump_tentry(tc->astate, da->ti, e, tent); - if (error != 0) - return (error); - - /* Convert current format to previous one */ - xent->masklen = tent->masklen; - pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); - xent->value = ipfw_export_table_value_legacy(pval); - /* Apply some hacks */ - if (tc->no.subtype == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { - xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr; - xent->flags = IPFW_TCF_INET; - } else - memcpy(&xent->k, &tent->k, sizeof(xent->k)); - - return (0); -} - /* * Helper function to export table algo data * to tentry format before calling user function. * * Returns 0 on success. */ static int prepare_table_tentry(void *e, void *arg) { struct dump_args *da; struct table_config *tc; struct table_algo *ta; int error; da = (struct dump_args *)arg; tc = da->tc; ta = tc->ta; error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); if (error != 0) return (error); da->f(&da->tent, da->farg); return (0); } /* * Allow external consumers to read table entries in standard format. */ int -ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, +ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint32_t kidx, ta_foreach_f *f, void *arg) { struct namedobj_instance *ni; struct table_config *tc; struct table_algo *ta; struct dump_args da; ni = CHAIN_TO_NI(ch); tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); if (tc == NULL) return (ESRCH); ta = tc->ta; memset(&da, 0, sizeof(da)); da.ch = ch; da.ti = KIDX_TO_TI(ch, tc->no.kidx); da.tc = tc; da.f = f; da.farg = arg; ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da); return (0); } /* * Table algorithms - */ + */ /* * Finds algorithm by index, table type or supplied name. * * Returns pointer to algo or NULL. */ static struct table_algo * find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name) { int i, l; struct table_algo *ta; if (ti->type > IPFW_TABLE_MAXTYPE) return (NULL); /* Search by index */ if (ti->atype != 0) { if (ti->atype > tcfg->algo_count) return (NULL); return (tcfg->algo[ti->atype]); } if (name == NULL) { /* Return default algorithm for given type if set */ return (tcfg->def_algo[ti->type]); } /* Search by name */ /* TODO: better search */ for (i = 1; i <= tcfg->algo_count; i++) { ta = tcfg->algo[i]; /* * One can supply additional algorithm * parameters so we compare only the first word * of supplied name: * 'addr:chash hsize=32' * '^^^^^^^^^' * */ l = strlen(ta->name); if (strncmp(name, ta->name, l) != 0) continue; if (name[l] != '\0' && name[l] != ' ') continue; /* Check if we're requesting proper table type */ if (ti->type != 0 && ti->type != ta->type) return (NULL); return (ta); } return (NULL); } /* * Register new table algo @ta. * Stores algo id inside @idx. * * Returns 0 on success. */ int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size, int *idx) { struct tables_config *tcfg; struct table_algo *ta_new; size_t sz; if (size > sizeof(struct table_algo)) return (EINVAL); /* Check for the required on-stack size for add/del */ sz = roundup2(ta->ta_buf_size, sizeof(void *)); if (sz > TA_BUF_SZ) return (EINVAL); KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE")); /* Copy algorithm data to stable storage. */ ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO); memcpy(ta_new, ta, size); tcfg = CHAIN_TO_TCFG(ch); KASSERT(tcfg->algo_count < 255, ("Increase algo array size")); tcfg->algo[++tcfg->algo_count] = ta_new; ta_new->idx = tcfg->algo_count; /* Set algorithm as default one for given type */ if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 && tcfg->def_algo[ta_new->type] == NULL) tcfg->def_algo[ta_new->type] = ta_new; *idx = ta_new->idx; return (0); } /* * Unregisters table algo using @idx as id. * XXX: It is NOT safe to call this function in any place * other than ipfw instance destroy handler. */ void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx) { struct tables_config *tcfg; struct table_algo *ta; tcfg = CHAIN_TO_TCFG(ch); KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d", idx, tcfg->algo_count)); ta = tcfg->algo[idx]; KASSERT(ta != NULL, ("algo idx %d is NULL", idx)); if (tcfg->def_algo[ta->type] == ta) tcfg->def_algo[ta->type] = NULL; free(ta, M_IPFW); } /* * Lists all table algorithms currently available. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ] * * Returns 0 on success */ static int list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; struct tables_config *tcfg; ipfw_ta_info *i; struct table_algo *ta; uint32_t count, n, size; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); tcfg = CHAIN_TO_TCFG(ch); count = tcfg->algo_count; size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_ta_info); if (size > olh->size) { olh->size = size; IPFW_UH_RUNLOCK(ch); return (ENOMEM); } olh->size = size; for (n = 1; n <= count; n++) { i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i)); KASSERT(i != NULL, ("previously checked buffer is not enough")); ta = tcfg->algo[n]; strlcpy(i->algoname, ta->name, sizeof(i->algoname)); i->type = ta->type; i->refcnt = ta->refcnt; } IPFW_UH_RUNLOCK(ch); return (0); } static int -classify_srcdst(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +classify_srcdst(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { + ipfw_insn_table *cmd; + /* Basic IPv4/IPv6 or u32 lookups */ - *puidx = cmd->arg1; - /* Assume ADDR by default */ - *ptype = IPFW_TABLE_ADDR; - int v; - - if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { - /* - * generic lookup. The key must be - * in 32bit big-endian format. - */ - v = ((ipfw_insn_u32 *)cmd)->d[1]; - switch (v) { - case LOOKUP_DST_IP: - case LOOKUP_SRC_IP: - break; - case LOOKUP_DST_PORT: - case LOOKUP_SRC_PORT: - case LOOKUP_UID: - case LOOKUP_JAIL: - case LOOKUP_DSCP: - case LOOKUP_MARK: - *ptype = IPFW_TABLE_NUMBER; - break; - case LOOKUP_DST_MAC: - case LOOKUP_SRC_MAC: - *ptype = IPFW_TABLE_MAC; - break; - } + cmd = insntod(cmd0, table); + *puidx = cmd->kidx; + switch(cmd0->arg1) { + case LOOKUP_DST_IP: + case LOOKUP_SRC_IP: + default: + /* IPv4 src/dst */ + *ptype = IPFW_TABLE_ADDR; + break; + case LOOKUP_DST_PORT: + case LOOKUP_SRC_PORT: + case LOOKUP_UID: + case LOOKUP_JAIL: + case LOOKUP_DSCP: + case LOOKUP_MARK: + case LOOKUP_RULENUM: + *ptype = IPFW_TABLE_NUMBER; + break; + case LOOKUP_DST_MAC: + case LOOKUP_SRC_MAC: + *ptype = IPFW_TABLE_MAC; + break; } - return (0); } static int -classify_via(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +classify_via(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { ipfw_insn_if *cmdif; /* Interface table, possibly */ - cmdif = (ipfw_insn_if *)cmd; + cmdif = insntod(cmd0, if); if (cmdif->name[0] != '\1') return (1); *ptype = IPFW_TABLE_INTERFACE; - *puidx = cmdif->p.kidx; - + *puidx = cmdif->p.kidx; /* XXXAE */ return (0); } static int -classify_flow(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +classify_flow(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { - - *puidx = cmd->arg1; + *puidx = insntod(cmd0, table)->kidx; *ptype = IPFW_TABLE_FLOW; - return (0); } static int -classify_mac_lookup(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +classify_mac_lookup(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { - *puidx = cmd->arg1; + *puidx = insntod(cmd0, table)->kidx; *ptype = IPFW_TABLE_MAC; return (0); } static void -update_arg1(ipfw_insn *cmd, uint16_t idx) +update_kidx(ipfw_insn *cmd0, uint32_t idx) { - - cmd->arg1 = idx; + insntod(cmd0, table)->kidx = idx; } static void -update_via(ipfw_insn *cmd, uint16_t idx) +update_via(ipfw_insn *cmd0, uint32_t idx) { - ipfw_insn_if *cmdif; - - cmdif = (ipfw_insn_if *)cmd; - cmdif->p.kidx = idx; + insntod(cmd0, if)->p.kidx = idx; } static int table_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno) { struct table_config *tc; int error; IPFW_UH_WLOCK_ASSERT(ch); error = find_table_err(CHAIN_TO_NI(ch), ti, &tc); if (error != 0) return (error); *pno = &tc->no; return (0); } /* XXX: sets-sets! */ static struct named_object * -table_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +table_findbykidx(struct ip_fw_chain *ch, uint32_t idx) { struct namedobj_instance *ni; struct table_config *tc; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, idx); - KASSERT(tc != NULL, ("Table with index %d not found", idx)); + KASSERT(tc != NULL, ("Table with index %u not found", idx)); return (&tc->no); } static int -table_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, +table_manage_sets(struct ip_fw_chain *ch, uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { switch (cmd) { case SWAP_ALL: case TEST_ALL: case MOVE_ALL: /* * Always return success, the real action and decision * should make table_manage_sets_all(). */ return (0); case TEST_ONE: case MOVE_ONE: /* * NOTE: we need to use ipfw_objhash_del/ipfw_objhash_add * if set number will be used in hash function. Currently * we can just use generic handler that replaces set value. */ if (V_fw_tables_sets == 0) return (0); break; case COUNT_ONE: /* * Return EOPNOTSUPP for COUNT_ONE when per-set sysctl is * disabled. This allow skip table's opcodes from additional * checks when specific rules moved to another set. */ if (V_fw_tables_sets == 0) return (EOPNOTSUPP); } /* Use generic sets handler when per-set sysctl is enabled. */ return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME, set, new_set, cmd)); } /* * We register several opcode rewriters for lookup tables. * All tables opcodes have the same ETLV type, but different subtype. * To avoid invoking sets handler several times for XXX_ALL commands, * we use separate manage_sets handler. O_RECV has the lowest value, * so it should be called first. */ static int -table_manage_sets_all(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, +table_manage_sets_all(struct ip_fw_chain *ch, uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { switch (cmd) { case SWAP_ALL: case TEST_ALL: /* * Return success for TEST_ALL, since nothing prevents * move rules from one set to another. All tables are * accessible from all sets when per-set tables sysctl * is disabled. */ case MOVE_ALL: if (V_fw_tables_sets == 0) return (0); break; default: return (table_manage_sets(ch, set, new_set, cmd)); } /* Use generic sets handler when per-set sysctl is enabled. */ return (ipfw_obj_manage_sets(CHAIN_TO_NI(ch), IPFW_TLV_TBL_NAME, set, new_set, cmd)); } static struct opcode_obj_rewrite opcodes[] = { { .opcode = O_IP_SRC_LOOKUP, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_srcdst, - .update = update_arg1, + .update = update_kidx, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, { .opcode = O_IP_DST_LOOKUP, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_srcdst, - .update = update_arg1, + .update = update_kidx, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, { .opcode = O_IP_FLOW_LOOKUP, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_flow, - .update = update_arg1, + .update = update_kidx, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, { .opcode = O_MAC_SRC_LOOKUP, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_mac_lookup, - .update = update_arg1, + .update = update_kidx, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, { .opcode = O_MAC_DST_LOOKUP, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_mac_lookup, - .update = update_arg1, + .update = update_kidx, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, { .opcode = O_XMIT, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_via, .update = update_via, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, { .opcode = O_RECV, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_via, .update = update_via, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets_all, }, { .opcode = O_VIA, .etlv = IPFW_TLV_TBL_NAME, .classifier = classify_via, .update = update_via, .find_byname = table_findbyname, .find_bykidx = table_findbykidx, .create_object = create_table_compat, .manage_sets = table_manage_sets, }, }; static int test_sets_cb(struct namedobj_instance *ni __unused, struct named_object *no, void *arg __unused) { /* Check that there aren't any tables in not default set */ if (no->set != 0) return (EBUSY); return (0); } /* * Switch between "set 0" and "rule's set" table binding, * Check all ruleset bindings and permits changing * IFF each binding has both rule AND table in default set (set 0). * * Returns 0 on success. */ int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) { struct opcode_obj_rewrite *rw; struct namedobj_instance *ni; struct named_object *no; struct ip_fw *rule; ipfw_insn *cmd; int cmdlen, i, l; - uint16_t kidx; + uint32_t kidx; uint8_t subtype; IPFW_UH_WLOCK(ch); if (V_fw_tables_sets == sets) { IPFW_UH_WUNLOCK(ch); return (0); } ni = CHAIN_TO_NI(ch); if (sets == 0) { /* * Prevent disabling sets support if we have some tables * in not default sets. */ if (ipfw_objhash_foreach_type(ni, test_sets_cb, NULL, IPFW_TLV_TBL_NAME) != 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } } /* * Scan all rules and examine tables opcodes. */ for (i = 0; i < ch->n_rules; i++) { rule = ch->map[i]; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); /* Check only tables opcodes */ for (kidx = 0, rw = opcodes; rw < opcodes + nitems(opcodes); rw++) { if (rw->opcode != cmd->opcode) continue; if (rw->classifier(cmd, &kidx, &subtype) == 0) break; } if (kidx == 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); /* Check if both table object and rule has the set 0 */ if (no->set != 0 || rule->set != 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } } } V_fw_tables_sets = sets; IPFW_UH_WUNLOCK(ch); return (0); } /* * Checks table name for validity. * Enforce basic length checks, the rest * should be done in userland. * * Returns 0 if name is considered valid. */ static int check_table_name(const char *name) { /* * TODO: do some more complicated checks */ return (ipfw_check_object_name_generic(name)); } /* * Finds table config based on either legacy index * or name in ntlv. * Note @ti structure contains unchecked data from userland. * * Returns 0 in success and fills in @tc with found config */ static int find_table_err(struct namedobj_instance *ni, struct tid_info *ti, struct table_config **tc) { char *name, bname[16]; struct named_object *no; ipfw_obj_ntlv *ntlv; uint32_t set; if (ti->tlvs != NULL) { ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_TBL_NAME); if (ntlv == NULL) return (EINVAL); name = ntlv->name; /* * Use set provided by @ti instead of @ntlv one. * This is needed due to different sets behavior * controlled by V_fw_tables_sets. */ set = (V_fw_tables_sets != 0) ? ti->set : 0; } else { snprintf(bname, sizeof(bname), "%d", ti->uidx); name = bname; set = 0; } no = ipfw_objhash_lookup_name(ni, set, name); *tc = (struct table_config *)no; return (0); } /* * Finds table config based on either legacy index * or name in ntlv. * Note @ti structure contains unchecked data from userland. * * Returns pointer to table_config or NULL. */ static struct table_config * find_table(struct namedobj_instance *ni, struct tid_info *ti) { struct table_config *tc; if (find_table_err(ni, ti, &tc) != 0) return (NULL); return (tc); } /* * Allocate new table config structure using * specified @algo and @aname. * * Returns pointer to config or NULL. */ static struct table_config * alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, struct table_algo *ta, char *aname, uint8_t tflags) { char *name, bname[16]; struct table_config *tc; int error; ipfw_obj_ntlv *ntlv; uint32_t set; if (ti->tlvs != NULL) { ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, IPFW_TLV_TBL_NAME); if (ntlv == NULL) return (NULL); name = ntlv->name; set = (V_fw_tables_sets == 0) ? 0 : ntlv->set; } else { /* Compat part: convert number to string representation */ snprintf(bname, sizeof(bname), "%d", ti->uidx); name = bname; set = 0; } tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO); tc->no.name = tc->tablename; tc->no.subtype = ta->type; tc->no.set = set; tc->tflags = tflags; tc->ta = ta; strlcpy(tc->tablename, name, sizeof(tc->tablename)); /* Set "shared" value type by default */ tc->vshared = 1; /* Preallocate data structures for new tables */ error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags); if (error != 0) { free(tc, M_IPFW); return (NULL); } return (tc); } /* * Destroys table state and config. */ static void free_table_config(struct namedobj_instance *ni, struct table_config *tc) { KASSERT(tc->linked == 0, ("free() on linked config")); /* UH lock MUST NOT be held */ /* * We're using ta without any locking/referencing. * TODO: fix this if we're going to use unloadable algos. */ tc->ta->destroy(tc->astate, &tc->ti_copy); free(tc, M_IPFW); } /* * Links @tc to @chain table named instance. * Sets appropriate type/states in @chain table info. */ static void link_table(struct ip_fw_chain *ch, struct table_config *tc) { struct namedobj_instance *ni; struct table_info *ti; uint16_t kidx; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); kidx = tc->no.kidx; ipfw_objhash_add(ni, &tc->no); ti = KIDX_TO_TI(ch, kidx); *ti = tc->ti_copy; /* Notify algo on real @ti address */ if (tc->ta->change_ti != NULL) tc->ta->change_ti(tc->astate, ti); tc->linked = 1; tc->ta->refcnt++; } /* * Unlinks @tc from @chain table named instance. * Zeroes states in @chain and stores them in @tc. */ static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc) { struct namedobj_instance *ni; struct table_info *ti; uint16_t kidx; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); kidx = tc->no.kidx; /* Clear state. @ti copy is already saved inside @tc */ ipfw_objhash_del(ni, &tc->no); ti = KIDX_TO_TI(ch, kidx); memset(ti, 0, sizeof(struct table_info)); tc->linked = 0; tc->ta->refcnt--; /* Notify algo on real @ti address */ if (tc->ta->change_ti != NULL) tc->ta->change_ti(tc->astate, NULL); } static struct ipfw_sopt_handler scodes[] = { - { IP_FW_TABLE_XCREATE, 0, HDIR_SET, create_table }, - { IP_FW_TABLE_XDESTROY, 0, HDIR_SET, flush_table_v0 }, - { IP_FW_TABLE_XFLUSH, 0, HDIR_SET, flush_table_v0 }, - { IP_FW_TABLE_XMODIFY, 0, HDIR_BOTH, modify_table }, - { IP_FW_TABLE_XINFO, 0, HDIR_GET, describe_table }, - { IP_FW_TABLES_XLIST, 0, HDIR_GET, list_tables }, - { IP_FW_TABLE_XLIST, 0, HDIR_GET, dump_table_v0 }, - { IP_FW_TABLE_XLIST, 1, HDIR_GET, dump_table_v1 }, - { IP_FW_TABLE_XADD, 0, HDIR_BOTH, manage_table_ent_v0 }, - { IP_FW_TABLE_XADD, 1, HDIR_BOTH, manage_table_ent_v1 }, - { IP_FW_TABLE_XDEL, 0, HDIR_BOTH, manage_table_ent_v0 }, - { IP_FW_TABLE_XDEL, 1, HDIR_BOTH, manage_table_ent_v1 }, - { IP_FW_TABLE_XFIND, 0, HDIR_GET, find_table_entry }, - { IP_FW_TABLE_XSWAP, 0, HDIR_SET, swap_table }, - { IP_FW_TABLES_ALIST, 0, HDIR_GET, list_table_algo }, - { IP_FW_TABLE_XGETSIZE, 0, HDIR_GET, get_table_size }, + { IP_FW_TABLE_XCREATE, IP_FW3_OPVER, HDIR_SET, create_table }, + { IP_FW_TABLE_XDESTROY, IP_FW3_OPVER, HDIR_SET, flush_table_v0 }, + { IP_FW_TABLE_XFLUSH, IP_FW3_OPVER, HDIR_SET, flush_table_v0 }, + { IP_FW_TABLE_XMODIFY, IP_FW3_OPVER, HDIR_BOTH, modify_table }, + { IP_FW_TABLE_XINFO, IP_FW3_OPVER, HDIR_GET, describe_table }, + { IP_FW_TABLES_XLIST, IP_FW3_OPVER, HDIR_GET, list_tables }, + { IP_FW_TABLE_XLIST, IP_FW3_OPVER, HDIR_GET, dump_table_v1 }, + { IP_FW_TABLE_XADD, IP_FW3_OPVER, HDIR_BOTH, manage_table_ent_v1 }, + { IP_FW_TABLE_XDEL, IP_FW3_OPVER, HDIR_BOTH, manage_table_ent_v1 }, + { IP_FW_TABLE_XFIND, IP_FW3_OPVER, HDIR_GET, find_table_entry }, + { IP_FW_TABLE_XSWAP, IP_FW3_OPVER, HDIR_SET, swap_table }, + { IP_FW_TABLES_ALIST, IP_FW3_OPVER, HDIR_GET, list_table_algo }, }; static int destroy_table_locked(struct namedobj_instance *ni, struct named_object *no, void *arg) { unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no); if (ipfw_objhash_free_idx(ni, no->kidx) != 0) printf("Error unlinking kidx %d from table %s\n", no->kidx, no->name); free_table_config(ni, (struct table_config *)no); return (0); } /* * Shuts tables module down. */ void ipfw_destroy_tables(struct ip_fw_chain *ch, int last) { IPFW_DEL_SOPT_HANDLER(last, scodes); IPFW_DEL_OBJ_REWRITER(last, opcodes); /* Remove all tables from working set */ IPFW_UH_WLOCK(ch); IPFW_WLOCK(ch); ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch); IPFW_WUNLOCK(ch); IPFW_UH_WUNLOCK(ch); /* Free pointers itself */ free(ch->tablestate, M_IPFW); ipfw_table_value_destroy(ch, last); ipfw_table_algo_destroy(ch); ipfw_objhash_destroy(CHAIN_TO_NI(ch)); free(CHAIN_TO_TCFG(ch), M_IPFW); } /* * Starts tables module. */ int ipfw_init_tables(struct ip_fw_chain *ch, int first) { struct tables_config *tcfg; /* Allocate pointers */ ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info), M_IPFW, M_WAITOK | M_ZERO); tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO); - tcfg->namehash = ipfw_objhash_create(V_fw_tables_max); + tcfg->namehash = ipfw_objhash_create(V_fw_tables_max, + DEFAULT_OBJHASH_SIZE); ch->tblcfg = tcfg; ipfw_table_value_init(ch, first); ipfw_table_algo_init(ch); IPFW_ADD_OBJ_REWRITER(first, opcodes); IPFW_ADD_SOPT_HANDLER(first, scodes); return (0); } diff --git a/sys/netpfil/ipfw/ip_fw_table.h b/sys/netpfil/ipfw/ip_fw_table.h index 9fb845815ffa..1dd7b198236d 100644 --- a/sys/netpfil/ipfw/ip_fw_table.h +++ b/sys/netpfil/ipfw/ip_fw_table.h @@ -1,232 +1,220 @@ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _IPFW2_TABLE_H #define _IPFW2_TABLE_H /* * Internal constants and data structures used by ipfw tables * not meant to be exported outside the kernel. */ #ifdef _KERNEL struct table_algo; struct tables_config { struct namedobj_instance *namehash; struct namedobj_instance *valhash; uint32_t val_size; uint32_t algo_count; struct table_algo *algo[256]; struct table_algo *def_algo[IPFW_TABLE_MAXTYPE + 1]; TAILQ_HEAD(op_state_l,op_state) state_list; }; #define CHAIN_TO_TCFG(chain) ((struct tables_config *)(chain)->tblcfg) struct table_info { table_lookup_t *lookup; /* Lookup function */ void *state; /* Lookup radix/other structure */ void *xstate; /* eXtended state */ u_long data; /* Hints for given func */ }; struct table_value; struct tentry_info { void *paddr; struct table_value *pvalue; void *ptv; /* Temporary field to hold obj */ uint8_t masklen; /* mask length */ uint8_t subtype; uint16_t flags; /* record flags */ uint32_t value; /* value index */ }; #define TEI_FLAGS_UPDATE 0x0001 /* Add or update rec if exists */ #define TEI_FLAGS_UPDATED 0x0002 /* Entry has been updated */ #define TEI_FLAGS_COMPAT 0x0004 /* Called from old ABI */ #define TEI_FLAGS_DONTADD 0x0008 /* Do not create new rec */ #define TEI_FLAGS_ADDED 0x0010 /* Entry was added */ #define TEI_FLAGS_DELETED 0x0020 /* Entry was deleted */ #define TEI_FLAGS_LIMIT 0x0040 /* Limit was hit */ #define TEI_FLAGS_ERROR 0x0080 /* Unknown request error */ #define TEI_FLAGS_NOTFOUND 0x0100 /* Entry was not found */ #define TEI_FLAGS_EXISTS 0x0200 /* Entry already exists */ typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); typedef void (ta_destroy)(void *ta_state, struct table_info *ti); typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); typedef int (ta_add)(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); typedef int (ta_del)(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); typedef void (ta_flush_entry)(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); typedef void (ta_modify)(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); typedef void (ta_flush_mod)(void *ta_buf); typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); typedef void (ta_print_config)(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); typedef int ta_foreach_f(void *node, void *arg); typedef void ta_foreach(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); typedef int ta_find_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); typedef uint32_t ta_get_count(void *ta_state, struct table_info *ti); struct table_algo { char name[16]; uint32_t idx; uint32_t type; uint32_t refcnt; uint32_t flags; uint32_t vlimit; size_t ta_buf_size; ta_init *init; ta_destroy *destroy; ta_prepare_add *prepare_add; ta_prepare_del *prepare_del; ta_add *add; ta_del *del; ta_flush_entry *flush_entry; ta_find_tentry *find_tentry; ta_need_modify *need_modify; ta_prepare_mod *prepare_mod; ta_fill_mod *fill_mod; ta_modify *modify; ta_flush_mod *flush_mod; ta_change_ti *change_ti; ta_foreach *foreach; ta_dump_tentry *dump_tentry; ta_print_config *print_config; ta_dump_tinfo *dump_tinfo; ta_get_count *get_count; }; #define TA_FLAG_DEFAULT 0x01 /* Algo is default for given type */ #define TA_FLAG_READONLY 0x02 /* Algo does not support modifications*/ #define TA_FLAG_EXTCOUNTER 0x04 /* Algo has external counter available*/ int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size, int *idx); void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx); void ipfw_table_algo_init(struct ip_fw_chain *chain); void ipfw_table_algo_destroy(struct ip_fw_chain *chain); MALLOC_DECLARE(M_IPFW_TBL); /* Exported to support legacy opcodes */ int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint8_t flags, uint32_t count); int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint8_t flags, uint32_t count); int flush_table(struct ip_fw_chain *ch, struct tid_info *ti); -void ipfw_import_table_value_legacy(uint32_t value, struct table_value *v); -uint32_t ipfw_export_table_value_legacy(struct table_value *v); -int ipfw_get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, - struct sockopt_data *sd); /* ipfw_table_value.c functions */ struct table_config; struct tableop_state; void ipfw_table_value_init(struct ip_fw_chain *ch, int first); void ipfw_table_value_destroy(struct ip_fw_chain *ch, int last); int ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts, uint8_t flags); void ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, struct tentry_info *tei, uint32_t count, int rollback); void ipfw_import_table_value_v1(ipfw_table_value *iv); void ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *iv); void ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, struct table_algo *ta, void *astate, struct table_info *ti); void rollback_table_values(struct tableop_state *ts); int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci); int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, uint32_t *bmask); -int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, +int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint32_t kidx, struct sockopt_data *sd); void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule); struct namedobj_instance *ipfw_get_table_objhash(struct ip_fw_chain *ch); /* utility functions */ int ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, uint32_t new_set); void ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t old_set, uint32_t new_set, int mv); -int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, +int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint32_t kidx, ta_foreach_f f, void *arg); /* internal functions */ void tc_ref(struct table_config *tc); void tc_unref(struct table_config *tc); struct op_state; typedef void (op_rollback_f)(void *object, struct op_state *state); struct op_state { TAILQ_ENTRY(op_state) next; /* chain link */ op_rollback_f *func; }; struct tableop_state { struct op_state opstate; struct ip_fw_chain *ch; struct table_config *tc; struct table_algo *ta; struct tentry_info *tei; uint32_t count; uint32_t vmask; int vshared; int modified; }; void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts); void rollback_toperation_state(struct ip_fw_chain *ch, void *object); -/* Legacy interfaces */ -int ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, - uint32_t *cnt); -int ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, - uint32_t *cnt); -int ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, - ipfw_table *tbl); - #endif /* _KERNEL */ #endif /* _IPFW2_TABLE_H */ diff --git a/sys/netpfil/ipfw/ip_fw_table_algo.c b/sys/netpfil/ipfw/ip_fw_table_algo.c index ccc1a8098585..515c420ed663 100644 --- a/sys/netpfil/ipfw/ip_fw_table_algo.c +++ b/sys/netpfil/ipfw/ip_fw_table_algo.c @@ -1,4370 +1,4371 @@ /*- - * Copyright (c) 2014 Yandex LLC + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014-2025 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * Lookup table algorithms. * */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include #include #include #include #include /* struct ipfw_rule_ref */ #include #include #include #include /* * IPFW table lookup algorithms. * * What is needed to add another table algo? * * Algo init: * * struct table_algo has to be filled with: * name: "type:algoname" format, e.g. "addr:radix". Currently * there are the following types: "addr", "iface", "number" and "flow". * type: one of IPFW_TABLE_* types * flags: one or more TA_FLAGS_* * ta_buf_size: size of structure used to store add/del item state. * Needs to be less than TA_BUF_SZ. * callbacks: see below for description. * * ipfw_add_table_algo / ipfw_del_table_algo has to be called * * Callbacks description: * * -init: request to initialize new table instance. * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, * struct table_info *ti, char *data, uint8_t tflags); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all structures needed for normal operations. * * Caller may want to parse @data for some algo-specific * options provided by userland. * * Caller may want to save configuration state pointer to @ta_state * * Caller needs to save desired runtime structure pointer(s) * inside @ti fields. Note that it is not correct to save * @ti pointer at this moment. Use -change_ti hook for that. * * Caller has to fill in ti->lookup to appropriate function * pointer. * * * * -destroy: request to destroy table instance. * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); * MANDATORY, unlocked. (M_WAITOK). * * Frees all table entries and all tables structures allocated by -init. * * * * -prepare_add: request to allocate state for adding new entry. * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocates state and fills it in with all necessary data (EXCEPT value) * from @tei to minimize operations needed to be done under WLOCK. * "value" field has to be copied to new entry in @add callback. * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. * * * * -prepare_del: request to set state for deleting existing entry. * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. * * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. Caller should use on-stack ta_buf allocation * instead of doing malloc(). * * * * -add: request to insert new entry into runtime/config structures. * typedef int (ta_add)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Insert new entry using previously-allocated state in @ta_buf. * * @tei may have the following flags: * TEI_FLAGS_UPDATE: request to add or update entry. * TEI_FLAGS_DONTADD: request to update (but not add) entry. * * Caller is required to do the following: * copy real entry value from @tei * entry added: return 0, set 1 to @pnum * entry updated: return 0, store 0 to @pnum, store old value in @tei, * add TEI_FLAGS_UPDATED flag to @tei. * entry exists: return EEXIST * entry not found: return ENOENT * other error: return non-zero error code. * * * * -del: request to delete existing entry from runtime/config structures. * typedef int (ta_del)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Delete entry using previously set up in @ta_buf. * * Caller is required to do the following: * entry deleted: return 0, set 1 to @pnum, store old value in @tei. * entry not found: return ENOENT * other error: return non-zero error code. * * * * -flush_entry: flush entry state created by -prepare_add / -del / others * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, * struct tentry_info *tei, void *ta_buf); * MANDATORY, may be locked. (M_NOWAIT). * * Delete state allocated by: * -prepare_add (-add returned EEXIST|UPDATED) * -prepare_del (if any) * -del * * Caller is required to handle empty @ta_buf correctly. * * * -find_tentry: finds entry specified by key @tei * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, * ipfw_obj_tentry *tent); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. * * Finds entry specified by given key. * * Caller is required to do the following: * entry found: returns 0, export entry to @tent * entry not found: returns ENOENT * * * -need_modify: checks if @ti has enough space to hold another @count items. * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, * uint32_t count, uint64_t *pflags); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. * * Checks if given table has enough space to add @count items without * resize. Caller may use @pflags to store desired modification data. * * * * -prepare_mod: allocate structures for table modification. * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all needed state for table modification. Caller * should use `struct mod_item` to store new state in @ta_buf. * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. * * * * -fill_mod: copy some data to new state/ * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. * * Copy as much data as we can to minimize changes under WLOCK. * For example, array can be merged inside this callback. * * * * -modify: perform final modification. * typedef void (ta_modify)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t pflags); * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). * * Performs all changes necessary to switch to new structures. * * Caller should save old pointers to @ta_buf storage. * * * * -flush_mod: flush table modification state. * typedef void (ta_flush_mod)(void *ta_buf); * OPTIONAL(need_modify), unlocked. (M_WAITOK). * * Performs flush for the following: * - prepare_mod (modification was not necessary) * - modify (for the old state) * * * * -change_gi: monitor table info pointer changes * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); * OPTIONAL, locked (UH). (M_NOWAIT). * * Called on @ti pointer changed. Called immediately after -init * to set initial state. * * * * -foreach: calls @f for each table entry * typedef void ta_foreach(void *ta_state, struct table_info *ti, * ta_foreach_f *f, void *arg); * MANDATORY, locked(UH). (M_NOWAIT). * * Runs callback with specified argument for each table entry, * Typically used for dumping table entries. * * * * -dump_tentry: dump table entry in current @tentry format. * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, * ipfw_obj_tentry *tent); * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. * * Dumps entry @e to @tent. * * * -print_config: prints custom algorithm options into buffer. * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, * char *buf, size_t bufsize); * OPTIONAL. locked(UH). (M_NOWAIT). * * Prints custom algorithm options in the format suitable to pass * back to -init callback. * * * * -dump_tinfo: dumps algo-specific info. * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, * ipfw_ta_tinfo *tinfo); * OPTIONAL. locked(UH). (M_NOWAIT). * * Dumps options like items size/hash size, etc. */ MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); /* * Utility structures/functions common to more than one algo */ struct mod_item { void *main_ptr; size_t size; void *main_ptr6; size_t size6; }; static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); /* * ADDR implementation using radix * */ /* * The radix code expects addr and mask to be array of bytes, * with the first byte being the length of the array. rn_inithead * is called with the offset in bits of the lookup key within the * array. If we use a sockaddr_in as the underlying type, * sin_len is conveniently located at offset 0, sin_addr is at * offset 4 and normally aligned. * But for portability, let's avoid assumption and make the code explicit */ #define KEY_LEN(v) *((uint8_t *)&(v)) /* * Do not require radix to compare more than actual IPv4/IPv6/MAC address */ #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) #define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) #define KEY_LEN_MAC (offsetof(struct sa_mac, mac_addr) + ETHER_ADDR_LEN) #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) #define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) #define OFF_LEN_MAC (8 * offsetof(struct sa_mac, mac_addr)) struct addr_radix_entry { struct radix_node rn[2]; struct sockaddr_in addr; uint32_t value; uint8_t masklen; }; struct sa_in6 { uint8_t sin6_len; uint8_t sin6_family; uint8_t pad[2]; struct in6_addr sin6_addr; }; struct addr_radix_xentry { struct radix_node rn[2]; struct sa_in6 addr6; uint32_t value; uint8_t masklen; }; struct addr_radix_cfg { struct radix_node_head *head4; struct radix_node_head *head6; size_t count4; size_t count6; }; struct sa_mac { uint8_t mac_len; struct ether_addr mac_addr; }; struct ta_buf_radix { void *ent_ptr; struct sockaddr *addr_ptr; struct sockaddr *mask_ptr; union { struct { struct sockaddr_in sa; struct sockaddr_in ma; } a4; struct { struct sa_in6 sa; struct sa_in6 ma; } a6; struct { struct sa_mac sa; struct sa_mac ma; } mac; } addr; }; static int ta_lookup_addr_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_addr_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static int flush_radix_entry(struct radix_node *rn, void *arg); static void ta_destroy_addr_radix(void *ta_state, struct table_info *ti); static void ta_dump_addr_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_addr_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_addr_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_addr_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static void tei_to_sockaddr_ent_addr(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask); static int ta_prepare_add_addr_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_addr_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_addr_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_addr_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_lookup_addr_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct radix_node_head *rnh; if (keylen == sizeof(in_addr_t)) { struct addr_radix_entry *ent; struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = *((in_addr_t *)key); rnh = (struct radix_node_head *)ti->state; ent = (struct addr_radix_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh)); if (ent != NULL) { *val = ent->value; return (1); } } else if (keylen == sizeof(struct in6_addr)) { struct addr_radix_xentry *xent; struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; xent = (struct addr_radix_xentry *)(rnh->rnh_matchaddr(&sa6, &rnh->rh)); if (xent != NULL) { *val = xent->value; return (1); } } return (0); } /* * New table */ static int ta_init_addr_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct addr_radix_cfg *cfg; if (!rn_inithead(&ti->state, OFF_LEN_INET)) return (ENOMEM); if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { rn_detachhead(&ti->state); return (ENOMEM); } cfg = malloc(sizeof(struct addr_radix_cfg), M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->lookup = ta_lookup_addr_radix; return (0); } static int flush_radix_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct addr_radix_entry *ent; ent = (struct addr_radix_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, &rnh->rh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static void ta_destroy_addr_radix(void *ta_state, struct table_info *ti) { struct addr_radix_cfg *cfg; struct radix_node_head *rnh; cfg = (struct addr_radix_cfg *)ta_state; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->state); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->xstate); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_addr_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct addr_radix_cfg *cfg; cfg = (struct addr_radix_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = cfg->count4; tinfo->itemsize4 = sizeof(struct addr_radix_entry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = cfg->count6; tinfo->itemsize6 = sizeof(struct addr_radix_xentry); } static int ta_dump_addr_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct addr_radix_entry *n; #ifdef INET6 struct addr_radix_xentry *xn; #endif n = (struct addr_radix_entry *)e; /* Guess IPv4/IPv6 radix by sockaddr family */ if (n->addr.sin_family == AF_INET) { tent->k.addr.s_addr = n->addr.sin_addr.s_addr; tent->masklen = n->masklen; tent->subtype = AF_INET; tent->v.kidx = n->value; #ifdef INET6 } else { xn = (struct addr_radix_xentry *)e; memcpy(&tent->k.addr6, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); tent->masklen = xn->masklen; tent->subtype = AF_INET6; tent->v.kidx = xn->value; #endif } return (0); } static int ta_find_addr_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct radix_node_head *rnh; void *e; e = NULL; if (tent->subtype == AF_INET) { struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = tent->k.addr.s_addr; rnh = (struct radix_node_head *)ti->state; e = rnh->rnh_matchaddr(&sa, &rnh->rh); } else if (tent->subtype == AF_INET6) { struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; e = rnh->rnh_matchaddr(&sa6, &rnh->rh); } if (e != NULL) { ta_dump_addr_radix_tentry(ta_state, ti, e, tent); return (0); } return (ENOENT); } static void ta_foreach_addr_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct radix_node_head *rnh; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); } #ifdef INET6 static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask) { uint32_t *cp; for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) *cp++ = 0xFFFFFFFF; if (mask > 0) *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); } #endif static void tei_to_sockaddr_ent_addr(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask) { int mlen; #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sa_in6 *addr6, *mask6; #endif in_addr_t a4; mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET addr = (struct sockaddr_in *)sa; mask = (struct sockaddr_in *)ma; /* Set 'total' structure length */ KEY_LEN(*addr) = KEY_LEN_INET; KEY_LEN(*mask) = KEY_LEN_INET; addr->sin_family = AF_INET; mask->sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); a4 = *((in_addr_t *)tei->paddr); addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; if (mlen != 32) *set_mask = 1; else *set_mask = 0; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ addr6 = (struct sa_in6 *)sa; mask6 = (struct sa_in6 *)ma; /* Set 'total' structure length */ KEY_LEN(*addr6) = KEY_LEN_INET6; KEY_LEN(*mask6) = KEY_LEN_INET6; addr6->sin6_family = AF_INET6; ipv6_writemask(&mask6->sin6_addr, mlen); memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); if (mlen != 128) *set_mask = 1; else *set_mask = 0; #endif } } static int ta_prepare_add_addr_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct addr_radix_entry *ent; #ifdef INET6 struct addr_radix_xentry *xent; #endif struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); ent->masklen = mlen; addr = (struct sockaddr *)&ent->addr; mask = (struct sockaddr *)&tb->addr.a4.ma; tb->ent_ptr = ent; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); xent->masklen = mlen; addr = (struct sockaddr *)&xent->addr6; mask = (struct sockaddr *)&tb->addr.a6.ma; tb->ent_ptr = xent; #endif } else { /* Unknown CIDR type */ return (EINVAL); } tei_to_sockaddr_ent_addr(tei, addr, mask, &set_mask); /* Set pointers */ tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_add_addr_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct addr_radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; uint32_t *old_value, value; cfg = (struct addr_radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; /* Save current entry value from @tei */ if (tei->subtype == AF_INET) { rnh = ti->state; ((struct addr_radix_entry *)tb->ent_ptr)->value = tei->value; } else { rnh = ti->xstate; ((struct addr_radix_xentry *)tb->ent_ptr)->value = tei->value; } /* Search for an entry first */ rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ if (tei->subtype == AF_INET) old_value = &((struct addr_radix_entry *)rn)->value; else old_value = &((struct addr_radix_xentry *)rn)->value; value = *old_value; *old_value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh,tb->ent_ptr); if (rn == NULL) { /* Unknown error */ return (EINVAL); } if (tei->subtype == AF_INET) cfg->count4++; else cfg->count6++; tb->ent_ptr = NULL; *pnum = 1; return (0); } static int ta_prepare_del_addr_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { if (mlen > 32) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a4.sa; mask = (struct sockaddr *)&tb->addr.a4.ma; #ifdef INET6 } else if (tei->subtype == AF_INET6) { if (mlen > 128) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a6.sa; mask = (struct sockaddr *)&tb->addr.a6.ma; #endif } else return (EINVAL); tei_to_sockaddr_ent_addr(tei, addr, mask, &set_mask); tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_del_addr_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct addr_radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; cfg = (struct addr_radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; if (tei->subtype == AF_INET) rnh = ti->state; else rnh = ti->xstate; rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn == NULL) return (ENOENT); /* Save entry value to @tei */ if (tei->subtype == AF_INET) tei->value = ((struct addr_radix_entry *)rn)->value; else tei->value = ((struct addr_radix_xentry *)rn)->value; tb->ent_ptr = rn; if (tei->subtype == AF_INET) cfg->count4--; else cfg->count6--; *pnum = 1; return (0); } static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; tb = (struct ta_buf_radix *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { /* * radix does not require additional memory allocations * other than nodes itself. Adding new masks to the tree do * but we don't have any API to call (and we don't known which * sizes do we need). */ return (0); } struct table_algo addr_radix = { .name = "addr:radix", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_radix), .init = ta_init_addr_radix, .destroy = ta_destroy_addr_radix, .prepare_add = ta_prepare_add_addr_radix, .prepare_del = ta_prepare_del_addr_radix, .add = ta_add_addr_radix, .del = ta_del_addr_radix, .flush_entry = ta_flush_radix_entry, .foreach = ta_foreach_addr_radix, .dump_tentry = ta_dump_addr_radix_tentry, .find_tentry = ta_find_addr_radix_tentry, .dump_tinfo = ta_dump_addr_radix_tinfo, .need_modify = ta_need_modify_radix, }; /* * addr:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [v4=1/v6=0][hsize] * [ 32][ 32] */ struct chashentry; SLIST_HEAD(chashbhead, chashentry); struct chash_cfg { struct chashbhead *head4; struct chashbhead *head6; size_t size4; size_t size6; size_t items4; size_t items6; uint8_t mask4; uint8_t mask6; }; struct chashentry { SLIST_ENTRY(chashentry) next; uint32_t value; uint32_t type; union { uint32_t a4; /* Host format */ struct in6_addr a6; /* Network format */ } a; }; struct ta_buf_chash { void *ent_ptr; struct chashentry ent; }; #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize); #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize); static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize); #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int chash_parse_opts(struct chash_cfg *cfg, char *data); static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_log2(uint32_t v); static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_chash(void *ta_state, struct table_info *ti); static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size); static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_chash(void *ta_buf); #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize) { return (addr % (hsize - 1)); } #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; return (i % (hsize - 1)); } static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; return (i % (hsize - 1)); } static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) { struct in6_addr mask6; ipv6_writemask(&mask6, mask); memcpy(addr6, key, sizeof(struct in6_addr)); APPLY_MASK(addr6, &mask6); return (hash_ip6(addr6, hsize)); } static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) { uint64_t *paddr; paddr = (uint64_t *)addr6; *paddr = 0; *(paddr + 1) = 0; memcpy(addr6, key, mask); return (hash_ip6(addr6, hsize)); } #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: worst scenario: non-round mask */ struct in6_addr addr6; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_slow(&addr6, key, imask, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (memcmp(&ent->a.a6, &addr6, 16) == 0) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: aligned to 8bit mask */ struct in6_addr addr6; uint64_t *paddr, *ptmp; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_al(&addr6, key, imask, hsize); paddr = (uint64_t *)&addr6; SLIST_FOREACH(ent, &head[hash], next) { ptmp = (uint64_t *)&ent->a.a6; if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: /64 */ uint64_t a6, *paddr; head = (struct chashbhead *)ti->xstate; paddr = (uint64_t *)key; hsize = 1 << (ti->data & 0xFF); a6 = *paddr; hash = hash_ip64((struct in6_addr *)key, hsize); SLIST_FOREACH(ent, &head[hash], next) { paddr = (uint64_t *)&ent->a.a6; if (a6 == *paddr) { *val = ent->value; return (1); } } #endif } return (0); } static int chash_parse_opts(struct chash_cfg *cfg, char *data) { char *pdel, *pend, *s; int mask4, mask6; mask4 = cfg->mask4; mask6 = cfg->mask6; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "masks=", 6) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 6; /* Need /XX[,/YY] */ if (*pdel++ != '/') return (EINVAL); mask4 = strtol(pdel, &pend, 10); if (*pend == ',') { /* ,/YY */ pdel = pend + 1; if (*pdel++ != '/') return (EINVAL); mask6 = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); } else if (*pend != '\0') return (EINVAL); if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) return (EINVAL); cfg->mask4 = mask4; cfg->mask6 = mask6; return (0); } static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; if (cfg->mask4 != 32 || cfg->mask6 != 128) snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", cfg->mask4, cfg->mask6); else snprintf(buf, bufsize, "%s", "addr:hash"); } static int ta_log2(uint32_t v) { uint32_t r; r = 0; while (v >>= 1) r++; return (r); } /* * New table. * We assume 'data' to be either NULL or the following format: * 'addr:hash [masks=/32[,/128]]' */ static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, i; uint32_t hsize; struct chash_cfg *cfg; cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->mask4 = 32; cfg->mask6 = 128; if ((error = chash_parse_opts(cfg, data)) != 0) { free(cfg, M_IPFW); return (error); } cfg->size4 = 128; cfg->size6 = 128; cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, M_WAITOK | M_ZERO); cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size4; i++) SLIST_INIT(&cfg->head4[i]); for (i = 0; i < cfg->size6; i++) SLIST_INIT(&cfg->head6[i]); *ta_state = cfg; ti->state = cfg->head4; ti->xstate = cfg->head6; /* Store data depending on v6 mask length */ hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); if (cfg->mask6 == 64) { ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| hsize; ti->lookup = ta_lookup_chash_64; } else if ((cfg->mask6 % 8) == 0) { ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 13 | hsize; ti->lookup = ta_lookup_chash_aligned; } else { /* don't do that! */ ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 16 | hsize; ti->lookup = ta_lookup_chash_slow; } return (0); } static void ta_destroy_chash(void *ta_state, struct table_info *ti) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) free(ent, M_IPFW_TBL); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head4, M_IPFW); free(cfg->head6, M_IPFW); free(cfg, M_IPFW); } static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size4; tinfo->count4 = cfg->items4; tinfo->itemsize4 = sizeof(struct chashentry); tinfo->taclass6 = IPFW_TACLASS_HASH; tinfo->size6 = cfg->size6; tinfo->count6 = cfg->items6; tinfo->itemsize6 = sizeof(struct chashentry); } static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashentry *ent; cfg = (struct chash_cfg *)ta_state; ent = (struct chashentry *)e; if (ent->type == AF_INET) { tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); tent->masklen = cfg->mask4; tent->subtype = AF_INET; tent->v.kidx = ent->value; #ifdef INET6 } else { memcpy(&tent->k.addr6, &ent->a.a6, sizeof(struct in6_addr)); tent->masklen = cfg->mask6; tent->subtype = AF_INET6; tent->v.kidx = ent->value; #endif } return (0); } static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) { uint32_t hash; hash = 0; if (af == AF_INET) { #ifdef INET hash = hash_ip(ent->a.a4, size); #endif } else { #ifdef INET6 if (mlen == 64) hash = hash_ip64(&ent->a.a6, size); else hash = hash_ip6(&ent->a.a6, size); #endif } return (hash); } static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) { int mlen; #ifdef INET6 struct in6_addr mask6; #endif mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent->type = AF_INET; /* Calculate masked address */ ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); ent->type = AF_INET6; ipv6_writemask(&mask6, mlen); memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&ent->a.a6, &mask6); #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry ent, *tmp; struct tentry_info tei; int error; uint32_t hash; cfg = (struct chash_cfg *)ta_state; memset(&ent, 0, sizeof(ent)); memset(&tei, 0, sizeof(tei)); if (tent->subtype == AF_INET) { tei.paddr = &tent->k.addr; tei.masklen = cfg->mask4; tei.subtype = AF_INET; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head4; hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 != ent.a.a4) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } else { tei.paddr = &tent->k.addr6; tei.masklen = cfg->mask6; tei.subtype = AF_INET6; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head6; hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) f(ent, arg); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; struct chashentry *ent; int error; tb = (struct ta_buf_chash *)ta_buf; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_chash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *ent, *tmp; struct ta_buf_chash *tb; int exists; uint32_t hash, value; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = (struct chashentry *)tb->ent_ptr; hash = 0; exists = 0; /* Read current value from @tei */ ent->value = tei->value; /* Read cuurrent value */ if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 == ent->a.a4) { exists = 1; break; } } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { exists = 1; break; } } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters */ if (tei->subtype == AF_INET) cfg->items4++; else cfg->items6++; } return (0); } static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; return (tei_to_chash_ent(tei, &tb->ent)); } static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *tmp, *tmp_next, *ent; struct ta_buf_chash *tb; uint32_t hash; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = &tb->ent; if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (tmp->a.a4 != ent->a.a4) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items4--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items6--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } return (ENOENT); } static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct chash_cfg *cfg; uint64_t data; /* * Since we don't know exact number of IPv4/IPv6 records in @count, * ignore non-zero @count value at all. Check current hash sizes * and return appropriate data. */ cfg = (struct chash_cfg *)ta_state; data = 0; if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) data |= (cfg->size4 * 2) << 16; if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) data |= cfg->size6 * 2; if (data != 0) { *pflags = data; return (1); } return (0); } /* * Allocate new, larger chash. */ static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct chashbhead *head; int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = (*pflags >> 16) & 0xFFFF; mi->size6 = *pflags & 0xFFFF; if (mi->size > 0) { head = malloc(sizeof(struct chashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; } if (mi->size6 > 0) { head = malloc(sizeof(struct chashbhead) * mi->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size6; i++) SLIST_INIT(&head[i]); mi->main_ptr6 = head; } return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct chash_cfg *cfg; struct chashbhead *old_head, *new_head; struct chashentry *ent, *ent_next; int af, i, mlen; uint32_t nhash; size_t old_size, new_size; mi = (struct mod_item *)ta_buf; cfg = (struct chash_cfg *)ta_state; /* Check which hash we need to grow and do we still need that */ if (mi->size > 0 && cfg->size4 < mi->size) { new_head = (struct chashbhead *)mi->main_ptr; new_size = mi->size; old_size = cfg->size4; old_head = ti->state; mlen = cfg->mask4; af = AF_INET; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; cfg->head4 = new_head; cfg->size4 = mi->size; mi->main_ptr = old_head; } if (mi->size6 > 0 && cfg->size6 < mi->size6) { new_head = (struct chashbhead *)mi->main_ptr6; new_size = mi->size6; old_size = cfg->size6; old_head = ti->xstate; mlen = cfg->mask6; af = AF_INET6; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->xstate = new_head; cfg->head6 = new_head; cfg->size6 = mi->size6; mi->main_ptr6 = old_head; } /* Update lower 32 bits with new values */ ti->data &= 0xFFFFFFFF00000000; ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); } /* * Free unneded array. */ static void ta_flush_mod_chash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); if (mi->main_ptr6 != NULL) free(mi->main_ptr6, M_IPFW); } struct table_algo addr_hash = { .name = "addr:hash", .type = IPFW_TABLE_ADDR, .ta_buf_size = sizeof(struct ta_buf_chash), .init = ta_init_chash, .destroy = ta_destroy_chash, .prepare_add = ta_prepare_add_chash, .prepare_del = ta_prepare_del_chash, .add = ta_add_chash, .del = ta_del_chash, .flush_entry = ta_flush_chash_entry, .foreach = ta_foreach_chash, .dump_tentry = ta_dump_chash_tentry, .find_tentry = ta_find_chash_tentry, .print_config = ta_print_chash_config, .dump_tinfo = ta_dump_chash_tinfo, .need_modify = ta_need_modify_chash, .prepare_mod = ta_prepare_mod_chash, .fill_mod = ta_fill_mod_chash, .modify = ta_modify_chash, .flush_mod = ta_flush_mod_chash, }; /* * Iface table cmds. * * Implementation: * * Runtime part: * - sorted array of "struct ifidx" pointed by ti->state. * Array is allocated with rounding up to IFIDX_CHUNK. Only existing * interfaces are stored in array, however its allocated size is * sufficient to hold all table records if needed. * - current array size is stored in ti->data * * Table data: * - "struct iftable_cfg" is allocated to store table state (ta_state). * - All table records are stored inside namedobj instance. * */ struct ifidx { uint16_t kidx; uint16_t spare; uint32_t value; }; #define DEFAULT_IFIDX_SIZE 64 struct iftable_cfg; struct ifentry { struct named_object no; struct ipfw_ifc ic; struct iftable_cfg *icfg; uint32_t value; int linked; }; struct iftable_cfg { struct namedobj_instance *ii; struct ip_fw_chain *ch; struct table_info *ti; void *main_ptr; size_t size; /* Number of items allocated in array */ size_t count; /* Number of all items */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_ifidx { struct ifentry *ife; uint32_t value; }; int compare_ifidx(const void *k, const void *v); static struct ifidx * ifidx_find(struct table_info *ti, void *key); static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); static int destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_ifidx(void *ta_buf); static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_ifidx(const void *k, const void *v) { const struct ifidx *ifidx; uint16_t key; key = *((const uint16_t *)k); ifidx = (const struct ifidx *)v; if (key < ifidx->kidx) return (-1); else if (key > ifidx->kidx) return (1); return (0); } /* * Adds item @item with key @key into ascending-sorted array @base. * Assumes @base has enough additional storage. * * Returns 1 on success, 0 on duplicate key. */ static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { int min, max, mid, shift, res; caddr_t paddr; if (nmemb == 0) { memcpy(base, item, size); return (1); } /* Binary search */ min = 0; max = nmemb - 1; mid = 0; while (min <= max) { mid = (min + max) / 2; res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res == 0) return (0); if (res > 0) min = mid + 1; else max = mid - 1; } /* Item not found. */ res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res > 0) shift = mid + 1; else shift = mid; paddr = (caddr_t)base + shift * size; if (nmemb > shift) memmove(paddr + size, paddr, (nmemb - shift) * size); memcpy(paddr, item, size); return (1); } /* * Deletes item with key @key from ascending-sorted array @base. * * Returns 1 on success, 0 for non-existent key. */ static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { caddr_t item; size_t sz; item = (caddr_t)bsearch(key, base, nmemb, size, compar); if (item == NULL) return (0); sz = (caddr_t)base + nmemb * size - item; if (sz > 0) memmove(item, item + size, sz); return (1); } static struct ifidx * ifidx_find(struct table_info *ti, void *key) { struct ifidx *ifi; ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), compare_ifidx); return (ifi); } static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct ifidx *ifi; ifi = ifidx_find(ti, key); if (ifi != NULL) { *val = ifi->value; return (1); } return (0); } static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct iftable_cfg *icfg; icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); - icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); + icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE, DEFAULT_OBJHASH_SIZE); icfg->size = DEFAULT_IFIDX_SIZE; icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, M_WAITOK | M_ZERO); icfg->ch = ch; *ta_state = icfg; ti->state = icfg->main_ptr; ti->lookup = ta_lookup_ifidx; return (0); } /* * Handle tableinfo @ti pointer change (on table array resize). */ static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; icfg = (struct iftable_cfg *)ta_state; icfg->ti = ti; } static int destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; ife = (struct ifentry *)no; ipfw_iface_del_notify(ch, &ife->ic); ipfw_iface_unref(ch, &ife->ic); free(ife, M_IPFW_TBL); return (0); } /* * Destroys table @ti */ static void ta_destroy_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; struct ip_fw_chain *ch; icfg = (struct iftable_cfg *)ta_state; ch = icfg->ch; if (icfg->main_ptr != NULL) free(icfg->main_ptr, M_IPFW); IPFW_UH_WLOCK(ch); ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); IPFW_UH_WUNLOCK(ch); ipfw_objhash_destroy(icfg->ii); free(icfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct iftable_cfg *cfg; cfg = (struct iftable_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct ifidx); } /* * Prepare state to add to the table: * allocate ifentry and reference needed interface. */ static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; struct ifentry *ife; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); ife->ic.cb = if_notifier; ife->ic.cbdata = ife; if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { free(ife, M_IPFW_TBL); return (EINVAL); } /* Use ipfw_iface 'ifname' field as stable storage */ ife->no.name = ife->ic.iface->ifname; tb->ife = ife; return (0); } static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife, *tmp; struct ta_buf_ifidx *tb; struct ipfw_iface *iif; struct ifidx *ifi; char *ifname; uint32_t value; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = tb->ife; ife->icfg = icfg; ife->value = tei->value; tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (tmp != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values in @tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; iif = tmp->ic.iface; if (iif->resolved != 0) { /* We have to update runtime value, too */ ifi = ifidx_find(ti, &iif->ifindex); ifi->value = ife->value; } /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); /* Link to internal list */ ipfw_objhash_add(icfg->ii, &ife->no); /* Link notifier (possible running its callback) */ ipfw_iface_add_notify(icfg->ch, &ife->ic); icfg->count++; tb->ife = NULL; *pnum = 1; return (0); } /* * Prepare to delete key from table. * Do basic interface name checks. */ static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { char *ifname; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife; struct ta_buf_ifidx *tb; char *ifname; uint16_t ifindex; int res __diagused; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife == NULL) return (ENOENT); if (ife->linked != 0) { /* We have to remove item from runtime */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } /* Unlink from local list */ ipfw_objhash_del(icfg->ii, &ife->no); /* Unlink notifier and deref */ ipfw_iface_del_notify(icfg->ch, &ife->ic); ipfw_iface_unref(icfg->ch, &ife->ic); icfg->count--; tei->value = ife->value; tb->ife = ife; *pnum = 1; return (0); } /* * Flush deleted entry. * Drops interface reference and frees entry. */ static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; tb = (struct ta_buf_ifidx *)ta_buf; if (tb->ife != NULL) free(tb->ife, M_IPFW_TBL); } /* * Handle interface announce/withdrawal for particular table. * Every real runtime array modification happens here. */ static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) { struct ifentry *ife; struct ifidx ifi; struct iftable_cfg *icfg; struct table_info *ti; int res __diagused; ife = (struct ifentry *)cbdata; icfg = ife->icfg; ti = icfg->ti; KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); if (ife->linked == 0 && ifindex != 0) { /* Interface announce */ ifi.kidx = ifindex; ifi.spare = 0; ifi.value = ife->value; res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d already exists", ifindex)); icfg->used++; ti->data = icfg->used; ife->linked = 1; } else if (ife->linked != 0 && ifindex == 0) { /* Interface withdrawal */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } } /* * Table growing callbacks. */ static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct iftable_cfg *cfg; uint32_t size; cfg = (struct iftable_cfg *)ta_state; size = cfg->size; while (size < cfg->count + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate ned, larger runtime ifidx array. */ static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct iftable_cfg *icfg; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; /* Check if we still need to grow array */ if (icfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct iftable_cfg *icfg; void *old_ptr; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; old_ptr = icfg->main_ptr; icfg->main_ptr = mi->main_ptr; icfg->size = mi->size; ti->state = icfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_ifidx(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct ifentry *ife; ife = (struct ifentry *)e; tent->masklen = 8 * IF_NAMESIZE; memcpy(&tent->k, ife->no.name, IF_NAMESIZE); tent->v.kidx = ife->value; return (0); } static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct iftable_cfg *icfg; struct ifentry *ife; char *ifname; icfg = (struct iftable_cfg *)ta_state; ifname = tent->k.iface; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife != NULL) { ta_dump_ifidx_tentry(ta_state, ti, ife, tent); return (0); } return (ENOENT); } struct wa_ifidx { ta_foreach_f *f; void *arg; }; static int foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct wa_ifidx *wa; ife = (struct ifentry *)no; wa = (struct wa_ifidx *)arg; wa->f(ife, wa->arg); return (0); } static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct iftable_cfg *icfg; struct wa_ifidx wa; icfg = (struct iftable_cfg *)ta_state; wa.f = f; wa.arg = arg; ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); } struct table_algo iface_idx = { .name = "iface:array", .type = IPFW_TABLE_INTERFACE, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_ifidx), .init = ta_init_ifidx, .destroy = ta_destroy_ifidx, .prepare_add = ta_prepare_add_ifidx, .prepare_del = ta_prepare_del_ifidx, .add = ta_add_ifidx, .del = ta_del_ifidx, .flush_entry = ta_flush_ifidx_entry, .foreach = ta_foreach_ifidx, .dump_tentry = ta_dump_ifidx_tentry, .find_tentry = ta_find_ifidx_tentry, .dump_tinfo = ta_dump_ifidx_tinfo, .need_modify = ta_need_modify_ifidx, .prepare_mod = ta_prepare_mod_ifidx, .fill_mod = ta_fill_mod_ifidx, .modify = ta_modify_ifidx, .flush_mod = ta_flush_mod_ifidx, .change_ti = ta_change_ti_ifidx, }; /* * Number array cmds. * * Implementation: * * Runtime part: * - sorted array of "struct numarray" pointed by ti->state. * Array is allocated with rounding up to NUMARRAY_CHUNK. * - current array size is stored in ti->data * */ struct numarray { uint32_t number; uint32_t value; }; struct numarray_cfg { void *main_ptr; size_t size; /* Number of items allocated in array */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_numarray { struct numarray na; }; int compare_numarray(const void *k, const void *v); static struct numarray *numarray_find(struct table_info *ti, void *key); static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_numarray(void *ta_state, struct table_info *ti); static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_numarray(void *ta_buf); static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_numarray(const void *k, const void *v) { const struct numarray *na; uint32_t key; key = *((const uint32_t *)k); na = (const struct numarray *)v; if (key < na->number) return (-1); else if (key > na->number) return (1); return (0); } static struct numarray * numarray_find(struct table_info *ti, void *key) { struct numarray *ri; ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), compare_numarray); return (ri); } static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct numarray *ri; ri = numarray_find(ti, key); if (ri != NULL) { *val = ri->value; return (1); } return (0); } static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct numarray_cfg *cfg; cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 16; cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->state = cfg->main_ptr; ti->lookup = ta_lookup_numarray; return (0); } /* * Destroys table @ti */ static void ta_destroy_numarray(void *ta_state, struct table_info *ti) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; if (cfg->main_ptr != NULL) free(cfg->main_ptr, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct numarray); } /* * Prepare for addition/deletion to an array. */ static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_numarray *tb; tb = (struct ta_buf_numarray *)ta_buf; tb->na.number = *((uint32_t *)tei->paddr); return (0); } static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res __diagused; uint32_t value; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Read current value from @tei */ tb->na.value = tei->value; ri = numarray_find(ti, &tb->na.number); if (ri != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values between ri and @tei */ value = ri->value; ri->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %d already exists", tb->na.number)); cfg->used++; ti->data = cfg->used; *pnum = 1; return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res __diagused; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tb->na.number); if (ri == NULL) return (ENOENT); tei->value = ri->value; res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %u does not exist", tb->na.number)); cfg->used--; ti->data = cfg->used; *pnum = 1; return (0); } static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { /* We don't have any state, do nothing */ } /* * Table growing callbacks. */ static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct numarray_cfg *cfg; size_t size; cfg = (struct numarray_cfg *)ta_state; size = cfg->size; while (size < cfg->used + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate new, larger runtime array. */ static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct numarray_cfg *cfg; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Check if we still need to grow array */ if (cfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct numarray_cfg *cfg; void *old_ptr; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; old_ptr = cfg->main_ptr; cfg->main_ptr = mi->main_ptr; cfg->size = mi->size; ti->state = cfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_numarray(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct numarray *na; na = (struct numarray *)e; tent->k.key = na->number; tent->v.kidx = na->value; return (0); } static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct numarray *ri; ri = numarray_find(ti, &tent->k.key); if (ri != NULL) { ta_dump_numarray_tentry(ta_state, ti, ri, tent); return (0); } return (ENOENT); } static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct numarray_cfg *cfg; struct numarray *array; int i; cfg = (struct numarray_cfg *)ta_state; array = cfg->main_ptr; for (i = 0; i < cfg->used; i++) f(&array[i], arg); } struct table_algo number_array = { .name = "number:array", .type = IPFW_TABLE_NUMBER, .ta_buf_size = sizeof(struct ta_buf_numarray), .init = ta_init_numarray, .destroy = ta_destroy_numarray, .prepare_add = ta_prepare_add_numarray, .prepare_del = ta_prepare_add_numarray, .add = ta_add_numarray, .del = ta_del_numarray, .flush_entry = ta_flush_numarray_entry, .foreach = ta_foreach_numarray, .dump_tentry = ta_dump_numarray_tentry, .find_tentry = ta_find_numarray_tentry, .dump_tinfo = ta_dump_numarray_tinfo, .need_modify = ta_need_modify_numarray, .prepare_mod = ta_prepare_mod_numarray, .fill_mod = ta_fill_mod_numarray, .modify = ta_modify_numarray, .flush_mod = ta_flush_mod_numarray, }; /* * flow:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [hsize4][hsize6] * [ 16][ 16] */ struct fhashentry; SLIST_HEAD(fhashbhead, fhashentry); struct fhashentry { SLIST_ENTRY(fhashentry) next; uint8_t af; uint8_t proto; uint16_t spare0; uint16_t dport; uint16_t sport; uint32_t value; uint32_t spare1; }; struct fhashentry4 { struct fhashentry e; struct in_addr dip; struct in_addr sip; }; struct fhashentry6 { struct fhashentry e; struct in6_addr dip6; struct in6_addr sip6; }; struct fhash_cfg { struct fhashbhead *head; size_t size; size_t items; struct fhashentry4 fe4; struct fhashentry6 fe6; }; struct ta_buf_fhash { void *ent_ptr; struct fhashentry6 fe6; }; static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz); static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_fhash(void *ta_state, struct table_info *ti); static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_fhash(void *ta_buf); static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) { uint64_t *ka, *kb; ka = (uint64_t *)(&a->next + 1); kb = (uint64_t *)(&b->next + 1); if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) return (1); return (0); } static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize) { uint32_t i; i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize) { uint32_t i; i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ (f->dip6.__u6_addr.__u6_addr32[3]) ^ (f->sip6.__u6_addr.__u6_addr32[2]) ^ (f->sip6.__u6_addr.__u6_addr32[3]) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size) { uint32_t hash; if (ent->af == AF_INET) { hash = hash_flow4((struct fhashentry4 *)ent, size); } else { hash = hash_flow6((struct fhashentry6 *)ent, size); } return (hash); } static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct fhashbhead *head; struct fhashentry *ent; struct fhashentry4 *m4; struct ipfw_flow_id *id; - uint32_t hsize; - uint16_t hash; + uint32_t hash, hsize; id = (struct ipfw_flow_id *)key; head = (struct fhashbhead *)ti->state; hsize = ti->data; m4 = (struct fhashentry4 *)ti->xstate; if (id->addr_type == 4) { struct fhashentry4 f; /* Copy hash mask */ f = *m4; f.dip.s_addr &= id->dst_ip; f.sip.s_addr &= id->src_ip; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow4(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { *val = ent->value; return (1); } } } else if (id->addr_type == 6) { struct fhashentry6 f; uint64_t *fp, *idp; /* Copy hash mask */ f = *((struct fhashentry6 *)(m4 + 1)); /* Handle lack of __u6_addr.__u6_addr64 */ fp = (uint64_t *)&f.dip6; idp = (uint64_t *)&id->dst_ip6; /* src IPv6 is stored after dst IPv6 */ *fp++ &= *idp++; *fp++ &= *idp++; *fp++ &= *idp++; *fp &= *idp; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow6(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { *val = ent->value; return (1); } } } return (0); } /* * New table. */ static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct fhash_cfg *cfg; struct fhashentry4 *fe4; struct fhashentry6 *fe6; u_int i; cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 512; cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size; i++) SLIST_INIT(&cfg->head[i]); /* Fill in fe masks based on @tflags */ fe4 = &cfg->fe4; fe6 = &cfg->fe6; if (tflags & IPFW_TFFLAG_SRCIP) { memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); } if (tflags & IPFW_TFFLAG_DSTIP) { memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); } if (tflags & IPFW_TFFLAG_SRCPORT) { memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); } if (tflags & IPFW_TFFLAG_DSTPORT) { memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); } if (tflags & IPFW_TFFLAG_PROTO) { memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); } fe4->e.af = AF_INET; fe6->e.af = AF_INET6; *ta_state = cfg; ti->state = cfg->head; ti->xstate = &cfg->fe4; ti->data = cfg->size; ti->lookup = ta_lookup_fhash; return (0); } static void ta_destroy_fhash(void *ta_state, struct table_info *ti) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size; tinfo->count4 = cfg->items; tinfo->itemsize4 = sizeof(struct fhashentry4); tinfo->itemsize6 = sizeof(struct fhashentry6); } static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct fhashentry *ent; struct fhashentry4 *fe4; #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; ent = (struct fhashentry *)e; tfe = &tent->k.flow; tfe->af = ent->af; tfe->proto = ent->proto; tfe->dport = htons(ent->dport); tfe->sport = htons(ent->sport); tent->v.kidx = ent->value; tent->subtype = ent->af; if (ent->af == AF_INET) { fe4 = (struct fhashentry4 *)ent; tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); tent->masklen = 32; #ifdef INET6 } else { fe6 = (struct fhashentry6 *)ent; tfe->a.a6.sip6 = fe6->sip6; tfe->a.a6.dip6 = fe6->dip6; tent->masklen = 128; #endif } return (0); } static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) { #ifdef INET struct fhashentry4 *fe4; #endif #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; tfe = (struct tflow_entry *)tei->paddr; ent->af = tei->subtype; ent->proto = tfe->proto; ent->dport = ntohs(tfe->dport); ent->sport = ntohs(tfe->sport); if (tei->subtype == AF_INET) { #ifdef INET fe4 = (struct fhashentry4 *)ent; fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { fe6 = (struct fhashentry6 *)ent; fe6->sip6 = tfe->a.a6.sip6; fe6->dip6 = tfe->a.a6.dip6; #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct fhashentry6 fe6; struct tentry_info tei; int error; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; ent = &fe6.e; memset(&fe6, 0, sizeof(fe6)); memset(&tei, 0, sizeof(tei)); tei.paddr = &tent->k.flow; tei.subtype = tent->subtype; if ((error = tei_to_fhash_ent(&tei, ent)) != 0) return (error); head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei.subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { ta_dump_fhash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; struct fhashentry *ent; size_t sz; int error; tb = (struct ta_buf_fhash *)ta_buf; if (tei->subtype == AF_INET) sz = sizeof(struct fhashentry4); else if (tei->subtype == AF_INET6) sz = sizeof(struct fhashentry6); else return (EINVAL); ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_fhash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; int exists; uint32_t hash, value; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = (struct fhashentry *)tb->ent_ptr; exists = 0; /* Read current value from @tei */ ent->value = tei->value; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { exists = 1; break; } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ /* Exchange values between tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters and check if we need to grow hash */ cfg->items++; } return (0); } static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; return (tei_to_fhash_ent(tei, &tb->fe6.e)); } static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = &tb->fe6.e; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) == 0) continue; SLIST_REMOVE(&head[hash], tmp, fhashentry, next); tei->value = tmp->value; *pnum = 1; cfg->items--; tb->ent_ptr = tmp; return (0); } return (ENOENT); } static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; if (cfg->items > cfg->size && cfg->size < 65536) { *pflags = cfg->size * 2; return (1); } return (0); } /* * Allocate new, larger fhash. */ static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct fhashbhead *head; u_int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct fhash_cfg *cfg; struct fhashbhead *old_head, *new_head; struct fhashentry *ent, *ent_next; int i; uint32_t nhash; size_t old_size; mi = (struct mod_item *)ta_buf; cfg = (struct fhash_cfg *)ta_state; old_size = cfg->size; old_head = ti->state; new_head = (struct fhashbhead *)mi->main_ptr; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_flow_ent(ent, mi->size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; ti->data = mi->size; cfg->head = new_head; cfg->size = mi->size; mi->main_ptr = old_head; } /* * Free unneded array. */ static void ta_flush_mod_fhash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } struct table_algo flow_hash = { .name = "flow:hash", .type = IPFW_TABLE_FLOW, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_fhash), .init = ta_init_fhash, .destroy = ta_destroy_fhash, .prepare_add = ta_prepare_add_fhash, .prepare_del = ta_prepare_del_fhash, .add = ta_add_fhash, .del = ta_del_fhash, .flush_entry = ta_flush_fhash_entry, .foreach = ta_foreach_fhash, .dump_tentry = ta_dump_fhash_tentry, .find_tentry = ta_find_fhash_tentry, .dump_tinfo = ta_dump_fhash_tinfo, .need_modify = ta_need_modify_fhash, .prepare_mod = ta_prepare_mod_fhash, .fill_mod = ta_fill_mod_fhash, .modify = ta_modify_fhash, .flush_mod = ta_flush_mod_fhash, }; /* * Kernel fibs bindings. * * Implementation: * * Runtime part: * - fully relies on route API * - fib number is stored in ti->data * */ static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int kfib_parse_opts(int *pfib, char *data); static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_kfib(void *ta_state, struct table_info *ti); static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_dump_kfib_tentry_int(int familt, const struct rtentry *rt, ipfw_obj_tentry *tent); static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { #ifdef INET struct in_addr in; #endif int error; error = ENOENT; #ifdef INET if (keylen == 4) { in.s_addr = *(in_addr_t *)key; NET_EPOCH_ASSERT(); error = fib4_lookup(ti->data, in, 0, NHR_NONE, 0) != NULL; } #endif #ifdef INET6 if (keylen == 6) error = fib6_lookup(ti->data, (struct in6_addr *)key, 0, NHR_NONE, 0) != NULL; #endif if (error != 0) return (0); *val = 0; return (1); } /* Parse 'fib=%d' */ static int kfib_parse_opts(int *pfib, char *data) { char *pdel, *pend, *s; int fibnum; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "fib=", 4) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 4; /* Need \d+ */ fibnum = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); *pfib = fibnum; return (0); } static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { if (ti->data != 0) snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); else snprintf(buf, bufsize, "%s", "addr:kfib"); } static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, fibnum; fibnum = 0; if ((error = kfib_parse_opts(&fibnum, data)) != 0) return (error); if (fibnum >= rt_numfibs) return (E2BIG); ti->data = fibnum; ti->lookup = ta_lookup_kfib; return (0); } /* * Destroys table @ti */ static void ta_destroy_kfib(void *ta_state, struct table_info *ti) { } /* * Provide algo-specific table info */ static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { tinfo->flags = IPFW_TATFLAGS_AFDATA; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = 0; tinfo->itemsize4 = 128; /* table is readonly, value does not matter */ tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = 0; tinfo->itemsize6 = 128; } static int ta_dump_kfib_tentry_int(int family, const struct rtentry *rt, ipfw_obj_tentry *tent) { uint32_t scopeid; int plen; #ifdef INET if (family == AF_INET) { rt_get_inet_prefix_plen(rt, &tent->k.addr, &plen, &scopeid); tent->masklen = plen; tent->subtype = AF_INET; tent->v.kidx = 0; } #endif #ifdef INET6 if (family == AF_INET6) { rt_get_inet6_prefix_plen(rt, &tent->k.addr6, &plen, &scopeid); tent->masklen = plen; tent->subtype = AF_INET6; tent->v.kidx = 0; } #endif return (0); } static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct rtentry *rt = NULL; struct route_nhop_data rnd; struct epoch_tracker et; int error; NET_EPOCH_ENTER(et); switch (tent->subtype) { #ifdef INET case AF_INET: rt = fib4_lookup_rt(ti->data, tent->k.addr, 0, 0, &rnd); break; #endif #ifdef INET6 case AF_INET6: rt = fib6_lookup_rt(ti->data, &tent->k.addr6, 0, 0, &rnd); break; #endif } if (rt != NULL) error = ta_dump_kfib_tentry_int(tent->subtype, rt, tent); else error = ENOENT; NET_EPOCH_EXIT(et); return (error); } struct kfib_dump_arg { struct rtentry *rt; int family; ta_foreach_f *f; void *arg; }; static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct kfib_dump_arg *karg = (struct kfib_dump_arg *)e; return (ta_dump_kfib_tentry_int(karg->family, karg->rt, tent)); } static int walk_wrapper_f(struct rtentry *rt, void *arg) { struct kfib_dump_arg *karg = (struct kfib_dump_arg *)arg; karg->rt = rt; return (karg->f(karg, karg->arg)); } static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct kfib_dump_arg karg = { .f = f, .arg = arg }; karg.family = AF_INET; rib_walk(ti->data, AF_INET, false, walk_wrapper_f, &karg); karg.family = AF_INET6; rib_walk(ti->data, AF_INET6, false, walk_wrapper_f, &karg); } struct table_algo addr_kfib = { .name = "addr:kfib", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_READONLY, .ta_buf_size = 0, .init = ta_init_kfib, .destroy = ta_destroy_kfib, .foreach = ta_foreach_kfib, .dump_tentry = ta_dump_kfib_tentry, .find_tentry = ta_find_kfib_tentry, .dump_tinfo = ta_dump_kfib_tinfo, .print_config = ta_print_kfib_config, }; struct mac_radix_entry { struct radix_node rn[2]; + struct sa_mac sa; uint32_t value; uint8_t masklen; - struct sa_mac sa; }; struct mac_radix_cfg { struct radix_node_head *head; size_t count; }; static int ta_lookup_mac_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct radix_node_head *rnh; if (keylen == ETHER_ADDR_LEN) { struct mac_radix_entry *ent; struct sa_mac sa; KEY_LEN(sa) = KEY_LEN_MAC; memcpy(sa.mac_addr.octet, key, ETHER_ADDR_LEN); rnh = (struct radix_node_head *)ti->state; ent = (struct mac_radix_entry *)(rnh->rnh_matchaddr(&sa, &rnh->rh)); if (ent != NULL) { *val = ent->value; return (1); } } return (0); } static int ta_init_mac_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct mac_radix_cfg *cfg; if (!rn_inithead(&ti->state, OFF_LEN_MAC)) return (ENOMEM); cfg = malloc(sizeof(struct mac_radix_cfg), M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->lookup = ta_lookup_mac_radix; return (0); } static void ta_destroy_mac_radix(void *ta_state, struct table_info *ti) { struct mac_radix_cfg *cfg; struct radix_node_head *rnh; cfg = (struct mac_radix_cfg *)ta_state; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, flush_radix_entry, rnh); rn_detachhead(&ti->state); free(cfg, M_IPFW); } static void tei_to_sockaddr_ent_mac(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask) { int mlen, i; struct sa_mac *addr, *mask; u_char *cp; mlen = tei->masklen; addr = (struct sa_mac *)sa; mask = (struct sa_mac *)ma; /* Set 'total' structure length */ KEY_LEN(*addr) = KEY_LEN_MAC; KEY_LEN(*mask) = KEY_LEN_MAC; for (i = mlen, cp = mask->mac_addr.octet; i >= 8; i -= 8) *cp++ = 0xFF; if (i > 0) *cp = ~((1 << (8 - i)) - 1); addr->mac_addr = *((struct ether_addr *)tei->paddr); for (i = 0; i < ETHER_ADDR_LEN; ++i) addr->mac_addr.octet[i] &= mask->mac_addr.octet[i]; if (mlen != 8 * ETHER_ADDR_LEN) *set_mask = 1; else *set_mask = 0; } static int ta_prepare_add_mac_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct mac_radix_entry *ent; struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_LINK) { if (mlen > 8 * ETHER_ADDR_LEN) return (EINVAL); ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); ent->masklen = mlen; addr = (struct sockaddr *)&ent->sa; mask = (struct sockaddr *)&tb->addr.mac.ma; tb->ent_ptr = ent; } else { /* Unknown CIDR type */ return (EINVAL); } tei_to_sockaddr_ent_mac(tei, addr, mask, &set_mask); /* Set pointers */ tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_add_mac_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct mac_radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; uint32_t *old_value, value; cfg = (struct mac_radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; /* Save current entry value from @tei */ rnh = ti->state; ((struct mac_radix_entry *)tb->ent_ptr)->value = tei->value; /* Search for an entry first */ rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ old_value = &((struct mac_radix_entry *)rn)->value; value = *old_value; *old_value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, &rnh->rh, tb->ent_ptr); if (rn == NULL) { /* Unknown error */ return (EINVAL); } cfg->count++; tb->ent_ptr = NULL; *pnum = 1; return (0); } static int ta_prepare_del_mac_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_LINK) { if (mlen > 8 * ETHER_ADDR_LEN) return (EINVAL); addr = (struct sockaddr *)&tb->addr.mac.sa; mask = (struct sockaddr *)&tb->addr.mac.ma; } else return (EINVAL); tei_to_sockaddr_ent_mac(tei, addr, mask, &set_mask); tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_del_mac_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct mac_radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; cfg = (struct mac_radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; rnh = ti->state; rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, &rnh->rh); if (rn == NULL) return (ENOENT); /* Save entry value to @tei */ tei->value = ((struct mac_radix_entry *)rn)->value; tb->ent_ptr = rn; cfg->count--; *pnum = 1; return (0); } static void ta_foreach_mac_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct radix_node_head *rnh; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(&rnh->rh, (walktree_f_t *)f, arg); } static void ta_dump_mac_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct mac_radix_cfg *cfg; cfg = (struct mac_radix_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = cfg->count; tinfo->itemsize4 = sizeof(struct mac_radix_entry); } static int ta_dump_mac_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct mac_radix_entry *n = (struct mac_radix_entry *)e; memcpy(tent->k.mac, n->sa.mac_addr.octet, ETHER_ADDR_LEN); tent->masklen = n->masklen; tent->subtype = AF_LINK; tent->v.kidx = n->value; return (0); } static int ta_find_mac_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct radix_node_head *rnh; void *e; e = NULL; if (tent->subtype == AF_LINK) { struct sa_mac sa; KEY_LEN(sa) = KEY_LEN_MAC; memcpy(sa.mac_addr.octet, tent->k.mac, ETHER_ADDR_LEN); rnh = (struct radix_node_head *)ti->state; e = rnh->rnh_matchaddr(&sa, &rnh->rh); } if (e != NULL) { ta_dump_mac_radix_tentry(ta_state, ti, e, tent); return (0); } return (ENOENT); } struct table_algo mac_radix = { .name = "mac:radix", .type = IPFW_TABLE_MAC, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_radix), .init = ta_init_mac_radix, .destroy = ta_destroy_mac_radix, .prepare_add = ta_prepare_add_mac_radix, .prepare_del = ta_prepare_del_mac_radix, .add = ta_add_mac_radix, .del = ta_del_mac_radix, .flush_entry = ta_flush_radix_entry, .foreach = ta_foreach_mac_radix, .dump_tentry = ta_dump_mac_radix_tentry, .find_tentry = ta_find_mac_radix_tentry, .dump_tinfo = ta_dump_mac_radix_tinfo, .need_modify = ta_need_modify_radix, }; void ipfw_table_algo_init(struct ip_fw_chain *ch) { size_t sz; /* * Register all algorithms presented here. */ sz = sizeof(struct table_algo); ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); ipfw_add_table_algo(ch, &mac_radix, sz, &mac_radix.idx); } void ipfw_table_algo_destroy(struct ip_fw_chain *ch) { ipfw_del_table_algo(ch, addr_radix.idx); ipfw_del_table_algo(ch, addr_hash.idx); ipfw_del_table_algo(ch, iface_idx.idx); ipfw_del_table_algo(ch, number_array.idx); ipfw_del_table_algo(ch, flow_hash.idx); ipfw_del_table_algo(ch, addr_kfib.idx); ipfw_del_table_algo(ch, mac_radix.idx); } diff --git a/sys/netpfil/ipfw/ip_fw_table_value.c b/sys/netpfil/ipfw/ip_fw_table_value.c index 431707ae5d74..e09323cd02c3 100644 --- a/sys/netpfil/ipfw/ip_fw_table_value.c +++ b/sys/netpfil/ipfw/ip_fw_table_value.c @@ -1,804 +1,770 @@ /*- - * Copyright (c) 2014 Yandex LLC + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2014-2025 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include /* * Multi-field value support for ipfw tables. * * This file contains necessary functions to convert * large multi-field values into u32 indices suitable to be fed * to various table algorithms. Other machinery like proper refcounting, * internal structures resizing are also kept here. */ #include "opt_ipfw.h" #include #include #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include /* struct ipfw_rule_ref */ #include #include #include static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt); static int cmp_table_value(struct named_object *no, const void *key, uint32_t kopt); static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); static struct ipfw_sopt_handler scodes[] = { - { IP_FW_TABLE_VLIST, 0, HDIR_GET, list_table_values }, + { IP_FW_TABLE_VLIST, IP_FW3_OPVER, HDIR_GET, list_table_values }, }; #define CHAIN_TO_VI(chain) (CHAIN_TO_TCFG(chain)->valhash) struct table_val_link { struct named_object no; struct table_value *pval; /* Pointer to real table value */ }; #define VALDATA_START_SIZE 64 /* Allocate 64-items array by default */ +#define VALDATA_HASH_SIZE 65536 struct vdump_args { struct ip_fw_chain *ch; struct sockopt_data *sd; struct table_value *pval; int error; }; static uint32_t hash_table_value(struct namedobj_instance *ni, const void *key, uint32_t kopt) { return (hash32_buf(key, 56, 0)); } static int cmp_table_value(struct named_object *no, const void *key, uint32_t kopt) { return (memcmp(((struct table_val_link *)no)->pval, key, 56)); } static void mask_table_value(struct table_value *src, struct table_value *dst, uint32_t mask) { #define _MCPY(f, b) if ((mask & (b)) != 0) { dst->f = src->f; } memset(dst, 0, sizeof(*dst)); _MCPY(tag, IPFW_VTYPE_TAG); _MCPY(pipe, IPFW_VTYPE_PIPE); _MCPY(divert, IPFW_VTYPE_DIVERT); _MCPY(skipto, IPFW_VTYPE_SKIPTO); _MCPY(netgraph, IPFW_VTYPE_NETGRAPH); _MCPY(fib, IPFW_VTYPE_FIB); _MCPY(nat, IPFW_VTYPE_NAT); _MCPY(limit, IPFW_VTYPE_LIMIT); _MCPY(mark, IPFW_VTYPE_MARK); _MCPY(dscp, IPFW_VTYPE_DSCP); _MCPY(nh4, IPFW_VTYPE_NH4); _MCPY(nh6, IPFW_VTYPE_NH6); _MCPY(zoneid, IPFW_VTYPE_NH6); #undef _MCPY } static void get_value_ptrs(struct ip_fw_chain *ch, struct table_config *tc, int vshared, struct table_value **ptv, struct namedobj_instance **pvi) { struct table_value *pval; struct namedobj_instance *vi; if (vshared != 0) { pval = (struct table_value *)ch->valuestate; vi = CHAIN_TO_VI(ch); } else { pval = NULL; vi = NULL; //pval = (struct table_value *)&tc->ti.data; } if (ptv != NULL) *ptv = pval; if (pvi != NULL) *pvi = vi; } /* * Update pointers to real vaues after @pval change. */ static int update_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct vdump_args *da; struct table_val_link *ptv; struct table_value *pval; da = (struct vdump_args *)arg; ptv = (struct table_val_link *)no; pval = da->pval; ptv->pval = &pval[ptv->no.kidx]; ptv->no.name = (char *)&pval[ptv->no.kidx]; return (0); } /* * Grows value storage shared among all tables. * Drops/reacquires UH locks. * Notifies other running adds on @ch shared storage resize. * Note function does not guarantee that free space * will be available after invocation, so one caller needs * to roll cycle himself. * * Returns 0 if case of no errors. */ static int resize_shared_value_storage(struct ip_fw_chain *ch) { struct tables_config *tcfg; struct namedobj_instance *vi; struct table_value *pval, *valuestate, *old_valuestate; void *new_idx; struct vdump_args da; int new_blocks; int val_size, val_size_old; IPFW_UH_WLOCK_ASSERT(ch); valuestate = NULL; new_idx = NULL; pval = (struct table_value *)ch->valuestate; vi = CHAIN_TO_VI(ch); tcfg = CHAIN_TO_TCFG(ch); val_size = tcfg->val_size * 2; if (val_size == (1 << 30)) return (ENOSPC); IPFW_UH_WUNLOCK(ch); valuestate = malloc(sizeof(struct table_value) * val_size, M_IPFW, M_WAITOK | M_ZERO); ipfw_objhash_bitmap_alloc(val_size, (void *)&new_idx, &new_blocks); IPFW_UH_WLOCK(ch); /* * Check if we still need to resize */ if (tcfg->val_size >= val_size) goto done; /* Update pointers and notify everyone we're changing @ch */ pval = (struct table_value *)ch->valuestate; rollback_toperation_state(ch, ch); /* Good. Let's merge */ memcpy(valuestate, pval, sizeof(struct table_value) * tcfg->val_size); ipfw_objhash_bitmap_merge(CHAIN_TO_VI(ch), &new_idx, &new_blocks); IPFW_WLOCK(ch); /* Change pointers */ old_valuestate = ch->valuestate; ch->valuestate = valuestate; valuestate = old_valuestate; ipfw_objhash_bitmap_swap(CHAIN_TO_VI(ch), &new_idx, &new_blocks); val_size_old = tcfg->val_size; tcfg->val_size = val_size; val_size = val_size_old; IPFW_WUNLOCK(ch); /* Update pointers to reflect resize */ memset(&da, 0, sizeof(da)); da.pval = (struct table_value *)ch->valuestate; ipfw_objhash_foreach(vi, update_tvalue, &da); done: free(valuestate, M_IPFW); ipfw_objhash_bitmap_free(new_idx, new_blocks); return (0); } /* * Drops reference for table value with index @kidx, stored in @pval and * @vi. Frees value if it has no references. */ static void unref_table_value(struct namedobj_instance *vi, struct table_value *pval, uint32_t kidx) { struct table_val_link *ptvl; KASSERT(pval[kidx].refcnt > 0, ("Refcount is 0 on kidx %d", kidx)); if (--pval[kidx].refcnt > 0) return; /* Last reference, delete item */ ptvl = (struct table_val_link *)ipfw_objhash_lookup_kidx(vi, kidx); KASSERT(ptvl != NULL, ("lookup on value kidx %d failed", kidx)); ipfw_objhash_del(vi, &ptvl->no); ipfw_objhash_free_idx(vi, kidx); free(ptvl, M_IPFW); } struct flush_args { struct ip_fw_chain *ch; struct table_algo *ta; struct table_info *ti; void *astate; ipfw_obj_tentry tent; }; static int unref_table_value_cb(void *e, void *arg) { struct flush_args *fa; struct ip_fw_chain *ch; struct table_algo *ta; ipfw_obj_tentry *tent; int error; fa = (struct flush_args *)arg; ta = fa->ta; memset(&fa->tent, 0, sizeof(fa->tent)); tent = &fa->tent; error = ta->dump_tentry(fa->astate, fa->ti, e, tent); if (error != 0) return (error); ch = fa->ch; unref_table_value(CHAIN_TO_VI(ch), (struct table_value *)ch->valuestate, tent->v.kidx); return (0); } /* * Drop references for each value used in @tc. */ void ipfw_unref_table_values(struct ip_fw_chain *ch, struct table_config *tc, struct table_algo *ta, void *astate, struct table_info *ti) { struct flush_args fa; IPFW_UH_WLOCK_ASSERT(ch); memset(&fa, 0, sizeof(fa)); fa.ch = ch; fa.ta = ta; fa.astate = astate; fa.ti = ti; ta->foreach(astate, ti, unref_table_value_cb, &fa); } /* * Table operation state handler. * Called when we are going to change something in @tc which * may lead to inconsistencies in on-going table data addition. * * Here we rollback all already committed state (table values, currently) * and set "modified" field to non-zero value to indicate * that we need to restart original operation. */ void rollback_table_values(struct tableop_state *ts) { struct ip_fw_chain *ch; struct table_value *pval; struct tentry_info *ptei; struct namedobj_instance *vi; int i; ch = ts->ch; IPFW_UH_WLOCK_ASSERT(ch); /* Get current table value pointer */ get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); for (i = 0; i < ts->count; i++) { ptei = &ts->tei[i]; if (ptei->value == 0) continue; unref_table_value(vi, pval, ptei->value); } } /* * Allocate new value index in either shared or per-table array. * Function may drop/reacquire UH lock. * * Returns 0 on success. */ static int alloc_table_vidx(struct ip_fw_chain *ch, struct tableop_state *ts, - struct namedobj_instance *vi, uint16_t *pvidx, uint8_t flags) + struct namedobj_instance *vi, uint32_t *pvidx, uint8_t flags) { int error, vlimit; - uint16_t vidx; + uint32_t vidx; IPFW_UH_WLOCK_ASSERT(ch); error = ipfw_objhash_alloc_idx(vi, &vidx); if (error != 0) { /* * We need to resize array. This involves * lock/unlock, so we need to check "modified" * state. */ ts->opstate.func(ts->tc, &ts->opstate); error = resize_shared_value_storage(ch); return (error); /* ts->modified should be set, we will restart */ } vlimit = ts->ta->vlimit; if (vlimit != 0 && vidx >= vlimit && !(flags & IPFW_CTF_ATOMIC)) { /* * Algorithm is not able to store given index. * We have to rollback state, start using * per-table value array or return error * if we're already using it. */ if (ts->vshared != 0) { /* shared -> per-table */ return (ENOSPC); /* TODO: proper error */ } /* per-table. Fail for now. */ return (ENOSPC); /* TODO: proper error */ } *pvidx = vidx; return (0); } /* * Drops value reference for unused values (updates, deletes, partially * successful adds or rollbacks). */ void ipfw_garbage_table_values(struct ip_fw_chain *ch, struct table_config *tc, struct tentry_info *tei, uint32_t count, int rollback) { int i; struct tentry_info *ptei; struct table_value *pval; struct namedobj_instance *vi; /* * We have two slightly different ADD cases here: * either (1) we are successful / partially successful, * in that case we need * * to ignore ADDED entries values * * rollback every other values if atomicity is not * * required (either UPDATED since old value has been * stored there, or some failure like EXISTS or LIMIT * or simply "ignored" case. * * (2): atomic rollback of partially successful operation * in that case we simply need to unref all entries. * * DELETE case is simpler: no atomic support there, so * we simply unref all non-zero values. */ /* * Get current table value pointers. * XXX: Properly read vshared */ get_value_ptrs(ch, tc, 1, &pval, &vi); for (i = 0; i < count; i++) { ptei = &tei[i]; if (ptei->value == 0) { /* * We may be deleting non-existing record. * Skip. */ continue; } if ((ptei->flags & TEI_FLAGS_ADDED) != 0 && rollback == 0) { ptei->value = 0; continue; } unref_table_value(vi, pval, ptei->value); ptei->value = 0; } } /* * Main function used to link values of entries going to be added, * to the index. Since we may perform many UH locks drops/acquires, * handle changes by checking tablestate "modified" field. * * Success: return 0. */ int ipfw_link_table_values(struct ip_fw_chain *ch, struct tableop_state *ts, uint8_t flags) { int error, i, found; struct namedobj_instance *vi; struct table_config *tc; struct tentry_info *tei, *ptei; - uint32_t count, vlimit; - uint16_t vidx; + uint32_t count, vidx, vlimit; struct table_val_link *ptv; struct table_value tval, *pval; /* * Stage 1: reference all existing values and * save their indices. */ IPFW_UH_WLOCK_ASSERT(ch); get_value_ptrs(ch, ts->tc, ts->vshared, &pval, &vi); error = 0; found = 0; vlimit = ts->ta->vlimit; vidx = 0; tc = ts->tc; tei = ts->tei; count = ts->count; for (i = 0; i < count; i++) { ptei = &tei[i]; ptei->value = 0; /* Ensure value is always 0 in the beginning */ mask_table_value(ptei->pvalue, &tval, ts->vmask); ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, (char *)&tval); if (ptv == NULL) continue; /* Deal with vlimit later */ if (vlimit > 0 && vlimit <= ptv->no.kidx) continue; /* Value found. Bump refcount */ ptv->pval->refcnt++; ptei->value = ptv->no.kidx; found++; } if (ts->count == found) { /* We've found all values , no need ts create new ones */ return (0); } /* * we have added some state here, let's attach operation * state ts the list ts be able ts rollback if necessary. */ add_toperation_state(ch, ts); /* Ensure table won't disappear */ tc_ref(tc); IPFW_UH_WUNLOCK(ch); /* * Stage 2: allocate objects for non-existing values. */ for (i = 0; i < count; i++) { ptei = &tei[i]; if (ptei->value != 0) continue; if (ptei->ptv != NULL) continue; ptei->ptv = malloc(sizeof(struct table_val_link), M_IPFW, M_WAITOK | M_ZERO); } /* * Stage 3: allocate index numbers for new values * and link them to index. */ IPFW_UH_WLOCK(ch); tc_unref(tc); del_toperation_state(ch, ts); if (ts->modified != 0) { /* * In general, we should free all state/indexes here * and return. However, we keep allocated state instead * to ensure we achieve some progress on each restart. */ return (0); } KASSERT(pval == ch->valuestate, ("resize_storage() notify failure")); /* Let's try to link values */ for (i = 0; i < count; i++) { ptei = &tei[i]; /* Check if record has appeared */ mask_table_value(ptei->pvalue, &tval, ts->vmask); ptv = (struct table_val_link *)ipfw_objhash_lookup_name(vi, 0, (char *)&tval); if (ptv != NULL) { ptv->pval->refcnt++; ptei->value = ptv->no.kidx; continue; } /* May perform UH unlock/lock */ error = alloc_table_vidx(ch, ts, vi, &vidx, flags); if (error != 0) { ts->opstate.func(ts->tc, &ts->opstate); return (error); } /* value storage resize has happened, return */ if (ts->modified != 0) return (0); /* Finally, we have allocated valid index, let's add entry */ ptei->value = vidx; ptv = (struct table_val_link *)ptei->ptv; ptei->ptv = NULL; ptv->no.kidx = vidx; ptv->no.name = (char *)&pval[vidx]; ptv->pval = &pval[vidx]; memcpy(ptv->pval, &tval, sizeof(struct table_value)); pval[vidx].refcnt = 1; ipfw_objhash_add(vi, &ptv->no); } return (0); } -/* - * Compatibility function used to import data from old - * IP_FW_TABLE_ADD / IP_FW_TABLE_XADD opcodes. - */ -void -ipfw_import_table_value_legacy(uint32_t value, struct table_value *v) -{ - - memset(v, 0, sizeof(*v)); - v->tag = value; - v->pipe = value; - v->divert = value; - v->skipto = value; - v->netgraph = value; - v->fib = value; - v->nat = value; - v->nh4 = value; /* host format */ - v->dscp = value; - v->limit = value; - v->mark = value; -} - -/* - * Export data to legacy table dumps opcodes. - */ -uint32_t -ipfw_export_table_value_legacy(struct table_value *v) -{ - - /* - * TODO: provide more compatibility depending on - * vmask value. - */ - return (v->tag); -} - /* * Imports table value from current userland format. * Saves value in kernel format to the same place. */ void ipfw_import_table_value_v1(ipfw_table_value *iv) { struct table_value v; memset(&v, 0, sizeof(v)); v.tag = iv->tag; v.pipe = iv->pipe; v.divert = iv->divert; v.skipto = iv->skipto; v.netgraph = iv->netgraph; v.fib = iv->fib; v.nat = iv->nat; v.dscp = iv->dscp; v.nh4 = iv->nh4; v.nh6 = iv->nh6; v.limit = iv->limit; v.zoneid = iv->zoneid; v.mark = iv->mark; memcpy(iv, &v, sizeof(ipfw_table_value)); } /* * Export real table value @v to current userland format. * Note that @v and @piv may point to the same memory. */ void ipfw_export_table_value_v1(struct table_value *v, ipfw_table_value *piv) { ipfw_table_value iv; memset(&iv, 0, sizeof(iv)); iv.tag = v->tag; iv.pipe = v->pipe; iv.divert = v->divert; iv.skipto = v->skipto; iv.netgraph = v->netgraph; iv.fib = v->fib; iv.nat = v->nat; iv.dscp = v->dscp; iv.limit = v->limit; iv.nh4 = v->nh4; iv.nh6 = v->nh6; iv.zoneid = v->zoneid; iv.mark = v->mark; memcpy(piv, &iv, sizeof(iv)); } /* * Exports real value data into ipfw_table_value structure including refcnt. */ static int dump_tvalue(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct vdump_args *da; struct table_val_link *ptv; ipfw_table_value *v; da = (struct vdump_args *)arg; ptv = (struct table_val_link *)no; v = (ipfw_table_value *)ipfw_get_sopt_space(da->sd, sizeof(*v)); /* Out of memory, returning */ if (v == NULL) { da->error = ENOMEM; return (ENOMEM); } ipfw_export_table_value_v1(ptv->pval, v); v->refcnt = ptv->pval->refcnt; v->kidx = ptv->no.kidx; return (0); } /* * Dumps all shared/table value data * Data layout (v1)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_table_value x N ] * * Returns 0 on success */ static int list_table_values(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; struct namedobj_instance *vi; struct vdump_args da; uint32_t count, size; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); vi = CHAIN_TO_VI(ch); count = ipfw_objhash_count(vi); size = count * sizeof(ipfw_table_value) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_table_value); if (size > olh->size) { olh->size = size; IPFW_UH_RUNLOCK(ch); return (ENOMEM); } olh->size = size; /* * Do the actual value dump */ memset(&da, 0, sizeof(da)); da.ch = ch; da.sd = sd; ipfw_objhash_foreach(vi, dump_tvalue, &da); IPFW_UH_RUNLOCK(ch); return (0); } void ipfw_table_value_init(struct ip_fw_chain *ch, int first) { struct tables_config *tcfg; ch->valuestate = malloc(VALDATA_START_SIZE * sizeof(struct table_value), M_IPFW, M_WAITOK | M_ZERO); tcfg = ch->tblcfg; tcfg->val_size = VALDATA_START_SIZE; - tcfg->valhash = ipfw_objhash_create(tcfg->val_size); + tcfg->valhash = ipfw_objhash_create(tcfg->val_size, VALDATA_HASH_SIZE); ipfw_objhash_set_funcs(tcfg->valhash, hash_table_value, cmp_table_value); IPFW_ADD_SOPT_HANDLER(first, scodes); } static int destroy_value(struct namedobj_instance *ni, struct named_object *no, void *arg) { free(no, M_IPFW); return (0); } void ipfw_table_value_destroy(struct ip_fw_chain *ch, int last) { IPFW_DEL_SOPT_HANDLER(last, scodes); free(ch->valuestate, M_IPFW); ipfw_objhash_foreach(CHAIN_TO_VI(ch), destroy_value, ch); ipfw_objhash_destroy(CHAIN_TO_VI(ch)); } diff --git a/sys/netpfil/ipfw/nat64/ip_fw_nat64.h b/sys/netpfil/ipfw/nat64/ip_fw_nat64.h index da5d0dc2dfce..b6645383a4ac 100644 --- a/sys/netpfil/ipfw/nat64/ip_fw_nat64.h +++ b/sys/netpfil/ipfw/nat64/ip_fw_nat64.h @@ -1,58 +1,106 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _IP_FW_NAT64_H_ #define _IP_FW_NAT64_H_ #define DPRINTF(mask, fmt, ...) \ if (V_nat64_debug & (mask)) \ printf("NAT64: %s: " fmt "\n", __func__, ## __VA_ARGS__) #define DP_GENERIC 0x0001 #define DP_OBJ 0x0002 #define DP_JQUEUE 0x0004 #define DP_STATE 0x0008 #define DP_DROPS 0x0010 #define DP_ALL 0xFFFF VNET_DECLARE(int, nat64_debug); #define V_nat64_debug VNET(nat64_debug) #if 0 #define NAT64NOINLINE __noinline #else #define NAT64NOINLINE #endif int nat64stl_init(struct ip_fw_chain *ch, int first); void nat64stl_uninit(struct ip_fw_chain *ch, int last); int nat64lsn_init(struct ip_fw_chain *ch, int first); void nat64lsn_uninit(struct ip_fw_chain *ch, int last); int nat64clat_init(struct ip_fw_chain *ch, int first); void nat64clat_uninit(struct ip_fw_chain *ch, int last); +#define NAT64_DEFINE_OPCODE_REWRITER(mod, name, ops) \ +static int \ +mod ## _classify(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) \ +{ \ + ipfw_insn *icmd; \ + icmd = cmd0 - F_LEN(cmd0); \ + if (icmd->opcode != O_EXTERNAL_ACTION || \ + insntod(icmd, kidx)->kidx != V_ ## mod ## _eid) \ + return (1); \ + *puidx = insntod(cmd0, kidx)->kidx; \ + *ptype = 0; \ + return (0); \ +} \ +static void \ +mod ## _update_kidx(ipfw_insn *cmd0, uint32_t idx) \ +{ \ + insntod(cmd0, kidx)->kidx = idx; \ +} \ +static int \ +mod ## _findbyname(struct ip_fw_chain *ch, struct tid_info *ti, \ + struct named_object **pno) \ +{ \ + return (ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, \ + IPFW_TLV_## name ## _NAME, pno)); \ +} \ +static struct named_object * \ +mod ## _findbykidx(struct ip_fw_chain *ch, uint32_t idx) \ +{ \ + struct namedobj_instance *ni; \ + struct named_object *no; \ + IPFW_UH_WLOCK_ASSERT(ch); \ + ni = CHAIN_TO_SRV(ch); \ + no = ipfw_objhash_lookup_kidx(ni, idx); \ + KASSERT(no != NULL, ("NAT with index %u not found", idx)); \ + return (no); \ +} \ +static struct opcode_obj_rewrite ops[] = { \ + { \ + .opcode = O_EXTERNAL_INSTANCE, \ + .etlv = IPFW_TLV_EACTION /* just show it isn't table */,\ + .classifier = mod ## _classify, \ + .update = mod ## _update_kidx, \ + .find_byname = mod ## _findbyname, \ + .find_bykidx = mod ## _findbykidx, \ + .manage_sets = mod ## _manage_sets, \ + }, \ +} + #endif /* _IP_FW_NAT64_H_ */ diff --git a/sys/netpfil/ipfw/nat64/nat64clat.c b/sys/netpfil/ipfw/nat64/nat64clat.c index 88f576025d5f..d524652e9a99 100644 --- a/sys/netpfil/ipfw/nat64/nat64clat.c +++ b/sys/netpfil/ipfw/nat64/nat64clat.c @@ -1,252 +1,252 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2019 Yandex LLC * Copyright (c) 2019 Andrey V. Elsukov * Copyright (c) 2019 Boris N. Lytochkin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64clat.h" #define NAT64_LOOKUP(chain, cmd) \ - (struct nat64clat_cfg *)SRV_OBJECT((chain), (cmd)->arg1) + (struct nat64clat_cfg *)SRV_OBJECT((chain), insntod(cmd, kidx)->kidx) static void nat64clat_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, uint32_t kidx) { static uint32_t pktid = 0; memset(plog, 0, sizeof(*plog)); - plog->length = PFLOG_HDRLEN; + plog->length = PFLOG_REAL_HDRLEN; plog->af = family; plog->action = PF_NAT; plog->dir = PF_IN; plog->rulenr = htonl(kidx); pktid++; plog->subrulenr = htonl(pktid); plog->ruleset[0] = '\0'; strlcpy(plog->ifname, "NAT64CLAT", sizeof(plog->ifname)); ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); } static int nat64clat_handle_ip4(struct ip_fw_chain *chain, struct nat64clat_cfg *cfg, struct mbuf *m) { struct pfloghdr loghdr, *logdata; struct in6_addr saddr, daddr; struct ip *ip; ip = mtod(m, struct ip*); /* source address for CLAT may be private with no harm */ if (nat64_check_ip4(ip->ip_src.s_addr) != 0 || nat64_check_ip4(ip->ip_dst.s_addr) != 0 || nat64_check_private_ip4(&cfg->base, ip->ip_dst.s_addr) != 0) return (NAT64SKIP); memcpy(&saddr, &cfg->base.clat_prefix, sizeof(saddr)); nat64_embed_ip4(&saddr, cfg->base.clat_plen, ip->ip_src.s_addr); memcpy(&daddr, &cfg->base.plat_prefix, sizeof(daddr)); nat64_embed_ip4(&daddr, cfg->base.plat_plen, ip->ip_dst.s_addr); if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64clat_log(logdata, m, AF_INET, cfg->no.kidx); } else logdata = NULL; return (nat64_do_handle_ip4(m, &saddr, &daddr, 0, &cfg->base, logdata)); } static int nat64clat_handle_ip6(struct ip_fw_chain *chain, struct nat64clat_cfg *cfg, struct mbuf *m) { struct pfloghdr loghdr, *logdata; struct ip6_hdr *ip6; uint32_t aaddr; /* * NOTE: we expect ipfw_chk() did m_pullup() up to upper level * protocol's headers. Also we skip some checks, that ip6_input(), * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. */ ip6 = mtod(m, struct ip6_hdr *); /* Check ip6_dst matches configured prefix */ if (memcmp(&ip6->ip6_dst, &cfg->base.clat_prefix, cfg->base.clat_plen / 8) != 0) return (NAT64SKIP); /* Check ip6_src matches configured prefix */ if (memcmp(&ip6->ip6_src, &cfg->base.plat_prefix, cfg->base.plat_plen / 8) != 0) return (NAT64SKIP); if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64clat_log(logdata, m, AF_INET6, cfg->no.kidx); } else logdata = NULL; aaddr = nat64_extract_ip4(&ip6->ip6_src, cfg->base.plat_plen); return (nat64_do_handle_ip6(m, aaddr, 0, &cfg->base, logdata)); } static int nat64clat_handle_icmp6(struct ip_fw_chain *chain, struct nat64clat_cfg *cfg, struct mbuf *m) { struct pfloghdr loghdr, *logdata; struct nat64_counters *stats; struct ip6_hdr *ip6i; struct icmp6_hdr *icmp6; uint32_t daddr; int hlen, proto; hlen = 0; stats = &cfg->base.stats; proto = nat64_getlasthdr(m, &hlen); if (proto != IPPROTO_ICMPV6) { NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } icmp6 = mtodo(m, hlen); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEED_TRANSIT: case ICMP6_PARAM_PROB: break; default: NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } hlen += sizeof(struct icmp6_hdr); if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) { NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN); if (m == NULL) { NAT64STAT_INC(stats, nomem); return (NAT64RETURN); } /* * Use destination address from inner IPv6 header to determine * IPv4 mapped address. */ ip6i = mtodo(m, hlen); daddr = nat64_extract_ip4(&ip6i->ip6_dst, cfg->base.clat_plen); if (daddr == 0) { NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64clat_log(logdata, m, AF_INET6, cfg->no.kidx); } else logdata = NULL; return (nat64_handle_icmp6(m, 0, daddr, 0, &cfg->base, logdata)); } int ipfw_nat64clat(struct ip_fw_chain *chain, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { ipfw_insn *icmd; struct nat64clat_cfg *cfg; int ret; IPFW_RLOCK_ASSERT(chain); *done = 0; /* try next rule if not matched */ - icmd = cmd + 1; + icmd = cmd + F_LEN(cmd); if (cmd->opcode != O_EXTERNAL_ACTION || - cmd->arg1 != V_nat64clat_eid || + insntod(cmd, kidx)->kidx != V_nat64clat_eid || icmd->opcode != O_EXTERNAL_INSTANCE || (cfg = NAT64_LOOKUP(chain, icmd)) == NULL) return (0); switch (args->f_id.addr_type) { case 4: ret = nat64clat_handle_ip4(chain, cfg, args->m); break; case 6: ret = nat64clat_handle_ip6(chain, cfg, args->m); break; default: return (0); } if (ret == NAT64SKIP) { /* * In case when packet is ICMPv6 message from an intermediate * router, the source address of message will not match the * addresses from configured prefixes. */ if (args->f_id.proto != IPPROTO_ICMPV6) return (0); ret = nat64clat_handle_icmp6(chain, cfg, args->m); } if (ret == NAT64SKIP) return (0); *done = 1; /* terminate the search */ if (ret == NAT64MFREE) m_freem(args->m); args->m = NULL; return (IP_FW_NAT64); } diff --git a/sys/netpfil/ipfw/nat64/nat64clat.h b/sys/netpfil/ipfw/nat64/nat64clat.h index 25560fe07b8d..4b90363c8ecd 100644 --- a/sys/netpfil/ipfw/nat64/nat64clat.h +++ b/sys/netpfil/ipfw/nat64/nat64clat.h @@ -1,52 +1,52 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2019 Yandex LLC * Copyright (c) 2019 Andrey V. Elsukov * Copyright (c) 2019 Boris N. Lytochkin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _IP_FW_NAT64CLAT_H_ #define _IP_FW_NAT64CLAT_H_ #include "ip_fw_nat64.h" #include "nat64_translate.h" struct nat64clat_cfg { struct named_object no; struct nat64_config base; #define NAT64CLAT_FLAGSMASK \ (NAT64_LOG | NAT64_ALLOW_PRIVATE) /* flags to pass to userland */ char name[64]; }; -VNET_DECLARE(uint16_t, nat64clat_eid); +VNET_DECLARE(uint32_t, nat64clat_eid); #define V_nat64clat_eid VNET(nat64clat_eid) #define IPFW_TLV_NAT64CLAT_NAME IPFW_TLV_EACTION_NAME(V_nat64clat_eid) int ipfw_nat64clat(struct ip_fw_chain *chain, struct ip_fw_args *args, ipfw_insn *cmd, int *done); #endif diff --git a/sys/netpfil/ipfw/nat64/nat64clat_control.c b/sys/netpfil/ipfw/nat64/nat64clat_control.c index 27f7e09c2317..63327646f506 100644 --- a/sys/netpfil/ipfw/nat64/nat64clat_control.c +++ b/sys/netpfil/ipfw/nat64/nat64clat_control.c @@ -1,609 +1,551 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2019 Yandex LLC * Copyright (c) 2019 Andrey V. Elsukov * Copyright (c) 2019 Boris N. Lytochkin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64clat.h" -VNET_DEFINE(uint16_t, nat64clat_eid) = 0; +VNET_DEFINE(uint32_t, nat64clat_eid) = 0; static struct nat64clat_cfg *nat64clat_alloc_config(const char *name, uint8_t set); static void nat64clat_free_config(struct nat64clat_cfg *cfg); static struct nat64clat_cfg *nat64clat_find(struct namedobj_instance *ni, const char *name, uint8_t set); static struct nat64clat_cfg * nat64clat_alloc_config(const char *name, uint8_t set) { struct nat64clat_cfg *cfg; cfg = malloc(sizeof(struct nat64clat_cfg), M_IPFW, M_WAITOK | M_ZERO); COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK); cfg->no.name = cfg->name; cfg->no.etlv = IPFW_TLV_NAT64CLAT_NAME; cfg->no.set = set; strlcpy(cfg->name, name, sizeof(cfg->name)); return (cfg); } static void nat64clat_free_config(struct nat64clat_cfg *cfg) { COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS); free(cfg, M_IPFW); } static void nat64clat_export_config(struct ip_fw_chain *ch, struct nat64clat_cfg *cfg, ipfw_nat64clat_cfg *uc) { uc->plat_prefix = cfg->base.plat_prefix; uc->plat_plen = cfg->base.plat_plen; uc->clat_prefix = cfg->base.clat_prefix; uc->clat_plen = cfg->base.clat_plen; uc->flags = cfg->base.flags & NAT64CLAT_FLAGSMASK; uc->set = cfg->no.set; strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); } struct nat64clat_dump_arg { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static int export_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct nat64clat_dump_arg *da = (struct nat64clat_dump_arg *)arg; ipfw_nat64clat_cfg *uc; uc = (ipfw_nat64clat_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc)); nat64clat_export_config(da->ch, (struct nat64clat_cfg *)no, uc); return (0); } static struct nat64clat_cfg * nat64clat_find(struct namedobj_instance *ni, const char *name, uint8_t set) { struct nat64clat_cfg *cfg; cfg = (struct nat64clat_cfg *)ipfw_objhash_lookup_name_type(ni, set, IPFW_TLV_NAT64CLAT_NAME, name); return (cfg); } /* * Creates new consumer-side nat64 translator instance. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ipfw_nat64clat_cfg ] * * Returns 0 on success */ static int nat64clat_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; ipfw_nat64clat_cfg *uc; struct namedobj_instance *ni; struct nat64clat_cfg *cfg; if (sd->valsize != sizeof(*olh) + sizeof(*uc)) return (EINVAL); olh = (ipfw_obj_lheader *)sd->kbuf; uc = (ipfw_nat64clat_cfg *)(olh + 1); if (ipfw_check_object_name_generic(uc->name) != 0) return (EINVAL); if (uc->set >= IPFW_MAX_SETS || nat64_check_prefix6(&uc->plat_prefix, uc->plat_plen) != 0 || nat64_check_prefix6(&uc->clat_prefix, uc->clat_plen) != 0) return (EINVAL); ni = CHAIN_TO_SRV(ch); IPFW_UH_RLOCK(ch); if (nat64clat_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } IPFW_UH_RUNLOCK(ch); cfg = nat64clat_alloc_config(uc->name, uc->set); cfg->base.plat_prefix = uc->plat_prefix; cfg->base.plat_plen = uc->plat_plen; cfg->base.clat_prefix = uc->clat_prefix; cfg->base.clat_plen = uc->clat_plen; cfg->base.flags = (uc->flags & NAT64CLAT_FLAGSMASK) | NAT64_CLATPFX | NAT64_PLATPFX; if (IN6_IS_ADDR_WKPFX(&cfg->base.plat_prefix)) cfg->base.flags |= NAT64_WKPFX; IPFW_UH_WLOCK(ch); if (nat64clat_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_WUNLOCK(ch); nat64clat_free_config(cfg); return (EEXIST); } if (ipfw_objhash_alloc_idx(ni, &cfg->no.kidx) != 0) { IPFW_UH_WUNLOCK(ch); nat64clat_free_config(cfg); return (ENOSPC); } ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no); /* Okay, let's link data */ SRV_OBJECT(ch, cfg->no.kidx) = cfg; IPFW_UH_WUNLOCK(ch); return (0); } /* * Change existing nat64clat instance configuration. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_nat64clat_cfg ] * Reply: [ ipfw_obj_header ipfw_nat64clat_cfg ] * * Returns 0 on success */ static int nat64clat_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { ipfw_obj_header *oh; ipfw_nat64clat_cfg *uc; struct nat64clat_cfg *cfg; struct namedobj_instance *ni; uint32_t flags; if (sd->valsize != sizeof(*oh) + sizeof(*uc)) return (EINVAL); oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sizeof(*oh) + sizeof(*uc)); uc = (ipfw_nat64clat_cfg *)(oh + 1); if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); ni = CHAIN_TO_SRV(ch); if (sd->sopt->sopt_dir == SOPT_GET) { IPFW_UH_RLOCK(ch); cfg = nat64clat_find(ni, oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); return (ENOENT); } nat64clat_export_config(ch, cfg, uc); IPFW_UH_RUNLOCK(ch); return (0); } IPFW_UH_WLOCK(ch); cfg = nat64clat_find(ni, oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ENOENT); } /* * For now allow to change only following values: * plat_prefix, plat_plen, clat_prefix, clat_plen, flags. */ flags = 0; if (uc->plat_plen != cfg->base.plat_plen || !IN6_ARE_ADDR_EQUAL(&uc->plat_prefix, &cfg->base.plat_prefix)) { if (nat64_check_prefix6(&uc->plat_prefix, uc->plat_plen) != 0) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } flags |= NAT64_PLATPFX; } if (uc->clat_plen != cfg->base.clat_plen || !IN6_ARE_ADDR_EQUAL(&uc->clat_prefix, &cfg->base.clat_prefix)) { if (nat64_check_prefix6(&uc->clat_prefix, uc->clat_plen) != 0) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } flags |= NAT64_CLATPFX; } if (flags != 0) { IPFW_WLOCK(ch); if (flags & NAT64_PLATPFX) { cfg->base.plat_prefix = uc->plat_prefix; cfg->base.plat_plen = uc->plat_plen; } if (flags & NAT64_CLATPFX) { cfg->base.clat_prefix = uc->clat_prefix; cfg->base.clat_plen = uc->clat_plen; } IPFW_WUNLOCK(ch); } cfg->base.flags &= ~NAT64CLAT_FLAGSMASK; cfg->base.flags |= uc->flags & NAT64CLAT_FLAGSMASK; IPFW_UH_WUNLOCK(ch); return (0); } static void nat64clat_detach_config(struct ip_fw_chain *ch, struct nat64clat_cfg *cfg) { IPFW_UH_WLOCK_ASSERT(ch); ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); } /* * Destroys nat64 instance. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat64clat_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat64clat_cfg *cfg; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0) return (EINVAL); IPFW_UH_WLOCK(ch); cfg = nat64clat_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ENOENT); } if (cfg->no.refcnt > 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } ipfw_reset_eaction_instance(ch, V_nat64clat_eid, cfg->no.kidx); SRV_OBJECT(ch, cfg->no.kidx) = NULL; nat64clat_detach_config(ch, cfg); IPFW_UH_WUNLOCK(ch); nat64clat_free_config(cfg); return (0); } /* * Lists all nat64clat instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader ipfw_nat64clat_cfg x N ] * * Returns 0 on success */ static int nat64clat_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat64clat_dump_arg da; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(ch); olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64CLAT_NAME); olh->objsize = sizeof(ipfw_nat64clat_cfg); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(ch); return (ENOMEM); } memset(&da, 0, sizeof(da)); da.ch = ch; da.sd = sd; ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, &da, IPFW_TLV_NAT64CLAT_NAME); IPFW_UH_RUNLOCK(ch); return (0); } #define __COPY_STAT_FIELD(_cfg, _stats, _field) \ (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->base.stats, _field) static void export_stats(struct ip_fw_chain *ch, struct nat64clat_cfg *cfg, struct ipfw_nat64clat_stats *stats) { __COPY_STAT_FIELD(cfg, stats, opcnt64); __COPY_STAT_FIELD(cfg, stats, opcnt46); __COPY_STAT_FIELD(cfg, stats, ofrags); __COPY_STAT_FIELD(cfg, stats, ifrags); __COPY_STAT_FIELD(cfg, stats, oerrors); __COPY_STAT_FIELD(cfg, stats, noroute4); __COPY_STAT_FIELD(cfg, stats, noroute6); __COPY_STAT_FIELD(cfg, stats, noproto); __COPY_STAT_FIELD(cfg, stats, nomem); __COPY_STAT_FIELD(cfg, stats, dropped); } /* * Get nat64clat statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]] * * Returns 0 on success */ static int nat64clat_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct ipfw_nat64clat_stats stats; struct nat64clat_cfg *cfg; ipfw_obj_header *oh; ipfw_obj_ctlv *ctlv; size_t sz; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); if (sd->valsize % sizeof(uint64_t)) return (EINVAL); if (sd->valsize < sz) return (ENOMEM); oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); memset(&stats, 0, sizeof(stats)); IPFW_UH_RLOCK(ch); cfg = nat64clat_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); return (ENOENT); } export_stats(ch, cfg, &stats); IPFW_UH_RUNLOCK(ch); ctlv = (ipfw_obj_ctlv *)(oh + 1); memset(ctlv, 0, sizeof(*ctlv)); ctlv->head.type = IPFW_TLV_COUNTERS; ctlv->head.length = sz - sizeof(ipfw_obj_header); ctlv->count = sizeof(stats) / sizeof(uint64_t); ctlv->objsize = sizeof(uint64_t); ctlv->version = IPFW_NAT64_VERSION; memcpy(ctlv + 1, &stats, sizeof(stats)); return (0); } /* * Reset nat64clat statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat64clat_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct nat64clat_cfg *cfg; ipfw_obj_header *oh; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); IPFW_UH_WLOCK(ch); cfg = nat64clat_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ENOENT); } COUNTER_ARRAY_ZERO(cfg->base.stats.cnt, NAT64STATS); IPFW_UH_WUNLOCK(ch); return (0); } static struct ipfw_sopt_handler scodes[] = { - { IP_FW_NAT64CLAT_CREATE, 0, HDIR_SET, nat64clat_create }, - { IP_FW_NAT64CLAT_DESTROY,0, HDIR_SET, nat64clat_destroy }, - { IP_FW_NAT64CLAT_CONFIG, 0, HDIR_BOTH, nat64clat_config }, - { IP_FW_NAT64CLAT_LIST, 0, HDIR_GET, nat64clat_list }, - { IP_FW_NAT64CLAT_STATS, 0, HDIR_GET, nat64clat_stats }, - { IP_FW_NAT64CLAT_RESET_STATS,0, HDIR_SET, nat64clat_reset_stats }, + { IP_FW_NAT64CLAT_CREATE, IP_FW3_OPVER, HDIR_SET, nat64clat_create }, + { IP_FW_NAT64CLAT_DESTROY, IP_FW3_OPVER, HDIR_SET, nat64clat_destroy }, + { IP_FW_NAT64CLAT_CONFIG, IP_FW3_OPVER, HDIR_BOTH, nat64clat_config }, + { IP_FW_NAT64CLAT_LIST, IP_FW3_OPVER, HDIR_GET, nat64clat_list }, + { IP_FW_NAT64CLAT_STATS, IP_FW3_OPVER, HDIR_GET, nat64clat_stats }, + { IP_FW_NAT64CLAT_RESET_STATS, IP_FW3_OPVER, HDIR_SET, nat64clat_reset_stats }, }; static int -nat64clat_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) -{ - ipfw_insn *icmd; - - icmd = cmd - 1; - if (icmd->opcode != O_EXTERNAL_ACTION || - icmd->arg1 != V_nat64clat_eid) - return (1); - - *puidx = cmd->arg1; - *ptype = 0; - return (0); -} - -static void -nat64clat_update_arg1(ipfw_insn *cmd, uint16_t idx) -{ - - cmd->arg1 = idx; -} - -static int -nat64clat_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, - struct named_object **pno) -{ - int err; - - err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, - IPFW_TLV_NAT64CLAT_NAME, pno); - return (err); -} - -static struct named_object * -nat64clat_findbykidx(struct ip_fw_chain *ch, uint16_t idx) -{ - struct namedobj_instance *ni; - struct named_object *no; - - IPFW_UH_WLOCK_ASSERT(ch); - ni = CHAIN_TO_SRV(ch); - no = ipfw_objhash_lookup_kidx(ni, idx); - KASSERT(no != NULL, ("NAT with index %d not found", idx)); - - return (no); -} - -static int -nat64clat_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, +nat64clat_manage_sets(struct ip_fw_chain *ch, uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { - return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64CLAT_NAME, - set, new_set, cmd)); + return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), + IPFW_TLV_NAT64CLAT_NAME, set, new_set, cmd)); } - -static struct opcode_obj_rewrite opcodes[] = { - { - .opcode = O_EXTERNAL_INSTANCE, - .etlv = IPFW_TLV_EACTION /* just show it isn't table */, - .classifier = nat64clat_classify, - .update = nat64clat_update_arg1, - .find_byname = nat64clat_findbyname, - .find_bykidx = nat64clat_findbykidx, - .manage_sets = nat64clat_manage_sets, - }, -}; +NAT64_DEFINE_OPCODE_REWRITER(nat64clat, NAT64CLAT, opcodes); static int destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct nat64clat_cfg *cfg; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; cfg = (struct nat64clat_cfg *)SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; nat64clat_detach_config(ch, cfg); nat64clat_free_config(cfg); return (0); } int nat64clat_init(struct ip_fw_chain *ch, int first) { V_nat64clat_eid = ipfw_add_eaction(ch, ipfw_nat64clat, "nat64clat"); if (V_nat64clat_eid == 0) return (ENXIO); IPFW_ADD_SOPT_HANDLER(first, scodes); IPFW_ADD_OBJ_REWRITER(first, opcodes); return (0); } void nat64clat_uninit(struct ip_fw_chain *ch, int last) { IPFW_DEL_OBJ_REWRITER(last, opcodes); IPFW_DEL_SOPT_HANDLER(last, scodes); ipfw_del_eaction(ch, V_nat64clat_eid); /* * Since we already have deregistered external action, * our named objects become unaccessible via rules, because * all rules were truncated by ipfw_del_eaction(). * So, we can unlink and destroy our named objects without holding * IPFW_WLOCK(). */ IPFW_UH_WLOCK(ch); ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, IPFW_TLV_NAT64CLAT_NAME); V_nat64clat_eid = 0; IPFW_UH_WUNLOCK(ch); } diff --git a/sys/netpfil/ipfw/nat64/nat64lsn.c b/sys/netpfil/ipfw/nat64/nat64lsn.c index e6d0167b52a1..5d2ee7ee3b34 100644 --- a/sys/netpfil/ipfw/nat64/nat64lsn.c +++ b/sys/netpfil/ipfw/nat64/nat64lsn.c @@ -1,1807 +1,1959 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2015-2019 Yandex LLC + * Copyright (c) 2015-2020 Yandex LLC * Copyright (c) 2015 Alexander V. Chernikov - * Copyright (c) 2016-2019 Andrey V. Elsukov + * Copyright (c) 2016-2020 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64lsn.h" MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN"); #define NAT64LSN_EPOCH_ENTER(et) NET_EPOCH_ENTER(et) #define NAT64LSN_EPOCH_EXIT(et) NET_EPOCH_EXIT(et) #define NAT64LSN_EPOCH_ASSERT() NET_EPOCH_ASSERT() #define NAT64LSN_EPOCH_CALL(c, f) NET_EPOCH_CALL((f), (c)) static uma_zone_t nat64lsn_host_zone; static uma_zone_t nat64lsn_pgchunk_zone; static uma_zone_t nat64lsn_pg_zone; static uma_zone_t nat64lsn_aliaslink_zone; static uma_zone_t nat64lsn_state_zone; static uma_zone_t nat64lsn_job_zone; static void nat64lsn_periodic(void *data); #define PERIODIC_DELAY 4 #define NAT64_LOOKUP(chain, cmd) \ - (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1) + (struct nat64lsn_instance *)SRV_OBJECT((chain), insntod(cmd, kidx)->kidx) /* * Delayed job queue, used to create new hosts * and new portgroups */ enum nat64lsn_jtype { JTYPE_NEWHOST = 1, JTYPE_NEWPORTGROUP, JTYPE_DESTROY, }; struct nat64lsn_job_item { STAILQ_ENTRY(nat64lsn_job_item) entries; enum nat64lsn_jtype jtype; union { struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */ struct mbuf *m; struct nat64lsn_host *host; struct nat64lsn_state *state; uint32_t src6_hval; uint32_t state_hval; struct ipfw_flow_id f_id; in_addr_t faddr; uint16_t port; uint8_t proto; uint8_t done; }; struct { /* used by JTYPE_DESTROY */ struct nat64lsn_hosts_slist hosts; struct nat64lsn_pg_slist portgroups; struct nat64lsn_pgchunk *pgchunk; struct epoch_context epoch_ctx; }; }; }; static struct mtx jmtx; #define JQUEUE_LOCK_INIT() mtx_init(&jmtx, "qlock", NULL, MTX_DEF) #define JQUEUE_LOCK_DESTROY() mtx_destroy(&jmtx) #define JQUEUE_LOCK() mtx_lock(&jmtx) #define JQUEUE_UNLOCK() mtx_unlock(&jmtx) static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); static struct nat64lsn_job_item *nat64lsn_create_job( struct nat64lsn_cfg *cfg, int jtype); static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); static void nat64lsn_job_destroy(epoch_context_t ctx); static void nat64lsn_destroy_host(struct nat64lsn_host *host); static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg); static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, struct mbuf **mp); static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, struct mbuf **mp); static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags); #define NAT64_BIT_TCP_FIN 0 /* FIN was seen */ #define NAT64_BIT_TCP_SYN 1 /* First syn in->out */ #define NAT64_BIT_TCP_ESTAB 2 /* Packet with Ack */ #define NAT64_BIT_READY_IPV4 6 /* state is ready for translate4 */ #define NAT64_BIT_STALE 7 /* state is going to be expired */ #define NAT64_FLAG_FIN (1 << NAT64_BIT_TCP_FIN) #define NAT64_FLAG_SYN (1 << NAT64_BIT_TCP_SYN) #define NAT64_FLAG_ESTAB (1 << NAT64_BIT_TCP_ESTAB) #define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) #define NAT64_FLAG_READY (1 << NAT64_BIT_READY_IPV4) #define NAT64_FLAG_STALE (1 << NAT64_BIT_STALE) static inline uint8_t convert_tcp_flags(uint8_t flags) { uint8_t result; result = flags & (TH_FIN|TH_SYN); result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ return (result); } static void nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, struct nat64lsn_state *state) { memset(plog, 0, sizeof(*plog)); - plog->length = PFLOG_HDRLEN; + plog->length = PFLOG_REAL_HDRLEN; plog->af = family; plog->action = PF_NAT; plog->dir = PF_IN; plog->rulenr = htonl(state->ip_src); plog->subrulenr = htonl((uint32_t)(state->aport << 16) | (state->proto << 8) | (state->ip_dst & 0xff)); plog->ruleset[0] = '\0'; strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); } #define HVAL(p, n, s) jenkins_hash32((const uint32_t *)(p), (n), (s)) #define HOST_HVAL(c, a) HVAL((a),\ sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed) #define HOSTS(c, v) ((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)]) #define ALIASLINK_HVAL(c, f) HVAL(&(f)->dst_ip6,\ sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed) #define ALIAS_BYHASH(c, v) \ ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)]) static struct nat64lsn_aliaslink* nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused, struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused) { /* * We can implement some different algorithms how * select an alias address. * XXX: for now we use first available. */ return (CK_SLIST_FIRST(&host->aliases)); } +static struct nat64lsn_alias* +nat64lsn_get_alias(struct nat64lsn_cfg *cfg, + const struct ipfw_flow_id *f_id __unused) +{ + static uint32_t idx = 0; + + /* + * We can choose alias by number of allocated PGs, + * not used yet by other hosts, or some static configured + * by user. + * XXX: for now we choose it using round robin. + */ + return (&ALIAS_BYHASH(cfg, idx++)); +} + #define STATE_HVAL(c, d) HVAL((d), 2, (c)->hash_seed) #define STATE_HASH(h, v) \ ((h)->states_hash[(v) & ((h)->states_hashsize - 1)]) #define STATES_CHUNK(p, v) \ ((p)->chunks_count == 1 ? (p)->states : \ ((p)->states_chunk[CHUNK_BY_FADDR(p, v)])) #ifdef __LP64__ #define FREEMASK_FFSLL(pg, faddr) \ ffsll(*FREEMASK_CHUNK((pg), (faddr))) #define FREEMASK_BTR(pg, faddr, bit) \ ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) #define FREEMASK_BTS(pg, faddr, bit) \ ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) #define FREEMASK_ISSET(pg, faddr, bit) \ ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit)) #define FREEMASK_COPY(pg, n, out) \ (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n))) #else static inline int freemask_ffsll(uint32_t *freemask) { int i; if ((i = ffsl(freemask[0])) != 0) return (i); if ((i = ffsl(freemask[1])) != 0) return (i + 32); return (0); } #define FREEMASK_FFSLL(pg, faddr) \ freemask_ffsll(FREEMASK_CHUNK((pg), (faddr))) #define FREEMASK_BTR(pg, faddr, bit) \ ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) #define FREEMASK_BTS(pg, faddr, bit) \ ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) #define FREEMASK_ISSET(pg, faddr, bit) \ ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32) #define FREEMASK_COPY(pg, n, out) \ (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \ ((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32) #endif /* !__LP64__ */ -#define NAT64LSN_TRY_PGCNT 32 + +#define NAT64LSN_TRY_PGCNT 36 static struct nat64lsn_pg* nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask, - struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr, - uint32_t *pgidx, in_addr_t faddr) + struct nat64lsn_pgchunk **chunks, uint32_t *pgidx, in_addr_t faddr) { - struct nat64lsn_pg *pg, *oldpg; + struct nat64lsn_pg *pg; uint32_t idx, oldidx; int cnt; - cnt = 0; - /* First try last used PG */ - oldpg = pg = ck_pr_load_ptr(pgptr); + /* First try last used PG. */ idx = oldidx = ck_pr_load_32(pgidx); - /* If pgidx is out of range, reset it to the first pgchunk */ - if (!ISSET32(*chunkmask, idx / 32)) - idx = 0; + MPASS(idx < 1024); + cnt = 0; do { ck_pr_fence_load(); - if (pg != NULL && FREEMASK_BITCOUNT(pg, faddr) > 0) { - /* - * If last used PG has not free states, - * try to update pointer. - * NOTE: it can be already updated by jobs handler, - * thus we use CAS operation. - */ + if (idx > 1023 || !ISSET32(*chunkmask, idx / 32)) { + /* If it is first try, reset idx to first PG */ + idx = 0; + /* Stop if idx is out of range */ if (cnt > 0) - ck_pr_cas_ptr(pgptr, oldpg, pg); - return (pg); + break; } - /* Stop if idx is out of range */ - if (!ISSET32(*chunkmask, idx / 32)) - break; - - if (ISSET32(pgmask[idx / 32], idx % 32)) + if (ISSET32(pgmask[idx / 32], idx % 32)) { pg = ck_pr_load_ptr( &chunks[idx / 32]->pgptr[idx % 32]); - else - pg = NULL; - + ck_pr_fence_load(); + /* + * Make sure that pg did not become DEAD. + */ + if ((pg->flags & NAT64LSN_DEADPG) == 0 && + FREEMASK_BITCOUNT(pg, faddr) > 0) { + if (cnt > 0) + ck_pr_cas_32(pgidx, oldidx, idx); + return (pg); + } + } idx++; } while (++cnt < NAT64LSN_TRY_PGCNT); - - /* If pgidx is out of range, reset it to the first pgchunk */ - if (!ISSET32(*chunkmask, idx / 32)) - idx = 0; - ck_pr_cas_32(pgidx, oldidx, idx); + if (oldidx != idx) + ck_pr_cas_32(pgidx, oldidx, idx); return (NULL); } static struct nat64lsn_state* nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_aliaslink *link; struct nat64lsn_state *state; struct nat64lsn_pg *pg; int i, offset; NAT64LSN_EPOCH_ASSERT(); /* Check that we already have state for given arguments */ CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) { if (state->proto == proto && state->ip_dst == faddr && state->sport == port && state->dport == f_id->dst_port) return (state); } link = nat64lsn_get_aliaslink(cfg, host, f_id); if (link == NULL) return (NULL); switch (proto) { case IPPROTO_TCP: - pg = nat64lsn_get_pg( - &link->alias->tcp_chunkmask, link->alias->tcp_pgmask, - link->alias->tcp, &link->alias->tcp_pg, + pg = nat64lsn_get_pg(&link->alias->tcp_chunkmask, + link->alias->tcp_pgmask, link->alias->tcp, &link->alias->tcp_pgidx, faddr); break; case IPPROTO_UDP: - pg = nat64lsn_get_pg( - &link->alias->udp_chunkmask, link->alias->udp_pgmask, - link->alias->udp, &link->alias->udp_pg, + pg = nat64lsn_get_pg(&link->alias->udp_chunkmask, + link->alias->udp_pgmask, link->alias->udp, &link->alias->udp_pgidx, faddr); break; case IPPROTO_ICMP: - pg = nat64lsn_get_pg( - &link->alias->icmp_chunkmask, link->alias->icmp_pgmask, - link->alias->icmp, &link->alias->icmp_pg, + pg = nat64lsn_get_pg(&link->alias->icmp_chunkmask, + link->alias->icmp_pgmask, link->alias->icmp, &link->alias->icmp_pgidx, faddr); break; default: panic("%s: wrong proto %d", __func__, proto); } - if (pg == NULL) + if (pg == NULL || (pg->flags & NAT64LSN_DEADPG) != 0) return (NULL); /* Check that PG has some free states */ state = NULL; i = FREEMASK_BITCOUNT(pg, faddr); while (i-- > 0) { offset = FREEMASK_FFSLL(pg, faddr); if (offset == 0) { /* * We lost the race. * No more free states in this PG. */ break; } /* Lets try to atomically grab the state */ if (FREEMASK_BTR(pg, faddr, offset - 1)) { state = &STATES_CHUNK(pg, faddr)->state[offset - 1]; /* Initialize */ state->flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags); state->proto = proto; state->aport = pg->base_port + offset - 1; state->dport = f_id->dst_port; state->sport = port; state->ip6_dst = f_id->dst_ip6; state->ip_dst = faddr; state->ip_src = link->alias->addr; state->hval = hval; state->host = host; SET_AGE(state->timestamp); /* Insert new state into host's hash table */ HOST_LOCK(host); + SET_AGE(host->timestamp); CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval), state, entries); host->states_count++; - /* - * XXX: In case if host is going to be expired, - * reset NAT64LSN_DEADHOST flag. - */ - host->flags &= ~NAT64LSN_DEADHOST; HOST_UNLOCK(host); NAT64STAT_INC(&cfg->base.stats, screated); /* Mark the state as ready for translate4 */ ck_pr_fence_store(); ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4); break; } } return (state); } /* * Inspects icmp packets to see if the message contains different * packet header so we need to alter @addr and @port. */ static int inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr, uint16_t *port) { struct icmp *icmp; struct ip *ip; int off; uint8_t inner_proto; ip = mtod(*mp, struct ip *); /* Outer IP header */ off = (ip->ip_hl << 2) + ICMP_MINLEN; if ((*mp)->m_len < off) *mp = m_pullup(*mp, off); if (*mp == NULL) return (ENOMEM); ip = mtod(*mp, struct ip *); /* Outer IP header */ icmp = L3HDR(ip, struct icmp *); switch (icmp->icmp_type) { case ICMP_ECHO: case ICMP_ECHOREPLY: /* Use icmp ID as distinguisher */ *port = ntohs(icmp->icmp_id); return (0); case ICMP_UNREACH: case ICMP_TIMXCEED: break; default: return (EOPNOTSUPP); } /* * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits * of ULP header. */ if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) return (EINVAL); if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) *mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN); if (*mp == NULL) return (ENOMEM); ip = mtodo(*mp, off); /* Inner IP header */ inner_proto = ip->ip_p; off += ip->ip_hl << 2; /* Skip inner IP header */ *addr = ntohl(ip->ip_src.s_addr); if ((*mp)->m_len < off + ICMP_MINLEN) *mp = m_pullup(*mp, off + ICMP_MINLEN); if (*mp == NULL) return (ENOMEM); switch (inner_proto) { case IPPROTO_TCP: case IPPROTO_UDP: /* Copy source port from the header */ *port = ntohs(*((uint16_t *)mtodo(*mp, off))); *proto = inner_proto; return (0); case IPPROTO_ICMP: /* * We will translate only ICMP errors for our ICMP * echo requests. */ icmp = mtodo(*mp, off); if (icmp->icmp_type != ICMP_ECHO) return (EOPNOTSUPP); *port = ntohs(icmp->icmp_id); return (0); }; return (EOPNOTSUPP); } static struct nat64lsn_state* nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_state *state; struct nat64lsn_pg *pg; int chunk_idx, pg_idx, state_idx; NAT64LSN_EPOCH_ASSERT(); if (port < NAT64_MIN_PORT) return (NULL); /* * Alias keeps 32 pgchunks for each protocol. * Each pgchunk has 32 pointers to portgroup. * Each portgroup has 64 states for ports. */ port -= NAT64_MIN_PORT; chunk_idx = port / 2048; port -= chunk_idx * 2048; pg_idx = port / 64; state_idx = port % 64; /* * First check in proto_chunkmask that we have allocated PG chunk. * Then check in proto_pgmask that we have valid PG pointer. */ pg = NULL; switch (proto) { case IPPROTO_TCP: if (ISSET32(alias->tcp_chunkmask, chunk_idx) && ISSET32(alias->tcp_pgmask[chunk_idx], pg_idx)) { pg = alias->tcp[chunk_idx]->pgptr[pg_idx]; break; } return (NULL); case IPPROTO_UDP: if (ISSET32(alias->udp_chunkmask, chunk_idx) && ISSET32(alias->udp_pgmask[chunk_idx], pg_idx)) { pg = alias->udp[chunk_idx]->pgptr[pg_idx]; break; } return (NULL); case IPPROTO_ICMP: if (ISSET32(alias->icmp_chunkmask, chunk_idx) && ISSET32(alias->icmp_pgmask[chunk_idx], pg_idx)) { pg = alias->icmp[chunk_idx]->pgptr[pg_idx]; break; } return (NULL); default: panic("%s: wrong proto %d", __func__, proto); } if (pg == NULL) return (NULL); if (FREEMASK_ISSET(pg, faddr, state_idx)) return (NULL); state = &STATES_CHUNK(pg, faddr)->state[state_idx]; ck_pr_fence_load(); if (ck_pr_load_32(&state->flags) & NAT64_FLAG_READY) return (state); return (NULL); } /* * Reassemble IPv4 fragments, make PULLUP if needed, get some ULP fields * that might be unknown until reassembling is completed. */ static struct mbuf* nat64lsn_reassemble4(struct nat64lsn_cfg *cfg, struct mbuf *m, uint16_t *port) { struct ip *ip; int len; m = ip_reass(m); if (m == NULL) return (NULL); /* IP header must be contigious after ip_reass() */ ip = mtod(m, struct ip *); len = ip->ip_hl << 2; switch (ip->ip_p) { case IPPROTO_ICMP: - len += ICMP_MINLEN; /* Enough to get icmp_id */ + len += ICMP_MINLEN; break; case IPPROTO_TCP: len += sizeof(struct tcphdr); break; case IPPROTO_UDP: len += sizeof(struct udphdr); break; default: m_freem(m); NAT64STAT_INC(&cfg->base.stats, noproto); return (NULL); } if (m->m_len < len) { m = m_pullup(m, len); if (m == NULL) { NAT64STAT_INC(&cfg->base.stats, nomem); return (NULL); } ip = mtod(m, struct ip *); } switch (ip->ip_p) { case IPPROTO_TCP: *port = ntohs(L3HDR(ip, struct tcphdr *)->th_dport); break; case IPPROTO_UDP: *port = ntohs(L3HDR(ip, struct udphdr *)->uh_dport); break; } return (m); } static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, struct mbuf **mp) { struct pfloghdr loghdr, *logdata; struct in6_addr src6; struct nat64lsn_state *state; struct nat64lsn_alias *alias; uint32_t addr, flags; uint16_t port, ts; int ret; uint8_t proto; addr = f_id->dst_ip; port = f_id->dst_port; proto = f_id->proto; if (addr < cfg->prefix4 || addr > cfg->pmask4) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } /* Reassemble fragments if needed */ ret = ntohs(mtod(*mp, struct ip *)->ip_off); if ((ret & (IP_MF | IP_OFFMASK)) != 0) { *mp = nat64lsn_reassemble4(cfg, *mp, &port); if (*mp == NULL) return (IP_FW_DENY); } /* Check if protocol is supported */ switch (proto) { case IPPROTO_ICMP: ret = inspect_icmp_mbuf(mp, &proto, &addr, &port); if (ret != 0) { if (ret == ENOMEM) { NAT64STAT_INC(&cfg->base.stats, nomem); return (IP_FW_DENY); } NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } if (addr < cfg->prefix4 || addr > cfg->pmask4) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } /* FALLTHROUGH */ case IPPROTO_TCP: case IPPROTO_UDP: break; default: NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } alias = &ALIAS_BYHASH(cfg, addr); MPASS(addr == alias->addr); /* Check that we have state for this port */ state = nat64lsn_get_state4to6(cfg, alias, f_id->src_ip, port, proto); if (state == NULL) { NAT64STAT_INC(&cfg->base.stats, nomatch4); return (cfg->nomatch_verdict); } /* TODO: Check flags to see if we need to do some static mapping */ /* Update some state fields if need */ SET_AGE(ts); if (f_id->proto == IPPROTO_TCP) flags = convert_tcp_flags(f_id->_flags); else flags = 0; if (state->timestamp != ts) state->timestamp = ts; if ((state->flags & flags) != flags) state->flags |= flags; port = htons(state->sport); src6 = state->ip6_dst; if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64lsn_log(logdata, *mp, AF_INET, state); } else logdata = NULL; /* * We already have src6 with embedded address, but it is possible, * that src_ip is different than state->ip_dst, this is why we * do embedding again. */ nat64_embed_ip4(&src6, cfg->base.plat_plen, htonl(f_id->src_ip)); ret = nat64_do_handle_ip4(*mp, &src6, &state->host->addr, port, &cfg->base, logdata); if (ret == NAT64SKIP) return (cfg->nomatch_verdict); if (ret == NAT64RETURN) *mp = NULL; return (IP_FW_DENY); } /* * Check if particular state is stale and should be deleted. * Return 1 if true, 0 otherwise. */ static int nat64lsn_check_state(struct nat64lsn_cfg *cfg, struct nat64lsn_state *state) { int age, ttl; /* State was marked as stale in previous pass. */ if (ISSET32(state->flags, NAT64_BIT_STALE)) return (1); /* State is not yet initialized, it is going to be READY */ if (!ISSET32(state->flags, NAT64_BIT_READY_IPV4)) return (0); age = GET_AGE(state->timestamp); switch (state->proto) { case IPPROTO_TCP: if (ISSET32(state->flags, NAT64_BIT_TCP_FIN)) ttl = cfg->st_close_ttl; else if (ISSET32(state->flags, NAT64_BIT_TCP_ESTAB)) ttl = cfg->st_estab_ttl; else if (ISSET32(state->flags, NAT64_BIT_TCP_SYN)) ttl = cfg->st_syn_ttl; else ttl = cfg->st_syn_ttl; if (age > ttl) return (1); break; case IPPROTO_UDP: if (age > cfg->st_udp_ttl) return (1); break; case IPPROTO_ICMP: if (age > cfg->st_icmp_ttl) return (1); break; } return (0); } +#define PGCOUNT_ADD(alias, proto, value) \ + switch (proto) { \ + case IPPROTO_TCP: (alias)->tcp_pgcount += (value); break; \ + case IPPROTO_UDP: (alias)->udp_pgcount += (value); break; \ + case IPPROTO_ICMP: (alias)->icmp_pgcount += (value); break; \ + } +#define PGCOUNT_INC(alias, proto) PGCOUNT_ADD(alias, proto, 1) +#define PGCOUNT_DEC(alias, proto) PGCOUNT_ADD(alias, proto, -1) + +static inline void +nat64lsn_state_cleanup(struct nat64lsn_state *state) +{ + + /* + * Reset READY flag and wait until it become + * safe for translate4. + */ + ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4); + /* + * And set STALE flag for deferred deletion in the + * next pass of nat64lsn_maintain_pg(). + */ + ck_pr_bts_32(&state->flags, NAT64_BIT_STALE); + ck_pr_fence_store(); +} + static int nat64lsn_maintain_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg) { struct nat64lsn_state *state; struct nat64lsn_host *host; uint64_t freemask; int c, i, update_age; update_age = 0; for (c = 0; c < pg->chunks_count; c++) { FREEMASK_COPY(pg, c, freemask); for (i = 0; i < 64; i++) { if (ISSET64(freemask, i)) continue; state = &STATES_CHUNK(pg, c)->state[i]; if (nat64lsn_check_state(cfg, state) == 0) { update_age = 1; continue; } /* * Expire state: * 1. Mark as STALE and unlink from host's hash. * 2. Set bit in freemask. */ if (ISSET32(state->flags, NAT64_BIT_STALE)) { /* * State was marked as STALE in previous * pass. Now it is safe to release it. */ state->flags = 0; ck_pr_fence_store(); FREEMASK_BTS(pg, c, i); NAT64STAT_INC(&cfg->base.stats, sdeleted); continue; } MPASS(state->flags & NAT64_FLAG_READY); host = state->host; HOST_LOCK(host); CK_SLIST_REMOVE(&STATE_HASH(host, state->hval), state, nat64lsn_state, entries); - host->states_count--; - HOST_UNLOCK(host); - - /* Reset READY flag */ - ck_pr_btr_32(&state->flags, NAT64_BIT_READY_IPV4); - /* And set STALE flag */ - ck_pr_bts_32(&state->flags, NAT64_BIT_STALE); - ck_pr_fence_store(); /* - * Now translate6 will not use this state, wait - * until it become safe for translate4, then mark - * state as free. + * Now translate6 will not use this state. */ + host->states_count--; + HOST_UNLOCK(host); + nat64lsn_state_cleanup(state); } } /* * We have some alive states, update timestamp. */ if (update_age) SET_AGE(pg->timestamp); if (GET_AGE(pg->timestamp) < cfg->pg_delete_delay) return (0); return (1); } static void nat64lsn_expire_portgroups(struct nat64lsn_cfg *cfg, struct nat64lsn_pg_slist *portgroups) { struct nat64lsn_alias *alias; - struct nat64lsn_pg *pg, *tpg, *firstpg, **pgptr; + struct nat64lsn_pg *pg, *tpg; uint32_t *pgmask, *pgidx; int i, idx; for (i = 0; i < 1 << (32 - cfg->plen4); i++) { alias = &cfg->aliases[i]; CK_SLIST_FOREACH_SAFE(pg, &alias->portgroups, entries, tpg) { if (nat64lsn_maintain_pg(cfg, pg) == 0) continue; /* Always keep first PG */ if (pg->base_port == NAT64_MIN_PORT) continue; /* - * PG is expired, unlink it and schedule for - * deferred destroying. + * PG expires in two passes: + * 1. Reset bit in pgmask, mark it as DEAD. + * 2. Unlink it and schedule for deferred destroying. */ idx = (pg->base_port - NAT64_MIN_PORT) / 64; switch (pg->proto) { case IPPROTO_TCP: pgmask = alias->tcp_pgmask; - pgptr = &alias->tcp_pg; pgidx = &alias->tcp_pgidx; - firstpg = alias->tcp[0]->pgptr[0]; break; case IPPROTO_UDP: pgmask = alias->udp_pgmask; - pgptr = &alias->udp_pg; pgidx = &alias->udp_pgidx; - firstpg = alias->udp[0]->pgptr[0]; break; case IPPROTO_ICMP: pgmask = alias->icmp_pgmask; - pgptr = &alias->icmp_pg; pgidx = &alias->icmp_pgidx; - firstpg = alias->icmp[0]->pgptr[0]; break; } + if (pg->flags & NAT64LSN_DEADPG) { + /* Unlink PG from alias's chain */ + ALIAS_LOCK(alias); + CK_SLIST_REMOVE(&alias->portgroups, pg, + nat64lsn_pg, entries); + PGCOUNT_DEC(alias, pg->proto); + ALIAS_UNLOCK(alias); + /* + * Link it to job's chain for deferred + * destroying. + */ + NAT64STAT_INC(&cfg->base.stats, spgdeleted); + CK_SLIST_INSERT_HEAD(portgroups, pg, entries); + continue; + } + /* Reset the corresponding bit in pgmask array. */ ck_pr_btr_32(&pgmask[idx / 32], idx % 32); + pg->flags |= NAT64LSN_DEADPG; ck_pr_fence_store(); /* If last used PG points to this PG, reset it. */ - ck_pr_cas_ptr(pgptr, pg, firstpg); ck_pr_cas_32(pgidx, idx, 0); - /* Unlink PG from alias's chain */ - ALIAS_LOCK(alias); - CK_SLIST_REMOVE(&alias->portgroups, pg, - nat64lsn_pg, entries); - alias->portgroups_count--; - ALIAS_UNLOCK(alias); - /* And link to job's chain for deferred destroying */ - NAT64STAT_INC(&cfg->base.stats, spgdeleted); - CK_SLIST_INSERT_HEAD(portgroups, pg, entries); } } } static void nat64lsn_expire_hosts(struct nat64lsn_cfg *cfg, struct nat64lsn_hosts_slist *hosts) { struct nat64lsn_host *host, *tmp; int i; for (i = 0; i < cfg->hosts_hashsize; i++) { CK_SLIST_FOREACH_SAFE(host, &cfg->hosts_hash[i], entries, tmp) { /* Is host was marked in previous call? */ if (host->flags & NAT64LSN_DEADHOST) { - if (host->states_count > 0) { + if (host->states_count > 0 || + GET_AGE(host->timestamp) < + cfg->host_delete_delay) { host->flags &= ~NAT64LSN_DEADHOST; continue; } /* * Unlink host from hash table and schedule * it for deferred destroying. */ CFG_LOCK(cfg); CK_SLIST_REMOVE(&cfg->hosts_hash[i], host, nat64lsn_host, entries); cfg->hosts_count--; CFG_UNLOCK(cfg); CK_SLIST_INSERT_HEAD(hosts, host, entries); continue; } - if (GET_AGE(host->timestamp) < cfg->host_delete_delay) - continue; - if (host->states_count > 0) + if (host->states_count > 0 || + GET_AGE(host->timestamp) < cfg->host_delete_delay) continue; /* Mark host as going to be expired in next pass */ host->flags |= NAT64LSN_DEADHOST; ck_pr_fence_store(); } } } static struct nat64lsn_pgchunk* nat64lsn_expire_pgchunk(struct nat64lsn_cfg *cfg) { #if 0 struct nat64lsn_alias *alias; struct nat64lsn_pgchunk *chunk; uint32_t pgmask; int i, c; for (i = 0; i < 1 << (32 - cfg->plen4); i++) { alias = &cfg->aliases[i]; if (GET_AGE(alias->timestamp) < cfg->pgchunk_delete_delay) continue; /* Always keep single chunk allocated */ for (c = 1; c < 32; c++) { if ((alias->tcp_chunkmask & (1 << c)) == 0) break; chunk = ck_pr_load_ptr(&alias->tcp[c]); if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) continue; ck_pr_btr_32(&alias->tcp_chunkmask, c); ck_pr_fence_load(); if (ck_pr_load_32(&alias->tcp_pgmask[c]) != 0) continue; } } #endif return (NULL); } #if 0 static void nat64lsn_maintain_hosts(struct nat64lsn_cfg *cfg) { struct nat64lsn_host *h; struct nat64lsn_states_slist *hash; int i, j, hsize; for (i = 0; i < cfg->hosts_hashsize; i++) { CK_SLIST_FOREACH(h, &cfg->hosts_hash[i], entries) { if (h->states_count / 2 < h->states_hashsize || h->states_hashsize >= NAT64LSN_MAX_HSIZE) continue; hsize = h->states_hashsize * 2; hash = malloc(sizeof(*hash)* hsize, M_NOWAIT); if (hash == NULL) continue; for (j = 0; j < hsize; j++) CK_SLIST_INIT(&hash[i]); ck_pr_bts_32(&h->flags, NAT64LSN_GROWHASH); } } } #endif /* - * This procedure is used to perform various maintenance + * This procedure is used to perform various maintance * on dynamic hash list. Currently it is called every 4 seconds. */ static void nat64lsn_periodic(void *data) { struct nat64lsn_job_item *ji; struct nat64lsn_cfg *cfg; cfg = (struct nat64lsn_cfg *) data; CURVNET_SET(cfg->vp); if (cfg->hosts_count > 0) { ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); if (ji != NULL) { ji->jtype = JTYPE_DESTROY; CK_SLIST_INIT(&ji->hosts); CK_SLIST_INIT(&ji->portgroups); nat64lsn_expire_hosts(cfg, &ji->hosts); nat64lsn_expire_portgroups(cfg, &ji->portgroups); ji->pgchunk = nat64lsn_expire_pgchunk(cfg); NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, nat64lsn_job_destroy); } else NAT64STAT_INC(&cfg->base.stats, jnomem); } callout_schedule(&cfg->periodic, hz * PERIODIC_DELAY); CURVNET_RESTORE(); } #define ALLOC_ERROR(stage, type) ((stage) ? 10 * (type) + (stage): 0) #define HOST_ERROR(stage) ALLOC_ERROR(stage, 1) #define PG_ERROR(stage) ALLOC_ERROR(stage, 2) static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { char a[INET6_ADDRSTRLEN]; struct nat64lsn_aliaslink *link; struct nat64lsn_host *host; struct nat64lsn_state *state; uint32_t hval, data[2]; int i; /* Check that host was not yet added. */ NAT64LSN_EPOCH_ASSERT(); CK_SLIST_FOREACH(host, &HOSTS(cfg, ji->src6_hval), entries) { if (IN6_ARE_ADDR_EQUAL(&ji->f_id.src_ip6, &host->addr)) { /* The host was allocated in previous call. */ ji->host = host; goto get_state; } } host = ji->host = uma_zalloc(nat64lsn_host_zone, M_NOWAIT); if (ji->host == NULL) return (HOST_ERROR(1)); host->states_hashsize = NAT64LSN_HSIZE; host->states_hash = malloc(sizeof(struct nat64lsn_states_slist) * host->states_hashsize, M_NAT64LSN, M_NOWAIT); if (host->states_hash == NULL) { uma_zfree(nat64lsn_host_zone, host); return (HOST_ERROR(2)); } link = uma_zalloc(nat64lsn_aliaslink_zone, M_NOWAIT); if (link == NULL) { free(host->states_hash, M_NAT64LSN); uma_zfree(nat64lsn_host_zone, host); return (HOST_ERROR(3)); } /* Initialize */ HOST_LOCK_INIT(host); SET_AGE(host->timestamp); host->addr = ji->f_id.src_ip6; host->hval = ji->src6_hval; host->flags = 0; host->states_count = 0; - host->states_hashsize = NAT64LSN_HSIZE; CK_SLIST_INIT(&host->aliases); for (i = 0; i < host->states_hashsize; i++) CK_SLIST_INIT(&host->states_hash[i]); - /* Determine alias from flow hash. */ - hval = ALIASLINK_HVAL(cfg, &ji->f_id); - link->alias = &ALIAS_BYHASH(cfg, hval); + link->alias = nat64lsn_get_alias(cfg, &ji->f_id); CK_SLIST_INSERT_HEAD(&host->aliases, link, host_entries); ALIAS_LOCK(link->alias); CK_SLIST_INSERT_HEAD(&link->alias->hosts, link, alias_entries); link->alias->hosts_count++; ALIAS_UNLOCK(link->alias); CFG_LOCK(cfg); CK_SLIST_INSERT_HEAD(&HOSTS(cfg, ji->src6_hval), host, entries); cfg->hosts_count++; CFG_UNLOCK(cfg); get_state: data[0] = ji->faddr; data[1] = (ji->f_id.dst_port << 16) | ji->port; ji->state_hval = hval = STATE_HVAL(cfg, data); state = nat64lsn_get_state6to4(cfg, host, &ji->f_id, hval, ji->faddr, ji->port, ji->proto); /* * We failed to obtain new state, used alias needs new PG. * XXX: or another alias should be used. */ if (state == NULL) { /* Try to allocate new PG */ if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) return (HOST_ERROR(4)); /* We assume that nat64lsn_alloc_pg() got state */ } else ji->state = state; ji->done = 1; DPRINTF(DP_OBJ, "ALLOC HOST %s %p", inet_ntop(AF_INET6, &host->addr, a, sizeof(a)), host); return (HOST_ERROR(0)); } static int nat64lsn_find_pg_place(uint32_t *data) { int i; for (i = 0; i < 32; i++) { if (~data[i] == 0) continue; return (i * 32 + ffs(~data[i]) - 1); } return (-1); } static int nat64lsn_alloc_proto_pg(struct nat64lsn_cfg *cfg, - struct nat64lsn_alias *alias, uint32_t *chunkmask, - uint32_t *pgmask, struct nat64lsn_pgchunk **chunks, - struct nat64lsn_pg **pgptr, uint8_t proto) + struct nat64lsn_alias *alias, uint32_t *chunkmask, uint32_t *pgmask, + struct nat64lsn_pgchunk **chunks, uint32_t *pgidx, uint8_t proto) { struct nat64lsn_pg *pg; int i, pg_idx, chunk_idx; /* Find place in pgchunk where PG can be added */ pg_idx = nat64lsn_find_pg_place(pgmask); if (pg_idx < 0) /* no more PGs */ return (PG_ERROR(1)); /* Check that we have allocated pgchunk for given PG index */ chunk_idx = pg_idx / 32; if (!ISSET32(*chunkmask, chunk_idx)) { chunks[chunk_idx] = uma_zalloc(nat64lsn_pgchunk_zone, M_NOWAIT); if (chunks[chunk_idx] == NULL) return (PG_ERROR(2)); ck_pr_bts_32(chunkmask, chunk_idx); ck_pr_fence_store(); } /* Allocate PG and states chunks */ pg = uma_zalloc(nat64lsn_pg_zone, M_NOWAIT); if (pg == NULL) return (PG_ERROR(3)); pg->chunks_count = cfg->states_chunks; if (pg->chunks_count > 1) { pg->freemask_chunk = malloc(pg->chunks_count * sizeof(uint64_t), M_NAT64LSN, M_NOWAIT); if (pg->freemask_chunk == NULL) { uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(4)); } pg->states_chunk = malloc(pg->chunks_count * sizeof(struct nat64lsn_states_chunk *), M_NAT64LSN, M_NOWAIT | M_ZERO); if (pg->states_chunk == NULL) { free(pg->freemask_chunk, M_NAT64LSN); uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(5)); } for (i = 0; i < pg->chunks_count; i++) { pg->states_chunk[i] = uma_zalloc( nat64lsn_state_zone, M_NOWAIT); if (pg->states_chunk[i] == NULL) goto states_failed; } memset(pg->freemask_chunk, 0xff, sizeof(uint64_t) * pg->chunks_count); } else { pg->states = uma_zalloc(nat64lsn_state_zone, M_NOWAIT); if (pg->states == NULL) { uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(6)); } memset(&pg->freemask64, 0xff, sizeof(uint64_t)); } /* Initialize PG and hook it to pgchunk */ SET_AGE(pg->timestamp); + pg->flags = 0; pg->proto = proto; pg->base_port = NAT64_MIN_PORT + 64 * pg_idx; ck_pr_store_ptr(&chunks[chunk_idx]->pgptr[pg_idx % 32], pg); ck_pr_fence_store(); - ck_pr_bts_32(&pgmask[pg_idx / 32], pg_idx % 32); - ck_pr_store_ptr(pgptr, pg); + + /* Set bit in pgmask and set index of last used PG */ + ck_pr_bts_32(&pgmask[chunk_idx], pg_idx % 32); + ck_pr_store_32(pgidx, pg_idx); ALIAS_LOCK(alias); CK_SLIST_INSERT_HEAD(&alias->portgroups, pg, entries); SET_AGE(alias->timestamp); - alias->portgroups_count++; + PGCOUNT_INC(alias, proto); ALIAS_UNLOCK(alias); NAT64STAT_INC(&cfg->base.stats, spgcreated); return (PG_ERROR(0)); states_failed: for (i = 0; i < pg->chunks_count; i++) uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); free(pg->freemask_chunk, M_NAT64LSN); free(pg->states_chunk, M_NAT64LSN); uma_zfree(nat64lsn_pg_zone, pg); return (PG_ERROR(7)); } static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { struct nat64lsn_aliaslink *link; struct nat64lsn_alias *alias; int ret; link = nat64lsn_get_aliaslink(cfg, ji->host, &ji->f_id); if (link == NULL) return (PG_ERROR(1)); /* * TODO: check that we did not already allocated PG in * previous call. */ ret = 0; alias = link->alias; /* Find place in pgchunk where PG can be added */ switch (ji->proto) { case IPPROTO_TCP: ret = nat64lsn_alloc_proto_pg(cfg, alias, &alias->tcp_chunkmask, alias->tcp_pgmask, - alias->tcp, &alias->tcp_pg, ji->proto); + alias->tcp, &alias->tcp_pgidx, ji->proto); break; case IPPROTO_UDP: ret = nat64lsn_alloc_proto_pg(cfg, alias, &alias->udp_chunkmask, alias->udp_pgmask, - alias->udp, &alias->udp_pg, ji->proto); + alias->udp, &alias->udp_pgidx, ji->proto); break; case IPPROTO_ICMP: ret = nat64lsn_alloc_proto_pg(cfg, alias, &alias->icmp_chunkmask, alias->icmp_pgmask, - alias->icmp, &alias->icmp_pg, ji->proto); + alias->icmp, &alias->icmp_pgidx, ji->proto); break; default: panic("%s: wrong proto %d", __func__, ji->proto); } if (ret == PG_ERROR(1)) { /* * PG_ERROR(1) means that alias lacks free PGs * XXX: try next alias. */ printf("NAT64LSN: %s: failed to obtain PG\n", __func__); return (ret); } if (ret == PG_ERROR(0)) { ji->state = nat64lsn_get_state6to4(cfg, ji->host, &ji->f_id, ji->state_hval, ji->faddr, ji->port, ji->proto); if (ji->state == NULL) ret = PG_ERROR(8); else ji->done = 1; } return (ret); } static void nat64lsn_do_request(void *data) { struct epoch_tracker et; struct nat64lsn_job_head jhead; struct nat64lsn_job_item *ji, *ji2; struct nat64lsn_cfg *cfg; int jcount; uint8_t flags; cfg = (struct nat64lsn_cfg *)data; if (cfg->jlen == 0) return; CURVNET_SET(cfg->vp); STAILQ_INIT(&jhead); /* Grab queue */ JQUEUE_LOCK(); STAILQ_SWAP(&jhead, &cfg->jhead, nat64lsn_job_item); jcount = cfg->jlen; cfg->jlen = 0; JQUEUE_UNLOCK(); /* TODO: check if we need to resize hash */ NAT64STAT_INC(&cfg->base.stats, jcalls); DPRINTF(DP_JQUEUE, "count=%d", jcount); /* * TODO: * What we should do here is to build a hash * to ensure we don't have lots of duplicate requests. * Skip this for now. * * TODO: Limit per-call number of items */ NAT64LSN_EPOCH_ENTER(et); STAILQ_FOREACH(ji, &jhead, entries) { switch (ji->jtype) { case JTYPE_NEWHOST: if (nat64lsn_alloc_host(cfg, ji) != HOST_ERROR(0)) NAT64STAT_INC(&cfg->base.stats, jhostfails); break; case JTYPE_NEWPORTGROUP: if (nat64lsn_alloc_pg(cfg, ji) != PG_ERROR(0)) NAT64STAT_INC(&cfg->base.stats, jportfails); break; default: continue; } if (ji->done != 0) { flags = ji->proto != IPPROTO_TCP ? 0 : convert_tcp_flags(ji->f_id._flags); nat64lsn_translate6_internal(cfg, &ji->m, ji->state, flags); NAT64STAT_INC(&cfg->base.stats, jreinjected); } } NAT64LSN_EPOCH_EXIT(et); ji = STAILQ_FIRST(&jhead); while (ji != NULL) { ji2 = STAILQ_NEXT(ji, entries); /* * In any case we must free mbuf if * translator did not consumed it. */ m_freem(ji->m); uma_zfree(nat64lsn_job_zone, ji); ji = ji2; } CURVNET_RESTORE(); } static struct nat64lsn_job_item * nat64lsn_create_job(struct nat64lsn_cfg *cfg, int jtype) { struct nat64lsn_job_item *ji; /* * Do not try to lock possibly contested mutex if we're near the * limit. Drop packet instead. */ ji = NULL; if (cfg->jlen >= cfg->jmaxlen) NAT64STAT_INC(&cfg->base.stats, jmaxlen); else { ji = uma_zalloc(nat64lsn_job_zone, M_NOWAIT); if (ji == NULL) NAT64STAT_INC(&cfg->base.stats, jnomem); } if (ji == NULL) { NAT64STAT_INC(&cfg->base.stats, dropped); DPRINTF(DP_DROPS, "failed to create job"); } else { ji->jtype = jtype; ji->done = 0; } return (ji); } static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji) { JQUEUE_LOCK(); STAILQ_INSERT_TAIL(&cfg->jhead, ji, entries); NAT64STAT_INC(&cfg->base.stats, jrequests); cfg->jlen++; if (callout_pending(&cfg->jcallout) == 0) callout_reset(&cfg->jcallout, 1, nat64lsn_do_request, cfg); JQUEUE_UNLOCK(); } +/* + * This function is used to clean up the result of less likely possible + * race condition, when host object was deleted, but some translation + * state was created before it is destroyed. + * + * Since the state expiration removes state from host's hash table, + * we need to be sure, that there will not any states, that are linked + * with this host entry. + */ +static void +nat64lsn_host_cleanup(struct nat64lsn_host *host) +{ + struct nat64lsn_state *state, *ts; + int i; + + printf("NAT64LSN: %s: race condition has been detected for host %p\n", + __func__, host); + for (i = 0; i < host->states_hashsize; i++) { + CK_SLIST_FOREACH_SAFE(state, &host->states_hash[i], + entries, ts) { + /* + * We can remove the state without lock, + * because this host entry is unlinked and will + * be destroyed. + */ + CK_SLIST_REMOVE(&host->states_hash[i], state, + nat64lsn_state, entries); + host->states_count--; + nat64lsn_state_cleanup(state); + } + } + MPASS(host->states_count == 0); +} + +/* + * This function is used to clean up the result of less likely possible + * race condition, when portgroup was deleted, but some translation state + * was created before it is destroyed. + * + * Since states entries are accessible via host's hash table, we need + * to be sure, that there will not any states from this PG, that are + * linked with any host entries. + */ +static void +nat64lsn_pg_cleanup(struct nat64lsn_pg *pg) +{ + struct nat64lsn_state *state; + uint64_t usedmask; + int c, i; + + printf("NAT64LSN: %s: race condition has been detected for pg %p\n", + __func__, pg); + for (c = 0; c < pg->chunks_count; c++) { + /* + * Use inverted freemask to find what state was created. + */ + usedmask = ~(*FREEMASK_CHUNK(pg, c)); + if (usedmask == 0) + continue; + for (i = 0; i < 64; i++) { + if (!ISSET64(usedmask, i)) + continue; + state = &STATES_CHUNK(pg, c)->state[i]; + /* + * If we have STALE bit, this means that state + * is already unlinked from host's hash table. + * Thus we can just reset the bit in mask and + * schedule destroying in the next epoch call. + */ + if (ISSET32(state->flags, NAT64_BIT_STALE)) { + FREEMASK_BTS(pg, c, i); + continue; + } + /* + * There is small window, when we have bit + * grabbed from freemask, but state is not yet + * linked into host's hash table. + * Check for READY flag, it is set just after + * linking. If it is not set, defer cleanup + * for next call. + */ + if (ISSET32(state->flags, NAT64_BIT_READY_IPV4)) { + struct nat64lsn_host *host; + + host = state->host; + HOST_LOCK(host); + CK_SLIST_REMOVE(&STATE_HASH(host, + state->hval), state, nat64lsn_state, + entries); + host->states_count--; + HOST_UNLOCK(host); + nat64lsn_state_cleanup(state); + } + } + } +} + static void nat64lsn_job_destroy(epoch_context_t ctx) { + struct nat64lsn_hosts_slist hosts; + struct nat64lsn_pg_slist portgroups; struct nat64lsn_job_item *ji; struct nat64lsn_host *host; struct nat64lsn_pg *pg; int i; + CK_SLIST_INIT(&hosts); + CK_SLIST_INIT(&portgroups); ji = __containerof(ctx, struct nat64lsn_job_item, epoch_ctx); MPASS(ji->jtype == JTYPE_DESTROY); while (!CK_SLIST_EMPTY(&ji->hosts)) { host = CK_SLIST_FIRST(&ji->hosts); CK_SLIST_REMOVE_HEAD(&ji->hosts, entries); if (host->states_count > 0) { /* - * XXX: The state has been created - * during host deletion. + * The state has been created during host deletion. */ printf("NAT64LSN: %s: destroying host with %d " "states\n", __func__, host->states_count); + /* + * We need to cleanup these states to avoid + * possible access to already deleted host in + * the state expiration code. + */ + nat64lsn_host_cleanup(host); + CK_SLIST_INSERT_HEAD(&hosts, host, entries); + /* + * Keep host entry for next deferred destroying. + * In the next epoch its states will be not + * accessible. + */ + continue; } nat64lsn_destroy_host(host); } while (!CK_SLIST_EMPTY(&ji->portgroups)) { pg = CK_SLIST_FIRST(&ji->portgroups); CK_SLIST_REMOVE_HEAD(&ji->portgroups, entries); for (i = 0; i < pg->chunks_count; i++) { if (FREEMASK_BITCOUNT(pg, i) != 64) { /* - * XXX: The state has been created during + * A state has been created during * PG deletion. */ printf("NAT64LSN: %s: destroying PG %p " "with non-empty chunk %d\n", __func__, pg, i); + nat64lsn_pg_cleanup(pg); + CK_SLIST_INSERT_HEAD(&portgroups, + pg, entries); + i = -1; + break; } } - nat64lsn_destroy_pg(pg); + if (i != -1) + nat64lsn_destroy_pg(pg); } - uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk); - uma_zfree(nat64lsn_job_zone, ji); + if (CK_SLIST_EMPTY(&hosts) && + CK_SLIST_EMPTY(&portgroups)) { + uma_zfree(nat64lsn_pgchunk_zone, ji->pgchunk); + uma_zfree(nat64lsn_job_zone, ji); + return; + } + + /* Schedule job item again */ + CK_SLIST_MOVE(&ji->hosts, &hosts, entries); + CK_SLIST_MOVE(&ji->portgroups, &portgroups, entries); + NAT64LSN_EPOCH_CALL(&ji->epoch_ctx, nat64lsn_job_destroy); } static int nat64lsn_request_host(struct nat64lsn_cfg *cfg, const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_job_item *ji; ji = nat64lsn_create_job(cfg, JTYPE_NEWHOST); if (ji != NULL) { ji->m = *mp; ji->f_id = *f_id; ji->faddr = faddr; ji->port = port; ji->proto = proto; ji->src6_hval = hval; nat64lsn_enqueue_job(cfg, ji); NAT64STAT_INC(&cfg->base.stats, jhostsreq); *mp = NULL; } return (IP_FW_DENY); } static int nat64lsn_request_pg(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, const struct ipfw_flow_id *f_id, struct mbuf **mp, uint32_t hval, in_addr_t faddr, uint16_t port, uint8_t proto) { struct nat64lsn_job_item *ji; ji = nat64lsn_create_job(cfg, JTYPE_NEWPORTGROUP); if (ji != NULL) { ji->m = *mp; ji->f_id = *f_id; ji->faddr = faddr; ji->port = port; ji->proto = proto; ji->state_hval = hval; ji->host = host; nat64lsn_enqueue_job(cfg, ji); NAT64STAT_INC(&cfg->base.stats, jportreq); *mp = NULL; } return (IP_FW_DENY); } static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags) { struct pfloghdr loghdr, *logdata; int ret; uint16_t ts; /* Update timestamp and flags if needed */ SET_AGE(ts); if (state->timestamp != ts) state->timestamp = ts; if ((state->flags & flags) != 0) state->flags |= flags; if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64lsn_log(logdata, *mp, AF_INET6, state); } else logdata = NULL; ret = nat64_do_handle_ip6(*mp, htonl(state->ip_src), htons(state->aport), &cfg->base, logdata); if (ret == NAT64SKIP) return (cfg->nomatch_verdict); if (ret == NAT64RETURN) *mp = NULL; return (IP_FW_DENY); } static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, struct ipfw_flow_id *f_id, struct mbuf **mp) { struct nat64lsn_state *state; struct nat64lsn_host *host; struct icmp6_hdr *icmp6; uint32_t addr, hval, data[2]; int offset, proto; uint16_t port; uint8_t flags; /* Check if protocol is supported */ port = f_id->src_port; proto = f_id->proto; switch (f_id->proto) { case IPPROTO_ICMPV6: /* * For ICMPv6 echo reply/request we use icmp6_id as * local port. */ offset = 0; proto = nat64_getlasthdr(*mp, &offset); if (proto < 0) { NAT64STAT_INC(&cfg->base.stats, dropped); DPRINTF(DP_DROPS, "mbuf isn't contigious"); return (IP_FW_DENY); } if (proto == IPPROTO_ICMPV6) { icmp6 = mtodo(*mp, offset); if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST || icmp6->icmp6_type == ICMP6_ECHO_REPLY) port = ntohs(icmp6->icmp6_id); } proto = IPPROTO_ICMP; /* FALLTHROUGH */ case IPPROTO_TCP: case IPPROTO_UDP: break; default: NAT64STAT_INC(&cfg->base.stats, noproto); return (cfg->nomatch_verdict); } /* Extract IPv4 from destination IPv6 address */ addr = nat64_extract_ip4(&f_id->dst_ip6, cfg->base.plat_plen); if (addr == 0 || nat64_check_private_ip4(&cfg->base, addr) != 0) { char a[INET_ADDRSTRLEN]; NAT64STAT_INC(&cfg->base.stats, dropped); DPRINTF(DP_DROPS, "dropped due to embedded IPv4 address %s", inet_ntop(AF_INET, &addr, a, sizeof(a))); return (IP_FW_DENY); /* XXX: add extra stats? */ } /* Try to find host */ hval = HOST_HVAL(cfg, &f_id->src_ip6); CK_SLIST_FOREACH(host, &HOSTS(cfg, hval), entries) { if (IN6_ARE_ADDR_EQUAL(&f_id->src_ip6, &host->addr)) break; } /* We use IPv4 address in host byte order */ addr = ntohl(addr); if (host == NULL) return (nat64lsn_request_host(cfg, f_id, mp, hval, addr, port, proto)); flags = proto != IPPROTO_TCP ? 0 : convert_tcp_flags(f_id->_flags); data[0] = addr; data[1] = (f_id->dst_port << 16) | port; hval = STATE_HVAL(cfg, data); state = nat64lsn_get_state6to4(cfg, host, f_id, hval, addr, port, proto); if (state == NULL) return (nat64lsn_request_pg(cfg, host, f_id, mp, hval, addr, port, proto)); return (nat64lsn_translate6_internal(cfg, mp, state, flags)); } /* * Main dataplane entry point. */ int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { - struct nat64lsn_cfg *cfg; + struct nat64lsn_instance *i; ipfw_insn *icmd; int ret; IPFW_RLOCK_ASSERT(ch); *done = 0; /* continue the search in case of failure */ - icmd = cmd + 1; + icmd = cmd + F_LEN(cmd); if (cmd->opcode != O_EXTERNAL_ACTION || - cmd->arg1 != V_nat64lsn_eid || + insntod(cmd, kidx)->kidx != V_nat64lsn_eid || icmd->opcode != O_EXTERNAL_INSTANCE || - (cfg = NAT64_LOOKUP(ch, icmd)) == NULL) + (i = NAT64_LOOKUP(ch, icmd)) == NULL) return (IP_FW_DENY); *done = 1; /* terminate the search */ switch (args->f_id.addr_type) { case 4: - ret = nat64lsn_translate4(cfg, &args->f_id, &args->m); + ret = nat64lsn_translate4(i->cfg, &args->f_id, &args->m); break; case 6: /* * Check that destination IPv6 address matches our prefix6. */ - if ((cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 && - memcmp(&args->f_id.dst_ip6, &cfg->base.plat_prefix, - cfg->base.plat_plen / 8) != 0) { - ret = cfg->nomatch_verdict; + if ((i->cfg->base.flags & NAT64LSN_ANYPREFIX) == 0 && + memcmp(&args->f_id.dst_ip6, &i->cfg->base.plat_prefix, + i->cfg->base.plat_plen / 8) != 0) { + ret = i->cfg->nomatch_verdict; break; } - ret = nat64lsn_translate6(cfg, &args->f_id, &args->m); + ret = nat64lsn_translate6(i->cfg, &args->f_id, &args->m); break; default: - ret = cfg->nomatch_verdict; + ret = i->cfg->nomatch_verdict; } if (ret != IP_FW_PASS && args->m != NULL) { m_freem(args->m); args->m = NULL; } return (ret); } static int nat64lsn_state_ctor(void *mem, int size, void *arg, int flags) { struct nat64lsn_states_chunk *chunk; int i; chunk = (struct nat64lsn_states_chunk *)mem; for (i = 0; i < 64; i++) chunk->state[i].flags = 0; return (0); } void nat64lsn_init_internal(void) { nat64lsn_host_zone = uma_zcreate("NAT64LSN hosts", sizeof(struct nat64lsn_host), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_pgchunk_zone = uma_zcreate("NAT64LSN portgroup chunks", sizeof(struct nat64lsn_pgchunk), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_pg_zone = uma_zcreate("NAT64LSN portgroups", sizeof(struct nat64lsn_pg), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_aliaslink_zone = uma_zcreate("NAT64LSN links", sizeof(struct nat64lsn_aliaslink), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_state_zone = uma_zcreate("NAT64LSN states", sizeof(struct nat64lsn_states_chunk), nat64lsn_state_ctor, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nat64lsn_job_zone = uma_zcreate("NAT64LSN jobs", sizeof(struct nat64lsn_job_item), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); JQUEUE_LOCK_INIT(); } void nat64lsn_uninit_internal(void) { /* XXX: epoch_task drain */ JQUEUE_LOCK_DESTROY(); uma_zdestroy(nat64lsn_host_zone); uma_zdestroy(nat64lsn_pgchunk_zone); uma_zdestroy(nat64lsn_pg_zone); uma_zdestroy(nat64lsn_aliaslink_zone); uma_zdestroy(nat64lsn_state_zone); uma_zdestroy(nat64lsn_job_zone); } void nat64lsn_start_instance(struct nat64lsn_cfg *cfg) { CALLOUT_LOCK(cfg); callout_reset(&cfg->periodic, hz * PERIODIC_DELAY, nat64lsn_periodic, cfg); CALLOUT_UNLOCK(cfg); } struct nat64lsn_cfg * -nat64lsn_init_instance(struct ip_fw_chain *ch, in_addr_t prefix, int plen) +nat64lsn_init_config(struct ip_fw_chain *ch, in_addr_t prefix, int plen) { struct nat64lsn_cfg *cfg; struct nat64lsn_alias *alias; int i, naddr; cfg = malloc(sizeof(struct nat64lsn_cfg), M_NAT64LSN, M_WAITOK | M_ZERO); CFG_LOCK_INIT(cfg); CALLOUT_LOCK_INIT(cfg); STAILQ_INIT(&cfg->jhead); cfg->vp = curvnet; COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK); cfg->hash_seed = arc4random(); cfg->hosts_hashsize = NAT64LSN_HOSTS_HSIZE; cfg->hosts_hash = malloc(sizeof(struct nat64lsn_hosts_slist) * cfg->hosts_hashsize, M_NAT64LSN, M_WAITOK | M_ZERO); for (i = 0; i < cfg->hosts_hashsize; i++) CK_SLIST_INIT(&cfg->hosts_hash[i]); naddr = 1 << (32 - plen); cfg->prefix4 = prefix; cfg->pmask4 = prefix | (naddr - 1); cfg->plen4 = plen; cfg->aliases = malloc(sizeof(struct nat64lsn_alias) * naddr, M_NAT64LSN, M_WAITOK | M_ZERO); for (i = 0; i < naddr; i++) { alias = &cfg->aliases[i]; alias->addr = prefix + i; /* host byte order */ CK_SLIST_INIT(&alias->hosts); ALIAS_LOCK_INIT(alias); } callout_init_mtx(&cfg->periodic, &cfg->periodic_lock, 0); callout_init(&cfg->jcallout, CALLOUT_MPSAFE); return (cfg); } static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg) { int i; if (pg->chunks_count == 1) { uma_zfree(nat64lsn_state_zone, pg->states); } else { for (i = 0; i < pg->chunks_count; i++) uma_zfree(nat64lsn_state_zone, pg->states_chunk[i]); free(pg->states_chunk, M_NAT64LSN); free(pg->freemask_chunk, M_NAT64LSN); } uma_zfree(nat64lsn_pg_zone, pg); } static void nat64lsn_destroy_alias(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias) { struct nat64lsn_pg *pg; int i; while (!CK_SLIST_EMPTY(&alias->portgroups)) { pg = CK_SLIST_FIRST(&alias->portgroups); CK_SLIST_REMOVE_HEAD(&alias->portgroups, entries); nat64lsn_destroy_pg(pg); } for (i = 0; i < 32; i++) { if (ISSET32(alias->tcp_chunkmask, i)) uma_zfree(nat64lsn_pgchunk_zone, alias->tcp[i]); if (ISSET32(alias->udp_chunkmask, i)) uma_zfree(nat64lsn_pgchunk_zone, alias->udp[i]); if (ISSET32(alias->icmp_chunkmask, i)) uma_zfree(nat64lsn_pgchunk_zone, alias->icmp[i]); } ALIAS_LOCK_DESTROY(alias); } static void nat64lsn_destroy_host(struct nat64lsn_host *host) { struct nat64lsn_aliaslink *link; while (!CK_SLIST_EMPTY(&host->aliases)) { link = CK_SLIST_FIRST(&host->aliases); CK_SLIST_REMOVE_HEAD(&host->aliases, host_entries); ALIAS_LOCK(link->alias); CK_SLIST_REMOVE(&link->alias->hosts, link, nat64lsn_aliaslink, alias_entries); link->alias->hosts_count--; ALIAS_UNLOCK(link->alias); uma_zfree(nat64lsn_aliaslink_zone, link); } HOST_LOCK_DESTROY(host); free(host->states_hash, M_NAT64LSN); uma_zfree(nat64lsn_host_zone, host); } void -nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg) +nat64lsn_destroy_config(struct nat64lsn_cfg *cfg) { struct nat64lsn_host *host; int i; CALLOUT_LOCK(cfg); callout_drain(&cfg->periodic); CALLOUT_UNLOCK(cfg); callout_drain(&cfg->jcallout); for (i = 0; i < cfg->hosts_hashsize; i++) { while (!CK_SLIST_EMPTY(&cfg->hosts_hash[i])) { host = CK_SLIST_FIRST(&cfg->hosts_hash[i]); CK_SLIST_REMOVE_HEAD(&cfg->hosts_hash[i], entries); nat64lsn_destroy_host(host); } } for (i = 0; i < (1 << (32 - cfg->plen4)); i++) nat64lsn_destroy_alias(cfg, &cfg->aliases[i]); CALLOUT_LOCK_DESTROY(cfg); CFG_LOCK_DESTROY(cfg); COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS); free(cfg->hosts_hash, M_NAT64LSN); free(cfg->aliases, M_NAT64LSN); free(cfg, M_NAT64LSN); } + diff --git a/sys/netpfil/ipfw/nat64/nat64lsn.h b/sys/netpfil/ipfw/nat64/nat64lsn.h index f255eca22c27..996136b760a2 100644 --- a/sys/netpfil/ipfw/nat64/nat64lsn.h +++ b/sys/netpfil/ipfw/nat64/nat64lsn.h @@ -1,253 +1,275 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * - * Copyright (c) 2015-2019 Yandex LLC + * Copyright (c) 2015-2020 Yandex LLC * Copyright (c) 2015 Alexander V. Chernikov - * Copyright (c) 2015-2019 Andrey V. Elsukov + * Copyright (c) 2015-2020 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _IP_FW_NAT64LSN_H_ #define _IP_FW_NAT64LSN_H_ #include "ip_fw_nat64.h" #include "nat64_translate.h" #define NAT64_MIN_PORT 1024 struct nat64lsn_host; struct nat64lsn_alias; struct nat64lsn_state { /* IPv6 host entry keeps hash table to speedup state lookup */ CK_SLIST_ENTRY(nat64lsn_state) entries; struct nat64lsn_host *host; struct in6_addr ip6_dst; /* Destination IPv6 address */ in_addr_t ip_src; /* Alias IPv4 address */ in_addr_t ip_dst; /* Destination IPv4 address */ uint16_t dport; /* Destination port */ uint16_t sport; /* Source port */ uint32_t hval; uint32_t flags; /* Internal flags */ uint16_t aport; uint16_t timestamp; /* last used */ uint8_t proto; uint8_t _spare[7]; }; struct nat64lsn_states_chunk { struct nat64lsn_state state[64]; }; #define ISSET64(mask, bit) ((mask) & ((uint64_t)1 << (bit))) #define ISSET32(mask, bit) ((mask) & ((uint32_t)1 << (bit))) struct nat64lsn_pg { - CK_SLIST_ENTRY(nat64lsn_pg) entries; - uint16_t base_port; uint16_t timestamp; uint8_t proto; uint8_t chunks_count; - uint8_t spare[2]; + uint16_t flags; +#define NAT64LSN_DEADPG 1 union { uint64_t freemask64; uint32_t freemask32[2]; uint64_t *freemask64_chunk; uint32_t *freemask32_chunk; void *freemask_chunk; }; union { struct nat64lsn_states_chunk *states; struct nat64lsn_states_chunk **states_chunk; }; + /* + * An alias object holds chain of all allocated PGs. + * The chain is used mostly by expiration code. + */ + CK_SLIST_ENTRY(nat64lsn_pg) entries; }; #define CHUNK_BY_FADDR(p, a) ((a) & ((p)->chunks_count - 1)) #ifdef __LP64__ #define FREEMASK_CHUNK(p, v) \ ((p)->chunks_count == 1 ? &(p)->freemask64 : \ &(p)->freemask64_chunk[CHUNK_BY_FADDR(p, v)]) #define FREEMASK_BITCOUNT(pg, faddr) \ bitcount64(*FREEMASK_CHUNK((pg), (faddr))) #else #define FREEMASK_CHUNK(p, v) \ ((p)->chunks_count == 1 ? &(p)->freemask32[0] : \ &(p)->freemask32_chunk[CHUNK_BY_FADDR(p, v) * 2]) #define FREEMASK_BITCOUNT(pg, faddr) \ bitcount64(*(uint64_t *)FREEMASK_CHUNK((pg), (faddr))) #endif /* !__LP64__ */ struct nat64lsn_pgchunk { struct nat64lsn_pg *pgptr[32]; }; struct nat64lsn_aliaslink { CK_SLIST_ENTRY(nat64lsn_aliaslink) alias_entries; CK_SLIST_ENTRY(nat64lsn_aliaslink) host_entries; struct nat64lsn_alias *alias; }; CK_SLIST_HEAD(nat64lsn_aliaslink_slist, nat64lsn_aliaslink); CK_SLIST_HEAD(nat64lsn_states_slist, nat64lsn_state); CK_SLIST_HEAD(nat64lsn_hosts_slist, nat64lsn_host); CK_SLIST_HEAD(nat64lsn_pg_slist, nat64lsn_pg); struct nat64lsn_alias { struct nat64lsn_aliaslink_slist hosts; struct nat64lsn_pg_slist portgroups; struct mtx lock; in_addr_t addr; /* host byte order */ uint32_t hosts_count; - uint32_t portgroups_count; - uint32_t tcp_chunkmask; - uint32_t udp_chunkmask; - uint32_t icmp_chunkmask; + uint16_t timestamp; + uint16_t tcp_pgcount; + uint16_t udp_pgcount; + uint16_t icmp_pgcount; + /* + * We keep PGs in chunks by 32 PGs in each. + * Each chunk allocated by demand, and then corresponding bit + * is set in chunkmask. + * + * Also we keep last used PG's index for each protocol. + * pgidx / 32 = index of pgchunk; + * pgidx % 32 = index of pgptr in pgchunk. + */ + uint32_t tcp_chunkmask; uint32_t tcp_pgidx; + + uint32_t udp_chunkmask; uint32_t udp_pgidx; - uint32_t icmp_pgidx; - uint16_t timestamp; - uint16_t spare; + uint32_t icmp_chunkmask; + uint32_t icmp_pgidx; + /* + * Each pgchunk keeps 32 pointers to PGs. If pgptr pointer is + * valid, we have corresponding bit set in the pgmask. + */ uint32_t tcp_pgmask[32]; uint32_t udp_pgmask[32]; uint32_t icmp_pgmask[32]; + struct nat64lsn_pgchunk *tcp[32]; struct nat64lsn_pgchunk *udp[32]; struct nat64lsn_pgchunk *icmp[32]; - - /* pointer to PG that can be used for faster state allocation */ - struct nat64lsn_pg *tcp_pg; - struct nat64lsn_pg *udp_pg; - struct nat64lsn_pg *icmp_pg; }; #define ALIAS_LOCK_INIT(p) \ mtx_init(&(p)->lock, "alias_lock", NULL, MTX_DEF) #define ALIAS_LOCK_DESTROY(p) mtx_destroy(&(p)->lock) #define ALIAS_LOCK(p) mtx_lock(&(p)->lock) #define ALIAS_UNLOCK(p) mtx_unlock(&(p)->lock) #define NAT64LSN_HSIZE 256 #define NAT64LSN_MAX_HSIZE 4096 #define NAT64LSN_HOSTS_HSIZE 1024 struct nat64lsn_host { struct in6_addr addr; struct nat64lsn_aliaslink_slist aliases; struct nat64lsn_states_slist *states_hash; CK_SLIST_ENTRY(nat64lsn_host) entries; uint32_t states_count; uint32_t hval; uint32_t flags; #define NAT64LSN_DEADHOST 1 #define NAT64LSN_GROWHASH 2 uint16_t states_hashsize; uint16_t timestamp; struct mtx lock; }; #define HOST_LOCK_INIT(p) \ mtx_init(&(p)->lock, "host_lock", NULL, MTX_DEF|MTX_NEW) #define HOST_LOCK_DESTROY(p) mtx_destroy(&(p)->lock) #define HOST_LOCK(p) mtx_lock(&(p)->lock) #define HOST_UNLOCK(p) mtx_unlock(&(p)->lock) -VNET_DECLARE(uint16_t, nat64lsn_eid); +VNET_DECLARE(uint32_t, nat64lsn_eid); #define V_nat64lsn_eid VNET(nat64lsn_eid) #define IPFW_TLV_NAT64LSN_NAME IPFW_TLV_EACTION_NAME(V_nat64lsn_eid) /* Timestamp macro */ #define _CT ((int)time_uptime % 65536) #define SET_AGE(x) (x) = _CT #define GET_AGE(x) ((_CT >= (x)) ? _CT - (x): (int)65536 + _CT - (x)) STAILQ_HEAD(nat64lsn_job_head, nat64lsn_job_item); struct nat64lsn_cfg { - struct named_object no; - struct nat64lsn_hosts_slist *hosts_hash; struct nat64lsn_alias *aliases; /* array of aliases */ struct mtx lock; uint32_t hosts_hashsize; uint32_t hash_seed; uint32_t prefix4; /* IPv4 prefix */ uint32_t pmask4; /* IPv4 prefix mask */ uint8_t plen4; uint8_t nomatch_verdict;/* Return value on no-match */ uint32_t hosts_count; /* Number of items in host hash */ uint32_t states_chunks; /* Number of states chunks per PG */ uint32_t jmaxlen; /* Max jobqueue length */ uint16_t host_delete_delay; /* Stale host delete delay */ uint16_t pgchunk_delete_delay; uint16_t pg_delete_delay; /* Stale portgroup del delay */ uint16_t st_syn_ttl; /* TCP syn expire */ uint16_t st_close_ttl; /* TCP fin expire */ uint16_t st_estab_ttl; /* TCP established expire */ uint16_t st_udp_ttl; /* UDP expire */ uint16_t st_icmp_ttl; /* ICMP expire */ struct nat64_config base; -#define NAT64LSN_FLAGSMASK (NAT64_LOG | NAT64_ALLOW_PRIVATE) +#define NAT64LSN_FLAGSMASK (NAT64_LOG | NAT64_ALLOW_PRIVATE | \ + NAT64LSN_ALLOW_SWAPCONF) #define NAT64LSN_ANYPREFIX 0x00000100 struct mtx periodic_lock; struct callout periodic; struct callout jcallout; struct vnet *vp; struct nat64lsn_job_head jhead; int jlen; char name[64]; /* Nat instance name */ }; +struct nat64lsn_instance { + struct named_object no; + struct nat64lsn_cfg *cfg; + char name[64]; /* Nat instance name */ +}; + /* CFG_LOCK protects cfg->hosts_hash from modification */ #define CFG_LOCK_INIT(p) \ mtx_init(&(p)->lock, "cfg_lock", NULL, MTX_DEF) #define CFG_LOCK_DESTROY(p) mtx_destroy(&(p)->lock) #define CFG_LOCK(p) mtx_lock(&(p)->lock) #define CFG_UNLOCK(p) mtx_unlock(&(p)->lock) #define CALLOUT_LOCK_INIT(p) \ mtx_init(&(p)->periodic_lock, "periodic_lock", NULL, MTX_DEF) #define CALLOUT_LOCK_DESTROY(p) mtx_destroy(&(p)->periodic_lock) #define CALLOUT_LOCK(p) mtx_lock(&(p)->periodic_lock) #define CALLOUT_UNLOCK(p) mtx_unlock(&(p)->periodic_lock) -struct nat64lsn_cfg *nat64lsn_init_instance(struct ip_fw_chain *ch, +MALLOC_DECLARE(M_NAT64LSN); + +struct nat64lsn_cfg *nat64lsn_init_config(struct ip_fw_chain *ch, in_addr_t prefix, int plen); -void nat64lsn_destroy_instance(struct nat64lsn_cfg *cfg); +void nat64lsn_destroy_config(struct nat64lsn_cfg *cfg); void nat64lsn_start_instance(struct nat64lsn_cfg *cfg); void nat64lsn_init_internal(void); void nat64lsn_uninit_internal(void); int ipfw_nat64lsn(struct ip_fw_chain *ch, struct ip_fw_args *args, ipfw_insn *cmd, int *done); #endif /* _IP_FW_NAT64LSN_H_ */ diff --git a/sys/netpfil/ipfw/nat64/nat64lsn_control.c b/sys/netpfil/ipfw/nat64/nat64lsn_control.c index 70a67ac72f73..50d95b547510 100644 --- a/sys/netpfil/ipfw/nat64/nat64lsn_control.c +++ b/sys/netpfil/ipfw/nat64/nat64lsn_control.c @@ -1,940 +1,951 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015 Alexander V. Chernikov * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64lsn.h" -VNET_DEFINE(uint16_t, nat64lsn_eid) = 0; +VNET_DEFINE(uint32_t, nat64lsn_eid) = 0; -static struct nat64lsn_cfg * +static struct nat64lsn_instance * nat64lsn_find(struct namedobj_instance *ni, const char *name, uint8_t set) { - struct nat64lsn_cfg *cfg; + struct named_object *no; - cfg = (struct nat64lsn_cfg *)ipfw_objhash_lookup_name_type(ni, set, + no = ipfw_objhash_lookup_name_type(ni, set, IPFW_TLV_NAT64LSN_NAME, name); - - return (cfg); + if (no == NULL) + return (NULL); + return (__containerof(no, struct nat64lsn_instance, no)); } static void nat64lsn_default_config(ipfw_nat64lsn_cfg *uc) { if (uc->jmaxlen == 0) uc->jmaxlen = NAT64LSN_JMAXLEN; if (uc->jmaxlen > 65536) uc->jmaxlen = 65536; if (uc->nh_delete_delay == 0) uc->nh_delete_delay = NAT64LSN_HOST_AGE; if (uc->pg_delete_delay == 0) uc->pg_delete_delay = NAT64LSN_PG_AGE; if (uc->st_syn_ttl == 0) uc->st_syn_ttl = NAT64LSN_TCP_SYN_AGE; if (uc->st_close_ttl == 0) uc->st_close_ttl = NAT64LSN_TCP_FIN_AGE; if (uc->st_estab_ttl == 0) uc->st_estab_ttl = NAT64LSN_TCP_EST_AGE; if (uc->st_udp_ttl == 0) uc->st_udp_ttl = NAT64LSN_UDP_AGE; if (uc->st_icmp_ttl == 0) uc->st_icmp_ttl = NAT64LSN_ICMP_AGE; if (uc->states_chunks == 0) uc->states_chunks = 1; else if (uc->states_chunks >= 128) uc->states_chunks = 128; else if (!powerof2(uc->states_chunks)) uc->states_chunks = 1 << fls(uc->states_chunks); } /* * Creates new nat64lsn instance. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ipfw_nat64lsn_cfg ] * * Returns 0 on success */ static int nat64lsn_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; ipfw_nat64lsn_cfg *uc; + struct nat64lsn_instance *i; struct nat64lsn_cfg *cfg; struct namedobj_instance *ni; uint32_t addr4, mask4; if (sd->valsize != sizeof(*olh) + sizeof(*uc)) return (EINVAL); olh = (ipfw_obj_lheader *)sd->kbuf; uc = (ipfw_nat64lsn_cfg *)(olh + 1); if (ipfw_check_object_name_generic(uc->name) != 0) return (EINVAL); if (uc->set >= IPFW_MAX_SETS) return (EINVAL); if (uc->plen4 > 32) return (EINVAL); /* * Unspecified address has special meaning. But it must * have valid prefix length. This length will be used to * correctly extract and embedd IPv4 address into IPv6. */ if (nat64_check_prefix6(&uc->prefix6, uc->plen6) != 0 && IN6_IS_ADDR_UNSPECIFIED(&uc->prefix6) && nat64_check_prefixlen(uc->plen6) != 0) return (EINVAL); /* XXX: Check prefix4 to be global */ addr4 = ntohl(uc->prefix4.s_addr); mask4 = ~((1 << (32 - uc->plen4)) - 1); if ((addr4 & mask4) != addr4) return (EINVAL); nat64lsn_default_config(uc); ni = CHAIN_TO_SRV(ch); IPFW_UH_RLOCK(ch); if (nat64lsn_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } IPFW_UH_RUNLOCK(ch); - cfg = nat64lsn_init_instance(ch, addr4, uc->plen4); - strlcpy(cfg->name, uc->name, sizeof(cfg->name)); - cfg->no.name = cfg->name; - cfg->no.etlv = IPFW_TLV_NAT64LSN_NAME; - cfg->no.set = uc->set; + i = malloc(sizeof(struct nat64lsn_instance), M_NAT64LSN, + M_WAITOK | M_ZERO); + strlcpy(i->name, uc->name, sizeof(i->name)); + i->no.name = i->name; + i->no.etlv = IPFW_TLV_NAT64LSN_NAME; + i->no.set = uc->set; + cfg = nat64lsn_init_config(ch, addr4, uc->plen4); cfg->base.plat_prefix = uc->prefix6; cfg->base.plat_plen = uc->plen6; cfg->base.flags = (uc->flags & NAT64LSN_FLAGSMASK) | NAT64_PLATPFX; if (IN6_IS_ADDR_WKPFX(&cfg->base.plat_prefix)) cfg->base.flags |= NAT64_WKPFX; else if (IN6_IS_ADDR_UNSPECIFIED(&cfg->base.plat_prefix)) cfg->base.flags |= NAT64LSN_ANYPREFIX; cfg->states_chunks = uc->states_chunks; cfg->jmaxlen = uc->jmaxlen; cfg->host_delete_delay = uc->nh_delete_delay; cfg->pg_delete_delay = uc->pg_delete_delay; cfg->st_syn_ttl = uc->st_syn_ttl; cfg->st_close_ttl = uc->st_close_ttl; cfg->st_estab_ttl = uc->st_estab_ttl; cfg->st_udp_ttl = uc->st_udp_ttl; cfg->st_icmp_ttl = uc->st_icmp_ttl; - cfg->nomatch_verdict = IP_FW_DENY; IPFW_UH_WLOCK(ch); if (nat64lsn_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_WUNLOCK(ch); - nat64lsn_destroy_instance(cfg); + nat64lsn_destroy_config(cfg); + free(i, M_NAT64LSN); return (EEXIST); } - if (ipfw_objhash_alloc_idx(CHAIN_TO_SRV(ch), &cfg->no.kidx) != 0) { + if (ipfw_objhash_alloc_idx(ni, &i->no.kidx) != 0) { IPFW_UH_WUNLOCK(ch); - nat64lsn_destroy_instance(cfg); + nat64lsn_destroy_config(cfg); + free(i, M_NAT64LSN); return (ENOSPC); } - ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no); + ipfw_objhash_add(ni, &i->no); /* Okay, let's link data */ - SRV_OBJECT(ch, cfg->no.kidx) = cfg; + i->cfg = cfg; + SRV_OBJECT(ch, i->no.kidx) = i; nat64lsn_start_instance(cfg); IPFW_UH_WUNLOCK(ch); return (0); } static void -nat64lsn_detach_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg) +nat64lsn_detach_instance(struct ip_fw_chain *ch, + struct nat64lsn_instance *i) { IPFW_UH_WLOCK_ASSERT(ch); - - ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); - ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); + SRV_OBJECT(ch, i->no.kidx) = NULL; + ipfw_objhash_del(CHAIN_TO_SRV(ch), &i->no); + ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), i->no.kidx); } /* * Destroys nat64 instance. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat64lsn_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { - struct nat64lsn_cfg *cfg; + struct nat64lsn_instance *i; ipfw_obj_header *oh; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)op3; IPFW_UH_WLOCK(ch); - cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); - if (cfg == NULL) { + i = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (i == NULL) { IPFW_UH_WUNLOCK(ch); return (ENOENT); } - if (cfg->no.refcnt > 0) { + if (i->no.refcnt > 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } - ipfw_reset_eaction_instance(ch, V_nat64lsn_eid, cfg->no.kidx); - SRV_OBJECT(ch, cfg->no.kidx) = NULL; - nat64lsn_detach_config(ch, cfg); + ipfw_reset_eaction_instance(ch, V_nat64lsn_eid, i->no.kidx); + nat64lsn_detach_instance(ch, i); IPFW_UH_WUNLOCK(ch); - nat64lsn_destroy_instance(cfg); + nat64lsn_destroy_config(i->cfg); + free(i, M_NAT64LSN); return (0); } #define __COPY_STAT_FIELD(_cfg, _stats, _field) \ (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->base.stats, _field) static void export_stats(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, struct ipfw_nat64lsn_stats *stats) { struct nat64lsn_alias *alias; - int i, j; + int i; __COPY_STAT_FIELD(cfg, stats, opcnt64); __COPY_STAT_FIELD(cfg, stats, opcnt46); __COPY_STAT_FIELD(cfg, stats, ofrags); __COPY_STAT_FIELD(cfg, stats, ifrags); __COPY_STAT_FIELD(cfg, stats, oerrors); __COPY_STAT_FIELD(cfg, stats, noroute4); __COPY_STAT_FIELD(cfg, stats, noroute6); __COPY_STAT_FIELD(cfg, stats, nomatch4); __COPY_STAT_FIELD(cfg, stats, noproto); __COPY_STAT_FIELD(cfg, stats, nomem); __COPY_STAT_FIELD(cfg, stats, dropped); __COPY_STAT_FIELD(cfg, stats, jcalls); __COPY_STAT_FIELD(cfg, stats, jrequests); __COPY_STAT_FIELD(cfg, stats, jhostsreq); __COPY_STAT_FIELD(cfg, stats, jportreq); __COPY_STAT_FIELD(cfg, stats, jhostfails); __COPY_STAT_FIELD(cfg, stats, jportfails); __COPY_STAT_FIELD(cfg, stats, jmaxlen); __COPY_STAT_FIELD(cfg, stats, jnomem); __COPY_STAT_FIELD(cfg, stats, jreinjected); __COPY_STAT_FIELD(cfg, stats, screated); __COPY_STAT_FIELD(cfg, stats, sdeleted); __COPY_STAT_FIELD(cfg, stats, spgcreated); __COPY_STAT_FIELD(cfg, stats, spgdeleted); stats->hostcount = cfg->hosts_count; for (i = 0; i < (1 << (32 - cfg->plen4)); i++) { alias = &cfg->aliases[i]; - for (j = 0; j < 32 && ISSET32(alias->tcp_chunkmask, j); j++) - stats->tcpchunks += bitcount32(alias->tcp_pgmask[j]); - for (j = 0; j < 32 && ISSET32(alias->udp_chunkmask, j); j++) - stats->udpchunks += bitcount32(alias->udp_pgmask[j]); - for (j = 0; j < 32 && ISSET32(alias->icmp_chunkmask, j); j++) - stats->icmpchunks += bitcount32(alias->icmp_pgmask[j]); + stats->tcpchunks += alias->tcp_pgcount; + stats->udpchunks += alias->udp_pgcount; + stats->icmpchunks += alias->icmp_pgcount; } } #undef __COPY_STAT_FIELD static void -nat64lsn_export_config(struct ip_fw_chain *ch, struct nat64lsn_cfg *cfg, +nat64lsn_export_config(struct ip_fw_chain *ch, struct nat64lsn_instance *i, ipfw_nat64lsn_cfg *uc) { + struct nat64lsn_cfg *cfg; + + strlcpy(uc->name, i->no.name, sizeof(uc->name)); + uc->set = i->no.set; + cfg = i->cfg; uc->flags = cfg->base.flags & NAT64LSN_FLAGSMASK; uc->states_chunks = cfg->states_chunks; uc->jmaxlen = cfg->jmaxlen; uc->nh_delete_delay = cfg->host_delete_delay; uc->pg_delete_delay = cfg->pg_delete_delay; uc->st_syn_ttl = cfg->st_syn_ttl; uc->st_close_ttl = cfg->st_close_ttl; uc->st_estab_ttl = cfg->st_estab_ttl; uc->st_udp_ttl = cfg->st_udp_ttl; uc->st_icmp_ttl = cfg->st_icmp_ttl; uc->prefix4.s_addr = htonl(cfg->prefix4); uc->prefix6 = cfg->base.plat_prefix; uc->plen4 = cfg->plen4; uc->plen6 = cfg->base.plat_plen; - uc->set = cfg->no.set; - strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); } struct nat64_dump_arg { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static int export_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { - struct nat64_dump_arg *da = (struct nat64_dump_arg *)arg; + struct nat64_dump_arg *da; ipfw_nat64lsn_cfg *uc; + da = (struct nat64_dump_arg *)arg; uc = (struct _ipfw_nat64lsn_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc)); - nat64lsn_export_config(da->ch, (struct nat64lsn_cfg *)no, uc); + nat64lsn_export_config(da->ch, + __containerof(no, struct nat64lsn_instance, no), uc); return (0); } /* * Lists all nat64 lsn instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader ipfw_nat64lsn_cfg x N ] * * Returns 0 on success */ static int nat64lsn_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat64_dump_arg da; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(ch); olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64LSN_NAME); olh->objsize = sizeof(ipfw_nat64lsn_cfg); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(ch); return (ENOMEM); } memset(&da, 0, sizeof(da)); da.ch = ch; da.sd = sd; ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, &da, IPFW_TLV_NAT64LSN_NAME); IPFW_UH_RUNLOCK(ch); return (0); } /* * Change existing nat64lsn instance configuration. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_nat64lsn_cfg ] * Reply: [ ipfw_obj_header ipfw_nat64lsn_cfg ] * * Returns 0 on success */ static int nat64lsn_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { ipfw_obj_header *oh; ipfw_nat64lsn_cfg *uc; + struct nat64lsn_instance *i; struct nat64lsn_cfg *cfg; struct namedobj_instance *ni; if (sd->valsize != sizeof(*oh) + sizeof(*uc)) return (EINVAL); oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sizeof(*oh) + sizeof(*uc)); uc = (ipfw_nat64lsn_cfg *)(oh + 1); if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); ni = CHAIN_TO_SRV(ch); if (sd->sopt->sopt_dir == SOPT_GET) { IPFW_UH_RLOCK(ch); - cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); - if (cfg == NULL) { + i = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); + if (i == NULL) { IPFW_UH_RUNLOCK(ch); return (ENOENT); } - nat64lsn_export_config(ch, cfg, uc); + nat64lsn_export_config(ch, i, uc); IPFW_UH_RUNLOCK(ch); return (0); } nat64lsn_default_config(uc); IPFW_UH_WLOCK(ch); - cfg = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); - if (cfg == NULL) { + i = nat64lsn_find(ni, oh->ntlv.name, oh->ntlv.set); + if (i == NULL) { IPFW_UH_WUNLOCK(ch); return (ENOENT); } /* * For now allow to change only following values: * jmaxlen, nh_del_age, pg_del_age, tcp_syn_age, tcp_close_age, * tcp_est_age, udp_age, icmp_age, flags, states_chunks. */ - + cfg = i->cfg; cfg->states_chunks = uc->states_chunks; cfg->jmaxlen = uc->jmaxlen; cfg->host_delete_delay = uc->nh_delete_delay; cfg->pg_delete_delay = uc->pg_delete_delay; cfg->st_syn_ttl = uc->st_syn_ttl; cfg->st_close_ttl = uc->st_close_ttl; cfg->st_estab_ttl = uc->st_estab_ttl; cfg->st_udp_ttl = uc->st_udp_ttl; cfg->st_icmp_ttl = uc->st_icmp_ttl; cfg->base.flags &= ~NAT64LSN_FLAGSMASK; cfg->base.flags |= uc->flags & NAT64LSN_FLAGSMASK; IPFW_UH_WUNLOCK(ch); return (0); } /* * Get nat64lsn statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * Reply: [ ipfw_obj_header ipfw_counter_tlv ] * * Returns 0 on success */ static int nat64lsn_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct ipfw_nat64lsn_stats stats; - struct nat64lsn_cfg *cfg; + struct nat64lsn_instance *i; ipfw_obj_header *oh; ipfw_obj_ctlv *ctlv; size_t sz; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); if (sd->valsize % sizeof(uint64_t)) return (EINVAL); if (sd->valsize < sz) return (ENOMEM); oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); memset(&stats, 0, sizeof(stats)); IPFW_UH_RLOCK(ch); - cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); - if (cfg == NULL) { + i = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (i == NULL) { IPFW_UH_RUNLOCK(ch); return (ENOENT); } - export_stats(ch, cfg, &stats); + export_stats(ch, i->cfg, &stats); IPFW_UH_RUNLOCK(ch); ctlv = (ipfw_obj_ctlv *)(oh + 1); memset(ctlv, 0, sizeof(*ctlv)); ctlv->head.type = IPFW_TLV_COUNTERS; ctlv->head.length = sz - sizeof(ipfw_obj_header); ctlv->count = sizeof(stats) / sizeof(uint64_t); ctlv->objsize = sizeof(uint64_t); ctlv->version = IPFW_NAT64_VERSION; memcpy(ctlv + 1, &stats, sizeof(stats)); return (0); } /* * Reset nat64lsn statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat64lsn_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { - struct nat64lsn_cfg *cfg; + struct nat64lsn_instance *i; ipfw_obj_header *oh; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); IPFW_UH_WLOCK(ch); - cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); - if (cfg == NULL) { + i = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (i == NULL) { IPFW_UH_WUNLOCK(ch); return (ENOENT); } - COUNTER_ARRAY_ZERO(cfg->base.stats.cnt, NAT64STATS); + COUNTER_ARRAY_ZERO(i->cfg->base.stats.cnt, NAT64STATS); IPFW_UH_WUNLOCK(ch); return (0); } #ifdef __LP64__ #define FREEMASK_COPY(pg, n, out) (out) = *FREEMASK_CHUNK((pg), (n)) #else #define FREEMASK_COPY(pg, n, out) (out) = *FREEMASK_CHUNK((pg), (n)) | \ ((uint64_t)*(FREEMASK_CHUNK((pg), (n)) + 1) << 32) #endif /* * Reply: [ ipfw_obj_header ipfw_obj_data [ ipfw_nat64lsn_stg * ipfw_nat64lsn_state x count, ... ] ] */ static int -nat64lsn_export_states_v1(struct nat64lsn_cfg *cfg, union nat64lsn_pgidx *idx, +nat64lsn_export_states(struct nat64lsn_cfg *cfg, union nat64lsn_pgidx *idx, struct nat64lsn_pg *pg, struct sockopt_data *sd, uint32_t *ret_count) { ipfw_nat64lsn_state_v1 *s; struct nat64lsn_state *state; uint64_t freemask; uint32_t i, count; /* validate user input */ if (idx->chunk > pg->chunks_count - 1) return (EINVAL); FREEMASK_COPY(pg, idx->chunk, freemask); count = 64 - bitcount64(freemask); if (count == 0) return (0); /* Try next PG/chunk */ DPRINTF(DP_STATE, "EXPORT PG 0x%16jx, count %d", (uintmax_t)idx->index, count); s = (ipfw_nat64lsn_state_v1 *)ipfw_get_sopt_space(sd, count * sizeof(ipfw_nat64lsn_state_v1)); if (s == NULL) return (ENOMEM); for (i = 0; i < 64; i++) { if (ISSET64(freemask, i)) continue; state = pg->chunks_count == 1 ? &pg->states->state[i] : &pg->states_chunk[idx->chunk]->state[i]; s->host6 = state->host->addr; s->daddr.s_addr = htonl(state->ip_dst); s->dport = state->dport; s->sport = state->sport; s->aport = state->aport; s->flags = (uint8_t)(state->flags & 7); s->proto = state->proto; s->idle = GET_AGE(state->timestamp); s++; } *ret_count = count; return (0); } #define LAST_IDX 0xFF static int nat64lsn_next_pgidx(struct nat64lsn_cfg *cfg, struct nat64lsn_pg *pg, union nat64lsn_pgidx *idx) { /* First iterate over chunks */ if (pg != NULL) { if (idx->chunk < pg->chunks_count - 1) { idx->chunk++; return (0); } } idx->chunk = 0; /* Then over PGs */ if (idx->port < UINT16_MAX - 64) { idx->port += 64; return (0); } idx->port = NAT64_MIN_PORT; /* Then over supported protocols */ switch (idx->proto) { case IPPROTO_ICMP: idx->proto = IPPROTO_TCP; return (0); case IPPROTO_TCP: idx->proto = IPPROTO_UDP; return (0); default: idx->proto = IPPROTO_ICMP; } /* And then over IPv4 alias addresses */ if (idx->addr < cfg->pmask4) { idx->addr++; return (1); /* New states group is needed */ } idx->index = LAST_IDX; return (-1); /* No more states */ } static struct nat64lsn_pg* nat64lsn_get_pg_byidx(struct nat64lsn_cfg *cfg, union nat64lsn_pgidx *idx) { struct nat64lsn_alias *alias; int pg_idx; alias = &cfg->aliases[idx->addr & ((1 << (32 - cfg->plen4)) - 1)]; MPASS(alias->addr == idx->addr); pg_idx = (idx->port - NAT64_MIN_PORT) / 64; switch (idx->proto) { case IPPROTO_ICMP: if (ISSET32(alias->icmp_pgmask[pg_idx / 32], pg_idx % 32)) return (alias->icmp[pg_idx / 32]->pgptr[pg_idx % 32]); break; case IPPROTO_TCP: if (ISSET32(alias->tcp_pgmask[pg_idx / 32], pg_idx % 32)) return (alias->tcp[pg_idx / 32]->pgptr[pg_idx % 32]); break; case IPPROTO_UDP: if (ISSET32(alias->udp_pgmask[pg_idx / 32], pg_idx % 32)) return (alias->udp[pg_idx / 32]->pgptr[pg_idx % 32]); break; } return (NULL); } -/* - * Lists nat64lsn states. - * Data layout (v0): - * Request: [ ipfw_obj_header ipfw_obj_data [ uint64_t ]] - * Reply: [ ipfw_obj_header ipfw_obj_data [ - * ipfw_nat64lsn_stg ipfw_nat64lsn_state x N] ] - * - * Returns 0 on success - */ -static int -nat64lsn_states_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, - struct sockopt_data *sd) -{ - - /* TODO: implement states listing for old ipfw(8) binaries */ - return (EOPNOTSUPP); -} - /* * Lists nat64lsn states. * Data layout (v1)(current): * Request: [ ipfw_obj_header ipfw_obj_data [ uint64_t ]] * Reply: [ ipfw_obj_header ipfw_obj_data [ * ipfw_nat64lsn_stg_v1 ipfw_nat64lsn_state_v1 x N] ] * * Returns 0 on success */ static int -nat64lsn_states_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, +nat64lsn_states(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; ipfw_obj_data *od; ipfw_nat64lsn_stg_v1 *stg; + struct nat64lsn_instance *i; struct nat64lsn_cfg *cfg; struct nat64lsn_pg *pg; union nat64lsn_pgidx idx; size_t sz; uint32_t count, total; int ret; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) + sizeof(uint64_t); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; od = (ipfw_obj_data *)(oh + 1); if (od->head.type != IPFW_TLV_OBJDATA || od->head.length != sz - sizeof(ipfw_obj_header)) return (EINVAL); idx.index = *(uint64_t *)(od + 1); if (idx.index != 0 && idx.proto != IPPROTO_ICMP && idx.proto != IPPROTO_TCP && idx.proto != IPPROTO_UDP) return (EINVAL); if (idx.index == LAST_IDX) return (EINVAL); IPFW_UH_RLOCK(ch); - cfg = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); - if (cfg == NULL) { + i = nat64lsn_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); + if (i == NULL) { IPFW_UH_RUNLOCK(ch); return (ENOENT); } + cfg = i->cfg; if (idx.index == 0) { /* Fill in starting point */ idx.addr = cfg->prefix4; idx.proto = IPPROTO_ICMP; idx.port = NAT64_MIN_PORT; } if (idx.addr < cfg->prefix4 || idx.addr > cfg->pmask4 || idx.port < NAT64_MIN_PORT) { IPFW_UH_RUNLOCK(ch); return (EINVAL); } sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_data) + sizeof(ipfw_nat64lsn_stg_v1); if (sd->valsize < sz) { IPFW_UH_RUNLOCK(ch); return (ENOMEM); } oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sz); od = (ipfw_obj_data *)(oh + 1); od->head.type = IPFW_TLV_OBJDATA; od->head.length = sz - sizeof(ipfw_obj_header); stg = (ipfw_nat64lsn_stg_v1 *)(od + 1); stg->count = total = 0; stg->next.index = idx.index; /* * Acquire CALLOUT_LOCK to avoid races with expiration code. * Thus states, hosts and PGs will not expire while we hold it. */ CALLOUT_LOCK(cfg); ret = 0; do { pg = nat64lsn_get_pg_byidx(cfg, &idx); if (pg != NULL) { count = 0; - ret = nat64lsn_export_states_v1(cfg, &idx, pg, + ret = nat64lsn_export_states(cfg, &idx, pg, sd, &count); if (ret != 0) break; if (count > 0) { stg->count += count; total += count; /* Update total size of reply */ od->head.length += count * sizeof(ipfw_nat64lsn_state_v1); sz += count * sizeof(ipfw_nat64lsn_state_v1); } stg->alias4.s_addr = htonl(idx.addr); } /* Determine new index */ switch (nat64lsn_next_pgidx(cfg, pg, &idx)) { case -1: ret = ENOENT; /* End of search */ break; case 1: /* * Next alias address, new group may be needed. * If states count is zero, use this group. */ if (stg->count == 0) continue; /* Otherwise try to create new group */ sz += sizeof(ipfw_nat64lsn_stg_v1); if (sd->valsize < sz) { ret = ENOMEM; break; } /* Save next index in current group */ stg->next.index = idx.index; stg = (ipfw_nat64lsn_stg_v1 *)ipfw_get_sopt_space(sd, sizeof(ipfw_nat64lsn_stg_v1)); od->head.length += sizeof(ipfw_nat64lsn_stg_v1); stg->count = 0; break; } stg->next.index = idx.index; } while (ret == 0); CALLOUT_UNLOCK(cfg); IPFW_UH_RUNLOCK(ch); return ((total > 0 || idx.index == LAST_IDX) ? 0: ret); } static struct ipfw_sopt_handler scodes[] = { - { IP_FW_NAT64LSN_CREATE, 0, HDIR_BOTH, nat64lsn_create }, - { IP_FW_NAT64LSN_DESTROY,0, HDIR_SET, nat64lsn_destroy }, - { IP_FW_NAT64LSN_CONFIG, 0, HDIR_BOTH, nat64lsn_config }, - { IP_FW_NAT64LSN_LIST, 0, HDIR_GET, nat64lsn_list }, - { IP_FW_NAT64LSN_STATS, 0, HDIR_GET, nat64lsn_stats }, - { IP_FW_NAT64LSN_RESET_STATS,0, HDIR_SET, nat64lsn_reset_stats }, - { IP_FW_NAT64LSN_LIST_STATES,0, HDIR_GET, nat64lsn_states_v0 }, - { IP_FW_NAT64LSN_LIST_STATES,1, HDIR_GET, nat64lsn_states_v1 }, + { IP_FW_NAT64LSN_CREATE, IP_FW3_OPVER, HDIR_BOTH, nat64lsn_create }, + { IP_FW_NAT64LSN_DESTROY, IP_FW3_OPVER, HDIR_SET, nat64lsn_destroy }, + { IP_FW_NAT64LSN_CONFIG, IP_FW3_OPVER, HDIR_BOTH, nat64lsn_config }, + { IP_FW_NAT64LSN_LIST, IP_FW3_OPVER, HDIR_GET, nat64lsn_list }, + { IP_FW_NAT64LSN_STATS, IP_FW3_OPVER, HDIR_GET, nat64lsn_stats }, + { IP_FW_NAT64LSN_RESET_STATS, IP_FW3_OPVER, HDIR_SET, nat64lsn_reset_stats }, + { IP_FW_NAT64LSN_LIST_STATES, IP_FW3_OPVER, HDIR_GET, nat64lsn_states }, }; +#define NAT64LSN_ARE_EQUAL(v) (cfg0->v == cfg1->v) static int -nat64lsn_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +nat64lsn_cmp_configs(struct nat64lsn_cfg *cfg0, struct nat64lsn_cfg *cfg1) { - ipfw_insn *icmd; - - icmd = cmd - 1; - if (icmd->opcode != O_EXTERNAL_ACTION || - icmd->arg1 != V_nat64lsn_eid) - return (1); - *puidx = cmd->arg1; - *ptype = 0; - return (0); + if ((cfg0->base.flags & cfg1->base.flags & NAT64LSN_ALLOW_SWAPCONF) && + NAT64LSN_ARE_EQUAL(prefix4) && + NAT64LSN_ARE_EQUAL(pmask4) && + NAT64LSN_ARE_EQUAL(plen4) && + NAT64LSN_ARE_EQUAL(base.plat_plen) && + IN6_ARE_ADDR_EQUAL(&cfg0->base.plat_prefix, + &cfg1->base.plat_prefix)) + return (0); + return (1); } +#undef NAT64LSN_ARE_EQUAL static void -nat64lsn_update_arg1(ipfw_insn *cmd, uint16_t idx) +nat64lsn_swap_configs(struct nat64lsn_instance *i0, + struct nat64lsn_instance *i1) { + struct nat64lsn_cfg *cfg; - cmd->arg1 = idx; + cfg = i0->cfg; + i0->cfg = i1->cfg; + i1->cfg = cfg; } +/* + * NAT64LSN sets swap handler. + * + * When two sets have NAT64LSN instance with the same name, we check + * most important configuration parameters, and if there are no difference, + * and both instances have NAT64LSN_ALLOW_SWAPCONF flag, we will exchange + * configs between instances. This allows to keep NAT64 states when ipfw's + * rules are reloaded using new set. + * + * XXX: since manage_sets caller doesn't hold IPFW_WLOCK(), it is possible + * that some states will be created during switching, because set of rules + * is changed a bit earley than named objects. + */ static int -nat64lsn_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, - struct named_object **pno) -{ - int err; - - err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, - IPFW_TLV_NAT64LSN_NAME, pno); - return (err); -} - -static struct named_object * -nat64lsn_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +nat64lsn_swap_sets_cb(struct namedobj_instance *ni, struct named_object *no, + void *arg) { - struct namedobj_instance *ni; - struct named_object *no; - - IPFW_UH_WLOCK_ASSERT(ch); - ni = CHAIN_TO_SRV(ch); - no = ipfw_objhash_lookup_kidx(ni, idx); - KASSERT(no != NULL, ("NAT64LSN with index %d not found", idx)); - - return (no); + struct nat64lsn_instance *i0, *i1; + uint8_t *sets; + + sets = arg; + if (no->set == sets[0]) { + /* + * Check if we have instance in new set with the same + * config that is sets aware and ready to swap configs. + */ + i0 = __containerof(no, struct nat64lsn_instance, no); + if ((i0->cfg->base.flags & NAT64LSN_ALLOW_SWAPCONF) && + (i1 = nat64lsn_find(ni, no->name, sets[1])) != NULL) { + /* Compare configs */ + if (nat64lsn_cmp_configs(i0->cfg, i1->cfg) == 0) { + IPFW_UH_WLOCK_ASSERT(&V_layer3_chain); + IPFW_WLOCK(&V_layer3_chain); + nat64lsn_swap_configs(i0, i1); + IPFW_WUNLOCK(&V_layer3_chain); + } + } + } + return (0); } static int -nat64lsn_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, +nat64lsn_manage_sets(struct ip_fw_chain *ch, uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { + uint8_t sets[2]; + if (cmd == SWAP_ALL) { + sets[0] = (uint8_t)set; + sets[1] = new_set; + ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), + nat64lsn_swap_sets_cb, &sets, IPFW_TLV_NAT64LSN_NAME); + } return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64LSN_NAME, set, new_set, cmd)); } - -static struct opcode_obj_rewrite opcodes[] = { - { - .opcode = O_EXTERNAL_INSTANCE, - .etlv = IPFW_TLV_EACTION /* just show it isn't table */, - .classifier = nat64lsn_classify, - .update = nat64lsn_update_arg1, - .find_byname = nat64lsn_findbyname, - .find_bykidx = nat64lsn_findbykidx, - .manage_sets = nat64lsn_manage_sets, - }, -}; +NAT64_DEFINE_OPCODE_REWRITER(nat64lsn, NAT64LSN, opcodes); static int destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { - struct nat64lsn_cfg *cfg; + struct nat64lsn_instance *i; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; - cfg = (struct nat64lsn_cfg *)SRV_OBJECT(ch, no->kidx); - SRV_OBJECT(ch, no->kidx) = NULL; - nat64lsn_detach_config(ch, cfg); - nat64lsn_destroy_instance(cfg); + i = (struct nat64lsn_instance *)SRV_OBJECT(ch, no->kidx); + nat64lsn_detach_instance(ch, i); + nat64lsn_destroy_config(i->cfg); + free(i, M_NAT64LSN); return (0); } int nat64lsn_init(struct ip_fw_chain *ch, int first) { if (first != 0) nat64lsn_init_internal(); V_nat64lsn_eid = ipfw_add_eaction(ch, ipfw_nat64lsn, "nat64lsn"); if (V_nat64lsn_eid == 0) return (ENXIO); IPFW_ADD_SOPT_HANDLER(first, scodes); IPFW_ADD_OBJ_REWRITER(first, opcodes); return (0); } void nat64lsn_uninit(struct ip_fw_chain *ch, int last) { IPFW_DEL_OBJ_REWRITER(last, opcodes); IPFW_DEL_SOPT_HANDLER(last, scodes); ipfw_del_eaction(ch, V_nat64lsn_eid); /* * Since we already have deregistered external action, * our named objects become unaccessible via rules, because * all rules were truncated by ipfw_del_eaction(). * So, we can unlink and destroy our named objects without holding * IPFW_WLOCK(). */ IPFW_UH_WLOCK(ch); ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, IPFW_TLV_NAT64LSN_NAME); V_nat64lsn_eid = 0; IPFW_UH_WUNLOCK(ch); if (last != 0) nat64lsn_uninit_internal(); } diff --git a/sys/netpfil/ipfw/nat64/nat64stl.c b/sys/netpfil/ipfw/nat64/nat64stl.c index ac479cb78d2c..ad1f2b3ec378 100644 --- a/sys/netpfil/ipfw/nat64/nat64stl.c +++ b/sys/netpfil/ipfw/nat64/nat64stl.c @@ -1,262 +1,262 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64stl.h" #define NAT64_LOOKUP(chain, cmd) \ - (struct nat64stl_cfg *)SRV_OBJECT((chain), (cmd)->arg1) + (struct nat64stl_cfg *)SRV_OBJECT((chain), insntod(cmd, kidx)->kidx) static void nat64stl_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, uint32_t kidx) { static uint32_t pktid = 0; memset(plog, 0, sizeof(*plog)); - plog->length = PFLOG_HDRLEN; + plog->length = PFLOG_REAL_HDRLEN; plog->af = family; plog->action = PF_NAT; plog->dir = PF_IN; plog->rulenr = htonl(kidx); pktid++; plog->subrulenr = htonl(pktid); plog->ruleset[0] = '\0'; strlcpy(plog->ifname, "NAT64STL", sizeof(plog->ifname)); ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); } static int nat64stl_handle_ip4(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg, struct mbuf *m, uint32_t tablearg) { struct pfloghdr loghdr, *logdata; struct in6_addr saddr, daddr; struct ip *ip; ip = mtod(m, struct ip*); if (nat64_check_ip4(ip->ip_src.s_addr) != 0 || nat64_check_ip4(ip->ip_dst.s_addr) != 0 || nat64_check_private_ip4(&cfg->base, ip->ip_src.s_addr) != 0 || nat64_check_private_ip4(&cfg->base, ip->ip_dst.s_addr) != 0) return (NAT64SKIP); daddr = TARG_VAL(chain, tablearg, nh6); if (nat64_check_ip6(&daddr) != 0) return (NAT64MFREE); saddr = cfg->base.plat_prefix; nat64_embed_ip4(&saddr, cfg->base.plat_plen, ip->ip_src.s_addr); if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64stl_log(logdata, m, AF_INET, cfg->no.kidx); } else logdata = NULL; return (nat64_do_handle_ip4(m, &saddr, &daddr, 0, &cfg->base, logdata)); } static int nat64stl_handle_ip6(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg, struct mbuf *m, uint32_t tablearg) { struct pfloghdr loghdr, *logdata; struct ip6_hdr *ip6; uint32_t aaddr; aaddr = htonl(TARG_VAL(chain, tablearg, nh4)); if (nat64_check_private_ip4(&cfg->base, aaddr) != 0) { NAT64STAT_INC(&cfg->base.stats, dropped); return (NAT64MFREE); } /* * NOTE: we expect ipfw_chk() did m_pullup() up to upper level * protocol's headers. Also we skip some checks, that ip6_input(), * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. */ ip6 = mtod(m, struct ip6_hdr *); /* Check ip6_dst matches configured prefix */ if (memcmp(&ip6->ip6_dst, &cfg->base.plat_prefix, cfg->base.plat_plen / 8) != 0) return (NAT64SKIP); if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64stl_log(logdata, m, AF_INET6, cfg->no.kidx); } else logdata = NULL; return (nat64_do_handle_ip6(m, aaddr, 0, &cfg->base, logdata)); } static int nat64stl_handle_icmp6(struct ip_fw_chain *chain, struct nat64stl_cfg *cfg, struct mbuf *m) { struct pfloghdr loghdr, *logdata; struct nat64_counters *stats; struct ip6_hdr *ip6i; struct icmp6_hdr *icmp6; uint32_t tablearg; int hlen, proto; hlen = 0; stats = &cfg->base.stats; proto = nat64_getlasthdr(m, &hlen); if (proto != IPPROTO_ICMPV6) { NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } icmp6 = mtodo(m, hlen); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEED_TRANSIT: case ICMP6_PARAM_PROB: break; default: NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } hlen += sizeof(struct icmp6_hdr); if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) { NAT64STAT_INC(stats, dropped); return (NAT64MFREE); } if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN); if (m == NULL) { NAT64STAT_INC(stats, nomem); return (NAT64RETURN); } /* * Use destination address from inner IPv6 header to determine * IPv4 mapped address. */ ip6i = mtodo(m, hlen); if (ipfw_lookup_table(chain, cfg->map64, sizeof(struct in6_addr), &ip6i->ip6_dst, &tablearg) == 0) { m_freem(m); return (NAT64RETURN); } if (cfg->base.flags & NAT64_LOG) { logdata = &loghdr; nat64stl_log(logdata, m, AF_INET6, cfg->no.kidx); } else logdata = NULL; return (nat64_handle_icmp6(m, 0, htonl(TARG_VAL(chain, tablearg, nh4)), 0, &cfg->base, logdata)); } int ipfw_nat64stl(struct ip_fw_chain *chain, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { - ipfw_insn *icmd; struct nat64stl_cfg *cfg; + ipfw_insn *icmd; in_addr_t dst4; uint32_t tablearg; int ret; IPFW_RLOCK_ASSERT(chain); *done = 0; /* try next rule if not matched */ - icmd = cmd + 1; + icmd = cmd + F_LEN(cmd); if (cmd->opcode != O_EXTERNAL_ACTION || - cmd->arg1 != V_nat64stl_eid || + insntod(cmd, kidx)->kidx != V_nat64stl_eid || icmd->opcode != O_EXTERNAL_INSTANCE || (cfg = NAT64_LOOKUP(chain, icmd)) == NULL) return (0); switch (args->f_id.addr_type) { case 4: dst4 = htonl(args->f_id.dst_ip); ret = ipfw_lookup_table(chain, cfg->map46, sizeof(in_addr_t), &dst4, &tablearg); break; case 6: ret = ipfw_lookup_table(chain, cfg->map64, sizeof(struct in6_addr), &args->f_id.src_ip6, &tablearg); break; default: return (0); } if (ret == 0) { /* * In case when packet is ICMPv6 message from an intermediate * router, the source address of message will not match the * addresses from our map64 table. */ if (args->f_id.proto != IPPROTO_ICMPV6) return (0); ret = nat64stl_handle_icmp6(chain, cfg, args->m); } else { if (args->f_id.addr_type == 4) ret = nat64stl_handle_ip4(chain, cfg, args->m, tablearg); else ret = nat64stl_handle_ip6(chain, cfg, args->m, tablearg); } if (ret == NAT64SKIP) return (0); *done = 1; /* terminate the search */ if (ret == NAT64MFREE) m_freem(args->m); args->m = NULL; return (IP_FW_NAT64); } diff --git a/sys/netpfil/ipfw/nat64/nat64stl.h b/sys/netpfil/ipfw/nat64/nat64stl.h index 016345775fcf..95a59ca2185f 100644 --- a/sys/netpfil/ipfw/nat64/nat64stl.h +++ b/sys/netpfil/ipfw/nat64/nat64stl.h @@ -1,57 +1,57 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _IP_FW_NAT64STL_H_ #define _IP_FW_NAT64STL_H_ #include "ip_fw_nat64.h" #include "nat64_translate.h" struct nat64stl_cfg { struct named_object no; - uint16_t map64; /* table with 6to4 mapping */ - uint16_t map46; /* table with 4to6 mapping */ + uint32_t map64; /* table with 6to4 mapping */ + uint32_t map46; /* table with 4to6 mapping */ struct nat64_config base; #define NAT64STL_KIDX 0x0100 #define NAT64STL_46T 0x0200 #define NAT64STL_64T 0x0400 /* flags to pass to userland */ #define NAT64STL_FLAGSMASK (NAT64_LOG | NAT64_ALLOW_PRIVATE) char name[64]; }; -VNET_DECLARE(uint16_t, nat64stl_eid); +VNET_DECLARE(uint32_t, nat64stl_eid); #define V_nat64stl_eid VNET(nat64stl_eid) #define IPFW_TLV_NAT64STL_NAME IPFW_TLV_EACTION_NAME(V_nat64stl_eid) int ipfw_nat64stl(struct ip_fw_chain *chain, struct ip_fw_args *args, ipfw_insn *cmd, int *done); #endif diff --git a/sys/netpfil/ipfw/nat64/nat64stl_control.c b/sys/netpfil/ipfw/nat64/nat64stl_control.c index b614bbc1c59c..e66894dc65d4 100644 --- a/sys/netpfil/ipfw/nat64/nat64stl_control.c +++ b/sys/netpfil/ipfw/nat64/nat64stl_control.c @@ -1,614 +1,556 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2015-2019 Yandex LLC * Copyright (c) 2015 Alexander V. Chernikov * Copyright (c) 2015-2019 Andrey V. Elsukov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nat64stl.h" -VNET_DEFINE(uint16_t, nat64stl_eid) = 0; +VNET_DEFINE(uint32_t, nat64stl_eid) = 0; static struct nat64stl_cfg *nat64stl_alloc_config(const char *name, uint8_t set); static void nat64stl_free_config(struct nat64stl_cfg *cfg); static struct nat64stl_cfg *nat64stl_find(struct namedobj_instance *ni, const char *name, uint8_t set); static struct nat64stl_cfg * nat64stl_alloc_config(const char *name, uint8_t set) { struct nat64stl_cfg *cfg; cfg = malloc(sizeof(struct nat64stl_cfg), M_IPFW, M_WAITOK | M_ZERO); COUNTER_ARRAY_ALLOC(cfg->base.stats.cnt, NAT64STATS, M_WAITOK); cfg->no.name = cfg->name; cfg->no.etlv = IPFW_TLV_NAT64STL_NAME; cfg->no.set = set; strlcpy(cfg->name, name, sizeof(cfg->name)); return (cfg); } static void nat64stl_free_config(struct nat64stl_cfg *cfg) { COUNTER_ARRAY_FREE(cfg->base.stats.cnt, NAT64STATS); free(cfg, M_IPFW); } static void nat64stl_export_config(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg, ipfw_nat64stl_cfg *uc) { struct named_object *no; uc->prefix6 = cfg->base.plat_prefix; uc->plen6 = cfg->base.plat_plen; uc->flags = cfg->base.flags & NAT64STL_FLAGSMASK; uc->set = cfg->no.set; strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); no = ipfw_objhash_lookup_table_kidx(ch, cfg->map64); ipfw_export_obj_ntlv(no, &uc->ntlv6); no = ipfw_objhash_lookup_table_kidx(ch, cfg->map46); ipfw_export_obj_ntlv(no, &uc->ntlv4); } struct nat64stl_dump_arg { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static int export_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct nat64stl_dump_arg *da = (struct nat64stl_dump_arg *)arg; ipfw_nat64stl_cfg *uc; uc = (ipfw_nat64stl_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc)); nat64stl_export_config(da->ch, (struct nat64stl_cfg *)no, uc); return (0); } static struct nat64stl_cfg * nat64stl_find(struct namedobj_instance *ni, const char *name, uint8_t set) { struct nat64stl_cfg *cfg; cfg = (struct nat64stl_cfg *)ipfw_objhash_lookup_name_type(ni, set, IPFW_TLV_NAT64STL_NAME, name); return (cfg); } static int nat64stl_create_internal(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg, ipfw_nat64stl_cfg *i) { IPFW_UH_WLOCK_ASSERT(ch); if (ipfw_objhash_alloc_idx(CHAIN_TO_SRV(ch), &cfg->no.kidx) != 0) return (ENOSPC); cfg->base.flags |= NAT64STL_KIDX; if (ipfw_ref_table(ch, &i->ntlv4, &cfg->map46) != 0) return (EINVAL); cfg->base.flags |= NAT64STL_46T; if (ipfw_ref_table(ch, &i->ntlv6, &cfg->map64) != 0) return (EINVAL); cfg->base.flags |= NAT64STL_64T; ipfw_objhash_add(CHAIN_TO_SRV(ch), &cfg->no); return (0); } /* * Creates new nat64 instance. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ipfw_nat64stl_cfg ] * * Returns 0 on success */ static int nat64stl_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; ipfw_nat64stl_cfg *uc; struct namedobj_instance *ni; struct nat64stl_cfg *cfg; int error; if (sd->valsize != sizeof(*olh) + sizeof(*uc)) return (EINVAL); olh = (ipfw_obj_lheader *)sd->kbuf; uc = (ipfw_nat64stl_cfg *)(olh + 1); if (ipfw_check_object_name_generic(uc->name) != 0) return (EINVAL); if (uc->set >= IPFW_MAX_SETS || nat64_check_prefix6(&uc->prefix6, uc->plen6) != 0) return (EINVAL); /* XXX: check types of tables */ ni = CHAIN_TO_SRV(ch); error = 0; IPFW_UH_RLOCK(ch); if (nat64stl_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } IPFW_UH_RUNLOCK(ch); cfg = nat64stl_alloc_config(uc->name, uc->set); cfg->base.plat_prefix = uc->prefix6; cfg->base.plat_plen = uc->plen6; cfg->base.flags = (uc->flags & NAT64STL_FLAGSMASK) | NAT64_PLATPFX; if (IN6_IS_ADDR_WKPFX(&cfg->base.plat_prefix)) cfg->base.flags |= NAT64_WKPFX; IPFW_UH_WLOCK(ch); if (nat64stl_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_WUNLOCK(ch); nat64stl_free_config(cfg); return (EEXIST); } error = nat64stl_create_internal(ch, cfg, uc); if (error == 0) { /* Okay, let's link data */ SRV_OBJECT(ch, cfg->no.kidx) = cfg; IPFW_UH_WUNLOCK(ch); return (0); } if (cfg->base.flags & NAT64STL_KIDX) ipfw_objhash_free_idx(ni, cfg->no.kidx); if (cfg->base.flags & NAT64STL_46T) ipfw_unref_table(ch, cfg->map46); if (cfg->base.flags & NAT64STL_64T) ipfw_unref_table(ch, cfg->map64); IPFW_UH_WUNLOCK(ch); nat64stl_free_config(cfg); return (error); } /* * Change existing nat64stl instance configuration. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_nat64stl_cfg ] * Reply: [ ipfw_obj_header ipfw_nat64stl_cfg ] * * Returns 0 on success */ static int nat64stl_config(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { ipfw_obj_header *oh; ipfw_nat64stl_cfg *uc; struct nat64stl_cfg *cfg; struct namedobj_instance *ni; if (sd->valsize != sizeof(*oh) + sizeof(*uc)) return (EINVAL); oh = (ipfw_obj_header *)ipfw_get_sopt_space(sd, sizeof(*oh) + sizeof(*uc)); uc = (ipfw_nat64stl_cfg *)(oh + 1); if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); ni = CHAIN_TO_SRV(ch); if (sd->sopt->sopt_dir == SOPT_GET) { IPFW_UH_RLOCK(ch); cfg = nat64stl_find(ni, oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } nat64stl_export_config(ch, cfg, uc); IPFW_UH_RUNLOCK(ch); return (0); } IPFW_UH_WLOCK(ch); cfg = nat64stl_find(ni, oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (EEXIST); } /* * For now allow to change only following values: * flags. */ cfg->base.flags &= ~NAT64STL_FLAGSMASK; cfg->base.flags |= uc->flags & NAT64STL_FLAGSMASK; IPFW_UH_WUNLOCK(ch); return (0); } static void nat64stl_detach_config(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg) { IPFW_UH_WLOCK_ASSERT(ch); ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); ipfw_unref_table(ch, cfg->map46); ipfw_unref_table(ch, cfg->map64); } /* * Destroys nat64 instance. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat64stl_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat64stl_cfg *cfg; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0) return (EINVAL); IPFW_UH_WLOCK(ch); cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } if (cfg->no.refcnt > 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } ipfw_reset_eaction_instance(ch, V_nat64stl_eid, cfg->no.kidx); SRV_OBJECT(ch, cfg->no.kidx) = NULL; nat64stl_detach_config(ch, cfg); IPFW_UH_WUNLOCK(ch); nat64stl_free_config(cfg); return (0); } /* * Lists all nat64stl instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader ipfw_nat64stl_cfg x N ] * * Returns 0 on success */ static int nat64stl_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat64stl_dump_arg da; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(ch); olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64STL_NAME); olh->objsize = sizeof(ipfw_nat64stl_cfg); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(ch); return (ENOMEM); } memset(&da, 0, sizeof(da)); da.ch = ch; da.sd = sd; ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, &da, IPFW_TLV_NAT64STL_NAME); IPFW_UH_RUNLOCK(ch); return (0); } #define __COPY_STAT_FIELD(_cfg, _stats, _field) \ (_stats)->_field = NAT64STAT_FETCH(&(_cfg)->base.stats, _field) static void export_stats(struct ip_fw_chain *ch, struct nat64stl_cfg *cfg, struct ipfw_nat64stl_stats *stats) { __COPY_STAT_FIELD(cfg, stats, opcnt64); __COPY_STAT_FIELD(cfg, stats, opcnt46); __COPY_STAT_FIELD(cfg, stats, ofrags); __COPY_STAT_FIELD(cfg, stats, ifrags); __COPY_STAT_FIELD(cfg, stats, oerrors); __COPY_STAT_FIELD(cfg, stats, noroute4); __COPY_STAT_FIELD(cfg, stats, noroute6); __COPY_STAT_FIELD(cfg, stats, noproto); __COPY_STAT_FIELD(cfg, stats, nomem); __COPY_STAT_FIELD(cfg, stats, dropped); } /* * Get nat64stl statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]] * * Returns 0 on success */ static int nat64stl_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct ipfw_nat64stl_stats stats; struct nat64stl_cfg *cfg; ipfw_obj_header *oh; ipfw_obj_ctlv *ctlv; size_t sz; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); if (sd->valsize % sizeof(uint64_t)) return (EINVAL); if (sd->valsize < sz) return (ENOMEM); oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); memset(&stats, 0, sizeof(stats)); IPFW_UH_RLOCK(ch); cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } export_stats(ch, cfg, &stats); IPFW_UH_RUNLOCK(ch); ctlv = (ipfw_obj_ctlv *)(oh + 1); memset(ctlv, 0, sizeof(*ctlv)); ctlv->head.type = IPFW_TLV_COUNTERS; ctlv->head.length = sz - sizeof(ipfw_obj_header); ctlv->count = sizeof(stats) / sizeof(uint64_t); ctlv->objsize = sizeof(uint64_t); ctlv->version = IPFW_NAT64_VERSION; memcpy(ctlv + 1, &stats, sizeof(stats)); return (0); } /* * Reset nat64stl statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat64stl_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct nat64stl_cfg *cfg; ipfw_obj_header *oh; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); IPFW_UH_WLOCK(ch); cfg = nat64stl_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } COUNTER_ARRAY_ZERO(cfg->base.stats.cnt, NAT64STATS); IPFW_UH_WUNLOCK(ch); return (0); } static struct ipfw_sopt_handler scodes[] = { - { IP_FW_NAT64STL_CREATE, 0, HDIR_SET, nat64stl_create }, - { IP_FW_NAT64STL_DESTROY,0, HDIR_SET, nat64stl_destroy }, - { IP_FW_NAT64STL_CONFIG, 0, HDIR_BOTH, nat64stl_config }, - { IP_FW_NAT64STL_LIST, 0, HDIR_GET, nat64stl_list }, - { IP_FW_NAT64STL_STATS, 0, HDIR_GET, nat64stl_stats }, - { IP_FW_NAT64STL_RESET_STATS,0, HDIR_SET, nat64stl_reset_stats }, + { IP_FW_NAT64STL_CREATE, IP_FW3_OPVER, HDIR_SET, nat64stl_create }, + { IP_FW_NAT64STL_DESTROY, IP_FW3_OPVER, HDIR_SET, nat64stl_destroy }, + { IP_FW_NAT64STL_CONFIG, IP_FW3_OPVER, HDIR_BOTH,nat64stl_config }, + { IP_FW_NAT64STL_LIST, IP_FW3_OPVER, HDIR_GET, nat64stl_list }, + { IP_FW_NAT64STL_STATS, IP_FW3_OPVER, HDIR_GET, nat64stl_stats }, + { IP_FW_NAT64STL_RESET_STATS, IP_FW3_OPVER, HDIR_SET, nat64stl_reset_stats }, }; static int -nat64stl_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) -{ - ipfw_insn *icmd; - - icmd = cmd - 1; - if (icmd->opcode != O_EXTERNAL_ACTION || - icmd->arg1 != V_nat64stl_eid) - return (1); - - *puidx = cmd->arg1; - *ptype = 0; - return (0); -} - -static void -nat64stl_update_arg1(ipfw_insn *cmd, uint16_t idx) -{ - - cmd->arg1 = idx; -} - -static int -nat64stl_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, - struct named_object **pno) -{ - int err; - - err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, - IPFW_TLV_NAT64STL_NAME, pno); - return (err); -} - -static struct named_object * -nat64stl_findbykidx(struct ip_fw_chain *ch, uint16_t idx) -{ - struct namedobj_instance *ni; - struct named_object *no; - - IPFW_UH_WLOCK_ASSERT(ch); - ni = CHAIN_TO_SRV(ch); - no = ipfw_objhash_lookup_kidx(ni, idx); - KASSERT(no != NULL, ("NAT with index %d not found", idx)); - - return (no); -} - -static int -nat64stl_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, +nat64stl_manage_sets(struct ip_fw_chain *ch, uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NAT64STL_NAME, set, new_set, cmd)); } - -static struct opcode_obj_rewrite opcodes[] = { - { - .opcode = O_EXTERNAL_INSTANCE, - .etlv = IPFW_TLV_EACTION /* just show it isn't table */, - .classifier = nat64stl_classify, - .update = nat64stl_update_arg1, - .find_byname = nat64stl_findbyname, - .find_bykidx = nat64stl_findbykidx, - .manage_sets = nat64stl_manage_sets, - }, -}; +NAT64_DEFINE_OPCODE_REWRITER(nat64stl, NAT64STL, opcodes); static int destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct nat64stl_cfg *cfg; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; cfg = (struct nat64stl_cfg *)SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; nat64stl_detach_config(ch, cfg); nat64stl_free_config(cfg); return (0); } int nat64stl_init(struct ip_fw_chain *ch, int first) { V_nat64stl_eid = ipfw_add_eaction(ch, ipfw_nat64stl, "nat64stl"); if (V_nat64stl_eid == 0) return (ENXIO); IPFW_ADD_SOPT_HANDLER(first, scodes); IPFW_ADD_OBJ_REWRITER(first, opcodes); return (0); } void nat64stl_uninit(struct ip_fw_chain *ch, int last) { IPFW_DEL_OBJ_REWRITER(last, opcodes); IPFW_DEL_SOPT_HANDLER(last, scodes); ipfw_del_eaction(ch, V_nat64stl_eid); /* * Since we already have deregistered external action, * our named objects become unaccessible via rules, because * all rules were truncated by ipfw_del_eaction(). * So, we can unlink and destroy our named objects without holding * IPFW_WLOCK(). */ IPFW_UH_WLOCK(ch); ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, IPFW_TLV_NAT64STL_NAME); V_nat64stl_eid = 0; IPFW_UH_WUNLOCK(ch); } diff --git a/sys/netpfil/ipfw/nptv6/nptv6.c b/sys/netpfil/ipfw/nptv6/nptv6.c index 898545bd36fa..bfe28d75a01d 100644 --- a/sys/netpfil/ipfw/nptv6/nptv6.c +++ b/sys/netpfil/ipfw/nptv6/nptv6.c @@ -1,1039 +1,1040 @@ /*- * Copyright (c) 2016 Yandex LLC * Copyright (c) 2016 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -VNET_DEFINE_STATIC(uint16_t, nptv6_eid) = 0; +VNET_DEFINE_STATIC(uint32_t, nptv6_eid) = 0; #define V_nptv6_eid VNET(nptv6_eid) #define IPFW_TLV_NPTV6_NAME IPFW_TLV_EACTION_NAME(V_nptv6_eid) static eventhandler_tag nptv6_ifaddr_event; static struct nptv6_cfg *nptv6_alloc_config(const char *name, uint8_t set); static void nptv6_free_config(struct nptv6_cfg *cfg); static struct nptv6_cfg *nptv6_find(struct namedobj_instance *ni, const char *name, uint8_t set); static int nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp, int offset); static int nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp, int offset); #define NPTV6_LOOKUP(chain, cmd) \ - (struct nptv6_cfg *)SRV_OBJECT((chain), (cmd)->arg1) + (struct nptv6_cfg *)SRV_OBJECT((chain), insntod(cmd, kidx)->kidx) #ifndef IN6_MASK_ADDR #define IN6_MASK_ADDR(a, m) do { \ (a)->s6_addr32[0] &= (m)->s6_addr32[0]; \ (a)->s6_addr32[1] &= (m)->s6_addr32[1]; \ (a)->s6_addr32[2] &= (m)->s6_addr32[2]; \ (a)->s6_addr32[3] &= (m)->s6_addr32[3]; \ } while (0) #endif #ifndef IN6_ARE_MASKED_ADDR_EQUAL #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ (((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \ (((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \ (((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 ) #endif #if 0 #define NPTV6_DEBUG(fmt, ...) do { \ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ } while (0) #define NPTV6_IPDEBUG(fmt, ...) do { \ char _s[INET6_ADDRSTRLEN], _d[INET6_ADDRSTRLEN]; \ printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ } while (0) #else #define NPTV6_DEBUG(fmt, ...) #define NPTV6_IPDEBUG(fmt, ...) #endif static int nptv6_getlasthdr(struct nptv6_cfg *cfg, struct mbuf *m, int *offset) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; int proto, hlen; hlen = (offset == NULL) ? 0: *offset; if (m->m_len < hlen) return (-1); ip6 = mtodo(m, hlen); hlen += sizeof(*ip6); proto = ip6->ip6_nxt; while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING || proto == IPPROTO_DSTOPTS) { hbh = mtodo(m, hlen); if (m->m_len < hlen) return (-1); proto = hbh->ip6h_nxt; hlen += (hbh->ip6h_len + 1) << 3; } if (offset != NULL) *offset = hlen; return (proto); } static int nptv6_translate_icmpv6(struct nptv6_cfg *cfg, struct mbuf **mp, int offset) { struct icmp6_hdr *icmp6; struct ip6_hdr *ip6; struct mbuf *m; m = *mp; if (offset > m->m_len) return (-1); icmp6 = mtodo(m, offset); NPTV6_DEBUG("ICMPv6 type %d", icmp6->icmp6_type); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: case ICMP6_PACKET_TOO_BIG: case ICMP6_TIME_EXCEEDED: case ICMP6_PARAM_PROB: break; case ICMP6_ECHO_REQUEST: case ICMP6_ECHO_REPLY: /* nothing to translate */ return (0); default: /* * XXX: We can add some checks to not translate NDP and MLD * messages. Currently user must explicitly allow these message * types, otherwise packets will be dropped. */ return (-1); } offset += sizeof(*icmp6); if (offset + sizeof(*ip6) > m->m_pkthdr.len) return (-1); if (offset + sizeof(*ip6) > m->m_len) *mp = m = m_pullup(m, offset + sizeof(*ip6)); if (m == NULL) return (-1); ip6 = mtodo(m, offset); NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset, inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), ip6->ip6_nxt); if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src, &cfg->external, &cfg->mask)) return (nptv6_rewrite_external(cfg, mp, offset)); else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst, &cfg->internal, &cfg->mask)) return (nptv6_rewrite_internal(cfg, mp, offset)); /* * Addresses in the inner IPv6 header doesn't matched to * our prefixes. */ return (-1); } static int nptv6_search_index(struct nptv6_cfg *cfg, struct in6_addr *a) { int idx; if (cfg->flags & NPTV6_48PLEN) return (3); /* Search suitable word index for adjustment */ for (idx = 4; idx < 8; idx++) if (a->s6_addr16[idx] != 0xffff) break; /* * RFC 6296 p3.7: If an NPTv6 Translator discovers a datagram with * an IID of all-zeros while performing address mapping, that * datagram MUST be dropped, and an ICMPv6 Parameter Problem error * SHOULD be generated. */ if (idx == 8 || (a->s6_addr32[2] == 0 && a->s6_addr32[3] == 0)) return (-1); return (idx); } static void nptv6_copy_addr(struct in6_addr *src, struct in6_addr *dst, struct in6_addr *mask) { int i; for (i = 0; i < 8 && mask->s6_addr8[i] != 0; i++) { dst->s6_addr8[i] &= ~mask->s6_addr8[i]; dst->s6_addr8[i] |= src->s6_addr8[i] & mask->s6_addr8[i]; } } static int nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp, int offset) { struct in6_addr *addr; struct ip6_hdr *ip6; int idx, proto; uint16_t adj; ip6 = mtodo(*mp, offset); NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset, inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), ip6->ip6_nxt); if (offset == 0) addr = &ip6->ip6_src; else { /* * When we rewriting inner IPv6 header, we need to rewrite * destination address back to external prefix. The datagram in * the ICMPv6 payload should looks like it was send from * external prefix. */ addr = &ip6->ip6_dst; } idx = nptv6_search_index(cfg, addr); if (idx < 0) { /* * Do not send ICMPv6 error when offset isn't zero. * This means we are rewriting inner IPv6 header in the * ICMPv6 error message. */ if (offset == 0) { icmp6_error2(*mp, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif); *mp = NULL; } return (IP_FW_DENY); } adj = addr->s6_addr16[idx]; nptv6_copy_addr(&cfg->external, addr, &cfg->mask); adj = cksum_add(adj, cfg->adjustment); if (adj == 0xffff) adj = 0; addr->s6_addr16[idx] = adj; if (offset == 0) { /* * We may need to translate addresses in the inner IPv6 * header for ICMPv6 error messages. */ proto = nptv6_getlasthdr(cfg, *mp, &offset); if (proto < 0 || (proto == IPPROTO_ICMPV6 && nptv6_translate_icmpv6(cfg, mp, offset) != 0)) return (IP_FW_DENY); NPTV6STAT_INC(cfg, in2ex); } return (0); } static int nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp, int offset) { struct in6_addr *addr; struct ip6_hdr *ip6; int idx, proto; uint16_t adj; ip6 = mtodo(*mp, offset); NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset, inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), ip6->ip6_nxt); if (offset == 0) addr = &ip6->ip6_dst; else { /* * When we rewriting inner IPv6 header, we need to rewrite * source address back to internal prefix. The datagram in * the ICMPv6 payload should looks like it was send from * internal prefix. */ addr = &ip6->ip6_src; } idx = nptv6_search_index(cfg, addr); if (idx < 0) { /* * Do not send ICMPv6 error when offset isn't zero. * This means we are rewriting inner IPv6 header in the * ICMPv6 error message. */ if (offset == 0) { icmp6_error2(*mp, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif); *mp = NULL; } return (IP_FW_DENY); } adj = addr->s6_addr16[idx]; nptv6_copy_addr(&cfg->internal, addr, &cfg->mask); adj = cksum_add(adj, ~cfg->adjustment); if (adj == 0xffff) adj = 0; addr->s6_addr16[idx] = adj; if (offset == 0) { /* * We may need to translate addresses in the inner IPv6 * header for ICMPv6 error messages. */ proto = nptv6_getlasthdr(cfg, *mp, &offset); if (proto < 0 || (proto == IPPROTO_ICMPV6 && nptv6_translate_icmpv6(cfg, mp, offset) != 0)) return (IP_FW_DENY); NPTV6STAT_INC(cfg, ex2in); } return (0); } /* * ipfw external action handler. */ static int ipfw_nptv6(struct ip_fw_chain *chain, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { struct ip6_hdr *ip6; struct nptv6_cfg *cfg; ipfw_insn *icmd; int ret; *done = 0; /* try next rule if not matched */ ret = IP_FW_DENY; - icmd = cmd + 1; + icmd = cmd + F_LEN(cmd); if (cmd->opcode != O_EXTERNAL_ACTION || - cmd->arg1 != V_nptv6_eid || + insntod(cmd, kidx)->kidx != V_nptv6_eid || icmd->opcode != O_EXTERNAL_INSTANCE || (cfg = NPTV6_LOOKUP(chain, icmd)) == NULL || (cfg->flags & NPTV6_READY) == 0) return (ret); /* * We need act as router, so when forwarding is disabled - * do nothing. */ if (V_ip6_forwarding == 0 || args->f_id.addr_type != 6) return (ret); /* * NOTE: we expect ipfw_chk() did m_pullup() up to upper level * protocol's headers. Also we skip some checks, that ip6_input(), * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did. */ ip6 = mtod(args->m, struct ip6_hdr *); NPTV6_IPDEBUG("eid %u, oid %u, %s -> %s %d", - cmd->arg1, icmd->arg1, + insntod(cmd, kidx)->kidx, insntod(icmd, kidx)->kidx, inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)), inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)), ip6->ip6_nxt); if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src, &cfg->internal, &cfg->mask)) { /* * XXX: Do not translate packets when both src and dst * are from internal prefix. */ if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst, &cfg->internal, &cfg->mask)) return (ret); ret = nptv6_rewrite_internal(cfg, &args->m, 0); } else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst, &cfg->external, &cfg->mask)) ret = nptv6_rewrite_external(cfg, &args->m, 0); else return (ret); /* * If address wasn't rewrited - free mbuf and terminate the search. */ if (ret != 0) { if (args->m != NULL) { m_freem(args->m); args->m = NULL; /* mark mbuf as consumed */ } NPTV6STAT_INC(cfg, dropped); *done = 1; } else { /* Terminate the search if one_pass is set */ *done = V_fw_one_pass; /* Update args->f_id when one_pass is off */ if (*done == 0) { ip6 = mtod(args->m, struct ip6_hdr *); args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; } } return (ret); } static struct nptv6_cfg * nptv6_alloc_config(const char *name, uint8_t set) { struct nptv6_cfg *cfg; cfg = malloc(sizeof(struct nptv6_cfg), M_IPFW, M_WAITOK | M_ZERO); COUNTER_ARRAY_ALLOC(cfg->stats, NPTV6STATS, M_WAITOK); cfg->no.name = cfg->name; cfg->no.etlv = IPFW_TLV_NPTV6_NAME; cfg->no.set = set; strlcpy(cfg->name, name, sizeof(cfg->name)); return (cfg); } static void nptv6_free_config(struct nptv6_cfg *cfg) { COUNTER_ARRAY_FREE(cfg->stats, NPTV6STATS); free(cfg, M_IPFW); } static void nptv6_export_config(struct ip_fw_chain *ch, struct nptv6_cfg *cfg, ipfw_nptv6_cfg *uc) { uc->internal = cfg->internal; if (cfg->flags & NPTV6_DYNAMIC_PREFIX) memcpy(uc->if_name, cfg->if_name, IF_NAMESIZE); else uc->external = cfg->external; uc->plen = cfg->plen; uc->flags = cfg->flags & NPTV6_FLAGSMASK; uc->set = cfg->no.set; strlcpy(uc->name, cfg->no.name, sizeof(uc->name)); } struct nptv6_dump_arg { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static int export_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct nptv6_dump_arg *da = (struct nptv6_dump_arg *)arg; ipfw_nptv6_cfg *uc; uc = (ipfw_nptv6_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc)); nptv6_export_config(da->ch, (struct nptv6_cfg *)no, uc); return (0); } static struct nptv6_cfg * nptv6_find(struct namedobj_instance *ni, const char *name, uint8_t set) { struct nptv6_cfg *cfg; cfg = (struct nptv6_cfg *)ipfw_objhash_lookup_name_type(ni, set, IPFW_TLV_NPTV6_NAME, name); return (cfg); } static void nptv6_calculate_adjustment(struct nptv6_cfg *cfg) { uint16_t i, e; uint16_t *p; /* Calculate checksum of internal prefix */ for (i = 0, p = (uint16_t *)&cfg->internal; p < (uint16_t *)(&cfg->internal + 1); p++) i = cksum_add(i, *p); /* Calculate checksum of external prefix */ for (e = 0, p = (uint16_t *)&cfg->external; p < (uint16_t *)(&cfg->external + 1); p++) e = cksum_add(e, *p); /* Adjustment value for Int->Ext direction */ cfg->adjustment = cksum_add(~e, i); } static int nptv6_check_prefix(const struct in6_addr *addr) { if (IN6_IS_ADDR_MULTICAST(addr) || IN6_IS_ADDR_LINKLOCAL(addr) || IN6_IS_ADDR_LOOPBACK(addr) || IN6_IS_ADDR_UNSPECIFIED(addr)) return (EINVAL); return (0); } static void nptv6_set_external(struct nptv6_cfg *cfg, struct in6_addr *addr) { cfg->external = *addr; IN6_MASK_ADDR(&cfg->external, &cfg->mask); nptv6_calculate_adjustment(cfg); cfg->flags |= NPTV6_READY; } /* * Try to determine what prefix to use as external for * configured interface name. */ static void nptv6_find_prefix(struct ip_fw_chain *ch, struct nptv6_cfg *cfg, struct ifnet *ifp) { struct epoch_tracker et; struct ifaddr *ifa; struct in6_ifaddr *ia; MPASS(cfg->flags & NPTV6_DYNAMIC_PREFIX); IPFW_UH_WLOCK_ASSERT(ch); if (ifp == NULL) { ifp = ifunit_ref(cfg->if_name); if (ifp == NULL) return; } NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (nptv6_check_prefix(&ia->ia_addr.sin6_addr) || IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &cfg->internal, &cfg->mask)) continue; /* Suitable address is found. */ nptv6_set_external(cfg, &ia->ia_addr.sin6_addr); break; } NET_EPOCH_EXIT(et); if_rele(ifp); } struct ifaddr_event_args { struct ifnet *ifp; const struct in6_addr *addr; int event; }; static int ifaddr_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct ifaddr_event_args *args; struct ip_fw_chain *ch; struct nptv6_cfg *cfg; ch = &V_layer3_chain; cfg = (struct nptv6_cfg *)SRV_OBJECT(ch, no->kidx); if ((cfg->flags & NPTV6_DYNAMIC_PREFIX) == 0) return (0); args = arg; /* If interface name doesn't match, ignore */ if (strncmp(args->ifp->if_xname, cfg->if_name, IF_NAMESIZE)) return (0); if (args->ifp->if_flags & IFF_DYING) { /* XXX: is it possible? */ cfg->flags &= ~NPTV6_READY; return (0); } if (args->event == IFADDR_EVENT_DEL) { /* If instance is not ready, ignore */ if ((cfg->flags & NPTV6_READY) == 0) return (0); /* If address does not match the external prefix, ignore */ if (IN6_ARE_MASKED_ADDR_EQUAL(&cfg->external, args->addr, &cfg->mask) != 0) return (0); /* Otherwise clear READY flag */ cfg->flags &= ~NPTV6_READY; } else {/* IFADDR_EVENT_ADD */ /* If instance is already ready, ignore */ if (cfg->flags & NPTV6_READY) return (0); /* If address is not suitable for prefix, ignore */ if (nptv6_check_prefix(args->addr) || IN6_ARE_MASKED_ADDR_EQUAL(args->addr, &cfg->internal, &cfg->mask)) return (0); /* FALLTHROUGH */ } MPASS(!(cfg->flags & NPTV6_READY)); /* Try to determine the prefix */ if_ref(args->ifp); nptv6_find_prefix(ch, cfg, args->ifp); return (0); } static void nptv6_ifaddrevent_handler(void *arg __unused, struct ifnet *ifp, struct ifaddr *ifa, int event) { struct ifaddr_event_args args; struct ip_fw_chain *ch; if (ifa->ifa_addr->sa_family != AF_INET6) return; args.ifp = ifp; args.addr = &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr; args.event = event; ch = &V_layer3_chain; IPFW_UH_WLOCK(ch); ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), ifaddr_cb, &args, IPFW_TLV_NPTV6_NAME); IPFW_UH_WUNLOCK(ch); } /* * Creates new NPTv6 instance. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ipfw_nptv6_cfg ] * * Returns 0 on success */ static int nptv6_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct in6_addr mask; ipfw_obj_lheader *olh; ipfw_nptv6_cfg *uc; struct namedobj_instance *ni; struct nptv6_cfg *cfg; if (sd->valsize != sizeof(*olh) + sizeof(*uc)) return (EINVAL); olh = (ipfw_obj_lheader *)sd->kbuf; uc = (ipfw_nptv6_cfg *)(olh + 1); if (ipfw_check_object_name_generic(uc->name) != 0) return (EINVAL); if (uc->plen < 8 || uc->plen > 64 || uc->set >= IPFW_MAX_SETS) return (EINVAL); if (nptv6_check_prefix(&uc->internal)) return (EINVAL); in6_prefixlen2mask(&mask, uc->plen); if ((uc->flags & NPTV6_DYNAMIC_PREFIX) == 0 && ( nptv6_check_prefix(&uc->external) || IN6_ARE_MASKED_ADDR_EQUAL(&uc->external, &uc->internal, &mask))) return (EINVAL); ni = CHAIN_TO_SRV(ch); IPFW_UH_RLOCK(ch); if (nptv6_find(ni, uc->name, uc->set) != NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } IPFW_UH_RUNLOCK(ch); cfg = nptv6_alloc_config(uc->name, uc->set); cfg->plen = uc->plen; cfg->flags = uc->flags & NPTV6_FLAGSMASK; if (cfg->plen <= 48) cfg->flags |= NPTV6_48PLEN; cfg->mask = mask; cfg->internal = uc->internal; IN6_MASK_ADDR(&cfg->internal, &mask); if (cfg->flags & NPTV6_DYNAMIC_PREFIX) memcpy(cfg->if_name, uc->if_name, IF_NAMESIZE); else nptv6_set_external(cfg, &uc->external); if ((uc->flags & NPTV6_DYNAMIC_PREFIX) != 0 && nptv6_ifaddr_event == NULL) nptv6_ifaddr_event = EVENTHANDLER_REGISTER( ifaddr_event_ext, nptv6_ifaddrevent_handler, NULL, EVENTHANDLER_PRI_ANY); IPFW_UH_WLOCK(ch); if (ipfw_objhash_alloc_idx(ni, &cfg->no.kidx) != 0) { IPFW_UH_WUNLOCK(ch); nptv6_free_config(cfg); return (ENOSPC); } ipfw_objhash_add(ni, &cfg->no); SRV_OBJECT(ch, cfg->no.kidx) = cfg; if (cfg->flags & NPTV6_DYNAMIC_PREFIX) nptv6_find_prefix(ch, cfg, NULL); IPFW_UH_WUNLOCK(ch); return (0); } /* * Destroys NPTv6 instance. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nptv6_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nptv6_cfg *cfg; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0) return (EINVAL); IPFW_UH_WLOCK(ch); cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } if (cfg->no.refcnt > 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } ipfw_reset_eaction_instance(ch, V_nptv6_eid, cfg->no.kidx); SRV_OBJECT(ch, cfg->no.kidx) = NULL; ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no); ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx); IPFW_UH_WUNLOCK(ch); nptv6_free_config(cfg); return (0); } /* * Get or change nptv6 instance config. * Request: [ ipfw_obj_header [ ipfw_nptv6_cfg ] ] */ static int nptv6_config(struct ip_fw_chain *chain, ip_fw3_opheader *op, struct sockopt_data *sd) { return (EOPNOTSUPP); } /* * Lists all NPTv6 instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader ipfw_nptv6_cfg x N ] * * Returns 0 on success */ static int nptv6_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nptv6_dump_arg da; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(ch); olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch), IPFW_TLV_NPTV6_NAME); olh->objsize = sizeof(ipfw_nptv6_cfg); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(ch); return (ENOMEM); } memset(&da, 0, sizeof(da)); da.ch = ch; da.sd = sd; ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb, &da, IPFW_TLV_NPTV6_NAME); IPFW_UH_RUNLOCK(ch); return (0); } #define __COPY_STAT_FIELD(_cfg, _stats, _field) \ (_stats)->_field = NPTV6STAT_FETCH(_cfg, _field) static void export_stats(struct ip_fw_chain *ch, struct nptv6_cfg *cfg, struct ipfw_nptv6_stats *stats) { __COPY_STAT_FIELD(cfg, stats, in2ex); __COPY_STAT_FIELD(cfg, stats, ex2in); __COPY_STAT_FIELD(cfg, stats, dropped); } /* * Get NPTv6 statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]] * * Returns 0 on success */ static int nptv6_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct ipfw_nptv6_stats stats; struct nptv6_cfg *cfg; ipfw_obj_header *oh; ipfw_obj_ctlv *ctlv; size_t sz; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats); if (sd->valsize % sizeof(uint64_t)) return (EINVAL); if (sd->valsize < sz) return (ENOMEM); oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); memset(&stats, 0, sizeof(stats)); IPFW_UH_RLOCK(ch); cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } export_stats(ch, cfg, &stats); IPFW_UH_RUNLOCK(ch); ctlv = (ipfw_obj_ctlv *)(oh + 1); memset(ctlv, 0, sizeof(*ctlv)); ctlv->head.type = IPFW_TLV_COUNTERS; ctlv->head.length = sz - sizeof(ipfw_obj_header); ctlv->count = sizeof(stats) / sizeof(uint64_t); ctlv->objsize = sizeof(uint64_t); ctlv->version = 1; memcpy(ctlv + 1, &stats, sizeof(stats)); return (0); } /* * Reset NPTv6 statistics. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nptv6_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op, struct sockopt_data *sd) { struct nptv6_cfg *cfg; ipfw_obj_header *oh; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 || oh->ntlv.set >= IPFW_MAX_SETS) return (EINVAL); IPFW_UH_WLOCK(ch); cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set); if (cfg == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } COUNTER_ARRAY_ZERO(cfg->stats, NPTV6STATS); IPFW_UH_WUNLOCK(ch); return (0); } static struct ipfw_sopt_handler scodes[] = { - { IP_FW_NPTV6_CREATE, 0, HDIR_SET, nptv6_create }, - { IP_FW_NPTV6_DESTROY,0, HDIR_SET, nptv6_destroy }, - { IP_FW_NPTV6_CONFIG, 0, HDIR_BOTH, nptv6_config }, - { IP_FW_NPTV6_LIST, 0, HDIR_GET, nptv6_list }, - { IP_FW_NPTV6_STATS, 0, HDIR_GET, nptv6_stats }, - { IP_FW_NPTV6_RESET_STATS,0, HDIR_SET, nptv6_reset_stats }, + { IP_FW_NPTV6_CREATE, IP_FW3_OPVER, HDIR_SET, nptv6_create }, + { IP_FW_NPTV6_DESTROY, IP_FW3_OPVER, HDIR_SET, nptv6_destroy }, + { IP_FW_NPTV6_CONFIG, IP_FW3_OPVER, HDIR_BOTH,nptv6_config }, + { IP_FW_NPTV6_LIST, IP_FW3_OPVER, HDIR_GET, nptv6_list }, + { IP_FW_NPTV6_STATS, IP_FW3_OPVER, HDIR_GET, nptv6_stats }, + { IP_FW_NPTV6_RESET_STATS, IP_FW3_OPVER, HDIR_SET, nptv6_reset_stats }, }; static int -nptv6_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) +nptv6_classify(ipfw_insn *cmd0, uint32_t *puidx, uint8_t *ptype) { ipfw_insn *icmd; - icmd = cmd - 1; - NPTV6_DEBUG("opcode %d, arg1 %d, opcode0 %d, arg1 %d", - cmd->opcode, cmd->arg1, icmd->opcode, icmd->arg1); + icmd = cmd0 - F_LEN(cmd0); + NPTV6_DEBUG("opcode %u, kidx %u, opcode0 %u, kidx %u", + cmd->opcode, insntod(cmd, kidx)->kidx, + icmd->opcode, insntod(icmd, kidx)->kidx); if (icmd->opcode != O_EXTERNAL_ACTION || - icmd->arg1 != V_nptv6_eid) + insntod(icmd, kidx)->kidx != V_nptv6_eid) return (1); - *puidx = cmd->arg1; + *puidx = insntod(cmd0, kidx)->kidx; *ptype = 0; return (0); } static void -nptv6_update_arg1(ipfw_insn *cmd, uint16_t idx) +nptv6_update_kidx(ipfw_insn *cmd0, uint32_t idx) { - cmd->arg1 = idx; - NPTV6_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1); + insntod(cmd0, kidx)->kidx = idx; + NPTV6_DEBUG("opcode %u, kidx -> %u", cmd->opcode, idx); } static int nptv6_findbyname(struct ip_fw_chain *ch, struct tid_info *ti, struct named_object **pno) { int err; err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti, IPFW_TLV_NPTV6_NAME, pno); NPTV6_DEBUG("uidx %u, type %u, err %d", ti->uidx, ti->type, err); return (err); } static struct named_object * -nptv6_findbykidx(struct ip_fw_chain *ch, uint16_t idx) +nptv6_findbykidx(struct ip_fw_chain *ch, uint32_t idx) { struct namedobj_instance *ni; struct named_object *no; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_SRV(ch); no = ipfw_objhash_lookup_kidx(ni, idx); - KASSERT(no != NULL, ("NPT with index %d not found", idx)); + KASSERT(no != NULL, ("NPT with index %u not found", idx)); NPTV6_DEBUG("kidx %u -> %s", idx, no->name); return (no); } static int -nptv6_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set, +nptv6_manage_sets(struct ip_fw_chain *ch, uint32_t set, uint8_t new_set, enum ipfw_sets_cmd cmd) { return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NPTV6_NAME, set, new_set, cmd)); } static struct opcode_obj_rewrite opcodes[] = { { .opcode = O_EXTERNAL_INSTANCE, .etlv = IPFW_TLV_EACTION /* just show it isn't table */, .classifier = nptv6_classify, - .update = nptv6_update_arg1, + .update = nptv6_update_kidx, .find_byname = nptv6_findbyname, .find_bykidx = nptv6_findbykidx, .manage_sets = nptv6_manage_sets, }, }; static int destroy_config_cb(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct nptv6_cfg *cfg; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; IPFW_UH_WLOCK_ASSERT(ch); cfg = (struct nptv6_cfg *)SRV_OBJECT(ch, no->kidx); SRV_OBJECT(ch, no->kidx) = NULL; ipfw_objhash_del(ni, &cfg->no); ipfw_objhash_free_idx(ni, cfg->no.kidx); nptv6_free_config(cfg); return (0); } int nptv6_init(struct ip_fw_chain *ch, int first) { V_nptv6_eid = ipfw_add_eaction(ch, ipfw_nptv6, "nptv6"); if (V_nptv6_eid == 0) return (ENXIO); IPFW_ADD_SOPT_HANDLER(first, scodes); IPFW_ADD_OBJ_REWRITER(first, opcodes); return (0); } void nptv6_uninit(struct ip_fw_chain *ch, int last) { if (last && nptv6_ifaddr_event != NULL) EVENTHANDLER_DEREGISTER(ifaddr_event_ext, nptv6_ifaddr_event); IPFW_DEL_OBJ_REWRITER(last, opcodes); IPFW_DEL_SOPT_HANDLER(last, scodes); ipfw_del_eaction(ch, V_nptv6_eid); /* * Since we already have deregistered external action, * our named objects become unaccessible via rules, because * all rules were truncated by ipfw_del_eaction(). * So, we can unlink and destroy our named objects without holding * IPFW_WLOCK(). */ IPFW_UH_WLOCK(ch); ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch, IPFW_TLV_NPTV6_NAME); V_nptv6_eid = 0; IPFW_UH_WUNLOCK(ch); } diff --git a/sys/netpfil/ipfw/pmod/tcpmod.c b/sys/netpfil/ipfw/pmod/tcpmod.c index e978394924e1..0338dc792c64 100644 --- a/sys/netpfil/ipfw/pmod/tcpmod.c +++ b/sys/netpfil/ipfw/pmod/tcpmod.c @@ -1,242 +1,242 @@ /*- * Copyright (c) 2017 Yandex LLC * Copyright (c) 2017 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -VNET_DEFINE_STATIC(uint16_t, tcpmod_setmss_eid) = 0; +VNET_DEFINE_STATIC(uint32_t, tcpmod_setmss_eid) = 0; #define V_tcpmod_setmss_eid VNET(tcpmod_setmss_eid) static int tcpmod_setmss(struct mbuf **mp, struct tcphdr *tcp, int tlen, uint16_t mss) { struct mbuf *m; u_char *cp; int optlen, ret; uint16_t oldmss, csum; m = *mp; ret = IP_FW_DENY; if (m->m_len < m->m_pkthdr.len) { /* * We shouldn't have any data, IP packet contains only * TCP header with options. */ *mp = m = m_pullup(m, m->m_pkthdr.len); if (m == NULL) return (ret); } /* Parse TCP options. */ for (tlen -= sizeof(struct tcphdr), cp = (u_char *)(tcp + 1); tlen > 0; tlen -= optlen, cp += optlen) { if (cp[0] == TCPOPT_EOL) break; if (cp[0] == TCPOPT_NOP) { optlen = 1; continue; } if (tlen < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > tlen) break; if (cp[0] == TCPOPT_MAXSEG) { if (optlen != TCPOLEN_MAXSEG) break; ret = 0; /* report success */ bcopy(cp + 2, &oldmss, sizeof(oldmss)); /* Do not update lower MSS value */ if (ntohs(oldmss) <= ntohs(mss)) break; bcopy(&mss, cp + 2, sizeof(mss)); /* Update checksum if it is not delayed. */ if ((m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6)) == 0) { bcopy(&tcp->th_sum, &csum, sizeof(csum)); csum = cksum_adjust(csum, oldmss, mss); bcopy(&csum, &tcp->th_sum, sizeof(csum)); } break; } } return (ret); } #ifdef INET6 static int tcpmod_ipv6_setmss(struct mbuf **mp, uint16_t mss) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; struct tcphdr *tcp; int hlen, plen, proto; ip6 = mtod(*mp, struct ip6_hdr *); hlen = sizeof(*ip6); proto = ip6->ip6_nxt; /* * Skip IPv6 extension headers and get the TCP header. * ipfw_chk() has already done this work. So we are sure that * we will not do an access to the out of bounds. For this * reason we skip some checks here. */ while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING || proto == IPPROTO_DSTOPTS) { hbh = mtodo(*mp, hlen); proto = hbh->ip6h_nxt; hlen += (hbh->ip6h_len + 1) << 3; } tcp = mtodo(*mp, hlen); plen = (*mp)->m_pkthdr.len - hlen; hlen = tcp->th_off << 2; /* We must have TCP options and enough data in a packet. */ if (hlen <= sizeof(struct tcphdr) || hlen > plen) return (IP_FW_DENY); return (tcpmod_setmss(mp, tcp, hlen, mss)); } #endif /* INET6 */ #ifdef INET static int tcpmod_ipv4_setmss(struct mbuf **mp, uint16_t mss) { struct tcphdr *tcp; struct ip *ip; int hlen, plen; ip = mtod(*mp, struct ip *); hlen = ip->ip_hl << 2; tcp = mtodo(*mp, hlen); plen = (*mp)->m_pkthdr.len - hlen; hlen = tcp->th_off << 2; /* We must have TCP options and enough data in a packet. */ if (hlen <= sizeof(struct tcphdr) || hlen > plen) return (IP_FW_DENY); return (tcpmod_setmss(mp, tcp, hlen, mss)); } #endif /* INET */ /* * ipfw external action handler. */ static int ipfw_tcpmod(struct ip_fw_chain *chain, struct ip_fw_args *args, ipfw_insn *cmd, int *done) { ipfw_insn *icmd; int ret; *done = 0; /* try next rule if not matched */ ret = IP_FW_DENY; - icmd = cmd + 1; + icmd = cmd + F_LEN(cmd); if (cmd->opcode != O_EXTERNAL_ACTION || - cmd->arg1 != V_tcpmod_setmss_eid || + insntod(cmd, kidx)->kidx != V_tcpmod_setmss_eid || icmd->opcode != O_EXTERNAL_DATA || icmd->len != F_INSN_SIZE(ipfw_insn)) return (ret); /* * NOTE: ipfw_chk() can set f_id.proto from IPv6 fragment header, * but f_id._flags can be filled only from real TCP header. * * NOTE: ipfw_chk() drops very short packets in the PULLUP_TO() * macro. But we need to check that mbuf is contiguous more than * IP+IP_options/IP_extensions+tcphdr length, because TCP header * must have TCP options, and ipfw_chk() does PULLUP_TO() size of * struct tcphdr. * * NOTE: we require only the presence of SYN flag. User should * properly configure the rule to select the direction of packets, * that should be modified. */ if (args->f_id.proto != IPPROTO_TCP || (args->f_id._flags & TH_SYN) == 0) return (ret); switch (args->f_id.addr_type) { #ifdef INET case 4: ret = tcpmod_ipv4_setmss(&args->m, htons(icmd->arg1)); break; #endif #ifdef INET6 case 6: ret = tcpmod_ipv6_setmss(&args->m, htons(icmd->arg1)); break; #endif } /* * We return zero in both @ret and @done on success, and ipfw_chk() * will update rule counters. Otherwise a packet will not be matched * by rule. */ return (ret); } int tcpmod_init(struct ip_fw_chain *ch, int first) { V_tcpmod_setmss_eid = ipfw_add_eaction(ch, ipfw_tcpmod, "tcp-setmss"); if (V_tcpmod_setmss_eid == 0) return (ENXIO); return (0); } void tcpmod_uninit(struct ip_fw_chain *ch, int last) { ipfw_del_eaction(ch, V_tcpmod_setmss_eid); V_tcpmod_setmss_eid = 0; } diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 2ef455991491..064da1cb95be 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -1,742 +1,744 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1985, 1986, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SOCKET_H_ #define _SYS_SOCKET_H_ #include #include #include #include #include /* * Definitions related to sockets: types, address families, options. */ /* * Data types. */ #if __BSD_VISIBLE #ifndef _GID_T_DECLARED typedef __gid_t gid_t; #define _GID_T_DECLARED #endif #ifndef _OFF_T_DECLARED typedef __off_t off_t; #define _OFF_T_DECLARED #endif #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #ifndef _SSIZE_T_DECLARED typedef __ssize_t ssize_t; #define _SSIZE_T_DECLARED #endif #if __BSD_VISIBLE #ifndef _UID_T_DECLARED typedef __uid_t uid_t; #define _UID_T_DECLARED #endif #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _UINTPTR_T_DECLARED typedef __uintptr_t uintptr_t; #define _UINTPTR_T_DECLARED #endif /* * Types */ #define SOCK_STREAM 1 /* stream socket */ #define SOCK_DGRAM 2 /* datagram socket */ #define SOCK_RAW 3 /* raw-protocol interface */ #if __BSD_VISIBLE #define SOCK_RDM 4 /* reliably-delivered message */ #endif #define SOCK_SEQPACKET 5 /* sequenced packet stream */ #if __BSD_VISIBLE /* * Creation flags, OR'ed into socket() and socketpair() type argument. */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 #ifdef _KERNEL /* * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition * to SOCK_CLOEXEC and SOCK_NONBLOCK. */ #define ACCEPT4_INHERIT 0x1 #define ACCEPT4_COMPAT 0x2 #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ /* * Option flags per-socket. */ #define SO_DEBUG 0x00000001 /* turn on debugging info recording */ #define SO_ACCEPTCONN 0x00000002 /* socket has had listen() */ #define SO_REUSEADDR 0x00000004 /* allow local address reuse */ #define SO_KEEPALIVE 0x00000008 /* keep connections alive */ #define SO_DONTROUTE 0x00000010 /* just use interface addresses */ #define SO_BROADCAST 0x00000020 /* permit sending of broadcast msgs */ #if __BSD_VISIBLE #define SO_USELOOPBACK 0x00000040 /* bypass hardware when possible */ #endif #define SO_LINGER 0x00000080 /* linger on close if data present */ #define SO_OOBINLINE 0x00000100 /* leave received OOB data in line */ #if __BSD_VISIBLE #define SO_REUSEPORT 0x00000200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x00000400 /* timestamp received dgram traffic */ #define SO_NOSIGPIPE 0x00000800 /* no SIGPIPE from EPIPE */ #define SO_ACCEPTFILTER 0x00001000 /* there is an accept filter */ #define SO_BINTIME 0x00002000 /* timestamp received dgram traffic */ #endif #define SO_NO_OFFLOAD 0x00004000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x00008000 /* disable direct data placement */ #define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ #define SO_RERROR 0x00020000 /* keep track of receive errors */ /* * Additional options, not kept in so_options. */ #define SO_SNDBUF 0x1001 /* send buffer size */ #define SO_RCVBUF 0x1002 /* receive buffer size */ #define SO_SNDLOWAT 0x1003 /* send low-water mark */ #define SO_RCVLOWAT 0x1004 /* receive low-water mark */ #define SO_SNDTIMEO 0x1005 /* send timeout */ #define SO_RCVTIMEO 0x1006 /* receive timeout */ #define SO_ERROR 0x1007 /* get error status and clear */ #define SO_TYPE 0x1008 /* get socket type */ #if __BSD_VISIBLE #define SO_LABEL 0x1009 /* socket's MAC label */ #define SO_PEERLABEL 0x1010 /* socket's peer's MAC label */ #define SO_LISTENQLIMIT 0x1011 /* socket's backlog limit */ #define SO_LISTENQLEN 0x1012 /* socket's complete queue length */ #define SO_LISTENINCQLEN 0x1013 /* socket's incomplete queue length */ #define SO_FIB 0x1014 /* get or set socket FIB */ #define SO_SETFIB SO_FIB /* backward compat alias */ #define SO_USER_COOKIE 0x1015 /* user cookie (dummynet etc.) */ #define SO_PROTOCOL 0x1016 /* get socket protocol (Linux name) */ #define SO_PROTOTYPE SO_PROTOCOL /* alias for SO_PROTOCOL (SunOS name) */ #define SO_TS_CLOCK 0x1017 /* clock type used for SO_TIMESTAMP */ #define SO_MAX_PACING_RATE 0x1018 /* socket's max TX pacing rate (Linux name) */ #define SO_DOMAIN 0x1019 /* get socket domain */ #define SO_SPLICE 0x1023 /* splice data to other socket */ #endif #if __BSD_VISIBLE #define SO_TS_REALTIME_MICRO 0 /* microsecond resolution, realtime */ #define SO_TS_BINTIME 1 /* sub-nanosecond resolution, realtime */ #define SO_TS_REALTIME 2 /* nanosecond resolution, realtime */ #define SO_TS_MONOTONIC 3 /* nanosecond resolution, monotonic */ #define SO_TS_DEFAULT SO_TS_REALTIME_MICRO #define SO_TS_CLOCK_MAX SO_TS_MONOTONIC #endif /* * Space reserved for new socket options added by third-party vendors. * This range applies to all socket option levels. New socket options * in FreeBSD should always use an option value less than SO_VENDOR. */ #if __BSD_VISIBLE #define SO_VENDOR 0x80000000 #endif /* * Structure used for manipulating linger option. */ struct linger { int l_onoff; /* option on/off */ int l_linger; /* linger time */ }; #if __BSD_VISIBLE struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; #endif /* * Level number for (get/set)sockopt() to apply to socket itself. */ #define SOL_SOCKET 0xffff /* options for socket level */ /* * Address families. */ #define AF_UNSPEC 0 /* unspecified */ #if __BSD_VISIBLE #define AF_LOCAL AF_UNIX /* local to host (pipes, portals) */ #endif #define AF_UNIX 1 /* standardized name for AF_LOCAL */ #define AF_INET 2 /* internetwork: UDP, TCP, etc. */ #if __BSD_VISIBLE #define AF_IMPLINK 3 /* arpanet imp addresses */ #define AF_PUP 4 /* pup protocols: e.g. BSP */ #define AF_CHAOS 5 /* mit CHAOS protocols */ #define AF_NETBIOS 6 /* SMB protocols */ #define AF_ISO 7 /* ISO protocols */ #define AF_OSI AF_ISO #define AF_ECMA 8 /* European computer manufacturers */ #define AF_DATAKIT 9 /* datakit protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_SNA 11 /* IBM SNA */ #define AF_DECnet 12 /* DECnet */ #define AF_DLI 13 /* DEC Direct data link interface */ #define AF_LAT 14 /* LAT */ #define AF_HYLINK 15 /* NSC Hyperchannel */ #define AF_APPLETALK 16 /* Apple Talk */ #define AF_ROUTE 17 /* Internal Routing Protocol */ #define AF_LINK 18 /* Link layer interface */ #define pseudo_AF_XTP 19 /* eXpress Transfer Protocol (no AF) */ #define AF_COIP 20 /* connection-oriented IP, aka ST II */ #define AF_CNT 21 /* Computer Network Technology */ #define pseudo_AF_RTIP 22 /* Help Identify RTIP packets */ #define AF_IPX 23 /* Novell Internet Protocol */ #define AF_SIP 24 /* Simple Internet Protocol */ #define pseudo_AF_PIP 25 /* Help Identify PIP packets */ #define AF_ISDN 26 /* Integrated Services Digital Network*/ #define AF_E164 AF_ISDN /* CCITT E.164 recommendation */ #define pseudo_AF_KEY 27 /* Internal key-management function */ #endif #define AF_INET6 28 /* IPv6 */ #if __BSD_VISIBLE #define AF_NATM 29 /* native ATM access */ #define AF_ATM 30 /* ATM */ #define pseudo_AF_HDRCMPLT 31 /* Used by BPF to not rewrite headers * in interface output routine */ #define AF_NETGRAPH 32 /* Netgraph sockets */ #define AF_SLOW 33 /* 802.3ad slow protocol */ #define AF_SCLUSTER 34 /* Sitara cluster protocol */ #define AF_ARP 35 #define AF_BLUETOOTH 36 /* Bluetooth sockets */ #define AF_IEEE80211 37 /* IEEE 802.11 protocol */ #define AF_NETLINK 38 /* Netlink protocol */ #define AF_INET_SDP 40 /* OFED Socket Direct Protocol ipv4 */ #define AF_INET6_SDP 42 /* OFED Socket Direct Protocol ipv6 */ #define AF_HYPERV 43 /* HyperV sockets */ #define AF_DIVERT 44 /* divert(4) */ -#define AF_MAX 44 +#define AF_IPFWLOG 46 +#define AF_MAX 46 /* * When allocating a new AF_ constant, please only allocate * even numbered constants for FreeBSD until 134 as odd numbered AF_ * constants 39-133 are now reserved for vendors. */ #define AF_VENDOR00 39 #define AF_VENDOR01 41 #define AF_VENDOR03 45 #define AF_VENDOR04 47 #define AF_VENDOR05 49 #define AF_VENDOR06 51 #define AF_VENDOR07 53 #define AF_VENDOR08 55 #define AF_VENDOR09 57 #define AF_VENDOR10 59 #define AF_VENDOR11 61 #define AF_VENDOR12 63 #define AF_VENDOR13 65 #define AF_VENDOR14 67 #define AF_VENDOR15 69 #define AF_VENDOR16 71 #define AF_VENDOR17 73 #define AF_VENDOR18 75 #define AF_VENDOR19 77 #define AF_VENDOR20 79 #define AF_VENDOR21 81 #define AF_VENDOR22 83 #define AF_VENDOR23 85 #define AF_VENDOR24 87 #define AF_VENDOR25 89 #define AF_VENDOR26 91 #define AF_VENDOR27 93 #define AF_VENDOR28 95 #define AF_VENDOR29 97 #define AF_VENDOR30 99 #define AF_VENDOR31 101 #define AF_VENDOR32 103 #define AF_VENDOR33 105 #define AF_VENDOR34 107 #define AF_VENDOR35 109 #define AF_VENDOR36 111 #define AF_VENDOR37 113 #define AF_VENDOR38 115 #define AF_VENDOR39 117 #define AF_VENDOR40 119 #define AF_VENDOR41 121 #define AF_VENDOR42 123 #define AF_VENDOR43 125 #define AF_VENDOR44 127 #define AF_VENDOR45 129 #define AF_VENDOR46 131 #define AF_VENDOR47 133 #endif /* * Structure used by kernel to store most * addresses. */ struct sockaddr { unsigned char sa_len; /* total length */ sa_family_t sa_family; /* address family */ char sa_data[14]; /* actually longer; address value */ }; #if __BSD_VISIBLE #define SOCK_MAXADDRLEN 255 /* longest possible addresses */ /* * Structure used by kernel to pass protocol * information in raw sockets. */ struct sockproto { unsigned short sp_family; /* address family */ unsigned short sp_protocol; /* protocol */ }; #endif #include #if __BSD_VISIBLE /* * Protocol families, same as address families for now. */ #define PF_UNSPEC AF_UNSPEC #define PF_LOCAL AF_LOCAL #define PF_UNIX PF_LOCAL /* backward compatibility */ #define PF_INET AF_INET #define PF_IMPLINK AF_IMPLINK #define PF_PUP AF_PUP #define PF_CHAOS AF_CHAOS #define PF_NETBIOS AF_NETBIOS #define PF_ISO AF_ISO #define PF_OSI AF_ISO #define PF_ECMA AF_ECMA #define PF_DATAKIT AF_DATAKIT #define PF_CCITT AF_CCITT #define PF_SNA AF_SNA #define PF_DECnet AF_DECnet #define PF_DLI AF_DLI #define PF_LAT AF_LAT #define PF_HYLINK AF_HYLINK #define PF_APPLETALK AF_APPLETALK #define PF_ROUTE AF_ROUTE #define PF_LINK AF_LINK #define PF_XTP pseudo_AF_XTP /* really just proto family, no AF */ #define PF_COIP AF_COIP #define PF_CNT AF_CNT #define PF_SIP AF_SIP #define PF_IPX AF_IPX #define PF_RTIP pseudo_AF_RTIP /* same format as AF_INET */ #define PF_PIP pseudo_AF_PIP #define PF_ISDN AF_ISDN #define PF_KEY pseudo_AF_KEY #define PF_INET6 AF_INET6 #define PF_NATM AF_NATM #define PF_ATM AF_ATM #define PF_NETGRAPH AF_NETGRAPH #define PF_SLOW AF_SLOW #define PF_SCLUSTER AF_SCLUSTER #define PF_ARP AF_ARP #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IEEE80211 AF_IEEE80211 #define PF_NETLINK AF_NETLINK #define PF_INET_SDP AF_INET_SDP #define PF_INET6_SDP AF_INET6_SDP #define PF_DIVERT AF_DIVERT +#define PF_IPFWLOG AF_IPFWLOG #define PF_MAX AF_MAX /* * Definitions for network related sysctl, CTL_NET. * * Second level is protocol family. * Third level is protocol number. * * Further levels are defined by the individual families. */ /* * PF_ROUTE - Routing table * * Three additional levels are defined: * Fourth: address family, 0 is wildcard * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ #define NET_RT_DUMP 1 /* dump; may limit to a.f. */ #define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ #define NET_RT_IFLIST 3 /* survey interface list */ #define NET_RT_IFMALIST 4 /* return multicast address list */ #define NET_RT_IFLISTL 5 /* Survey interface list, using 'l'en * versions of msghdr structs. */ #define NET_RT_NHOP 6 /* dump routing nexthops */ #define NET_RT_NHGRP 7 /* dump routing nexthop groups */ #endif /* __BSD_VISIBLE */ /* * Maximum queue length specifiable by listen. */ #define SOMAXCONN 128 /* * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ struct msghdr { void *msg_name; /* optional address */ socklen_t msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ void *msg_control; /* ancillary data, see below */ socklen_t msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ }; #define MSG_OOB 0x00000001 /* process out-of-band data */ #define MSG_PEEK 0x00000002 /* peek at incoming message */ #define MSG_DONTROUTE 0x00000004 /* send without using routing tables */ #define MSG_EOR 0x00000008 /* data completes record */ #define MSG_TRUNC 0x00000010 /* data discarded before delivery */ #define MSG_CTRUNC 0x00000020 /* control data lost before delivery */ #define MSG_WAITALL 0x00000040 /* wait for full request or error */ #if __BSD_VISIBLE #define MSG_DONTWAIT 0x00000080 /* this message should be nonblocking */ #define MSG_EOF 0x00000100 /* data completes connection */ /* 0x00000200 unused */ /* 0x00000400 unused */ /* 0x00000800 unused */ /* 0x00001000 unused */ #define MSG_NOTIFICATION 0x00002000 /* SCTP notification */ #define MSG_NBIO 0x00004000 /* FIONBIO mode, used by fifofs */ #define MSG_COMPAT 0x00008000 /* used in sendit() */ #endif #ifdef _KERNEL #define MSG_SOCALLBCK 0x00010000 /* for use by socket callbacks - soreceive (TCP) */ #endif #if __POSIX_VISIBLE >= 200809 #define MSG_NOSIGNAL 0x00020000 /* do not generate SIGPIPE on EOF */ #endif #if __BSD_VISIBLE #define MSG_CMSG_CLOEXEC 0x00040000 /* make received fds close-on-exec */ #define MSG_WAITFORONE 0x00080000 /* for recvmmsg() */ #endif #ifdef _KERNEL #define MSG_MORETOCOME 0x00100000 /* additional data pending */ #define MSG_TLSAPPDATA 0x00200000 /* do not soreceive() alert rec. (TLS) */ #endif /* * Header for ancillary data objects in msg_control buffer. * Used for additional information with/about a datagram * not expressible by flags. The format is a sequence * of message elements headed by cmsghdr structures. */ struct cmsghdr { socklen_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ /* followed by u_char cmsg_data[]; */ }; #if __BSD_VISIBLE /* * While we may have more groups than this, the cmsgcred struct must * be able to fit in an mbuf and we have historically supported a * maximum of 16 groups. */ #define CMGROUP_MAX 16 /* * Credentials structure, used to verify the identity of a peer * process that has sent us a message. This is allocated by the * peer process but filled in by the kernel. This prevents the * peer from lying about its identity. (Note that cmcred_groups[0] * is the effective GID.) */ struct cmsgcred { pid_t cmcred_pid; /* PID of sending process */ uid_t cmcred_uid; /* real UID of sending process */ uid_t cmcred_euid; /* effective UID of sending process */ gid_t cmcred_gid; /* real GID of sending process */ short cmcred_ngroups; /* number or groups */ gid_t cmcred_groups[CMGROUP_MAX]; /* groups */ }; /* * Socket credentials (LOCAL_CREDS). */ struct sockcred { uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; /* * Compute size of a sockcred structure with groups. */ #define SOCKCREDSIZE(ngrps) \ (sizeof(struct sockcred) + (sizeof(gid_t) * ((ngrps) - 1))) /* * Socket credentials (LOCAL_CREDS_PERSISTENT). */ struct sockcred2 { int sc_version; /* version of this structure */ pid_t sc_pid; /* PID of sending process */ uid_t sc_uid; /* real user id */ uid_t sc_euid; /* effective user id */ gid_t sc_gid; /* real group id */ gid_t sc_egid; /* effective group id */ int sc_ngroups; /* number of supplemental groups */ gid_t sc_groups[1]; /* variable length */ }; #define SOCKCRED2SIZE(ngrps) \ (sizeof(struct sockcred2) + (sizeof(gid_t) * ((ngrps) - 1))) #endif /* __BSD_VISIBLE */ /* given pointer to struct cmsghdr, return pointer to data */ #define CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \ _ALIGN(sizeof(struct cmsghdr))) /* given pointer to struct cmsghdr, return pointer to next cmsghdr */ #define CMSG_NXTHDR(mhdr, cmsg) \ ((char *)(cmsg) == (char *)0 ? CMSG_FIRSTHDR(mhdr) : \ ((char *)(cmsg) + _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len) + \ _ALIGN(sizeof(struct cmsghdr)) > \ (char *)(mhdr)->msg_control + (mhdr)->msg_controllen) ? \ (struct cmsghdr *)0 : \ (struct cmsghdr *)(void *)((char *)(cmsg) + \ _ALIGN(((struct cmsghdr *)(cmsg))->cmsg_len))) /* * RFC 2292 requires to check msg_controllen, in case that the kernel returns * an empty list for some reasons. */ #define CMSG_FIRSTHDR(mhdr) \ ((mhdr)->msg_controllen >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(mhdr)->msg_control : \ (struct cmsghdr *)0) #if __BSD_VISIBLE /* RFC 2292 additions */ #define CMSG_SPACE(l) (_ALIGN(sizeof(struct cmsghdr)) + _ALIGN(l)) #define CMSG_LEN(l) (_ALIGN(sizeof(struct cmsghdr)) + (l)) #endif #ifdef _KERNEL #define CMSG_ALIGN(n) _ALIGN(n) #endif /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if __BSD_VISIBLE #define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ #define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ #define SCM_TIME_INFO 0x07 /* timestamp info */ #define SCM_CREDS2 0x08 /* process creds (struct sockcred2) */ struct sock_timestamp_info { __uint32_t st_info_flags; __uint32_t st_info_pad0; __uint64_t st_info_rsv[7]; }; #define ST_INFO_HW 0x0001 /* SCM_TIMESTAMP was hw */ #define ST_INFO_HW_HPREC 0x0002 /* SCM_TIMESTAMP was hw-assisted on entrance */ #endif #if __BSD_VISIBLE /* * 4.3 compat sockaddr, move to compat file later */ struct osockaddr { unsigned short sa_family; /* address family */ char sa_data[14]; /* up to 14 bytes of direct address */ }; /* * 4.3-compat message header (move to compat file later). */ struct omsghdr { char *msg_name; /* optional address */ int msg_namelen; /* size of address */ struct iovec *msg_iov; /* scatter/gather array */ int msg_iovlen; /* # elements in msg_iov */ char *msg_accrights; /* access rights sent/received */ int msg_accrightslen; }; #endif /* * howto arguments for shutdown(2), specified by Posix.1g. */ enum shutdown_how { SHUT_RD = 0, /* shut down the reading side */ #define SHUT_RD SHUT_RD SHUT_WR, /* shut down the writing side */ #define SHUT_WR SHUT_WR SHUT_RDWR /* shut down both sides */ #define SHUT_RDWR SHUT_RDWR }; #if __BSD_VISIBLE /* * sendfile(2) header/trailer struct */ struct sf_hdtr { struct iovec *headers; /* pointer to an array of header struct iovec's */ int hdr_cnt; /* number of header iovec's */ struct iovec *trailers; /* pointer to an array of trailer struct iovec's */ int trl_cnt; /* number of trailer iovec's */ }; /* * Sendfile-specific flag(s) */ #define SF_NODISKIO 0x00000001 #define SF_MNOWAIT 0x00000002 /* obsolete */ #define SF_SYNC 0x00000004 #define SF_USER_READAHEAD 0x00000008 #define SF_NOCACHE 0x00000010 #define SF_FLAGS(rh, flags) (((rh) << 16) | (flags)) #ifdef _KERNEL #define SF_READAHEAD(flags) ((flags) >> 16) #endif /* _KERNEL */ /* * Sendmmsg/recvmmsg specific structure(s) */ struct mmsghdr { struct msghdr msg_hdr; /* message header */ ssize_t msg_len; /* message length */ }; /* * Structure used for manipulating splice option. */ struct splice { int sp_fd; /* drain socket file descriptor */ off_t sp_max; /* if set, maximum bytes to splice */ struct timeval sp_idle; /* idle timeout */ }; #endif /* __BSD_VISIBLE */ #if defined(_FORTIFY_SOURCE) && _FORTIFY_SOURCE > 0 #include #endif #ifndef _KERNEL #include __BEGIN_DECLS int accept(int, struct sockaddr * __restrict, socklen_t * __restrict); int bind(int, const struct sockaddr *, socklen_t); int connect(int, const struct sockaddr *, socklen_t); #if __BSD_VISIBLE int accept4(int, struct sockaddr * __restrict, socklen_t * __restrict, int); int bindat(int, int, const struct sockaddr *, socklen_t); int connectat(int, int, const struct sockaddr *, socklen_t); #endif int getpeername(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockname(int, struct sockaddr * __restrict, socklen_t * __restrict); int getsockopt(int, int, int, void * __restrict, socklen_t * __restrict); int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); #if __BSD_VISIBLE struct timespec; ssize_t recvmmsg(int, struct mmsghdr * __restrict, size_t, int, const struct timespec * __restrict); #endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); ssize_t sendmmsg(int, struct mmsghdr * __restrict, size_t, int); int setfib(int); #endif int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); int sockatmark(int); int socket(int, int, int); int socketpair(int, int, int, int *); __END_DECLS #endif /* !_KERNEL */ #ifdef _KERNEL struct socket; int so_options_get(const struct socket *); void so_options_set(struct socket *, int); int so_error_get(const struct socket *); void so_error_set(struct socket *, int); #endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */