diff --git a/sys/dev/xen/netback/netback_unit_tests.c b/sys/dev/xen/netback/netback_unit_tests.c index e5a000c872d8..af88d5ced498 100644 --- a/sys/dev/xen/netback/netback_unit_tests.c +++ b/sys/dev/xen/netback/netback_unit_tests.c @@ -1,2523 +1,2523 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009-2011 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) * Alan Somers (Spectra Logic Corporation) * John Suykerbuyk (Spectra Logic Corporation) */ #include +#include /** * \file netback_unit_tests.c * * \brief Unit tests for the Xen netback driver. * * Due to the driver's use of static functions, these tests cannot be compiled * standalone; they must be #include'd from the driver's .c file. */ /** Helper macro used to snprintf to a buffer and update the buffer pointer */ #define SNCATF(buffer, buflen, ...) do { \ size_t new_chars = snprintf(buffer, buflen, __VA_ARGS__); \ buffer += new_chars; \ /* be careful; snprintf's return value can be > buflen */ \ buflen -= MIN(buflen, new_chars); \ } while (0) /* STRINGIFY and TOSTRING are used only to help turn __LINE__ into a string */ #define STRINGIFY(x) #x #define TOSTRING(x) STRINGIFY(x) /** * Writes an error message to buffer if cond is false * Note the implied parameters buffer and * buflen */ #define XNB_ASSERT(cond) ({ \ int passed = (cond); \ char *_buffer = (buffer); \ size_t _buflen = (buflen); \ if (! passed) { \ strlcat(_buffer, __func__, _buflen); \ strlcat(_buffer, ":" TOSTRING(__LINE__) \ " Assertion Error: " #cond "\n", _buflen); \ } \ }) /** * The signature used by all testcases. If the test writes anything * to buffer, then it will be considered a failure * \param buffer Return storage for error messages * \param buflen The space available in the buffer */ typedef void testcase_t(char *buffer, size_t buflen); /** * Signature used by setup functions * \return nonzero on error */ typedef int setup_t(void); typedef void teardown_t(void); /** A simple test fixture comprising setup, teardown, and test */ struct test_fixture { /** Will be run before the test to allocate and initialize variables */ setup_t *setup; /** Will be run if setup succeeds */ testcase_t *test; /** Cleans up test data whether or not the setup succeeded */ teardown_t *teardown; }; typedef struct test_fixture test_fixture_t; static int xnb_get1pkt(struct xnb_pkt *pkt, size_t size, uint16_t flags); static int xnb_unit_test_runner(test_fixture_t const tests[], int ntests, char *buffer, size_t buflen); static int __unused null_setup(void) { return 0; } static void __unused null_teardown(void) { } static setup_t setup_pvt_data; static teardown_t teardown_pvt_data; static testcase_t xnb_ring2pkt_emptyring; static testcase_t xnb_ring2pkt_1req; static testcase_t xnb_ring2pkt_2req; static testcase_t xnb_ring2pkt_3req; static testcase_t xnb_ring2pkt_extra; static testcase_t xnb_ring2pkt_partial; static testcase_t xnb_ring2pkt_wraps; static testcase_t xnb_txpkt2rsp_emptypkt; static testcase_t xnb_txpkt2rsp_1req; static testcase_t xnb_txpkt2rsp_extra; static testcase_t xnb_txpkt2rsp_long; static testcase_t xnb_txpkt2rsp_invalid; static testcase_t xnb_txpkt2rsp_error; static testcase_t xnb_txpkt2rsp_wraps; static testcase_t xnb_pkt2mbufc_empty; static testcase_t xnb_pkt2mbufc_short; static testcase_t xnb_pkt2mbufc_csum; static testcase_t xnb_pkt2mbufc_1cluster; static testcase_t xnb_pkt2mbufc_largecluster; static testcase_t xnb_pkt2mbufc_2cluster; static testcase_t xnb_txpkt2gnttab_empty; static testcase_t xnb_txpkt2gnttab_short; static testcase_t xnb_txpkt2gnttab_2req; static testcase_t xnb_txpkt2gnttab_2cluster; static testcase_t xnb_update_mbufc_short; static testcase_t xnb_update_mbufc_2req; static testcase_t xnb_update_mbufc_2cluster; static testcase_t xnb_mbufc2pkt_empty; static testcase_t xnb_mbufc2pkt_short; static testcase_t xnb_mbufc2pkt_1cluster; static testcase_t xnb_mbufc2pkt_2short; static testcase_t xnb_mbufc2pkt_long; static testcase_t xnb_mbufc2pkt_extra; static testcase_t xnb_mbufc2pkt_nospace; static testcase_t xnb_rxpkt2gnttab_empty; static testcase_t xnb_rxpkt2gnttab_short; static testcase_t xnb_rxpkt2gnttab_2req; static testcase_t xnb_rxpkt2rsp_empty; static testcase_t xnb_rxpkt2rsp_short; static testcase_t xnb_rxpkt2rsp_extra; static testcase_t xnb_rxpkt2rsp_2short; static testcase_t xnb_rxpkt2rsp_2slots; static testcase_t xnb_rxpkt2rsp_copyerror; static testcase_t xnb_sscanf_llu; static testcase_t xnb_sscanf_lld; static testcase_t xnb_sscanf_hhu; static testcase_t xnb_sscanf_hhd; static testcase_t xnb_sscanf_hhn; #if defined(INET) || defined(INET6) /* TODO: add test cases for xnb_add_mbuf_cksum for IPV6 tcp and udp */ static testcase_t xnb_add_mbuf_cksum_arp; static testcase_t xnb_add_mbuf_cksum_tcp; static testcase_t xnb_add_mbuf_cksum_udp; static testcase_t xnb_add_mbuf_cksum_icmp; static testcase_t xnb_add_mbuf_cksum_tcp_swcksum; static void xnb_fill_eh_and_ip(struct mbuf *m, uint16_t ip_len, uint16_t ip_id, uint16_t ip_p, uint16_t ip_off, uint16_t ip_sum); static void xnb_fill_tcp(struct mbuf *m); #endif /* INET || INET6 */ /** Private data used by unit tests */ static struct { gnttab_copy_table gnttab; netif_rx_back_ring_t rxb; netif_rx_front_ring_t rxf; netif_tx_back_ring_t txb; netif_tx_front_ring_t txf; struct ifnet* ifp; netif_rx_sring_t* rxs; netif_tx_sring_t* txs; } xnb_unit_pvt; static inline void safe_m_freem(struct mbuf **ppMbuf) { if (*ppMbuf != NULL) { m_freem(*ppMbuf); *ppMbuf = NULL; } } /** * The unit test runner. It will run every supplied test and return an * output message as a string * \param tests An array of tests. Every test will be attempted. * \param ntests The length of tests * \param buffer Return storage for the result string * \param buflen The length of buffer * \return The number of tests that failed */ static int xnb_unit_test_runner(test_fixture_t const tests[], int ntests, char *buffer, size_t buflen) { int i; int n_passes; int n_failures = 0; for (i = 0; i < ntests; i++) { int error = tests[i].setup(); if (error != 0) { SNCATF(buffer, buflen, "Setup failed for test idx %d\n", i); n_failures++; } else { size_t new_chars; tests[i].test(buffer, buflen); new_chars = strnlen(buffer, buflen); buffer += new_chars; buflen -= new_chars; if (new_chars > 0) { n_failures++; } } tests[i].teardown(); } n_passes = ntests - n_failures; if (n_passes > 0) { SNCATF(buffer, buflen, "%d Tests Passed\n", n_passes); } if (n_failures > 0) { SNCATF(buffer, buflen, "%d Tests FAILED\n", n_failures); } return n_failures; } /** Number of unit tests. Must match the length of the tests array below */ #define TOTAL_TESTS (53) /** * Max memory available for returning results. 400 chars/test should give * enough space for a five line error message for every test */ #define TOTAL_BUFLEN (400 * TOTAL_TESTS + 2) /** * Called from userspace by a sysctl. Runs all internal unit tests, and * returns the results to userspace as a string * \param oidp unused * \param arg1 pointer to an xnb_softc for a specific xnb device * \param arg2 unused * \param req sysctl access structure * \return a string via the special SYSCTL_OUT macro. */ static int xnb_unit_test_main(SYSCTL_HANDLER_ARGS) { test_fixture_t const tests[TOTAL_TESTS] = { {setup_pvt_data, xnb_ring2pkt_emptyring, teardown_pvt_data}, {setup_pvt_data, xnb_ring2pkt_1req, teardown_pvt_data}, {setup_pvt_data, xnb_ring2pkt_2req, teardown_pvt_data}, {setup_pvt_data, xnb_ring2pkt_3req, teardown_pvt_data}, {setup_pvt_data, xnb_ring2pkt_extra, teardown_pvt_data}, {setup_pvt_data, xnb_ring2pkt_partial, teardown_pvt_data}, {setup_pvt_data, xnb_ring2pkt_wraps, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_emptypkt, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_1req, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_extra, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_long, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_invalid, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_error, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2rsp_wraps, teardown_pvt_data}, {setup_pvt_data, xnb_pkt2mbufc_empty, teardown_pvt_data}, {setup_pvt_data, xnb_pkt2mbufc_short, teardown_pvt_data}, {setup_pvt_data, xnb_pkt2mbufc_csum, teardown_pvt_data}, {setup_pvt_data, xnb_pkt2mbufc_1cluster, teardown_pvt_data}, {setup_pvt_data, xnb_pkt2mbufc_largecluster, teardown_pvt_data}, {setup_pvt_data, xnb_pkt2mbufc_2cluster, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2gnttab_empty, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2gnttab_short, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2gnttab_2req, teardown_pvt_data}, {setup_pvt_data, xnb_txpkt2gnttab_2cluster, teardown_pvt_data}, {setup_pvt_data, xnb_update_mbufc_short, teardown_pvt_data}, {setup_pvt_data, xnb_update_mbufc_2req, teardown_pvt_data}, {setup_pvt_data, xnb_update_mbufc_2cluster, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_empty, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_short, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_1cluster, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_2short, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_long, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_extra, teardown_pvt_data}, {setup_pvt_data, xnb_mbufc2pkt_nospace, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2gnttab_empty, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2gnttab_short, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2gnttab_2req, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2rsp_empty, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2rsp_short, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2rsp_extra, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2rsp_2short, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2rsp_2slots, teardown_pvt_data}, {setup_pvt_data, xnb_rxpkt2rsp_copyerror, teardown_pvt_data}, #if defined(INET) || defined(INET6) {null_setup, xnb_add_mbuf_cksum_arp, null_teardown}, {null_setup, xnb_add_mbuf_cksum_icmp, null_teardown}, {null_setup, xnb_add_mbuf_cksum_tcp, null_teardown}, {null_setup, xnb_add_mbuf_cksum_tcp_swcksum, null_teardown}, {null_setup, xnb_add_mbuf_cksum_udp, null_teardown}, #endif {null_setup, xnb_sscanf_hhd, null_teardown}, {null_setup, xnb_sscanf_hhu, null_teardown}, {null_setup, xnb_sscanf_lld, null_teardown}, {null_setup, xnb_sscanf_llu, null_teardown}, {null_setup, xnb_sscanf_hhn, null_teardown}, }; /** * results is static so that the data will persist after this function * returns. The sysctl code expects us to return a constant string. * \todo: the static variable is not thread safe. Put a mutex around * it. */ static char results[TOTAL_BUFLEN]; /* empty the result strings */ results[0] = 0; xnb_unit_test_runner(tests, TOTAL_TESTS, results, TOTAL_BUFLEN); return (SYSCTL_OUT(req, results, strnlen(results, TOTAL_BUFLEN))); } static int setup_pvt_data(void) { int error = 0; bzero(xnb_unit_pvt.gnttab, sizeof(xnb_unit_pvt.gnttab)); xnb_unit_pvt.txs = malloc(PAGE_SIZE, M_XENNETBACK, M_WAITOK|M_ZERO); if (xnb_unit_pvt.txs != NULL) { SHARED_RING_INIT(xnb_unit_pvt.txs); BACK_RING_INIT(&xnb_unit_pvt.txb, xnb_unit_pvt.txs, PAGE_SIZE); FRONT_RING_INIT(&xnb_unit_pvt.txf, xnb_unit_pvt.txs, PAGE_SIZE); } else { error = 1; } xnb_unit_pvt.ifp = if_alloc(IFT_ETHER); if (xnb_unit_pvt.ifp == NULL) { error = 1; } xnb_unit_pvt.rxs = malloc(PAGE_SIZE, M_XENNETBACK, M_WAITOK|M_ZERO); if (xnb_unit_pvt.rxs != NULL) { SHARED_RING_INIT(xnb_unit_pvt.rxs); BACK_RING_INIT(&xnb_unit_pvt.rxb, xnb_unit_pvt.rxs, PAGE_SIZE); FRONT_RING_INIT(&xnb_unit_pvt.rxf, xnb_unit_pvt.rxs, PAGE_SIZE); } else { error = 1; } return error; } static void teardown_pvt_data(void) { if (xnb_unit_pvt.txs != NULL) { free(xnb_unit_pvt.txs, M_XENNETBACK); } if (xnb_unit_pvt.rxs != NULL) { free(xnb_unit_pvt.rxs, M_XENNETBACK); } if (xnb_unit_pvt.ifp != NULL) { if_free(xnb_unit_pvt.ifp); } } /** * Verify that xnb_ring2pkt will not consume any requests from an empty ring */ static void xnb_ring2pkt_emptyring(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 0); } /** * Verify that xnb_ring2pkt can convert a single request packet correctly */ static void xnb_ring2pkt_1req(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; struct netif_tx_request *req; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 69; /* arbitrary number for test */ xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 1); XNB_ASSERT(pkt.size == 69); XNB_ASSERT(pkt.car_size == 69); XNB_ASSERT(pkt.flags == 0); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.list_len == 1); XNB_ASSERT(pkt.car == 0); } /** * Verify that xnb_ring2pkt can convert a two request packet correctly. * This tests handling of the MORE_DATA flag and cdr */ static void xnb_ring2pkt_2req(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; struct netif_tx_request *req; RING_IDX start_idx = xnb_unit_pvt.txf.req_prod_pvt; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 100; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 40; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 2); XNB_ASSERT(pkt.size == 100); XNB_ASSERT(pkt.car_size == 60); XNB_ASSERT(pkt.flags == 0); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.list_len == 2); XNB_ASSERT(pkt.car == start_idx); XNB_ASSERT(pkt.cdr == start_idx + 1); } /** * Verify that xnb_ring2pkt can convert a three request packet correctly */ static void xnb_ring2pkt_3req(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; struct netif_tx_request *req; RING_IDX start_idx = xnb_unit_pvt.txf.req_prod_pvt; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 200; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 40; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 50; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 3); XNB_ASSERT(pkt.size == 200); XNB_ASSERT(pkt.car_size == 110); XNB_ASSERT(pkt.flags == 0); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.list_len == 3); XNB_ASSERT(pkt.car == start_idx); XNB_ASSERT(pkt.cdr == start_idx + 1); XNB_ASSERT(RING_GET_REQUEST(&xnb_unit_pvt.txb, pkt.cdr + 1) == req); } /** * Verify that xnb_ring2pkt can read extra inf */ static void xnb_ring2pkt_extra(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; struct netif_tx_request *req; struct netif_extra_info *ext; RING_IDX start_idx = xnb_unit_pvt.txf.req_prod_pvt; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_extra_info | NETTXF_more_data; req->size = 150; xnb_unit_pvt.txf.req_prod_pvt++; ext = (struct netif_extra_info*) RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); ext->flags = 0; ext->type = XEN_NETIF_EXTRA_TYPE_GSO; ext->u.gso.size = 250; ext->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ext->u.gso.features = 0; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 50; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 3); XNB_ASSERT(pkt.extra.flags == 0); XNB_ASSERT(pkt.extra.type == XEN_NETIF_EXTRA_TYPE_GSO); XNB_ASSERT(pkt.extra.u.gso.size == 250); XNB_ASSERT(pkt.extra.u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4); XNB_ASSERT(pkt.size == 150); XNB_ASSERT(pkt.car_size == 100); XNB_ASSERT(pkt.flags == NETTXF_extra_info); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.list_len == 2); XNB_ASSERT(pkt.car == start_idx); XNB_ASSERT(pkt.cdr == start_idx + 2); XNB_ASSERT(RING_GET_REQUEST(&xnb_unit_pvt.txb, pkt.cdr) == req); } /** * Verify that xnb_ring2pkt will consume no requests if the entire packet is * not yet in the ring */ static void xnb_ring2pkt_partial(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; struct netif_tx_request *req; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 150; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 0); XNB_ASSERT(! xnb_pkt_is_valid(&pkt)); } /** * Verity that xnb_ring2pkt can read a packet whose requests wrap around * the end of the ring */ static void xnb_ring2pkt_wraps(char *buffer, size_t buflen) { struct xnb_pkt pkt; int num_consumed; struct netif_tx_request *req; unsigned int rsize; /* * Manually tweak the ring indices to create a ring with no responses * and the next request slot at position 2 from the end */ rsize = RING_SIZE(&xnb_unit_pvt.txf); xnb_unit_pvt.txf.req_prod_pvt = rsize - 2; xnb_unit_pvt.txf.rsp_cons = rsize - 2; xnb_unit_pvt.txs->req_prod = rsize - 2; xnb_unit_pvt.txs->req_event = rsize - 1; xnb_unit_pvt.txs->rsp_prod = rsize - 2; xnb_unit_pvt.txs->rsp_event = rsize - 1; xnb_unit_pvt.txb.rsp_prod_pvt = rsize - 2; xnb_unit_pvt.txb.req_cons = rsize - 2; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 550; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 100; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 50; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); XNB_ASSERT(num_consumed == 3); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.list_len == 3); XNB_ASSERT(RING_GET_REQUEST(&xnb_unit_pvt.txb, pkt.cdr + 1) == req); } /** * xnb_txpkt2rsp should do nothing for an empty packet */ static void xnb_txpkt2rsp_emptypkt(char *buffer, size_t buflen) { struct xnb_pkt pkt; netif_tx_back_ring_t txb_backup = xnb_unit_pvt.txb; netif_tx_sring_t txs_backup = *xnb_unit_pvt.txs; pkt.list_len = 0; /* must call xnb_ring2pkt just to intialize pkt */ xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 0); XNB_ASSERT( memcmp(&txb_backup, &xnb_unit_pvt.txb, sizeof(txb_backup)) == 0); XNB_ASSERT( memcmp(&txs_backup, xnb_unit_pvt.txs, sizeof(txs_backup)) == 0); } /** * xnb_txpkt2rsp responding to one request */ static void xnb_txpkt2rsp_1req(char *buffer, size_t buflen) { uint16_t num_consumed; struct xnb_pkt pkt; struct netif_tx_request *req; struct netif_tx_response *rsp; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 1000; req->flags = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_unit_pvt.txb.req_cons += num_consumed; xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 0); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons); XNB_ASSERT( xnb_unit_pvt.txb.rsp_prod_pvt == xnb_unit_pvt.txs->req_prod); XNB_ASSERT(rsp->id == req->id); XNB_ASSERT(rsp->status == NETIF_RSP_OKAY); }; /** * xnb_txpkt2rsp responding to 1 data request and 1 extra info */ static void xnb_txpkt2rsp_extra(char *buffer, size_t buflen) { uint16_t num_consumed; struct xnb_pkt pkt; struct netif_tx_request *req; netif_extra_info_t *ext; struct netif_tx_response *rsp; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 1000; req->flags = NETTXF_extra_info; req->id = 69; xnb_unit_pvt.txf.req_prod_pvt++; ext = (netif_extra_info_t*) RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); ext->type = XEN_NETIF_EXTRA_TYPE_GSO; ext->flags = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_unit_pvt.txb.req_cons += num_consumed; xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 0); XNB_ASSERT( xnb_unit_pvt.txb.rsp_prod_pvt == xnb_unit_pvt.txs->req_prod); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons); XNB_ASSERT(rsp->id == req->id); XNB_ASSERT(rsp->status == NETIF_RSP_OKAY); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons + 1); XNB_ASSERT(rsp->status == NETIF_RSP_NULL); }; /** * xnb_pkg2rsp responding to 3 data requests and 1 extra info */ static void xnb_txpkt2rsp_long(char *buffer, size_t buflen) { uint16_t num_consumed; struct xnb_pkt pkt; struct netif_tx_request *req; netif_extra_info_t *ext; struct netif_tx_response *rsp; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 1000; req->flags = NETTXF_extra_info | NETTXF_more_data; req->id = 254; xnb_unit_pvt.txf.req_prod_pvt++; ext = (netif_extra_info_t*) RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); ext->type = XEN_NETIF_EXTRA_TYPE_GSO; ext->flags = 0; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 300; req->flags = NETTXF_more_data; req->id = 1034; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 400; req->flags = 0; req->id = 34; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_unit_pvt.txb.req_cons += num_consumed; xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 0); XNB_ASSERT( xnb_unit_pvt.txb.rsp_prod_pvt == xnb_unit_pvt.txs->req_prod); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons); XNB_ASSERT(rsp->id == RING_GET_REQUEST(&xnb_unit_pvt.txf, 0)->id); XNB_ASSERT(rsp->status == NETIF_RSP_OKAY); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons + 1); XNB_ASSERT(rsp->status == NETIF_RSP_NULL); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons + 2); XNB_ASSERT(rsp->id == RING_GET_REQUEST(&xnb_unit_pvt.txf, 2)->id); XNB_ASSERT(rsp->status == NETIF_RSP_OKAY); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons + 3); XNB_ASSERT(rsp->id == RING_GET_REQUEST(&xnb_unit_pvt.txf, 3)->id); XNB_ASSERT(rsp->status == NETIF_RSP_OKAY); } /** * xnb_txpkt2rsp responding to an invalid packet. * Note: this test will result in an error message being printed to the console * such as: * xnb(xnb_ring2pkt:1306): Unknown extra info type 255. Discarding packet */ static void xnb_txpkt2rsp_invalid(char *buffer, size_t buflen) { uint16_t num_consumed; struct xnb_pkt pkt; struct netif_tx_request *req; netif_extra_info_t *ext; struct netif_tx_response *rsp; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 1000; req->flags = NETTXF_extra_info; req->id = 69; xnb_unit_pvt.txf.req_prod_pvt++; ext = (netif_extra_info_t*) RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); ext->type = 0xFF; /* Invalid extra type */ ext->flags = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_unit_pvt.txb.req_cons += num_consumed; XNB_ASSERT(! xnb_pkt_is_valid(&pkt)); xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 0); XNB_ASSERT( xnb_unit_pvt.txb.rsp_prod_pvt == xnb_unit_pvt.txs->req_prod); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons); XNB_ASSERT(rsp->id == req->id); XNB_ASSERT(rsp->status == NETIF_RSP_ERROR); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons + 1); XNB_ASSERT(rsp->status == NETIF_RSP_NULL); }; /** * xnb_txpkt2rsp responding to one request which caused an error */ static void xnb_txpkt2rsp_error(char *buffer, size_t buflen) { uint16_t num_consumed; struct xnb_pkt pkt; struct netif_tx_request *req; struct netif_tx_response *rsp; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->size = 1000; req->flags = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); num_consumed = xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_unit_pvt.txb.req_cons += num_consumed; xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 1); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons); XNB_ASSERT( xnb_unit_pvt.txb.rsp_prod_pvt == xnb_unit_pvt.txs->req_prod); XNB_ASSERT(rsp->id == req->id); XNB_ASSERT(rsp->status == NETIF_RSP_ERROR); }; /** * xnb_txpkt2rsp's responses wrap around the end of the ring */ static void xnb_txpkt2rsp_wraps(char *buffer, size_t buflen) { struct xnb_pkt pkt; struct netif_tx_request *req; struct netif_tx_response *rsp; unsigned int rsize; /* * Manually tweak the ring indices to create a ring with no responses * and the next request slot at position 2 from the end */ rsize = RING_SIZE(&xnb_unit_pvt.txf); xnb_unit_pvt.txf.req_prod_pvt = rsize - 2; xnb_unit_pvt.txf.rsp_cons = rsize - 2; xnb_unit_pvt.txs->req_prod = rsize - 2; xnb_unit_pvt.txs->req_event = rsize - 1; xnb_unit_pvt.txs->rsp_prod = rsize - 2; xnb_unit_pvt.txs->rsp_event = rsize - 1; xnb_unit_pvt.txb.rsp_prod_pvt = rsize - 2; xnb_unit_pvt.txb.req_cons = rsize - 2; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 550; req->id = 1; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 100; req->id = 2; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 50; req->id = 3; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); xnb_txpkt2rsp(&pkt, &xnb_unit_pvt.txb, 0); XNB_ASSERT( xnb_unit_pvt.txb.rsp_prod_pvt == xnb_unit_pvt.txs->req_prod); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.txb, xnb_unit_pvt.txf.rsp_cons + 2); XNB_ASSERT(rsp->id == req->id); XNB_ASSERT(rsp->status == NETIF_RSP_OKAY); } /** * Helper function used to setup pkt2mbufc tests * \param size size in bytes of the single request to push to the ring * \param flags optional flags to put in the netif request * \param[out] pkt the returned packet object * \return number of requests consumed from the ring */ static int xnb_get1pkt(struct xnb_pkt *pkt, size_t size, uint16_t flags) { struct netif_tx_request *req; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = flags; req->size = size; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); return xnb_ring2pkt(pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); } /** * xnb_pkt2mbufc on an empty packet */ static void xnb_pkt2mbufc_empty(char *buffer, size_t buflen) { struct xnb_pkt pkt; struct mbuf *pMbuf; pkt.list_len = 0; /* must call xnb_ring2pkt just to intialize pkt */ xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pkt.size = 0; pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); safe_m_freem(&pMbuf); } /** * xnb_pkt2mbufc on short packet that can fit in an mbuf internal buffer */ static void xnb_pkt2mbufc_short(char *buffer, size_t buflen) { const size_t size = MINCLSIZE - 1; struct xnb_pkt pkt; struct mbuf *pMbuf; xnb_get1pkt(&pkt, size, 0); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); XNB_ASSERT(M_TRAILINGSPACE(pMbuf) >= size); safe_m_freem(&pMbuf); } /** * xnb_pkt2mbufc on short packet whose checksum was validated by the netfron */ static void xnb_pkt2mbufc_csum(char *buffer, size_t buflen) { const size_t size = MINCLSIZE - 1; struct xnb_pkt pkt; struct mbuf *pMbuf; xnb_get1pkt(&pkt, size, NETTXF_data_validated); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); XNB_ASSERT(M_TRAILINGSPACE(pMbuf) >= size); XNB_ASSERT(pMbuf->m_pkthdr.csum_flags & CSUM_IP_CHECKED); XNB_ASSERT(pMbuf->m_pkthdr.csum_flags & CSUM_IP_VALID); XNB_ASSERT(pMbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID); XNB_ASSERT(pMbuf->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR); safe_m_freem(&pMbuf); } /** * xnb_pkt2mbufc on packet that can fit in one cluster */ static void xnb_pkt2mbufc_1cluster(char *buffer, size_t buflen) { const size_t size = MINCLSIZE; struct xnb_pkt pkt; struct mbuf *pMbuf; xnb_get1pkt(&pkt, size, 0); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); XNB_ASSERT(M_TRAILINGSPACE(pMbuf) >= size); safe_m_freem(&pMbuf); } /** * xnb_pkt2mbufc on packet that cannot fit in one regular cluster */ static void xnb_pkt2mbufc_largecluster(char *buffer, size_t buflen) { const size_t size = MCLBYTES + 1; struct xnb_pkt pkt; struct mbuf *pMbuf; xnb_get1pkt(&pkt, size, 0); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); XNB_ASSERT(M_TRAILINGSPACE(pMbuf) >= size); safe_m_freem(&pMbuf); } /** * xnb_pkt2mbufc on packet that cannot fit in one clusters */ static void xnb_pkt2mbufc_2cluster(char *buffer, size_t buflen) { const size_t size = 2 * MCLBYTES + 1; size_t space = 0; struct xnb_pkt pkt; struct mbuf *pMbuf; struct mbuf *m; xnb_get1pkt(&pkt, size, 0); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); for (m = pMbuf; m != NULL; m = m->m_next) { space += M_TRAILINGSPACE(m); } XNB_ASSERT(space >= size); safe_m_freem(&pMbuf); } /** * xnb_txpkt2gnttab on an empty packet. Should return empty gnttab */ static void xnb_txpkt2gnttab_empty(char *buffer, size_t buflen) { int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; pkt.list_len = 0; /* must call xnb_ring2pkt just to intialize pkt */ xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pkt.size = 0; pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); XNB_ASSERT(n_entries == 0); safe_m_freem(&pMbuf); } /** * xnb_txpkt2gnttab on a short packet, that can fit in one mbuf internal buffer * and has one request */ static void xnb_txpkt2gnttab_short(char *buffer, size_t buflen) { const size_t size = MINCLSIZE - 1; int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; struct netif_tx_request *req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = size; req->gref = 7; req->offset = 17; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); XNB_ASSERT(n_entries == 1); XNB_ASSERT(xnb_unit_pvt.gnttab[0].len == size); /* flags should indicate gref's for source */ XNB_ASSERT(xnb_unit_pvt.gnttab[0].flags & GNTCOPY_source_gref); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.offset == req->offset); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.domid == DOMID_SELF); XNB_ASSERT(xnb_unit_pvt.gnttab[0].dest.offset == virt_to_offset( mtod(pMbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[0].dest.u.gmfn == virt_to_mfn(mtod(pMbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[0].dest.domid == DOMID_FIRST_RESERVED); safe_m_freem(&pMbuf); } /** * xnb_txpkt2gnttab on a packet with two requests, that can fit into a single * mbuf cluster */ static void xnb_txpkt2gnttab_2req(char *buffer, size_t buflen) { int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; struct netif_tx_request *req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 1900; req->gref = 7; req->offset = 0; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 500; req->gref = 8; req->offset = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); XNB_ASSERT(n_entries == 2); XNB_ASSERT(xnb_unit_pvt.gnttab[0].len == 1400); XNB_ASSERT(xnb_unit_pvt.gnttab[0].dest.offset == virt_to_offset( mtod(pMbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[1].len == 500); XNB_ASSERT(xnb_unit_pvt.gnttab[1].dest.offset == virt_to_offset( mtod(pMbuf, vm_offset_t) + 1400)); safe_m_freem(&pMbuf); } /** * xnb_txpkt2gnttab on a single request that spans two mbuf clusters */ static void xnb_txpkt2gnttab_2cluster(char *buffer, size_t buflen) { int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; const uint16_t data_this_transaction = (MCLBYTES*2) + 1; struct netif_tx_request *req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = data_this_transaction; req->gref = 8; req->offset = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); XNB_ASSERT(pMbuf != NULL); if (pMbuf == NULL) return; n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); if (M_TRAILINGSPACE(pMbuf) == MCLBYTES) { /* there should be three mbufs and three gnttab entries */ XNB_ASSERT(n_entries == 3); XNB_ASSERT(xnb_unit_pvt.gnttab[0].len == MCLBYTES); XNB_ASSERT( xnb_unit_pvt.gnttab[0].dest.offset == virt_to_offset( mtod(pMbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.offset == 0); XNB_ASSERT(xnb_unit_pvt.gnttab[1].len == MCLBYTES); XNB_ASSERT( xnb_unit_pvt.gnttab[1].dest.offset == virt_to_offset( mtod(pMbuf->m_next, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[1].source.offset == MCLBYTES); XNB_ASSERT(xnb_unit_pvt.gnttab[2].len == 1); XNB_ASSERT( xnb_unit_pvt.gnttab[2].dest.offset == virt_to_offset( mtod(pMbuf->m_next, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[2].source.offset == 2 * MCLBYTES); } else if (M_TRAILINGSPACE(pMbuf) == 2 * MCLBYTES) { /* there should be two mbufs and two gnttab entries */ XNB_ASSERT(n_entries == 2); XNB_ASSERT(xnb_unit_pvt.gnttab[0].len == 2 * MCLBYTES); XNB_ASSERT( xnb_unit_pvt.gnttab[0].dest.offset == virt_to_offset( mtod(pMbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.offset == 0); XNB_ASSERT(xnb_unit_pvt.gnttab[1].len == 1); XNB_ASSERT( xnb_unit_pvt.gnttab[1].dest.offset == virt_to_offset( mtod(pMbuf->m_next, vm_offset_t))); XNB_ASSERT( xnb_unit_pvt.gnttab[1].source.offset == 2 * MCLBYTES); } else { /* should never get here */ XNB_ASSERT(0); } m_freem(pMbuf); } /** * xnb_update_mbufc on a short packet that only has one gnttab entry */ static void xnb_update_mbufc_short(char *buffer, size_t buflen) { const size_t size = MINCLSIZE - 1; int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; struct netif_tx_request *req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = size; req->gref = 7; req->offset = 17; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); /* Update grant table's status fields as the hypervisor call would */ xnb_unit_pvt.gnttab[0].status = GNTST_okay; xnb_update_mbufc(pMbuf, xnb_unit_pvt.gnttab, n_entries); XNB_ASSERT(pMbuf->m_len == size); XNB_ASSERT(pMbuf->m_pkthdr.len == size); safe_m_freem(&pMbuf); } /** * xnb_update_mbufc on a packet with two requests, that can fit into a single * mbuf cluster */ static void xnb_update_mbufc_2req(char *buffer, size_t buflen) { int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; struct netif_tx_request *req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = NETTXF_more_data; req->size = 1900; req->gref = 7; req->offset = 0; xnb_unit_pvt.txf.req_prod_pvt++; req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = 500; req->gref = 8; req->offset = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); /* Update grant table's status fields as the hypervisor call would */ xnb_unit_pvt.gnttab[0].status = GNTST_okay; xnb_unit_pvt.gnttab[1].status = GNTST_okay; xnb_update_mbufc(pMbuf, xnb_unit_pvt.gnttab, n_entries); XNB_ASSERT(n_entries == 2); XNB_ASSERT(pMbuf->m_pkthdr.len == 1900); XNB_ASSERT(pMbuf->m_len == 1900); safe_m_freem(&pMbuf); } /** * xnb_update_mbufc on a single request that spans two mbuf clusters */ static void xnb_update_mbufc_2cluster(char *buffer, size_t buflen) { int i; int n_entries; struct xnb_pkt pkt; struct mbuf *pMbuf; const uint16_t data_this_transaction = (MCLBYTES*2) + 1; struct netif_tx_request *req = RING_GET_REQUEST(&xnb_unit_pvt.txf, xnb_unit_pvt.txf.req_prod_pvt); req->flags = 0; req->size = data_this_transaction; req->gref = 8; req->offset = 0; xnb_unit_pvt.txf.req_prod_pvt++; RING_PUSH_REQUESTS(&xnb_unit_pvt.txf); xnb_ring2pkt(&pkt, &xnb_unit_pvt.txb, xnb_unit_pvt.txb.req_cons); pMbuf = xnb_pkt2mbufc(&pkt, xnb_unit_pvt.ifp); n_entries = xnb_txpkt2gnttab(&pkt, pMbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.txb, DOMID_FIRST_RESERVED); /* Update grant table's status fields */ for (i = 0; i < n_entries; i++) { xnb_unit_pvt.gnttab[0].status = GNTST_okay; } xnb_update_mbufc(pMbuf, xnb_unit_pvt.gnttab, n_entries); if (n_entries == 3) { /* there should be three mbufs and three gnttab entries */ XNB_ASSERT(pMbuf->m_pkthdr.len == data_this_transaction); XNB_ASSERT(pMbuf->m_len == MCLBYTES); XNB_ASSERT(pMbuf->m_next->m_len == MCLBYTES); XNB_ASSERT(pMbuf->m_next->m_next->m_len == 1); } else if (n_entries == 2) { /* there should be two mbufs and two gnttab entries */ XNB_ASSERT(n_entries == 2); XNB_ASSERT(pMbuf->m_pkthdr.len == data_this_transaction); XNB_ASSERT(pMbuf->m_len == 2 * MCLBYTES); XNB_ASSERT(pMbuf->m_next->m_len == 1); } else { /* should never get here */ XNB_ASSERT(0); } safe_m_freem(&pMbuf); } /** xnb_mbufc2pkt on an empty mbufc */ static void xnb_mbufc2pkt_empty(char *buffer, size_t buflen) { struct xnb_pkt pkt; int free_slots = 64; struct mbuf *mbuf; mbuf = m_get(M_WAITOK, MT_DATA); /* * note: it is illegal to set M_PKTHDR on a mbuf with no data. Doing so * will cause m_freem to segfault */ XNB_ASSERT(mbuf->m_len == 0); xnb_mbufc2pkt(mbuf, &pkt, 0, free_slots); XNB_ASSERT(! xnb_pkt_is_valid(&pkt)); safe_m_freem(&mbuf); } /** xnb_mbufc2pkt on a short mbufc */ static void xnb_mbufc2pkt_short(char *buffer, size_t buflen) { struct xnb_pkt pkt; size_t size = 128; int free_slots = 64; RING_IDX start = 9; struct mbuf *mbuf; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; mbuf->m_len = size; xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.size == size); XNB_ASSERT(pkt.car_size == size); XNB_ASSERT(! (pkt.flags & (NETRXF_more_data | NETRXF_extra_info))); XNB_ASSERT(pkt.list_len == 1); XNB_ASSERT(pkt.car == start); safe_m_freem(&mbuf); } /** xnb_mbufc2pkt on a single mbuf with an mbuf cluster */ static void xnb_mbufc2pkt_1cluster(char *buffer, size_t buflen) { struct xnb_pkt pkt; size_t size = MCLBYTES; int free_slots = 32; RING_IDX start = 12; struct mbuf *mbuf; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; mbuf->m_len = size; xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.size == size); XNB_ASSERT(pkt.car_size == size); XNB_ASSERT(! (pkt.flags & (NETRXF_more_data | NETRXF_extra_info))); XNB_ASSERT(pkt.list_len == 1); XNB_ASSERT(pkt.car == start); safe_m_freem(&mbuf); } /** xnb_mbufc2pkt on a two-mbuf chain with short data regions */ static void xnb_mbufc2pkt_2short(char *buffer, size_t buflen) { struct xnb_pkt pkt; size_t size1 = MHLEN - 5; size_t size2 = MHLEN - 15; int free_slots = 32; RING_IDX start = 14; struct mbuf *mbufc, *mbufc2; mbufc = m_getm(NULL, size1, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc != NULL); if (mbufc == NULL) return; mbufc->m_flags |= M_PKTHDR; mbufc2 = m_getm(mbufc, size2, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc2 != NULL); if (mbufc2 == NULL) { safe_m_freem(&mbufc); return; } mbufc2->m_pkthdr.len = size1 + size2; mbufc2->m_len = size1; xnb_mbufc2pkt(mbufc2, &pkt, start, free_slots); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.size == size1 + size2); XNB_ASSERT(pkt.car == start); /* * The second m_getm may allocate a new mbuf and append * it to the chain, or it may simply extend the first mbuf. */ if (mbufc2->m_next != NULL) { XNB_ASSERT(pkt.car_size == size1); XNB_ASSERT(pkt.list_len == 1); XNB_ASSERT(pkt.cdr == start + 1); } safe_m_freem(&mbufc2); } /** xnb_mbufc2pkt on a mbuf chain with >1 mbuf cluster */ static void xnb_mbufc2pkt_long(char *buffer, size_t buflen) { struct xnb_pkt pkt; size_t size = 14 * MCLBYTES / 3; size_t size_remaining; int free_slots = 15; RING_IDX start = 3; struct mbuf *mbufc, *m; mbufc = m_getm(NULL, size, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc != NULL); if (mbufc == NULL) return; mbufc->m_flags |= M_PKTHDR; mbufc->m_pkthdr.len = size; size_remaining = size; for (m = mbufc; m != NULL; m = m->m_next) { m->m_len = MAX(M_TRAILINGSPACE(m), size_remaining); size_remaining -= m->m_len; } xnb_mbufc2pkt(mbufc, &pkt, start, free_slots); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.size == size); XNB_ASSERT(pkt.car == start); XNB_ASSERT(pkt.car_size = mbufc->m_len); /* * There should be >1 response in the packet, and there is no * extra info. */ XNB_ASSERT(! (pkt.flags & NETRXF_extra_info)); XNB_ASSERT(pkt.cdr == pkt.car + 1); safe_m_freem(&mbufc); } /** xnb_mbufc2pkt on a mbuf chain with >1 mbuf cluster and extra info */ static void xnb_mbufc2pkt_extra(char *buffer, size_t buflen) { struct xnb_pkt pkt; size_t size = 14 * MCLBYTES / 3; size_t size_remaining; int free_slots = 15; RING_IDX start = 3; struct mbuf *mbufc, *m; mbufc = m_getm(NULL, size, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc != NULL); if (mbufc == NULL) return; mbufc->m_flags |= M_PKTHDR; mbufc->m_pkthdr.len = size; mbufc->m_pkthdr.csum_flags |= CSUM_TSO; mbufc->m_pkthdr.tso_segsz = TCP_MSS - 40; size_remaining = size; for (m = mbufc; m != NULL; m = m->m_next) { m->m_len = MAX(M_TRAILINGSPACE(m), size_remaining); size_remaining -= m->m_len; } xnb_mbufc2pkt(mbufc, &pkt, start, free_slots); XNB_ASSERT(xnb_pkt_is_valid(&pkt)); XNB_ASSERT(pkt.size == size); XNB_ASSERT(pkt.car == start); XNB_ASSERT(pkt.car_size = mbufc->m_len); /* There should be >1 response in the packet, there is extra info */ XNB_ASSERT(pkt.flags & NETRXF_extra_info); XNB_ASSERT(pkt.flags & NETRXF_data_validated); XNB_ASSERT(pkt.cdr == pkt.car + 2); XNB_ASSERT(pkt.extra.u.gso.size = mbufc->m_pkthdr.tso_segsz); XNB_ASSERT(pkt.extra.type == XEN_NETIF_EXTRA_TYPE_GSO); XNB_ASSERT(! (pkt.extra.flags & XEN_NETIF_EXTRA_FLAG_MORE)); safe_m_freem(&mbufc); } /** xnb_mbufc2pkt with insufficient space in the ring */ static void xnb_mbufc2pkt_nospace(char *buffer, size_t buflen) { struct xnb_pkt pkt; size_t size = 14 * MCLBYTES / 3; size_t size_remaining; int free_slots = 2; RING_IDX start = 3; struct mbuf *mbufc, *m; int error; mbufc = m_getm(NULL, size, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc != NULL); if (mbufc == NULL) return; mbufc->m_flags |= M_PKTHDR; mbufc->m_pkthdr.len = size; size_remaining = size; for (m = mbufc; m != NULL; m = m->m_next) { m->m_len = MAX(M_TRAILINGSPACE(m), size_remaining); size_remaining -= m->m_len; } error = xnb_mbufc2pkt(mbufc, &pkt, start, free_slots); XNB_ASSERT(error == EAGAIN); XNB_ASSERT(! xnb_pkt_is_valid(&pkt)); safe_m_freem(&mbufc); } /** * xnb_rxpkt2gnttab on an empty packet. Should return empty gnttab */ static void xnb_rxpkt2gnttab_empty(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries; int free_slots = 60; struct mbuf *mbuf; mbuf = m_get(M_WAITOK, MT_DATA); xnb_mbufc2pkt(mbuf, &pkt, 0, free_slots); nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); XNB_ASSERT(nr_entries == 0); safe_m_freem(&mbuf); } /** xnb_rxpkt2gnttab on a short packet without extra data */ static void xnb_rxpkt2gnttab_short(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries; size_t size = 128; int free_slots = 60; RING_IDX start = 9; struct netif_rx_request *req; struct mbuf *mbuf; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; mbuf->m_len = size; xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, xnb_unit_pvt.txf.req_prod_pvt); req->gref = 7; nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); XNB_ASSERT(nr_entries == 1); XNB_ASSERT(xnb_unit_pvt.gnttab[0].len == size); /* flags should indicate gref's for dest */ XNB_ASSERT(xnb_unit_pvt.gnttab[0].flags & GNTCOPY_dest_gref); XNB_ASSERT(xnb_unit_pvt.gnttab[0].dest.offset == 0); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.domid == DOMID_SELF); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.offset == virt_to_offset( mtod(mbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[0].source.u.gmfn == virt_to_mfn(mtod(mbuf, vm_offset_t))); XNB_ASSERT(xnb_unit_pvt.gnttab[0].dest.domid == DOMID_FIRST_RESERVED); safe_m_freem(&mbuf); } /** * xnb_rxpkt2gnttab on a packet with two different mbufs in a single chai */ static void xnb_rxpkt2gnttab_2req(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries; int i, num_mbufs; size_t total_granted_size = 0; size_t size = MJUMPAGESIZE + 1; int free_slots = 60; RING_IDX start = 11; struct netif_rx_request *req; struct mbuf *mbuf, *m; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; mbuf->m_len = size; xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); for (i = 0, m=mbuf; m != NULL; i++, m = m->m_next) { req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, xnb_unit_pvt.txf.req_prod_pvt); req->gref = i; req->id = 5; } num_mbufs = i; nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); XNB_ASSERT(nr_entries >= num_mbufs); for (i = 0; i < nr_entries; i++) { int end_offset = xnb_unit_pvt.gnttab[i].len + xnb_unit_pvt.gnttab[i].dest.offset; XNB_ASSERT(end_offset <= PAGE_SIZE); total_granted_size += xnb_unit_pvt.gnttab[i].len; } XNB_ASSERT(total_granted_size == size); } /** * xnb_rxpkt2rsp on an empty packet. Shouldn't make any response */ static void xnb_rxpkt2rsp_empty(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries; int nr_reqs; int free_slots = 60; netif_rx_back_ring_t rxb_backup = xnb_unit_pvt.rxb; netif_rx_sring_t rxs_backup = *xnb_unit_pvt.rxs; struct mbuf *mbuf; mbuf = m_get(M_WAITOK, MT_DATA); xnb_mbufc2pkt(mbuf, &pkt, 0, free_slots); nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); nr_reqs = xnb_rxpkt2rsp(&pkt, xnb_unit_pvt.gnttab, nr_entries, &xnb_unit_pvt.rxb); XNB_ASSERT(nr_reqs == 0); XNB_ASSERT( memcmp(&rxb_backup, &xnb_unit_pvt.rxb, sizeof(rxb_backup)) == 0); XNB_ASSERT( memcmp(&rxs_backup, xnb_unit_pvt.rxs, sizeof(rxs_backup)) == 0); safe_m_freem(&mbuf); } /** * xnb_rxpkt2rsp on a short packet with no extras */ static void xnb_rxpkt2rsp_short(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries, nr_reqs; size_t size = 128; int free_slots = 60; RING_IDX start = 5; struct netif_rx_request *req; struct netif_rx_response *rsp; struct mbuf *mbuf; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; mbuf->m_len = size; xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start); req->gref = 7; xnb_unit_pvt.rxb.req_cons = start; xnb_unit_pvt.rxb.rsp_prod_pvt = start; xnb_unit_pvt.rxs->req_prod = start + 1; xnb_unit_pvt.rxs->rsp_prod = start; nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); nr_reqs = xnb_rxpkt2rsp(&pkt, xnb_unit_pvt.gnttab, nr_entries, &xnb_unit_pvt.rxb); XNB_ASSERT(nr_reqs == 1); XNB_ASSERT(xnb_unit_pvt.rxb.rsp_prod_pvt == start + 1); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start); XNB_ASSERT(rsp->id == req->id); XNB_ASSERT(rsp->offset == 0); XNB_ASSERT((rsp->flags & (NETRXF_more_data | NETRXF_extra_info)) == 0); XNB_ASSERT(rsp->status == size); safe_m_freem(&mbuf); } /** * xnb_rxpkt2rsp with extra data */ static void xnb_rxpkt2rsp_extra(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries, nr_reqs; size_t size = 14; int free_slots = 15; RING_IDX start = 3; uint16_t id = 49; uint16_t gref = 65; uint16_t mss = TCP_MSS - 40; struct mbuf *mbufc; struct netif_rx_request *req; struct netif_rx_response *rsp; struct netif_extra_info *ext; mbufc = m_getm(NULL, size, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc != NULL); if (mbufc == NULL) return; mbufc->m_flags |= M_PKTHDR; mbufc->m_pkthdr.len = size; mbufc->m_pkthdr.csum_flags |= CSUM_TSO; mbufc->m_pkthdr.tso_segsz = mss; mbufc->m_len = size; xnb_mbufc2pkt(mbufc, &pkt, start, free_slots); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start); req->id = id; req->gref = gref; req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start + 1); req->id = id + 1; req->gref = gref + 1; xnb_unit_pvt.rxb.req_cons = start; xnb_unit_pvt.rxb.rsp_prod_pvt = start; xnb_unit_pvt.rxs->req_prod = start + 2; xnb_unit_pvt.rxs->rsp_prod = start; nr_entries = xnb_rxpkt2gnttab(&pkt, mbufc, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); nr_reqs = xnb_rxpkt2rsp(&pkt, xnb_unit_pvt.gnttab, nr_entries, &xnb_unit_pvt.rxb); XNB_ASSERT(nr_reqs == 2); XNB_ASSERT(xnb_unit_pvt.rxb.rsp_prod_pvt == start + 2); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start); XNB_ASSERT(rsp->id == id); XNB_ASSERT((rsp->flags & NETRXF_more_data) == 0); XNB_ASSERT((rsp->flags & NETRXF_extra_info)); XNB_ASSERT((rsp->flags & NETRXF_data_validated)); XNB_ASSERT((rsp->flags & NETRXF_csum_blank)); XNB_ASSERT(rsp->status == size); ext = (struct netif_extra_info*) RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start + 1); XNB_ASSERT(ext->type == XEN_NETIF_EXTRA_TYPE_GSO); XNB_ASSERT(! (ext->flags & XEN_NETIF_EXTRA_FLAG_MORE)); XNB_ASSERT(ext->u.gso.size == mss); XNB_ASSERT(ext->u.gso.type == XEN_NETIF_EXTRA_TYPE_GSO); safe_m_freem(&mbufc); } /** * xnb_rxpkt2rsp on a packet with more than a pages's worth of data. It should * generate two response slot */ static void xnb_rxpkt2rsp_2slots(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries, nr_reqs; size_t size = PAGE_SIZE + 100; int free_slots = 3; uint16_t id1 = 17; uint16_t id2 = 37; uint16_t gref1 = 24; uint16_t gref2 = 34; RING_IDX start = 15; struct netif_rx_request *req; struct netif_rx_response *rsp; struct mbuf *mbuf; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; if (mbuf->m_next != NULL) { size_t first_len = MIN(M_TRAILINGSPACE(mbuf), size); mbuf->m_len = first_len; mbuf->m_next->m_len = size - first_len; } else { mbuf->m_len = size; } xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start); req->gref = gref1; req->id = id1; req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start + 1); req->gref = gref2; req->id = id2; xnb_unit_pvt.rxb.req_cons = start; xnb_unit_pvt.rxb.rsp_prod_pvt = start; xnb_unit_pvt.rxs->req_prod = start + 2; xnb_unit_pvt.rxs->rsp_prod = start; nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); nr_reqs = xnb_rxpkt2rsp(&pkt, xnb_unit_pvt.gnttab, nr_entries, &xnb_unit_pvt.rxb); XNB_ASSERT(nr_reqs == 2); XNB_ASSERT(xnb_unit_pvt.rxb.rsp_prod_pvt == start + 2); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start); XNB_ASSERT(rsp->id == id1); XNB_ASSERT(rsp->offset == 0); XNB_ASSERT((rsp->flags & NETRXF_extra_info) == 0); XNB_ASSERT(rsp->flags & NETRXF_more_data); XNB_ASSERT(rsp->status == PAGE_SIZE); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start + 1); XNB_ASSERT(rsp->id == id2); XNB_ASSERT(rsp->offset == 0); XNB_ASSERT((rsp->flags & NETRXF_extra_info) == 0); XNB_ASSERT(! (rsp->flags & NETRXF_more_data)); XNB_ASSERT(rsp->status == size - PAGE_SIZE); safe_m_freem(&mbuf); } /** xnb_rxpkt2rsp on a grant table with two sub-page entries */ static void xnb_rxpkt2rsp_2short(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_reqs, nr_entries; size_t size1 = MHLEN - 5; size_t size2 = MHLEN - 15; int free_slots = 32; RING_IDX start = 14; uint16_t id = 47; uint16_t gref = 54; struct netif_rx_request *req; struct netif_rx_response *rsp; struct mbuf *mbufc; mbufc = m_getm(NULL, size1, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc != NULL); if (mbufc == NULL) return; mbufc->m_flags |= M_PKTHDR; m_getm(mbufc, size2, M_WAITOK, MT_DATA); XNB_ASSERT(mbufc->m_next != NULL); mbufc->m_pkthdr.len = size1 + size2; mbufc->m_len = size1; mbufc->m_next->m_len = size2; xnb_mbufc2pkt(mbufc, &pkt, start, free_slots); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start); req->gref = gref; req->id = id; xnb_unit_pvt.rxb.req_cons = start; xnb_unit_pvt.rxb.rsp_prod_pvt = start; xnb_unit_pvt.rxs->req_prod = start + 1; xnb_unit_pvt.rxs->rsp_prod = start; nr_entries = xnb_rxpkt2gnttab(&pkt, mbufc, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); nr_reqs = xnb_rxpkt2rsp(&pkt, xnb_unit_pvt.gnttab, nr_entries, &xnb_unit_pvt.rxb); XNB_ASSERT(nr_entries == 2); XNB_ASSERT(nr_reqs == 1); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start); XNB_ASSERT(rsp->id == id); XNB_ASSERT(rsp->status == size1 + size2); XNB_ASSERT(rsp->offset == 0); XNB_ASSERT(! (rsp->flags & (NETRXF_more_data | NETRXF_extra_info))); safe_m_freem(&mbufc); } /** * xnb_rxpkt2rsp on a long packet with a hypervisor gnttab_copy error * Note: this test will result in an error message being printed to the console * such as: * xnb(xnb_rxpkt2rsp:1720): Got error -1 for hypervisor gnttab_copy status */ static void xnb_rxpkt2rsp_copyerror(char *buffer, size_t buflen) { struct xnb_pkt pkt; int nr_entries, nr_reqs; int id = 7; int gref = 42; uint16_t canary = 6859; size_t size = 7 * MCLBYTES; int free_slots = 9; RING_IDX start = 2; struct netif_rx_request *req; struct netif_rx_response *rsp; struct mbuf *mbuf; mbuf = m_getm(NULL, size, M_WAITOK, MT_DATA); mbuf->m_flags |= M_PKTHDR; mbuf->m_pkthdr.len = size; mbuf->m_len = size; xnb_mbufc2pkt(mbuf, &pkt, start, free_slots); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start); req->gref = gref; req->id = id; xnb_unit_pvt.rxb.req_cons = start; xnb_unit_pvt.rxb.rsp_prod_pvt = start; xnb_unit_pvt.rxs->req_prod = start + 1; xnb_unit_pvt.rxs->rsp_prod = start; req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start + 1); req->gref = canary; req->id = canary; nr_entries = xnb_rxpkt2gnttab(&pkt, mbuf, xnb_unit_pvt.gnttab, &xnb_unit_pvt.rxb, DOMID_FIRST_RESERVED); /* Inject the error*/ xnb_unit_pvt.gnttab[2].status = GNTST_general_error; nr_reqs = xnb_rxpkt2rsp(&pkt, xnb_unit_pvt.gnttab, nr_entries, &xnb_unit_pvt.rxb); XNB_ASSERT(nr_reqs == 1); XNB_ASSERT(xnb_unit_pvt.rxb.rsp_prod_pvt == start + 1); rsp = RING_GET_RESPONSE(&xnb_unit_pvt.rxb, start); XNB_ASSERT(rsp->id == id); XNB_ASSERT(rsp->status == NETIF_RSP_ERROR); req = RING_GET_REQUEST(&xnb_unit_pvt.rxf, start + 1); XNB_ASSERT(req->gref == canary); XNB_ASSERT(req->id == canary); safe_m_freem(&mbuf); } #if defined(INET) || defined(INET6) /** * xnb_add_mbuf_cksum on an ARP request packet */ static void xnb_add_mbuf_cksum_arp(char *buffer, size_t buflen) { const size_t pkt_len = sizeof(struct ether_header) + sizeof(struct ether_arp); struct mbuf *mbufc; struct ether_header *eh; struct ether_arp *ep; unsigned char pkt_orig[pkt_len]; mbufc = m_getm(NULL, pkt_len, M_WAITOK, MT_DATA); /* Fill in an example arp request */ eh = mtod(mbufc, struct ether_header*); eh->ether_dhost[0] = 0xff; eh->ether_dhost[1] = 0xff; eh->ether_dhost[2] = 0xff; eh->ether_dhost[3] = 0xff; eh->ether_dhost[4] = 0xff; eh->ether_dhost[5] = 0xff; eh->ether_shost[0] = 0x00; eh->ether_shost[1] = 0x15; eh->ether_shost[2] = 0x17; eh->ether_shost[3] = 0xe9; eh->ether_shost[4] = 0x30; eh->ether_shost[5] = 0x68; eh->ether_type = htons(ETHERTYPE_ARP); ep = (struct ether_arp*)(eh + 1); ep->ea_hdr.ar_hrd = htons(ARPHRD_ETHER); ep->ea_hdr.ar_pro = htons(ETHERTYPE_IP); ep->ea_hdr.ar_hln = 6; ep->ea_hdr.ar_pln = 4; ep->ea_hdr.ar_op = htons(ARPOP_REQUEST); ep->arp_sha[0] = 0x00; ep->arp_sha[1] = 0x15; ep->arp_sha[2] = 0x17; ep->arp_sha[3] = 0xe9; ep->arp_sha[4] = 0x30; ep->arp_sha[5] = 0x68; ep->arp_spa[0] = 0xc0; ep->arp_spa[1] = 0xa8; ep->arp_spa[2] = 0x0a; ep->arp_spa[3] = 0x04; bzero(&(ep->arp_tha), ETHER_ADDR_LEN); ep->arp_tpa[0] = 0xc0; ep->arp_tpa[1] = 0xa8; ep->arp_tpa[2] = 0x0a; ep->arp_tpa[3] = 0x06; /* fill in the length field */ mbufc->m_len = pkt_len; mbufc->m_pkthdr.len = pkt_len; /* indicate that the netfront uses hw-assisted checksums */ mbufc->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; /* Make a backup copy of the packet */ bcopy(mtod(mbufc, const void*), pkt_orig, pkt_len); /* Function under test */ xnb_add_mbuf_cksum(mbufc); /* Verify that the packet's data did not change */ XNB_ASSERT(bcmp(mtod(mbufc, const void*), pkt_orig, pkt_len) == 0); m_freem(mbufc); } /** * Helper function that populates the ethernet header and IP header used by * some of the xnb_add_mbuf_cksum unit tests. m must already be allocated * and must be large enough */ static void xnb_fill_eh_and_ip(struct mbuf *m, uint16_t ip_len, uint16_t ip_id, uint16_t ip_p, uint16_t ip_off, uint16_t ip_sum) { struct ether_header *eh; struct ip *iph; eh = mtod(m, struct ether_header*); eh->ether_dhost[0] = 0x00; eh->ether_dhost[1] = 0x16; eh->ether_dhost[2] = 0x3e; eh->ether_dhost[3] = 0x23; eh->ether_dhost[4] = 0x50; eh->ether_dhost[5] = 0x0b; eh->ether_shost[0] = 0x00; eh->ether_shost[1] = 0x16; eh->ether_shost[2] = 0x30; eh->ether_shost[3] = 0x00; eh->ether_shost[4] = 0x00; eh->ether_shost[5] = 0x00; eh->ether_type = htons(ETHERTYPE_IP); iph = (struct ip*)(eh + 1); iph->ip_hl = 0x5; /* 5 dwords == 20 bytes */ iph->ip_v = 4; /* IP v4 */ iph->ip_tos = 0; iph->ip_len = htons(ip_len); iph->ip_id = htons(ip_id); iph->ip_off = htons(ip_off); iph->ip_ttl = 64; iph->ip_p = ip_p; iph->ip_sum = htons(ip_sum); iph->ip_src.s_addr = htonl(0xc0a80a04); iph->ip_dst.s_addr = htonl(0xc0a80a05); } /** * xnb_add_mbuf_cksum on an ICMP packet, based on a tcpdump of an actual * ICMP packet */ static void xnb_add_mbuf_cksum_icmp(char *buffer, size_t buflen) { const size_t icmp_len = 64; /* set by ping(1) */ const size_t pkt_len = sizeof(struct ether_header) + sizeof(struct ip) + icmp_len; struct mbuf *mbufc; struct ether_header *eh; struct ip *iph; struct icmp *icmph; unsigned char pkt_orig[icmp_len]; uint32_t *tv_field; uint8_t *data_payload; int i; const uint16_t ICMP_CSUM = 0xaed7; const uint16_t IP_CSUM = 0xe533; mbufc = m_getm(NULL, pkt_len, M_WAITOK, MT_DATA); /* Fill in an example ICMP ping request */ eh = mtod(mbufc, struct ether_header*); xnb_fill_eh_and_ip(mbufc, 84, 28, IPPROTO_ICMP, 0, 0); iph = (struct ip*)(eh + 1); icmph = (struct icmp*)(iph + 1); icmph->icmp_type = ICMP_ECHO; icmph->icmp_code = 0; icmph->icmp_cksum = htons(ICMP_CSUM); icmph->icmp_id = htons(31492); icmph->icmp_seq = htons(0); /* * ping(1) uses bcopy to insert a native-endian timeval after icmp_seq. * For this test, we will set the bytes individually for portability. */ tv_field = (uint32_t*)(&(icmph->icmp_hun)); tv_field[0] = 0x4f02cfac; tv_field[1] = 0x0007c46a; /* * Remainder of packet is an incrmenting 8 bit integer, starting with 8 */ data_payload = (uint8_t*)(&tv_field[2]); for (i = 8; i < 37; i++) { *data_payload++ = i; } /* fill in the length field */ mbufc->m_len = pkt_len; mbufc->m_pkthdr.len = pkt_len; /* indicate that the netfront uses hw-assisted checksums */ mbufc->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; bcopy(mtod(mbufc, const void*), pkt_orig, icmp_len); /* Function under test */ xnb_add_mbuf_cksum(mbufc); /* Check the IP checksum */ XNB_ASSERT(iph->ip_sum == htons(IP_CSUM)); /* Check that the ICMP packet did not change */ XNB_ASSERT(bcmp(icmph, pkt_orig, icmp_len)); m_freem(mbufc); } /** * xnb_add_mbuf_cksum on a UDP packet, based on a tcpdump of an actual * UDP packet */ static void xnb_add_mbuf_cksum_udp(char *buffer, size_t buflen) { const size_t udp_len = 16; const size_t pkt_len = sizeof(struct ether_header) + sizeof(struct ip) + udp_len; struct mbuf *mbufc; struct ether_header *eh; struct ip *iph; struct udphdr *udp; uint8_t *data_payload; const uint16_t IP_CSUM = 0xe56b; const uint16_t UDP_CSUM = 0xdde2; mbufc = m_getm(NULL, pkt_len, M_WAITOK, MT_DATA); /* Fill in an example UDP packet made by 'uname | nc -u 2222 */ eh = mtod(mbufc, struct ether_header*); xnb_fill_eh_and_ip(mbufc, 36, 4, IPPROTO_UDP, 0, 0xbaad); iph = (struct ip*)(eh + 1); udp = (struct udphdr*)(iph + 1); udp->uh_sport = htons(0x51ae); udp->uh_dport = htons(0x08ae); udp->uh_ulen = htons(udp_len); udp->uh_sum = htons(0xbaad); /* xnb_add_mbuf_cksum will fill this in */ data_payload = (uint8_t*)(udp + 1); data_payload[0] = 'F'; data_payload[1] = 'r'; data_payload[2] = 'e'; data_payload[3] = 'e'; data_payload[4] = 'B'; data_payload[5] = 'S'; data_payload[6] = 'D'; data_payload[7] = '\n'; /* fill in the length field */ mbufc->m_len = pkt_len; mbufc->m_pkthdr.len = pkt_len; /* indicate that the netfront uses hw-assisted checksums */ mbufc->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; /* Function under test */ xnb_add_mbuf_cksum(mbufc); /* Check the checksums */ XNB_ASSERT(iph->ip_sum == htons(IP_CSUM)); XNB_ASSERT(udp->uh_sum == htons(UDP_CSUM)); m_freem(mbufc); } /** * Helper function that populates a TCP packet used by all of the * xnb_add_mbuf_cksum tcp unit tests. m must already be allocated and must be * large enough */ static void xnb_fill_tcp(struct mbuf *m) { struct ether_header *eh; struct ip *iph; struct tcphdr *tcp; uint32_t *options; uint8_t *data_payload; /* Fill in an example TCP packet made by 'uname | nc 2222' */ eh = mtod(m, struct ether_header*); xnb_fill_eh_and_ip(m, 60, 8, IPPROTO_TCP, IP_DF, 0); iph = (struct ip*)(eh + 1); tcp = (struct tcphdr*)(iph + 1); tcp->th_sport = htons(0x9cd9); tcp->th_dport = htons(2222); tcp->th_seq = htonl(0x00f72b10); tcp->th_ack = htonl(0x7f37ba6c); - tcp->th_x2 = 0; + tcp_set_flags(tcp, TH_ACK | TH_PUSH); tcp->th_off = 8; - tcp->th_flags = 0x18; tcp->th_win = htons(0x410); /* th_sum is incorrect; will be inserted by function under test */ tcp->th_sum = htons(0xbaad); tcp->th_urp = htons(0); /* * The following 12 bytes of options encode: * [nop, nop, TS val 33247 ecr 3457687679] */ options = (uint32_t*)(tcp + 1); options[0] = htonl(0x0101080a); options[1] = htonl(0x000081df); options[2] = htonl(0xce18207f); data_payload = (uint8_t*)(&options[3]); data_payload[0] = 'F'; data_payload[1] = 'r'; data_payload[2] = 'e'; data_payload[3] = 'e'; data_payload[4] = 'B'; data_payload[5] = 'S'; data_payload[6] = 'D'; data_payload[7] = '\n'; } /** * xnb_add_mbuf_cksum on a TCP packet, based on a tcpdump of an actual TCP * packet */ static void xnb_add_mbuf_cksum_tcp(char *buffer, size_t buflen) { const size_t payload_len = 8; const size_t tcp_options_len = 12; const size_t pkt_len = sizeof(struct ether_header) + sizeof(struct ip) + sizeof(struct tcphdr) + tcp_options_len + payload_len; struct mbuf *mbufc; struct ether_header *eh; struct ip *iph; struct tcphdr *tcp; const uint16_t IP_CSUM = 0xa55a; const uint16_t TCP_CSUM = 0x2f64; mbufc = m_getm(NULL, pkt_len, M_WAITOK, MT_DATA); /* Fill in an example TCP packet made by 'uname | nc 2222' */ xnb_fill_tcp(mbufc); eh = mtod(mbufc, struct ether_header*); iph = (struct ip*)(eh + 1); tcp = (struct tcphdr*)(iph + 1); /* fill in the length field */ mbufc->m_len = pkt_len; mbufc->m_pkthdr.len = pkt_len; /* indicate that the netfront uses hw-assisted checksums */ mbufc->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; /* Function under test */ xnb_add_mbuf_cksum(mbufc); /* Check the checksums */ XNB_ASSERT(iph->ip_sum == htons(IP_CSUM)); XNB_ASSERT(tcp->th_sum == htons(TCP_CSUM)); m_freem(mbufc); } /** * xnb_add_mbuf_cksum on a TCP packet that does not use HW assisted checksums */ static void xnb_add_mbuf_cksum_tcp_swcksum(char *buffer, size_t buflen) { const size_t payload_len = 8; const size_t tcp_options_len = 12; const size_t pkt_len = sizeof(struct ether_header) + sizeof(struct ip) + sizeof(struct tcphdr) + tcp_options_len + payload_len; struct mbuf *mbufc; struct ether_header *eh; struct ip *iph; struct tcphdr *tcp; /* Use deliberately bad checksums, and verify that they don't get */ /* corrected by xnb_add_mbuf_cksum */ const uint16_t IP_CSUM = 0xdead; const uint16_t TCP_CSUM = 0xbeef; mbufc = m_getm(NULL, pkt_len, M_WAITOK, MT_DATA); /* Fill in an example TCP packet made by 'uname | nc 2222' */ xnb_fill_tcp(mbufc); eh = mtod(mbufc, struct ether_header*); iph = (struct ip*)(eh + 1); iph->ip_sum = htons(IP_CSUM); tcp = (struct tcphdr*)(iph + 1); tcp->th_sum = htons(TCP_CSUM); /* fill in the length field */ mbufc->m_len = pkt_len; mbufc->m_pkthdr.len = pkt_len; /* indicate that the netfront does not use hw-assisted checksums */ mbufc->m_pkthdr.csum_flags = 0; /* Function under test */ xnb_add_mbuf_cksum(mbufc); /* Check that the checksums didn't change */ XNB_ASSERT(iph->ip_sum == htons(IP_CSUM)); XNB_ASSERT(tcp->th_sum == htons(TCP_CSUM)); m_freem(mbufc); } #endif /* INET || INET6 */ /** * sscanf on unsigned chars */ static void xnb_sscanf_hhu(char *buffer, size_t buflen) { const char mystr[] = "137"; uint8_t dest[12]; int i; for (i = 0; i < 12; i++) dest[i] = 'X'; XNB_ASSERT(sscanf(mystr, "%hhu", &dest[4]) == 1); for (i = 0; i < 12; i++) XNB_ASSERT(dest[i] == (i == 4 ? 137 : 'X')); } /** * sscanf on signed chars */ static void xnb_sscanf_hhd(char *buffer, size_t buflen) { const char mystr[] = "-27"; int8_t dest[12]; int i; for (i = 0; i < 12; i++) dest[i] = 'X'; XNB_ASSERT(sscanf(mystr, "%hhd", &dest[4]) == 1); for (i = 0; i < 12; i++) XNB_ASSERT(dest[i] == (i == 4 ? -27 : 'X')); } /** * sscanf on signed long longs */ static void xnb_sscanf_lld(char *buffer, size_t buflen) { const char mystr[] = "-123456789012345"; /* about -2**47 */ long long dest[3]; int i; for (i = 0; i < 3; i++) dest[i] = (long long)0xdeadbeefdeadbeef; XNB_ASSERT(sscanf(mystr, "%lld", &dest[1]) == 1); for (i = 0; i < 3; i++) XNB_ASSERT(dest[i] == (i != 1 ? (long long)0xdeadbeefdeadbeef : -123456789012345)); } /** * sscanf on unsigned long longs */ static void xnb_sscanf_llu(char *buffer, size_t buflen) { const char mystr[] = "12802747070103273189"; unsigned long long dest[3]; int i; for (i = 0; i < 3; i++) dest[i] = (long long)0xdeadbeefdeadbeef; XNB_ASSERT(sscanf(mystr, "%llu", &dest[1]) == 1); for (i = 0; i < 3; i++) XNB_ASSERT(dest[i] == (i != 1 ? (long long)0xdeadbeefdeadbeef : 12802747070103273189ull)); } /** * sscanf on unsigned short short n's */ static void xnb_sscanf_hhn(char *buffer, size_t buflen) { const char mystr[] = "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f" "202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f" "404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f"; unsigned char dest[12]; int i; for (i = 0; i < 12; i++) dest[i] = (unsigned char)'X'; XNB_ASSERT(sscanf(mystr, "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f" "202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f" "404142434445464748494a4b4c4d4e4f%hhn", &dest[4]) == 0); for (i = 0; i < 12; i++) XNB_ASSERT(dest[i] == (i == 4 ? 160 : 'X')); } diff --git a/sys/netgraph/ng_nat.c b/sys/netgraph/ng_nat.c index ae083608a199..d7492b71e07c 100644 --- a/sys/netgraph/ng_nat.c +++ b/sys/netgraph/ng_nat.c @@ -1,966 +1,966 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2005, Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static ng_constructor_t ng_nat_constructor; static ng_rcvmsg_t ng_nat_rcvmsg; static ng_shutdown_t ng_nat_shutdown; static ng_newhook_t ng_nat_newhook; static ng_rcvdata_t ng_nat_rcvdata; static ng_disconnect_t ng_nat_disconnect; static unsigned int ng_nat_translate_flags(unsigned int x); /* Parse type for struct ng_nat_mode. */ static const struct ng_parse_struct_field ng_nat_mode_fields[] = NG_NAT_MODE_INFO; static const struct ng_parse_type ng_nat_mode_type = { &ng_parse_struct_type, &ng_nat_mode_fields }; /* Parse type for 'description' field in structs. */ static const struct ng_parse_fixedstring_info ng_nat_description_info = { NG_NAT_DESC_LENGTH }; static const struct ng_parse_type ng_nat_description_type = { &ng_parse_fixedstring_type, &ng_nat_description_info }; /* Parse type for struct ng_nat_redirect_port. */ static const struct ng_parse_struct_field ng_nat_redirect_port_fields[] = NG_NAT_REDIRECT_PORT_TYPE_INFO(&ng_nat_description_type); static const struct ng_parse_type ng_nat_redirect_port_type = { &ng_parse_struct_type, &ng_nat_redirect_port_fields }; /* Parse type for struct ng_nat_redirect_addr. */ static const struct ng_parse_struct_field ng_nat_redirect_addr_fields[] = NG_NAT_REDIRECT_ADDR_TYPE_INFO(&ng_nat_description_type); static const struct ng_parse_type ng_nat_redirect_addr_type = { &ng_parse_struct_type, &ng_nat_redirect_addr_fields }; /* Parse type for struct ng_nat_redirect_proto. */ static const struct ng_parse_struct_field ng_nat_redirect_proto_fields[] = NG_NAT_REDIRECT_PROTO_TYPE_INFO(&ng_nat_description_type); static const struct ng_parse_type ng_nat_redirect_proto_type = { &ng_parse_struct_type, &ng_nat_redirect_proto_fields }; /* Parse type for struct ng_nat_add_server. */ static const struct ng_parse_struct_field ng_nat_add_server_fields[] = NG_NAT_ADD_SERVER_TYPE_INFO; static const struct ng_parse_type ng_nat_add_server_type = { &ng_parse_struct_type, &ng_nat_add_server_fields }; /* Parse type for one struct ng_nat_listrdrs_entry. */ static const struct ng_parse_struct_field ng_nat_listrdrs_entry_fields[] = NG_NAT_LISTRDRS_ENTRY_TYPE_INFO(&ng_nat_description_type); static const struct ng_parse_type ng_nat_listrdrs_entry_type = { &ng_parse_struct_type, &ng_nat_listrdrs_entry_fields }; /* Parse type for 'redirects' array in struct ng_nat_list_redirects. */ static int ng_nat_listrdrs_ary_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { const struct ng_nat_list_redirects *lr; lr = (const struct ng_nat_list_redirects *) (buf - offsetof(struct ng_nat_list_redirects, redirects)); return lr->total_count; } static const struct ng_parse_array_info ng_nat_listrdrs_ary_info = { &ng_nat_listrdrs_entry_type, &ng_nat_listrdrs_ary_getLength, NULL }; static const struct ng_parse_type ng_nat_listrdrs_ary_type = { &ng_parse_array_type, &ng_nat_listrdrs_ary_info }; /* Parse type for struct ng_nat_list_redirects. */ static const struct ng_parse_struct_field ng_nat_list_redirects_fields[] = NG_NAT_LIST_REDIRECTS_TYPE_INFO(&ng_nat_listrdrs_ary_type); static const struct ng_parse_type ng_nat_list_redirects_type = { &ng_parse_struct_type, &ng_nat_list_redirects_fields }; /* Parse type for struct ng_nat_libalias_info. */ static const struct ng_parse_struct_field ng_nat_libalias_info_fields[] = NG_NAT_LIBALIAS_INFO; static const struct ng_parse_type ng_nat_libalias_info_type = { &ng_parse_struct_type, &ng_nat_libalias_info_fields }; /* List of commands and how to convert arguments to/from ASCII. */ static const struct ng_cmdlist ng_nat_cmdlist[] = { { NGM_NAT_COOKIE, NGM_NAT_SET_IPADDR, "setaliasaddr", &ng_parse_ipaddr_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_SET_MODE, "setmode", &ng_nat_mode_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_SET_TARGET, "settarget", &ng_parse_ipaddr_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_REDIRECT_PORT, "redirectport", &ng_nat_redirect_port_type, &ng_parse_uint32_type }, { NGM_NAT_COOKIE, NGM_NAT_REDIRECT_ADDR, "redirectaddr", &ng_nat_redirect_addr_type, &ng_parse_uint32_type }, { NGM_NAT_COOKIE, NGM_NAT_REDIRECT_PROTO, "redirectproto", &ng_nat_redirect_proto_type, &ng_parse_uint32_type }, { NGM_NAT_COOKIE, NGM_NAT_REDIRECT_DYNAMIC, "redirectdynamic", &ng_parse_uint32_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_REDIRECT_DELETE, "redirectdelete", &ng_parse_uint32_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_ADD_SERVER, "addserver", &ng_nat_add_server_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_LIST_REDIRECTS, "listredirects", NULL, &ng_nat_list_redirects_type }, { NGM_NAT_COOKIE, NGM_NAT_PROXY_RULE, "proxyrule", &ng_parse_string_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_LIBALIAS_INFO, "libaliasinfo", NULL, &ng_nat_libalias_info_type }, { NGM_NAT_COOKIE, NGM_NAT_SET_DLT, "setdlt", &ng_parse_uint8_type, NULL }, { NGM_NAT_COOKIE, NGM_NAT_GET_DLT, "getdlt", NULL, &ng_parse_uint8_type }, { 0 } }; /* Netgraph node type descriptor. */ static struct ng_type typestruct = { .version = NG_ABI_VERSION, .name = NG_NAT_NODE_TYPE, .constructor = ng_nat_constructor, .rcvmsg = ng_nat_rcvmsg, .shutdown = ng_nat_shutdown, .newhook = ng_nat_newhook, .rcvdata = ng_nat_rcvdata, .disconnect = ng_nat_disconnect, .cmdlist = ng_nat_cmdlist, }; NETGRAPH_INIT(nat, &typestruct); MODULE_DEPEND(ng_nat, libalias, 1, 1, 1); /* Element for list of redirects. */ struct ng_nat_rdr_lst { STAILQ_ENTRY(ng_nat_rdr_lst) entries; struct alias_link *lnk; struct ng_nat_listrdrs_entry rdr; }; STAILQ_HEAD(rdrhead, ng_nat_rdr_lst); /* Information we store for each node. */ struct ng_nat_priv { node_p node; /* back pointer to node */ hook_p in; /* hook for demasquerading */ hook_p out; /* hook for masquerading */ struct libalias *lib; /* libalias handler */ uint32_t flags; /* status flags */ uint32_t rdrcount; /* number or redirects in list */ uint32_t nextid; /* for next in turn in list */ struct rdrhead redirhead; /* redirect list header */ uint8_t dlt; /* DLT_XXX from bpf.h */ }; typedef struct ng_nat_priv *priv_p; /* Values of flags */ #define NGNAT_CONNECTED 0x1 /* We have both hooks connected */ #define NGNAT_ADDR_DEFINED 0x2 /* NGM_NAT_SET_IPADDR happened */ static int ng_nat_constructor(node_p node) { priv_p priv; /* Initialize private descriptor. */ priv = malloc(sizeof(*priv), M_NETGRAPH, M_WAITOK | M_ZERO); /* Init aliasing engine. */ priv->lib = LibAliasInit(NULL); /* Set same ports on. */ (void )LibAliasSetMode(priv->lib, PKT_ALIAS_SAME_PORTS, PKT_ALIAS_SAME_PORTS); /* Init redirects housekeeping. */ priv->rdrcount = 0; priv->nextid = 1; priv->dlt = DLT_RAW; STAILQ_INIT(&priv->redirhead); /* Link structs together. */ NG_NODE_SET_PRIVATE(node, priv); priv->node = node; /* * libalias is not thread safe, so our node * must be single threaded. */ NG_NODE_FORCE_WRITER(node); return (0); } static int ng_nat_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); if (strcmp(name, NG_NAT_HOOK_IN) == 0) { priv->in = hook; } else if (strcmp(name, NG_NAT_HOOK_OUT) == 0) { priv->out = hook; } else return (EINVAL); if (priv->out != NULL && priv->in != NULL) priv->flags |= NGNAT_CONNECTED; return(0); } static int ng_nat_rcvmsg(node_p node, item_p item, hook_p lasthook) { const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *resp = NULL; struct ng_mesg *msg; int error = 0; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_NAT_COOKIE: switch (msg->header.cmd) { case NGM_NAT_SET_IPADDR: { struct in_addr *const ia = (struct in_addr *)msg->data; if (msg->header.arglen < sizeof(*ia)) { error = EINVAL; break; } LibAliasSetAddress(priv->lib, *ia); priv->flags |= NGNAT_ADDR_DEFINED; } break; case NGM_NAT_SET_MODE: { struct ng_nat_mode *const mode = (struct ng_nat_mode *)msg->data; if (msg->header.arglen < sizeof(*mode)) { error = EINVAL; break; } if (LibAliasSetMode(priv->lib, ng_nat_translate_flags(mode->flags), ng_nat_translate_flags(mode->mask)) < 0) { error = ENOMEM; break; } } break; case NGM_NAT_SET_TARGET: { struct in_addr *const ia = (struct in_addr *)msg->data; if (msg->header.arglen < sizeof(*ia)) { error = EINVAL; break; } LibAliasSetTarget(priv->lib, *ia); } break; case NGM_NAT_REDIRECT_PORT: { struct ng_nat_rdr_lst *entry; struct ng_nat_redirect_port *const rp = (struct ng_nat_redirect_port *)msg->data; if (msg->header.arglen < sizeof(*rp)) { error = EINVAL; break; } if ((entry = malloc(sizeof(struct ng_nat_rdr_lst), M_NETGRAPH, M_NOWAIT | M_ZERO)) == NULL) { error = ENOMEM; break; } /* Try actual redirect. */ entry->lnk = LibAliasRedirectPort(priv->lib, rp->local_addr, htons(rp->local_port), rp->remote_addr, htons(rp->remote_port), rp->alias_addr, htons(rp->alias_port), rp->proto); if (entry->lnk == NULL) { error = ENOMEM; free(entry, M_NETGRAPH); break; } /* Successful, save info in our internal list. */ entry->rdr.local_addr = rp->local_addr; entry->rdr.alias_addr = rp->alias_addr; entry->rdr.remote_addr = rp->remote_addr; entry->rdr.local_port = rp->local_port; entry->rdr.alias_port = rp->alias_port; entry->rdr.remote_port = rp->remote_port; entry->rdr.proto = rp->proto; bcopy(rp->description, entry->rdr.description, NG_NAT_DESC_LENGTH); /* Safety precaution. */ entry->rdr.description[NG_NAT_DESC_LENGTH-1] = '\0'; entry->rdr.id = priv->nextid++; priv->rdrcount++; /* Link to list of redirects. */ STAILQ_INSERT_TAIL(&priv->redirhead, entry, entries); /* Response with id of newly added entry. */ NG_MKRESPONSE(resp, msg, sizeof(entry->rdr.id), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } bcopy(&entry->rdr.id, resp->data, sizeof(entry->rdr.id)); } break; case NGM_NAT_REDIRECT_ADDR: { struct ng_nat_rdr_lst *entry; struct ng_nat_redirect_addr *const ra = (struct ng_nat_redirect_addr *)msg->data; if (msg->header.arglen < sizeof(*ra)) { error = EINVAL; break; } if ((entry = malloc(sizeof(struct ng_nat_rdr_lst), M_NETGRAPH, M_NOWAIT | M_ZERO)) == NULL) { error = ENOMEM; break; } /* Try actual redirect. */ entry->lnk = LibAliasRedirectAddr(priv->lib, ra->local_addr, ra->alias_addr); if (entry->lnk == NULL) { error = ENOMEM; free(entry, M_NETGRAPH); break; } /* Successful, save info in our internal list. */ entry->rdr.local_addr = ra->local_addr; entry->rdr.alias_addr = ra->alias_addr; entry->rdr.proto = NG_NAT_REDIRPROTO_ADDR; bcopy(ra->description, entry->rdr.description, NG_NAT_DESC_LENGTH); /* Safety precaution. */ entry->rdr.description[NG_NAT_DESC_LENGTH-1] = '\0'; entry->rdr.id = priv->nextid++; priv->rdrcount++; /* Link to list of redirects. */ STAILQ_INSERT_TAIL(&priv->redirhead, entry, entries); /* Response with id of newly added entry. */ NG_MKRESPONSE(resp, msg, sizeof(entry->rdr.id), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } bcopy(&entry->rdr.id, resp->data, sizeof(entry->rdr.id)); } break; case NGM_NAT_REDIRECT_PROTO: { struct ng_nat_rdr_lst *entry; struct ng_nat_redirect_proto *const rp = (struct ng_nat_redirect_proto *)msg->data; if (msg->header.arglen < sizeof(*rp)) { error = EINVAL; break; } if ((entry = malloc(sizeof(struct ng_nat_rdr_lst), M_NETGRAPH, M_NOWAIT | M_ZERO)) == NULL) { error = ENOMEM; break; } /* Try actual redirect. */ entry->lnk = LibAliasRedirectProto(priv->lib, rp->local_addr, rp->remote_addr, rp->alias_addr, rp->proto); if (entry->lnk == NULL) { error = ENOMEM; free(entry, M_NETGRAPH); break; } /* Successful, save info in our internal list. */ entry->rdr.local_addr = rp->local_addr; entry->rdr.alias_addr = rp->alias_addr; entry->rdr.remote_addr = rp->remote_addr; entry->rdr.proto = rp->proto; bcopy(rp->description, entry->rdr.description, NG_NAT_DESC_LENGTH); /* Safety precaution. */ entry->rdr.description[NG_NAT_DESC_LENGTH-1] = '\0'; entry->rdr.id = priv->nextid++; priv->rdrcount++; /* Link to list of redirects. */ STAILQ_INSERT_TAIL(&priv->redirhead, entry, entries); /* Response with id of newly added entry. */ NG_MKRESPONSE(resp, msg, sizeof(entry->rdr.id), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } bcopy(&entry->rdr.id, resp->data, sizeof(entry->rdr.id)); } break; case NGM_NAT_REDIRECT_DYNAMIC: case NGM_NAT_REDIRECT_DELETE: { struct ng_nat_rdr_lst *entry; uint32_t *const id = (uint32_t *)msg->data; if (msg->header.arglen < sizeof(*id)) { error = EINVAL; break; } /* Find entry with supplied id. */ STAILQ_FOREACH(entry, &priv->redirhead, entries) { if (entry->rdr.id == *id) break; } /* Not found. */ if (entry == NULL) { error = ENOENT; break; } if (msg->header.cmd == NGM_NAT_REDIRECT_DYNAMIC) { if (LibAliasRedirectDynamic(priv->lib, entry->lnk) == -1) { error = ENOTTY; /* XXX Something better? */ break; } } else { /* NGM_NAT_REDIRECT_DELETE */ LibAliasRedirectDelete(priv->lib, entry->lnk); } /* Delete entry from our internal list. */ priv->rdrcount--; STAILQ_REMOVE(&priv->redirhead, entry, ng_nat_rdr_lst, entries); free(entry, M_NETGRAPH); } break; case NGM_NAT_ADD_SERVER: { struct ng_nat_rdr_lst *entry; struct ng_nat_add_server *const as = (struct ng_nat_add_server *)msg->data; if (msg->header.arglen < sizeof(*as)) { error = EINVAL; break; } /* Find entry with supplied id. */ STAILQ_FOREACH(entry, &priv->redirhead, entries) { if (entry->rdr.id == as->id) break; } /* Not found. */ if (entry == NULL) { error = ENOENT; break; } if (LibAliasAddServer(priv->lib, entry->lnk, as->addr, htons(as->port)) == -1) { error = ENOMEM; break; } entry->rdr.lsnat++; } break; case NGM_NAT_LIST_REDIRECTS: { struct ng_nat_rdr_lst *entry; struct ng_nat_list_redirects *ary; int i = 0; NG_MKRESPONSE(resp, msg, sizeof(*ary) + (priv->rdrcount) * sizeof(*entry), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } ary = (struct ng_nat_list_redirects *)resp->data; ary->total_count = priv->rdrcount; STAILQ_FOREACH(entry, &priv->redirhead, entries) { bcopy(&entry->rdr, &ary->redirects[i++], sizeof(struct ng_nat_listrdrs_entry)); } } break; case NGM_NAT_PROXY_RULE: { char *cmd = (char *)msg->data; if (msg->header.arglen < 6) { error = EINVAL; break; } if (LibAliasProxyRule(priv->lib, cmd) != 0) error = ENOMEM; } break; case NGM_NAT_LIBALIAS_INFO: { struct ng_nat_libalias_info *i; NG_MKRESPONSE(resp, msg, sizeof(struct ng_nat_libalias_info), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } i = (struct ng_nat_libalias_info *)resp->data; #define COPY(F) do { \ if (priv->lib->F >= 0 && priv->lib->F < UINT32_MAX) \ i->F = priv->lib->F; \ else \ i->F = UINT32_MAX; \ } while (0) COPY(icmpLinkCount); COPY(udpLinkCount); COPY(tcpLinkCount); COPY(pptpLinkCount); COPY(sctpLinkCount); COPY(protoLinkCount); COPY(fragmentIdLinkCount); COPY(fragmentPtrLinkCount); COPY(sockCount); #undef COPY } break; case NGM_NAT_SET_DLT: if (msg->header.arglen != sizeof(uint8_t)) { error = EINVAL; break; } switch (*(uint8_t *) msg->data) { case DLT_EN10MB: case DLT_RAW: priv->dlt = *(uint8_t *) msg->data; break; default: error = EINVAL; break; } break; default: error = EINVAL; /* unknown command */ break; } break; case NGM_NAT_GET_DLT: NG_MKRESPONSE(resp, msg, sizeof(uint8_t), M_WAITOK); if (resp == NULL) { error = ENOMEM; break; } *((uint8_t *) resp->data) = priv->dlt; break; default: error = EINVAL; /* unknown cookie type */ break; } NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } static int ng_nat_rcvdata(hook_p hook, item_p item ) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); struct mbuf *m; struct ip *ip; int rval, ipofs, error = 0; char *c; /* We have no required hooks. */ if (!(priv->flags & NGNAT_CONNECTED)) { NG_FREE_ITEM(item); return (ENXIO); } /* We have no alias address yet to do anything. */ if (!(priv->flags & NGNAT_ADDR_DEFINED)) goto send; m = NGI_M(item); if ((m = m_megapullup(m, m->m_pkthdr.len)) == NULL) { NGI_M(item) = NULL; /* avoid double free */ NG_FREE_ITEM(item); return (ENOBUFS); } NGI_M(item) = m; switch (priv->dlt) { case DLT_RAW: ipofs = 0; break; case DLT_EN10MB: { struct ether_header *eh; if (m->m_pkthdr.len < sizeof(struct ether_header)) { NG_FREE_ITEM(item); return (ENXIO); } eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: ipofs = sizeof(struct ether_header); break; default: goto send; } break; } default: panic("Corrupted priv->dlt: %u", priv->dlt); } if (m->m_pkthdr.len < ipofs + sizeof(struct ip)) goto send; /* packet too short to hold IP */ c = (char *)mtodo(m, ipofs); ip = (struct ip *)mtodo(m, ipofs); if (ip->ip_v != IPVERSION) goto send; /* other IP version, let it pass */ if (m->m_pkthdr.len < ipofs + ntohs(ip->ip_len)) goto send; /* packet too short (i.e. fragmented or broken) */ /* * We drop packet when: * 1. libalias returns PKT_ALIAS_ERROR; * 2. For incoming packets: * a) for unresolved fragments; * b) libalias returns PKT_ALIAS_IGNORED and * PKT_ALIAS_DENY_INCOMING flag is set. */ if (hook == priv->in) { rval = LibAliasIn(priv->lib, c, m->m_len - ipofs + M_TRAILINGSPACE(m)); if (rval == PKT_ALIAS_ERROR || rval == PKT_ALIAS_UNRESOLVED_FRAGMENT || (rval == PKT_ALIAS_IGNORED && (priv->lib->packetAliasMode & PKT_ALIAS_DENY_INCOMING) != 0)) { NG_FREE_ITEM(item); return (EINVAL); } } else if (hook == priv->out) { rval = LibAliasOut(priv->lib, c, m->m_len - ipofs + M_TRAILINGSPACE(m)); if (rval == PKT_ALIAS_ERROR) { NG_FREE_ITEM(item); return (EINVAL); } } else panic("ng_nat: unknown hook!\n"); if (rval == PKT_ALIAS_RESPOND) m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = ntohs(ip->ip_len) + ipofs; if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); /* * Here is our terrible HACK. * * Sometimes LibAlias edits contents of TCP packet. * In this case it needs to recompute full TCP * checksum. However, the problem is that LibAlias * doesn't have any idea about checksum offloading * in kernel. To workaround this, we do not do * checksumming in LibAlias, but only mark the - * packets in th_x2 field. If we receive a marked - * packet, we calculate correct checksum for it - * aware of offloading. + * packets with TH_RES1 in the th_x2 field. If we + * receive a marked packet, we calculate correct + * checksum for it aware of offloading. * * Why do I do such a terrible hack instead of * recalculating checksum for each packet? * Because the previous checksum was not checked! * Recalculating checksums for EVERY packet will * hide ALL transmission errors. Yes, marked packets * still suffer from this problem. But, sigh, natd(8) * has this problem, too. */ - if (th->th_x2) { + if (tcp_get_flags(th) & TH_RES1) { uint16_t ip_len = ntohs(ip->ip_len); - th->th_x2 = 0; + tcp_set_flags(th, tcp_get_flags(th) & ~TH_RES1); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP + ip_len - (ip->ip_hl << 2))); if ((m->m_pkthdr.csum_flags & CSUM_TCP) == 0) { m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); in_delayed_cksum(m); } } } send: if (hook == priv->in) NG_FWD_ITEM_HOOK(error, item, priv->out); else NG_FWD_ITEM_HOOK(error, item, priv->in); return (error); } static int ng_nat_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); NG_NODE_SET_PRIVATE(node, NULL); NG_NODE_UNREF(node); /* Free redirects list. */ while (!STAILQ_EMPTY(&priv->redirhead)) { struct ng_nat_rdr_lst *entry = STAILQ_FIRST(&priv->redirhead); STAILQ_REMOVE_HEAD(&priv->redirhead, entries); free(entry, M_NETGRAPH); } /* Final free. */ LibAliasUninit(priv->lib); free(priv, M_NETGRAPH); return (0); } static int ng_nat_disconnect(hook_p hook) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); priv->flags &= ~NGNAT_CONNECTED; if (hook == priv->out) priv->out = NULL; if (hook == priv->in) priv->in = NULL; if (priv->out == NULL && priv->in == NULL) ng_rmnode_self(NG_HOOK_NODE(hook)); return (0); } static unsigned int ng_nat_translate_flags(unsigned int x) { unsigned int res = 0; if (x & NG_NAT_LOG) res |= PKT_ALIAS_LOG; if (x & NG_NAT_DENY_INCOMING) res |= PKT_ALIAS_DENY_INCOMING; if (x & NG_NAT_SAME_PORTS) res |= PKT_ALIAS_SAME_PORTS; if (x & NG_NAT_UNREGISTERED_ONLY) res |= PKT_ALIAS_UNREGISTERED_ONLY; if (x & NG_NAT_RESET_ON_ADDR_CHANGE) res |= PKT_ALIAS_RESET_ON_ADDR_CHANGE; if (x & NG_NAT_PROXY_ONLY) res |= PKT_ALIAS_PROXY_ONLY; if (x & NG_NAT_REVERSE) res |= PKT_ALIAS_REVERSE; if (x & NG_NAT_UNREGISTERED_CGN) res |= PKT_ALIAS_UNREGISTERED_CGN; return (res); } diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 44f8a67a1e64..209b89c9a427 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -1,532 +1,545 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_TCP_H_ #define _NETINET_TCP_H_ #include #if __BSD_VISIBLE typedef u_int32_t tcp_seq; #define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ #define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ /* * TCP header. * Per RFC 793, September, 1981. */ struct tcphdr { u_short th_sport; /* source port */ u_short th_dport; /* destination port */ tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ #if BYTE_ORDER == LITTLE_ENDIAN u_char th_x2:4, /* upper 4 (reserved) flags */ th_off:4; /* data offset */ #endif #if BYTE_ORDER == BIG_ENDIAN u_char th_off:4, /* data offset */ th_x2:4; /* upper 4 (reserved) flags */ #endif u_char th_flags; #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 #define TH_AE 0x100 /* maps into th_x2 */ #define TH_RES3 0x200 #define TH_RES2 0x400 #define TH_RES1 0x800 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR\11AE" u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ }; +static inline uint16_t +tcp_get_flags(const struct tcphdr *th) +{ + return (((uint16_t)th->th_x2 << 8) | th->th_flags); +} + +static inline void +tcp_set_flags(struct tcphdr *th, uint16_t flags) +{ + th->th_x2 = (flags >> 8) & 0x0f; + th->th_flags = flags & 0xff; +} + #define PADTCPOLEN(len) ((((len) / 4) + !!((len) % 4)) * 4) #define TCPOPT_EOL 0 #define TCPOLEN_EOL 1 #define TCPOPT_PAD 0 /* padding after EOL */ #define TCPOLEN_PAD 1 #define TCPOPT_NOP 1 #define TCPOLEN_NOP 1 #define TCPOPT_MAXSEG 2 #define TCPOLEN_MAXSEG 4 #define TCPOPT_WINDOW 3 #define TCPOLEN_WINDOW 3 #define TCPOPT_SACK_PERMITTED 4 #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 #define TCPOLEN_SACKHDR 2 #define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 #define TCPOPT_FAST_OPEN 34 #define TCPOLEN_FAST_OPEN_EMPTY 2 #define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */ /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ #define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ /* * The default maximum segment size (MSS) to be used for new TCP connections * when path MTU discovery is not enabled. * * RFC879 derives the default MSS from the largest datagram size hosts are * minimally required to handle directly or through IP reassembly minus the * size of the IP and TCP header. With IPv6 the minimum MTU is specified * in RFC2460. * * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) * * We use explicit numerical definition here to avoid header pollution. */ #define TCP_MSS 536 #define TCP6_MSS 1220 /* * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS * option. Allowing low values of MSS can consume significant resources and * be used to mount a resource exhaustion attack. * Connections requesting lower MSS values will be rounded up to this value * and the IP_DF flag will be cleared to allow fragmentation along the path. * * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting * it to "0" disables the minmss check. * * The default value is fine for TCP across the Internet's smallest official * link MTU (256 bytes for AX.25 packet radio). However, a connection is very * unlikely to come across such low MTU interfaces these days (anno domini 2003). */ #define TCP_MINMSS 216 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ #define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ #define TCP_MAXBURST 4 /* maximum segments in a burst */ #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ #define TCP_FASTOPEN_MIN_COOKIE_LEN 4 /* Per RFC7413 */ #define TCP_FASTOPEN_MAX_COOKIE_LEN 16 /* Per RFC7413 */ #define TCP_FASTOPEN_PSK_LEN 16 /* Same as TCP_FASTOPEN_KEY_LEN */ #endif /* __BSD_VISIBLE */ /* * User-settable options (used with setsockopt). These are discrete * values and are not masked together. Some values appear to be * bitmasks for historical reasons. */ #define TCP_NODELAY 1 /* don't delay send to coalesce packets */ #if __BSD_VISIBLE #define TCP_MAXSEG 2 /* set maximum segment size */ #define TCP_NOPUSH 4 /* don't push last block of write */ #define TCP_NOOPT 8 /* don't use TCP options */ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_STATS 33 /* retrieve stats blob structure */ #define TCP_LOG 34 /* configure event logging for connection */ #define TCP_LOGBUF 35 /* retrieve event log for connection */ #define TCP_LOGID 36 /* configure log ID to correlate connections */ #define TCP_LOGDUMP 37 /* dump connection log events to device */ #define TCP_LOGDUMPID 38 /* dump events from connections with same ID to device */ #define TCP_TXTLS_ENABLE 39 /* TLS framing and encryption for transmit */ #define TCP_TXTLS_MODE 40 /* Transmit TLS mode */ #define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */ #define TCP_RXTLS_MODE 42 /* Receive TLS mode */ #define TCP_IWND_NB 43 /* Override initial window (units: bytes) */ #define TCP_IWND_NSEG 44 /* Override initial window (units: MSS segs) */ #define TCP_LOGID_CNT 46 /* get number of connections with the same ID */ #define TCP_LOG_TAG 47 /* configure tag for grouping logs */ #define TCP_USER_LOG 48 /* userspace log event */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_MAXUNACKTIME 68 /* maximum time without making progress (sec) */ #define TCP_MAXPEAKRATE 69 /* maximum peak rate allowed (kbps) */ #define TCP_IDLE_REDUCE 70 /* Reduce cwnd on idle input */ #define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */ #define TCP_DELACK 72 /* socket option for delayed ack */ #define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */ #define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */ #define TCP_SHARED_CWND_ALLOWED 75 /* Use of a shared cwnd is allowed */ #define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */ #define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */ #define TCP_PERF_INFO 78 /* retrieve accounting counters */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ #define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ #define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ #define TCP_FUNCTION_ALIAS 8193 /* Get the current tcp function pointer name alias */ /* Options for Rack and BBR */ #define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ #define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ #define TCP_RACK_PROP 1051 /* Not used */ #define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ #define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */ #define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */ #define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */ #define TCP_RACK_PROP_RATE 1056 /* Not used */ #define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */ #define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */ #define TCP_RACK_EARLY_RECOV 1059 /* Not used */ #define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */ #define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */ #define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */ #define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ #define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ #define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ #define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ #define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ #define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ #define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ #define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ #define TCP_BBR_PROBE_RTT_INT 1072 /* How long in useconds between probe-rtt */ #define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ #define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ #define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ #define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ #define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ #define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ #define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ #define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ #define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ #define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ #define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ #define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ #define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ #define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ #define TCP_BBR_PACE_PER_SEC 1086 #define TCP_BBR_PACE_DEL_TAR 1087 #define TCP_BBR_PACE_SEG_MAX 1088 #define TCP_BBR_PACE_SEG_MIN 1089 #define TCP_BBR_PACE_CROSS 1090 #define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ #define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ #define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ #define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase in older rack */ #define TCP_RACK_TLP_USE 1095 #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ #define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 #define TCP_RACK_DO_DETECTION 1097 /* Recycle of extra gain for rack, attack detection */ #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 #define TCP_DATA_AFTER_CLOSE 1100 #define TCP_BBR_PROBE_RTT_GAIN 1101 #define TCP_BBR_PROBE_RTT_LEN 1102 #define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ #define TCP_BBR_USE_RACK_RR 1104 /* Do we use the rack rapid recovery for pacing rxt's */ #define TCP_BBR_USE_RACK_CHEAT TCP_BBR_USE_RACK_RR /* Compat. */ #define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ #define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ #define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ #define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ #define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ #define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ #define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ #define TCP_BBR_RACK_INIT_RATE 1112 /* Set an initial pacing rate for when we have no b/w in kbits per sec */ #define TCP_RACK_RR_CONF 1113 /* Rack rapid recovery configuration control*/ #define TCP_RACK_CHEAT_NOT_CONF_RATE TCP_RACK_RR_CONF #define TCP_RACK_GP_INCREASE_CA 1114 /* GP increase for Congestion Avoidance */ #define TCP_RACK_GP_INCREASE_SS 1115 /* GP increase for Slow Start */ #define TCP_RACK_GP_INCREASE_REC 1116 /* GP increase for Recovery */ #define TCP_RACK_FORCE_MSEG 1117 /* Override to use the user set max-seg value */ #define TCP_RACK_PACE_RATE_CA 1118 /* Pacing rate for Congestion Avoidance */ #define TCP_RACK_PACE_RATE_SS 1119 /* Pacing rate for Slow Start */ #define TCP_RACK_PACE_RATE_REC 1120 /* Pacing rate for Recovery */ #define TCP_NO_PRR 1122 /* If pacing, don't use prr */ #define TCP_RACK_NONRXT_CFG_RATE 1123 /* In recovery does a non-rxt use the cfg rate */ #define TCP_SHARED_CWND_ENABLE 1124 /* Use a shared cwnd if allowed */ #define TCP_TIMELY_DYN_ADJ 1125 /* Do we attempt dynamic multipler adjustment with timely. */ #define TCP_RACK_NO_PUSH_AT_MAX 1126 /* For timely do not push if we are over max rtt */ #define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */ #define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */ #define TCP_RACK_PROFILE 1129 /* Select a profile that sets multiple options */ #define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */ #define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */ #define TCP_HDWR_UP_ONLY 1132 /* Allow the pacing rate to climb but not descend (with the exception of fill-cw */ #define TCP_RACK_ABC_VAL 1133 /* Set a local ABC value different then the system default */ #define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */ #define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */ #define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */ #define TCP_FAST_RSM_HACK 1137 /* Not used in modern stacks */ #define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */ #define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */ #define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */ #define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */ #define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */ #define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */ #define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */ #define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */ #define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */ #define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */ #define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */ #define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */ #define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */ #define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ #define TCPI_OPT_TIMESTAMPS 0x01 #define TCPI_OPT_SACK 0x02 #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 #define TCPI_OPT_TFO 0x20 #define TCPI_OPT_ACE 0x40 /* Maximum length of log ID. */ #define TCP_LOG_ID_LEN 64 /* TCP accounting counters */ #define TCP_NUM_PROC_COUNTERS 11 #define TCP_NUM_CNT_COUNTERS 13 /* Must match counter array sizes in tcpcb */ struct tcp_perf_info { uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; uint64_t timebase; /* timebase for tcp_proc_time */ uint8_t tb_is_stable; /* timebase is stable/invariant */ }; /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits * the caller to query certain information about the state of a TCP * connection. We provide an overlapping set of fields with the Linux * implementation, but since this is a fixed size structure, room has been * left for growth. In order to maximize potential future compatibility with * the Linux API, the same variable names and order have been adopted, and * padding left to make room for omitted fields in case they are added later. * * XXX: This is currently an unstable ABI/API, in that it is expected to * change. */ struct tcp_info { u_int8_t tcpi_state; /* TCP FSM state. */ u_int8_t __tcpi_ca_state; u_int8_t __tcpi_retransmits; u_int8_t __tcpi_probes; u_int8_t __tcpi_backoff; u_int8_t tcpi_options; /* Options enabled on conn. */ u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ u_int32_t tcpi_rto; /* Retransmission timeout (usec). */ u_int32_t __tcpi_ato; u_int32_t tcpi_snd_mss; /* Max segment size for send. */ u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ u_int32_t __tcpi_unacked; u_int32_t __tcpi_sacked; u_int32_t __tcpi_lost; u_int32_t __tcpi_retrans; u_int32_t __tcpi_fackets; /* Times; measurements in usecs. */ u_int32_t __tcpi_last_data_sent; u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ u_int32_t tcpi_last_data_recv; /* Time since last recv data. */ u_int32_t __tcpi_last_ack_recv; /* Metrics; variable units. */ u_int32_t __tcpi_pmtu; u_int32_t __tcpi_rcv_ssthresh; u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ u_int32_t __tcpi_advmss; u_int32_t __tcpi_reordering; u_int32_t __tcpi_rcv_rtt; u_int32_t tcpi_rcv_space; /* Advertised recv window. */ /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ /* Accurate ECN counters. */ u_int32_t tcpi_delivered_ce; u_int32_t tcpi_received_ce; /* # of CE marks received */ u_int32_t __tcpi_delivered_e1_bytes; u_int32_t __tcpi_delivered_e0_bytes; u_int32_t __tcpi_delivered_ce_bytes; u_int32_t __tcpi_received_e1_bytes; u_int32_t __tcpi_received_e0_bytes; u_int32_t __tcpi_received_ce_bytes; u_int32_t tcpi_total_tlp; /* tail loss probes sent */ u_int64_t tcpi_total_tlp_bytes; /* tail loss probe bytes sent */ u_int32_t tcpi_snd_una; /* Unacked seqno sent */ u_int32_t tcpi_snd_max; /* Highest seqno sent */ u_int32_t tcpi_rcv_numsacks; /* Distinct SACK blks present */ u_int32_t tcpi_rcv_adv; /* Peer advertised window */ u_int32_t tcpi_dupacks; /* Consecutive dup ACKs recvd */ /* Padding to grow without breaking ABI. */ u_int32_t __tcpi_pad[14]; /* Padding. */ }; /* * If this structure is provided when setting the TCP_FASTOPEN socket * option, and the enable member is non-zero, a subsequent connect will use * pre-shared key (PSK) mode using the provided key. */ struct tcp_fastopen { int enable; uint8_t psk[TCP_FASTOPEN_PSK_LEN]; }; #endif #define TCP_FUNCTION_NAME_LEN_MAX 32 struct tcp_function_set { char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; uint32_t pcbcnt; }; /* TLS modes for TCP_TXTLS_MODE */ #define TCP_TLS_MODE_NONE 0 #define TCP_TLS_MODE_SW 1 #define TCP_TLS_MODE_IFNET 2 #define TCP_TLS_MODE_TOE 3 /* * TCP Control message types */ #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD 2 /* * TCP log user opaque */ struct tcp_snd_req { uint64_t timestamp; uint64_t start; uint64_t end; uint32_t flags; }; union tcp_log_userdata { struct tcp_snd_req tcp_req; }; struct tcp_log_user { uint32_t type; uint32_t subtype; union tcp_log_userdata data; }; /* user types, i.e. apps */ #define TCP_LOG_USER_HTTPD 1 /* user subtypes */ #define TCP_LOG_HTTPD_TS 1 /* client timestamp */ #define TCP_LOG_HTTPD_TS_REQ 2 /* client timestamp and request info */ /* HTTPD REQ flags */ #define TCP_LOG_HTTPD_RANGE_START 0x0001 #define TCP_LOG_HTTPD_RANGE_END 0x0002 /* Flags for hybrid pacing */ #define TCP_HYBRID_PACING_CU 0x0001 /* Enable catch-up mode */ #define TCP_HYBRID_PACING_DTL 0x0002 /* Enable Detailed logging */ #define TCP_HYBRID_PACING_CSPR 0x0004 /* A client suggested rate is present */ #define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */ #define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */ #define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */ #define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */ #define TCP_HYBRID_PACING_WASSET 0x2000 /* We init to this to know if a hybrid command was issued */ struct tcp_hybrid_req { struct tcp_snd_req req; uint64_t cspr; uint32_t hint_maxseg; uint32_t hybrid_flags; }; /* * TCP specific variables of interest for tp->t_stats stats(9) accounting. */ #define VOI_TCP_TXPB 0 /* Transmit payload bytes */ #define VOI_TCP_RETXPB 1 /* Retransmit payload bytes */ #define VOI_TCP_FRWIN 2 /* Foreign receive window */ #define VOI_TCP_LCWIN 3 /* Local congesiton window */ #define VOI_TCP_RTT 4 /* Round trip time */ #define VOI_TCP_CSIG 5 /* Congestion signal */ #define VOI_TCP_GPUT 6 /* Goodput */ #define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */ #define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */ #define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */ #define VOI_TCP_PATHRTT 10 /* The path RTT based on ACK arrival */ #define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */ #define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */ #endif /* !_NETINET_TCP_H_ */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index af441b4fc7d7..c2b15526c15b 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,1593 +1,1581 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #include #endif #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */ /* Types of ending byte info */ #define TCP_EI_EMPTY_SLOT 0 #define TCP_EI_STATUS_CLIENT_FIN 0x1 #define TCP_EI_STATUS_CLIENT_RST 0x2 #define TCP_EI_STATUS_SERVER_FIN 0x3 #define TCP_EI_STATUS_SERVER_RST 0x4 #define TCP_EI_STATUS_RETRAN 0x5 #define TCP_EI_STATUS_PROGRESS 0x6 #define TCP_EI_STATUS_PERSIST_MAX 0x7 #define TCP_EI_STATUS_KEEP_MAX 0x8 #define TCP_EI_STATUS_DATA_A_CLOSE 0x9 #define TCP_EI_STATUS_RST_IN_FRONT 0xa #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb #define TCP_TRK_REQ_LOG_NEW 0x01 #define TCP_TRK_REQ_LOG_COMPLETE 0x02 #define TCP_TRK_REQ_LOG_FREED 0x03 #define TCP_TRK_REQ_LOG_ALLOCFAIL 0x04 #define TCP_TRK_REQ_LOG_MOREYET 0x05 #define TCP_TRK_REQ_LOG_FORCEFREE 0x06 #define TCP_TRK_REQ_LOG_STALE 0x07 #define TCP_TRK_REQ_LOG_SEARCH 0x08 /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but * for human representation. To check a bit we * take and shift over by 1 minus the value (1-8). */ /************************************************/ #define TCP_EI_BITS_CLIENT_FIN 0x001 #define TCP_EI_BITS_CLIENT_RST 0x002 #define TCP_EI_BITS_SERVER_FIN 0x004 #define TCP_EI_BITS_SERVER_RST 0x008 #define TCP_EI_BITS_RETRAN 0x010 #define TCP_EI_BITS_PROGRESS 0x020 #define TCP_EI_BITS_PRESIST_MAX 0x040 #define TCP_EI_BITS_KEEP_MAX 0x080 #define TCP_EI_BITS_DATA_A_CLO 0x100 #define TCP_EI_BITS_RST_IN_FR 0x200 /* a front state reset */ #define TCP_EI_BITS_2MS_TIMER 0x400 /* 2 MSL timer expired */ #if defined(_KERNEL) || defined(_WANT_TCPCB) #include /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from tcp_get_flags() */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int32_t delivered_data; /* Newly acked data from last SACK */ int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */ uint32_t prr_delivered; /* Total bytes delivered using PRR */ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ int32_t hole_bytes; /* current number of bytes in scoreboard holes */ int32_t lost_bytes; /* number of rfc6675 IsLost() bytes */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); #define TCP_TRK_TRACK_FLG_EMPTY 0x00 /* Available */ #define TCP_TRK_TRACK_FLG_USED 0x01 /* In use */ #define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */ #define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */ #define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */ #define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */ #define MAX_TCP_TRK_REQ 5 /* Max we will have at once */ struct tcp_sendfile_track { uint64_t timestamp; /* User sent timestamp */ uint64_t start; /* Start of sendfile offset */ uint64_t end; /* End if not open-range req */ uint64_t localtime; /* Time we actually got the req */ uint64_t deadline; /* If in CU mode, deadline to delivery */ uint64_t first_send; /* Time of first send in the range */ uint64_t cspr; /* Client suggested pace rate */ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */ tcp_seq start_seq; /* First TCP Seq assigned */ tcp_seq end_seq; /* If range req last seq */ uint32_t flags; /* Type of request open etc */ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */ uint32_t hint_maxseg; /* Client hinted maxseg */ uint32_t hybrid_flags; /* Hybrid flags on this request */ }; /* * Change Query responses for a stack switch we create a structure * that allows query response from the new stack to the old, if * supported. * * There are three queries currently defined. * - sendmap * - timers * - rack_times * * For the sendmap query the caller fills in the * req and the req_param as the first seq (usually * snd_una). When the response comes back indicating * that there was data (return value 1), then the caller * can build a sendmap entry based on the range and the * times. The next query would then be done at the * newly created sendmap_end. Repeated until sendmap_end == snd_max. * * Flags in sendmap_flags are defined below as well. * * For timers the standard PACE_TMR_XXXX flags are returned indicating * a pacing timer (possibly) and one other timer. If pacing timer then * the expiration timeout time in microseconds is in timer_pacing_to. * And the value used with whatever timer (if a flag is set) is in * timer_rxt. If no timers are running a 0 is returned and of * course no flags are set in timer_hpts_flags. * * The rack_times are a misc collection of information that * the old stack might possibly fill in. Of course its possible * that an old stack may not have a piece of information. If so * then setting that value to zero is advised. Setting any * timestamp passed should only place a zero in it when it * is unfilled. This may mean that a time is off by a micro-second * but this is ok in the grand scheme of things. * * When switching stacks it is desireable to get as much information * from the old stack to the new stack as possible. Though not always * will the stack be compatible in the types of information. The * init() function needs to take care when it begins changing * things such as inp_flags2 and the timer units to position these * changes at a point where it is unlikely they will fail after * making such changes. A stack optionally can have an "undo" * function * * To transfer information to the old stack from the new in * respect to LRO and the inp_flags2, the new stack should set * the inp_flags2 to what it supports. The old stack in its * fini() function should call the tcp_handle_orphaned_packets() * to clean up any packets. Note that a new stack should attempt */ /* Query types */ #define TCP_QUERY_SENDMAP 1 #define TCP_QUERY_TIMERS_UP 2 #define TCP_QUERY_RACK_TIMES 3 /* Flags returned in sendmap_flags */ #define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */ #define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */ #define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */ #define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */ #define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */ #define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */ #define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */ #define SNDMAP_MASK (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\ |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH) #define SNDMAP_NRTX 3 struct tcp_query_resp { int req; uint32_t req_param; union { struct { tcp_seq sendmap_start; tcp_seq sendmap_end; int sendmap_send_cnt; uint64_t sendmap_time[SNDMAP_NRTX]; uint64_t sendmap_ack_arrival; int sendmap_flags; uint32_t sendmap_r_rtr_bytes; /* If FAS is available if not 0 */ uint32_t sendmap_fas; uint8_t sendmap_dupacks; }; struct { uint32_t timer_hpts_flags; uint32_t timer_pacing_to; uint32_t timer_timer_exp; }; struct { /* Timestamps and rtt's */ uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */ uint32_t rack_num_dsacks; /* Num of dsacks seen */ uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */ uint32_t rack_min_rtt; /* never 0 smallest rtt seen */ uint32_t rack_rtt; /* Last rtt used by rack */ uint32_t rack_tmit_time; /* The time the rtt seg was tmited */ uint32_t rack_time_went_idle; /* If in persist the time we went idle */ /* Prr data */ uint32_t rack_sacked; uint32_t rack_holes_rxt; uint32_t rack_prr_delivered; uint32_t rack_prr_recovery_fs; uint32_t rack_prr_out; uint32_t rack_prr_sndcnt; /* TLP data */ uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */ /* Various bits */ uint8_t rack_tlp_out; /* Is a TLP outstanding */ uint8_t rack_srtt_measured; /* The previous stack has measured srtt */ uint8_t rack_in_persist; /* Is the old stack in persists? */ uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */ }; }; }; #define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */ #define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */ typedef enum { TT_REXMT = 0, TT_PERSIST, TT_KEEP, TT_2MSL, TT_DELACK, TT_N, } tt_which; typedef enum { TT_PROCESSING = 0, TT_PROCESSED, TT_STARTING, TT_STOPPING, } tt_what; /* * Tcp control block, one per tcp connection. */ struct tcpcb { struct inpcb t_inpcb; /* embedded protocol independent cb */ #define t_start_zero t_fb #define t_zero_size (sizeof(struct tcpcb) - \ offsetof(struct tcpcb, t_start_zero)) struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ struct callout t_callout; sbintime_t t_timers[TT_N]; sbintime_t t_precisions[TT_N]; /* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */ TAILQ_ENTRY(tcpcb) t_hpts; /* linkage to HPTS ring */ STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input packets queue */ uint32_t t_hpts_request; /* Current hpts request, zero if * fits in the pacing window. */ uint32_t t_hpts_slot; /* HPTS wheel slot this tcb is. */ uint32_t t_hpts_drop_reas; /* Reason we are dropping the pcb. */ uint32_t t_hpts_gencnt; uint16_t t_hpts_cpu; /* CPU chosen by hpts_cpuid(). */ uint16_t t_lro_cpu; /* CPU derived from LRO. */ #define HPTS_CPU_NONE ((uint16_t)-1) enum { IHPTS_NONE = 0, IHPTS_ONQUEUE, IHPTS_MOVING, } t_in_hpts; /* Is it linked into HPTS? */ uint32_t t_maxseg:24, /* maximum segment size */ _t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t ts_offset; /* our timestamp offset */ uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ uint32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* total reassembly queue byte length */ struct tsegqe_head t_segq; /* segment reassembly queue */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; /* RACK and BBR incoming new data was acked */ u_int t_sndtime; /* time last data was sent */ u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ char t_oobflags; /* have some */ char t_iobc; /* input character */ uint8_t t_nic_ktls_xmit:1, /* active nic ktls xmit sessions */ t_nic_ktls_xmit_dis:1, /* disabled nic xmit ktls? */ t_nic_ktls_spare:6; /* spare nic ktls */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_fbyte_in; /* ticks time first byte queued in */ u_int t_fbyte_out; /* ticks time first byte queued out */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_blackhole_enter; /* when to enter blackhole detection */ int t_blackhole_exit; /* when to exit blackhole detection */ u_int t_rttmin; /* minimum rtt allowed */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *t_cc; /* congestion control algorithm */ struct cc_var t_ccv; /* congestion control specific vars */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ uint32_t t_rcep; /* Number of received CE marked pkts */ uint32_t t_scep; /* Synced number of delivered CE pkts */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ /* Should these be a pointer to the arrays or an array? */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint32_t t_maxpeakrate; /* max peak rate set by user, bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ uint64_t t_snd_rxt_bytes; /* total bytes retransmitted */ uint32_t t_dsack_bytes; /* dsack bytes received */ uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */ uint32_t t_dsack_pack; /* dsack packets we have eceived */ uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */ uint8_t t_rttupdated; /* number of times rtt sampled */ /* TCP Fast Open */ uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TFO server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ union { uint8_t t_end_info_bytes[TCP_END_BYTE_INFO]; uint64_t t_end_info; }; struct osd t_osd; /* storage for Khelp module data */ uint8_t _t_logpoint; /* Used when a BB log points is enabled */ /* * Keep all #ifdef'ed components at the end of the structure! * This is important to minimize problems when compiling modules * using this structure from within the modules' directory. */ #ifdef TCP_REQUEST_TRK /* Response tracking addons. */ uint8_t t_tcpreq_req; /* Request count */ uint8_t t_tcpreq_open; /* Number of open range requests */ uint8_t t_tcpreq_closed; /* Number of closed range requests */ uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */ uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */ uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */ struct tcp_sendfile_track t_tcpreq_info[MAX_TCP_TRK_REQ]; #endif #ifdef TCP_ACCOUNTING uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; #endif #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* SACK scoreboard update status */ typedef enum { SACK_NOCHANGE = 0, SACK_CHANGE, SACK_NEWLOSS } sackstatus_t; /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 #define TCP_TUNNELING_PORT_DEFAULT 0 /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) #define TCP_TUNNELING_OVERHEAD_MAX 1024 #define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ /** * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * * tfb_tcp_fb_init is used to allow the new stack to * setup its control block. Among the things it must * do is: * a) Make sure that the inp_flags2 is setup correctly * for LRO. There are two flags that the previous * stack may have set INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. If the new stack does not * support these it *should* clear the flags. * b) Make sure that the timers are in the proper * granularity that the stack wants. The stack * should check the t_tmr_granularity field. Currently * there are two values that it may hold * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC. * Use the functions tcp_timer_convert(tp, granularity); * to move the timers to the correct format for your stack. * * The new stack may also optionally query the tfb_chg_query * function if the old stack has one. The new stack may ask * for one of three entries and can also state to the old * stack its support for the INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. This is important since if there are * queued ack's without that statement the old stack will * be forced to discard the queued acks. The requests that * can be made for information by the new stacks are: * * Note also that the tfb_tcp_fb_init() when called can * determine if a query is needed by looking at the * value passed in the ptr. The ptr is designed to be * set in with any allocated memory, but the address * of the condtion (ptr == &tp->t_fb_ptr) will be * true if this is not a stack switch but the initial * setup of a tcb (which means no query would be needed). * If, however, the value is not t_fb_ptr, then the caller * is in the middle of a stack switch and is the new stack. * A query would be appropriate (if the new stack support * the query mechanism). * * TCP_QUERY_SENDMAP - Query of outstanding data. * TCP_QUERY_TIMERS_UP - Query about running timers. * TCP_SUPPORTED_LRO - Declaration in req_param of * the inp_flags2 supported by * the new stack. * TCP_QUERY_RACK_TIMES - Enquire about various timestamps * and states the old stack may be in. * * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). The * tfb_tcp_fb_fini() function itself should not change timers * or inp_flags2 (the tfb_tcp_fb_init() must do that). However * if the old stack supports the LRO mbuf queuing, and the new * stack does not communicate via chg messages that it too does, * it must assume it does not and free any queued mbufs. * */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); void (*tfb_tcp_do_segment)(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t); int (*tfb_do_segment_nounlock)(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t, int, struct timeval *); int (*tfb_do_queued_segments)(struct tcpcb *, int); int (*tfb_tcp_ctloutput)(struct tcpcb *, struct sockopt *); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *, void **); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *tp); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); int (*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *); void (*tfb_switch_failed)(struct tcpcb *); bool (*tfb_early_wake_check)(struct tcpcb *); int (*tfb_compute_pipe)(struct tcpcb *tp); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); struct tcpcb * tcp_drop(struct tcpcb *, int); #ifdef _NETINET_IN_PCB_H_ #define intotcpcb(inp) __containerof((inp), struct tcpcb, t_inpcb) #define sototcpcb(so) intotcpcb(sotoinpcb(so)) #define tptoinpcb(tp) (&(tp)->t_inpcb) #define tptosocket(tp) (tp)->t_inpcb.inp_socket /* * tcp_output() * Handles tcp_drop request from advanced stacks and reports that inpcb is * gone with negative return code. * Drop in replacement for the default stack. */ static inline int tcp_output(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); int rv; INP_WLOCK_ASSERT(inp); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); tp = tcp_drop(tp, -rv); if (tp) INP_WUNLOCK(inp); } return (rv); } /* * tcp_output_unlock() * Always returns unlocked, handles drop request from advanced stacks. * Always returns positive error code. */ static inline int tcp_output_unlock(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); int rv; INP_WLOCK_ASSERT(inp); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); rv = -rv; tp = tcp_drop(tp, rv); if (tp) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); return (rv); } /* * tcp_output_nodrop() * Always returns locked. It is caller's responsibility to run tcp_drop()! * Useful in syscall implementations, when we want to perform some logging * and/or tracing with tcpcb before calling tcp_drop(). To be used with * tcp_unlock_or_drop() later. * * XXXGL: maybe don't allow stacks to return a drop request at certain * TCP states? Why would it do in connect(2)? In recv(2)? */ static inline int tcp_output_nodrop(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tptoinpcb(tp)); rv = tp->t_fb->tfb_tcp_output(tp); KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); return (rv); } /* * tcp_unlock_or_drop() * Handle return code from tfb_tcp_output() after we have logged/traced, * to be used with tcp_output_nodrop(). */ static inline int tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); if (tcp_output_retval < 0) { tcp_output_retval = -tcp_output_retval; if (tcp_drop(tp, tcp_output_retval) != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); return (tcp_output_retval); } #endif /* _NETINET_IN_PCB_H_ */ static int inline tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack) { return ((ack - tp->snd_una) / tp->t_maxseg + ((((ack - tp->snd_una) % tp->t_maxseg) != 0) ? 1 : 0)); } #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid * Note: accessing and restoring from * these may only be done in the 1st * RTO recovery round (t_rxtshift == 1) */ #define TF_WAKESOR 0x00004000 /* wake up receive socket */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_SONOTCONN 0x00020000 /* needs soisconnected() on ESTAB */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_CLOSED 0x04000000 /* close(2) called on socket */ #define TF_UNUSED1 0x08000000 /* unused */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ #define TF2_HPTS_CPU_SET 0x00000200 /* t_hpts_cpu is not random */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ #define TF2_ECN_USE_ECT1 0x00000800 /* Use ECT(1) marking on session */ #define TF2_TCP_ACCOUNTING 0x00001000 /* Do TCP accounting */ #define TF2_HPTS_CALLS 0x00002000 /* tcp_output() called via HPTS */ #define TF2_MBUF_L_ACKS 0x00004000 /* large mbufs for ack compression */ #define TF2_MBUF_ACKCMP 0x00008000 /* mbuf ack compression ok */ #define TF2_SUPPORTS_MBUFQ 0x00010000 /* Supports the mbuf queue method */ #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_progdrops; /* drops due to no progress */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_rcvce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_rcvect0; /* ECN Capable Transport */ uint64_t tcps_ecn_rcvect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ /* Dsack related stats */ uint64_t tcps_dsack_count; /* Number of ACKs arriving with DSACKs */ uint64_t tcps_dsack_bytes; /* Number of bytes DSACK'ed no TLP */ uint64_t tcps_dsack_tlp_bytes; /* Number of bytes DSACK'ed due to TLPs */ /* TCPS_TIME_WAIT usage stats */ uint64_t tcps_tw_recycles; /* Times time-wait was recycled. */ uint64_t tcps_tw_resets; /* Times time-wait sent a reset. */ uint64_t tcps_tw_responds; /* Times time-wait sent a valid ack. */ /* Accurate ECN Handshake stats */ uint64_t tcps_ace_nect; /* ACE SYN packet with Non-ECT */ uint64_t tcps_ace_ect1; /* ACE SYN packet with ECT1 */ uint64_t tcps_ace_ect0; /* ACE SYN packet with ECT0 */ uint64_t tcps_ace_ce; /* ACE SYN packet with CE */ /* ECN related stats */ uint64_t tcps_ecn_sndect0; /* ECN Capable Transport */ uint64_t tcps_ecn_sndect1; /* ECN Capable Transport */ /* * BBR and Rack implement TLP's these values count TLP bytes in * two catagories, bytes that were retransmitted and bytes that * were newly transmited. Both types can serve as TLP's but they * are accounted differently. */ uint64_t tcps_tlpresends; /* number of tlp resends */ uint64_t tcps_tlpresend_bytes; /* number of bytes resent by tlp */ uint64_t _pad[4]; /* 4 TBD placeholder for STABLE */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_add(int statnum, int val); #define KMOD_TCPSTAT_ADD(name, val) \ kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val) #define KMOD_TCPSTAT_INC(name) KMOD_TCPSTAT_ADD(name, 1) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ char xt_cc[TCP_CA_NAME_MAX]; /* (s) */ int64_t spare64[6]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ uint32_t t_snd_cwnd; /* (s) */ uint32_t t_snd_ssthresh; /* (s) */ uint32_t t_maxseg; /* (s) */ uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ uint32_t t_dsack_bytes; /* (n) */ uint32_t t_dsack_tlp_bytes; /* (n) */ uint32_t t_dsack_pack; /* (n) */ uint16_t xt_encaps_port; /* (s) */ int16_t spare16; int32_t spare32[22]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_tolerate_missing_ts); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_newsack); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_retries); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_tolerate_missing_ts VNET(tcp_tolerate_missing_ts) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_newsack VNET(tcp_do_newsack) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_retries VNET(tcp_retries) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif void tcp_account_for_send(struct tcpcb *, uint32_t, uint8_t, uint8_t, bool); int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, const void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_do_segment(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs); uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt); int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt); void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_extra_mbuf; extern counter_u64_t tcp_would_have_but; extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; extern counter_u64_t tcp_bad_csums; #ifdef TCP_SAD_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sad_limit; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp6_use_min_mtu(struct tcpcb *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_default_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, uint16_t); bool tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, tt_which, u_int); bool tcp_timer_active(struct tcpcb *, tt_which); void tcp_timer_stop(struct tcpcb *); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); void cc_after_idle(struct tcpcb *tp); extern struct protosw tcp_protosw; /* shared for TOE */ extern struct protosw tcp6_protosw; /* shared for TOE */ uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); sackstatus_t tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); void tcp_decrement_paced_conn(void); void tcp_change_time_units(struct tcpcb *, int); void tcp_handle_orphaned_packets(struct tcpcb *); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); #ifdef TCP_REQUEST_TRK void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent); struct tcp_sendfile_track * tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip); int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point); int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point); struct tcp_sendfile_track * tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq); void tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes); uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes); void tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts); struct tcp_sendfile_track * tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups); #endif #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss); #endif static inline void tcp_lro_features_off(struct tcpcb *tp) { tp->t_flags2 &= ~(TF2_SUPPORTS_MBUFQ| TF2_MBUF_QUEUE_READY| TF2_DONT_SACK_QUEUE| TF2_MBUF_ACKCMP| TF2_MBUF_L_ACKS); } static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ -static inline uint16_t -tcp_get_flags(const struct tcphdr *th) -{ - return (((uint16_t)th->th_x2 << 8) | th->th_flags); -} - -static inline void -tcp_set_flags(struct tcphdr *th, uint16_t flags) -{ - th->th_x2 = (flags >> 8) & 0x0f; - th->th_flags = flags & 0xff; -} #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/netpfil/ipfilter/netinet/ip_compat.h b/sys/netpfil/ipfilter/netinet/ip_compat.h index 08ce4b572d43..c73af315b132 100644 --- a/sys/netpfil/ipfilter/netinet/ip_compat.h +++ b/sys/netpfil/ipfilter/netinet/ip_compat.h @@ -1,1241 +1,1235 @@ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. * Id: ip_compat.h,v 2.142.2.57 2007/10/10 09:51:42 darrenr Exp $ */ #ifndef __IP_COMPAT_H__ #define __IP_COMPAT_H__ #if defined(_KERNEL) || defined(KERNEL) || defined(__KERNEL__) # undef KERNEL # undef _KERNEL # undef __KERNEL__ # define KERNEL # define _KERNEL # define __KERNEL__ #endif #ifndef SOLARIS # if defined(sun) && defined(__SVR4) # define SOLARIS 1 # else # define SOLARIS 0 # endif #endif #if defined(__SVR4) # define index strchr # if !defined(_KERNEL) # define bzero(a,b) memset(a,0,b) # define bcmp memcmp # define bcopy(a,b,c) memmove(b,a,c) # endif #endif #ifndef LIFNAMSIZ # ifdef IF_NAMESIZE # define LIFNAMSIZ IF_NAMESIZE # else # ifdef IFNAMSIZ # define LIFNAMSIZ IFNAMSIZ # else # define LIFNAMSIZ 16 # endif # endif #endif # define IPL_EXTERN(ep) ipl##ep /* * This is a workaround for troubles on FreeBSD and OpenBSD. */ #ifndef _KERNEL # define ADD_KERNEL # define _KERNEL # define KERNEL #endif #include #ifdef ADD_KERNEL # undef _KERNEL # undef KERNEL #endif #define NETBSD_GE_REV(x) (defined(__NetBSD_Version__) && \ (__NetBSD_Version__ >= (x))) #define NETBSD_GT_REV(x) (defined(__NetBSD_Version__) && \ (__NetBSD_Version__ > (x))) #define NETBSD_LT_REV(x) (defined(__NetBSD_Version__) && \ (__NetBSD_Version__ < (x))) /* ----------------------------------------------------------------------- */ /* F R E E B S D */ /* ----------------------------------------------------------------------- */ #define HAS_SYS_MD5_H 1 #if defined(_KERNEL) # include "opt_bpf.h" # include "opt_inet6.h" # if defined(INET6) && !defined(USE_INET6) # define USE_INET6 # endif #else # if !defined(USE_INET6) && !defined(NOINET6) # define USE_INET6 # endif #endif #if defined(_KERNEL) # include # define p_cred td_ucred # define p_uid td_ucred->cr_ruid /* * When #define'd, the 5.2.1 kernel panics when used with the ftp proxy. * There may be other, safe, kernels but this is not extensively tested yet. */ # define HAVE_M_PULLDOWN # if !defined(IPFILTER_LKM) && defined(__FreeBSD__) # include "opt_ipfilter.h" # endif # define COPYIN(a,b,c) copyin((caddr_t)(a), (caddr_t)(b), (c)) # define COPYOUT(a,b,c) copyout((caddr_t)(a), (caddr_t)(b), (c)) #else # include #endif /* _KERNEL */ #include #include #include #include #define KRWLOCK_FILL_SZ 56 #define KMUTEX_FILL_SZ 56 #include #define KMUTEX_T struct mtx #define KRWLOCK_T struct rwlock #ifdef _KERNEL # define READ_ENTER(x) rw_rlock(&(x)->ipf_lk) # define WRITE_ENTER(x) rw_wlock(&(x)->ipf_lk) # define MUTEX_DOWNGRADE(x) rw_downgrade(&(x)->ipf_lk) # define MUTEX_TRY_UPGRADE(x) rw_try_upgrade(&(x)->ipf_lk) # define RWLOCK_INIT(x,y) rw_init(&(x)->ipf_lk, (y)) # define RW_DESTROY(x) rw_destroy(&(x)->ipf_lk) # define RWLOCK_EXIT(x) do { \ if (rw_wowned(&(x)->ipf_lk)) \ rw_wunlock(&(x)->ipf_lk); \ else \ rw_runlock(&(x)->ipf_lk); \ } while (0) # include # include # define GETKTIME(x) microtime((struct timeval *)x) # define if_addrlist if_addrhead # include # include # include # define USE_MUTEXES # define MUTEX_ENTER(x) mtx_lock(&(x)->ipf_lk) # define MUTEX_EXIT(x) mtx_unlock(&(x)->ipf_lk) # define MUTEX_INIT(x,y) mtx_init(&(x)->ipf_lk, (y), NULL,\ MTX_DEF) # define MUTEX_DESTROY(x) mtx_destroy(&(x)->ipf_lk) # define MUTEX_NUKE(x) bzero((x), sizeof(*(x))) /* * Whilst the sx(9) locks on FreeBSD have the right semantics and interface * for what we want to use them for, despite testing showing they work - * with a WITNESS kernel, it generates LOR messages. */ # include # define ATOMIC_INC(x) { mtx_lock(&softc->ipf_rw.ipf_lk); (x)++; \ mtx_unlock(&softc->ipf_rw.ipf_lk); } # define ATOMIC_DEC(x) { mtx_lock(&softc->ipf_rw.ipf_lk); (x)--; \ mtx_unlock(&softc->ipf_rw.ipf_lk); } # define ATOMIC_INCL(x) atomic_add_long(&(x), 1) # define ATOMIC_INC64(x) ATOMIC_INC(x) # define ATOMIC_INC32(x) atomic_add_32((u_int *)&(x), 1) # define ATOMIC_DECL(x) atomic_add_long(&(x), -1) # define ATOMIC_DEC64(x) ATOMIC_DEC(x) # define ATOMIC_DEC32(x) atomic_add_32((u_int *)&(x), -1) # define SPL_X(x) ; # define SPL_NET(x) ; # define SPL_IMP(x) ; # define SPL_SCHED(x) ; # define GET_MINOR dev2unit # define MSGDSIZE(m) mbufchainlen(m) # define M_LEN(m) (m)->m_len # define M_ADJ(m,x) m_adj(m, x) # define M_COPY(x) m_copym((x), 0, M_COPYALL, M_NOWAIT) # define M_DUP(m) m_dup(m, M_NOWAIT) # define IPF_PANIC(x,y) if (x) { printf y; panic("ipf_panic"); } typedef struct mbuf mb_t; #else /* !_KERNEL */ # ifndef _NET_IF_VAR_H_ /* * Userland emulation of struct ifnet. */ struct route; struct mbuf; struct ifnet { char if_xname[IFNAMSIZ]; STAILQ_HEAD(, ifaddr) if_addrlist; int (*if_output)(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); }; # endif /* _NET_IF_VAR_H_ */ #endif /* _KERNEL */ #define IFNAME(x) ((struct ifnet *)x)->if_xname #define COPYIFNAME(v, x, b) \ (void) strncpy(b, \ ((struct ifnet *)x)->if_xname, \ LIFNAMSIZ) typedef u_long ioctlcmd_t; typedef struct uio uio_t; typedef int minor_t; typedef u_int32_t u_32_t; # define U_32_T 1 /* ----------------------------------------------------------------------- */ /* G E N E R I C */ /* ----------------------------------------------------------------------- */ /* * For BSD kernels, if bpf is in the kernel, enable ipfilter to use bpf in * filter rules. */ #if !defined(IPFILTER_BPF) # if (defined(NBPF) && (NBPF > 0)) || (defined(DEV_BPF) && (DEV_BPF > 0)) || \ (defined(NBPFILTER) && (NBPFILTER > 0)) # define IPFILTER_BPF # endif #endif /* * Userland locking primitives */ #ifndef _KERNEL # if !defined(KMUTEX_FILL_SZ) # define KMUTEX_FILL_SZ 1 # endif # if !defined(KRWLOCK_FILL_SZ) # define KRWLOCK_FILL_SZ 1 # endif #endif typedef struct { char *eMm_owner; char *eMm_heldin; u_int eMm_magic; int eMm_held; int eMm_heldat; } eMmutex_t; typedef struct { char *eMrw_owner; char *eMrw_heldin; u_int eMrw_magic; short eMrw_read; short eMrw_write; int eMrw_heldat; } eMrwlock_t; typedef union { char _fill[KMUTEX_FILL_SZ]; #ifdef KMUTEX_T struct { KMUTEX_T ipf_slk; const char *ipf_lname; } ipf_lkun_s; #endif eMmutex_t ipf_emu; } ipfmutex_t; typedef union { char _fill[KRWLOCK_FILL_SZ]; #ifdef KRWLOCK_T struct { KRWLOCK_T ipf_slk; const char *ipf_lname; int ipf_sr; int ipf_sw; u_int ipf_magic; } ipf_lkun_s; #endif eMrwlock_t ipf_emu; } ipfrwlock_t; #define ipf_lk ipf_lkun_s.ipf_slk #define ipf_lname ipf_lkun_s.ipf_lname #define ipf_isr ipf_lkun_s.ipf_sr #define ipf_isw ipf_lkun_s.ipf_sw #define ipf_magic ipf_lkun_s.ipf_magic #if defined(__FreeBSD__) && defined(_KERNEL) CTASSERT(sizeof(ipfrwlock_t) == KRWLOCK_FILL_SZ); CTASSERT(sizeof(ipfmutex_t) == KMUTEX_FILL_SZ); #endif /* * In a non-kernel environment, there are a lot of macros that need to be * filled in to be null-ops or to point to some compatibility function, * somewhere in userland. */ #ifndef _KERNEL typedef struct mb_s { struct mb_s *mb_next; char *mb_data; void *mb_ifp; int mb_len; int mb_flags; u_long mb_buf[2048]; } mb_t; # undef m_next # define m_next mb_next # undef m_len # define m_len mb_len # undef m_flags # define m_flags mb_flags # undef m_data # define m_data mb_data # undef M_MCAST # define M_MCAST 0x01 # undef M_BCAST # define M_BCAST 0x02 # undef M_MBCAST # define M_MBCAST 0x04 # define MSGDSIZE(m) msgdsize(m) # define M_LEN(m) (m)->mb_len # define M_ADJ(m,x) (m)->mb_len += x # define M_COPY(m) dupmbt(m) # define M_DUP(m) dupmbt(m) # define GETKTIME(x) gettimeofday((struct timeval *)(x), NULL) # define MTOD(m, t) ((t)(m)->mb_data) # define FREE_MB_T(m) freembt(m) # define ALLOC_MB_T(m,l) (m) = allocmbt(l) # define PREP_MB_T(f, m) do { \ (m)->mb_next = *(f)->fin_mp; \ *(fin)->fin_mp = (m); \ (f)->fin_m = (m); \ } while (0) # define SLEEP(x,y) 1; # define WAKEUP(x,y) ; # define POLLWAKEUP(y) ; # define IPF_PANIC(x,y) ; # define PANIC(x,y) ; # define SPL_SCHED(x) ; # define SPL_NET(x) ; # define SPL_IMP(x) ; # define SPL_X(x) ; # define KMALLOC(a,b) (a) = (b)malloc(sizeof(*a)) # define KMALLOCS(a,b,c) (a) = (b)malloc(c) # define KFREE(x) free(x) # define KFREES(x,s) free(x) # define GETIFP(x, v) get_unit(x,v) # define GETIFMTU_4(x) 2048 # define GETIFMTU_6(x) 2048 # define COPYIN(a,b,c) bcopywrap((a), (b), (c)) # define COPYOUT(a,b,c) bcopywrap((a), (b), (c)) # define COPYDATA(m, o, l, b) bcopy(MTOD((mb_t *)m, char *) + (o), \ (b), (l)) # define COPYBACK(m, o, l, b) bcopy((b), \ MTOD((mb_t *)m, char *) + (o), \ (l)) # define UIOMOVE(a,b,c,d) ipfuiomove((caddr_t)a,b,c,d) extern void m_copydata(mb_t *, int, int, caddr_t); extern int ipfuiomove(caddr_t, int, int, struct uio *); extern int bcopywrap(void *, void *, size_t); extern mb_t *allocmbt(size_t); extern mb_t *dupmbt(mb_t *); extern void freembt(mb_t *); # define MUTEX_DESTROY(x) eMmutex_destroy(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_ENTER(x) eMmutex_enter(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_EXIT(x) eMmutex_exit(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_INIT(x,y) eMmutex_init(&(x)->ipf_emu, y, \ __FILE__, __LINE__) # define MUTEX_NUKE(x) bzero((x), sizeof(*(x))) # define MUTEX_DOWNGRADE(x) eMrwlock_downgrade(&(x)->ipf_emu, \ __FILE__, __LINE__) # define MUTEX_TRY_UPGRADE(x) eMrwlock_try_upgrade(&(x)->ipf_emu, \ __FILE__, __LINE__) # define READ_ENTER(x) eMrwlock_read_enter(&(x)->ipf_emu, \ __FILE__, __LINE__) # define RWLOCK_INIT(x, y) eMrwlock_init(&(x)->ipf_emu, y) # define RWLOCK_EXIT(x) eMrwlock_exit(&(x)->ipf_emu) # define RW_DESTROY(x) eMrwlock_destroy(&(x)->ipf_emu) # define WRITE_ENTER(x) eMrwlock_write_enter(&(x)->ipf_emu, \ __FILE__, \ __LINE__) # define USE_MUTEXES 1 extern void eMmutex_destroy(eMmutex_t *, char *, int); extern void eMmutex_enter(eMmutex_t *, char *, int); extern void eMmutex_exit(eMmutex_t *, char *, int); extern void eMmutex_init(eMmutex_t *, char *, char *, int); extern void eMrwlock_destroy(eMrwlock_t *); extern void eMrwlock_exit(eMrwlock_t *); extern void eMrwlock_init(eMrwlock_t *, char *); extern void eMrwlock_read_enter(eMrwlock_t *, char *, int); extern void eMrwlock_write_enter(eMrwlock_t *, char *, int); extern void eMrwlock_downgrade(eMrwlock_t *, char *, int); #endif extern mb_t *allocmbt(size_t); #define MAX_IPV4HDR ((0xf << 2) + sizeof(struct icmp) + sizeof(ip_t) + 8) #ifndef IP_OFFMASK # define IP_OFFMASK 0x1fff #endif /* * On BSD's use quad_t as a guarantee for getting at least a 64bit sized * object. */ #if !defined(__amd64__) && !SOLARIS # define USE_QUAD_T # define U_QUAD_T unsigned long long # define QUAD_T long long #else /* BSD */ # if !defined(U_QUAD_T) # define U_QUAD_T u_long # define QUAD_T long # endif #endif /* BSD */ #ifdef USE_INET6 # if defined(__NetBSD__) || defined(__FreeBSD__) # include # include # if defined(_KERNEL) # include # endif typedef struct ip6_hdr ip6_t; # endif #endif #ifndef MAX # define MAX(a,b) (((a) > (b)) ? (a) : (b)) #endif #if defined(_KERNEL) # if SOLARIS && !defined(INSTANCES) # define COPYDATA mb_copydata # define COPYBACK mb_copyback # else # define COPYDATA m_copydata # define COPYBACK m_copyback # endif # if (defined(__NetBSD_Version__) && (__NetBSD_Version__ < 105180000)) || \ defined(__FreeBSD__) # include # endif # if NETBSD_GE_REV(105180000) # include # else # include extern vm_map_t kmem_map; # endif # include # ifdef IPFILTER_M_IPFILTER # include MALLOC_DECLARE(M_IPFILTER); # define _M_IPF M_IPFILTER # else /* IPFILTER_M_IPFILTER */ # ifdef M_PFIL # define _M_IPF M_PFIL # else # ifdef M_IPFILTER # define _M_IPF M_IPFILTER # else # define _M_IPF M_TEMP # endif /* M_IPFILTER */ # endif /* M_PFIL */ # endif /* IPFILTER_M_IPFILTER */ # if !defined(KMALLOC) # define KMALLOC(a, b) (a) = (b)malloc(sizeof(*(a)), _M_IPF, M_NOWAIT) # endif # if !defined(KMALLOCS) # define KMALLOCS(a, b, c) (a) = (b)malloc((c), _M_IPF, M_NOWAIT) # endif # if !defined(KFREE) # define KFREE(x) free((x), _M_IPF) # endif # if !defined(KFREES) # define KFREES(x,s) free((x), _M_IPF) # endif # define UIOMOVE(a,b,c,d) uiomove((caddr_t)a,b,d) # define SLEEP(id, n) tsleep((id), PPAUSE|PCATCH, n, 0) # define WAKEUP(id,x) wakeup(id+x) # if !defined(POLLWAKEUP) # define POLLWAKEUP(x) selwakeup(softc->ipf_selwait+x) # endif # define GETIFP(n, v) ifunit(n) # define GETIFMTU_4(x) ((struct ifnet *)x)->if_mtu # define GETIFMTU_6(x) ((struct ifnet *)x)->if_mtu # if !defined(USE_MUTEXES) && !defined(SPL_NET) # define SPL_IMP(x) x = splimp() # define SPL_NET(x) x = splnet() # if !defined(SPL_SCHED) # define SPL_SCHED(x) x = splsched() # endif # define SPL_X(x) (void) splx(x) # endif /* !USE_MUTEXES */ # ifndef FREE_MB_T # define FREE_MB_T(m) m_freem(m) # endif # ifndef ALLOC_MB_T # ifdef MGETHDR # define ALLOC_MB_T(m,l) do { \ MGETHDR((m), M_NOWAIT, MT_HEADER); \ if ((m) != NULL) { \ (m)->m_len = (l); \ (m)->m_pkthdr.len = (l); \ } \ } while (0) # else # define ALLOC_MB_T(m,l) do { \ MGET((m), M_NOWAIT, MT_HEADER); \ if ((m) != NULL) { \ (m)->m_len = (l); \ (m)->m_pkthdr.len = (l); \ } \ } while (0) # endif # endif # ifndef PREP_MB_T # define PREP_MB_T(f, m) do { \ mb_t *_o = *(f)->fin_mp; \ (m)->m_next = _o; \ *(fin)->fin_mp = (m); \ if (_o->m_flags & M_PKTHDR) { \ (m)->m_pkthdr.len += \ _o->m_pkthdr.len; \ (m)->m_pkthdr.rcvif = \ _o->m_pkthdr.rcvif; \ } \ } while (0) # endif # ifndef M_DUP # ifdef M_COPYALL # define M_DUP(m) m_dup(m, 0, M_COPYALL, 0) # else # define M_DUP(m) m_dup(m) # endif # endif # ifndef MTOD # define MTOD(m,t) mtod(m,t) # endif # ifndef COPYIN # define COPYIN(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) # define COPYOUT(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) # endif # if SOLARIS && !defined(KMALLOC) # define KMALLOC(a,b) (a) = (b)new_kmem_alloc(sizeof(*(a)), \ KMEM_NOSLEEP) # define KMALLOCS(a,b,c) (a) = (b)new_kmem_alloc((c), KMEM_NOSLEEP) # endif # ifndef GET_MINOR # define GET_MINOR(x) dev2unit(x) # endif # define PANIC(x,y) if (x) panic y #endif /* _KERNEL */ #if !defined(IFNAME) && !defined(_KERNEL) # define IFNAME(x) get_ifname((struct ifnet *)x) #endif #ifndef COPYIFNAME # define NEED_FRGETIFNAME extern char *ipf_getifname(struct ifnet *, char *); # define COPYIFNAME(v, x, b) \ ipf_getifname((struct ifnet *)x, b) #endif #ifndef ASSERT # ifdef _KERNEL # define ASSERT(x) # else # define ASSERT(x) do { if (!(x)) abort(); } while (0) # endif #endif #ifndef BCOPYIN # define BCOPYIN(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) # define BCOPYOUT(a,b,c) (bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0) #endif /* * Because the ctype(3) posix definition, if used "safely" in code everywhere, * would mean all normal code that walks through strings needed casts. Yuck. */ #define ISALNUM(x) isalnum((u_char)(x)) #define ISALPHA(x) isalpha((u_char)(x)) #define ISDIGIT(x) isdigit((u_char)(x)) #define ISSPACE(x) isspace((u_char)(x)) #define ISUPPER(x) isupper((u_char)(x)) #define ISXDIGIT(x) isxdigit((u_char)(x)) #define ISLOWER(x) islower((u_char)(x)) #define TOUPPER(x) toupper((u_char)(x)) #define TOLOWER(x) tolower((u_char)(x)) /* * If mutexes aren't being used, turn all the mutex functions into null-ops. */ #if !defined(USE_MUTEXES) # define USE_SPL 1 # undef RW_DESTROY # undef MUTEX_INIT # undef MUTEX_NUKE # undef MUTEX_DESTROY # define MUTEX_ENTER(x) ; # define READ_ENTER(x) ; # define WRITE_ENTER(x) ; # define MUTEX_DOWNGRADE(x) ; # define MUTEX_TRY_UPGRADE(x) ; # define RWLOCK_INIT(x, y) ; # define RWLOCK_EXIT(x) ; # define RW_DESTROY(x) ; # define MUTEX_EXIT(x) ; # define MUTEX_INIT(x,y) ; # define MUTEX_DESTROY(x) ; # define MUTEX_NUKE(x) ; #endif /* !USE_MUTEXES */ #ifndef ATOMIC_INC # define ATOMIC_INC(x) (x)++ # define ATOMIC_DEC(x) (x)-- #endif #if defined(USE_SPL) && defined(_KERNEL) # define SPL_INT(x) int x #else # define SPL_INT(x) #endif /* * If there are no atomic operations for bit sizes defined, define them to all * use a generic one that works for all sizes. */ #ifndef ATOMIC_INCL # define ATOMIC_INCL ATOMIC_INC # define ATOMIC_INC64 ATOMIC_INC # define ATOMIC_INC32 ATOMIC_INC # define ATOMIC_DECL ATOMIC_DEC # define ATOMIC_DEC64 ATOMIC_DEC # define ATOMIC_DEC32 ATOMIC_DEC #endif #ifndef HDR_T_PRIVATE typedef struct tcphdr tcphdr_t; typedef struct udphdr udphdr_t; #endif typedef struct icmp icmphdr_t; typedef struct ip ip_t; typedef struct ether_header ether_header_t; typedef struct tcpiphdr tcpiphdr_t; #ifndef FR_GROUPLEN # define FR_GROUPLEN 16 #endif #ifndef offsetof # define offsetof(t,m) (size_t)((&((t *)0L)->m)) #endif #ifndef stsizeof # define stsizeof(t,m) sizeof(((t *)0L)->m) #endif /* * This set of macros has been brought about because on Tru64 it is not * possible to easily assign or examine values in a structure that are * bit fields. */ #ifndef IP_V # define IP_V(x) (x)->ip_v #endif #ifndef IP_V_A # define IP_V_A(x,y) (x)->ip_v = (y) #endif #ifndef IP_HL # define IP_HL(x) (x)->ip_hl #endif #ifndef IP_HL_A # define IP_HL_A(x,y) (x)->ip_hl = ((y) & 0xf) #endif -#ifndef TCP_X2 -# define TCP_X2(x) (x)->th_x2 -#endif -#ifndef TCP_X2_A -# define TCP_X2_A(x,y) (x)->th_x2 = (y) -#endif #ifndef TCP_OFF # define TCP_OFF(x) (x)->th_off #endif #ifndef TCP_OFF_A # define TCP_OFF_A(x,y) (x)->th_off = (y) #endif #define IPMINLEN(i, h) ((i)->ip_len >= (IP_HL(i) * 4 + sizeof(struct h))) #define TCPF_ALL (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|\ TH_ECN|TH_CWR) #if !SOLARIS && !defined(m_act) # define m_act m_nextpkt #endif /* * Security Options for Intenet Protocol (IPSO) as defined in RFC 1108. * * Basic Option * * 00000001 - (Reserved 4) * 00111101 - Top Secret * 01011010 - Secret * 10010110 - Confidential * 01100110 - (Reserved 3) * 11001100 - (Reserved 2) * 10101011 - Unclassified * 11110001 - (Reserved 1) */ #define IPSO_CLASS_RES4 0x01 #define IPSO_CLASS_TOPS 0x3d #define IPSO_CLASS_SECR 0x5a #define IPSO_CLASS_CONF 0x96 #define IPSO_CLASS_RES3 0x66 #define IPSO_CLASS_RES2 0xcc #define IPSO_CLASS_UNCL 0xab #define IPSO_CLASS_RES1 0xf1 #define IPSO_AUTH_GENSER 0x80 #define IPSO_AUTH_ESI 0x40 #define IPSO_AUTH_SCI 0x20 #define IPSO_AUTH_NSA 0x10 #define IPSO_AUTH_DOE 0x08 #define IPSO_AUTH_UN 0x06 #define IPSO_AUTH_FTE 0x01 /* * IP option #defines */ #undef IPOPT_RR #define IPOPT_RR 7 #undef IPOPT_ZSU #define IPOPT_ZSU 10 /* ZSU */ #undef IPOPT_MTUP #define IPOPT_MTUP 11 /* MTUP */ #undef IPOPT_MTUR #define IPOPT_MTUR 12 /* MTUR */ #undef IPOPT_ENCODE #define IPOPT_ENCODE 15 /* ENCODE */ #undef IPOPT_TS #define IPOPT_TS 68 #undef IPOPT_TR #define IPOPT_TR 82 /* TR */ #undef IPOPT_SECURITY #define IPOPT_SECURITY 130 #undef IPOPT_LSRR #define IPOPT_LSRR 131 #undef IPOPT_E_SEC #define IPOPT_E_SEC 133 /* E-SEC */ #undef IPOPT_CIPSO #define IPOPT_CIPSO 134 /* CIPSO */ #undef IPOPT_SATID #define IPOPT_SATID 136 #ifndef IPOPT_SID # define IPOPT_SID IPOPT_SATID #endif #undef IPOPT_SSRR #define IPOPT_SSRR 137 #undef IPOPT_ADDEXT #define IPOPT_ADDEXT 147 /* ADDEXT */ #undef IPOPT_VISA #define IPOPT_VISA 142 /* VISA */ #undef IPOPT_IMITD #define IPOPT_IMITD 144 /* IMITD */ #undef IPOPT_EIP #define IPOPT_EIP 145 /* EIP */ #undef IPOPT_RTRALRT #define IPOPT_RTRALRT 148 /* RTRALRT */ #undef IPOPT_SDB #define IPOPT_SDB 149 #undef IPOPT_NSAPA #define IPOPT_NSAPA 150 #undef IPOPT_DPS #define IPOPT_DPS 151 #undef IPOPT_UMP #define IPOPT_UMP 152 #undef IPOPT_FINN #define IPOPT_FINN 205 /* FINN */ #undef IPOPT_AH #define IPOPT_AH 256+IPPROTO_AH #define ICMP_UNREACH_ADMIN_PROHIBIT ICMP_UNREACH_FILTER_PROHIB #define ICMP_UNREACH_FILTER ICMP_UNREACH_FILTER_PROHIB #ifndef IPVERSION # define IPVERSION 4 #endif #ifndef IPOPT_MINOFF # define IPOPT_MINOFF 4 #endif #ifndef IPOPT_COPIED # define IPOPT_COPIED(x) ((x)&0x80) #endif #ifndef IPOPT_EOL # define IPOPT_EOL 0 #endif #ifndef IPOPT_NOP # define IPOPT_NOP 1 #endif #ifndef IP_MF # define IP_MF ((u_short)0x2000) #endif #ifndef ETHERTYPE_IP # define ETHERTYPE_IP ((u_short)0x0800) #endif #ifndef TH_FIN # define TH_FIN 0x01 #endif #ifndef TH_SYN # define TH_SYN 0x02 #endif #ifndef TH_RST # define TH_RST 0x04 #endif #ifndef TH_PUSH # define TH_PUSH 0x08 #endif #ifndef TH_ACK # define TH_ACK 0x10 #endif #ifndef TH_URG # define TH_URG 0x20 #endif #undef TH_ACKMASK #define TH_ACKMASK (TH_FIN|TH_SYN|TH_RST|TH_ACK) #ifndef IPOPT_EOL # define IPOPT_EOL 0 #endif #ifndef IPOPT_NOP # define IPOPT_NOP 1 #endif #ifndef IPOPT_RR # define IPOPT_RR 7 #endif #ifndef IPOPT_TS # define IPOPT_TS 68 #endif #ifndef IPOPT_SECURITY # define IPOPT_SECURITY 130 #endif #ifndef IPOPT_LSRR # define IPOPT_LSRR 131 #endif #ifndef IPOPT_SATID # define IPOPT_SATID 136 #endif #ifndef IPOPT_SSRR # define IPOPT_SSRR 137 #endif #ifndef IPOPT_SECUR_UNCLASS # define IPOPT_SECUR_UNCLASS ((u_short)0x0000) #endif #ifndef IPOPT_SECUR_CONFID # define IPOPT_SECUR_CONFID ((u_short)0xf135) #endif #ifndef IPOPT_SECUR_EFTO # define IPOPT_SECUR_EFTO ((u_short)0x789a) #endif #ifndef IPOPT_SECUR_MMMM # define IPOPT_SECUR_MMMM ((u_short)0xbc4d) #endif #ifndef IPOPT_SECUR_RESTR # define IPOPT_SECUR_RESTR ((u_short)0xaf13) #endif #ifndef IPOPT_SECUR_SECRET # define IPOPT_SECUR_SECRET ((u_short)0xd788) #endif #ifndef IPOPT_SECUR_TOPSECRET # define IPOPT_SECUR_TOPSECRET ((u_short)0x6bc5) #endif #ifndef IPOPT_OLEN # define IPOPT_OLEN 1 #endif #ifndef IPPROTO_HOPOPTS # define IPPROTO_HOPOPTS 0 #endif #ifndef IPPROTO_IPIP # define IPPROTO_IPIP 4 #endif #ifndef IPPROTO_ENCAP # define IPPROTO_ENCAP 98 #endif #ifndef IPPROTO_IPV6 # define IPPROTO_IPV6 41 #endif #ifndef IPPROTO_ROUTING # define IPPROTO_ROUTING 43 #endif #ifndef IPPROTO_FRAGMENT # define IPPROTO_FRAGMENT 44 #endif #ifndef IPPROTO_GRE # define IPPROTO_GRE 47 /* GRE encaps RFC 1701 */ #endif #ifndef IPPROTO_ESP # define IPPROTO_ESP 50 #endif #ifndef IPPROTO_AH # define IPPROTO_AH 51 #endif #ifndef IPPROTO_ICMPV6 # define IPPROTO_ICMPV6 58 #endif #ifndef IPPROTO_NONE # define IPPROTO_NONE 59 #endif #ifndef IPPROTO_DSTOPTS # define IPPROTO_DSTOPTS 60 #endif #ifndef IPPROTO_MOBILITY # define IPPROTO_MOBILITY 135 #endif #ifndef ICMP_ROUTERADVERT # define ICMP_ROUTERADVERT 9 #endif #ifndef ICMP_ROUTERSOLICIT # define ICMP_ROUTERSOLICIT 10 #endif #ifndef ICMP6_DST_UNREACH # define ICMP6_DST_UNREACH 1 #endif #ifndef ICMP6_PACKET_TOO_BIG # define ICMP6_PACKET_TOO_BIG 2 #endif #ifndef ICMP6_TIME_EXCEEDED # define ICMP6_TIME_EXCEEDED 3 #endif #ifndef ICMP6_PARAM_PROB # define ICMP6_PARAM_PROB 4 #endif #ifndef ICMP6_ECHO_REQUEST # define ICMP6_ECHO_REQUEST 128 #endif #ifndef ICMP6_ECHO_REPLY # define ICMP6_ECHO_REPLY 129 #endif #ifndef ICMP6_MEMBERSHIP_QUERY # define ICMP6_MEMBERSHIP_QUERY 130 #endif #ifndef MLD6_LISTENER_QUERY # define MLD6_LISTENER_QUERY 130 #endif #ifndef ICMP6_MEMBERSHIP_REPORT # define ICMP6_MEMBERSHIP_REPORT 131 #endif #ifndef MLD6_LISTENER_REPORT # define MLD6_LISTENER_REPORT 131 #endif #ifndef ICMP6_MEMBERSHIP_REDUCTION # define ICMP6_MEMBERSHIP_REDUCTION 132 #endif #ifndef MLD6_LISTENER_DONE # define MLD6_LISTENER_DONE 132 #endif #ifndef ND_ROUTER_SOLICIT # define ND_ROUTER_SOLICIT 133 #endif #ifndef ND_ROUTER_ADVERT # define ND_ROUTER_ADVERT 134 #endif #ifndef ND_NEIGHBOR_SOLICIT # define ND_NEIGHBOR_SOLICIT 135 #endif #ifndef ND_NEIGHBOR_ADVERT # define ND_NEIGHBOR_ADVERT 136 #endif #ifndef ND_REDIRECT # define ND_REDIRECT 137 #endif #ifndef ICMP6_ROUTER_RENUMBERING # define ICMP6_ROUTER_RENUMBERING 138 #endif #ifndef ICMP6_WRUREQUEST # define ICMP6_WRUREQUEST 139 #endif #ifndef ICMP6_WRUREPLY # define ICMP6_WRUREPLY 140 #endif #ifndef ICMP6_FQDN_QUERY # define ICMP6_FQDN_QUERY 139 #endif #ifndef ICMP6_FQDN_REPLY # define ICMP6_FQDN_REPLY 140 #endif #ifndef ICMP6_NI_QUERY # define ICMP6_NI_QUERY 139 #endif #ifndef ICMP6_NI_REPLY # define ICMP6_NI_REPLY 140 #endif #ifndef MLD6_MTRACE_RESP # define MLD6_MTRACE_RESP 200 #endif #ifndef MLD6_MTRACE # define MLD6_MTRACE 201 #endif #ifndef ICMP6_HADISCOV_REQUEST # define ICMP6_HADISCOV_REQUEST 202 #endif #ifndef ICMP6_HADISCOV_REPLY # define ICMP6_HADISCOV_REPLY 203 #endif #ifndef ICMP6_MOBILEPREFIX_SOLICIT # define ICMP6_MOBILEPREFIX_SOLICIT 204 #endif #ifndef ICMP6_MOBILEPREFIX_ADVERT # define ICMP6_MOBILEPREFIX_ADVERT 205 #endif #ifndef ICMP6_MAXTYPE # define ICMP6_MAXTYPE 205 #endif #ifndef ICMP6_DST_UNREACH_NOROUTE # define ICMP6_DST_UNREACH_NOROUTE 0 #endif #ifndef ICMP6_DST_UNREACH_ADMIN # define ICMP6_DST_UNREACH_ADMIN 1 #endif #ifndef ICMP6_DST_UNREACH_NOTNEIGHBOR # define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 #endif #ifndef ICMP6_DST_UNREACH_BEYONDSCOPE # define ICMP6_DST_UNREACH_BEYONDSCOPE 2 #endif #ifndef ICMP6_DST_UNREACH_ADDR # define ICMP6_DST_UNREACH_ADDR 3 #endif #ifndef ICMP6_DST_UNREACH_NOPORT # define ICMP6_DST_UNREACH_NOPORT 4 #endif #ifndef ICMP6_TIME_EXCEED_TRANSIT # define ICMP6_TIME_EXCEED_TRANSIT 0 #endif #ifndef ICMP6_TIME_EXCEED_REASSEMBLY # define ICMP6_TIME_EXCEED_REASSEMBLY 1 #endif #ifndef ICMP6_NI_SUCCESS # define ICMP6_NI_SUCCESS 0 #endif #ifndef ICMP6_NI_REFUSED # define ICMP6_NI_REFUSED 1 #endif #ifndef ICMP6_NI_UNKNOWN # define ICMP6_NI_UNKNOWN 2 #endif #ifndef ICMP6_ROUTER_RENUMBERING_COMMAND # define ICMP6_ROUTER_RENUMBERING_COMMAND 0 #endif #ifndef ICMP6_ROUTER_RENUMBERING_RESULT # define ICMP6_ROUTER_RENUMBERING_RESULT 1 #endif #ifndef ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET # define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 #endif #ifndef ICMP6_PARAMPROB_HEADER # define ICMP6_PARAMPROB_HEADER 0 #endif #ifndef ICMP6_PARAMPROB_NEXTHEADER # define ICMP6_PARAMPROB_NEXTHEADER 1 #endif #ifndef ICMP6_PARAMPROB_OPTION # define ICMP6_PARAMPROB_OPTION 2 #endif #ifndef ICMP6_NI_SUBJ_IPV6 # define ICMP6_NI_SUBJ_IPV6 0 #endif #ifndef ICMP6_NI_SUBJ_FQDN # define ICMP6_NI_SUBJ_FQDN 1 #endif #ifndef ICMP6_NI_SUBJ_IPV4 # define ICMP6_NI_SUBJ_IPV4 2 #endif #ifndef MLD_MTRACE_RESP # define MLD_MTRACE_RESP 200 #endif #ifndef MLD_MTRACE # define MLD_MTRACE 201 #endif #ifndef MLD6_MTRACE_RESP # define MLD6_MTRACE_RESP MLD_MTRACE_RESP #endif #ifndef MLD6_MTRACE # define MLD6_MTRACE MLD_MTRACE #endif #if !defined(IPV6_FLOWINFO_MASK) # if (BYTE_ORDER == BIG_ENDIAN) || defined(_BIG_ENDIAN) # define IPV6_FLOWINFO_MASK 0x0fffffff /* flow info (28 bits) */ # else # if(BYTE_ORDER == LITTLE_ENDIAN) || !defined(_BIG_ENDIAN) # define IPV6_FLOWINFO_MASK 0xffffff0f /* flow info (28 bits) */ # endif /* LITTLE_ENDIAN */ # endif #endif #if !defined(IPV6_FLOWLABEL_MASK) # if (BYTE_ORDER == BIG_ENDIAN) || defined(_BIG_ENDIAN) # define IPV6_FLOWLABEL_MASK 0x000fffff /* flow label (20 bits) */ # else # if (BYTE_ORDER == LITTLE_ENDIAN) || !defined(_BIG_ENDIAN) # define IPV6_FLOWLABEL_MASK 0xffff0f00 /* flow label (20 bits) */ # endif /* LITTLE_ENDIAN */ # endif #endif /* * ECN is a new addition to TCP - RFC 2481 */ #ifndef TH_ECN # define TH_ECN 0x40 #endif #ifndef TH_CWR # define TH_CWR 0x80 #endif #define TH_ECNALL (TH_ECN|TH_CWR) /* * TCP States */ #define IPF_TCPS_LISTEN 0 /* listening for connection */ #define IPF_TCPS_SYN_SENT 1 /* active, have sent syn */ #define IPF_TCPS_SYN_RECEIVED 2 /* have send and received syn */ #define IPF_TCPS_HALF_ESTAB 3 /* for connections not fully "up" */ /* states < IPF_TCPS_ESTABLISHED are those where connections not established */ #define IPF_TCPS_ESTABLISHED 4 /* established */ #define IPF_TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ /* states > IPF_TCPS_CLOSE_WAIT are those where user has closed */ #define IPF_TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ #define IPF_TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ #define IPF_TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ /* states > IPF_TCPS_CLOSE_WAIT && < IPF_TCPS_FIN_WAIT_2 await ACK of FIN */ #define IPF_TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ #define IPF_TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ #define IPF_TCPS_CLOSED 11 /* closed */ #define IPF_TCP_NSTATES 12 #define TCP_MSL 120 #undef ICMP_MAX_UNREACH #define ICMP_MAX_UNREACH 14 #undef ICMP_MAXTYPE #define ICMP_MAXTYPE 18 #ifndef LOG_FTP # define LOG_FTP (11<<3) #endif #ifndef LOG_AUTHPRIV # define LOG_AUTHPRIV (10<<3) #endif #ifndef LOG_AUDIT # define LOG_AUDIT (13<<3) #endif #ifndef LOG_NTP # define LOG_NTP (12<<3) #endif #ifndef LOG_SECURITY # define LOG_SECURITY (13<<3) #endif #ifndef LOG_LFMT # define LOG_LFMT (14<<3) #endif #ifndef LOG_CONSOLE # define LOG_CONSOLE (14<<3) #endif /* * ICMP error replies have an IP header (20 bytes), 8 bytes of ICMP data, * another IP header and then 64 bits of data, totalling 56. Of course, * the last 64 bits is dependent on that being available. */ #define ICMPERR_ICMPHLEN 8 #define ICMPERR_IPICMPHLEN (20 + 8) #define ICMPERR_MINPKTLEN (20 + 8 + 20) #define ICMPERR_MAXPKTLEN (20 + 8 + 20 + 8) #define ICMP6ERR_MINPKTLEN (40 + 8) #define ICMP6ERR_IPICMPHLEN (40 + 8 + 40) #ifndef MIN # define MIN(a,b) (((a)<(b))?(a):(b)) #endif #ifdef RESCUE # undef IPFILTER_BPF #endif #ifdef IPF_DEBUG # define DPRINT(x) printf x #else # define DPRINT(x) #endif #if defined(DTRACE_PROBE) && defined(_KERNEL) # define DT(_n) DTRACE_PROBE(_n) # define DT1(_n,_a,_b) DTRACE_PROBE1(_n,_a,_b) # define DT2(_n,_a,_b,_c,_d) DTRACE_PROBE2(_n,_a,_b,_c,_d) # define DT3(_n,_a,_b,_c,_d,_e,_f) \ DTRACE_PROBE3(_n,_a,_b,_c,_d,_e,_f) # define DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h) \ DTRACE_PROBE4(_n,_a,_b,_c,_d,_e,_f,_g,_h) # define DT5(_n,_a,_b,_c,_d,_e,_f,_g,_h,_i,_j) \ DTRACE_PROBE5(_n,_a,_b,_c,_d,_e,_f,_g,_h,_i,_j) #else # define DT(_n) # define DT1(_n,_a,_b) # define DT2(_n,_a,_b,_c,_d) # define DT3(_n,_a,_b,_c,_d,_e,_f) # define DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h) # define DT5(_n,_a,_b,_c,_d,_e,_f,_g,_h,_i,_j) #endif struct ip6_routing { u_char ip6r_nxt; /* next header */ u_char ip6r_len; /* length in units of 8 octets */ u_char ip6r_type; /* always zero */ u_char ip6r_segleft; /* segments left */ u_32_t ip6r_reserved; /* reserved field */ }; #endif /* __IP_COMPAT_H__ */ diff --git a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c index 139aff9b1c73..1922880e90df 100644 --- a/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c +++ b/sys/netpfil/ipfilter/netinet/ip_fil_freebsd.c @@ -1,1459 +1,1458 @@ /* * Copyright (C) 2012 by Darren Reed. * * See the IPFILTER.LICENCE file for details on licencing. */ #if defined(KERNEL) || defined(_KERNEL) # undef KERNEL # undef _KERNEL # define KERNEL 1 # define _KERNEL 1 #endif #if defined(__FreeBSD__) && \ !defined(KLD_MODULE) && !defined(IPFILTER_LKM) # include "opt_inet6.h" #endif #include #include #include #include #include #include #include #include #include #include #include #if defined(__FreeBSD__) # include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "netinet/ip_compat.h" #ifdef USE_INET6 # include #endif #include "netinet/ip_fil.h" #include "netinet/ip_nat.h" #include "netinet/ip_frag.h" #include "netinet/ip_state.h" #include "netinet/ip_proxy.h" #include "netinet/ip_auth.h" #include "netinet/ip_sync.h" #include "netinet/ip_lookup.h" #include "netinet/ip_dstlist.h" #ifdef IPFILTER_SCAN # include "netinet/ip_scan.h" #endif #include "netinet/ip_pool.h" #include #include #ifdef CSUM_DATA_VALID # include #endif extern int ip_optcopy(struct ip *, struct ip *); #ifdef IPFILTER_M_IPFILTER MALLOC_DEFINE(M_IPFILTER, "ipfilter", "IP Filter packet filter data structures"); #endif static int ipf_send_ip(fr_info_t *, mb_t *); static void ipf_timer_func(void *arg); VNET_DEFINE(ipf_main_softc_t, ipfmain) = { .ipf_running = -2, }; #define V_ipfmain VNET(ipfmain) #include #include VNET_DEFINE_STATIC(eventhandler_tag, ipf_arrivetag); VNET_DEFINE_STATIC(eventhandler_tag, ipf_departtag); #define V_ipf_arrivetag VNET(ipf_arrivetag) #define V_ipf_departtag VNET(ipf_departtag) #if 0 /* * Disable the "cloner" event handler; we are getting interface * events before the firewall is fully initiallized and also no vnet * information thus leading to uninitialised memory accesses. * In addition it is unclear why we need it in first place. * If it turns out to be needed, well need a dedicated event handler * for it to deal with the ifc and the correct vnet. */ VNET_DEFINE_STATIC(eventhandler_tag, ipf_clonetag); #define V_ipf_clonetag VNET(ipf_clonetag) #endif static void ipf_ifevent(void *arg, struct ifnet *ifp); static void ipf_ifevent(void *arg, struct ifnet *ifp) { CURVNET_SET(ifp->if_vnet); if (V_ipfmain.ipf_running > 0) ipf_sync(&V_ipfmain, NULL); CURVNET_RESTORE(); } static pfil_return_t ipf_check_wrapper(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { struct ip *ip = mtod(*mp, struct ip *); pfil_return_t rv; CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, ip, ip->ip_hl << 2, ifp, !!(flags & PFIL_OUT), mp); CURVNET_RESTORE(); return (rv == 0 ? PFIL_PASS : PFIL_DROPPED); } #ifdef USE_INET6 static pfil_return_t ipf_check_wrapper6(struct mbuf **mp, struct ifnet *ifp, int flags, void *ruleset __unused, struct inpcb *inp) { pfil_return_t rv; CURVNET_SET(ifp->if_vnet); rv = ipf_check(&V_ipfmain, mtod(*mp, struct ip *), sizeof(struct ip6_hdr), ifp, !!(flags & PFIL_OUT), mp); CURVNET_RESTORE(); return (rv == 0 ? PFIL_PASS : PFIL_DROPPED); } # endif #if defined(IPFILTER_LKM) int ipf_identify(char *s) { if (strcmp(s, "ipl") == 0) return (1); return (0); } #endif /* IPFILTER_LKM */ static void ipf_timer_func(void *arg) { ipf_main_softc_t *softc = arg; SPL_INT(s); SPL_NET(s); READ_ENTER(&softc->ipf_global); if (softc->ipf_running > 0) ipf_slowtimer(softc); if (softc->ipf_running == -1 || softc->ipf_running == 1) { #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, hz/2); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); } RWLOCK_EXIT(&softc->ipf_global); SPL_X(s); } int ipfattach(ipf_main_softc_t *softc) { #ifdef USE_SPL int s; #endif SPL_NET(s); if (softc->ipf_running > 0) { SPL_X(s); return (EBUSY); } if (ipf_init_all(softc) < 0) { SPL_X(s); return (EIO); } bzero((char *)V_ipfmain.ipf_selwait, sizeof(V_ipfmain.ipf_selwait)); softc->ipf_running = 1; if (softc->ipf_control_forwarding & 1) V_ipforwarding = 1; SPL_X(s); #if 0 softc->ipf_slow_ch = timeout(ipf_timer_func, softc, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT); #endif callout_init(&softc->ipf_slow_ch, 1); callout_reset(&softc->ipf_slow_ch, (hz / IPF_HZ_DIVIDE) * IPF_HZ_MULT, ipf_timer_func, softc); return (0); } /* * Disable the filter by removing the hooks from the IP input/output * stream. */ int ipfdetach(ipf_main_softc_t *softc) { #ifdef USE_SPL int s; #endif if (softc->ipf_control_forwarding & 2) V_ipforwarding = 0; SPL_NET(s); #if 0 if (softc->ipf_slow_ch.callout != NULL) untimeout(ipf_timer_func, softc, softc->ipf_slow_ch); bzero(&softc->ipf_slow, sizeof(softc->ipf_slow)); #endif callout_drain(&softc->ipf_slow_ch); ipf_fini_all(softc); softc->ipf_running = -2; SPL_X(s); return (0); } /* * Filter ioctl interface. */ int ipfioctl(struct cdev *dev, ioctlcmd_t cmd, caddr_t data, int mode, struct thread *p) #define p_cred td_ucred #define p_uid td_ucred->cr_ruid { int error = 0, unit = 0; SPL_INT(s); CURVNET_SET(TD_TO_VNET(p)); if (securelevel_ge(p->p_cred, 3) && (mode & FWRITE)) { V_ipfmain.ipf_interror = 130001; CURVNET_RESTORE(); return (EPERM); } if (jailed_without_vnet(p->p_cred)) { V_ipfmain.ipf_interror = 130018; CURVNET_RESTORE(); return (EOPNOTSUPP); } unit = GET_MINOR(dev); if ((IPL_LOGMAX < unit) || (unit < 0)) { V_ipfmain.ipf_interror = 130002; CURVNET_RESTORE(); return (ENXIO); } if (V_ipfmain.ipf_running <= 0) { if (unit != IPL_LOGIPF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130003; CURVNET_RESTORE(); return (EIO); } if (cmd != SIOCIPFGETNEXT && cmd != SIOCIPFGET && cmd != SIOCIPFSET && cmd != SIOCFRENB && cmd != SIOCGETFS && cmd != SIOCGETFF && cmd != SIOCIPFINTERROR) { V_ipfmain.ipf_interror = 130004; CURVNET_RESTORE(); return (EIO); } } SPL_NET(s); error = ipf_ioctlswitch(&V_ipfmain, unit, data, cmd, mode, p->p_uid, p); CURVNET_RESTORE(); if (error != -1) { SPL_X(s); return (error); } SPL_X(s); return (error); } /* * ipf_send_reset - this could conceivably be a call to tcp_respond(), but that * requires a large amount of setting up and isn't any more efficient. */ int ipf_send_reset(fr_info_t *fin) { struct tcphdr *tcp, *tcp2; int tlen = 0, hlen; struct mbuf *m; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip; tcp = fin->fin_dp; if (tcp->th_flags & TH_RST) return (-1); /* feedback loop */ if (ipf_checkl4sum(fin) == -1) return (-1); tlen = fin->fin_dlen - (TCP_OFF(tcp) << 2) + ((tcp->th_flags & TH_SYN) ? 1 : 0) + ((tcp->th_flags & TH_FIN) ? 1 : 0); #ifdef USE_INET6 hlen = (fin->fin_v == 6) ? sizeof(ip6_t) : sizeof(ip_t); #else hlen = sizeof(ip_t); #endif #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return (-1); if (sizeof(*tcp2) + hlen > MLEN) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return (-1); } } m->m_len = sizeof(*tcp2) + hlen; m->m_data += max_linkhdr; m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = (struct ifnet *)0; ip = mtod(m, struct ip *); bzero((char *)ip, hlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; #endif tcp2 = (struct tcphdr *)((char *)ip + hlen); tcp2->th_sport = tcp->th_dport; tcp2->th_dport = tcp->th_sport; - if (tcp->th_flags & TH_ACK) { + if (tcp_get_flags(tcp) & TH_ACK) { tcp2->th_seq = tcp->th_ack; - tcp2->th_flags = TH_RST; + tcp_set_flags(tcp2, TH_RST); tcp2->th_ack = 0; } else { tcp2->th_seq = 0; tcp2->th_ack = ntohl(tcp->th_seq); tcp2->th_ack += tlen; tcp2->th_ack = htonl(tcp2->th_ack); - tcp2->th_flags = TH_RST|TH_ACK; + tcp_set_flags(tcp2, TH_RST|TH_ACK); } - TCP_X2_A(tcp2, 0); TCP_OFF_A(tcp2, sizeof(*tcp2) >> 2); tcp2->th_win = tcp->th_win; tcp2->th_sum = 0; tcp2->th_urp = 0; #ifdef USE_INET6 if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_hlim = 0; ip6->ip6_src = fin->fin_dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; tcp2->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*ip6), sizeof(*tcp2)); return (ipf_send_ip(fin, m)); } #endif ip->ip_p = IPPROTO_TCP; ip->ip_len = htons(sizeof(struct tcphdr)); ip->ip_src.s_addr = fin->fin_daddr; ip->ip_dst.s_addr = fin->fin_saddr; tcp2->th_sum = in_cksum(m, hlen + sizeof(*tcp2)); ip->ip_len = htons(hlen + sizeof(*tcp2)); return (ipf_send_ip(fin, m)); } /* * ip_len must be in network byte order when called. */ static int ipf_send_ip(fr_info_t *fin, mb_t *m) { fr_info_t fnew; ip_t *ip, *oip; int hlen; ip = mtod(m, ip_t *); bzero((char *)&fnew, sizeof(fnew)); fnew.fin_main_soft = fin->fin_main_soft; IP_V_A(ip, fin->fin_v); switch (fin->fin_v) { case 4 : oip = fin->fin_ip; hlen = sizeof(*oip); fnew.fin_v = 4; fnew.fin_p = ip->ip_p; fnew.fin_plen = ntohs(ip->ip_len); IP_HL_A(ip, sizeof(*oip) >> 2); ip->ip_tos = oip->ip_tos; ip->ip_id = fin->fin_ip->ip_id; ip->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0); ip->ip_ttl = V_ip_defttl; ip->ip_sum = 0; break; #ifdef USE_INET6 case 6 : { ip6_t *ip6 = (ip6_t *)ip; ip6->ip6_vfc = 0x60; ip6->ip6_hlim = IPDEFTTL; hlen = sizeof(*ip6); fnew.fin_p = ip6->ip6_nxt; fnew.fin_v = 6; fnew.fin_plen = ntohs(ip6->ip6_plen) + hlen; break; } #endif default : return (EINVAL); } #ifdef IPSEC_SUPPORT m->m_pkthdr.rcvif = NULL; #endif fnew.fin_ifp = fin->fin_ifp; fnew.fin_flx = FI_NOCKSUM; fnew.fin_m = m; fnew.fin_ip = ip; fnew.fin_mp = &m; fnew.fin_hlen = hlen; fnew.fin_dp = (char *)ip + hlen; (void) ipf_makefrip(hlen, ip, &fnew); return (ipf_fastroute(m, &m, &fnew, NULL)); } int ipf_send_icmp_err(int type, fr_info_t *fin, int dst) { int err, hlen, xtra, iclen, ohlen, avail, code; struct in_addr dst4; struct icmp *icmp; struct mbuf *m; i6addr_t dst6; void *ifp; #ifdef USE_INET6 ip6_t *ip6; #endif ip_t *ip, *ip2; if ((type < 0) || (type >= ICMP_MAXTYPE)) return (-1); code = fin->fin_icode; #ifdef USE_INET6 /* See NetBSD ip_fil_netbsd.c r1.4: */ if ((code < 0) || (code >= sizeof(icmptoicmp6unreach)/sizeof(int))) return (-1); #endif if (ipf_checkl4sum(fin) == -1) return (-1); #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) return (-1); avail = MHLEN; xtra = 0; hlen = 0; ohlen = 0; dst4.s_addr = 0; ifp = fin->fin_ifp; if (fin->fin_v == 4) { if ((fin->fin_p == IPPROTO_ICMP) && !(fin->fin_flx & FI_SHORT)) switch (ntohs(fin->fin_data[0]) >> 8) { case ICMP_ECHO : case ICMP_TSTAMP : case ICMP_IREQ : case ICMP_MASKREQ : break; default : FREE_MB_T(m); return (0); } if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 4, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return (-1); } dst4 = dst6.in4; } else dst4.s_addr = fin->fin_daddr; hlen = sizeof(ip_t); ohlen = fin->fin_hlen; iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; if (fin->fin_hlen < fin->fin_plen) xtra = MIN(fin->fin_dlen, 8); else xtra = 0; } #ifdef USE_INET6 else if (fin->fin_v == 6) { hlen = sizeof(ip6_t); ohlen = sizeof(ip6_t); iclen = hlen + offsetof(struct icmp, icmp_ip) + ohlen; type = icmptoicmp6types[type]; if (type == ICMP6_DST_UNREACH) code = icmptoicmp6unreach[code]; if (iclen + max_linkhdr + fin->fin_plen > avail) { if (!(MCLGET(m, M_NOWAIT))) { FREE_MB_T(m); return (-1); } avail = MCLBYTES; } xtra = MIN(fin->fin_plen, avail - iclen - max_linkhdr); xtra = MIN(xtra, IPV6_MMTU - iclen); if (dst == 0) { if (ipf_ifpaddr(&V_ipfmain, 6, FRI_NORMAL, ifp, &dst6, NULL) == -1) { FREE_MB_T(m); return (-1); } } else dst6 = fin->fin_dst6; } #endif else { FREE_MB_T(m); return (-1); } avail -= (max_linkhdr + iclen); if (avail < 0) { FREE_MB_T(m); return (-1); } if (xtra > avail) xtra = avail; iclen += xtra; m->m_data += max_linkhdr; m->m_pkthdr.rcvif = (struct ifnet *)0; m->m_pkthdr.len = iclen; m->m_len = iclen; ip = mtod(m, ip_t *); icmp = (struct icmp *)((char *)ip + hlen); ip2 = (ip_t *)&icmp->icmp_ip; icmp->icmp_type = type; icmp->icmp_code = fin->fin_icode; icmp->icmp_cksum = 0; #ifdef icmp_nextmtu if (type == ICMP_UNREACH && fin->fin_icode == ICMP_UNREACH_NEEDFRAG) { if (fin->fin_mtu != 0) { icmp->icmp_nextmtu = htons(fin->fin_mtu); } else if (ifp != NULL) { icmp->icmp_nextmtu = htons(GETIFMTU_4(ifp)); } else { /* make up a number... */ icmp->icmp_nextmtu = htons(fin->fin_plen - 20); } } #endif bcopy((char *)fin->fin_ip, (char *)ip2, ohlen); #ifdef USE_INET6 ip6 = (ip6_t *)ip; if (fin->fin_v == 6) { ip6->ip6_flow = ((ip6_t *)fin->fin_ip)->ip6_flow; ip6->ip6_plen = htons(iclen - hlen); ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 0; ip6->ip6_src = dst6.in6; ip6->ip6_dst = fin->fin_src6.in6; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), iclen - hlen); } else #endif { ip->ip_p = IPPROTO_ICMP; ip->ip_src.s_addr = dst4.s_addr; ip->ip_dst.s_addr = fin->fin_saddr; if (xtra > 0) bcopy((char *)fin->fin_ip + ohlen, (char *)&icmp->icmp_ip + ohlen, xtra); icmp->icmp_cksum = ipf_cksum((u_short *)icmp, sizeof(*icmp) + 8); ip->ip_len = htons(iclen); ip->ip_p = IPPROTO_ICMP; } err = ipf_send_ip(fin, m); return (err); } /* * m0 - pointer to mbuf where the IP packet starts * mpp - pointer to the mbuf pointer that is the start of the mbuf chain */ int ipf_fastroute(mb_t *m0, mb_t **mpp, fr_info_t *fin, frdest_t *fdp) { register struct ip *ip, *mhip; register struct mbuf *m = *mpp; int len, off, error = 0, hlen, code; struct ifnet *ifp, *sifp; struct route ro; struct sockaddr_in *dst; const struct sockaddr *gw; struct nhop_object *nh; u_long fibnum = 0; u_short ip_off; frdest_t node; frentry_t *fr; #ifdef M_WRITABLE /* * HOT FIX/KLUDGE: * * If the mbuf we're about to send is not writable (because of * a cluster reference, for example) we'll need to make a copy * of it since this routine modifies the contents. * * If you have non-crappy network hardware that can transmit data * from the mbuf, rather than making a copy, this is gonna be a * problem. */ if (M_WRITABLE(m) == 0) { m0 = m_dup(m, M_NOWAIT); if (m0 != NULL) { FREE_MB_T(m); m = m0; *mpp = m; } else { error = ENOBUFS; FREE_MB_T(m); goto done; } } #endif #ifdef USE_INET6 if (fin->fin_v == 6) { /* * currently "to " and "to :ip#" are not supported * for IPv6 */ return (ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL)); } #endif hlen = fin->fin_hlen; ip = mtod(m0, struct ip *); ifp = NULL; /* * Route packet. */ bzero(&ro, sizeof (ro)); dst = (struct sockaddr_in *)&ro.ro_dst; dst->sin_family = AF_INET; dst->sin_addr = ip->ip_dst; dst->sin_len = sizeof(dst); gw = (const struct sockaddr *)dst; fr = fin->fin_fr; if ((fr != NULL) && !(fr->fr_flags & FR_KEEPSTATE) && (fdp != NULL) && (fdp->fd_type == FRD_DSTLIST)) { if (ipf_dstlist_select_node(fin, fdp->fd_ptr, NULL, &node) == 0) fdp = &node; } if (fdp != NULL) ifp = fdp->fd_ptr; else ifp = fin->fin_ifp; if ((ifp == NULL) && ((fr == NULL) || !(fr->fr_flags & FR_FASTROUTE))) { error = -2; goto bad; } if ((fdp != NULL) && (fdp->fd_ip.s_addr != 0)) dst->sin_addr = fdp->fd_ip; fibnum = M_GETFIB(m0); NET_EPOCH_ASSERT(); nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_NONE, 0); if (nh == NULL) { if (in_localaddr(ip->ip_dst)) error = EHOSTUNREACH; else error = ENETUNREACH; goto bad; } if (ifp == NULL) ifp = nh->nh_ifp; if (nh->nh_flags & NHF_GATEWAY) { gw = &nh->gw_sa; ro.ro_flags |= RT_HAS_GW; } /* * For input packets which are being "fastrouted", they won't * go back through output filtering and miss their chance to get * NAT'd and counted. Duplicated packets aren't considered to be * part of the normal packet stream, so do not NAT them or pass * them through stateful checking, etc. */ if ((fdp != &fr->fr_dif) && (fin->fin_out == 0)) { sifp = fin->fin_ifp; fin->fin_ifp = ifp; fin->fin_out = 1; (void) ipf_acctpkt(fin, NULL); fin->fin_fr = NULL; if (!fr || !(fr->fr_flags & FR_RETMASK)) { u_32_t pass; (void) ipf_state_check(fin, &pass); } switch (ipf_nat_checkout(fin, NULL)) { case 0 : break; case 1 : ip->ip_sum = 0; break; case -1 : error = -1; goto bad; break; } fin->fin_ifp = sifp; fin->fin_out = 0; } else ip->ip_sum = 0; /* * If small enough for interface, can just send directly. */ if (ntohs(ip->ip_len) <= ifp->if_mtu) { if (!ip->ip_sum) ip->ip_sum = in_cksum(m, hlen); error = (*ifp->if_output)(ifp, m, gw, &ro); goto done; } /* * Too large for interface; fragment if possible. * Must be able to put at least 8 bytes per fragment. */ ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { error = EMSGSIZE; goto bad; } len = (ifp->if_mtu - hlen) &~ 7; if (len < 8) { error = EMSGSIZE; goto bad; } { int mhlen, firstlen = len; struct mbuf **mnext = &m->m_act; /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. */ m0 = m; mhlen = sizeof (struct ip); for (off = hlen + len; off < ntohs(ip->ip_len); off += len) { #ifdef MGETHDR MGETHDR(m, M_NOWAIT, MT_HEADER); #else MGET(m, M_NOWAIT, MT_HEADER); #endif if (m == NULL) { m = m0; error = ENOBUFS; goto bad; } m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); bcopy((char *)ip, (char *)mhip, sizeof(*ip)); if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); IP_HL_A(mhip, mhlen >> 2); } m->m_len = mhlen; mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ntohs(ip->ip_len)) len = ntohs(ip->ip_len) - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); *mnext = m; m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == 0) { error = ENOBUFS; /* ??? */ goto sendorfree; } m->m_pkthdr.len = mhlen + len; m->m_pkthdr.rcvif = NULL; mhip->ip_off = htons((u_short)mhip->ip_off); mhip->ip_sum = 0; mhip->ip_sum = in_cksum(m, mhlen); mnext = &m->m_act; } /* * Update first fragment by trimming what's been copied out * and updating header, then send each fragment (in order). */ m_adj(m0, hlen + firstlen - ip->ip_len); ip->ip_len = htons((u_short)(hlen + firstlen)); ip->ip_off = htons((u_short)IP_MF); ip->ip_sum = 0; ip->ip_sum = in_cksum(m0, hlen); sendorfree: for (m = m0; m; m = m0) { m0 = m->m_act; m->m_act = 0; if (error == 0) error = (*ifp->if_output)(ifp, m, gw, &ro); else FREE_MB_T(m); } } done: if (!error) V_ipfmain.ipf_frouteok[0]++; else V_ipfmain.ipf_frouteok[1]++; return (0); bad: if (error == EMSGSIZE) { sifp = fin->fin_ifp; code = fin->fin_icode; fin->fin_icode = ICMP_UNREACH_NEEDFRAG; fin->fin_ifp = ifp; (void) ipf_send_icmp_err(ICMP_UNREACH, fin, 1); fin->fin_ifp = sifp; fin->fin_icode = code; } FREE_MB_T(m); goto done; } int ipf_verifysrc(fr_info_t *fin) { struct nhop_object *nh; NET_EPOCH_ASSERT(); nh = fib4_lookup(RT_DEFAULT_FIB, fin->fin_src, 0, NHR_NONE, 0); if (nh == NULL) return (0); return (fin->fin_ifp == nh->nh_ifp); } /* * return the first IP Address associated with an interface */ int ipf_ifpaddr(ipf_main_softc_t *softc, int v, int atype, void *ifptr, i6addr_t *inp, i6addr_t *inpmask) { #ifdef USE_INET6 struct in6_addr *ia6 = NULL; #endif struct sockaddr *sock, *mask; struct sockaddr_in *sin; struct ifaddr *ifa; struct ifnet *ifp; if ((ifptr == NULL) || (ifptr == (void *)-1)) return (-1); sin = NULL; ifp = ifptr; if (v == 4) inp->in4.s_addr = 0; #ifdef USE_INET6 else if (v == 6) bzero((char *)inp, sizeof(*inp)); #endif ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); sock = ifa->ifa_addr; while (sock != NULL && ifa != NULL) { sin = (struct sockaddr_in *)sock; if ((v == 4) && (sin->sin_family == AF_INET)) break; #ifdef USE_INET6 if ((v == 6) && (sin->sin_family == AF_INET6)) { ia6 = &((struct sockaddr_in6 *)sin)->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(ia6) && !IN6_IS_ADDR_LOOPBACK(ia6)) break; } #endif ifa = CK_STAILQ_NEXT(ifa, ifa_link); if (ifa != NULL) sock = ifa->ifa_addr; } if (ifa == NULL || sin == NULL) return (-1); mask = ifa->ifa_netmask; if (atype == FRI_BROADCAST) sock = ifa->ifa_broadaddr; else if (atype == FRI_PEERADDR) sock = ifa->ifa_dstaddr; if (sock == NULL) return (-1); #ifdef USE_INET6 if (v == 6) { return (ipf_ifpfillv6addr(atype, (struct sockaddr_in6 *)sock, (struct sockaddr_in6 *)mask, inp, inpmask)); } #endif return (ipf_ifpfillv4addr(atype, (struct sockaddr_in *)sock, (struct sockaddr_in *)mask, &inp->in4, &inpmask->in4)); } u_32_t ipf_newisn(fr_info_t *fin) { u_32_t newiss; newiss = arc4random(); return (newiss); } int ipf_checkv4sum(fr_info_t *fin) { #ifdef CSUM_DATA_VALID int manual = 0; u_short sum; ip_t *ip; mb_t *m; if ((fin->fin_flx & FI_NOCKSUM) != 0) return (0); if ((fin->fin_flx & FI_SHORT) != 0) return (1); if (fin->fin_cksum != FI_CK_NEEDED) return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; m = fin->fin_m; if (m == NULL) { manual = 1; goto skipauto; } ip = fin->fin_ip; if ((m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)) == CSUM_IP_CHECKED) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_csum_ip_checked, fr_info_t *, fin, u_int, m->m_pkthdr.csum_flags & (CSUM_IP_CHECKED|CSUM_IP_VALID)); return (-1); } if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { /* Depending on the driver, UDP may have zero checksum */ if (fin->fin_p == IPPROTO_UDP && (fin->fin_flx & (FI_FRAG|FI_SHORT|FI_BAD)) == 0) { udphdr_t *udp = fin->fin_dp; if (udp->uh_sum == 0) { /* * we're good no matter what the hardware * checksum flags and csum_data say (handling * of csum_data for zero UDP checksum is not * consistent across all drivers) */ fin->fin_cksum = 1; return (0); } } if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) sum = m->m_pkthdr.csum_data; else sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + fin->fin_dlen + fin->fin_p)); sum ^= 0xffff; if (sum != 0) { fin->fin_cksum = FI_CK_BAD; fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_sum, fr_info_t *, fin, u_int, sum); } else { fin->fin_cksum = FI_CK_SUMOK; return (0); } } else { if (m->m_pkthdr.csum_flags == CSUM_DELAY_DATA) { fin->fin_cksum = FI_CK_L4FULL; return (0); } else if (m->m_pkthdr.csum_flags == CSUM_TCP || m->m_pkthdr.csum_flags == CSUM_UDP || m->m_pkthdr.csum_flags == CSUM_IP) { fin->fin_cksum = FI_CK_L4PART; return (0); } else { manual = 1; } } skipauto: if (manual != 0) { if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_manual, fr_info_t *, fin, u_int, manual); return (-1); } } #else if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv4sum_checkl4sum, fr_info_t *, fin, u_int, -1); return (-1); } #endif return (0); } #ifdef USE_INET6 int ipf_checkv6sum(fr_info_t *fin) { if ((fin->fin_flx & FI_NOCKSUM) != 0) { DT(ipf_checkv6sum_fi_nocksum); return (0); } if ((fin->fin_flx & FI_SHORT) != 0) { DT(ipf_checkv6sum_fi_short); return (1); } if (fin->fin_cksum != FI_CK_NEEDED) { DT(ipf_checkv6sum_fi_ck_needed); return (fin->fin_cksum > FI_CK_NEEDED) ? 0 : -1; } if (ipf_checkl4sum(fin) == -1) { fin->fin_flx |= FI_BAD; DT2(ipf_fi_bad_checkv6sum_checkl4sum, fr_info_t *, fin, u_int, -1); return (-1); } return (0); } #endif /* USE_INET6 */ size_t mbufchainlen(struct mbuf *m0) { size_t len; if ((m0->m_flags & M_PKTHDR) != 0) { len = m0->m_pkthdr.len; } else { struct mbuf *m; for (m = m0, len = 0; m != NULL; m = m->m_next) len += m->m_len; } return (len); } /* ------------------------------------------------------------------------ */ /* Function: ipf_pullup */ /* Returns: NULL == pullup failed, else pointer to protocol header */ /* Parameters: xmin(I)- pointer to buffer where data packet starts */ /* fin(I) - pointer to packet information */ /* len(I) - number of bytes to pullup */ /* */ /* Attempt to move at least len bytes (from the start of the buffer) into a */ /* single buffer for ease of access. Operating system native functions are */ /* used to manage buffers - if necessary. If the entire packet ends up in */ /* a single buffer, set the FI_COALESCE flag even though ipf_coalesce() has */ /* not been called. Both fin_ip and fin_dp are updated before exiting _IF_ */ /* and ONLY if the pullup succeeds. */ /* */ /* We assume that 'xmin' is a pointer to a buffer that is part of the chain */ /* of buffers that starts at *fin->fin_mp. */ /* ------------------------------------------------------------------------ */ ip_t * ipf_pullup(mb_t *xmin, fr_info_t *fin, int len) { int dpoff, ipoff; mb_t *m = xmin; ip_t *ip; if (m == NULL) return (NULL); ip = fin->fin_ip; if ((fin->fin_flx & FI_COALESCE) != 0) return (ip); ipoff = fin->fin_ipoff; if (fin->fin_dp != NULL) dpoff = (char *)fin->fin_dp - (char *)ip; else dpoff = 0; if (M_LEN(m) < len) { mb_t *n = *fin->fin_mp; /* * Assume that M_PKTHDR is set and just work with what is left * rather than check.. * Should not make any real difference, anyway. */ if (m != n) { /* * Record the mbuf that points to the mbuf that we're * about to go to work on so that we can update the * m_next appropriately later. */ for (; n->m_next != m; n = n->m_next) ; } else { n = NULL; } #ifdef MHLEN if (len > MHLEN) #else if (len > MLEN) #endif { #ifdef HAVE_M_PULLDOWN if (m_pulldown(m, 0, len, NULL) == NULL) m = NULL; #else FREE_MB_T(*fin->fin_mp); m = NULL; n = NULL; #endif } else { m = m_pullup(m, len); } if (n != NULL) n->m_next = m; if (m == NULL) { /* * When n is non-NULL, it indicates that m pointed to * a sub-chain (tail) of the mbuf and that the head * of this chain has not yet been free'd. */ if (n != NULL) { FREE_MB_T(*fin->fin_mp); } *fin->fin_mp = NULL; fin->fin_m = NULL; return (NULL); } if (n == NULL) *fin->fin_mp = m; while (M_LEN(m) == 0) { m = m->m_next; } fin->fin_m = m; ip = MTOD(m, ip_t *) + ipoff; fin->fin_ip = ip; if (fin->fin_dp != NULL) fin->fin_dp = (char *)fin->fin_ip + dpoff; if (fin->fin_fraghdr != NULL) fin->fin_fraghdr = (char *)ip + ((char *)fin->fin_fraghdr - (char *)fin->fin_ip); } if (len == fin->fin_plen) fin->fin_flx |= FI_COALESCE; return (ip); } int ipf_inject(fr_info_t *fin, mb_t *m) { struct epoch_tracker et; int error = 0; NET_EPOCH_ENTER(et); if (fin->fin_out == 0) { netisr_dispatch(NETISR_IP, m); } else { fin->fin_ip->ip_len = ntohs(fin->fin_ip->ip_len); fin->fin_ip->ip_off = ntohs(fin->fin_ip->ip_off); error = ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); } NET_EPOCH_EXIT(et); return (error); } VNET_DEFINE_STATIC(pfil_hook_t, ipf_inet_hook); VNET_DEFINE_STATIC(pfil_hook_t, ipf_inet6_hook); #define V_ipf_inet_hook VNET(ipf_inet_hook) #define V_ipf_inet6_hook VNET(ipf_inet6_hook) int ipf_pfil_unhook(void) { pfil_remove_hook(V_ipf_inet_hook); #ifdef USE_INET6 pfil_remove_hook(V_ipf_inet6_hook); #endif return (0); } int ipf_pfil_hook(void) { int error, error6; struct pfil_hook_args pha = { .pa_version = PFIL_VERSION, .pa_flags = PFIL_IN | PFIL_OUT, .pa_modname = "ipfilter", .pa_rulname = "default-ip4", .pa_mbuf_chk = ipf_check_wrapper, .pa_type = PFIL_TYPE_IP4, }; V_ipf_inet_hook = pfil_add_hook(&pha); #ifdef USE_INET6 pha.pa_rulname = "default-ip6"; pha.pa_mbuf_chk = ipf_check_wrapper6; pha.pa_type = PFIL_TYPE_IP6; V_ipf_inet6_hook = pfil_add_hook(&pha); #endif struct pfil_link_args pla = { .pa_version = PFIL_VERSION, .pa_flags = PFIL_IN | PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR, .pa_head = V_inet_pfil_head, .pa_hook = V_ipf_inet_hook, }; error = pfil_link(&pla); error6 = 0; #ifdef USE_INET6 pla.pa_head = V_inet6_pfil_head; pla.pa_hook = V_ipf_inet6_hook; error6 = pfil_link(&pla); #endif if (error || error6) error = ENODEV; else error = 0; return (error); } void ipf_event_reg(void) { V_ipf_arrivetag = EVENTHANDLER_REGISTER(ifnet_arrival_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); V_ipf_departtag = EVENTHANDLER_REGISTER(ifnet_departure_event, \ ipf_ifevent, NULL, \ EVENTHANDLER_PRI_ANY); #if 0 V_ipf_clonetag = EVENTHANDLER_REGISTER(if_clone_event, ipf_ifevent, \ NULL, EVENTHANDLER_PRI_ANY); #endif } void ipf_event_dereg(void) { if (V_ipf_arrivetag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_arrival_event, V_ipf_arrivetag); } if (V_ipf_departtag != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, V_ipf_departtag); } #if 0 if (V_ipf_clonetag != NULL) { EVENTHANDLER_DEREGISTER(if_clone_event, V_ipf_clonetag); } #endif } u_32_t ipf_random(void) { return (arc4random()); } u_int ipf_pcksum(fr_info_t *fin, int hlen, u_int sum) { struct mbuf *m; u_int sum2; int off; m = fin->fin_m; off = (char *)fin->fin_dp - (char *)fin->fin_ip; m->m_data += hlen; m->m_len -= hlen; sum2 = in_cksum(fin->fin_m, fin->fin_plen - off); m->m_len += hlen; m->m_data -= hlen; /* * Both sum and sum2 are partial sums, so combine them together. */ sum += ~sum2 & 0xffff; while (sum > 0xffff) sum = (sum & 0xffff) + (sum >> 16); sum2 = ~sum & 0xffff; return (sum2); } #ifdef USE_INET6 u_int ipf_pcksum6(struct mbuf *m, ip6_t *ip6, u_int32_t off, u_int32_t len) { #ifdef _KERNEL int sum; if (m->m_len < sizeof(struct ip6_hdr)) { return (0xffff); } sum = in6_cksum(m, ip6->ip6_nxt, off, len); return (sum); #else u_short *sp; u_int sum; sp = (u_short *)&ip6->ip6_src; sum = *sp++; /* ip6_src */ sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; /* ip6_dst */ sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; sum += *sp++; return (ipf_pcksum(fin, off, sum)); #endif } #endif void ipf_fbsd_kenv_get(ipf_main_softc_t *softc) { TUNABLE_INT_FETCH("net.inet.ipf.large_nat", &softc->ipf_large_nat); } diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c index a92462c53f15..a119d85f806e 100644 --- a/sys/netpfil/pf/pf_norm.c +++ b/sys/netpfil/pf/pf_norm.c @@ -1,2320 +1,2321 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2001 Niels Provos * Copyright 2011-2018 Alexander Bluhm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $ */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ struct pf_frent { TAILQ_ENTRY(pf_frent) fr_next; struct mbuf *fe_m; uint16_t fe_hdrlen; /* ipv4 header length with ip options ipv6, extension, fragment header */ uint16_t fe_extoff; /* last extension header offset or 0 */ uint16_t fe_len; /* fragment length */ uint16_t fe_off; /* fragment offset */ uint16_t fe_mff; /* more fragment flag */ }; struct pf_fragment_cmp { struct pf_addr frc_src; struct pf_addr frc_dst; uint32_t frc_id; sa_family_t frc_af; uint8_t frc_proto; }; struct pf_fragment { struct pf_fragment_cmp fr_key; #define fr_src fr_key.frc_src #define fr_dst fr_key.frc_dst #define fr_id fr_key.frc_id #define fr_af fr_key.frc_af #define fr_proto fr_key.frc_proto /* pointers to queue element */ struct pf_frent *fr_firstoff[PF_FRAG_ENTRY_POINTS]; /* count entries between pointers */ uint8_t fr_entries[PF_FRAG_ENTRY_POINTS]; RB_ENTRY(pf_fragment) fr_entry; TAILQ_ENTRY(pf_fragment) frag_next; uint32_t fr_timeout; uint16_t fr_maxlen; /* maximum length of single fragment */ u_int16_t fr_holes; /* number of holes in the queue */ TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; }; struct pf_fragment_tag { uint16_t ft_hdrlen; /* header length of reassembled pkt */ uint16_t ft_extoff; /* last extension header offset or 0 */ uint16_t ft_maxlen; /* maximum fragment payload length */ uint32_t ft_id; /* fragment id */ }; VNET_DEFINE_STATIC(struct mtx, pf_frag_mtx); #define V_pf_frag_mtx VNET(pf_frag_mtx) #define PF_FRAG_LOCK() mtx_lock(&V_pf_frag_mtx) #define PF_FRAG_UNLOCK() mtx_unlock(&V_pf_frag_mtx) #define PF_FRAG_ASSERT() mtx_assert(&V_pf_frag_mtx, MA_OWNED) VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z); #define V_pf_frent_z VNET(pf_frent_z) VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); VNET_DEFINE_STATIC(struct pf_fragqueue, pf_fragqueue); #define V_pf_fragqueue VNET(pf_fragqueue) RB_HEAD(pf_frag_tree, pf_fragment); VNET_DEFINE_STATIC(struct pf_frag_tree, pf_frag_tree); #define V_pf_frag_tree VNET(pf_frag_tree) static int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); static void pf_flush_fragments(void); static void pf_free_fragment(struct pf_fragment *); static void pf_remove_fragment(struct pf_fragment *); static struct pf_frent *pf_create_fragment(u_short *); static int pf_frent_holes(struct pf_frent *frent); static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree); static inline int pf_frent_index(struct pf_frent *); static int pf_frent_insert(struct pf_fragment *, struct pf_frent *, struct pf_frent *); void pf_frent_remove(struct pf_fragment *, struct pf_frent *); struct pf_frent *pf_frent_previous(struct pf_fragment *, struct pf_frent *); static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, struct pf_frent *, u_short *); static struct mbuf *pf_join_fragment(struct pf_fragment *); #ifdef INET static int pf_reassemble(struct mbuf **, struct ip *, int, u_short *); #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **, struct ip6_hdr *, struct ip6_frag *, uint16_t, uint16_t, u_short *); #endif /* INET6 */ #define DPFPRINTF(x) do { \ if (V_pf_status.debug >= PF_DEBUG_MISC) { \ printf("%s: ", __func__); \ printf x ; \ } \ } while(0) #ifdef INET static void pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key) { key->frc_src.v4 = ip->ip_src; key->frc_dst.v4 = ip->ip_dst; key->frc_af = AF_INET; key->frc_proto = ip->ip_p; key->frc_id = ip->ip_id; } #endif /* INET */ void pf_normalize_init(void) { V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_state_scrub_z = uma_zcreate("pf state scrubs", sizeof(struct pf_state_scrub), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&V_pf_frag_mtx, "pf fragments", NULL, MTX_DEF); V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z; V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT; uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT); uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached"); TAILQ_INIT(&V_pf_fragqueue); } void pf_normalize_cleanup(void) { uma_zdestroy(V_pf_state_scrub_z); uma_zdestroy(V_pf_frent_z); uma_zdestroy(V_pf_frag_z); mtx_destroy(&V_pf_frag_mtx); } static int pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) { int diff; if ((diff = a->fr_id - b->fr_id) != 0) return (diff); if ((diff = a->fr_proto - b->fr_proto) != 0) return (diff); if ((diff = a->fr_af - b->fr_af) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0) return (diff); if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) return (diff); return (0); } void pf_purge_expired_fragments(void) { u_int32_t expire = time_uptime - V_pf_default_rule.timeout[PFTM_FRAG]; pf_purge_fragments(expire); } void pf_purge_fragments(uint32_t expire) { struct pf_fragment *frag; PF_FRAG_LOCK(); while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) { if (frag->fr_timeout > expire) break; DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag)); pf_free_fragment(frag); } PF_FRAG_UNLOCK(); } /* * Try to flush old fragments to make space for new ones */ static void pf_flush_fragments(void) { struct pf_fragment *frag; int goal; PF_FRAG_ASSERT(); goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10; DPFPRINTF(("trying to free %d frag entriess\n", goal)); while (goal < uma_zone_get_cur(V_pf_frent_z)) { frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue); if (frag) pf_free_fragment(frag); else break; } } /* Frees the fragments and all associated entries */ static void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; PF_FRAG_ASSERT(); /* Free all fragments */ for (frent = TAILQ_FIRST(&frag->fr_queue); frent; frent = TAILQ_FIRST(&frag->fr_queue)) { TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); m_freem(frent->fe_m); uma_zfree(V_pf_frent_z, frent); } pf_remove_fragment(frag); } static struct pf_fragment * pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) { struct pf_fragment *frag; PF_FRAG_ASSERT(); frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); if (frag != NULL) { /* XXX Are we sure we want to update the timeout? */ frag->fr_timeout = time_uptime; TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); } return (frag); } /* Removes a fragment from the fragment queue and frees the fragment */ static void pf_remove_fragment(struct pf_fragment *frag) { PF_FRAG_ASSERT(); KASSERT(frag, ("frag != NULL")); RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); uma_zfree(V_pf_frag_z, frag); } static struct pf_frent * pf_create_fragment(u_short *reason) { struct pf_frent *frent; PF_FRAG_ASSERT(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { pf_flush_fragments(); frent = uma_zalloc(V_pf_frent_z, M_NOWAIT); if (frent == NULL) { REASON_SET(reason, PFRES_MEMORY); return (NULL); } } return (frent); } /* * Calculate the additional holes that were created in the fragment * queue by inserting this fragment. A fragment in the middle * creates one more hole by splitting. For each connected side, * it loses one hole. * Fragment entry must be in the queue when calling this function. */ static int pf_frent_holes(struct pf_frent *frent) { struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next); struct pf_frent *next = TAILQ_NEXT(frent, fr_next); int holes = 1; if (prev == NULL) { if (frent->fe_off == 0) holes--; } else { KASSERT(frent->fe_off != 0, ("frent->fe_off != 0")); if (frent->fe_off == prev->fe_off + prev->fe_len) holes--; } if (next == NULL) { if (!frent->fe_mff) holes--; } else { KASSERT(frent->fe_mff, ("frent->fe_mff")); if (next->fe_off == frent->fe_off + frent->fe_len) holes--; } return holes; } static inline int pf_frent_index(struct pf_frent *frent) { /* * We have an array of 16 entry points to the queue. A full size * 65535 octet IP packet can have 8192 fragments. So the queue * traversal length is at most 512 and at most 16 entry points are * checked. We need 128 additional bytes on a 64 bit architecture. */ CTASSERT(((u_int16_t)0xffff &~ 7) / (0x10000 / PF_FRAG_ENTRY_POINTS) == 16 - 1); CTASSERT(((u_int16_t)0xffff >> 3) / PF_FRAG_ENTRY_POINTS == 512 - 1); return frent->fe_off / (0x10000 / PF_FRAG_ENTRY_POINTS); } static int pf_frent_insert(struct pf_fragment *frag, struct pf_frent *frent, struct pf_frent *prev) { int index; CTASSERT(PF_FRAG_ENTRY_LIMIT <= 0xff); /* * A packet has at most 65536 octets. With 16 entry points, each one * spawns 4096 octets. We limit these to 64 fragments each, which * means on average every fragment must have at least 64 octets. */ index = pf_frent_index(frent); if (frag->fr_entries[index] >= PF_FRAG_ENTRY_LIMIT) return ENOBUFS; frag->fr_entries[index]++; if (prev == NULL) { TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next); } else { KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off, ("overlapping fragment")); TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next); } if (frag->fr_firstoff[index] == NULL) { KASSERT(prev == NULL || pf_frent_index(prev) < index, ("prev == NULL || pf_frent_index(pref) < index")); frag->fr_firstoff[index] = frent; } else { if (frent->fe_off < frag->fr_firstoff[index]->fe_off) { KASSERT(prev == NULL || pf_frent_index(prev) < index, ("prev == NULL || pf_frent_index(pref) < index")); frag->fr_firstoff[index] = frent; } else { KASSERT(prev != NULL, ("prev != NULL")); KASSERT(pf_frent_index(prev) == index, ("pf_frent_index(prev) == index")); } } frag->fr_holes += pf_frent_holes(frent); return 0; } void pf_frent_remove(struct pf_fragment *frag, struct pf_frent *frent) { #ifdef INVARIANTS struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next); #endif struct pf_frent *next = TAILQ_NEXT(frent, fr_next); int index; frag->fr_holes -= pf_frent_holes(frent); index = pf_frent_index(frent); KASSERT(frag->fr_firstoff[index] != NULL, ("frent not found")); if (frag->fr_firstoff[index]->fe_off == frent->fe_off) { if (next == NULL) { frag->fr_firstoff[index] = NULL; } else { KASSERT(frent->fe_off + frent->fe_len <= next->fe_off, ("overlapping fragment")); if (pf_frent_index(next) == index) { frag->fr_firstoff[index] = next; } else { frag->fr_firstoff[index] = NULL; } } } else { KASSERT(frag->fr_firstoff[index]->fe_off < frent->fe_off, ("frag->fr_firstoff[index]->fe_off < frent->fe_off")); KASSERT(prev != NULL, ("prev != NULL")); KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off, ("overlapping fragment")); KASSERT(pf_frent_index(prev) == index, ("pf_frent_index(prev) == index")); } TAILQ_REMOVE(&frag->fr_queue, frent, fr_next); KASSERT(frag->fr_entries[index] > 0, ("No fragments remaining")); frag->fr_entries[index]--; } struct pf_frent * pf_frent_previous(struct pf_fragment *frag, struct pf_frent *frent) { struct pf_frent *prev, *next; int index; /* * If there are no fragments after frag, take the final one. Assume * that the global queue is not empty. */ prev = TAILQ_LAST(&frag->fr_queue, pf_fragq); KASSERT(prev != NULL, ("prev != NULL")); if (prev->fe_off <= frent->fe_off) return prev; /* * We want to find a fragment entry that is before frag, but still * close to it. Find the first fragment entry that is in the same * entry point or in the first entry point after that. As we have * already checked that there are entries behind frag, this will * succeed. */ for (index = pf_frent_index(frent); index < PF_FRAG_ENTRY_POINTS; index++) { prev = frag->fr_firstoff[index]; if (prev != NULL) break; } KASSERT(prev != NULL, ("prev != NULL")); /* * In prev we may have a fragment from the same entry point that is * before frent, or one that is just one position behind frent. * In the latter case, we go back one step and have the predecessor. * There may be none if the new fragment will be the first one. */ if (prev->fe_off > frent->fe_off) { prev = TAILQ_PREV(prev, pf_fragq, fr_next); if (prev == NULL) return NULL; KASSERT(prev->fe_off <= frent->fe_off, ("prev->fe_off <= frent->fe_off")); return prev; } /* * In prev is the first fragment of the entry point. The offset * of frag is behind it. Find the closest previous fragment. */ for (next = TAILQ_NEXT(prev, fr_next); next != NULL; next = TAILQ_NEXT(next, fr_next)) { if (next->fe_off > frent->fe_off) break; prev = next; } return prev; } static struct pf_fragment * pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, u_short *reason) { struct pf_frent *after, *next, *prev; struct pf_fragment *frag; uint16_t total; int old_index, new_index; PF_FRAG_ASSERT(); /* No empty fragments. */ if (frent->fe_len == 0) { DPFPRINTF(("bad fragment: len 0\n")); goto bad_fragment; } /* All fragments are 8 byte aligned. */ if (frent->fe_mff && (frent->fe_len & 0x7)) { DPFPRINTF(("bad fragment: mff and len %d\n", frent->fe_len)); goto bad_fragment; } /* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */ if (frent->fe_off + frent->fe_len > IP_MAXPACKET) { DPFPRINTF(("bad fragment: max packet %d\n", frent->fe_off + frent->fe_len)); goto bad_fragment; } DPFPRINTF((key->frc_af == AF_INET ? "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n", key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len)); /* Fully buffer all of the fragments in this fragment queue. */ frag = pf_find_fragment(key, &V_pf_frag_tree); /* Create a new reassembly queue for this packet. */ if (frag == NULL) { frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { pf_flush_fragments(); frag = uma_zalloc(V_pf_frag_z, M_NOWAIT); if (frag == NULL) { REASON_SET(reason, PFRES_MEMORY); goto drop_fragment; } } *(struct pf_fragment_cmp *)frag = *key; memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff)); memset(frag->fr_entries, 0, sizeof(frag->fr_entries)); frag->fr_timeout = time_uptime; frag->fr_maxlen = frent->fe_len; frag->fr_holes = 1; TAILQ_INIT(&frag->fr_queue); RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag); TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); /* We do not have a previous fragment, cannot fail. */ pf_frent_insert(frag, frent, NULL); return (frag); } KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue")); /* Remember maximum fragment len for refragmentation. */ if (frent->fe_len > frag->fr_maxlen) frag->fr_maxlen = frent->fe_len; /* Maximum data we have seen already. */ total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; /* Non terminal fragments must have more fragments flag. */ if (frent->fe_off + frent->fe_len < total && !frent->fe_mff) goto bad_fragment; /* Check if we saw the last fragment already. */ if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) { if (frent->fe_off + frent->fe_len > total || (frent->fe_off + frent->fe_len == total && frent->fe_mff)) goto bad_fragment; } else { if (frent->fe_off + frent->fe_len == total && !frent->fe_mff) goto bad_fragment; } /* Find neighbors for newly inserted fragment */ prev = pf_frent_previous(frag, frent); if (prev == NULL) { after = TAILQ_FIRST(&frag->fr_queue); KASSERT(after != NULL, ("after != NULL")); } else { after = TAILQ_NEXT(prev, fr_next); } if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) { uint16_t precut; precut = prev->fe_off + prev->fe_len - frent->fe_off; if (precut >= frent->fe_len) goto bad_fragment; DPFPRINTF(("overlap -%d\n", precut)); m_adj(frent->fe_m, precut); frent->fe_off += precut; frent->fe_len -= precut; } for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off; after = next) { uint16_t aftercut; aftercut = frent->fe_off + frent->fe_len - after->fe_off; DPFPRINTF(("adjust overlap %d\n", aftercut)); if (aftercut < after->fe_len) { m_adj(after->fe_m, aftercut); old_index = pf_frent_index(after); after->fe_off += aftercut; after->fe_len -= aftercut; new_index = pf_frent_index(after); if (old_index != new_index) { DPFPRINTF(("frag index %d, new %d", old_index, new_index)); /* Fragment switched queue as fe_off changed */ after->fe_off -= aftercut; after->fe_len += aftercut; /* Remove restored fragment from old queue */ pf_frent_remove(frag, after); after->fe_off += aftercut; after->fe_len -= aftercut; /* Insert into correct queue */ if (pf_frent_insert(frag, after, prev)) { DPFPRINTF( ("fragment requeue limit exceeded")); m_freem(after->fe_m); uma_zfree(V_pf_frent_z, after); /* There is not way to recover */ goto bad_fragment; } } break; } /* This fragment is completely overlapped, lose it. */ next = TAILQ_NEXT(after, fr_next); pf_frent_remove(frag, after); m_freem(after->fe_m); uma_zfree(V_pf_frent_z, after); } /* If part of the queue gets too long, there is not way to recover. */ if (pf_frent_insert(frag, frent, prev)) { DPFPRINTF(("fragment queue limit exceeded\n")); goto bad_fragment; } return (frag); bad_fragment: REASON_SET(reason, PFRES_FRAG); drop_fragment: uma_zfree(V_pf_frent_z, frent); return (NULL); } static struct mbuf * pf_join_fragment(struct pf_fragment *frag) { struct mbuf *m, *m2; struct pf_frent *frent, *next; frent = TAILQ_FIRST(&frag->fr_queue); next = TAILQ_NEXT(frent, fr_next); m = frent->fe_m; m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len); uma_zfree(V_pf_frent_z, frent); for (frent = next; frent != NULL; frent = next) { next = TAILQ_NEXT(frent, fr_next); m2 = frent->fe_m; /* Strip off ip header. */ m_adj(m2, frent->fe_hdrlen); /* Strip off any trailing bytes. */ m_adj(m2, frent->fe_len - m2->m_pkthdr.len); uma_zfree(V_pf_frent_z, frent); m_cat(m, m2); } /* Remove from fragment queue. */ pf_remove_fragment(frag); return (m); } #ifdef INET static int pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; uint16_t total, hdrlen; /* Get an entry for the fragment queue */ if ((frent = pf_create_fragment(reason)) == NULL) return (PF_DROP); frent->fe_m = m; frent->fe_hdrlen = ip->ip_hl << 2; frent->fe_extoff = 0; frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2); frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; frent->fe_mff = ntohs(ip->ip_off) & IP_MF; pf_ip2key(ip, dir, &key); if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) return (PF_DROP); /* The mbuf is part of the fragment entry, no direct free or access */ m = *m0 = NULL; if (frag->fr_holes) { DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, frag->fr_holes)); return (PF_PASS); /* drop because *m0 is NULL, no error */ } /* We have all the data */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen; m = *m0 = pf_join_fragment(frag); frag = NULL; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } ip = mtod(m, struct ip *); ip->ip_sum = pf_cksum_fixup(ip->ip_sum, ip->ip_len, htons(hdrlen + total), 0); ip->ip_len = htons(hdrlen + total); ip->ip_sum = pf_cksum_fixup(ip->ip_sum, ip->ip_off, ip->ip_off & ~(IP_MF|IP_OFFMASK), 0); ip->ip_off &= ~(IP_MF|IP_OFFMASK); if (hdrlen + total > IP_MAXPACKET) { DPFPRINTF(("drop: too big: %d\n", total)); ip->ip_len = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test() */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len))); return (PF_PASS); } #endif /* INET */ #ifdef INET6 static int pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr, uint16_t hdrlen, uint16_t extoff, u_short *reason) { struct mbuf *m = *m0; struct pf_frent *frent; struct pf_fragment *frag; struct pf_fragment_cmp key; struct m_tag *mtag; struct pf_fragment_tag *ftag; int off; uint32_t frag_id; uint16_t total, maxlen; uint8_t proto; PF_FRAG_LOCK(); /* Get an entry for the fragment queue. */ if ((frent = pf_create_fragment(reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } frent->fe_m = m; frent->fe_hdrlen = hdrlen; frent->fe_extoff = extoff; frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen; frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; key.frc_src.v6 = ip6->ip6_src; key.frc_dst.v6 = ip6->ip6_dst; key.frc_af = AF_INET6; /* Only the first fragment's protocol is relevant. */ key.frc_proto = 0; key.frc_id = fraghdr->ip6f_ident; if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); } /* The mbuf is part of the fragment entry, no direct free or access. */ m = *m0 = NULL; if (frag->fr_holes) { DPFPRINTF(("frag %d, holes %d\n", frag->fr_id, frag->fr_holes)); PF_FRAG_UNLOCK(); return (PF_PASS); /* Drop because *m0 is NULL, no error. */ } /* We have all the data. */ frent = TAILQ_FIRST(&frag->fr_queue); KASSERT(frent != NULL, ("frent != NULL")); extoff = frent->fe_extoff; maxlen = frag->fr_maxlen; frag_id = frag->fr_id; total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off + TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len; hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag); m = *m0 = pf_join_fragment(frag); frag = NULL; PF_FRAG_UNLOCK(); /* Take protocol from first fragment header. */ m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); proto = *(mtod(m, uint8_t *) + off); m = *m0; /* Delete frag6 header */ if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0) goto fail; if (m->m_flags & M_PKTHDR) { int plen = 0; for (m = *m0; m; m = m->m_next) plen += m->m_len; m = *m0; m->m_pkthdr.len = plen; } if ((mtag = m_tag_get(PACKET_TAG_PF_REASSEMBLED, sizeof(struct pf_fragment_tag), M_NOWAIT)) == NULL) goto fail; ftag = (struct pf_fragment_tag *)(mtag + 1); ftag->ft_hdrlen = hdrlen; ftag->ft_extoff = extoff; ftag->ft_maxlen = maxlen; ftag->ft_id = frag_id; m_tag_prepend(m, mtag); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total); if (extoff) { /* Write protocol into next field of last extension header. */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT(m, ("%s: short mbuf chain", __func__)); *(mtod(m, char *) + off) = proto; m = *m0; } else ip6->ip6_nxt = proto; if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) { DPFPRINTF(("drop: too big: %d\n", total)); ip6->ip6_plen = 0; REASON_SET(reason, PFRES_SHORT); /* PF_DROP requires a valid mbuf *m0 in pf_test6(). */ return (PF_DROP); } DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip6->ip6_plen))); return (PF_PASS); fail: REASON_SET(reason, PFRES_MEMORY); /* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */ return (PF_DROP); } #endif /* INET6 */ #ifdef INET6 int pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag, bool forward) { struct mbuf *m = *m0, *t; struct ip6_hdr *hdr; struct pf_fragment_tag *ftag = (struct pf_fragment_tag *)(mtag + 1); struct pf_pdesc pd; uint32_t frag_id; uint16_t hdrlen, extoff, maxlen; uint8_t proto; int error, action; hdrlen = ftag->ft_hdrlen; extoff = ftag->ft_extoff; maxlen = ftag->ft_maxlen; frag_id = ftag->ft_id; m_tag_delete(m, mtag); mtag = NULL; ftag = NULL; if (extoff) { int off; /* Use protocol from next field of last extension header */ m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt), &off); KASSERT((m != NULL), ("pf_refragment6: short mbuf chain")); proto = *(mtod(m, uint8_t *) + off); *(mtod(m, char *) + off) = IPPROTO_FRAGMENT; m = *m0; } else { hdr = mtod(m, struct ip6_hdr *); proto = hdr->ip6_nxt; hdr->ip6_nxt = IPPROTO_FRAGMENT; } /* In case of link-local traffic we'll need a scope set. */ hdr = mtod(m, struct ip6_hdr *); in6_setscope(&hdr->ip6_src, ifp, NULL); in6_setscope(&hdr->ip6_dst, ifp, NULL); /* The MTU must be a multiple of 8 bytes, or we risk doing the * fragmentation wrong. */ maxlen = maxlen & ~7; /* * Maxlen may be less than 8 if there was only a single * fragment. As it was fragmented before, add a fragment * header also for a single fragment. If total or maxlen * is less than 8, ip6_fragment() will return EMSGSIZE and * we drop the packet. */ error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id); m = (*m0)->m_nextpkt; (*m0)->m_nextpkt = NULL; if (error == 0) { /* The first mbuf contains the unfragmented packet. */ m_freem(*m0); *m0 = NULL; action = PF_PASS; } else { /* Drop expects an mbuf to free. */ DPFPRINTF(("refragment error %d\n", error)); action = PF_DROP; } for (; m; m = t) { t = m->m_nextpkt; m->m_nextpkt = NULL; m->m_flags |= M_SKIP_FIREWALL; memset(&pd, 0, sizeof(pd)); pd.pf_mtag = pf_find_mtag(m); if (error == 0) if (forward) { MPASS(m->m_pkthdr.rcvif != NULL); ip6_forward(m, 0); } else { (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } else m_freem(m); } return (action); } #endif /* INET6 */ #ifdef INET int pf_normalize_ip(struct mbuf **m0, struct pfi_kkif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_krule *r; struct ip *h = mtod(m, struct ip *); int mff = (ntohs(h->ip_off) & IP_MF); int hlen = h->ip_hl << 2; u_int16_t fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3; u_int16_t max; int ip_len; int tag = -1; int verdict; bool scrub_compat; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* * Check if there are any scrub rules, matching or not. * Lack of scrub rules means: * - enforced packet normalization operation just like in OpenBSD * - fragment reassembly depends on V_pf_status.reass * With scrub rules: * - packet normalization is performed if there is a matching scrub rule * - fragment reassembly is performed if the matching rule has no * PFRULE_FRAGMENT_NOREASS flag */ scrub_compat = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != h->ip_p) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip_src.s_addr, AF_INET, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip_dst.s_addr, AF_INET, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->match_tag && !pf_match_tag(m, r, &tag, pd->pf_mtag ? pd->pf_mtag->tag : 0)) r = TAILQ_NEXT(r, entries); else break; } if (scrub_compat) { /* With scrub rules present IPv4 normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(r, &pd->act); } /* Check for illegal packets */ if (hlen < (int)sizeof(struct ip)) { REASON_SET(reason, PFRES_NORM); goto drop; } if (hlen > ntohs(h->ip_len)) { REASON_SET(reason, PFRES_NORM); goto drop; } /* Clear IP_DF if the rule uses the no-df option or we're in no-df mode */ if (((!scrub_compat && V_pf_status.reass & PF_REASS_NODF) || (r != NULL && r->rule_flag & PFRULE_NODF)) && (h->ip_off & htons(IP_DF)) ) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* We will need other tests here */ if (!fragoff && !mff) goto no_fragment; /* We're dealing with a fragment now. Don't allow fragments * with IP_DF to enter the cache. If the flag was cleared by * no-df above, fine. Otherwise drop it. */ if (h->ip_off & htons(IP_DF)) { DPFPRINTF(("IP_DF\n")); goto bad; } ip_len = ntohs(h->ip_len) - hlen; /* All fragments are 8 byte aligned */ if (mff && (ip_len & 0x7)) { DPFPRINTF(("mff and %d\n", ip_len)); goto bad; } /* Respect maximum length */ if (fragoff + ip_len > IP_MAXPACKET) { DPFPRINTF(("max packet %d\n", fragoff + ip_len)); goto bad; } if ((!scrub_compat && V_pf_status.reass) || (r != NULL && !(r->rule_flag & PFRULE_FRAGMENT_NOREASS)) ) { max = fragoff + ip_len; /* Fully buffer all of the fragments * Might return a completely reassembled mbuf, or NULL */ PF_FRAG_LOCK(); DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max)); verdict = pf_reassemble(m0, h, pd->dir, reason); PF_FRAG_UNLOCK(); if (verdict != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); h = mtod(m, struct ip *); no_fragment: /* At this point, only IP_DF is allowed in ip_off */ if (h->ip_off & ~htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } } return (PF_PASS); bad: DPFPRINTF(("dropping bad fragment\n")); REASON_SET(reason, PFRES_FRAG); drop: if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif #ifdef INET6 int pf_normalize_ip6(struct mbuf **m0, struct pfi_kkif *kif, u_short *reason, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct pf_krule *r; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); int extoff; int off; struct ip6_ext ext; struct ip6_opt opt; struct ip6_frag frag; u_int32_t plen; int optend; int ooff; u_int8_t proto; int terminal; bool scrub_compat; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* * Check if there are any scrub rules, matching or not. * Lack of scrub rules means: * - enforced packet normalization operation just like in OpenBSD * With scrub rules: * - packet normalization is performed if there is a matching scrub rule * XXX: Fragment reassembly always performed for IPv6! */ scrub_compat = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != AF_INET6) r = r->skip[PF_SKIP_AF].ptr; #if 0 /* header chain! */ else if (r->proto && r->proto != h->ip6_nxt) r = r->skip[PF_SKIP_PROTO].ptr; #endif else if (PF_MISMATCHAW(&r->src.addr, (struct pf_addr *)&h->ip6_src, AF_INET6, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (PF_MISMATCHAW(&r->dst.addr, (struct pf_addr *)&h->ip6_dst, AF_INET6, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else break; } if (scrub_compat) { /* With scrub rules present IPv6 normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (r == NULL || r->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(r, &pd->act); } /* Check for illegal packets */ if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len) goto drop; again: h = mtod(m, struct ip6_hdr *); plen = ntohs(h->ip6_plen); /* jumbo payload option not supported */ if (plen == 0) goto drop; extoff = 0; off = sizeof(struct ip6_hdr); proto = h->ip6_nxt; terminal = 0; do { switch (proto) { case IPPROTO_FRAGMENT: goto fragment; break; case IPPROTO_AH: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; if (proto == IPPROTO_AH) off += (ext.ip6e_len + 2) * 4; else off += (ext.ip6e_len + 1) * 8; proto = ext.ip6e_nxt; break; case IPPROTO_HOPOPTS: if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL, NULL, AF_INET6)) goto shortpkt; extoff = off; optend = off + (ext.ip6e_len + 1) * 8; ooff = off + sizeof(ext); do { if (!pf_pull_hdr(m, ooff, &opt.ip6o_type, sizeof(opt.ip6o_type), NULL, NULL, AF_INET6)) goto shortpkt; if (opt.ip6o_type == IP6OPT_PAD1) { ooff++; continue; } if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt), NULL, NULL, AF_INET6)) goto shortpkt; if (ooff + sizeof(opt) + opt.ip6o_len > optend) goto drop; if (opt.ip6o_type == IP6OPT_JUMBO) goto drop; ooff += sizeof(opt) + opt.ip6o_len; } while (ooff < optend); off = optend; proto = ext.ip6e_nxt; break; default: terminal = 1; break; } } while (!terminal); if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; return (PF_PASS); fragment: if (pd->flags & PFDESC_IP_REAS) return (PF_DROP); if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len) goto shortpkt; if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6)) goto shortpkt; /* Offset now points to data portion. */ off += sizeof(frag); /* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */ if (pf_reassemble6(m0, h, &frag, off, extoff, reason) != PF_PASS) return (PF_DROP); m = *m0; if (m == NULL) return (PF_DROP); pd->flags |= PFDESC_IP_REAS; goto again; shortpkt: REASON_SET(reason, PFRES_SHORT); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); drop: REASON_SET(reason, PFRES_NORM); if (r != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET6, *reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #endif /* INET6 */ int pf_normalize_tcp(struct pfi_kkif *kif, struct mbuf *m, int ipoff, int off, void *h, struct pf_pdesc *pd) { struct pf_krule *r, *rm = NULL; struct tcphdr *th = &pd->hdr.tcp; int rewrite = 0; u_short reason; - u_int8_t flags; + u_int16_t flags; sa_family_t af = pd->af; int srs; PF_RULES_RASSERT(); r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* Check if there any scrub rules. Lack of scrub rules means enforced * packet normalization operation just like in OpenBSD. */ srs = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != pd->dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], th->th_sport)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], th->th_dport)) r = r->skip[PF_SKIP_DST_PORT].ptr; else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match( pf_osfp_fingerprint(pd, m, off, th), r->os_fingerprint)) r = TAILQ_NEXT(r, entries); else { rm = r; break; } } if (srs) { /* With scrub rules present TCP normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (rm == NULL || rm->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[pd->dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[pd->dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); pf_rule_to_actions(rm, &pd->act); } if (rm && rm->rule_flag & PFRULE_REASSEMBLE_TCP) pd->flags |= PFDESC_TCP_NORM; - flags = th->th_flags; + flags = tcp_get_flags(th); if (flags & TH_SYN) { /* Illegal packet */ if (flags & TH_RST) goto tcp_drop; if (flags & TH_FIN) goto tcp_drop; } else { /* Illegal packet */ if (!(flags & (TH_ACK|TH_RST))) goto tcp_drop; } if (!(flags & TH_ACK)) { /* These flags are only valid if ACK is set */ if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG)) goto tcp_drop; } /* Check for illegal header length */ if (th->th_off < (sizeof(struct tcphdr) >> 2)) goto tcp_drop; /* If flags changed, or reserved data set, then adjust */ - if (flags != th->th_flags || th->th_x2 != 0) { + if (flags != tcp_get_flags(th) || + (tcp_get_flags(th) & (TH_RES1|TH_RES2|TH_RES2)) != 0) { u_int16_t ov, nv; ov = *(u_int16_t *)(&th->th_ack + 1); - th->th_flags = flags; - th->th_x2 = 0; + flags &= ~(TH_RES1 | TH_RES2 | TH_RES3); + tcp_set_flags(th, flags); nv = *(u_int16_t *)(&th->th_ack + 1); th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0); rewrite = 1; } /* Remove urgent pointer, if TH_URG is not set */ if (!(flags & TH_URG) && th->th_urp) { th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp, 0, 0); th->th_urp = 0; rewrite = 1; } /* copy back packet headers if we sanitized */ if (rewrite) m_copyback(m, off, sizeof(*th), (caddr_t)th); return (PF_PASS); tcp_drop: REASON_SET(&reason, PFRES_NORM); if (rm != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, reason, r, NULL, NULL, pd, 1); return (PF_DROP); } int pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd, struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst) { u_int32_t tsval, tsecr; u_int8_t hdr[60]; u_int8_t *opt; KASSERT((src->scrub == NULL), ("pf_normalize_tcp_init: src->scrub != NULL")); src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (src->scrub == NULL) return (1); switch (pd->af) { #ifdef INET case AF_INET: { struct ip *h = mtod(m, struct ip *); src->scrub->pfss_ttl = h->ip_ttl; break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); src->scrub->pfss_ttl = h->ip6_hlim; break; } #endif /* INET6 */ } /* * All normalizations below are only begun if we see the start of * the connections. They must all set an enabled bit in pfss_flags */ if ((th->th_flags & TH_SYN) == 0) return (0); if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: if (opt[1] >= TCPOLEN_TIMESTAMP) { src->scrub->pfss_flags |= PFSS_TIMESTAMP; src->scrub->pfss_ts_mod = htonl(arc4random()); /* note PFSS_PAWS not set yet */ memcpy(&tsval, &opt[2], sizeof(u_int32_t)); memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); src->scrub->pfss_tsval0 = ntohl(tsval); src->scrub->pfss_tsval = ntohl(tsval); src->scrub->pfss_tsecr = ntohl(tsecr); getmicrouptime(&src->scrub->pfss_last); } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } } return (0); } void pf_normalize_tcp_cleanup(struct pf_kstate *state) { /* XXX Note: this also cleans up SCTP. */ uma_zfree(V_pf_state_scrub_z, state->src.scrub); uma_zfree(V_pf_state_scrub_z, state->dst.scrub); /* Someday... flush the TCP segment reassembly descriptors. */ } int pf_normalize_sctp_init(struct mbuf *m, int off, struct pf_pdesc *pd, struct pf_state_peer *src, struct pf_state_peer *dst) { src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (src->scrub == NULL) return (1); dst->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT); if (dst->scrub == NULL) { uma_zfree(V_pf_state_scrub_z, src); return (1); } dst->scrub->pfss_v_tag = pd->sctp_initiate_tag; return (0); } int pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason, struct tcphdr *th, struct pf_kstate *state, struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback) { struct timeval uptime; u_int32_t tsval, tsecr; u_int tsval_from_last; u_int8_t hdr[60]; u_int8_t *opt; int copyback = 0; int got_ts = 0; size_t startoff; KASSERT((src->scrub || dst->scrub), ("%s: src->scrub && dst->scrub!", __func__)); /* * Enforce the minimum TTL seen for this connection. Negate a common * technique to evade an intrusion detection system and confuse * firewall state code. */ switch (pd->af) { #ifdef INET case AF_INET: { if (src->scrub) { struct ip *h = mtod(m, struct ip *); if (h->ip_ttl > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip_ttl; h->ip_ttl = src->scrub->pfss_ttl; } break; } #endif /* INET */ #ifdef INET6 case AF_INET6: { if (src->scrub) { struct ip6_hdr *h = mtod(m, struct ip6_hdr *); if (h->ip6_hlim > src->scrub->pfss_ttl) src->scrub->pfss_ttl = h->ip6_hlim; h->ip6_hlim = src->scrub->pfss_ttl; } break; } #endif /* INET6 */ } if (th->th_off > (sizeof(struct tcphdr) >> 2) && ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) || (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) && pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) { /* Diddle with TCP options */ int hlen; opt = hdr + sizeof(struct tcphdr); hlen = (th->th_off << 2) - sizeof(struct tcphdr); while (hlen >= TCPOLEN_TIMESTAMP) { startoff = opt - (hdr + sizeof(struct tcphdr)); switch (*opt) { case TCPOPT_EOL: /* FALLTHROUGH */ case TCPOPT_NOP: opt++; hlen--; break; case TCPOPT_TIMESTAMP: /* Modulate the timestamps. Can be used for * NAT detection, OS uptime determination or * reboot detection. */ if (got_ts) { /* Huh? Multiple timestamps!? */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("multiple TS??\n")); pf_print_state(state); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } if (opt[1] >= TCPOLEN_TIMESTAMP) { memcpy(&tsval, &opt[2], sizeof(u_int32_t)); if (tsval && src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsval = ntohl(tsval); pf_patch_32_unaligned(m, &th->th_sum, &opt[2], htonl(tsval + src->scrub->pfss_ts_mod), PF_ALGNMNT(startoff), 0); copyback = 1; } /* Modulate TS reply iff valid (!0) */ memcpy(&tsecr, &opt[6], sizeof(u_int32_t)); if (tsecr && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { tsecr = ntohl(tsecr) - dst->scrub->pfss_ts_mod; pf_patch_32_unaligned(m, &th->th_sum, &opt[6], htonl(tsecr), PF_ALGNMNT(startoff), 0); copyback = 1; } got_ts = 1; } /* FALLTHROUGH */ default: hlen -= MAX(opt[1], 2); opt += MAX(opt[1], 2); break; } } if (copyback) { /* Copyback the options, caller copys back header */ *writeback = 1; m_copyback(m, off + sizeof(struct tcphdr), (th->th_off << 2) - sizeof(struct tcphdr), hdr + sizeof(struct tcphdr)); } } /* * Must invalidate PAWS checks on connections idle for too long. * The fastest allowed timestamp clock is 1ms. That turns out to * be about 24 days before it wraps. XXX Right now our lowerbound * TS echo check only works for the first 12 days of a connection * when the TS has exhausted half its 32bit space */ #define TS_MAX_IDLE (24*24*60*60) #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */ getmicrouptime(&uptime); if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE || time_uptime - state->creation > TS_MAX_CONN)) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("src idled out of PAWS\n")); pf_print_state(state); printf("\n"); } src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) && uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) { if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("dst idled out of PAWS\n")); pf_print_state(state); printf("\n"); } dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS) | PFSS_PAWS_IDLED; } if (got_ts && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Validate that the timestamps are "in-window". * RFC1323 describes TCP Timestamp options that allow * measurement of RTT (round trip time) and PAWS * (protection against wrapped sequence numbers). PAWS * gives us a set of rules for rejecting packets on * long fat pipes (packets that were somehow delayed * in transit longer than the time it took to send the * full TCP sequence space of 4Gb). We can use these * rules and infer a few others that will let us treat * the 32bit timestamp and the 32bit echoed timestamp * as sequence numbers to prevent a blind attacker from * inserting packets into a connection. * * RFC1323 tells us: * - The timestamp on this packet must be greater than * or equal to the last value echoed by the other * endpoint. The RFC says those will be discarded * since it is a dup that has already been acked. * This gives us a lowerbound on the timestamp. * timestamp >= other last echoed timestamp * - The timestamp will be less than or equal to * the last timestamp plus the time between the * last packet and now. The RFC defines the max * clock rate as 1ms. We will allow clocks to be * up to 10% fast and will allow a total difference * or 30 seconds due to a route change. And this * gives us an upperbound on the timestamp. * timestamp <= last timestamp + max ticks * We have to be careful here. Windows will send an * initial timestamp of zero and then initialize it * to a random value after the 3whs; presumably to * avoid a DoS by having to call an expensive RNG * during a SYN flood. Proof MS has at least one * good security geek. * * - The TCP timestamp option must also echo the other * endpoints timestamp. The timestamp echoed is the * one carried on the earliest unacknowledged segment * on the left edge of the sequence window. The RFC * states that the host will reject any echoed * timestamps that were larger than any ever sent. * This gives us an upperbound on the TS echo. * tescr <= largest_tsval * - The lowerbound on the TS echo is a little more * tricky to determine. The other endpoint's echoed * values will not decrease. But there may be * network conditions that re-order packets and * cause our view of them to decrease. For now the * only lowerbound we can safely determine is that * the TS echo will never be less than the original * TS. XXX There is probably a better lowerbound. * Remove TS_MAX_CONN with better lowerbound check. * tescr >= other original TS * * It is also important to note that the fastest * timestamp clock of 1ms will wrap its 32bit space in * 24 days. So we just disable TS checking after 24 * days of idle time. We actually must use a 12d * connection limit until we can come up with a better * lowerbound to the TS echo check. */ struct timeval delta_ts; int ts_fudge; /* * PFTM_TS_DIFF is how many seconds of leeway to allow * a host's timestamp. This can happen if the previous * packet got delayed in transit for much longer than * this packet. */ if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0) ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF]; /* Calculate max ticks since the last timestamp */ #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */ #define TS_MICROSECS 1000000 /* microseconds per second */ delta_ts = uptime; timevalsub(&delta_ts, &src->scrub->pfss_last); tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ; tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ); if ((src->state >= TCPS_ESTABLISHED && dst->state >= TCPS_ESTABLISHED) && (SEQ_LT(tsval, dst->scrub->pfss_tsecr) || SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) || (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) || SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) { /* Bad RFC1323 implementation or an insertion attack. * * - Solaris 2.6 and 2.7 are known to send another ACK * after the FIN,FIN|ACK,ACK closing that carries * an old timestamp. */ DPFPRINTF(("Timestamp failed %c%c%c%c\n", SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ', SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ? '1' : ' ', SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ', SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' ')); DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u " "idle: %jus %lums\n", tsval, tsecr, tsval_from_last, (uintmax_t)delta_ts.tv_sec, delta_ts.tv_usec / 1000)); DPFPRINTF((" src->tsval: %u tsecr: %u\n", src->scrub->pfss_tsval, src->scrub->pfss_tsecr)); DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u" "\n", dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0)); if (V_pf_status.debug >= PF_DEBUG_MISC) { pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } /* XXX I'd really like to require tsecr but it's optional */ } else if (!got_ts && (th->th_flags & TH_RST) == 0 && ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED) || pd->p_len > 0 || (th->th_flags & TH_SYN)) && src->scrub && dst->scrub && (src->scrub->pfss_flags & PFSS_PAWS) && (dst->scrub->pfss_flags & PFSS_PAWS)) { /* Didn't send a timestamp. Timestamps aren't really useful * when: * - connection opening or closing (often not even sent). * but we must not let an attacker to put a FIN on a * data packet to sneak it through our ESTABLISHED check. * - on a TCP reset. RFC suggests not even looking at TS. * - on an empty ACK. The TS will not be echoed so it will * probably not help keep the RTT calculation in sync and * there isn't as much danger when the sequence numbers * got wrapped. So some stacks don't include TS on empty * ACKs :-( * * To minimize the disruption to mostly RFC1323 conformant * stacks, we will only require timestamps on data packets. * * And what do ya know, we cannot require timestamps on data * packets. There appear to be devices that do legitimate * TCP connection hijacking. There are HTTP devices that allow * a 3whs (with timestamps) and then buffer the HTTP request. * If the intermediate device has the HTTP response cache, it * will spoof the response but not bother timestamping its * packets. So we can look for the presence of a timestamp in * the first data packet and if there, require it in all future * packets. */ if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) { /* * Hey! Someone tried to sneak a packet in. Or the * stack changed its RFC1323 behavior?!?! */ if (V_pf_status.debug >= PF_DEBUG_MISC) { DPFPRINTF(("Did not receive expected RFC1323 " "timestamp\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } REASON_SET(reason, PFRES_TS); return (PF_DROP); } } /* * We will note if a host sends his data packets with or without * timestamps. And require all data packets to contain a timestamp * if the first does. PAWS implicitly requires that all data packets be * timestamped. But I think there are middle-man devices that hijack * TCP streams immediately after the 3whs and don't timestamp their * packets (seen in a WWW accelerator or cache). */ if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags & (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) { if (got_ts) src->scrub->pfss_flags |= PFSS_DATA_TS; else { src->scrub->pfss_flags |= PFSS_DATA_NOTS; if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) { /* Don't warn if other host rejected RFC1323 */ DPFPRINTF(("Broken RFC1323 stack did not " "timestamp data packet. Disabled PAWS " "security.\n")); pf_print_state(state); pf_print_flags(th->th_flags); printf("\n"); } } } /* * Update PAWS values */ if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags & (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) { getmicrouptime(&src->scrub->pfss_last); if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsval = tsval; if (tsecr) { if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) || (src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_tsecr = tsecr; if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 && (SEQ_LT(tsval, src->scrub->pfss_tsval0) || src->scrub->pfss_tsval0 == 0)) { /* tsval0 MUST be the lowest timestamp */ src->scrub->pfss_tsval0 = tsval; } /* Only fully initialized after a TS gets echoed */ if ((src->scrub->pfss_flags & PFSS_PAWS) == 0) src->scrub->pfss_flags |= PFSS_PAWS; } } /* I have a dream.... TCP segment reassembly.... */ return (0); } int pf_normalize_mss(struct mbuf *m, int off, struct pf_pdesc *pd) { struct tcphdr *th = &pd->hdr.tcp; u_int16_t *mss; int thoff; int opt, cnt, optlen = 0; u_char opts[TCP_MAXOLEN]; u_char *optp = opts; size_t startoff; thoff = th->th_off << 2; cnt = thoff - sizeof(struct tcphdr); if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt, NULL, NULL, pd->af)) return (0); for (; cnt > 0; cnt -= optlen, optp += optlen) { startoff = optp - opts; opt = optp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = optp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: mss = (u_int16_t *)(optp + 2); if ((ntohs(*mss)) > pd->act.max_mss) { pf_patch_16_unaligned(m, &th->th_sum, mss, htons(pd->act.max_mss), PF_ALGNMNT(startoff), 0); m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts); m_copyback(m, off, sizeof(*th), (caddr_t)th); } break; default: break; } } return (0); } static int pf_scan_sctp(struct mbuf *m, int ipoff, int off, struct pf_pdesc *pd, struct pfi_kkif *kif) { struct sctp_chunkhdr ch = { }; int chunk_off = sizeof(struct sctphdr); int chunk_start; int ret; while (off + chunk_off < pd->tot_len) { if (!pf_pull_hdr(m, off + chunk_off, &ch, sizeof(ch), NULL, NULL, pd->af)) return (PF_DROP); /* Length includes the header, this must be at least 4. */ if (ntohs(ch.chunk_length) < 4) return (PF_DROP); chunk_start = chunk_off; chunk_off += roundup(ntohs(ch.chunk_length), 4); switch (ch.chunk_type) { case SCTP_INITIATION: case SCTP_INITIATION_ACK: { struct sctp_init_chunk init; if (!pf_pull_hdr(m, off + chunk_start, &init, sizeof(init), NULL, NULL, pd->af)) return (PF_DROP); /* * RFC 9620, Section 3.3.2, "The Initiate Tag is allowed to have * any value except 0." */ if (init.init.initiate_tag == 0) return (PF_DROP); if (init.init.num_inbound_streams == 0) return (PF_DROP); if (init.init.num_outbound_streams == 0) return (PF_DROP); if (ntohl(init.init.a_rwnd) < SCTP_MIN_RWND) return (PF_DROP); /* * RFC 9260, Section 3.1, INIT chunks MUST have zero * verification tag. */ if (ch.chunk_type == SCTP_INITIATION && pd->hdr.sctp.v_tag != 0) return (PF_DROP); pd->sctp_initiate_tag = init.init.initiate_tag; if (ch.chunk_type == SCTP_INITIATION) pd->sctp_flags |= PFDESC_SCTP_INIT; else pd->sctp_flags |= PFDESC_SCTP_INIT_ACK; ret = pf_multihome_scan_init(m, off + chunk_start, ntohs(init.ch.chunk_length), pd, kif); if (ret != PF_PASS) return (ret); break; } case SCTP_ABORT_ASSOCIATION: pd->sctp_flags |= PFDESC_SCTP_ABORT; break; case SCTP_SHUTDOWN: case SCTP_SHUTDOWN_ACK: pd->sctp_flags |= PFDESC_SCTP_SHUTDOWN; break; case SCTP_SHUTDOWN_COMPLETE: pd->sctp_flags |= PFDESC_SCTP_SHUTDOWN_COMPLETE; break; case SCTP_COOKIE_ECHO: pd->sctp_flags |= PFDESC_SCTP_COOKIE; break; case SCTP_COOKIE_ACK: pd->sctp_flags |= PFDESC_SCTP_COOKIE_ACK; break; case SCTP_DATA: pd->sctp_flags |= PFDESC_SCTP_DATA; break; case SCTP_HEARTBEAT_REQUEST: pd->sctp_flags |= PFDESC_SCTP_HEARTBEAT; break; case SCTP_HEARTBEAT_ACK: pd->sctp_flags |= PFDESC_SCTP_HEARTBEAT_ACK; break; case SCTP_ASCONF: pd->sctp_flags |= PFDESC_SCTP_ASCONF; ret = pf_multihome_scan_asconf(m, off + chunk_start, ntohs(ch.chunk_length), pd, kif); if (ret != PF_PASS) return (ret); break; default: pd->sctp_flags |= PFDESC_SCTP_OTHER; break; } } /* Validate chunk lengths vs. packet length. */ if (off + chunk_off != pd->tot_len) return (PF_DROP); /* * INIT, INIT_ACK or SHUTDOWN_COMPLETE chunks must always be the only * one in a packet. */ if ((pd->sctp_flags & PFDESC_SCTP_INIT) && (pd->sctp_flags & ~PFDESC_SCTP_INIT)) return (PF_DROP); if ((pd->sctp_flags & PFDESC_SCTP_INIT_ACK) && (pd->sctp_flags & ~PFDESC_SCTP_INIT_ACK)) return (PF_DROP); if ((pd->sctp_flags & PFDESC_SCTP_SHUTDOWN_COMPLETE) && (pd->sctp_flags & ~PFDESC_SCTP_SHUTDOWN_COMPLETE)) return (PF_DROP); return (PF_PASS); } int pf_normalize_sctp(int dir, struct pfi_kkif *kif, struct mbuf *m, int ipoff, int off, void *h, struct pf_pdesc *pd) { struct pf_krule *r, *rm = NULL; struct sctphdr *sh = &pd->hdr.sctp; u_short reason; sa_family_t af = pd->af; int srs; PF_RULES_RASSERT(); /* Unconditionally scan the SCTP packet, because we need to look for * things like shutdown and asconf chunks. */ if (pf_scan_sctp(m, ipoff, off, pd, kif) != PF_PASS) goto sctp_drop; r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr); /* Check if there any scrub rules. Lack of scrub rules means enforced * packet normalization operation just like in OpenBSD. */ srs = (r != NULL); while (r != NULL) { pf_counter_u64_add(&r->evaluations, 1); if (pfi_kkif_match(r->kif, kif) == r->ifnot) r = r->skip[PF_SKIP_IFP].ptr; else if (r->direction && r->direction != dir) r = r->skip[PF_SKIP_DIR].ptr; else if (r->af && r->af != af) r = r->skip[PF_SKIP_AF].ptr; else if (r->proto && r->proto != pd->proto) r = r->skip[PF_SKIP_PROTO].ptr; else if (PF_MISMATCHAW(&r->src.addr, pd->src, af, r->src.neg, kif, M_GETFIB(m))) r = r->skip[PF_SKIP_SRC_ADDR].ptr; else if (r->src.port_op && !pf_match_port(r->src.port_op, r->src.port[0], r->src.port[1], sh->src_port)) r = r->skip[PF_SKIP_SRC_PORT].ptr; else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af, r->dst.neg, NULL, M_GETFIB(m))) r = r->skip[PF_SKIP_DST_ADDR].ptr; else if (r->dst.port_op && !pf_match_port(r->dst.port_op, r->dst.port[0], r->dst.port[1], sh->dest_port)) r = r->skip[PF_SKIP_DST_PORT].ptr; else { rm = r; break; } } if (srs) { /* With scrub rules present SCTP normalization happens only * if one of rules has matched and it's not a "no scrub" rule */ if (rm == NULL || rm->action == PF_NOSCRUB) return (PF_PASS); pf_counter_u64_critical_enter(); pf_counter_u64_add_protected(&r->packets[dir == PF_OUT], 1); pf_counter_u64_add_protected(&r->bytes[dir == PF_OUT], pd->tot_len); pf_counter_u64_critical_exit(); } /* Verify we're a multiple of 4 bytes long */ if ((pd->tot_len - off - sizeof(struct sctphdr)) % 4) goto sctp_drop; /* INIT chunk needs to be the only chunk */ if (pd->sctp_flags & PFDESC_SCTP_INIT) if (pd->sctp_flags & ~PFDESC_SCTP_INIT) goto sctp_drop; return (PF_PASS); sctp_drop: REASON_SET(&reason, PFRES_NORM); if (rm != NULL && r->log) PFLOG_PACKET(kif, m, AF_INET, reason, r, NULL, NULL, pd, 1); return (PF_DROP); } #ifdef INET void pf_scrub_ip(struct mbuf **m0, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct ip *h = mtod(m, struct ip *); /* Clear IP_DF if no-df was requested */ if (pd->act.flags & PFSTATE_NODF && h->ip_off & htons(IP_DF)) { u_int16_t ip_off = h->ip_off; h->ip_off &= htons(~IP_DF); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0); } /* Enforce a minimum ttl, may cause endless packet loops */ if (pd->act.min_ttl && h->ip_ttl < pd->act.min_ttl) { u_int16_t ip_ttl = h->ip_ttl; h->ip_ttl = pd->act.min_ttl; h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0); } /* Enforce tos */ if (pd->act.flags & PFSTATE_SETTOS) { u_int16_t ov, nv; ov = *(u_int16_t *)h; h->ip_tos = pd->act.set_tos | (h->ip_tos & IPTOS_ECN_MASK); nv = *(u_int16_t *)h; h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0); } /* random-id, but not for fragments */ if (pd->act.flags & PFSTATE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) { uint16_t ip_id = h->ip_id; ip_fillid(h); h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0); } } #endif /* INET */ #ifdef INET6 void pf_scrub_ip6(struct mbuf **m0, struct pf_pdesc *pd) { struct mbuf *m = *m0; struct ip6_hdr *h = mtod(m, struct ip6_hdr *); /* Enforce a minimum ttl, may cause endless packet loops */ if (pd->act.min_ttl && h->ip6_hlim < pd->act.min_ttl) h->ip6_hlim = pd->act.min_ttl; /* Enforce tos. Set traffic class bits */ if (pd->act.flags & PFSTATE_SETTOS) { h->ip6_flow &= IPV6_FLOWLABEL_MASK | IPV6_VERSION_MASK; h->ip6_flow |= htonl((pd->act.set_tos | IPV6_ECN(h)) << 20); } } #endif