Index: user/dchagin/lemul/gnu/lib/libdialog/Makefile =================================================================== --- user/dchagin/lemul/gnu/lib/libdialog/Makefile (revision 274240) +++ user/dchagin/lemul/gnu/lib/libdialog/Makefile (revision 274241) @@ -1,20 +1,23 @@ # $FreeBSD$ DIALOG= ${.CURDIR}/../../../contrib/dialog LIB= dialog SHLIB_MAJOR= 8 SRCS= argv.c arrows.c buildlist.c buttons.c calendar.c checklist.c \ columns.c dlg_keys.c editbox.c fselect.c formbox.c guage.c \ help.c inputbox.c inputstr.c menubox.c mixedform.c \ mixedgauge.c mouse.c mousewget.c msgbox.c pause.c prgbox.c \ progressbox.c rangebox.c rc.c tailbox.c textbox.c timebox.c \ trace.c treeview.c ui_getc.c util.c version.c yesno.c INCS= dialog.h dlg_colors.h dlg_config.h dlg_keys.h MAN= dialog.3 +DPADD= ${LIBNCURSESW} ${LIBM} +LDADD= -lncursesw -lm + CFLAGS+= -I${.CURDIR} -I${DIALOG} -D_XOPEN_SOURCE_EXTENDED -DGCC_UNUSED=__unused .PATH: ${DIALOG} WARNS?= 1 .include Index: user/dchagin/lemul/gnu/lib =================================================================== --- user/dchagin/lemul/gnu/lib (revision 274240) +++ user/dchagin/lemul/gnu/lib (revision 274241) Property changes on: user/dchagin/lemul/gnu/lib ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/gnu/lib:r273766-274240 Index: user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.c =================================================================== --- user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.c (revision 274240) +++ user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.c (revision 274241) @@ -1,1144 +1,1144 @@ /*- * Copyright (c) 2009-2012 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /** * HyperV vmbus network VSC (virtual services client) module * */ #include #include #include #include #include #include #include #include #include #include #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" /* * Forward declarations */ static void hv_nv_on_channel_callback(void *context); static int hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device); static int hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device); static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev); static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev); static int hv_nv_connect_to_vsp(struct hv_device *device); static void hv_nv_on_send_completion(struct hv_device *device, hv_vm_packet_descriptor *pkt); static void hv_nv_on_receive(struct hv_device *device, hv_vm_packet_descriptor *pkt); static void hv_nv_send_receive_completion(struct hv_device *device, uint64_t tid); /* * */ static inline netvsc_dev * hv_nv_alloc_net_device(struct hv_device *device) { netvsc_dev *net_dev; hn_softc_t *sc = device_get_softc(device->device); net_dev = malloc(sizeof(netvsc_dev), M_DEVBUF, M_NOWAIT | M_ZERO); if (net_dev == NULL) { return (NULL); } net_dev->dev = device; net_dev->destroy = FALSE; sc->net_dev = net_dev; return (net_dev); } /* * */ static inline netvsc_dev * hv_nv_get_outbound_net_device(struct hv_device *device) { hn_softc_t *sc = device_get_softc(device->device); netvsc_dev *net_dev = sc->net_dev;; if ((net_dev != NULL) && net_dev->destroy) { return (NULL); } return (net_dev); } /* * */ static inline netvsc_dev * hv_nv_get_inbound_net_device(struct hv_device *device) { hn_softc_t *sc = device_get_softc(device->device); netvsc_dev *net_dev = sc->net_dev;; if (net_dev == NULL) { return (net_dev); } /* * When the device is being destroyed; we only * permit incoming packets if and only if there * are outstanding sends. */ if (net_dev->destroy && net_dev->num_outstanding_sends == 0) { return (NULL); } return (net_dev); } /* * Net VSC initialize receive buffer with net VSP * * Net VSP: Network virtual services client, also known as the * Hyper-V extensible switch and the synthetic data path. */ static int hv_nv_init_rx_buffer_with_net_vsp(struct hv_device *device) { netvsc_dev *net_dev; nvsp_msg *init_pkt; int ret = 0; net_dev = hv_nv_get_outbound_net_device(device); if (!net_dev) { return (ENODEV); } net_dev->rx_buf = contigmalloc(net_dev->rx_buf_size, M_DEVBUF, M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); if (net_dev->rx_buf == NULL) { ret = ENOMEM; goto cleanup; } /* * Establish the GPADL handle for this buffer on this channel. * Note: This call uses the vmbus connection rather than the * channel to establish the gpadl handle. * GPADL: Guest physical address descriptor list. */ ret = hv_vmbus_channel_establish_gpadl( device->channel, net_dev->rx_buf, net_dev->rx_buf_size, &net_dev->rx_buf_gpadl_handle); if (ret != 0) { goto cleanup; } /* sema_wait(&ext->channel_init_sema); KYS CHECK */ /* Notify the NetVsp of the gpadl handle */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_1_type_send_rx_buf; init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = net_dev->rx_buf_gpadl_handle; init_pkt->msgs.vers_1_msgs.send_rx_buf.id = NETVSC_RECEIVE_BUFFER_ID; /* Send the gpadl notification request */ ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret != 0) { goto cleanup; } sema_wait(&net_dev->channel_init_sema); /* Check the response */ if (init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.status != nvsp_status_success) { ret = EINVAL; goto cleanup; } net_dev->rx_section_count = init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections; net_dev->rx_sections = malloc(net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section), M_DEVBUF, M_NOWAIT); if (net_dev->rx_sections == NULL) { ret = EINVAL; goto cleanup; } memcpy(net_dev->rx_sections, init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections, net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section)); /* * For first release, there should only be 1 section that represents * the entire receive buffer */ if (net_dev->rx_section_count != 1 || net_dev->rx_sections->offset != 0) { ret = EINVAL; goto cleanup; } goto exit; cleanup: hv_nv_destroy_rx_buffer(net_dev); exit: return (ret); } /* * Net VSC initialize send buffer with net VSP */ static int hv_nv_init_send_buffer_with_net_vsp(struct hv_device *device) { netvsc_dev *net_dev; nvsp_msg *init_pkt; int ret = 0; net_dev = hv_nv_get_outbound_net_device(device); if (!net_dev) { return (ENODEV); } net_dev->send_buf = contigmalloc(net_dev->send_buf_size, M_DEVBUF, M_ZERO, 0UL, BUS_SPACE_MAXADDR, PAGE_SIZE, 0); if (net_dev->send_buf == NULL) { ret = ENOMEM; goto cleanup; } /* * Establish the gpadl handle for this buffer on this channel. * Note: This call uses the vmbus connection rather than the * channel to establish the gpadl handle. */ ret = hv_vmbus_channel_establish_gpadl(device->channel, net_dev->send_buf, net_dev->send_buf_size, &net_dev->send_buf_gpadl_handle); if (ret != 0) { goto cleanup; } /* Notify the NetVsp of the gpadl handle */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_1_type_send_send_buf; init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = net_dev->send_buf_gpadl_handle; init_pkt->msgs.vers_1_msgs.send_rx_buf.id = NETVSC_SEND_BUFFER_ID; /* Send the gpadl notification request */ ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret != 0) { goto cleanup; } sema_wait(&net_dev->channel_init_sema); /* Check the response */ if (init_pkt->msgs.vers_1_msgs.send_send_buf_complete.status != nvsp_status_success) { ret = EINVAL; goto cleanup; } net_dev->send_section_size = init_pkt->msgs.vers_1_msgs.send_send_buf_complete.section_size; goto exit; cleanup: hv_nv_destroy_send_buffer(net_dev); exit: return (ret); } /* * Net VSC destroy receive buffer */ static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev) { nvsp_msg *revoke_pkt; int ret = 0; /* * If we got a section count, it means we received a * send_rx_buf_complete msg * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, * we need to send a revoke msg here */ if (net_dev->rx_section_count) { /* Send the revoke receive buffer */ revoke_pkt = &net_dev->revoke_packet; memset(revoke_pkt, 0, sizeof(nvsp_msg)); revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_rx_buf; revoke_pkt->msgs.vers_1_msgs.revoke_rx_buf.id = NETVSC_RECEIVE_BUFFER_ID; ret = hv_vmbus_channel_send_packet(net_dev->dev->channel, revoke_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)revoke_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } } /* Tear down the gpadl on the vsp end */ if (net_dev->rx_buf_gpadl_handle) { ret = hv_vmbus_channel_teardown_gpdal(net_dev->dev->channel, net_dev->rx_buf_gpadl_handle); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } net_dev->rx_buf_gpadl_handle = 0; } if (net_dev->rx_buf) { /* Free up the receive buffer */ contigfree(net_dev->rx_buf, net_dev->rx_buf_size, M_DEVBUF); net_dev->rx_buf = NULL; } if (net_dev->rx_sections) { free(net_dev->rx_sections, M_DEVBUF); net_dev->rx_sections = NULL; net_dev->rx_section_count = 0; } return (ret); } /* * Net VSC destroy send buffer */ static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev) { nvsp_msg *revoke_pkt; int ret = 0; /* * If we got a section count, it means we received a * send_rx_buf_complete msg * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, * we need to send a revoke msg here */ if (net_dev->send_section_size) { /* Send the revoke send buffer */ revoke_pkt = &net_dev->revoke_packet; memset(revoke_pkt, 0, sizeof(nvsp_msg)); revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_send_buf; revoke_pkt->msgs.vers_1_msgs.revoke_send_buf.id = NETVSC_SEND_BUFFER_ID; ret = hv_vmbus_channel_send_packet(net_dev->dev->channel, revoke_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)revoke_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } } /* Tear down the gpadl on the vsp end */ if (net_dev->send_buf_gpadl_handle) { ret = hv_vmbus_channel_teardown_gpdal(net_dev->dev->channel, net_dev->send_buf_gpadl_handle); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } net_dev->send_buf_gpadl_handle = 0; } if (net_dev->send_buf) { /* Free up the receive buffer */ contigfree(net_dev->send_buf, net_dev->send_buf_size, M_DEVBUF); net_dev->send_buf = NULL; } return (ret); } /* * Attempt to negotiate the caller-specified NVSP version * * For NVSP v2, Server 2008 R2 does not set * init_pkt->msgs.init_msgs.init_compl.negotiated_prot_vers * to the negotiated version, so we cannot rely on that. */ static int hv_nv_negotiate_nvsp_protocol(struct hv_device *device, netvsc_dev *net_dev, uint32_t nvsp_ver) { nvsp_msg *init_pkt; int ret; init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_type_init; /* * Specify parameter as the only acceptable protocol version */ init_pkt->msgs.init_msgs.init.p1.protocol_version = nvsp_ver; init_pkt->msgs.init_msgs.init.protocol_version_2 = nvsp_ver; /* Send the init request */ ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret != 0) return (-1); sema_wait(&net_dev->channel_init_sema); if (init_pkt->msgs.init_msgs.init_compl.status != nvsp_status_success) return (EINVAL); return (0); } /* * Send NDIS version 2 config packet containing MTU. * * Not valid for NDIS version 1. */ static int hv_nv_send_ndis_config(struct hv_device *device, uint32_t mtu) { netvsc_dev *net_dev; nvsp_msg *init_pkt; int ret; net_dev = hv_nv_get_outbound_net_device(device); if (!net_dev) return (-ENODEV); /* * Set up configuration packet, write MTU * Indicate we are capable of handling VLAN tags */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_2_type_send_ndis_config; init_pkt->msgs.vers_2_msgs.send_ndis_config.mtu = mtu; init_pkt-> msgs.vers_2_msgs.send_ndis_config.capabilities.u1.u2.ieee8021q = 1; /* Send the configuration packet */ ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); if (ret != 0) return (-EINVAL); return (0); } /* * Net VSC connect to VSP */ static int hv_nv_connect_to_vsp(struct hv_device *device) { netvsc_dev *net_dev; nvsp_msg *init_pkt; uint32_t nvsp_vers; uint32_t ndis_version; int ret = 0; device_t dev = device->device; hn_softc_t *sc = device_get_softc(dev); - struct ifnet *ifp = sc->arpcom.ac_ifp; + struct ifnet *ifp = sc->hn_ifp; net_dev = hv_nv_get_outbound_net_device(device); if (!net_dev) { return (ENODEV); } /* * Negotiate the NVSP version. Try NVSP v2 first. */ nvsp_vers = NVSP_PROTOCOL_VERSION_2; ret = hv_nv_negotiate_nvsp_protocol(device, net_dev, nvsp_vers); if (ret != 0) { /* NVSP v2 failed, try NVSP v1 */ nvsp_vers = NVSP_PROTOCOL_VERSION_1; ret = hv_nv_negotiate_nvsp_protocol(device, net_dev, nvsp_vers); if (ret != 0) { /* NVSP v1 failed, return bad status */ return (ret); } } net_dev->nvsp_version = nvsp_vers; /* * Set the MTU if supported by this NVSP protocol version * This needs to be right after the NVSP init message per Haiyang */ if (nvsp_vers >= NVSP_PROTOCOL_VERSION_2) ret = hv_nv_send_ndis_config(device, ifp->if_mtu); /* * Send the NDIS version */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); /* * Updated to version 5.1, minimum, for VLAN per Haiyang */ ndis_version = NDIS_VERSION; init_pkt->hdr.msg_type = nvsp_msg_1_type_send_ndis_vers; init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_major_vers = (ndis_version & 0xFFFF0000) >> 16; init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_minor_vers = ndis_version & 0xFFFF; /* Send the init request */ ret = hv_vmbus_channel_send_packet(device->channel, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, 0); if (ret != 0) { goto cleanup; } /* * TODO: BUGBUG - We have to wait for the above msg since the netvsp * uses KMCL which acknowledges packet (completion packet) * since our Vmbus always set the * HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED flag */ /* sema_wait(&NetVscChannel->channel_init_sema); */ /* Post the big receive buffer to NetVSP */ ret = hv_nv_init_rx_buffer_with_net_vsp(device); if (ret == 0) ret = hv_nv_init_send_buffer_with_net_vsp(device); cleanup: return (ret); } /* * Net VSC disconnect from VSP */ static void hv_nv_disconnect_from_vsp(netvsc_dev *net_dev) { hv_nv_destroy_rx_buffer(net_dev); hv_nv_destroy_send_buffer(net_dev); } /* * Net VSC on device add * * Callback when the device belonging to this driver is added */ netvsc_dev * hv_nv_on_device_add(struct hv_device *device, void *additional_info) { netvsc_dev *net_dev; netvsc_packet *packet; netvsc_packet *next_packet; int i, ret = 0; net_dev = hv_nv_alloc_net_device(device); if (!net_dev) goto cleanup; /* Initialize the NetVSC channel extension */ net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; mtx_init(&net_dev->rx_pkt_list_lock, "HV-RPL", NULL, MTX_SPIN | MTX_RECURSE); net_dev->send_buf_size = NETVSC_SEND_BUFFER_SIZE; /* Same effect as STAILQ_HEAD_INITIALIZER() static initializer */ STAILQ_INIT(&net_dev->myrx_packet_list); /* * malloc a sufficient number of netvsc_packet buffers to hold * a packet list. Add them to the netvsc device packet queue. */ for (i=0; i < NETVSC_RECEIVE_PACKETLIST_COUNT; i++) { packet = malloc(sizeof(netvsc_packet) + (NETVSC_RECEIVE_SG_COUNT * sizeof(hv_vmbus_page_buffer)), M_DEVBUF, M_NOWAIT | M_ZERO); if (!packet) { break; } STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, packet, mylist_entry); } sema_init(&net_dev->channel_init_sema, 0, "netdev_sema"); /* * Open the channel */ ret = hv_vmbus_channel_open(device->channel, NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, hv_nv_on_channel_callback, device); if (ret != 0) goto cleanup; /* * Connect with the NetVsp */ ret = hv_nv_connect_to_vsp(device); if (ret != 0) goto close; return (net_dev); close: /* Now, we can close the channel safely */ hv_vmbus_channel_close(device->channel); cleanup: /* * Free the packet buffers on the netvsc device packet queue. * Release other resources. */ if (net_dev) { sema_destroy(&net_dev->channel_init_sema); packet = STAILQ_FIRST(&net_dev->myrx_packet_list); while (packet != NULL) { next_packet = STAILQ_NEXT(packet, mylist_entry); free(packet, M_DEVBUF); packet = next_packet; } /* Reset the list to initial state */ STAILQ_INIT(&net_dev->myrx_packet_list); mtx_destroy(&net_dev->rx_pkt_list_lock); free(net_dev, M_DEVBUF); } return (NULL); } /* * Net VSC on device remove */ int hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel) { netvsc_packet *net_vsc_pkt; netvsc_packet *next_net_vsc_pkt; hn_softc_t *sc = device_get_softc(device->device); netvsc_dev *net_dev = sc->net_dev;; /* Stop outbound traffic ie sends and receives completions */ mtx_lock(&device->channel->inbound_lock); net_dev->destroy = TRUE; mtx_unlock(&device->channel->inbound_lock); /* Wait for all send completions */ while (net_dev->num_outstanding_sends) { DELAY(100); } hv_nv_disconnect_from_vsp(net_dev); /* At this point, no one should be accessing net_dev except in here */ /* Now, we can close the channel safely */ if (!destroy_channel) { device->channel->state = HV_CHANNEL_CLOSING_NONDESTRUCTIVE_STATE; } hv_vmbus_channel_close(device->channel); /* Release all resources */ net_vsc_pkt = STAILQ_FIRST(&net_dev->myrx_packet_list); while (net_vsc_pkt != NULL) { next_net_vsc_pkt = STAILQ_NEXT(net_vsc_pkt, mylist_entry); free(net_vsc_pkt, M_DEVBUF); net_vsc_pkt = next_net_vsc_pkt; } /* Reset the list to initial state */ STAILQ_INIT(&net_dev->myrx_packet_list); mtx_destroy(&net_dev->rx_pkt_list_lock); sema_destroy(&net_dev->channel_init_sema); free(net_dev, M_DEVBUF); return (0); } /* * Net VSC on send completion */ static void hv_nv_on_send_completion(struct hv_device *device, hv_vm_packet_descriptor *pkt) { netvsc_dev *net_dev; nvsp_msg *nvsp_msg_pkt; netvsc_packet *net_vsc_pkt; net_dev = hv_nv_get_inbound_net_device(device); if (!net_dev) { return; } nvsp_msg_pkt = (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3)); if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_type_init_complete || nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_rx_buf_complete || nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_send_buf_complete) { /* Copy the response back */ memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt, sizeof(nvsp_msg)); sema_post(&net_dev->channel_init_sema); } else if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_rndis_pkt_complete) { /* Get the send context */ net_vsc_pkt = (netvsc_packet *)(unsigned long)pkt->transaction_id; /* Notify the layer above us */ net_vsc_pkt->compl.send.on_send_completion( net_vsc_pkt->compl.send.send_completion_context); atomic_subtract_int(&net_dev->num_outstanding_sends, 1); } } /* * Net VSC on send * Sends a packet on the specified Hyper-V device. * Returns 0 on success, non-zero on failure. */ int hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt) { netvsc_dev *net_dev; nvsp_msg send_msg; int ret; net_dev = hv_nv_get_outbound_net_device(device); if (!net_dev) return (ENODEV); send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt; if (pkt->is_data_pkt) { /* 0 is RMC_DATA */ send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 0; } else { /* 1 is RMC_CONTROL */ send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 1; } /* Not using send buffer section */ send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_idx = 0xFFFFFFFF; send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_size = 0; if (pkt->page_buf_count) { ret = hv_vmbus_channel_send_packet_pagebuffer(device->channel, pkt->page_buffers, pkt->page_buf_count, &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt); } else { ret = hv_vmbus_channel_send_packet(device->channel, &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt, HV_VMBUS_PACKET_TYPE_DATA_IN_BAND, HV_VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); } /* Record outstanding send only if send_packet() succeeded */ if (ret == 0) atomic_add_int(&net_dev->num_outstanding_sends, 1); return (ret); } /* * Net VSC on receive * * In the FreeBSD Hyper-V virtual world, this function deals exclusively * with virtual addresses. */ static void hv_nv_on_receive(struct hv_device *device, hv_vm_packet_descriptor *pkt) { netvsc_dev *net_dev; hv_vm_transfer_page_packet_header *vm_xfer_page_pkt; nvsp_msg *nvsp_msg_pkt; netvsc_packet *net_vsc_pkt = NULL; unsigned long start; xfer_page_packet *xfer_page_pkt = NULL; STAILQ_HEAD(PKT_LIST, netvsc_packet_) mylist_head = STAILQ_HEAD_INITIALIZER(mylist_head); int count = 0; int i = 0; net_dev = hv_nv_get_inbound_net_device(device); if (!net_dev) return; /* * All inbound packets other than send completion should be * xfer page packet. */ if (pkt->type != HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES) return; nvsp_msg_pkt = (nvsp_msg *)((unsigned long)pkt + (pkt->data_offset8 << 3)); /* Make sure this is a valid nvsp packet */ if (nvsp_msg_pkt->hdr.msg_type != nvsp_msg_1_type_send_rndis_pkt) return; vm_xfer_page_pkt = (hv_vm_transfer_page_packet_header *)pkt; if (vm_xfer_page_pkt->transfer_page_set_id != NETVSC_RECEIVE_BUFFER_ID) { return; } STAILQ_INIT(&mylist_head); /* * Grab free packets (range count + 1) to represent this xfer page * packet. +1 to represent the xfer page packet itself. We grab it * here so that we know exactly how many we can fulfill. */ mtx_lock_spin(&net_dev->rx_pkt_list_lock); while (!STAILQ_EMPTY(&net_dev->myrx_packet_list)) { net_vsc_pkt = STAILQ_FIRST(&net_dev->myrx_packet_list); STAILQ_REMOVE_HEAD(&net_dev->myrx_packet_list, mylist_entry); STAILQ_INSERT_TAIL(&mylist_head, net_vsc_pkt, mylist_entry); if (++count == vm_xfer_page_pkt->range_count + 1) break; } mtx_unlock_spin(&net_dev->rx_pkt_list_lock); /* * We need at least 2 netvsc pkts (1 to represent the xfer page * and at least 1 for the range) i.e. we can handle some of the * xfer page packet ranges... */ if (count < 2) { /* Return netvsc packet to the freelist */ mtx_lock_spin(&net_dev->rx_pkt_list_lock); for (i=count; i != 0; i--) { net_vsc_pkt = STAILQ_FIRST(&mylist_head); STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry); STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, net_vsc_pkt, mylist_entry); } mtx_unlock_spin(&net_dev->rx_pkt_list_lock); hv_nv_send_receive_completion(device, vm_xfer_page_pkt->d.transaction_id); return; } /* Take the first packet in the list */ xfer_page_pkt = (xfer_page_packet *)STAILQ_FIRST(&mylist_head); STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry); /* This is how many data packets we can supply */ xfer_page_pkt->count = count - 1; /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i=0; i < (count - 1); i++) { net_vsc_pkt = STAILQ_FIRST(&mylist_head); STAILQ_REMOVE_HEAD(&mylist_head, mylist_entry); /* * Initialize the netvsc packet */ net_vsc_pkt->xfer_page_pkt = xfer_page_pkt; net_vsc_pkt->compl.rx.rx_completion_context = net_vsc_pkt; net_vsc_pkt->device = device; /* Save this so that we can send it back */ net_vsc_pkt->compl.rx.rx_completion_tid = vm_xfer_page_pkt->d.transaction_id; net_vsc_pkt->tot_data_buf_len = vm_xfer_page_pkt->ranges[i].byte_count; net_vsc_pkt->page_buf_count = 1; net_vsc_pkt->page_buffers[0].length = vm_xfer_page_pkt->ranges[i].byte_count; /* The virtual address of the packet in the receive buffer */ start = ((unsigned long)net_dev->rx_buf + vm_xfer_page_pkt->ranges[i].byte_offset); start = ((unsigned long)start) & ~(PAGE_SIZE - 1); /* Page number of the virtual page containing packet start */ net_vsc_pkt->page_buffers[0].pfn = start >> PAGE_SHIFT; /* Calculate the page relative offset */ net_vsc_pkt->page_buffers[0].offset = vm_xfer_page_pkt->ranges[i].byte_offset & (PAGE_SIZE - 1); /* * In this implementation, we are dealing with virtual * addresses exclusively. Since we aren't using physical * addresses at all, we don't care if a packet crosses a * page boundary. For this reason, the original code to * check for and handle page crossings has been removed. */ /* * Pass it to the upper layer. The receive completion call * has been moved into this function. */ hv_rf_on_receive(device, net_vsc_pkt); /* * Moved completion call back here so that all received * messages (not just data messages) will trigger a response * message back to the host. */ hv_nv_on_receive_completion(net_vsc_pkt); } } /* * Net VSC send receive completion */ static void hv_nv_send_receive_completion(struct hv_device *device, uint64_t tid) { nvsp_msg rx_comp_msg; int retries = 0; int ret = 0; rx_comp_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt_complete; /* Pass in the status */ rx_comp_msg.msgs.vers_1_msgs.send_rndis_pkt_complete.status = nvsp_status_success; retry_send_cmplt: /* Send the completion */ ret = hv_vmbus_channel_send_packet(device->channel, &rx_comp_msg, sizeof(nvsp_msg), tid, HV_VMBUS_PACKET_TYPE_COMPLETION, 0); if (ret == 0) { /* success */ /* no-op */ } else if (ret == EAGAIN) { /* no more room... wait a bit and attempt to retry 3 times */ retries++; if (retries < 4) { DELAY(100); goto retry_send_cmplt; } } } /* * Net VSC on receive completion * * Send a receive completion packet to RNDIS device (ie NetVsp) */ void hv_nv_on_receive_completion(void *context) { netvsc_packet *packet = (netvsc_packet *)context; struct hv_device *device = (struct hv_device *)packet->device; netvsc_dev *net_dev; uint64_t tid = 0; boolean_t send_rx_completion = FALSE; /* * Even though it seems logical to do a hv_nv_get_outbound_net_device() * here to send out receive completion, we are using * hv_nv_get_inbound_net_device() since we may have disabled * outbound traffic already. */ net_dev = hv_nv_get_inbound_net_device(device); if (net_dev == NULL) return; /* Overloading use of the lock. */ mtx_lock_spin(&net_dev->rx_pkt_list_lock); packet->xfer_page_pkt->count--; /* * Last one in the line that represent 1 xfer page packet. * Return the xfer page packet itself to the free list. */ if (packet->xfer_page_pkt->count == 0) { send_rx_completion = TRUE; tid = packet->compl.rx.rx_completion_tid; STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, (netvsc_packet *)(packet->xfer_page_pkt), mylist_entry); } /* Put the packet back on the free list */ STAILQ_INSERT_TAIL(&net_dev->myrx_packet_list, packet, mylist_entry); mtx_unlock_spin(&net_dev->rx_pkt_list_lock); /* Send a receive completion for the xfer page packet */ if (send_rx_completion) hv_nv_send_receive_completion(device, tid); } /* * Net VSC on channel callback */ static void hv_nv_on_channel_callback(void *context) { /* Fixme: Magic number */ const int net_pkt_size = 2048; struct hv_device *device = (struct hv_device *)context; netvsc_dev *net_dev; uint32_t bytes_rxed; uint64_t request_id; uint8_t *packet; hv_vm_packet_descriptor *desc; uint8_t *buffer; int bufferlen = net_pkt_size; int ret = 0; packet = malloc(net_pkt_size * sizeof(uint8_t), M_DEVBUF, M_NOWAIT); if (!packet) return; buffer = packet; net_dev = hv_nv_get_inbound_net_device(device); if (net_dev == NULL) goto out; do { ret = hv_vmbus_channel_recv_packet_raw(device->channel, buffer, bufferlen, &bytes_rxed, &request_id); if (ret == 0) { if (bytes_rxed > 0) { desc = (hv_vm_packet_descriptor *)buffer; switch (desc->type) { case HV_VMBUS_PACKET_TYPE_COMPLETION: hv_nv_on_send_completion(device, desc); break; case HV_VMBUS_PACKET_TYPE_DATA_USING_TRANSFER_PAGES: hv_nv_on_receive(device, desc); break; default: break; } } else { break; } } else if (ret == ENOBUFS) { /* Handle large packet */ free(buffer, M_DEVBUF); buffer = malloc(bytes_rxed, M_DEVBUF, M_NOWAIT); if (buffer == NULL) { break; } bufferlen = bytes_rxed; } } while (1); out: free(buffer, M_DEVBUF); } Index: user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.h =================================================================== --- user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.h (revision 274240) +++ user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_net_vsc.h (revision 274241) @@ -1,999 +1,998 @@ /*- * Copyright (c) 2009-2012 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * HyperV vmbus (virtual machine bus) network VSC (virtual services client) * header file * * (Updated from unencumbered NvspProtocol.h) */ #ifndef __HV_NET_VSC_H__ #define __HV_NET_VSC_H__ #include #include #include #include #include #define NVSP_INVALID_PROTOCOL_VERSION (0xFFFFFFFF) #define NVSP_PROTOCOL_VERSION_1 2 #define NVSP_PROTOCOL_VERSION_2 0x30002 #define NVSP_MIN_PROTOCOL_VERSION (NVSP_PROTOCOL_VERSION_1) #define NVSP_MAX_PROTOCOL_VERSION (NVSP_PROTOCOL_VERSION_2) #define NVSP_PROTOCOL_VERSION_CURRENT NVSP_PROTOCOL_VERSION_2 #define NVSP_OPERATIONAL_STATUS_OK (0x00000000) #define NVSP_OPERATIONAL_STATUS_DEGRADED (0x00000001) #define NVSP_OPERATIONAL_STATUS_NONRECOVERABLE (0x00000002) #define NVSP_OPERATIONAL_STATUS_NO_CONTACT (0x00000003) #define NVSP_OPERATIONAL_STATUS_LOST_COMMUNICATION (0x00000004) /* * Maximun number of transfer pages (packets) the VSP will use on a receive */ #define NVSP_MAX_PACKETS_PER_RECEIVE 375 typedef enum nvsp_msg_type_ { nvsp_msg_type_none = 0, /* * Init Messages */ nvsp_msg_type_init = 1, nvsp_msg_type_init_complete = 2, nvsp_version_msg_start = 100, /* * Version 1 Messages */ nvsp_msg_1_type_send_ndis_vers = nvsp_version_msg_start, nvsp_msg_1_type_send_rx_buf, nvsp_msg_1_type_send_rx_buf_complete, nvsp_msg_1_type_revoke_rx_buf, nvsp_msg_1_type_send_send_buf, nvsp_msg_1_type_send_send_buf_complete, nvsp_msg_1_type_revoke_send_buf, nvsp_msg_1_type_send_rndis_pkt, nvsp_msg_1_type_send_rndis_pkt_complete, /* * Version 2 Messages */ nvsp_msg_2_type_send_chimney_delegated_buf, nvsp_msg_2_type_send_chimney_delegated_buf_complete, nvsp_msg_2_type_revoke_chimney_delegated_buf, nvsp_msg_2_type_resume_chimney_rx_indication, nvsp_msg_2_type_terminate_chimney, nvsp_msg_2_type_terminate_chimney_complete, nvsp_msg_2_type_indicate_chimney_event, nvsp_msg_2_type_send_chimney_packet, nvsp_msg_2_type_send_chimney_packet_complete, nvsp_msg_2_type_post_chimney_rx_request, nvsp_msg_2_type_post_chimney_rx_request_complete, nvsp_msg_2_type_alloc_rx_buf, nvsp_msg_2_type_alloc_rx_buf_complete, nvsp_msg_2_type_free_rx_buf, nvsp_msg_2_send_vmq_rndis_pkt, nvsp_msg_2_send_vmq_rndis_pkt_complete, nvsp_msg_2_type_send_ndis_config, nvsp_msg_2_type_alloc_chimney_handle, nvsp_msg_2_type_alloc_chimney_handle_complete, } nvsp_msg_type; typedef enum nvsp_status_ { nvsp_status_none = 0, nvsp_status_success, nvsp_status_failure, /* Deprecated */ nvsp_status_prot_vers_range_too_new, /* Deprecated */ nvsp_status_prot_vers_range_too_old, nvsp_status_invalid_rndis_pkt, nvsp_status_busy, nvsp_status_max, } nvsp_status; typedef struct nvsp_msg_hdr_ { uint32_t msg_type; } __packed nvsp_msg_hdr; /* * Init Messages */ /* * This message is used by the VSC to initialize the channel * after the channels has been opened. This message should * never include anything other then versioning (i.e. this * message will be the same for ever). * * Forever is a long time. The values have been redefined * in Win7 to indicate major and minor protocol version * number. */ typedef struct nvsp_msg_init_ { union { struct { uint16_t minor_protocol_version; uint16_t major_protocol_version; } s; /* Formerly min_protocol_version */ uint32_t protocol_version; } p1; /* Formerly max_protocol_version */ uint32_t protocol_version_2; } __packed nvsp_msg_init; /* * This message is used by the VSP to complete the initialization * of the channel. This message should never include anything other * then versioning (i.e. this message will be the same forever). */ typedef struct nvsp_msg_init_complete_ { /* Deprecated */ uint32_t negotiated_prot_vers; uint32_t max_mdl_chain_len; uint32_t status; } __packed nvsp_msg_init_complete; typedef union nvsp_msg_init_uber_ { nvsp_msg_init init; nvsp_msg_init_complete init_compl; } __packed nvsp_msg_init_uber; /* * Version 1 Messages */ /* * This message is used by the VSC to send the NDIS version * to the VSP. The VSP can use this information when handling * OIDs sent by the VSC. */ typedef struct nvsp_1_msg_send_ndis_version_ { uint32_t ndis_major_vers; /* Deprecated */ uint32_t ndis_minor_vers; } __packed nvsp_1_msg_send_ndis_version; /* * This message is used by the VSC to send a receive buffer * to the VSP. The VSP can then use the receive buffer to * send data to the VSC. */ typedef struct nvsp_1_msg_send_rx_buf_ { uint32_t gpadl_handle; uint16_t id; } __packed nvsp_1_msg_send_rx_buf; typedef struct nvsp_1_rx_buf_section_ { uint32_t offset; uint32_t sub_allocation_size; uint32_t num_sub_allocations; uint32_t end_offset; } __packed nvsp_1_rx_buf_section; /* * This message is used by the VSP to acknowledge a receive * buffer send by the VSC. This message must be sent by the * VSP before the VSP uses the receive buffer. */ typedef struct nvsp_1_msg_send_rx_buf_complete_ { uint32_t status; uint32_t num_sections; /* * The receive buffer is split into two parts, a large * suballocation section and a small suballocation * section. These sections are then suballocated by a * certain size. * * For example, the following break up of the receive * buffer has 6 large suballocations and 10 small * suballocations. * * | Large Section | | Small Section | * ------------------------------------------------------------ * | | | | | | | | | | | | | | | | | | * | | * LargeOffset SmallOffset */ nvsp_1_rx_buf_section sections[1]; } __packed nvsp_1_msg_send_rx_buf_complete; /* * This message is sent by the VSC to revoke the receive buffer. * After the VSP completes this transaction, the VSP should never * use the receive buffer again. */ typedef struct nvsp_1_msg_revoke_rx_buf_ { uint16_t id; } __packed nvsp_1_msg_revoke_rx_buf; /* * This message is used by the VSC to send a send buffer * to the VSP. The VSC can then use the send buffer to * send data to the VSP. */ typedef struct nvsp_1_msg_send_send_buf_ { uint32_t gpadl_handle; uint16_t id; } __packed nvsp_1_msg_send_send_buf; /* * This message is used by the VSP to acknowledge a send * buffer sent by the VSC. This message must be sent by the * VSP before the VSP uses the sent buffer. */ typedef struct nvsp_1_msg_send_send_buf_complete_ { uint32_t status; /* * The VSC gets to choose the size of the send buffer and * the VSP gets to choose the sections size of the buffer. * This was done to enable dynamic reconfigurations when * the cost of GPA-direct buffers decreases. */ uint32_t section_size; } __packed nvsp_1_msg_send_send_buf_complete; /* * This message is sent by the VSC to revoke the send buffer. * After the VSP completes this transaction, the vsp should never * use the send buffer again. */ typedef struct nvsp_1_msg_revoke_send_buf_ { uint16_t id; } __packed nvsp_1_msg_revoke_send_buf; /* * This message is used by both the VSP and the VSC to send * an RNDIS message to the opposite channel endpoint. */ typedef struct nvsp_1_msg_send_rndis_pkt_ { /* * This field is specified by RNIDS. They assume there's * two different channels of communication. However, * the Network VSP only has one. Therefore, the channel * travels with the RNDIS packet. */ uint32_t chan_type; /* * This field is used to send part or all of the data * through a send buffer. This values specifies an * index into the send buffer. If the index is * 0xFFFFFFFF, then the send buffer is not being used * and all of the data was sent through other VMBus * mechanisms. */ uint32_t send_buf_section_idx; uint32_t send_buf_section_size; } __packed nvsp_1_msg_send_rndis_pkt; /* * This message is used by both the VSP and the VSC to complete * a RNDIS message to the opposite channel endpoint. At this * point, the initiator of this message cannot use any resources * associated with the original RNDIS packet. */ typedef struct nvsp_1_msg_send_rndis_pkt_complete_ { uint32_t status; } __packed nvsp_1_msg_send_rndis_pkt_complete; /* * Version 2 Messages */ /* * This message is used by the VSC to send the NDIS version * to the VSP. The VSP can use this information when handling * OIDs sent by the VSC. */ typedef struct nvsp_2_netvsc_capabilities_ { union { uint64_t as_uint64; struct { uint64_t vmq : 1; uint64_t chimney : 1; uint64_t sriov : 1; uint64_t ieee8021q : 1; uint64_t correlationid : 1; uint64_t teaming : 1; } u2; } u1; } __packed nvsp_2_netvsc_capabilities; typedef struct nvsp_2_msg_send_ndis_config_ { uint32_t mtu; uint32_t reserved; nvsp_2_netvsc_capabilities capabilities; } __packed nvsp_2_msg_send_ndis_config; /* * NvspMessage2TypeSendChimneyDelegatedBuffer */ typedef struct nvsp_2_msg_send_chimney_buf_ { /* * On WIN7 beta, delegated_obj_max_size is defined as a uint32_t * Since WIN7 RC, it was split into two uint16_t. To have the same * struct layout, delegated_obj_max_size shall be the first field. */ uint16_t delegated_obj_max_size; /* * The revision # of chimney protocol used between NVSC and NVSP. * * This revision is NOT related to the chimney revision between * NDIS protocol and miniport drivers. */ uint16_t revision; uint32_t gpadl_handle; } __packed nvsp_2_msg_send_chimney_buf; /* Unsupported chimney revision 0 (only present in WIN7 beta) */ #define NVSP_CHIMNEY_REVISION_0 0 /* WIN7 Beta Chimney QFE */ #define NVSP_CHIMNEY_REVISION_1 1 /* The chimney revision since WIN7 RC */ #define NVSP_CHIMNEY_REVISION_2 2 /* * NvspMessage2TypeSendChimneyDelegatedBufferComplete */ typedef struct nvsp_2_msg_send_chimney_buf_complete_ { uint32_t status; /* * Maximum number outstanding sends and pre-posted receives. * * NVSC should not post more than SendQuota/ReceiveQuota packets. * Otherwise, it can block the non-chimney path for an indefinite * amount of time. * (since chimney sends/receives are affected by the remote peer). * * Note: NVSP enforces the quota restrictions on a per-VMBCHANNEL * basis. It doesn't enforce the restriction separately for chimney * send/receive. If NVSC doesn't voluntarily enforce "SendQuota", * it may kill its own network connectivity. */ uint32_t send_quota; uint32_t rx_quota; } __packed nvsp_2_msg_send_chimney_buf_complete; /* * NvspMessage2TypeRevokeChimneyDelegatedBuffer */ typedef struct nvsp_2_msg_revoke_chimney_buf_ { uint32_t gpadl_handle; } __packed nvsp_2_msg_revoke_chimney_buf; #define NVSP_CHIMNEY_OBJECT_TYPE_NEIGHBOR 0 #define NVSP_CHIMNEY_OBJECT_TYPE_PATH4 1 #define NVSP_CHIMNEY_OBJECT_TYPE_PATH6 2 #define NVSP_CHIMNEY_OBJECT_TYPE_TCP 3 /* * NvspMessage2TypeAllocateChimneyHandle */ typedef struct nvsp_2_msg_alloc_chimney_handle_ { uint64_t vsc_context; uint32_t object_type; } __packed nvsp_2_msg_alloc_chimney_handle; /* * NvspMessage2TypeAllocateChimneyHandleComplete */ typedef struct nvsp_2_msg_alloc_chimney_handle_complete_ { uint32_t vsp_handle; } __packed nvsp_2_msg_alloc_chimney_handle_complete; /* * NvspMessage2TypeResumeChimneyRXIndication */ typedef struct nvsp_2_msg_resume_chimney_rx_indication { /* * Handle identifying the offloaded connection */ uint32_t vsp_tcp_handle; } __packed nvsp_2_msg_resume_chimney_rx_indication; #define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_FIRST_STAGE (0x01u) #define NVSP_2_MSG_TERMINATE_CHIMNEY_FLAGS_RESERVED (~(0x01u)) /* * NvspMessage2TypeTerminateChimney */ typedef struct nvsp_2_msg_terminate_chimney_ { /* * Handle identifying the offloaded object */ uint32_t vsp_handle; /* * Terminate Offload Flags * Bit 0: * When set to 0, terminate the offload at the destination NIC * Bit 1-31: Reserved, shall be zero */ uint32_t flags; union { /* * This field is valid only when bit 0 of flags is clear. * It specifies the index into the premapped delegated * object buffer. The buffer was sent through the * NvspMessage2TypeSendChimneyDelegatedBuffer * message at initialization time. * * NVSP will write the delegated state into the delegated * buffer upon upload completion. */ uint32_t index; /* * This field is valid only when bit 0 of flags is set. * * The seqence number of the most recently accepted RX * indication when VSC sets its TCP context into * "terminating" state. * * This allows NVSP to determines if there are any in-flight * RX indications for which the acceptance state is still * undefined. */ uint64_t last_accepted_rx_seq_no; } f0; } __packed nvsp_2_msg_terminate_chimney; #define NVSP_TERMINATE_CHIMNEY_COMPLETE_FLAG_DATA_CORRUPTED 0x0000001u /* * NvspMessage2TypeTerminateChimneyComplete */ typedef struct nvsp_2_msg_terminate_chimney_complete_ { uint64_t vsc_context; uint32_t flags; } __packed nvsp_2_msg_terminate_chimney_complete; /* * NvspMessage2TypeIndicateChimneyEvent */ typedef struct nvsp_2_msg_indicate_chimney_event_ { /* * When VscTcpContext is 0, event_type is an NDIS_STATUS event code * Otherwise, EventType is an TCP connection event (defined in * NdisTcpOffloadEventHandler chimney DDK document). */ uint32_t event_type; /* * When VscTcpContext is 0, EventType is an NDIS_STATUS event code * Otherwise, EventType is an TCP connection event specific information * (defined in NdisTcpOffloadEventHandler chimney DDK document). */ uint32_t event_specific_info; /* * If not 0, the event is per-TCP connection event. This field * contains the VSC's TCP context. * If 0, the event indication is global. */ uint64_t vsc_tcp_context; } __packed nvsp_2_msg_indicate_chimney_event; #define NVSP_1_CHIMNEY_SEND_INVALID_OOB_INDEX 0xffffu #define NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX 0xffffu /* * NvspMessage2TypeSendChimneyPacket */ typedef struct nvsp_2_msg_send_chimney_pkt_ { /* * Identify the TCP connection for which this chimney send is */ uint32_t vsp_tcp_handle; /* * This field is used to send part or all of the data * through a send buffer. This values specifies an * index into the send buffer. If the index is * 0xFFFF, then the send buffer is not being used * and all of the data was sent through other VMBus * mechanisms. */ uint16_t send_buf_section_index; uint16_t send_buf_section_size; /* * OOB Data Index * This an index to the OOB data buffer. If the index is 0xFFFFFFFF, * then there is no OOB data. * * This field shall be always 0xFFFFFFFF for now. It is reserved for * the future. */ uint16_t oob_data_index; /* * DisconnectFlags = 0 * Normal chimney send. See MiniportTcpOffloadSend for details. * * DisconnectFlags = TCP_DISCONNECT_GRACEFUL_CLOSE (0x01) * Graceful disconnect. See MiniportTcpOffloadDisconnect for details. * * DisconnectFlags = TCP_DISCONNECT_ABORTIVE_CLOSE (0x02) * Abortive disconnect. See MiniportTcpOffloadDisconnect for details. */ uint16_t disconnect_flags; uint32_t seq_no; } __packed nvsp_2_msg_send_chimney_pkt; /* * NvspMessage2TypeSendChimneyPacketComplete */ typedef struct nvsp_2_msg_send_chimney_pkt_complete_ { /* * The NDIS_STATUS for the chimney send */ uint32_t status; /* * Number of bytes that have been sent to the peer (and ACKed by the peer). */ uint32_t bytes_transferred; } __packed nvsp_2_msg_send_chimney_pkt_complete; #define NVSP_1_CHIMNEY_RECV_FLAG_NO_PUSH 0x0001u #define NVSP_1_CHIMNEY_RECV_INVALID_OOB_INDEX 0xffffu /* * NvspMessage2TypePostChimneyRecvRequest */ typedef struct nvsp_2_msg_post_chimney_rx_request_ { /* * Identify the TCP connection which this chimney receive request * is for. */ uint32_t vsp_tcp_handle; /* * OOB Data Index * This an index to the OOB data buffer. If the index is 0xFFFFFFFF, * then there is no OOB data. * * This field shall be always 0xFFFFFFFF for now. It is reserved for * the future. */ uint32_t oob_data_index; /* * Bit 0 * When it is set, this is a "no-push" receive. * When it is clear, this is a "push" receive. * * Bit 1-15: Reserved and shall be zero */ uint16_t flags; /* * For debugging and diagnoses purpose. * The SeqNo is per TCP connection and starts from 0. */ uint32_t seq_no; } __packed nvsp_2_msg_post_chimney_rx_request; /* * NvspMessage2TypePostChimneyRecvRequestComplete */ typedef struct nvsp_2_msg_post_chimney_rx_request_complete_ { /* * The NDIS_STATUS for the chimney send */ uint32_t status; /* * Number of bytes that have been sent to the peer (and ACKed by * the peer). */ uint32_t bytes_xferred; } __packed nvsp_2_msg_post_chimney_rx_request_complete; /* * NvspMessage2TypeAllocateReceiveBuffer */ typedef struct nvsp_2_msg_alloc_rx_buf_ { /* * Allocation ID to match the allocation request and response */ uint32_t allocation_id; /* * Length of the VM shared memory receive buffer that needs to * be allocated */ uint32_t length; } __packed nvsp_2_msg_alloc_rx_buf; /* * NvspMessage2TypeAllocateReceiveBufferComplete */ typedef struct nvsp_2_msg_alloc_rx_buf_complete_ { /* * The NDIS_STATUS code for buffer allocation */ uint32_t status; /* * Allocation ID from NVSP_2_MESSAGE_ALLOCATE_RECEIVE_BUFFER */ uint32_t allocation_id; /* * GPADL handle for the allocated receive buffer */ uint32_t gpadl_handle; /* * Receive buffer ID that is further used in * NvspMessage2SendVmqRndisPacket */ uint64_t rx_buf_id; } __packed nvsp_2_msg_alloc_rx_buf_complete; /* * NvspMessage2TypeFreeReceiveBuffer */ typedef struct nvsp_2_msg_free_rx_buf_ { /* * Receive buffer ID previous returned in * NvspMessage2TypeAllocateReceiveBufferComplete message */ uint64_t rx_buf_id; } __packed nvsp_2_msg_free_rx_buf; /* * This structure is used in defining the buffers in * NVSP_2_MESSAGE_SEND_VMQ_RNDIS_PACKET structure */ typedef struct nvsp_xfer_page_range_ { /* * Specifies the ID of the receive buffer that has the buffer. This * ID can be the general receive buffer ID specified in * NvspMessage1TypeSendReceiveBuffer or it can be the shared memory * receive buffer ID allocated by the VSC and specified in * NvspMessage2TypeAllocateReceiveBufferComplete message */ uint64_t xfer_page_set_id; /* * Number of bytes */ uint32_t byte_count; /* * Offset in bytes from the beginning of the buffer */ uint32_t byte_offset; } __packed nvsp_xfer_page_range; /* * NvspMessage2SendVmqRndisPacket */ typedef struct nvsp_2_msg_send_vmq_rndis_pkt_ { /* * This field is specified by RNIDS. They assume there's * two different channels of communication. However, * the Network VSP only has one. Therefore, the channel * travels with the RNDIS packet. It must be RMC_DATA */ uint32_t channel_type; /* * Only the Range element corresponding to the RNDIS header of * the first RNDIS message in the multiple RNDIS messages sent * in one NVSP message. Information about the data portions as well * as the subsequent RNDIS messages in the same NVSP message are * embedded in the RNDIS header itself */ nvsp_xfer_page_range range; } __packed nvsp_2_msg_send_vmq_rndis_pkt; /* * This message is used by the VSC to complete * a RNDIS VMQ message to the VSP. At this point, * the initiator of this message can use any resources * associated with the original RNDIS VMQ packet. */ typedef struct nvsp_2_msg_send_vmq_rndis_pkt_complete_ { uint32_t status; } __packed nvsp_2_msg_send_vmq_rndis_pkt_complete; typedef union nvsp_1_msg_uber_ { nvsp_1_msg_send_ndis_version send_ndis_vers; nvsp_1_msg_send_rx_buf send_rx_buf; nvsp_1_msg_send_rx_buf_complete send_rx_buf_complete; nvsp_1_msg_revoke_rx_buf revoke_rx_buf; nvsp_1_msg_send_send_buf send_send_buf; nvsp_1_msg_send_send_buf_complete send_send_buf_complete; nvsp_1_msg_revoke_send_buf revoke_send_buf; nvsp_1_msg_send_rndis_pkt send_rndis_pkt; nvsp_1_msg_send_rndis_pkt_complete send_rndis_pkt_complete; } __packed nvsp_1_msg_uber; typedef union nvsp_2_msg_uber_ { nvsp_2_msg_send_ndis_config send_ndis_config; nvsp_2_msg_send_chimney_buf send_chimney_buf; nvsp_2_msg_send_chimney_buf_complete send_chimney_buf_complete; nvsp_2_msg_revoke_chimney_buf revoke_chimney_buf; nvsp_2_msg_resume_chimney_rx_indication resume_chimney_rx_indication; nvsp_2_msg_terminate_chimney terminate_chimney; nvsp_2_msg_terminate_chimney_complete terminate_chimney_complete; nvsp_2_msg_indicate_chimney_event indicate_chimney_event; nvsp_2_msg_send_chimney_pkt send_chimney_packet; nvsp_2_msg_send_chimney_pkt_complete send_chimney_packet_complete; nvsp_2_msg_post_chimney_rx_request post_chimney_rx_request; nvsp_2_msg_post_chimney_rx_request_complete post_chimney_rx_request_complete; nvsp_2_msg_alloc_rx_buf alloc_rx_buffer; nvsp_2_msg_alloc_rx_buf_complete alloc_rx_buffer_complete; nvsp_2_msg_free_rx_buf free_rx_buffer; nvsp_2_msg_send_vmq_rndis_pkt send_vmq_rndis_pkt; nvsp_2_msg_send_vmq_rndis_pkt_complete send_vmq_rndis_pkt_complete; nvsp_2_msg_alloc_chimney_handle alloc_chimney_handle; nvsp_2_msg_alloc_chimney_handle_complete alloc_chimney_handle_complete; } __packed nvsp_2_msg_uber; typedef union nvsp_all_msgs_ { nvsp_msg_init_uber init_msgs; nvsp_1_msg_uber vers_1_msgs; nvsp_2_msg_uber vers_2_msgs; } __packed nvsp_all_msgs; /* * ALL Messages */ typedef struct nvsp_msg_ { nvsp_msg_hdr hdr; nvsp_all_msgs msgs; } __packed nvsp_msg; /* * The following arguably belongs in a separate header file */ /* * Defines */ #define NETVSC_SEND_BUFFER_SIZE (64*1024) /* 64K */ #define NETVSC_SEND_BUFFER_ID 0xface #define NETVSC_RECEIVE_BUFFER_SIZE (1024*1024) /* 1MB */ #define NETVSC_RECEIVE_BUFFER_ID 0xcafe #define NETVSC_RECEIVE_SG_COUNT 1 /* Preallocated receive packets */ #define NETVSC_RECEIVE_PACKETLIST_COUNT 256 /* * Maximum MTU we permit to be configured for a netvsc interface. * When the code was developed, a max MTU of 12232 was tested and * proven to work. 9K is a reasonable maximum for an Ethernet. */ #define NETVSC_MAX_CONFIGURABLE_MTU (9 * 1024) /* * Data types */ /* * Per netvsc channel-specific */ typedef struct netvsc_dev_ { struct hv_device *dev; int num_outstanding_sends; /* List of free preallocated NETVSC_PACKET to represent RX packet */ STAILQ_HEAD(PQ, netvsc_packet_) myrx_packet_list; struct mtx rx_pkt_list_lock; /* Send buffer allocated by us but manages by NetVSP */ void *send_buf; uint32_t send_buf_size; uint32_t send_buf_gpadl_handle; uint32_t send_section_size; /* Receive buffer allocated by us but managed by NetVSP */ void *rx_buf; uint32_t rx_buf_size; uint32_t rx_buf_gpadl_handle; uint32_t rx_section_count; nvsp_1_rx_buf_section *rx_sections; /* Used for NetVSP initialization protocol */ struct sema channel_init_sema; nvsp_msg channel_init_packet; nvsp_msg revoke_packet; /*uint8_t hw_mac_addr[HW_MACADDR_LEN];*/ /* Holds rndis device info */ void *extension; hv_bool_uint8_t destroy; /* Negotiated NVSP version */ uint32_t nvsp_version; } netvsc_dev; typedef void (*pfn_on_send_rx_completion)(void *); #define NETVSC_DEVICE_RING_BUFFER_SIZE (64 * PAGE_SIZE) #define NETVSC_PACKET_MAXPAGE 16 typedef struct xfer_page_packet_ { /* * This needs to be here because the network RX code casts * an instantiation of this structure to a netvsc_packet. */ STAILQ_ENTRY(netvsc_packet_) mylist_entry; uint32_t count; } xfer_page_packet; typedef struct netvsc_packet_ { /* * List used when enqueued on &net_dev->rx_packet_list, * and when enqueued within the netvsc code */ STAILQ_ENTRY(netvsc_packet_) mylist_entry; struct hv_device *device; hv_bool_uint8_t is_data_pkt; /* One byte */ uint16_t vlan_tci; xfer_page_packet *xfer_page_pkt; /* Completion */ union { struct { uint64_t rx_completion_tid; void *rx_completion_context; /* This is no longer used */ pfn_on_send_rx_completion on_rx_completion; } rx; struct { uint64_t send_completion_tid; void *send_completion_context; /* Still used in netvsc and filter code */ pfn_on_send_rx_completion on_send_completion; } send; } compl; void *extension; uint32_t tot_data_buf_len; uint32_t page_buf_count; hv_vmbus_page_buffer page_buffers[NETVSC_PACKET_MAXPAGE]; } netvsc_packet; typedef struct { uint8_t mac_addr[6]; /* Assumption unsigned long */ hv_bool_uint8_t link_state; } netvsc_device_info; /* * Device-specific softc structure */ typedef struct hn_softc { struct ifnet *hn_ifp; - struct arpcom arpcom; device_t hn_dev; uint8_t hn_unit; int hn_carrier; int hn_if_flags; struct mtx hn_lock; int hn_initdone; /* See hv_netvsc_drv_freebsd.c for rules on how to use */ int temp_unusable; struct hv_device *hn_dev_obj; netvsc_dev *net_dev; } hn_softc_t; /* * Externs */ extern int hv_promisc_mode; extern void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status); extern int netvsc_recv(struct hv_device *device_obj, netvsc_packet *packet); extern void netvsc_xmit_completion(void *context); extern void hv_nv_on_receive_completion(void *context); extern netvsc_dev *hv_nv_on_device_add(struct hv_device *device, void *additional_info); extern int hv_nv_on_device_remove(struct hv_device *device, boolean_t destroy_channel); extern int hv_nv_on_send(struct hv_device *device, netvsc_packet *pkt); #endif /* __HV_NET_VSC_H__ */ Index: user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c =================================================================== --- user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 274240) +++ user/dchagin/lemul/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 274241) @@ -1,1018 +1,1016 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" /* Short for Hyper-V network interface */ #define NETVSC_DEVNAME "hn" /* * It looks like offset 0 of buf is reserved to hold the softc pointer. * The sc pointer evidently not needed, and is not presently populated. * The packet offset is where the netvsc_packet starts in the buffer. */ #define HV_NV_SC_PTR_OFFSET_IN_BUF 0 #define HV_NV_PACKET_OFFSET_IN_BUF 16 /* * Data types */ struct hv_netvsc_driver_context { uint32_t drv_inited; }; /* * Be aware that this sleepable mutex will exhibit WITNESS errors when * certain TCP and ARP code paths are taken. This appears to be a * well-known condition, as all other drivers checked use a sleeping * mutex to protect their transmit paths. * Also Be aware that mutexes do not play well with semaphores, and there * is a conflicting semaphore in a certain channel code path. */ #define NV_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF) #define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock) #define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED) #define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock) #define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock) /* * Globals */ int hv_promisc_mode = 0; /* normal mode by default */ /* The one and only one */ static struct hv_netvsc_driver_context g_netvsc_drv; /* * Forward declarations */ static void hn_stop(hn_softc_t *sc); static void hn_ifinit_locked(hn_softc_t *sc); static void hn_ifinit(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct ifnet *ifp); static void hn_start(struct ifnet *ifp); /* * NetVsc driver initialization * Note: Filter init is no longer required */ static int netvsc_drv_init(void) { return (0); } /* * NetVsc global initialization entry point */ static void netvsc_init(void) { if (bootverbose) printf("Netvsc initializing... "); /* * XXXKYS: cleanup initialization */ if (!cold && !g_netvsc_drv.drv_inited) { g_netvsc_drv.drv_inited = 1; netvsc_drv_init(); if (bootverbose) printf("done!\n"); } else if (bootverbose) printf("Already initialized!\n"); } /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ static const hv_guid g_net_vsc_device_type = { .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} }; /* * Standard probe entry point. * */ static int netvsc_probe(device_t dev) { const char *p; p = vmbus_get_type(dev); if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) { device_set_desc(dev, "Synthetic Network Interface"); if (bootverbose) printf("Netvsc probe... DONE \n"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } /* * Standard attach entry point. * * Called when the driver is loaded. It allocates needed resources, * and initializes the "hardware" and software. */ static int netvsc_attach(device_t dev) { struct hv_device *device_ctx = vmbus_get_devctx(dev); netvsc_device_info device_info; hn_softc_t *sc; int unit = device_get_unit(dev); struct ifnet *ifp; int ret; netvsc_init(); sc = device_get_softc(dev); if (sc == NULL) { return (ENOMEM); } bzero(sc, sizeof(hn_softc_t)); sc->hn_unit = unit; sc->hn_dev = dev; NV_LOCK_INIT(sc, "NetVSCLock"); sc->hn_dev_obj = device_ctx; - ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER); + ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_dunit = unit; ifp->if_dname = NETVSC_DEVNAME; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_start = hn_start; ifp->if_init = hn_ifinit; /* needed by hv_rf_on_device_add() code */ ifp->if_mtu = ETHERMTU; IFQ_SET_MAXLEN(&ifp->if_snd, 512); ifp->if_snd.ifq_drv_maxlen = 511; IFQ_SET_READY(&ifp->if_snd); /* * Tell upper layers that we support full VLAN capability. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; ret = hv_rf_on_device_add(device_ctx, &device_info); if (ret != 0) { if_free(ifp); return (ret); } if (device_info.link_state == 0) { sc->hn_carrier = 1; } ether_ifattach(ifp, device_info.mac_addr); return (0); } /* * Standard detach entry point */ static int netvsc_detach(device_t dev) { struct hv_device *hv_device = vmbus_get_devctx(dev); if (bootverbose) printf("netvsc_detach\n"); /* * XXXKYS: Need to clean up all our * driver state; this is the driver * unloading. */ /* * XXXKYS: Need to stop outgoing traffic and unregister * the netdevice. */ hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL); return (0); } /* * Standard shutdown entry point */ static int netvsc_shutdown(device_t dev) { return (0); } /* * Send completion processing * * Note: It looks like offset 0 of buf is reserved to hold the softc * pointer. The sc pointer is not currently needed in this function, and * it is not presently populated by the TX function. */ void netvsc_xmit_completion(void *context) { netvsc_packet *packet = (netvsc_packet *)context; struct mbuf *mb; uint8_t *buf; mb = (struct mbuf *)(uintptr_t)packet->compl.send.send_completion_tid; buf = ((uint8_t *)packet) - HV_NV_PACKET_OFFSET_IN_BUF; free(buf, M_DEVBUF); if (mb != NULL) { m_freem(mb); } } /* * Start a transmit of one or more packets */ static int hn_start_locked(struct ifnet *ifp) { hn_softc_t *sc = ifp->if_softc; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); uint8_t *buf; netvsc_packet *packet; struct mbuf *m_head, *m; struct mbuf *mc_head = NULL; int i; int num_frags; int len; int xlen; int rppi_size; int retries = 0; int ret = 0; while (!IFQ_DRV_IS_EMPTY(&sc->hn_ifp->if_snd)) { IFQ_DRV_DEQUEUE(&sc->hn_ifp->if_snd, m_head); if (m_head == NULL) { break; } len = 0; num_frags = 0; xlen = 0; /* Walk the mbuf list computing total length and num frags */ for (m = m_head; m != NULL; m = m->m_next) { if (m->m_len != 0) { num_frags++; len += m->m_len; } } /* * Reserve the number of pages requested. Currently, * one page is reserved for the message in the RNDIS * filter packet */ num_frags += HV_RF_NUM_TX_RESERVED_PAGE_BUFS; /* If exceeds # page_buffers in netvsc_packet */ if (num_frags > NETVSC_PACKET_MAXPAGE) { m_freem(m); return (EINVAL); } rppi_size = 0; if (m_head->m_flags & M_VLANTAG) { rppi_size = sizeof(rndis_per_packet_info) + sizeof(ndis_8021q_info); } /* * Allocate a buffer with space for a netvsc packet plus a * number of reserved areas. First comes a (currently 16 * bytes, currently unused) reserved data area. Second is * the netvsc_packet, which includes (currently 4) page * buffers. Third (optional) is a rndis_per_packet_info * struct, but only if a VLAN tag should be inserted into the * Ethernet frame by the Hyper-V infrastructure. Fourth is * an area reserved for an rndis_filter_packet struct. * Changed malloc to M_NOWAIT to avoid sleep under spin lock. * No longer reserving extra space for page buffers, as they * are already part of the netvsc_packet. */ buf = malloc(HV_NV_PACKET_OFFSET_IN_BUF + sizeof(netvsc_packet) + rppi_size + sizeof(rndis_filter_packet), M_DEVBUF, M_ZERO | M_NOWAIT); if (buf == NULL) { m_freem(m); return (ENOMEM); } packet = (netvsc_packet *)(buf + HV_NV_PACKET_OFFSET_IN_BUF); *(vm_offset_t *)buf = HV_NV_SC_PTR_OFFSET_IN_BUF; /* * extension points to the area reserved for the * rndis_filter_packet, which is placed just after * the netvsc_packet (and rppi struct, if present; * length is updated later). */ packet->extension = packet + 1; /* Set up the rndis header */ packet->page_buf_count = num_frags; /* Initialize it from the mbuf */ packet->tot_data_buf_len = len; /* * If the Hyper-V infrastructure needs to embed a VLAN tag, * initialize netvsc_packet and rppi struct values as needed. */ if (rppi_size) { /* Lower layers need the VLAN TCI */ packet->vlan_tci = m_head->m_pkthdr.ether_vtag; } /* * Fill the page buffers with mbuf info starting at index * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. */ i = HV_RF_NUM_TX_RESERVED_PAGE_BUFS; for (m = m_head; m != NULL; m = m->m_next) { if (m->m_len) { vm_offset_t paddr = vtophys(mtod(m, vm_offset_t)); packet->page_buffers[i].pfn = paddr >> PAGE_SHIFT; packet->page_buffers[i].offset = paddr & (PAGE_SIZE - 1); packet->page_buffers[i].length = m->m_len; i++; } } /* * If bpf, copy the mbuf chain. This is less expensive than * it appears; the mbuf clusters are not copied, only their * reference counts are incremented. * Needed to avoid a race condition where the completion * callback is invoked, freeing the mbuf chain, before the * bpf_mtap code has a chance to run. */ if (ifp->if_bpf) { mc_head = m_copypacket(m_head, M_NOWAIT); } retry_send: /* Set the completion routine */ packet->compl.send.on_send_completion = netvsc_xmit_completion; packet->compl.send.send_completion_context = packet; packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)m_head; /* Removed critical_enter(), does not appear necessary */ ret = hv_rf_on_send(device_ctx, packet); if (ret == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* if bpf && mc_head, call bpf_mtap code */ if (mc_head) { ETHER_BPF_MTAP(ifp, mc_head); } } else { retries++; if (retries < 4) { goto retry_send; } IF_PREPEND(&ifp->if_snd, m_head); ifp->if_drv_flags |= IFF_DRV_OACTIVE; /* * Null the mbuf pointer so the completion function * does not free the mbuf chain. We just pushed the * mbuf chain back on the if_snd queue. */ packet->compl.send.send_completion_tid = 0; /* * Release the resources since we will not get any * send completion */ netvsc_xmit_completion(packet); } /* if bpf && mc_head, free the mbuf chain copy */ if (mc_head) { m_freem(mc_head); } } return (ret); } /* * Link up/down notification */ void netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status) { hn_softc_t *sc = device_get_softc(device_obj->device); if (sc == NULL) { return; } if (status == 1) { sc->hn_carrier = 1; } else { sc->hn_carrier = 0; } } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Called when we receive a data packet from the "wire" on the * specified device * * Note: This is no longer used as a callback */ int netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet) { hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device); struct mbuf *m_new; struct ifnet *ifp; int size; int i; if (sc == NULL) { return (0); /* TODO: KYS how can this be! */ } ifp = sc->hn_ifp; - - ifp = sc->arpcom.ac_ifp; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { return (0); } /* * Bail out if packet contains more data than configured MTU. */ if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); } /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (packet->tot_data_buf_len > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) return (0); /* * Remove trailing junk from RX data buffer. * Fixme: This will not work for multiple Hyper-V RX buffers. * Fortunately, the channel gathers all RX data into one buffer. * * L2 frame length, with L2 header, not including CRC */ packet->page_buffers[0].length = packet->tot_data_buf_len; /* * Copy the received packet to one or more mbufs. * The copy is required since the memory pointed to by netvsc_packet * cannot be deallocated */ for (i=0; i < packet->page_buf_count; i++) { /* Shift virtual page number to form virtual page address */ uint8_t *vaddr = (uint8_t *)(uintptr_t) (packet->page_buffers[i].pfn << PAGE_SHIFT); hv_m_append(m_new, packet->page_buffers[i].length, vaddr + packet->page_buffers[i].offset); } m_new->m_pkthdr.rcvif = ifp; if ((packet->vlan_tci != 0) && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) { m_new->m_pkthdr.ether_vtag = packet->vlan_tci; m_new->m_flags |= M_VLANTAG; } /* * Note: Moved RX completion back to hv_nv_on_receive() so all * messages (not just data messages) will trigger a response. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); /* We're not holding the lock here, so don't release it */ (*ifp->if_input)(ifp, m_new); return (0); } /* * Rules for using sc->temp_unusable: * 1. sc->temp_unusable can only be read or written while holding NV_LOCK() * 2. code reading sc->temp_unusable under NV_LOCK(), and finding * sc->temp_unusable set, must release NV_LOCK() and exit * 3. to retain exclusive control of the interface, * sc->temp_unusable must be set by code before releasing NV_LOCK() * 4. only code setting sc->temp_unusable can clear sc->temp_unusable * 5. code setting sc->temp_unusable must eventually clear sc->temp_unusable */ /* * Standard ioctl entry point. Called when the user wants to configure * the interface. */ static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { hn_softc_t *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; netvsc_device_info device_info; struct hv_device *hn_dev; int mask, error = 0; int retry_cnt = 500; switch(cmd) { case SIOCSIFADDR: case SIOCGIFADDR: error = ether_ioctl(ifp, cmd, data); break; case SIOCSIFMTU: hn_dev = vmbus_get_devctx(sc->hn_dev); /* Check MTU value change */ if (ifp->if_mtu == ifr->ifr_mtu) break; if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { error = EINVAL; break; } /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; do { NV_LOCK(sc); if (!sc->temp_unusable) { sc->temp_unusable = TRUE; retry_cnt = -1; } NV_UNLOCK(sc); if (retry_cnt > 0) { retry_cnt--; DELAY(5 * 1000); } } while (retry_cnt > 0); if (retry_cnt == 0) { error = EINVAL; break; } /* We must remove and add back the device to cause the new * MTU to take effect. This includes tearing down, but not * deleting the channel, then bringing it back up. */ error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; } error = hv_rf_on_device_add(hn_dev, &device_info); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; } hn_ifinit_locked(sc); NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; case SIOCSIFFLAGS: do { NV_LOCK(sc); if (!sc->temp_unusable) { sc->temp_unusable = TRUE; retry_cnt = -1; } NV_UNLOCK(sc); if (retry_cnt > 0) { retry_cnt--; DELAY(5 * 1000); } } while (retry_cnt > 0); if (retry_cnt == 0) { error = EINVAL; break; } if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ #ifdef notyet /* Fixme: Promiscuous mode? */ if (ifp->if_drv_flags & IFF_DRV_RUNNING && ifp->if_flags & IFF_PROMISC && !(sc->hn_if_flags & IFF_PROMISC)) { /* do something here for Hyper-V */ } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && !(ifp->if_flags & IFF_PROMISC) && sc->hn_if_flags & IFF_PROMISC) { /* do something here for Hyper-V */ } else #endif hn_ifinit_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { hn_stop(sc); } } NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); sc->hn_if_flags = ifp->if_flags; error = 0; break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_HWCSUM) { if (IFCAP_HWCSUM & ifp->if_capenable) { ifp->if_capenable &= ~IFCAP_HWCSUM; } else { ifp->if_capenable |= IFCAP_HWCSUM; } } error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: #ifdef notyet /* Fixme: Multicast mode? */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { NV_LOCK(sc); netvsc_setmulti(sc); NV_UNLOCK(sc); error = 0; } #endif /* FALLTHROUGH */ case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = EINVAL; break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } /* * */ static void hn_stop(hn_softc_t *sc) { struct ifnet *ifp; int ret; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); ifp = sc->hn_ifp; if (bootverbose) printf(" Closing Device ...\n"); ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); sc->hn_initdone = 0; ret = hv_rf_on_close(device_ctx); } /* * FreeBSD transmit entry point */ static void hn_start(struct ifnet *ifp) { hn_softc_t *sc; sc = ifp->if_softc; NV_LOCK(sc); if (sc->temp_unusable) { NV_UNLOCK(sc); return; } hn_start_locked(ifp); NV_UNLOCK(sc); } /* * */ static void hn_ifinit_locked(hn_softc_t *sc) { struct ifnet *ifp; struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev); int ret; ifp = sc->hn_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { return; } hv_promisc_mode = 1; ret = hv_rf_on_open(device_ctx); if (ret != 0) { return; } else { sc->hn_initdone = 1; } ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } /* * */ static void hn_ifinit(void *xsc) { hn_softc_t *sc = xsc; NV_LOCK(sc); if (sc->temp_unusable) { NV_UNLOCK(sc); return; } sc->temp_unusable = TRUE; NV_UNLOCK(sc); hn_ifinit_locked(sc); NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); } #ifdef LATER /* * */ static void hn_watchdog(struct ifnet *ifp) { hn_softc_t *sc; sc = ifp->if_softc; printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit); hn_ifinit(sc); /*???*/ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } #endif static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), DEVMETHOD(device_attach, netvsc_attach), DEVMETHOD(device_detach, netvsc_detach), DEVMETHOD(device_shutdown, netvsc_shutdown), { 0, 0 } }; static driver_t netvsc_driver = { NETVSC_DEVNAME, netvsc_methods, sizeof(hn_softc_t) }; static devclass_t netvsc_devclass; DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); SYSINIT(netvsc_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1, netvsc_init, NULL); Index: user/dchagin/lemul/sys/dev/hyperv =================================================================== --- user/dchagin/lemul/sys/dev/hyperv (revision 274240) +++ user/dchagin/lemul/sys/dev/hyperv (revision 274241) Property changes on: user/dchagin/lemul/sys/dev/hyperv ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/dev/hyperv:r273766-274240 Index: user/dchagin/lemul/sys/dev/ixl/ixl_txrx.c =================================================================== --- user/dchagin/lemul/sys/dev/ixl/ixl_txrx.c (revision 274240) +++ user/dchagin/lemul/sys/dev/ixl/ixl_txrx.c (revision 274241) @@ -1,1673 +1,1673 @@ /****************************************************************************** Copyright (c) 2013-2014, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ /* ** IXL driver TX/RX Routines: ** This was seperated to allow usage by ** both the BASE and the VF drivers. */ #include "opt_inet.h" #include "opt_inet6.h" #include "ixl.h" /* Local Prototypes */ static void ixl_rx_checksum(struct mbuf *, u32, u32, u8); static void ixl_refresh_mbufs(struct ixl_queue *, int); static int ixl_xmit(struct ixl_queue *, struct mbuf **); static int ixl_tx_setup_offload(struct ixl_queue *, struct mbuf *, u32 *, u32 *); static bool ixl_tso_setup(struct ixl_queue *, struct mbuf *); static __inline void ixl_rx_discard(struct rx_ring *, int); static __inline void ixl_rx_input(struct rx_ring *, struct ifnet *, struct mbuf *, u8); /* ** Multiqueue Transmit driver ** */ int ixl_mq_start(struct ifnet *ifp, struct mbuf *m) { struct ixl_vsi *vsi = ifp->if_softc; struct ixl_queue *que; struct tx_ring *txr; int err, i; /* Which queue to use */ if ((m->m_flags & M_FLOWID) != 0) i = m->m_pkthdr.flowid % vsi->num_queues; else i = curcpu % vsi->num_queues; /* Check for a hung queue and pick alternative */ if (((1 << i) & vsi->active_queues) == 0) i = ffsl(vsi->active_queues); que = &vsi->queues[i]; txr = &que->txr; err = drbr_enqueue(ifp, txr->br, m); if (err) return(err); if (IXL_TX_TRYLOCK(txr)) { ixl_mq_start_locked(ifp, txr); IXL_TX_UNLOCK(txr); } else taskqueue_enqueue(que->tq, &que->tx_task); return (0); } int ixl_mq_start_locked(struct ifnet *ifp, struct tx_ring *txr) { struct ixl_queue *que = txr->que; struct ixl_vsi *vsi = que->vsi; struct mbuf *next; int err = 0; if (((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) || vsi->link_active == 0) return (ENETDOWN); /* Process the transmit queue */ while ((next = drbr_peek(ifp, txr->br)) != NULL) { if ((err = ixl_xmit(que, &next)) != 0) { if (next == NULL) drbr_advance(ifp, txr->br); else drbr_putback(ifp, txr->br, next); break; } drbr_advance(ifp, txr->br); /* Send a copy of the frame to the BPF listener */ ETHER_BPF_MTAP(ifp, next); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; } if (txr->avail < IXL_TX_CLEANUP_THRESHOLD) ixl_txeof(que); return (err); } /* * Called from a taskqueue to drain queued transmit packets. */ void ixl_deferred_mq_start(void *arg, int pending) { struct ixl_queue *que = arg; struct tx_ring *txr = &que->txr; struct ixl_vsi *vsi = que->vsi; struct ifnet *ifp = vsi->ifp; IXL_TX_LOCK(txr); if (!drbr_empty(ifp, txr->br)) ixl_mq_start_locked(ifp, txr); IXL_TX_UNLOCK(txr); } /* ** Flush all queue ring buffers */ void ixl_qflush(struct ifnet *ifp) { struct ixl_vsi *vsi = ifp->if_softc; for (int i = 0; i < vsi->num_queues; i++) { struct ixl_queue *que = &vsi->queues[i]; struct tx_ring *txr = &que->txr; struct mbuf *m; IXL_TX_LOCK(txr); while ((m = buf_ring_dequeue_sc(txr->br)) != NULL) m_freem(m); IXL_TX_UNLOCK(txr); } if_qflush(ifp); } /* ** Find mbuf chains passed to the driver ** that are 'sparse', using more than 8 ** mbufs to deliver an mss-size chunk of data */ static inline bool ixl_tso_detect_sparse(struct mbuf *mp) { struct mbuf *m; int num = 0, mss; bool ret = FALSE; mss = mp->m_pkthdr.tso_segsz; for (m = mp->m_next; m != NULL; m = m->m_next) { num++; mss -= m->m_len; if (mss < 1) break; if (m->m_next == NULL) break; } if (num > IXL_SPARSE_CHAIN) ret = TRUE; return (ret); } /********************************************************************* * * This routine maps the mbufs to tx descriptors, allowing the * TX engine to transmit the packets. * - return 0 on success, positive on failure * **********************************************************************/ #define IXL_TXD_CMD (I40E_TX_DESC_CMD_EOP | I40E_TX_DESC_CMD_RS) static int ixl_xmit(struct ixl_queue *que, struct mbuf **m_headp) { struct ixl_vsi *vsi = que->vsi; struct i40e_hw *hw = vsi->hw; struct tx_ring *txr = &que->txr; struct ixl_tx_buf *buf; struct i40e_tx_desc *txd = NULL; struct mbuf *m_head, *m; int i, j, error, nsegs, maxsegs; int first, last = 0; u16 vtag = 0; u32 cmd, off; bus_dmamap_t map; bus_dma_tag_t tag; bus_dma_segment_t segs[IXL_MAX_TSO_SEGS]; cmd = off = 0; m_head = *m_headp; /* * Important to capture the first descriptor * used because it will contain the index of * the one we tell the hardware to report back */ first = txr->next_avail; buf = &txr->buffers[first]; map = buf->map; tag = txr->tx_tag; maxsegs = IXL_MAX_TX_SEGS; if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { /* Use larger mapping for TSO */ tag = txr->tso_tag; maxsegs = IXL_MAX_TSO_SEGS; if (ixl_tso_detect_sparse(m_head)) { m = m_defrag(m_head, M_NOWAIT); if (m == NULL) { m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; } } /* * Map the packet for DMA. */ error = bus_dmamap_load_mbuf_sg(tag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m; m = m_collapse(*m_headp, M_NOWAIT, maxsegs); if (m == NULL) { que->mbuf_defrag_failed++; m_freem(*m_headp); *m_headp = NULL; return (ENOBUFS); } *m_headp = m; /* Try it again */ error = bus_dmamap_load_mbuf_sg(tag, map, *m_headp, segs, &nsegs, BUS_DMA_NOWAIT); if (error == ENOMEM) { que->tx_dma_setup++; return (error); } else if (error != 0) { que->tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } } else if (error == ENOMEM) { que->tx_dma_setup++; return (error); } else if (error != 0) { que->tx_dma_setup++; m_freem(*m_headp); *m_headp = NULL; return (error); } /* Make certain there are enough descriptors */ if (nsegs > txr->avail - 2) { txr->no_desc++; error = ENOBUFS; goto xmit_fail; } m_head = *m_headp; /* Set up the TSO/CSUM offload */ if (m_head->m_pkthdr.csum_flags & CSUM_OFFLOAD) { error = ixl_tx_setup_offload(que, m_head, &cmd, &off); if (error) goto xmit_fail; } cmd |= I40E_TX_DESC_CMD_ICRC; /* Grab the VLAN tag */ if (m_head->m_flags & M_VLANTAG) { cmd |= I40E_TX_DESC_CMD_IL2TAG1; vtag = htole16(m_head->m_pkthdr.ether_vtag); } i = txr->next_avail; for (j = 0; j < nsegs; j++) { bus_size_t seglen; buf = &txr->buffers[i]; buf->tag = tag; /* Keep track of the type tag */ txd = &txr->base[i]; seglen = segs[j].ds_len; txd->buffer_addr = htole64(segs[j].ds_addr); txd->cmd_type_offset_bsz = htole64(I40E_TX_DESC_DTYPE_DATA | ((u64)cmd << I40E_TXD_QW1_CMD_SHIFT) | ((u64)off << I40E_TXD_QW1_OFFSET_SHIFT) | ((u64)seglen << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) | ((u64)vtag << I40E_TXD_QW1_L2TAG1_SHIFT)); last = i; /* descriptor that will get completion IRQ */ if (++i == que->num_desc) i = 0; buf->m_head = NULL; buf->eop_index = -1; } /* Set the last descriptor for report */ txd->cmd_type_offset_bsz |= htole64(((u64)IXL_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT)); txr->avail -= nsegs; txr->next_avail = i; buf->m_head = m_head; /* Swap the dma map between the first and last descriptor */ txr->buffers[first].map = buf->map; buf->map = map; bus_dmamap_sync(tag, map, BUS_DMASYNC_PREWRITE); /* Set the index of the descriptor that will be marked done */ buf = &txr->buffers[first]; buf->eop_index = last; bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * Advance the Transmit Descriptor Tail (Tdt), this tells the * hardware that this frame is available to transmit. */ ++txr->total_packets; wr32(hw, txr->tail, i); ixl_flush(hw); /* Mark outstanding work */ if (que->busy == 0) que->busy = 1; return (0); xmit_fail: bus_dmamap_unload(tag, buf->map); return (error); } /********************************************************************* * * Allocate memory for tx_buffer structures. The tx_buffer stores all * the information needed to transmit a packet on the wire. This is * called only once at attach, setup is done every reset. * **********************************************************************/ int ixl_allocate_tx_data(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; struct ixl_vsi *vsi = que->vsi; device_t dev = vsi->dev; struct ixl_tx_buf *buf; int error = 0; /* * Setup DMA descriptor areas. */ if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXL_TSO_SIZE, /* maxsize */ IXL_MAX_TX_SEGS, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->tx_tag))) { device_printf(dev,"Unable to allocate TX DMA tag\n"); goto fail; } /* Make a special tag for TSO */ if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ IXL_TSO_SIZE, /* maxsize */ IXL_MAX_TSO_SEGS, /* nsegments */ PAGE_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->tso_tag))) { device_printf(dev,"Unable to allocate TX TSO DMA tag\n"); goto fail; } if (!(txr->buffers = (struct ixl_tx_buf *) malloc(sizeof(struct ixl_tx_buf) * que->num_desc, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate tx_buffer memory\n"); error = ENOMEM; goto fail; } /* Create the descriptor buffer default dma maps */ buf = txr->buffers; for (int i = 0; i < que->num_desc; i++, buf++) { buf->tag = txr->tx_tag; error = bus_dmamap_create(buf->tag, 0, &buf->map); if (error != 0) { device_printf(dev, "Unable to create TX DMA map\n"); goto fail; } } fail: return (error); } /********************************************************************* * * (Re)Initialize a queue transmit ring. * - called by init, it clears the descriptor ring, * and frees any stale mbufs * **********************************************************************/ void ixl_init_tx_ring(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; struct ixl_tx_buf *buf; /* Clear the old ring contents */ IXL_TX_LOCK(txr); bzero((void *)txr->base, (sizeof(struct i40e_tx_desc)) * que->num_desc); /* Reset indices */ txr->next_avail = 0; txr->next_to_clean = 0; #ifdef IXL_FDIR /* Initialize flow director */ txr->atr_rate = ixl_atr_rate; txr->atr_count = 0; #endif /* Free any existing tx mbufs. */ buf = txr->buffers; for (int i = 0; i < que->num_desc; i++, buf++) { if (buf->m_head != NULL) { bus_dmamap_sync(buf->tag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(buf->tag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; } /* Clear the EOP index */ buf->eop_index = -1; } /* Set number of descriptors available */ txr->avail = que->num_desc; bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); IXL_TX_UNLOCK(txr); } /********************************************************************* * * Free transmit ring related data structures. * **********************************************************************/ void ixl_free_que_tx(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; struct ixl_tx_buf *buf; INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me); for (int i = 0; i < que->num_desc; i++) { buf = &txr->buffers[i]; if (buf->m_head != NULL) { bus_dmamap_sync(buf->tag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(buf->tag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; if (buf->map != NULL) { bus_dmamap_destroy(buf->tag, buf->map); buf->map = NULL; } } else if (buf->map != NULL) { bus_dmamap_unload(buf->tag, buf->map); bus_dmamap_destroy(buf->tag, buf->map); buf->map = NULL; } } if (txr->br != NULL) buf_ring_free(txr->br, M_DEVBUF); if (txr->buffers != NULL) { free(txr->buffers, M_DEVBUF); txr->buffers = NULL; } if (txr->tx_tag != NULL) { bus_dma_tag_destroy(txr->tx_tag); txr->tx_tag = NULL; } if (txr->tso_tag != NULL) { bus_dma_tag_destroy(txr->tso_tag); txr->tso_tag = NULL; } INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me); return; } /********************************************************************* * * Setup descriptor for hw offloads * **********************************************************************/ static int ixl_tx_setup_offload(struct ixl_queue *que, struct mbuf *mp, u32 *cmd, u32 *off) { struct ether_vlan_header *eh; #ifdef INET struct ip *ip = NULL; #endif struct tcphdr *th = NULL; #ifdef INET6 struct ip6_hdr *ip6; #endif int elen, ip_hlen = 0, tcp_hlen; u16 etype; u8 ipproto = 0; bool tso = FALSE; /* Set up the TSO context descriptor if required */ if (mp->m_pkthdr.csum_flags & CSUM_TSO) { tso = ixl_tso_setup(que, mp); if (tso) ++que->tso; else return (ENXIO); } /* * Determine where frame payload starts. * Jump over vlan headers if already present, * helpful for QinQ too. */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { etype = ntohs(eh->evl_proto); elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { etype = ntohs(eh->evl_encap_proto); elen = ETHER_HDR_LEN; } switch (etype) { #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + elen); ip_hlen = ip->ip_hl << 2; ipproto = ip->ip_p; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); /* The IP checksum must be recalculated with TSO */ if (tso) *cmd |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; else *cmd |= I40E_TX_DESC_CMD_IIPT_IPV4; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + elen); ip_hlen = sizeof(struct ip6_hdr); ipproto = ip6->ip6_nxt; th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); *cmd |= I40E_TX_DESC_CMD_IIPT_IPV6; break; #endif default: break; } *off |= (elen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; *off |= (ip_hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; switch (ipproto) { case IPPROTO_TCP: tcp_hlen = th->th_off << 2; if (mp->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) { *cmd |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; *off |= (tcp_hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } #ifdef IXL_FDIR ixl_atr(que, th, etype); #endif break; case IPPROTO_UDP: if (mp->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_UDP_IPV6)) { *cmd |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; *off |= (sizeof(struct udphdr) >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } break; case IPPROTO_SCTP: if (mp->m_pkthdr.csum_flags & (CSUM_SCTP|CSUM_SCTP_IPV6)) { *cmd |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; *off |= (sizeof(struct sctphdr) >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } /* Fall Thru */ default: break; } return (0); } /********************************************************************** * * Setup context for hardware segmentation offload (TSO) * **********************************************************************/ static bool ixl_tso_setup(struct ixl_queue *que, struct mbuf *mp) { struct tx_ring *txr = &que->txr; struct i40e_tx_context_desc *TXD; struct ixl_tx_buf *buf; u32 cmd, mss, type, tsolen; u16 etype; int idx, elen, ip_hlen, tcp_hlen; struct ether_vlan_header *eh; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif #if defined(INET6) || defined(INET) struct tcphdr *th; #endif u64 type_cmd_tso_mss; /* * Determine where frame payload starts. * Jump over vlan headers if already present */ eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { elen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = eh->evl_proto; } else { elen = ETHER_HDR_LEN; etype = eh->evl_encap_proto; } switch (ntohs(etype)) { #ifdef INET6 case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + elen); if (ip6->ip6_nxt != IPPROTO_TCP) return (ENXIO); ip_hlen = sizeof(struct ip6_hdr); th = (struct tcphdr *)((caddr_t)ip6 + ip_hlen); th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); tcp_hlen = th->th_off << 2; break; #endif #ifdef INET case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + elen); if (ip->ip_p != IPPROTO_TCP) return (ENXIO); ip->ip_sum = 0; ip_hlen = ip->ip_hl << 2; th = (struct tcphdr *)((caddr_t)ip + ip_hlen); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); tcp_hlen = th->th_off << 2; break; #endif default: printf("%s: CSUM_TSO but no supported IP version (0x%04x)", __func__, ntohs(etype)); return FALSE; } /* Ensure we have at least the IP+TCP header in the first mbuf. */ if (mp->m_len < elen + ip_hlen + sizeof(struct tcphdr)) return FALSE; idx = txr->next_avail; buf = &txr->buffers[idx]; TXD = (struct i40e_tx_context_desc *) &txr->base[idx]; tsolen = mp->m_pkthdr.len - (elen + ip_hlen + tcp_hlen); type = I40E_TX_DESC_DTYPE_CONTEXT; cmd = I40E_TX_CTX_DESC_TSO; mss = mp->m_pkthdr.tso_segsz; type_cmd_tso_mss = ((u64)type << I40E_TXD_CTX_QW1_DTYPE_SHIFT) | ((u64)cmd << I40E_TXD_CTX_QW1_CMD_SHIFT) | ((u64)tsolen << I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | ((u64)mss << I40E_TXD_CTX_QW1_MSS_SHIFT); TXD->type_cmd_tso_mss = htole64(type_cmd_tso_mss); TXD->tunneling_params = htole32(0); buf->m_head = NULL; buf->eop_index = -1; if (++idx == que->num_desc) idx = 0; txr->avail--; txr->next_avail = idx; return TRUE; } /* ** ixl_get_tx_head - Retrieve the value from the ** location the HW records its HEAD index */ static inline u32 ixl_get_tx_head(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; void *head = &txr->base[que->num_desc]; return LE32_TO_CPU(*(volatile __le32 *)head); } /********************************************************************** * * Examine each tx_buffer in the used queue. If the hardware is done * processing the packet then free associated resources. The * tx_buffer is put back on the free queue. * **********************************************************************/ bool ixl_txeof(struct ixl_queue *que) { struct tx_ring *txr = &que->txr; u32 first, last, head, done, processed; struct ixl_tx_buf *buf; struct i40e_tx_desc *tx_desc, *eop_desc; mtx_assert(&txr->mtx, MA_OWNED); /* These are not the descriptors you seek, move along :) */ if (txr->avail == que->num_desc) { que->busy = 0; return FALSE; } processed = 0; first = txr->next_to_clean; buf = &txr->buffers[first]; tx_desc = (struct i40e_tx_desc *)&txr->base[first]; last = buf->eop_index; if (last == -1) return FALSE; eop_desc = (struct i40e_tx_desc *)&txr->base[last]; /* Get the Head WB value */ head = ixl_get_tx_head(que); /* ** Get the index of the first descriptor ** BEYOND the EOP and call that 'done'. ** I do this so the comparison in the ** inner while loop below can be simple */ if (++last == que->num_desc) last = 0; done = last; bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_POSTREAD); /* ** The HEAD index of the ring is written in a ** defined location, this rather than a done bit ** is what is used to keep track of what must be ** 'cleaned'. */ while (first != head) { /* We clean the range of the packet */ while (first != done) { ++txr->avail; ++processed; if (buf->m_head) { txr->bytes += /* for ITR adjustment */ buf->m_head->m_pkthdr.len; txr->tx_bytes += /* for TX stats */ buf->m_head->m_pkthdr.len; bus_dmamap_sync(buf->tag, buf->map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(buf->tag, buf->map); m_freem(buf->m_head); buf->m_head = NULL; buf->map = NULL; } buf->eop_index = -1; if (++first == que->num_desc) first = 0; buf = &txr->buffers[first]; tx_desc = &txr->base[first]; } ++txr->packets; /* See if there is more work now */ last = buf->eop_index; if (last != -1) { eop_desc = &txr->base[last]; /* Get next done point */ if (++last == que->num_desc) last = 0; done = last; } else break; } bus_dmamap_sync(txr->dma.tag, txr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); txr->next_to_clean = first; /* ** Hang detection, we know there's ** work outstanding or the first return ** would have been taken, so indicate an ** unsuccessful pass, in local_timer if ** the value is too great the queue will ** be considered hung. If anything has been ** cleaned then reset the state. */ if ((processed == 0) && (que->busy != IXL_QUEUE_HUNG)) ++que->busy; if (processed) que->busy = 1; /* Note this turns off HUNG */ /* * If there are no pending descriptors, clear the timeout. */ if (txr->avail == que->num_desc) { que->busy = 0; return FALSE; } return TRUE; } /********************************************************************* * * Refresh mbuf buffers for RX descriptor rings * - now keeps its own state so discards due to resource * exhaustion are unnecessary, if an mbuf cannot be obtained * it just returns, keeping its placeholder, thus it can simply * be recalled to try again. * **********************************************************************/ static void ixl_refresh_mbufs(struct ixl_queue *que, int limit) { struct ixl_vsi *vsi = que->vsi; struct rx_ring *rxr = &que->rxr; bus_dma_segment_t hseg[1]; bus_dma_segment_t pseg[1]; struct ixl_rx_buf *buf; struct mbuf *mh, *mp; int i, j, nsegs, error; bool refreshed = FALSE; i = j = rxr->next_refresh; /* Control the loop with one beyond */ if (++j == que->num_desc) j = 0; while (j != limit) { buf = &rxr->buffers[i]; if (rxr->hdr_split == FALSE) goto no_split; if (buf->m_head == NULL) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) goto update; } else mh = buf->m_head; mh->m_pkthdr.len = mh->m_len = MHLEN; mh->m_len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, buf->hmap, mh, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: hdr dmamap load" " failure - %d\n", error); m_free(mh); buf->m_head = NULL; goto update; } buf->m_head = mh; bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_PREREAD); rxr->base[i].read.hdr_addr = htole64(hseg[0].ds_addr); no_split: if (buf->m_pack == NULL) { mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxr->mbuf_sz); if (mp == NULL) goto update; } else mp = buf->m_pack; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) { printf("Refresh mbufs: payload dmamap load" " failure - %d\n", error); m_free(mp); buf->m_pack = NULL; goto update; } buf->m_pack = mp; bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_PREREAD); rxr->base[i].read.pkt_addr = htole64(pseg[0].ds_addr); /* Used only when doing header split */ rxr->base[i].read.hdr_addr = 0; refreshed = TRUE; /* Next is precalculated */ i = j; rxr->next_refresh = i; if (++j == que->num_desc) j = 0; } update: if (refreshed) /* Update hardware tail index */ wr32(vsi->hw, rxr->tail, rxr->next_refresh); return; } /********************************************************************* * * Allocate memory for rx_buffer structures. Since we use one * rx_buffer per descriptor, the maximum number of rx_buffer's * that we'll need is equal to the number of receive descriptors * that we've defined. * **********************************************************************/ int ixl_allocate_rx_data(struct ixl_queue *que) { struct rx_ring *rxr = &que->rxr; struct ixl_vsi *vsi = que->vsi; device_t dev = vsi->dev; struct ixl_rx_buf *buf; int i, bsize, error; bsize = sizeof(struct ixl_rx_buf) * que->num_desc; if (!(rxr->buffers = (struct ixl_rx_buf *) malloc(bsize, M_DEVBUF, M_NOWAIT | M_ZERO))) { device_printf(dev, "Unable to allocate rx_buffer memory\n"); error = ENOMEM; return (error); } if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MSIZE, /* maxsize */ 1, /* nsegments */ MSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->htag))) { device_printf(dev, "Unable to create RX DMA htag\n"); return (error); } if ((error = bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM16BYTES, /* maxsize */ 1, /* nsegments */ MJUM16BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &rxr->ptag))) { device_printf(dev, "Unable to create RX DMA ptag\n"); return (error); } for (i = 0; i < que->num_desc; i++) { buf = &rxr->buffers[i]; error = bus_dmamap_create(rxr->htag, BUS_DMA_NOWAIT, &buf->hmap); if (error) { device_printf(dev, "Unable to create RX head map\n"); break; } error = bus_dmamap_create(rxr->ptag, BUS_DMA_NOWAIT, &buf->pmap); if (error) { device_printf(dev, "Unable to create RX pkt map\n"); break; } } return (error); } /********************************************************************* * * (Re)Initialize the queue receive ring and its buffers. * **********************************************************************/ int ixl_init_rx_ring(struct ixl_queue *que) { struct rx_ring *rxr = &que->rxr; -#if defined(INET6) || defined(INET) struct ixl_vsi *vsi = que->vsi; +#if defined(INET6) || defined(INET) struct ifnet *ifp = vsi->ifp; struct lro_ctrl *lro = &rxr->lro; #endif struct ixl_rx_buf *buf; bus_dma_segment_t pseg[1], hseg[1]; int rsize, nsegs, error = 0; IXL_RX_LOCK(rxr); /* Clear the ring contents */ rsize = roundup2(que->num_desc * sizeof(union i40e_rx_desc), DBA_ALIGN); bzero((void *)rxr->base, rsize); /* Cleanup any existing buffers */ for (int i = 0; i < que->num_desc; i++) { buf = &rxr->buffers[i]; if (buf->m_head != NULL) { bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, buf->hmap); buf->m_head->m_flags |= M_PKTHDR; m_freem(buf->m_head); } if (buf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, buf->pmap); buf->m_pack->m_flags |= M_PKTHDR; m_freem(buf->m_pack); } buf->m_head = NULL; buf->m_pack = NULL; } /* header split is off */ rxr->hdr_split = FALSE; /* Now replenish the mbufs */ for (int j = 0; j != que->num_desc; ++j) { struct mbuf *mh, *mp; buf = &rxr->buffers[j]; /* ** Don't allocate mbufs if not ** doing header split, its wasteful */ if (rxr->hdr_split == FALSE) goto skip_head; /* First the header */ buf->m_head = m_gethdr(M_NOWAIT, MT_DATA); if (buf->m_head == NULL) { error = ENOBUFS; goto fail; } m_adj(buf->m_head, ETHER_ALIGN); mh = buf->m_head; mh->m_len = mh->m_pkthdr.len = MHLEN; mh->m_flags |= M_PKTHDR; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->htag, buf->hmap, buf->m_head, hseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) /* Nothing elegant to do here */ goto fail; bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->base[j].read.hdr_addr = htole64(hseg[0].ds_addr); skip_head: /* Now the payload cluster */ buf->m_pack = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxr->mbuf_sz); if (buf->m_pack == NULL) { error = ENOBUFS; goto fail; } mp = buf->m_pack; mp->m_pkthdr.len = mp->m_len = rxr->mbuf_sz; /* Get the memory mapping */ error = bus_dmamap_load_mbuf_sg(rxr->ptag, buf->pmap, mp, pseg, &nsegs, BUS_DMA_NOWAIT); if (error != 0) goto fail; bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_PREREAD); /* Update descriptor */ rxr->base[j].read.pkt_addr = htole64(pseg[0].ds_addr); rxr->base[j].read.hdr_addr = 0; } /* Setup our descriptor indices */ rxr->next_check = 0; rxr->next_refresh = 0; rxr->lro_enabled = FALSE; rxr->split = 0; rxr->bytes = 0; rxr->discard = FALSE; wr32(vsi->hw, rxr->tail, que->num_desc - 1); ixl_flush(vsi->hw); #if defined(INET6) || defined(INET) /* ** Now set up the LRO interface: */ if (ifp->if_capenable & IFCAP_LRO) { int err = tcp_lro_init(lro); if (err) { if_printf(ifp, "queue %d: LRO Initialization failed!\n", que->me); goto fail; } INIT_DBG_IF(ifp, "queue %d: RX Soft LRO Initialized", que->me); rxr->lro_enabled = TRUE; lro->ifp = vsi->ifp; } #endif bus_dmamap_sync(rxr->dma.tag, rxr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); fail: IXL_RX_UNLOCK(rxr); return (error); } /********************************************************************* * * Free station receive ring data structures * **********************************************************************/ void ixl_free_que_rx(struct ixl_queue *que) { struct rx_ring *rxr = &que->rxr; struct ixl_rx_buf *buf; INIT_DBG_IF(que->vsi->ifp, "queue %d: begin", que->me); /* Cleanup any existing buffers */ if (rxr->buffers != NULL) { for (int i = 0; i < que->num_desc; i++) { buf = &rxr->buffers[i]; if (buf->m_head != NULL) { bus_dmamap_sync(rxr->htag, buf->hmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->htag, buf->hmap); buf->m_head->m_flags |= M_PKTHDR; m_freem(buf->m_head); } if (buf->m_pack != NULL) { bus_dmamap_sync(rxr->ptag, buf->pmap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rxr->ptag, buf->pmap); buf->m_pack->m_flags |= M_PKTHDR; m_freem(buf->m_pack); } buf->m_head = NULL; buf->m_pack = NULL; if (buf->hmap != NULL) { bus_dmamap_destroy(rxr->htag, buf->hmap); buf->hmap = NULL; } if (buf->pmap != NULL) { bus_dmamap_destroy(rxr->ptag, buf->pmap); buf->pmap = NULL; } } if (rxr->buffers != NULL) { free(rxr->buffers, M_DEVBUF); rxr->buffers = NULL; } } if (rxr->htag != NULL) { bus_dma_tag_destroy(rxr->htag); rxr->htag = NULL; } if (rxr->ptag != NULL) { bus_dma_tag_destroy(rxr->ptag); rxr->ptag = NULL; } INIT_DBG_IF(que->vsi->ifp, "queue %d: end", que->me); return; } static __inline void ixl_rx_input(struct rx_ring *rxr, struct ifnet *ifp, struct mbuf *m, u8 ptype) { #if defined(INET6) || defined(INET) /* * ATM LRO is only for IPv4/TCP packets and TCP checksum of the packet * should be computed by hardware. Also it should not have VLAN tag in * ethernet header. */ if (rxr->lro_enabled && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0 && (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) == (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) { /* * Send to the stack if: ** - LRO not enabled, or ** - no LRO resources, or ** - lro enqueue fails */ if (rxr->lro.lro_cnt != 0) if (tcp_lro_rx(&rxr->lro, m, 0) == 0) return; } #endif IXL_RX_UNLOCK(rxr); (*ifp->if_input)(ifp, m); IXL_RX_LOCK(rxr); } static __inline void ixl_rx_discard(struct rx_ring *rxr, int i) { struct ixl_rx_buf *rbuf; rbuf = &rxr->buffers[i]; if (rbuf->fmp != NULL) {/* Partial chain ? */ rbuf->fmp->m_flags |= M_PKTHDR; m_freem(rbuf->fmp); rbuf->fmp = NULL; } /* ** With advanced descriptors the writeback ** clobbers the buffer addrs, so its easier ** to just free the existing mbufs and take ** the normal refresh path to get new buffers ** and mapping. */ if (rbuf->m_head) { m_free(rbuf->m_head); rbuf->m_head = NULL; } if (rbuf->m_pack) { m_free(rbuf->m_pack); rbuf->m_pack = NULL; } return; } /********************************************************************* * * This routine executes in interrupt context. It replenishes * the mbufs in the descriptor and sends data which has been * dma'ed into host memory to upper layer. * * We loop at most count times if count is > 0, or until done if * count < 0. * * Return TRUE for more work, FALSE for all clean. *********************************************************************/ bool ixl_rxeof(struct ixl_queue *que, int count) { struct ixl_vsi *vsi = que->vsi; struct rx_ring *rxr = &que->rxr; struct ifnet *ifp = vsi->ifp; #if defined(INET6) || defined(INET) struct lro_ctrl *lro = &rxr->lro; struct lro_entry *queued; #endif int i, nextp, processed = 0; union i40e_rx_desc *cur; struct ixl_rx_buf *rbuf, *nbuf; IXL_RX_LOCK(rxr); for (i = rxr->next_check; count != 0;) { struct mbuf *sendmp, *mh, *mp; u32 rsc, status, error; u16 hlen, plen, vtag; u64 qword; u8 ptype; bool eop; /* Sync the ring. */ bus_dmamap_sync(rxr->dma.tag, rxr->dma.map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); cur = &rxr->base[i]; qword = le64toh(cur->wb.qword1.status_error_len); status = (qword & I40E_RXD_QW1_STATUS_MASK) >> I40E_RXD_QW1_STATUS_SHIFT; error = (qword & I40E_RXD_QW1_ERROR_MASK) >> I40E_RXD_QW1_ERROR_SHIFT; plen = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >> I40E_RXD_QW1_LENGTH_PBUF_SHIFT; hlen = (qword & I40E_RXD_QW1_LENGTH_HBUF_MASK) >> I40E_RXD_QW1_LENGTH_HBUF_SHIFT; ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT; if ((status & (1 << I40E_RX_DESC_STATUS_DD_SHIFT)) == 0) { ++rxr->not_done; break; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) break; count--; sendmp = NULL; nbuf = NULL; rsc = 0; cur->wb.qword1.status_error_len = 0; rbuf = &rxr->buffers[i]; mh = rbuf->m_head; mp = rbuf->m_pack; eop = (status & (1 << I40E_RX_DESC_STATUS_EOF_SHIFT)); if (status & (1 << I40E_RX_DESC_STATUS_L2TAG1P_SHIFT)) vtag = le16toh(cur->wb.qword0.lo_dword.l2tag1); else vtag = 0; /* ** Make sure bad packets are discarded, ** note that only EOP descriptor has valid ** error results. */ if (eop && (error & (1 << I40E_RX_DESC_ERROR_RXE_SHIFT))) { rxr->discarded++; ixl_rx_discard(rxr, i); goto next_desc; } /* Prefetch the next buffer */ if (!eop) { nextp = i + 1; if (nextp == que->num_desc) nextp = 0; nbuf = &rxr->buffers[nextp]; prefetch(nbuf); } /* ** The header mbuf is ONLY used when header ** split is enabled, otherwise we get normal ** behavior, ie, both header and payload ** are DMA'd into the payload buffer. ** ** Rather than using the fmp/lmp global pointers ** we now keep the head of a packet chain in the ** buffer struct and pass this along from one ** descriptor to the next, until we get EOP. */ if (rxr->hdr_split && (rbuf->fmp == NULL)) { if (hlen > IXL_RX_HDR) hlen = IXL_RX_HDR; mh->m_len = hlen; mh->m_flags |= M_PKTHDR; mh->m_next = NULL; mh->m_pkthdr.len = mh->m_len; /* Null buf pointer so it is refreshed */ rbuf->m_head = NULL; /* ** Check the payload length, this ** could be zero if its a small ** packet. */ if (plen > 0) { mp->m_len = plen; mp->m_next = NULL; mp->m_flags &= ~M_PKTHDR; mh->m_next = mp; mh->m_pkthdr.len += mp->m_len; /* Null buf pointer so it is refreshed */ rbuf->m_pack = NULL; rxr->split++; } /* ** Now create the forward ** chain so when complete ** we wont have to. */ if (eop == 0) { /* stash the chain head */ nbuf->fmp = mh; /* Make forward chain */ if (plen) mp->m_next = nbuf->m_pack; else mh->m_next = nbuf->m_pack; } else { /* Singlet, prepare to send */ sendmp = mh; if (vtag) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } } } else { /* ** Either no header split, or a ** secondary piece of a fragmented ** split packet. */ mp->m_len = plen; /* ** See if there is a stored head ** that determines what we are */ sendmp = rbuf->fmp; rbuf->m_pack = rbuf->fmp = NULL; if (sendmp != NULL) /* secondary frag */ sendmp->m_pkthdr.len += mp->m_len; else { /* first desc of a non-ps chain */ sendmp = mp; sendmp->m_flags |= M_PKTHDR; sendmp->m_pkthdr.len = mp->m_len; if (vtag) { sendmp->m_pkthdr.ether_vtag = vtag; sendmp->m_flags |= M_VLANTAG; } } /* Pass the head pointer on */ if (eop == 0) { nbuf->fmp = sendmp; sendmp = NULL; mp->m_next = nbuf->m_pack; } } ++processed; /* Sending this frame? */ if (eop) { sendmp->m_pkthdr.rcvif = ifp; /* gather stats */ rxr->rx_packets++; rxr->rx_bytes += sendmp->m_pkthdr.len; /* capture data for dynamic ITR adjustment */ rxr->packets++; rxr->bytes += sendmp->m_pkthdr.len; if ((ifp->if_capenable & IFCAP_RXCSUM) != 0) ixl_rx_checksum(sendmp, status, error, ptype); sendmp->m_pkthdr.flowid = que->msix; sendmp->m_flags |= M_FLOWID; } next_desc: bus_dmamap_sync(rxr->dma.tag, rxr->dma.map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* Advance our pointers to the next descriptor. */ if (++i == que->num_desc) i = 0; /* Now send to the stack or do LRO */ if (sendmp != NULL) { rxr->next_check = i; ixl_rx_input(rxr, ifp, sendmp, ptype); i = rxr->next_check; } /* Every 8 descriptors we go to refresh mbufs */ if (processed == 8) { ixl_refresh_mbufs(que, i); processed = 0; } } /* Refresh any remaining buf structs */ if (ixl_rx_unrefreshed(que)) ixl_refresh_mbufs(que, i); rxr->next_check = i; #if defined(INET6) || defined(INET) /* * Flush any outstanding LRO work */ while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } #endif IXL_RX_UNLOCK(rxr); return (FALSE); } /********************************************************************* * * Verify that the hardware indicated that the checksum is valid. * Inform the stack about the status of checksum so that stack * doesn't spend time verifying the checksum. * *********************************************************************/ static void ixl_rx_checksum(struct mbuf * mp, u32 status, u32 error, u8 ptype) { struct i40e_rx_ptype_decoded decoded; decoded = decode_rx_desc_ptype(ptype); /* Errors? */ if (error & ((1 << I40E_RX_DESC_ERROR_IPE_SHIFT) | (1 << I40E_RX_DESC_ERROR_L4E_SHIFT))) { mp->m_pkthdr.csum_flags = 0; return; } /* IPv6 with extension headers likely have bad csum */ if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP && decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6) if (status & (1 << I40E_RX_DESC_STATUS_IPV6EXADD_SHIFT)) { mp->m_pkthdr.csum_flags = 0; return; } /* IP Checksum Good */ mp->m_pkthdr.csum_flags = CSUM_IP_CHECKED; mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; if (status & (1 << I40E_RX_DESC_STATUS_L3L4P_SHIFT)) { mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); mp->m_pkthdr.csum_data |= htons(0xffff); } return; } #if __FreeBSD_version >= 1100000 uint64_t ixl_get_counter(if_t ifp, ift_counter cnt) { struct ixl_vsi *vsi; vsi = if_getsoftc(ifp); switch (cnt) { case IFCOUNTER_IPACKETS: return (vsi->ipackets); case IFCOUNTER_IERRORS: return (vsi->ierrors); case IFCOUNTER_OPACKETS: return (vsi->opackets); case IFCOUNTER_OERRORS: return (vsi->oerrors); case IFCOUNTER_COLLISIONS: /* Collisions are by standard impossible in 40G/10G Ethernet */ return (0); case IFCOUNTER_IBYTES: return (vsi->ibytes); case IFCOUNTER_OBYTES: return (vsi->obytes); case IFCOUNTER_IMCASTS: return (vsi->imcasts); case IFCOUNTER_OMCASTS: return (vsi->omcasts); case IFCOUNTER_IQDROPS: return (vsi->iqdrops); case IFCOUNTER_OQDROPS: return (vsi->oqdrops); case IFCOUNTER_NOPROTO: return (vsi->noproto); default: return (if_get_counter_default(ifp, cnt)); } } #endif Index: user/dchagin/lemul/sys/dev/usb/serial/u3g.c =================================================================== --- user/dchagin/lemul/sys/dev/usb/serial/u3g.c (revision 274240) +++ user/dchagin/lemul/sys/dev/usb/serial/u3g.c (revision 274241) @@ -1,1211 +1,1213 @@ /* * Copyright (c) 2008 AnyWi Technologies * Author: Andrea Guzzo * * based on uark.c 1.1 2006/08/14 08:30:22 jsg * * * parts from ubsa.c 183348 2008-09-25 12:00:56Z phk * * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * $FreeBSD$ */ /* * NOTE: * * - The detour through the tty layer is ridiculously expensive wrt * buffering due to the high speeds. * * We should consider adding a simple r/w device which allows * attaching of PPP in a more efficient way. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "usbdevs.h" #define USB_DEBUG_VAR u3g_debug #include #include #include #include #include #ifdef USB_DEBUG static int u3g_debug = 0; static SYSCTL_NODE(_hw_usb, OID_AUTO, u3g, CTLFLAG_RW, 0, "USB 3g"); SYSCTL_INT(_hw_usb_u3g, OID_AUTO, debug, CTLFLAG_RW, &u3g_debug, 0, "Debug level"); #endif #define U3G_MAXPORTS 12 #define U3G_CONFIG_INDEX 0 #define U3G_BSIZE 2048 #define U3G_TXSIZE (U3G_BSIZE / U3G_TXFRAMES) #define U3G_TXFRAMES 4 /* Eject methods; See also usb_quirks.h:UQ_MSC_EJECT_* */ #define U3GINIT_HUAWEI 1 /* Requires Huawei init command */ #define U3GINIT_SIERRA 2 /* Requires Sierra init command */ #define U3GINIT_SCSIEJECT 3 /* Requires SCSI eject command */ #define U3GINIT_REZERO 4 /* Requires SCSI rezero command */ #define U3GINIT_ZTESTOR 5 /* Requires ZTE SCSI command */ #define U3GINIT_CMOTECH 6 /* Requires CMOTECH SCSI command */ #define U3GINIT_WAIT 7 /* Device reappears after a delay */ #define U3GINIT_SAEL_M460 8 /* Requires vendor init */ #define U3GINIT_HUAWEISCSI 9 /* Requires Huawei SCSI init command */ #define U3GINIT_HUAWEISCSI2 10 /* Requires Huawei SCSI init command (2) */ #define U3GINIT_TCT 11 /* Requires TCT Mobile init command */ enum { U3G_BULK_WR, U3G_BULK_RD, U3G_INTR, U3G_N_TRANSFER, }; struct u3g_softc { struct ucom_super_softc sc_super_ucom; struct ucom_softc sc_ucom[U3G_MAXPORTS]; struct usb_xfer *sc_xfer[U3G_MAXPORTS][U3G_N_TRANSFER]; uint8_t sc_iface[U3G_MAXPORTS]; /* local status register */ uint8_t sc_lsr[U3G_MAXPORTS]; /* local status register */ uint8_t sc_msr[U3G_MAXPORTS]; /* u3g status register */ uint16_t sc_line[U3G_MAXPORTS]; /* line status */ struct usb_device *sc_udev; struct mtx sc_mtx; uint8_t sc_numports; }; static device_probe_t u3g_probe; static device_attach_t u3g_attach; static device_detach_t u3g_detach; static void u3g_free_softc(struct u3g_softc *); static usb_callback_t u3g_write_callback; static usb_callback_t u3g_read_callback; static usb_callback_t u3g_intr_callback; static void u3g_cfg_get_status(struct ucom_softc *, uint8_t *, uint8_t *); static void u3g_cfg_set_dtr(struct ucom_softc *, uint8_t); static void u3g_cfg_set_rts(struct ucom_softc *, uint8_t); static void u3g_start_read(struct ucom_softc *ucom); static void u3g_stop_read(struct ucom_softc *ucom); static void u3g_start_write(struct ucom_softc *ucom); static void u3g_stop_write(struct ucom_softc *ucom); static void u3g_poll(struct ucom_softc *ucom); static void u3g_free(struct ucom_softc *ucom); static void u3g_test_autoinst(void *, struct usb_device *, struct usb_attach_arg *); static int u3g_driver_loaded(struct module *mod, int what, void *arg); static eventhandler_tag u3g_etag; static const struct usb_config u3g_config[U3G_N_TRANSFER] = { [U3G_BULK_WR] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_OUT, .bufsize = U3G_BSIZE,/* bytes */ .frames = U3G_TXFRAMES, .flags = {.pipe_bof = 1,.force_short_xfer = 1,}, .callback = &u3g_write_callback, }, [U3G_BULK_RD] = { .type = UE_BULK, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .bufsize = U3G_BSIZE,/* bytes */ .flags = {.pipe_bof = 1,.short_xfer_ok = 1,}, .callback = &u3g_read_callback, }, [U3G_INTR] = { .type = UE_INTERRUPT, .endpoint = UE_ADDR_ANY, .direction = UE_DIR_IN, .flags = {.pipe_bof = 1,.short_xfer_ok = 1,.no_pipe_ok = 1,}, .bufsize = 0, /* use wMaxPacketSize */ .callback = &u3g_intr_callback, }, }; static const struct ucom_callback u3g_callback = { .ucom_cfg_get_status = &u3g_cfg_get_status, .ucom_cfg_set_dtr = &u3g_cfg_set_dtr, .ucom_cfg_set_rts = &u3g_cfg_set_rts, .ucom_start_read = &u3g_start_read, .ucom_stop_read = &u3g_stop_read, .ucom_start_write = &u3g_start_write, .ucom_stop_write = &u3g_stop_write, .ucom_poll = &u3g_poll, .ucom_free = &u3g_free, }; static device_method_t u3g_methods[] = { DEVMETHOD(device_probe, u3g_probe), DEVMETHOD(device_attach, u3g_attach), DEVMETHOD(device_detach, u3g_detach), DEVMETHOD_END }; static devclass_t u3g_devclass; static driver_t u3g_driver = { .name = "u3g", .methods = u3g_methods, .size = sizeof(struct u3g_softc), }; DRIVER_MODULE(u3g, uhub, u3g_driver, u3g_devclass, u3g_driver_loaded, 0); MODULE_DEPEND(u3g, ucom, 1, 1, 1); MODULE_DEPEND(u3g, usb, 1, 1, 1); MODULE_VERSION(u3g, 1); static const STRUCT_USB_HOST_ID u3g_devs[] = { #define U3G_DEV(v,p,i) { USB_VPI(USB_VENDOR_##v, USB_PRODUCT_##v##_##p, i) } U3G_DEV(ACERP, H10, 0), U3G_DEV(AIRPLUS, MCD650, 0), U3G_DEV(AIRPRIME, PC5220, 0), U3G_DEV(AIRPRIME, AC313U, 0), U3G_DEV(ALINK, 3G, 0), U3G_DEV(ALINK, 3GU, 0), U3G_DEV(ALINK, DWM652U5, 0), U3G_DEV(AMOI, H01, 0), U3G_DEV(AMOI, H01A, 0), U3G_DEV(AMOI, H02, 0), U3G_DEV(ANYDATA, ADU_500A, 0), U3G_DEV(ANYDATA, ADU_620UW, 0), U3G_DEV(ANYDATA, ADU_E100X, 0), U3G_DEV(AXESSTEL, DATAMODEM, 0), U3G_DEV(CMOTECH, CDMA_MODEM1, 0), U3G_DEV(CMOTECH, CGU628, U3GINIT_CMOTECH), U3G_DEV(DELL, U5500, 0), U3G_DEV(DELL, U5505, 0), U3G_DEV(DELL, U5510, 0), U3G_DEV(DELL, U5520, 0), U3G_DEV(DELL, U5520_2, 0), U3G_DEV(DELL, U5520_3, 0), U3G_DEV(DELL, U5700, 0), U3G_DEV(DELL, U5700_2, 0), U3G_DEV(DELL, U5700_3, 0), U3G_DEV(DELL, U5700_4, 0), U3G_DEV(DELL, U5720, 0), U3G_DEV(DELL, U5720_2, 0), U3G_DEV(DELL, U5730, 0), U3G_DEV(DELL, U5730_2, 0), U3G_DEV(DELL, U5730_3, 0), U3G_DEV(DELL, U740, 0), U3G_DEV(DLINK, DWR510_CD, U3GINIT_SCSIEJECT), U3G_DEV(DLINK, DWR510, 0), U3G_DEV(DLINK3, DWM652, 0), U3G_DEV(HP, EV2200, 0), U3G_DEV(HP, HS2300, 0), U3G_DEV(HP, UN2420_QDL, 0), U3G_DEV(HP, UN2420, 0), U3G_DEV(HUAWEI, E1401, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1402, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1403, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1404, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1405, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1406, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1407, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1408, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1409, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E140A, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E140B, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E140D, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E140E, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E140F, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1410, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1411, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1412, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1413, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1414, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1415, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1416, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1417, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1418, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1419, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E141A, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E141B, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E141C, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E141D, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E141E, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E141F, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1420, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1421, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1422, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1423, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1424, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1425, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1426, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1427, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1428, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1429, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E142A, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E142B, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E142C, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E142D, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E142E, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E142F, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1430, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1431, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1432, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1433, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1434, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1435, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1436, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1437, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1438, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1439, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E143A, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E143B, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E143C, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E143D, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E143E, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E143F, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E173, 0), U3G_DEV(HUAWEI, E173_INIT, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, E3131, 0), U3G_DEV(HUAWEI, E3131_INIT, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, E180V, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E220, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E220BIS, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E392, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, MOBILE, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, E1752, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, E1820, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, K3772, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, K3772_INIT, U3GINIT_HUAWEISCSI2), U3G_DEV(HUAWEI, K3765, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, K3765_INIT, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, K3770, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, K3770_INIT, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, K4505, U3GINIT_HUAWEI), U3G_DEV(HUAWEI, K4505_INIT, U3GINIT_HUAWEISCSI), U3G_DEV(HUAWEI, ETS2055, U3GINIT_HUAWEI), U3G_DEV(KYOCERA2, CDMA_MSM_K, 0), U3G_DEV(KYOCERA2, KPC680, 0), U3G_DEV(LONGCHEER, WM66, U3GINIT_HUAWEI), U3G_DEV(LONGCHEER, DISK, U3GINIT_TCT), U3G_DEV(LONGCHEER, W14, 0), U3G_DEV(LONGCHEER, XSSTICK, 0), U3G_DEV(MERLIN, V620, 0), U3G_DEV(NEOTEL, PRIME, 0), U3G_DEV(NOVATEL, E725, 0), U3G_DEV(NOVATEL, ES620, 0), U3G_DEV(NOVATEL, ES620_2, 0), U3G_DEV(NOVATEL, EU730, 0), U3G_DEV(NOVATEL, EU740, 0), U3G_DEV(NOVATEL, EU870D, 0), U3G_DEV(NOVATEL, MC760, 0), U3G_DEV(NOVATEL, MC547, 0), U3G_DEV(NOVATEL, MC679, 0), U3G_DEV(NOVATEL, MC950D, 0), U3G_DEV(NOVATEL, MC990D, 0), U3G_DEV(NOVATEL, MIFI2200, U3GINIT_SCSIEJECT), U3G_DEV(NOVATEL, MIFI2200V, U3GINIT_SCSIEJECT), U3G_DEV(NOVATEL, U720, 0), U3G_DEV(NOVATEL, U727, 0), U3G_DEV(NOVATEL, U727_2, 0), U3G_DEV(NOVATEL, U740, 0), U3G_DEV(NOVATEL, U740_2, 0), U3G_DEV(NOVATEL, U760, U3GINIT_SCSIEJECT), U3G_DEV(NOVATEL, U870, 0), U3G_DEV(NOVATEL, V620, 0), U3G_DEV(NOVATEL, V640, 0), U3G_DEV(NOVATEL, V720, 0), U3G_DEV(NOVATEL, V740, 0), U3G_DEV(NOVATEL, X950D, 0), U3G_DEV(NOVATEL, XU870, 0), U3G_DEV(MOTOROLA2, MB886, U3GINIT_SCSIEJECT), U3G_DEV(OPTION, E6500, 0), U3G_DEV(OPTION, E6501, 0), U3G_DEV(OPTION, E6601, 0), U3G_DEV(OPTION, E6721, 0), U3G_DEV(OPTION, E6741, 0), U3G_DEV(OPTION, E6761, 0), U3G_DEV(OPTION, E6800, 0), U3G_DEV(OPTION, E7021, 0), U3G_DEV(OPTION, E7041, 0), U3G_DEV(OPTION, E7061, 0), U3G_DEV(OPTION, E7100, 0), U3G_DEV(OPTION, GE40X, 0), U3G_DEV(OPTION, GT3G, 0), U3G_DEV(OPTION, GT3GPLUS, 0), U3G_DEV(OPTION, GT3GQUAD, 0), U3G_DEV(OPTION, GT3G_1, 0), U3G_DEV(OPTION, GT3G_2, 0), U3G_DEV(OPTION, GT3G_3, 0), U3G_DEV(OPTION, GT3G_4, 0), U3G_DEV(OPTION, GT3G_5, 0), U3G_DEV(OPTION, GT3G_6, 0), U3G_DEV(OPTION, GTHSDPA, 0), U3G_DEV(OPTION, GTM380, 0), U3G_DEV(OPTION, GTMAX36, 0), U3G_DEV(OPTION, GTMAX380HSUPAE, 0), U3G_DEV(OPTION, GTMAXHSUPA, 0), U3G_DEV(OPTION, GTMAXHSUPAE, 0), U3G_DEV(OPTION, VODAFONEMC3G, 0), U3G_DEV(QISDA, H20_1, 0), U3G_DEV(QISDA, H20_2, 0), U3G_DEV(QISDA, H21_1, 0), U3G_DEV(QISDA, H21_2, 0), U3G_DEV(QUALCOMM, NTT_L02C_MODEM, U3GINIT_SCSIEJECT), U3G_DEV(QUALCOMM2, AC8700, 0), U3G_DEV(QUALCOMM2, MF330, 0), U3G_DEV(QUALCOMM2, SIM5218, 0), U3G_DEV(QUALCOMM2, VW110L, U3GINIT_SCSIEJECT), U3G_DEV(QUALCOMM2, GOBI2000_QDL, 0), U3G_DEV(QUALCOMM2, GOBI2000, 0), U3G_DEV(QUALCOMMINC, AC2726, 0), U3G_DEV(QUALCOMMINC, AC8700, 0), U3G_DEV(QUALCOMMINC, AC8710, 0), U3G_DEV(QUALCOMMINC, CDMA_MSM, U3GINIT_SCSIEJECT), U3G_DEV(QUALCOMMINC, E0002, 0), U3G_DEV(QUALCOMMINC, E0003, 0), U3G_DEV(QUALCOMMINC, E0004, 0), U3G_DEV(QUALCOMMINC, E0005, 0), U3G_DEV(QUALCOMMINC, E0006, 0), U3G_DEV(QUALCOMMINC, E0007, 0), U3G_DEV(QUALCOMMINC, E0008, 0), U3G_DEV(QUALCOMMINC, E0009, 0), U3G_DEV(QUALCOMMINC, E000A, 0), U3G_DEV(QUALCOMMINC, E000B, 0), U3G_DEV(QUALCOMMINC, E000C, 0), U3G_DEV(QUALCOMMINC, E000D, 0), U3G_DEV(QUALCOMMINC, E000E, 0), U3G_DEV(QUALCOMMINC, E000F, 0), U3G_DEV(QUALCOMMINC, E0010, 0), U3G_DEV(QUALCOMMINC, E0011, 0), U3G_DEV(QUALCOMMINC, E0012, 0), U3G_DEV(QUALCOMMINC, E0013, 0), U3G_DEV(QUALCOMMINC, E0014, 0), U3G_DEV(QUALCOMMINC, E0017, 0), U3G_DEV(QUALCOMMINC, E0018, 0), U3G_DEV(QUALCOMMINC, E0019, 0), U3G_DEV(QUALCOMMINC, E0020, 0), U3G_DEV(QUALCOMMINC, E0021, 0), U3G_DEV(QUALCOMMINC, E0022, 0), U3G_DEV(QUALCOMMINC, E0023, 0), U3G_DEV(QUALCOMMINC, E0024, 0), U3G_DEV(QUALCOMMINC, E0025, 0), U3G_DEV(QUALCOMMINC, E0026, 0), U3G_DEV(QUALCOMMINC, E0027, 0), U3G_DEV(QUALCOMMINC, E0028, 0), U3G_DEV(QUALCOMMINC, E0029, 0), U3G_DEV(QUALCOMMINC, E0030, 0), U3G_DEV(QUALCOMMINC, E0032, 0), U3G_DEV(QUALCOMMINC, E0033, 0), U3G_DEV(QUALCOMMINC, E0037, 0), U3G_DEV(QUALCOMMINC, E0039, 0), U3G_DEV(QUALCOMMINC, E0042, 0), U3G_DEV(QUALCOMMINC, E0043, 0), U3G_DEV(QUALCOMMINC, E0048, 0), U3G_DEV(QUALCOMMINC, E0049, 0), U3G_DEV(QUALCOMMINC, E0051, 0), U3G_DEV(QUALCOMMINC, E0052, 0), U3G_DEV(QUALCOMMINC, E0054, 0), U3G_DEV(QUALCOMMINC, E0055, 0), U3G_DEV(QUALCOMMINC, E0057, 0), U3G_DEV(QUALCOMMINC, E0058, 0), U3G_DEV(QUALCOMMINC, E0059, 0), U3G_DEV(QUALCOMMINC, E0060, 0), U3G_DEV(QUALCOMMINC, E0061, 0), U3G_DEV(QUALCOMMINC, E0062, 0), U3G_DEV(QUALCOMMINC, E0063, 0), U3G_DEV(QUALCOMMINC, E0064, 0), U3G_DEV(QUALCOMMINC, E0066, 0), U3G_DEV(QUALCOMMINC, E0069, 0), U3G_DEV(QUALCOMMINC, E0070, 0), U3G_DEV(QUALCOMMINC, E0073, 0), U3G_DEV(QUALCOMMINC, E0076, 0), U3G_DEV(QUALCOMMINC, E0078, 0), U3G_DEV(QUALCOMMINC, E0082, 0), U3G_DEV(QUALCOMMINC, E0086, 0), U3G_DEV(QUALCOMMINC, SURFSTICK, 0), U3G_DEV(QUALCOMMINC, E2002, 0), U3G_DEV(QUALCOMMINC, E2003, 0), U3G_DEV(QUALCOMMINC, K3772_Z, 0), U3G_DEV(QUALCOMMINC, K3772_Z_INIT, U3GINIT_SCSIEJECT), + U3G_DEV(QUALCOMMINC, MF195E, 0), + U3G_DEV(QUALCOMMINC, MF195E_INIT, U3GINIT_SCSIEJECT), U3G_DEV(QUALCOMMINC, MF626, 0), U3G_DEV(QUALCOMMINC, MF628, 0), U3G_DEV(QUALCOMMINC, MF633R, 0), U3G_DEV(QUANTA, GKE, 0), U3G_DEV(QUANTA, GLE, 0), U3G_DEV(QUANTA, GLX, 0), U3G_DEV(QUANTA, Q101, 0), U3G_DEV(QUANTA, Q111, 0), U3G_DEV(SIERRA, AC402, 0), U3G_DEV(SIERRA, AC595U, 0), U3G_DEV(SIERRA, AC313U, 0), U3G_DEV(SIERRA, AC597E, 0), U3G_DEV(SIERRA, AC875E, 0), U3G_DEV(SIERRA, AC875U, 0), U3G_DEV(SIERRA, AC875U_2, 0), U3G_DEV(SIERRA, AC880, 0), U3G_DEV(SIERRA, AC880E, 0), U3G_DEV(SIERRA, AC880U, 0), U3G_DEV(SIERRA, AC881, 0), U3G_DEV(SIERRA, AC881E, 0), U3G_DEV(SIERRA, AC881U, 0), U3G_DEV(SIERRA, AC885E, 0), U3G_DEV(SIERRA, AC885E_2, 0), U3G_DEV(SIERRA, AC885U, 0), U3G_DEV(SIERRA, AIRCARD580, 0), U3G_DEV(SIERRA, AIRCARD595, 0), U3G_DEV(SIERRA, AIRCARD875, 0), U3G_DEV(SIERRA, C22, 0), U3G_DEV(SIERRA, C597, 0), U3G_DEV(SIERRA, C888, 0), U3G_DEV(SIERRA, E0029, 0), U3G_DEV(SIERRA, E6892, 0), U3G_DEV(SIERRA, E6893, 0), U3G_DEV(SIERRA, EM5625, 0), U3G_DEV(SIERRA, EM5725, 0), U3G_DEV(SIERRA, MC5720, 0), U3G_DEV(SIERRA, MC5720_2, 0), U3G_DEV(SIERRA, MC5725, 0), U3G_DEV(SIERRA, MC5727, 0), U3G_DEV(SIERRA, MC5727_2, 0), U3G_DEV(SIERRA, MC5728, 0), U3G_DEV(SIERRA, MC8700, 0), U3G_DEV(SIERRA, MC8755, 0), U3G_DEV(SIERRA, MC8755_2, 0), U3G_DEV(SIERRA, MC8755_3, 0), U3G_DEV(SIERRA, MC8755_4, 0), U3G_DEV(SIERRA, MC8765, 0), U3G_DEV(SIERRA, MC8765_2, 0), U3G_DEV(SIERRA, MC8765_3, 0), U3G_DEV(SIERRA, MC8775, 0), U3G_DEV(SIERRA, MC8775_2, 0), U3G_DEV(SIERRA, MC8780, 0), U3G_DEV(SIERRA, MC8780_2, 0), U3G_DEV(SIERRA, MC8780_3, 0), U3G_DEV(SIERRA, MC8781, 0), U3G_DEV(SIERRA, MC8781_2, 0), U3G_DEV(SIERRA, MC8781_3, 0), U3G_DEV(SIERRA, MC8785, 0), U3G_DEV(SIERRA, MC8785_2, 0), U3G_DEV(SIERRA, MC8790, 0), U3G_DEV(SIERRA, MC8791, 0), U3G_DEV(SIERRA, MC8792, 0), U3G_DEV(SIERRA, MINI5725, 0), U3G_DEV(SIERRA, T11, 0), U3G_DEV(SIERRA, T598, 0), U3G_DEV(SILABS, SAEL, U3GINIT_SAEL_M460), U3G_DEV(STELERA, C105, 0), U3G_DEV(STELERA, E1003, 0), U3G_DEV(STELERA, E1004, 0), U3G_DEV(STELERA, E1005, 0), U3G_DEV(STELERA, E1006, 0), U3G_DEV(STELERA, E1007, 0), U3G_DEV(STELERA, E1008, 0), U3G_DEV(STELERA, E1009, 0), U3G_DEV(STELERA, E100A, 0), U3G_DEV(STELERA, E100B, 0), U3G_DEV(STELERA, E100C, 0), U3G_DEV(STELERA, E100D, 0), U3G_DEV(STELERA, E100E, 0), U3G_DEV(STELERA, E100F, 0), U3G_DEV(STELERA, E1010, 0), U3G_DEV(STELERA, E1011, 0), U3G_DEV(STELERA, E1012, 0), U3G_DEV(TCTMOBILE, X060S, 0), U3G_DEV(TCTMOBILE, X080S, U3GINIT_TCT), U3G_DEV(TELIT, UC864E, 0), U3G_DEV(TELIT, UC864G, 0), U3G_DEV(TLAYTECH, TEU800, 0), U3G_DEV(TOSHIBA, G450, 0), U3G_DEV(TOSHIBA, HSDPA, 0), U3G_DEV(YISO, C893, 0), U3G_DEV(WETELECOM, WM_D200, 0), /* Autoinstallers */ U3G_DEV(NOVATEL, ZEROCD, U3GINIT_SCSIEJECT), U3G_DEV(OPTION, GTICON322, U3GINIT_REZERO), U3G_DEV(QUALCOMMINC, ZTE_STOR, U3GINIT_ZTESTOR), U3G_DEV(QUALCOMMINC, ZTE_STOR2, U3GINIT_SCSIEJECT), U3G_DEV(QUANTA, Q101_STOR, U3GINIT_SCSIEJECT), U3G_DEV(SIERRA, TRUINSTALL, U3GINIT_SIERRA), #undef U3G_DEV }; static int u3g_sierra_init(struct usb_device *udev) { struct usb_device_request req; req.bmRequestType = UT_VENDOR; req.bRequest = UR_SET_INTERFACE; USETW(req.wValue, UF_DEVICE_REMOTE_WAKEUP); USETW(req.wIndex, UHF_PORT_CONNECTION); USETW(req.wLength, 0); if (usbd_do_request_flags(udev, NULL, &req, NULL, 0, NULL, USB_MS_HZ)) { /* ignore any errors */ } return (0); } static int u3g_huawei_init(struct usb_device *udev) { struct usb_device_request req; req.bmRequestType = UT_WRITE_DEVICE; req.bRequest = UR_SET_FEATURE; USETW(req.wValue, UF_DEVICE_REMOTE_WAKEUP); USETW(req.wIndex, UHF_PORT_SUSPEND); USETW(req.wLength, 0); if (usbd_do_request_flags(udev, NULL, &req, NULL, 0, NULL, USB_MS_HZ)) { /* ignore any errors */ } return (0); } static void u3g_sael_m460_init(struct usb_device *udev) { static const uint8_t setup[][24] = { { 0x41, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x13, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0xc1, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x40, 0x02 }, { 0xc1, 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00 }, { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 }, { 0xc1, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }, { 0x41, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x03, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x19, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x13 }, { 0x41, 0x13, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x09, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00 }, { 0x41, 0x12, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x03, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00 }, { 0x41, 0x19, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x13 }, { 0x41, 0x13, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x09, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00 }, { 0x41, 0x07, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00 }, }; struct usb_device_request req; usb_error_t err; uint16_t len; uint8_t buf[0x300]; uint8_t n; DPRINTFN(1, "\n"); if (usbd_req_set_alt_interface_no(udev, NULL, 0, 0)) { DPRINTFN(0, "Alt setting 0 failed\n"); return; } for (n = 0; n != (sizeof(setup)/sizeof(setup[0])); n++) { memcpy(&req, setup[n], sizeof(req)); len = UGETW(req.wLength); if (req.bmRequestType & UE_DIR_IN) { if (len > sizeof(buf)) { DPRINTFN(0, "too small buffer\n"); continue; } err = usbd_do_request(udev, NULL, &req, buf); } else { if (len > (sizeof(setup[0]) - 8)) { DPRINTFN(0, "too small buffer\n"); continue; } err = usbd_do_request(udev, NULL, &req, __DECONST(uint8_t *, &setup[n][8])); } if (err) { DPRINTFN(1, "request %u failed\n", (unsigned int)n); /* * Some of the requests will fail. Stop doing * requests when we are getting timeouts so * that we don't block the explore/attach * thread forever. */ if (err == USB_ERR_TIMEOUT) break; } } } /* * The following function handles 3G modem devices (E220, Mobile, * etc.) with auto-install flash disks for Windows/MacOSX on the first * interface. After some command or some delay they change appearance * to a modem. */ static void u3g_test_autoinst(void *arg, struct usb_device *udev, struct usb_attach_arg *uaa) { struct usb_interface *iface; struct usb_interface_descriptor *id; int error; unsigned long method; if (uaa->dev_state != UAA_DEV_READY) return; iface = usbd_get_iface(udev, 0); if (iface == NULL) return; id = iface->idesc; if (id == NULL || id->bInterfaceClass != UICLASS_MASS) return; if (usb_test_quirk(uaa, UQ_MSC_EJECT_HUAWEI)) method = U3GINIT_HUAWEI; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_SIERRA)) method = U3GINIT_SIERRA; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_SCSIEJECT)) method = U3GINIT_SCSIEJECT; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_REZERO)) method = U3GINIT_REZERO; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_ZTESTOR)) method = U3GINIT_ZTESTOR; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_CMOTECH)) method = U3GINIT_CMOTECH; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_WAIT)) method = U3GINIT_WAIT; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_HUAWEISCSI)) method = U3GINIT_HUAWEISCSI; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_HUAWEISCSI2)) method = U3GINIT_HUAWEISCSI2; else if (usb_test_quirk(uaa, UQ_MSC_EJECT_TCT)) method = U3GINIT_TCT; else if (usbd_lookup_id_by_uaa(u3g_devs, sizeof(u3g_devs), uaa) == 0) method = USB_GET_DRIVER_INFO(uaa); else return; /* no device match */ if (bootverbose) { printf("Ejecting %s %s using method %ld\n", usb_get_manufacturer(udev), usb_get_product(udev), method); } switch (method) { case U3GINIT_HUAWEI: error = u3g_huawei_init(udev); break; case U3GINIT_HUAWEISCSI: error = usb_msc_eject(udev, 0, MSC_EJECT_HUAWEI); break; case U3GINIT_HUAWEISCSI2: error = usb_msc_eject(udev, 0, MSC_EJECT_HUAWEI2); break; case U3GINIT_SCSIEJECT: error = usb_msc_eject(udev, 0, MSC_EJECT_STOPUNIT); break; case U3GINIT_REZERO: error = usb_msc_eject(udev, 0, MSC_EJECT_REZERO); break; case U3GINIT_ZTESTOR: error = usb_msc_eject(udev, 0, MSC_EJECT_STOPUNIT); if (error == 0) error = usb_msc_eject(udev, 0, MSC_EJECT_ZTESTOR); break; case U3GINIT_CMOTECH: error = usb_msc_eject(udev, 0, MSC_EJECT_CMOTECH); break; case U3GINIT_TCT: error = usb_msc_eject(udev, 0, MSC_EJECT_TCT); break; case U3GINIT_SIERRA: error = u3g_sierra_init(udev); break; case U3GINIT_WAIT: /* Just pretend we ejected, the card will timeout */ error = 0; break; default: /* no 3G eject quirks */ error = EOPNOTSUPP; break; } if (error == 0) { /* success, mark the udev as disappearing */ uaa->dev_state = UAA_DEV_EJECTING; } } static int u3g_driver_loaded(struct module *mod, int what, void *arg) { switch (what) { case MOD_LOAD: /* register our autoinstall handler */ u3g_etag = EVENTHANDLER_REGISTER(usb_dev_configured, u3g_test_autoinst, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(usb_dev_configured, u3g_etag); break; default: return (EOPNOTSUPP); } return (0); } static int u3g_probe(device_t self) { struct usb_attach_arg *uaa = device_get_ivars(self); if (uaa->usb_mode != USB_MODE_HOST) { return (ENXIO); } if (uaa->info.bConfigIndex != U3G_CONFIG_INDEX) { return (ENXIO); } if (uaa->info.bInterfaceClass != UICLASS_VENDOR) { return (ENXIO); } return (usbd_lookup_id_by_uaa(u3g_devs, sizeof(u3g_devs), uaa)); } static int u3g_attach(device_t dev) { struct usb_config u3g_config_tmp[U3G_N_TRANSFER]; struct usb_attach_arg *uaa = device_get_ivars(dev); struct u3g_softc *sc = device_get_softc(dev); struct usb_interface *iface; struct usb_interface_descriptor *id; uint32_t iface_valid; int error, type, nports; int ep, n; uint8_t i; DPRINTF("sc=%p\n", sc); type = USB_GET_DRIVER_INFO(uaa); if (type == U3GINIT_SAEL_M460 || usb_test_quirk(uaa, UQ_MSC_EJECT_SAEL_M460)) { u3g_sael_m460_init(uaa->device); } /* copy in USB config */ for (n = 0; n != U3G_N_TRANSFER; n++) u3g_config_tmp[n] = u3g_config[n]; device_set_usb_desc(dev); mtx_init(&sc->sc_mtx, "u3g", NULL, MTX_DEF); ucom_ref(&sc->sc_super_ucom); sc->sc_udev = uaa->device; /* Claim all interfaces on the device */ iface_valid = 0; for (i = uaa->info.bIfaceIndex; i < USB_IFACE_MAX; i++) { iface = usbd_get_iface(uaa->device, i); if (iface == NULL) break; id = usbd_get_interface_descriptor(iface); if (id == NULL || id->bInterfaceClass != UICLASS_VENDOR) continue; usbd_set_parent_iface(uaa->device, i, uaa->info.bIfaceIndex); iface_valid |= (1<device, &i, sc->sc_xfer[nports], u3g_config_tmp, U3G_N_TRANSFER, &sc->sc_ucom[nports], &sc->sc_mtx); if (error) { /* next interface */ i++; ep = 0; continue; } iface = usbd_get_iface(uaa->device, i); id = usbd_get_interface_descriptor(iface); sc->sc_iface[nports] = id->bInterfaceNumber; if (bootverbose && sc->sc_xfer[nports][U3G_INTR]) { device_printf(dev, "port %d supports modem control\n", nports); } /* set stall by default */ mtx_lock(&sc->sc_mtx); usbd_xfer_set_stall(sc->sc_xfer[nports][U3G_BULK_WR]); usbd_xfer_set_stall(sc->sc_xfer[nports][U3G_BULK_RD]); mtx_unlock(&sc->sc_mtx); nports++; /* found one port */ ep++; if (nports == U3G_MAXPORTS) break; } if (nports == 0) { device_printf(dev, "no ports found\n"); goto detach; } sc->sc_numports = nports; error = ucom_attach(&sc->sc_super_ucom, sc->sc_ucom, sc->sc_numports, sc, &u3g_callback, &sc->sc_mtx); if (error) { DPRINTF("ucom_attach failed\n"); goto detach; } ucom_set_pnpinfo_usb(&sc->sc_super_ucom, dev); device_printf(dev, "Found %u port%s.\n", sc->sc_numports, sc->sc_numports > 1 ? "s":""); return (0); detach: u3g_detach(dev); return (ENXIO); } static int u3g_detach(device_t dev) { struct u3g_softc *sc = device_get_softc(dev); uint8_t subunit; DPRINTF("sc=%p\n", sc); /* NOTE: It is not dangerous to detach more ports than attached! */ ucom_detach(&sc->sc_super_ucom, sc->sc_ucom); for (subunit = 0; subunit != U3G_MAXPORTS; subunit++) usbd_transfer_unsetup(sc->sc_xfer[subunit], U3G_N_TRANSFER); device_claim_softc(dev); u3g_free_softc(sc); return (0); } UCOM_UNLOAD_DRAIN(u3g); static void u3g_free_softc(struct u3g_softc *sc) { if (ucom_unref(&sc->sc_super_ucom)) { mtx_destroy(&sc->sc_mtx); device_free_softc(sc); } } static void u3g_free(struct ucom_softc *ucom) { u3g_free_softc(ucom->sc_parent); } static void u3g_start_read(struct ucom_softc *ucom) { struct u3g_softc *sc = ucom->sc_parent; /* start interrupt endpoint (if configured) */ usbd_transfer_start(sc->sc_xfer[ucom->sc_subunit][U3G_INTR]); /* start read endpoint */ usbd_transfer_start(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_RD]); } static void u3g_stop_read(struct ucom_softc *ucom) { struct u3g_softc *sc = ucom->sc_parent; /* stop interrupt endpoint (if configured) */ usbd_transfer_stop(sc->sc_xfer[ucom->sc_subunit][U3G_INTR]); /* stop read endpoint */ usbd_transfer_stop(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_RD]); } static void u3g_start_write(struct ucom_softc *ucom) { struct u3g_softc *sc = ucom->sc_parent; usbd_transfer_start(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_WR]); } static void u3g_stop_write(struct ucom_softc *ucom) { struct u3g_softc *sc = ucom->sc_parent; usbd_transfer_stop(sc->sc_xfer[ucom->sc_subunit][U3G_BULK_WR]); } static void u3g_write_callback(struct usb_xfer *xfer, usb_error_t error) { struct ucom_softc *ucom = usbd_xfer_softc(xfer); struct usb_page_cache *pc; uint32_t actlen; uint32_t frame; switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: case USB_ST_SETUP: tr_setup: for (frame = 0; frame != U3G_TXFRAMES; frame++) { usbd_xfer_set_frame_offset(xfer, frame * U3G_TXSIZE, frame); pc = usbd_xfer_get_frame(xfer, frame); if (ucom_get_data(ucom, pc, 0, U3G_TXSIZE, &actlen) == 0) break; usbd_xfer_set_frame_len(xfer, frame, actlen); } if (frame != 0) { usbd_xfer_set_frames(xfer, frame); usbd_transfer_submit(xfer); } break; default: /* Error */ if (error != USB_ERR_CANCELLED) { /* do a builtin clear-stall */ usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void u3g_read_callback(struct usb_xfer *xfer, usb_error_t error) { struct ucom_softc *ucom = usbd_xfer_softc(xfer); struct usb_page_cache *pc; int actlen; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: pc = usbd_xfer_get_frame(xfer, 0); ucom_put_data(ucom, pc, 0, actlen); case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); break; default: /* Error */ if (error != USB_ERR_CANCELLED) { /* do a builtin clear-stall */ usbd_xfer_set_stall(xfer); goto tr_setup; } break; } } static void u3g_cfg_get_status(struct ucom_softc *ucom, uint8_t *lsr, uint8_t *msr) { struct u3g_softc *sc = ucom->sc_parent; *lsr = sc->sc_lsr[ucom->sc_subunit]; *msr = sc->sc_msr[ucom->sc_subunit]; } static void u3g_cfg_set_line(struct ucom_softc *ucom) { struct u3g_softc *sc = ucom->sc_parent; struct usb_device_request req; req.bmRequestType = UT_WRITE_CLASS_INTERFACE; req.bRequest = UCDC_SET_CONTROL_LINE_STATE; USETW(req.wValue, sc->sc_line[ucom->sc_subunit]); req.wIndex[0] = sc->sc_iface[ucom->sc_subunit]; req.wIndex[1] = 0; USETW(req.wLength, 0); ucom_cfg_do_request(sc->sc_udev, ucom, &req, NULL, 0, 1000); } static void u3g_cfg_set_dtr(struct ucom_softc *ucom, uint8_t onoff) { struct u3g_softc *sc = ucom->sc_parent; DPRINTF("onoff = %d\n", onoff); if (onoff) sc->sc_line[ucom->sc_subunit] |= UCDC_LINE_DTR; else sc->sc_line[ucom->sc_subunit] &= ~UCDC_LINE_DTR; u3g_cfg_set_line(ucom); } static void u3g_cfg_set_rts(struct ucom_softc *ucom, uint8_t onoff) { struct u3g_softc *sc = ucom->sc_parent; DPRINTF("onoff = %d\n", onoff); if (onoff) sc->sc_line[ucom->sc_subunit] |= UCDC_LINE_RTS; else sc->sc_line[ucom->sc_subunit] &= ~UCDC_LINE_RTS; u3g_cfg_set_line(ucom); } static void u3g_intr_callback(struct usb_xfer *xfer, usb_error_t error) { struct ucom_softc *ucom = usbd_xfer_softc(xfer); struct u3g_softc *sc = ucom->sc_parent; struct usb_page_cache *pc; struct usb_cdc_notification pkt; int actlen; uint16_t wLen; uint8_t mstatus; usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL); switch (USB_GET_STATE(xfer)) { case USB_ST_TRANSFERRED: if (actlen < 8) { /* usb_cdc_notification with 2 data bytes */ DPRINTF("message too short (expected 8, received %d)\n", actlen); goto tr_setup; } pc = usbd_xfer_get_frame(xfer, 0); usbd_copy_out(pc, 0, &pkt, actlen); wLen = UGETW(pkt.wLength); if (wLen < 2) { DPRINTF("message too short (expected 2 data bytes, received %d)\n", wLen); goto tr_setup; } if (pkt.bmRequestType == UCDC_NOTIFICATION && pkt.bNotification == UCDC_N_SERIAL_STATE) { /* * Set the serial state in ucom driver based on * the bits from the notify message */ DPRINTF("notify bytes = 0x%02x, 0x%02x\n", pkt.data[0], pkt.data[1]); /* currently, lsr is always zero. */ sc->sc_lsr[ucom->sc_subunit] = 0; sc->sc_msr[ucom->sc_subunit] = 0; mstatus = pkt.data[0]; if (mstatus & UCDC_N_SERIAL_RI) sc->sc_msr[ucom->sc_subunit] |= SER_RI; if (mstatus & UCDC_N_SERIAL_DSR) sc->sc_msr[ucom->sc_subunit] |= SER_DSR; if (mstatus & UCDC_N_SERIAL_DCD) sc->sc_msr[ucom->sc_subunit] |= SER_DCD; ucom_status_change(ucom); } case USB_ST_SETUP: tr_setup: usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer)); usbd_transfer_submit(xfer); return; default: /* Error */ if (error != USB_ERR_CANCELLED) { /* try to clear stall first */ usbd_xfer_set_stall(xfer); goto tr_setup; } return; } } static void u3g_poll(struct ucom_softc *ucom) { struct u3g_softc *sc = ucom->sc_parent; usbd_transfer_poll(sc->sc_xfer[ucom->sc_subunit], U3G_N_TRANSFER); } Index: user/dchagin/lemul/sys/dev/usb/usbdevs =================================================================== --- user/dchagin/lemul/sys/dev/usb/usbdevs (revision 274240) +++ user/dchagin/lemul/sys/dev/usb/usbdevs (revision 274241) @@ -1,4587 +1,4589 @@ $FreeBSD$ /* $NetBSD: usbdevs,v 1.392 2004/12/29 08:38:44 imp Exp $ */ /*- * Copyright (c) 1998-2004 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Lennart Augustsson (lennart@augustsson.net) at * Carlstedt Research & Technology. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * List of known USB vendors * * USB.org publishes a VID list of USB-IF member companies at * http://www.usb.org/developers/tools * Note that it does not show companies that have obtained a Vendor ID * without becoming full members. * * Please note that these IDs do not do anything. Adding an ID here and * regenerating the usbdevs.h and usbdevs_data.h only makes a symbolic name * available to the source code and does not change any functionality, nor * does it make your device available to a specific driver. * It will however make the descriptive string available if a device does not * provide the string itself. * * After adding a vendor ID VNDR and a product ID PRDCT you will have the * following extra defines: * #define USB_VENDOR_VNDR 0x???? * #define USB_PRODUCT_VNDR_PRDCT 0x???? * * You may have to add these defines to the respective probe routines to * make the device recognised by the appropriate device driver. */ vendor UNKNOWN1 0x0053 Unknown vendor vendor UNKNOWN2 0x0105 Unknown vendor vendor EGALAX2 0x0123 eGalax, Inc. vendor CHIPSBANK 0x0204 Chipsbank Microelectronics Co. vendor HUMAX 0x02ad HUMAX vendor LTS 0x0386 LTS vendor BWCT 0x03da Bernd Walter Computer Technology vendor AOX 0x03e8 AOX vendor THESYS 0x03e9 Thesys vendor DATABROADCAST 0x03ea Data Broadcasting vendor ATMEL 0x03eb Atmel vendor IWATSU 0x03ec Iwatsu America vendor MITSUMI 0x03ee Mitsumi vendor HP 0x03f0 Hewlett Packard vendor GENOA 0x03f1 Genoa vendor OAK 0x03f2 Oak vendor ADAPTEC 0x03f3 Adaptec vendor DIEBOLD 0x03f4 Diebold vendor SIEMENSELECTRO 0x03f5 Siemens Electromechanical vendor EPSONIMAGING 0x03f8 Epson Imaging vendor KEYTRONIC 0x03f9 KeyTronic vendor OPTI 0x03fb OPTi vendor ELITEGROUP 0x03fc Elitegroup vendor XILINX 0x03fd Xilinx vendor FARALLON 0x03fe Farallon Communications vendor NATIONAL 0x0400 National Semiconductor vendor NATIONALREG 0x0401 National Registry vendor ACERLABS 0x0402 Acer Labs vendor FTDI 0x0403 Future Technology Devices vendor NCR 0x0404 NCR vendor SYNOPSYS2 0x0405 Synopsys vendor FUJITSUICL 0x0406 Fujitsu-ICL vendor FUJITSU2 0x0407 Fujitsu Personal Systems vendor QUANTA 0x0408 Quanta vendor NEC 0x0409 NEC vendor KODAK 0x040a Eastman Kodak vendor WELTREND 0x040b Weltrend vendor VIA 0x040d VIA vendor MCCI 0x040e MCCI vendor MELCO 0x0411 Melco vendor LEADTEK 0x0413 Leadtek vendor WINBOND 0x0416 Winbond vendor PHOENIX 0x041a Phoenix vendor CREATIVE 0x041e Creative Labs vendor NOKIA 0x0421 Nokia vendor ADI 0x0422 ADI Systems vendor CATC 0x0423 Computer Access Technology vendor SMC2 0x0424 Standard Microsystems vendor MOTOROLA_HK 0x0425 Motorola HK vendor GRAVIS 0x0428 Advanced Gravis Computer vendor CIRRUSLOGIC 0x0429 Cirrus Logic vendor INNOVATIVE 0x042c Innovative Semiconductors vendor MOLEX 0x042f Molex vendor SUN 0x0430 Sun Microsystems vendor UNISYS 0x0432 Unisys vendor TAUGA 0x0436 Taugagreining HF vendor AMD 0x0438 Advanced Micro Devices vendor LEXMARK 0x043d Lexmark International vendor LG 0x043e LG Electronics vendor NANAO 0x0440 NANAO vendor GATEWAY 0x0443 Gateway 2000 vendor NMB 0x0446 NMB vendor ALPS 0x044e Alps Electric vendor THRUST 0x044f Thrustmaster vendor TI 0x0451 Texas Instruments vendor ANALOGDEVICES 0x0456 Analog Devices vendor SIS 0x0457 Silicon Integrated Systems Corp. vendor KYE 0x0458 KYE Systems vendor DIAMOND2 0x045a Diamond (Supra) vendor RENESAS 0x045b Renesas vendor MICROSOFT 0x045e Microsoft vendor PRIMAX 0x0461 Primax Electronics vendor MGE 0x0463 MGE UPS Systems vendor AMP 0x0464 AMP vendor CHERRY 0x046a Cherry Mikroschalter vendor MEGATRENDS 0x046b American Megatrends vendor LOGITECH 0x046d Logitech vendor BTC 0x046e Behavior Tech. Computer vendor PHILIPS 0x0471 Philips vendor SUN2 0x0472 Sun Microsystems (offical) vendor SANYO 0x0474 Sanyo Electric vendor SEAGATE 0x0477 Seagate vendor CONNECTIX 0x0478 Connectix vendor SEMTECH 0x047a Semtech vendor KENSINGTON 0x047d Kensington vendor LUCENT 0x047e Lucent vendor PLANTRONICS 0x047f Plantronics vendor KYOCERA 0x0482 Kyocera Wireless Corp. vendor STMICRO 0x0483 STMicroelectronics vendor FOXCONN 0x0489 Foxconn vendor MEIZU 0x0492 Meizu Electronics vendor YAMAHA 0x0499 YAMAHA vendor COMPAQ 0x049f Compaq vendor HITACHI 0x04a4 Hitachi vendor ACERP 0x04a5 Acer Peripherals vendor DAVICOM 0x04a6 Davicom vendor VISIONEER 0x04a7 Visioneer vendor CANON 0x04a9 Canon vendor NIKON 0x04b0 Nikon vendor PAN 0x04b1 Pan International vendor IBM 0x04b3 IBM vendor CYPRESS 0x04b4 Cypress Semiconductor vendor ROHM 0x04b5 ROHM vendor COMPAL 0x04b7 Compal vendor EPSON 0x04b8 Seiko Epson vendor RAINBOW 0x04b9 Rainbow Technologies vendor IODATA 0x04bb I-O Data vendor TDK 0x04bf TDK vendor 3COMUSR 0x04c1 U.S. Robotics vendor METHODE 0x04c2 Methode Electronics Far East vendor MAXISWITCH 0x04c3 Maxi Switch vendor LOCKHEEDMER 0x04c4 Lockheed Martin Energy Research vendor FUJITSU 0x04c5 Fujitsu vendor TOSHIBAAM 0x04c6 Toshiba America vendor MICROMACRO 0x04c7 Micro Macro Technologies vendor KONICA 0x04c8 Konica vendor LITEON 0x04ca Lite-On Technology vendor FUJIPHOTO 0x04cb Fuji Photo Film vendor PHILIPSSEMI 0x04cc Philips Semiconductors vendor TATUNG 0x04cd Tatung Co. Of America vendor SCANLOGIC 0x04ce ScanLogic vendor MYSON 0x04cf Myson Technology vendor DIGI2 0x04d0 Digi vendor ITTCANON 0x04d1 ITT Canon vendor ALTEC 0x04d2 Altec Lansing vendor LSI 0x04d4 LSI vendor MENTORGRAPHICS 0x04d6 Mentor Graphics vendor ITUNERNET 0x04d8 I-Tuner Networks vendor HOLTEK 0x04d9 Holtek Semiconductor, Inc. vendor PANASONIC 0x04da Panasonic (Matsushita) vendor HUANHSIN 0x04dc Huan Hsin vendor SHARP 0x04dd Sharp vendor IIYAMA 0x04e1 Iiyama vendor SHUTTLE 0x04e6 Shuttle Technology vendor ELO 0x04e7 Elo TouchSystems vendor SAMSUNG 0x04e8 Samsung Electronics vendor NORTHSTAR 0x04eb Northstar vendor TOKYOELECTRON 0x04ec Tokyo Electron vendor ANNABOOKS 0x04ed Annabooks vendor JVC 0x04f1 JVC vendor CHICONY 0x04f2 Chicony Electronics vendor ELAN 0x04f3 Elan vendor NEWNEX 0x04f7 Newnex vendor BROTHER 0x04f9 Brother Industries vendor DALLAS 0x04fa Dallas Semiconductor vendor AIPTEK2 0x04fc AIPTEK International vendor PFU 0x04fe PFU vendor FUJIKURA 0x0501 Fujikura/DDK vendor ACER 0x0502 Acer vendor 3COM 0x0506 3Com vendor HOSIDEN 0x0507 Hosiden Corporation vendor AZTECH 0x0509 Aztech Systems vendor BELKIN 0x050d Belkin Components vendor KAWATSU 0x050f Kawatsu Semiconductor vendor FCI 0x0514 FCI vendor LONGWELL 0x0516 Longwell vendor COMPOSITE 0x0518 Composite vendor STAR 0x0519 Star Micronics vendor APC 0x051d American Power Conversion vendor SCIATLANTA 0x051e Scientific Atlanta vendor TSM 0x0520 TSM vendor CONNECTEK 0x0522 Advanced Connectek USA vendor NETCHIP 0x0525 NetChip Technology vendor ALTRA 0x0527 ALTRA vendor ATI 0x0528 ATI Technologies vendor AKS 0x0529 Aladdin Knowledge Systems vendor TEKOM 0x052b Tekom vendor CANONDEV 0x052c Canon vendor WACOMTECH 0x0531 Wacom vendor INVENTEC 0x0537 Inventec vendor SHYHSHIUN 0x0539 Shyh Shiun Terminals vendor PREHWERKE 0x053a Preh Werke Gmbh & Co. KG vendor SYNOPSYS 0x053f Synopsys vendor UNIACCESS 0x0540 Universal Access vendor VIEWSONIC 0x0543 ViewSonic vendor XIRLINK 0x0545 Xirlink vendor ANCHOR 0x0547 Anchor Chips vendor SONY 0x054c Sony vendor FUJIXEROX 0x0550 Fuji Xerox vendor VISION 0x0553 VLSI Vision vendor ASAHIKASEI 0x0556 Asahi Kasei Microsystems vendor ATEN 0x0557 ATEN International vendor SAMSUNG2 0x055d Samsung Electronics vendor MUSTEK 0x055f Mustek Systems vendor TELEX 0x0562 Telex Communications vendor CHINON 0x0564 Chinon vendor PERACOM 0x0565 Peracom Networks vendor ALCOR2 0x0566 Alcor Micro vendor XYRATEX 0x0567 Xyratex vendor WACOM 0x056a WACOM vendor ETEK 0x056c e-TEK Labs vendor EIZO 0x056d EIZO vendor ELECOM 0x056e Elecom vendor CONEXANT 0x0572 Conexant vendor HAUPPAUGE 0x0573 Hauppauge Computer Works vendor BAFO 0x0576 BAFO/Quality Computer Accessories vendor YEDATA 0x057b Y-E Data vendor AVM 0x057c AVM vendor QUICKSHOT 0x057f Quickshot vendor ROLAND 0x0582 Roland vendor ROCKFIRE 0x0583 Rockfire vendor RATOC 0x0584 RATOC Systems vendor ZYXEL 0x0586 ZyXEL Communication vendor INFINEON 0x058b Infineon vendor MICREL 0x058d Micrel vendor ALCOR 0x058f Alcor Micro vendor OMRON 0x0590 OMRON vendor ZORAN 0x0595 Zoran Microelectronics vendor NIIGATA 0x0598 Niigata vendor IOMEGA 0x059b Iomega vendor ATREND 0x059c A-Trend Technology vendor AID 0x059d Advanced Input Devices vendor LACIE 0x059f LaCie vendor FUJIFILM 0x05a2 Fuji Film vendor ARC 0x05a3 ARC vendor ORTEK 0x05a4 Ortek vendor CISCOLINKSYS3 0x05a6 Cisco-Linksys vendor BOSE 0x05a7 Bose vendor OMNIVISION 0x05a9 OmniVision vendor INSYSTEM 0x05ab In-System Design vendor APPLE 0x05ac Apple Computer vendor YCCABLE 0x05ad Y.C. Cable vendor DIGITALPERSONA 0x05ba DigitalPersona vendor 3G 0x05bc 3G Green Green Globe vendor RAFI 0x05bd RAFI vendor TYCO 0x05be Tyco vendor KAWASAKI 0x05c1 Kawasaki vendor DIGI 0x05c5 Digi International vendor QUALCOMM2 0x05c6 Qualcomm vendor QTRONIX 0x05c7 Qtronix vendor FOXLINK 0x05c8 Foxlink vendor RICOH 0x05ca Ricoh vendor ELSA 0x05cc ELSA vendor SCIWORX 0x05ce sci-worx vendor BRAINBOXES 0x05d1 Brainboxes Limited vendor ULTIMA 0x05d8 Ultima vendor AXIOHM 0x05d9 Axiohm Transaction Solutions vendor MICROTEK 0x05da Microtek vendor SUNTAC 0x05db SUN Corporation vendor LEXAR 0x05dc Lexar Media vendor ADDTRON 0x05dd Addtron vendor SYMBOL 0x05e0 Symbol Technologies vendor SYNTEK 0x05e1 Syntek vendor GENESYS 0x05e3 Genesys Logic vendor FUJI 0x05e5 Fuji Electric vendor KEITHLEY 0x05e6 Keithley Instruments vendor EIZONANAO 0x05e7 EIZO Nanao vendor KLSI 0x05e9 Kawasaki LSI vendor FFC 0x05eb FFC vendor ANKO 0x05ef Anko Electronic vendor PIENGINEERING 0x05f3 P.I. Engineering vendor AOC 0x05f6 AOC International vendor CHIC 0x05fe Chic Technology vendor BARCO 0x0600 Barco Display Systems vendor BRIDGE 0x0607 Bridge Information vendor SOLIDYEAR 0x060b Solid Year vendor BIORAD 0x0614 Bio-Rad Laboratories vendor MACALLY 0x0618 Macally vendor ACTLABS 0x061c Act Labs vendor ALARIS 0x0620 Alaris vendor APEX 0x0624 Apex vendor CREATIVE3 0x062a Creative Labs vendor MICRON 0x0634 Micron Technology vendor VIVITAR 0x0636 Vivitar vendor GUNZE 0x0637 Gunze Electronics USA vendor AVISION 0x0638 Avision vendor TEAC 0x0644 TEAC vendor ACTON 0x0647 Acton Research Corp. vendor OPTO 0x065a Optoelectronics Co., Ltd vendor SGI 0x065e Silicon Graphics vendor SANWASUPPLY 0x0663 Sanwa Supply vendor MEGATEC 0x0665 Megatec vendor LINKSYS 0x066b Linksys vendor ACERSA 0x066e Acer Semiconductor America vendor SIGMATEL 0x066f Sigmatel vendor DRAYTEK 0x0675 DrayTek vendor AIWA 0x0677 Aiwa vendor ACARD 0x0678 ACARD Technology vendor PROLIFIC 0x067b Prolific Technology vendor SIEMENS 0x067c Siemens vendor AVANCELOGIC 0x0680 Avance Logic vendor SIEMENS2 0x0681 Siemens vendor MINOLTA 0x0686 Minolta vendor CHPRODUCTS 0x068e CH Products vendor HAGIWARA 0x0693 Hagiwara Sys-Com vendor CTX 0x0698 Chuntex vendor ASKEY 0x069a Askey Computer vendor SAITEK 0x06a3 Saitek vendor ALCATELT 0x06b9 Alcatel Telecom vendor AGFA 0x06bd AGFA-Gevaert vendor ASIAMD 0x06be Asia Microelectronic Development vendor BIZLINK 0x06c4 Bizlink International vendor KEYSPAN 0x06cd Keyspan / InnoSys Inc. vendor CONTEC 0x06ce Contec products vendor AASHIMA 0x06d6 Aashima Technology vendor LIEBERT 0x06da Liebert vendor MULTITECH 0x06e0 MultiTech vendor ADS 0x06e1 ADS Technologies vendor ALCATELM 0x06e4 Alcatel Microelectronics vendor SIRIUS 0x06ea Sirius Technologies vendor GUILLEMOT 0x06f8 Guillemot vendor BOSTON 0x06fd Boston Acoustics vendor SMC 0x0707 Standard Microsystems vendor PUTERCOM 0x0708 Putercom vendor MCT 0x0711 MCT vendor IMATION 0x0718 Imation vendor TECLAST 0x071b Teclast vendor SONYERICSSON 0x0731 Sony Ericsson vendor EICON 0x0734 Eicon Networks vendor SYNTECH 0x0745 Syntech Information vendor DIGITALSTREAM 0x074e Digital Stream vendor AUREAL 0x0755 Aureal Semiconductor vendor MAUDIO 0x0763 M-Audio vendor CYBERPOWER 0x0764 Cyber Power Systems, Inc. vendor SURECOM 0x0769 Surecom Technology vendor HIDGLOBAL 0x076b HID Global vendor LINKSYS2 0x077b Linksys vendor GRIFFIN 0x077d Griffin Technology vendor SANDISK 0x0781 SanDisk vendor JENOPTIK 0x0784 Jenoptik vendor LOGITEC 0x0789 Logitec vendor NOKIA2 0x078b Nokia vendor BRIMAX 0x078e Brimax vendor AXIS 0x0792 Axis Communications vendor ABL 0x0794 ABL Electronics vendor SAGEM 0x079b Sagem vendor SUNCOMM 0x079c Sun Communications, Inc. vendor ALFADATA 0x079d Alfadata Computer vendor NATIONALTECH 0x07a2 National Technical Systems vendor ONNTO 0x07a3 Onnto vendor BE 0x07a4 Be vendor ADMTEK 0x07a6 ADMtek vendor COREGA 0x07aa Corega vendor FREECOM 0x07ab Freecom vendor MICROTECH 0x07af Microtech vendor GENERALINSTMNTS 0x07b2 General Instruments (Motorola) vendor OLYMPUS 0x07b4 Olympus vendor ABOCOM 0x07b8 AboCom Systems vendor KEISOKUGIKEN 0x07c1 Keisokugiken vendor ONSPEC 0x07c4 OnSpec vendor APG 0x07c5 APG Cash Drawer vendor BUG 0x07c8 B.U.G. vendor ALLIEDTELESYN 0x07c9 Allied Telesyn International vendor AVERMEDIA 0x07ca AVerMedia Technologies vendor SIIG 0x07cc SIIG vendor CASIO 0x07cf CASIO vendor DLINK2 0x07d1 D-Link vendor APTIO 0x07d2 Aptio Products vendor ARASAN 0x07da Arasan Chip Systems vendor ALLIEDCABLE 0x07e6 Allied Cable vendor STSN 0x07ef STSN vendor CENTURY 0x07f7 Century Corp vendor NEWLINK 0x07ff NEWlink vendor MAGTEK 0x0801 Mag-Tek vendor ZOOM 0x0803 Zoom Telephonics vendor PCS 0x0810 Personal Communication Systems vendor ALPHASMART 0x081e AlphaSmart, Inc. vendor BROADLOGIC 0x0827 BroadLogic vendor HANDSPRING 0x082d Handspring vendor PALM 0x0830 Palm Computing vendor SOURCENEXT 0x0833 SOURCENEXT vendor ACTIONSTAR 0x0835 Action Star Enterprise vendor SAMSUNG_TECHWIN 0x0839 Samsung Techwin vendor ACCTON 0x083a Accton Technology vendor DIAMOND 0x0841 Diamond vendor NETGEAR 0x0846 BayNETGEAR vendor TOPRE 0x0853 Topre Corporation vendor ACTIVEWIRE 0x0854 ActiveWire vendor BBELECTRONICS 0x0856 B&B Electronics vendor PORTGEAR 0x085a PortGear vendor NETGEAR2 0x0864 Netgear vendor SYSTEMTALKS 0x086e System Talks vendor METRICOM 0x0870 Metricom vendor ADESSOKBTEK 0x087c ADESSO/Kbtek America vendor JATON 0x087d Jaton vendor APT 0x0880 APT Technologies vendor BOCARESEARCH 0x0885 Boca Research vendor ANDREA 0x08a8 Andrea Electronics vendor BURRBROWN 0x08bb Burr-Brown Japan vendor 2WIRE 0x08c8 2Wire vendor AIPTEK 0x08ca AIPTEK International vendor SMARTBRIDGES 0x08d1 SmartBridges vendor FUJITSUSIEMENS 0x08d4 Fujitsu-Siemens vendor BILLIONTON 0x08dd Billionton Systems vendor GEMALTO 0x08e6 Gemalto SA vendor EXTENDED 0x08e9 Extended Systems vendor MSYSTEMS 0x08ec M-Systems vendor DIGIANSWER 0x08fd Digianswer vendor AUTHENTEC 0x08ff AuthenTec vendor AUDIOTECHNICA 0x0909 Audio-Technica vendor TRUMPION 0x090a Trumpion Microelectronics vendor FEIYA 0x090c Feiya vendor ALATION 0x0910 Alation Systems vendor GLOBESPAN 0x0915 Globespan vendor CONCORDCAMERA 0x0919 Concord Camera vendor GARMIN 0x091e Garmin International vendor GOHUBS 0x0921 GoHubs vendor XEROX 0x0924 Xerox vendor BIOMETRIC 0x0929 American Biometric Company vendor TOSHIBA 0x0930 Toshiba vendor PLEXTOR 0x093b Plextor vendor INTREPIDCS 0x093c Intrepid vendor YANO 0x094f Yano vendor KINGSTON 0x0951 Kingston Technology vendor BLUEWATER 0x0956 BlueWater Systems vendor AGILENT 0x0957 Agilent Technologies vendor GUDE 0x0959 Gude ADS vendor PORTSMITH 0x095a Portsmith vendor ACERW 0x0967 Acer vendor ADIRONDACK 0x0976 Adirondack Wire & Cable vendor BECKHOFF 0x0978 Beckhoff vendor MINDSATWORK 0x097a Minds At Work vendor POINTCHIPS 0x09a6 PointChips vendor INTERSIL 0x09aa Intersil vendor ALTIUS 0x09b3 Altius Solutions vendor ARRIS 0x09c1 Arris Interactive vendor ACTIVCARD 0x09c3 ACTIVCARD vendor ACTISYS 0x09c4 ACTiSYS vendor NOVATEL2 0x09d7 Novatel Wireless vendor AFOURTECH 0x09da A-FOUR TECH vendor AIMEX 0x09dc AIMEX vendor ADDONICS 0x09df Addonics Technologies vendor AKAI 0x09e8 AKAI professional M.I. vendor ARESCOM 0x09f5 ARESCOM vendor BAY 0x09f9 Bay Associates vendor ALTERA 0x09fb Altera vendor CSR 0x0a12 Cambridge Silicon Radio vendor TREK 0x0a16 Trek Technology vendor ASAHIOPTICAL 0x0a17 Asahi Optical vendor BOCASYSTEMS 0x0a43 Boca Systems vendor SHANTOU 0x0a46 ShanTou vendor MEDIAGEAR 0x0a48 MediaGear vendor BROADCOM 0x0a5c Broadcom vendor GREENHOUSE 0x0a6b GREENHOUSE vendor MEDELI 0x0a67 Medeli vendor GEOCAST 0x0a79 Geocast Network Systems vendor EGO 0x0a92 EGO systems vendor IDQUANTIQUE 0x0aba ID Quantique vendor IDTECH 0x0acd ID TECH vendor ZYDAS 0x0ace Zydas Technology Corporation vendor NEODIO 0x0aec Neodio vendor OPTION 0x0af0 Option N.V. vendor ASUS 0x0b05 ASUSTeK Computer vendor TODOS 0x0b0c Todos Data System vendor SIIG2 0x0b39 SIIG vendor TEKRAM 0x0b3b Tekram Technology vendor HAL 0x0b41 HAL Corporation vendor EMS 0x0b43 EMS Production vendor NEC2 0x0b62 NEC vendor ADLINK 0x0b63 ADLINK Technoligy, Inc. vendor ATI2 0x0b6f ATI vendor ZEEVO 0x0b7a Zeevo, Inc. vendor KURUSUGAWA 0x0b7e Kurusugawa Electronics, Inc. vendor SMART 0x0b8c Smart Technologies vendor ASIX 0x0b95 ASIX Electronics vendor O2MICRO 0x0b97 O2 Micro, Inc. vendor USR 0x0baf U.S. Robotics vendor AMBIT 0x0bb2 Ambit Microsystems vendor HTC 0x0bb4 HTC vendor REALTEK 0x0bda Realtek vendor ERICSSON2 0x0bdb Ericsson vendor MEI 0x0bed MEI vendor ADDONICS2 0x0bf6 Addonics Technology vendor FSC 0x0bf8 Fujitsu Siemens Computers vendor AGATE 0x0c08 Agate Technologies vendor DMI 0x0c0b DMI vendor CANYON 0x0c10 Canyon vendor ICOM 0x0c26 Icom Inc. vendor GNOTOMETRICS 0x0c33 GN Otometrics vendor CHICONY2 0x0c45 Chicony vendor REINERSCT 0x0c4b Reiner-SCT vendor SEALEVEL 0x0c52 Sealevel System vendor JETI 0x0c6c Jeti vendor LUWEN 0x0c76 Luwen vendor ELEKTOR 0x0c7d ELEKTOR Electronics vendor KYOCERA2 0x0c88 Kyocera Wireless Corp. vendor ZCOM 0x0cde Z-Com vendor ATHEROS2 0x0cf3 Atheros Communications vendor POSIFLEX 0x0d3a POSIFLEX vendor TANGTOP 0x0d3d Tangtop vendor KOBIL 0x0d46 KOBIL vendor SMC3 0x0d5c Standard Microsystems vendor ADDON 0x0d7d Add-on Technology vendor ACDC 0x0d7e American Computer & Digital Components vendor CMEDIA 0x0d8c CMEDIA vendor CONCEPTRONIC 0x0d8e Conceptronic vendor SKANHEX 0x0d96 Skanhex Technology, Inc. vendor MSI 0x0db0 Micro Star International vendor ELCON 0x0db7 ELCON Systemtechnik vendor UNKNOWN4 0x0dcd Unknown vendor vendor NETAC 0x0dd8 Netac vendor SITECOMEU 0x0df6 Sitecom Europe vendor MOBILEACTION 0x0df7 Mobile Action vendor AMIGO 0x0e0b Amigo Technology vendor SPEEDDRAGON 0x0e55 Speed Dragon Multimedia vendor HAWKING 0x0e66 Hawking vendor FOSSIL 0x0e67 Fossil, Inc vendor GMATE 0x0e7e G.Mate, Inc vendor MEDIATEK 0x0e8d MediaTek, Inc. vendor OTI 0x0ea0 Ours Technology vendor YISO 0x0eab Yiso Wireless Co. vendor PILOTECH 0x0eaf Pilotech vendor NOVATECH 0x0eb0 NovaTech vendor ITEGNO 0x0eba iTegno vendor WINMAXGROUP 0x0ed1 WinMaxGroup vendor TOD 0x0ede TOD vendor EGALAX 0x0eef eGalax, Inc. vendor AIRPRIME 0x0f3d AirPrime, Inc. vendor MICROTUNE 0x0f4d Microtune vendor VTECH 0x0f88 VTech vendor FALCOM 0x0f94 Falcom Wireless Communications GmbH vendor RIM 0x0fca Research In Motion vendor DYNASTREAM 0x0fcf Dynastream Innovations vendor LARSENBRUSGAARD 0x0fd8 Larsen and Brusgaard vendor OWL 0x0fde OWL vendor KONTRON 0x0fe6 Kontron AG vendor QUALCOMM 0x1004 Qualcomm vendor APACER 0x1005 Apacer vendor MOTOROLA4 0x100d Motorola vendor HP3 0x103c Hewlett Packard vendor AIRPLUS 0x1011 Airplus vendor DESKNOTE 0x1019 Desknote vendor NEC3 0x1033 NEC vendor TTI 0x103e Thurlby Thandar Instruments vendor GIGABYTE 0x1044 GIGABYTE vendor WESTERN 0x1058 Western Digital vendor MOTOROLA 0x1063 Motorola vendor CCYU 0x1065 CCYU Technology vendor CURITEL 0x106c Curitel Communications Inc vendor SILABS2 0x10a6 SILABS2 vendor USI 0x10ab USI vendor LIEBERT2 0x10af Liebert vendor PLX 0x10b5 PLX vendor ASANTE 0x10bd Asante vendor SILABS 0x10c4 Silicon Labs vendor SILABS3 0x10c5 Silicon Labs vendor SILABS4 0x10ce Silicon Labs vendor ACTIONS 0x10d6 Actions vendor ANALOG 0x1110 Analog Devices vendor TENX 0x1130 Ten X Technology, Inc. vendor ISSC 0x1131 Integrated System Solution Corp. vendor JRC 0x1145 Japan Radio Company vendor SPHAIRON 0x114b Sphairon Access Systems GmbH vendor DELORME 0x1163 DeLorme vendor SERVERWORKS 0x1166 ServerWorks vendor DLINK3 0x1186 Dlink vendor ACERCM 0x1189 Acer Communications & Multimedia vendor SIERRA 0x1199 Sierra Wireless vendor SANWA 0x11ad Sanwa Electric Instrument Co., Ltd. vendor TOPFIELD 0x11db Topfield Co., Ltd vendor SIEMENS3 0x11f5 Siemens vendor NETINDEX 0x11f6 NetIndex vendor ALCATEL 0x11f7 Alcatel vendor INTERBIOMETRICS 0x1209 Interbiometrics vendor UNKNOWN3 0x1233 Unknown vendor vendor TSUNAMI 0x1241 Tsunami vendor PHEENET 0x124a Pheenet vendor TARGUS 0x1267 Targus vendor TWINMOS 0x126f TwinMOS vendor TENDA 0x1286 Tenda vendor TESTO 0x128d Testo products vendor CREATIVE2 0x1292 Creative Labs vendor BELKIN2 0x1293 Belkin Components vendor CYBERTAN 0x129b CyberTAN Technology vendor HUAWEI 0x12d1 Huawei Technologies vendor ARANEUS 0x12d8 Araneus Information Systems vendor TAPWAVE 0x12ef Tapwave vendor AINCOMM 0x12fd Aincomm vendor MOBILITY 0x1342 Mobility vendor DICKSMITH 0x1371 Dick Smith Electronics vendor NETGEAR3 0x1385 Netgear vendor BALTECH 0x13ad Baltech vendor CISCOLINKSYS 0x13b1 Cisco-Linksys vendor SHARK 0x13d2 Shark vendor AZUREWAVE 0x13d3 AsureWave vendor INITIO 0x13fd Initio Corporation vendor EMTEC 0x13fe Emtec vendor NOVATEL 0x1410 Novatel Wireless vendor MERLIN 0x1416 Merlin vendor REDOCTANE 0x1430 RedOctane vendor WISTRONNEWEB 0x1435 Wistron NeWeb vendor RADIOSHACK 0x1453 Radio Shack vendor FIC 0x1457 FIC / OpenMoko vendor HUAWEI3COM 0x1472 Huawei-3Com vendor ABOCOM2 0x1482 AboCom Systems vendor SILICOM 0x1485 Silicom vendor RALINK 0x148f Ralink Technology vendor IMAGINATION 0x149a Imagination Technologies vendor ATP 0x14af ATP Electronics vendor CONCEPTRONIC2 0x14b2 Conceptronic vendor SUPERTOP 0x14cd Super Top vendor PLANEX3 0x14ea Planex Communications vendor SILICONPORTALS 0x1527 Silicon Portals vendor UBIQUAM 0x1529 UBIQUAM Co., Ltd. vendor JMICRON 0x152d JMicron vendor UBLOX 0x1546 U-blox vendor PNY 0x154b PNY vendor OWEN 0x1555 Owen vendor OQO 0x1557 OQO vendor UMEDIA 0x157e U-MEDIA Communications vendor FIBERLINE 0x1582 Fiberline vendor FREESCALE 0x15a2 Freescale Semiconductor, Inc. vendor AFATECH 0x15a4 Afatech Technologies, Inc. vendor SPARKLAN 0x15a9 SparkLAN vendor OLIMEX 0x15ba Olimex vendor SOUNDGRAPH 0x15c2 Soundgraph, Inc. vendor AMIT2 0x15c5 AMIT vendor TEXTECH 0x15ca Textech International Ltd. vendor SOHOWARE 0x15e8 SOHOware vendor UMAX 0x1606 UMAX Data Systems vendor INSIDEOUT 0x1608 Inside Out Networks vendor AMOI 0x1614 Amoi Electronics vendor GOODWAY 0x1631 Good Way Technology vendor ENTREGA 0x1645 Entrega vendor ACTIONTEC 0x1668 Actiontec Electronics vendor CLIPSAL 0x166a Clipsal vendor CISCOLINKSYS2 0x167b Cisco-Linksys vendor ATHEROS 0x168c Atheros Communications vendor GIGASET 0x1690 Gigaset vendor GLOBALSUN 0x16ab Global Sun Technology vendor ANYDATA 0x16d5 AnyDATA Corporation vendor JABLOTRON 0x16d6 Jablotron vendor CMOTECH 0x16d8 C-motech vendor WIENERPLEINBAUS 0x16dc WIENER Plein & Baus GmbH. vendor AXESSTEL 0x1726 Axesstel Co., Ltd. vendor LINKSYS4 0x1737 Linksys vendor SENAO 0x1740 Senao vendor ASUS2 0x1761 ASUS vendor SWEEX2 0x177f Sweex vendor METAGEEK 0x1781 MetaGeek vendor KAMSTRUP 0x17a8 Kamstrup A/S vendor LENOVO 0x17ef Lenovo vendor WAVESENSE 0x17f4 WaveSense vendor VAISALA 0x1843 Vaisala vendor AMIT 0x18c5 AMIT vendor GOOGLE 0x18d1 Google vendor QCOM 0x18e8 Qcom vendor ELV 0x18ef ELV vendor LINKSYS3 0x1915 Linksys vendor QUALCOMMINC 0x19d2 Qualcomm, Incorporated vendor BAYER 0x1a79 Bayer vendor WCH2 0x1a86 QinHeng Electronics vendor STELERA 0x1a8d Stelera Wireless vendor SEL 0x1adb Schweitzer Engineering Laboratories vendor CORSAIR 0x1b1c Corsair vendor MATRIXORBITAL 0x1b3d Matrix Orbital vendor OVISLINK 0x1b75 OvisLink vendor TML 0x1b91 The Mobility Lab vendor TCTMOBILE 0x1bbb TCT Mobile vendor ALTI2 0x1bc9 Alti-2 products vendor SUNPLUS 0x1bcf Sunplus Innovation Technology Inc. vendor WAGO 0x1be3 WAGO Kontakttechnik GmbH. vendor TELIT 0x1bc7 Telit vendor IONICS 0x1c0c Ionics PlugComputer vendor LONGCHEER 0x1c9e Longcheer Holdings, Ltd. vendor MPMAN 0x1cae MpMan vendor DRESDENELEKTRONIK 0x1cf1 dresden elektronik vendor NEOTEL 0x1d09 Neotel vendor DREAMLINK 0x1d34 Dream Link vendor PEGATRON 0x1d4d Pegatron vendor QISDA 0x1da5 Qisda vendor METAGEEK2 0x1dd5 MetaGeek vendor ALINK 0x1e0e Alink vendor AIRTIES 0x1eda AirTies vendor FESTO 0x1e29 Festo vendor LAKESHORE 0x1fb9 Lake Shore Cryotronics, Inc. vendor VERTEX 0x1fe7 Vertex Wireless Co., Ltd. vendor DLINK 0x2001 D-Link vendor PLANEX2 0x2019 Planex Communications vendor HAUPPAUGE2 0x2040 Hauppauge Computer Works vendor TLAYTECH 0x20b9 Tlay Tech vendor ENCORE 0x203d Encore vendor QIHARDWARE 0x20b7 QI-hardware vendor PARA 0x20b8 PARA Industrial vendor SIMTEC 0x20df Simtec Electronics vendor TRENDNET 0x20f4 TRENDnet vendor RTSYSTEMS 0x2100 RTSYSTEMS vendor VIALABS 0x2109 VIA Labs vendor ERICSSON 0x2282 Ericsson vendor MOTOROLA2 0x22b8 Motorola vendor WETELECOM 0x22de WeTelecom vendor WESTMOUNTAIN 0x2405 West Mountain Radio vendor TRIPPLITE 0x2478 Tripp-Lite vendor HIROSE 0x2631 Hirose Electric vendor NHJ 0x2770 NHJ vendor PLANEX 0x2c02 Planex Communications vendor VIDZMEDIA 0x3275 VidzMedia Pte Ltd vendor LINKINSTRUMENTS 0x3195 Link Instruments Inc. vendor AEI 0x3334 AEI vendor HANK 0x3353 Hank Connection vendor PQI 0x3538 PQI vendor DAISY 0x3579 Daisy Technology vendor NI 0x3923 National Instruments vendor MICRONET 0x3980 Micronet Communications vendor IODATA2 0x40bb I-O Data vendor IRIVER 0x4102 iRiver vendor DELL 0x413c Dell vendor WCH 0x4348 QinHeng Electronics vendor ACEECA 0x4766 Aceeca vendor FEIXUN 0x4855 FeiXun Communication vendor PAPOUCH 0x5050 Papouch products vendor AVERATEC 0x50c2 Averatec vendor SWEEX 0x5173 Sweex vendor PROLIFIC2 0x5372 Prolific Technologies vendor ONSPEC2 0x55aa OnSpec Electronic Inc. vendor ZINWELL 0x5a57 Zinwell vendor SITECOM 0x6189 Sitecom vendor ARKMICRO 0x6547 Arkmicro Technologies Inc. vendor 3COM2 0x6891 3Com vendor EDIMAX 0x7392 Edimax vendor INTEL 0x8086 Intel vendor INTEL2 0x8087 Intel vendor ALLWIN 0x8516 ALLWIN Tech vendor SITECOM2 0x9016 Sitecom vendor MOSCHIP 0x9710 MosChip Semiconductor vendor NETGEAR4 0x9846 Netgear vendor MARVELL 0x9e88 Marvell Technology Group Ltd. vendor 3COM3 0xa727 3Com vendor EVOLUTION 0xdeee Evolution Robotics products vendor DATAAPEX 0xdaae DataApex vendor HP2 0xf003 Hewlett Packard vendor LOGILINK 0xfc08 LogiLink vendor USRP 0xfffe GNU Radio USRP /* * List of known products. Grouped by vendor. */ /* 3Com products */ product 3COM HOMECONN 0x009d HomeConnect Camera product 3COM 3CREB96 0x00a0 Bluetooth USB Adapter product 3COM 3C19250 0x03e8 3C19250 Ethernet Adapter product 3COM 3CRSHEW696 0x0a01 3CRSHEW696 Wireless Adapter product 3COM 3C460 0x11f8 HomeConnect 3C460 product 3COM USR56K 0x3021 U.S.Robotics 56000 Voice FaxModem Pro product 3COM 3C460B 0x4601 HomeConnect 3C460B product 3COM2 3CRUSB10075 0xa727 3CRUSB10075 product 3COM3 AR5523_1 0x6893 AR5523 product 3COM3 AR5523_2 0x6895 AR5523 product 3COM3 AR5523_3 0x6897 AR5523 product 3COMUSR OFFICECONN 0x0082 3Com OfficeConnect Analog Modem product 3COMUSR USRISDN 0x008f 3Com U.S. Robotics Pro ISDN TA product 3COMUSR HOMECONN 0x009d 3Com HomeConnect Camera product 3COMUSR USR56K 0x3021 U.S. Robotics 56000 Voice FaxModem Pro /* AboCom products */ product ABOCOM XX1 0x110c XX1 product ABOCOM XX2 0x200c XX2 product ABOCOM RT2770 0x2770 RT2770 product ABOCOM RT2870 0x2870 RT2870 product ABOCOM RT3070 0x3070 RT3070 product ABOCOM RT3071 0x3071 RT3071 product ABOCOM RT3072 0x3072 RT3072 product ABOCOM2 RT2870_1 0x3c09 RT2870 product ABOCOM URE450 0x4000 URE450 Ethernet Adapter product ABOCOM UFE1000 0x4002 UFE1000 Fast Ethernet Adapter product ABOCOM DSB650TX_PNA 0x4003 1/10/100 Ethernet Adapter product ABOCOM XX4 0x4004 XX4 product ABOCOM XX5 0x4007 XX5 product ABOCOM XX6 0x400b XX6 product ABOCOM XX7 0x400c XX7 product ABOCOM RTL8151 0x401a RTL8151 product ABOCOM XX8 0x4102 XX8 product ABOCOM XX9 0x4104 XX9 product ABOCOM UF200 0x420a UF200 Ethernet product ABOCOM WL54 0x6001 WL54 product ABOCOM XX10 0xabc1 XX10 product ABOCOM BWU613 0xb000 BWU613 product ABOCOM HWU54DM 0xb21b HWU54DM product ABOCOM RT2573_2 0xb21c RT2573 product ABOCOM RT2573_3 0xb21d RT2573 product ABOCOM RT2573_4 0xb21e RT2573 product ABOCOM RTL8188CU_1 0x8188 RTL8188CU product ABOCOM RTL8188CU_2 0x8189 RTL8188CU product ABOCOM RTL8192CU 0x8178 RTL8192CU product ABOCOM WUG2700 0xb21f WUG2700 /* Acton Research Corp. */ product ACTON SPECTRAPRO 0x0100 FTDI compatible adapter /* Accton products */ product ACCTON USB320_EC 0x1046 USB320-EC Ethernet Adapter product ACCTON 2664W 0x3501 2664W product ACCTON 111 0x3503 T-Sinus 111 Wireless Adapter product ACCTON SMCWUSBG_NF 0x4505 SMCWUSB-G (no firmware) product ACCTON SMCWUSBG 0x4506 SMCWUSB-G product ACCTON SMCWUSBTG2_NF 0x4507 SMCWUSBT-G2 (no firmware) product ACCTON SMCWUSBTG2 0x4508 SMCWUSBT-G2 product ACCTON PRISM_GT 0x4521 PrismGT USB 2.0 WLAN product ACCTON SS1001 0x5046 SpeedStream Ethernet Adapter product ACCTON RT2870_2 0x6618 RT2870 product ACCTON RT3070 0x7511 RT3070 product ACCTON RT2770 0x7512 RT2770 product ACCTON RT2870_3 0x7522 RT2870 product ACCTON RT2870_5 0x8522 RT2870 product ACCTON RT3070_4 0xa512 RT3070 product ACCTON RT2870_4 0xa618 RT2870 product ACCTON RT3070_1 0xa701 RT3070 product ACCTON RT3070_2 0xa702 RT3070 product ACCTON RT2870_1 0xb522 RT2870 product ACCTON RT3070_3 0xc522 RT3070 product ACCTON RT3070_5 0xd522 RT3070 product ACCTON RTL8192SU 0xc512 RTL8192SU product ACCTON ZD1211B 0xe501 ZD1211B /* Aceeca products */ product ACEECA MEZ1000 0x0001 MEZ1000 RDA /* Acer Communications & Multimedia (oemd by Surecom) */ product ACERCM EP1427X2 0x0893 EP-1427X-2 Ethernet Adapter /* Acer Labs products */ product ACERLABS M5632 0x5632 USB 2.0 Data Link /* Acer Peripherals, Inc. products */ product ACERP ACERSCAN_C310U 0x12a6 Acerscan C310U product ACERP ACERSCAN_320U 0x2022 Acerscan 320U product ACERP ACERSCAN_640U 0x2040 Acerscan 640U product ACERP ACERSCAN_620U 0x2060 Acerscan 620U product ACERP ACERSCAN_4300U 0x20b0 Benq 3300U/4300U product ACERP ACERSCAN_640BT 0x20be Acerscan 640BT product ACERP ACERSCAN_1240U 0x20c0 Acerscan 1240U product ACERP S81 0x4027 BenQ S81 phone product ACERP H10 0x4068 AWL400 Wireless Adapter product ACERP ATAPI 0x6003 ATA/ATAPI Adapter product ACERP AWL300 0x9000 AWL300 Wireless Adapter product ACERP AWL400 0x9001 AWL400 Wireless Adapter /* Acer Warp products */ product ACERW WARPLINK 0x0204 Warplink /* Actions products */ product ACTIONS MP4 0x1101 Actions MP4 Player /* Actiontec, Inc. products */ product ACTIONTEC PRISM_25 0x0408 Prism2.5 Wireless Adapter product ACTIONTEC PRISM_25A 0x0421 Prism2.5 Wireless Adapter A product ACTIONTEC FREELAN 0x6106 ROPEX FreeLan 802.11b product ACTIONTEC UAT1 0x7605 UAT1 Wireless Ethernet Adapter /* ACTiSYS products */ product ACTISYS IR2000U 0x0011 ACT-IR2000U FIR /* ActiveWire, Inc. products */ product ACTIVEWIRE IOBOARD 0x0100 I/O Board product ACTIVEWIRE IOBOARD_FW1 0x0101 I/O Board, rev. 1 firmware /* Adaptec products */ product ADAPTEC AWN8020 0x0020 AWN-8020 WLAN /* Addtron products */ product ADDTRON AWU120 0xff31 AWU-120 /* ADLINK Texhnology products */ product ADLINK ND6530 0x6530 ND-6530 USB-Serial /* ADMtek products */ product ADMTEK PEGASUSII_4 0x07c2 AN986A Ethernet product ADMTEK PEGASUS 0x0986 AN986 Ethernet product ADMTEK PEGASUSII 0x8511 AN8511 Ethernet product ADMTEK PEGASUSII_2 0x8513 AN8513 Ethernet product ADMTEK PEGASUSII_3 0x8515 AN8515 Ethernet /* ADDON products */ /* PNY OEMs these */ product ADDON ATTACHE 0x1300 USB 2.0 Flash Drive product ADDON ATTACHE 0x1300 USB 2.0 Flash Drive product ADDON A256MB 0x1400 Attache 256MB USB 2.0 Flash Drive product ADDON DISKPRO512 0x1420 USB 2.0 Flash Drive (DANE-ELEC zMate 512MB USB flash drive) /* Addonics products */ product ADDONICS2 CABLE_205 0xa001 Cable 205 /* ADS products */ product ADS UBS10BT 0x0008 UBS-10BT Ethernet product ADS UBS10BTX 0x0009 UBS-10BT Ethernet /* AEI products */ product AEI FASTETHERNET 0x1701 Fast Ethernet /* Afatech Technologies, Inc. */ product AFATECH AFATECH1336 0x1336 Flash Card Reader /* Agate Technologies products */ product AGATE QDRIVE 0x0378 Q-Drive /* AGFA products */ product AGFA SNAPSCAN1212U 0x0001 SnapScan 1212U product AGFA SNAPSCAN1236U 0x0002 SnapScan 1236U product AGFA SNAPSCANTOUCH 0x0100 SnapScan Touch product AGFA SNAPSCAN1212U2 0x2061 SnapScan 1212U product AGFA SNAPSCANE40 0x208d SnapScan e40 product AGFA SNAPSCANE50 0x208f SnapScan e50 product AGFA SNAPSCANE20 0x2091 SnapScan e20 product AGFA SNAPSCANE25 0x2095 SnapScan e25 product AGFA SNAPSCANE26 0x2097 SnapScan e26 product AGFA SNAPSCANE52 0x20fd SnapScan e52 /* Ain Communication Technology products */ product AINCOMM AWU2000B 0x1001 AWU2000B Wireless Adapter /* AIPTEK products */ product AIPTEK POCKETCAM3M 0x2011 PocketCAM 3Mega product AIPTEK2 PENCAM_MEGA_1_3 0x504a PenCam Mega 1.3 product AIPTEK2 SUNPLUS_TECH 0x0c15 Sunplus Technology Inc. /* AirPlis products */ product AIRPLUS MCD650 0x3198 MCD650 modem /* AirPrime products */ product AIRPRIME PC5220 0x0112 CDMA Wireless PC Card product AIRPRIME USB308 0x68A3 USB308 HSPA+ USB Modem product AIRPRIME AC313U 0x68aa Sierra Wireless AirCard 313U /* AirTies products */ product AIRTIES RT3070 0x2310 RT3070 /* AKS products */ product AKS USBHASP 0x0001 USB-HASP 0.06 /* Alcatel products */ product ALCATEL OT535 0x02df One Touch 535/735 /* Alcor Micro, Inc. products */ product ALCOR2 KBD_HUB 0x2802 Kbd Hub product ALCOR DUMMY 0x0000 Dummy product product ALCOR SDCR_6335 0x6335 SD/MMC Card Reader product ALCOR SDCR_6362 0x6362 SD/MMC Card Reader product ALCOR SDCR_6366 0x6366 SD/MMC Card Reader product ALCOR TRANSCEND 0x6387 Transcend JetFlash Drive product ALCOR MA_KBD_HUB 0x9213 MacAlly Kbd Hub product ALCOR AU9814 0x9215 AU9814 Hub product ALCOR UMCR_9361 0x9361 USB Multimedia Card Reader product ALCOR SM_KBD 0x9410 MicroConnectors/StrongMan Keyboard product ALCOR NEC_KBD_HUB 0x9472 NEC Kbd Hub product ALCOR AU9720 0x9720 USB2 - RS-232 product ALCOR AU6390 0x6390 AU6390 USB-IDE converter /* Alink products */ product ALINK DWM652U5 0xce16 DWM-652 product ALINK 3G 0x9000 3G modem product ALINK 3GU 0x9200 3G modem /* Altec Lansing products */ product ALTEC ADA70 0x0070 ADA70 Speakers product ALTEC ASC495 0xff05 ASC495 Speakers /* Alti-2 products */ product ALTI2 N3 0x6001 FTDI compatible adapter /* Allied Telesyn International products */ product ALLIEDTELESYN ATUSB100 0xb100 AT-USB100 /* ALLWIN Tech products */ product ALLWIN RT2070 0x2070 RT2070 product ALLWIN RT2770 0x2770 RT2770 product ALLWIN RT2870 0x2870 RT2870 product ALLWIN RT3070 0x3070 RT3070 product ALLWIN RT3071 0x3071 RT3071 product ALLWIN RT3072 0x3072 RT3072 product ALLWIN RT3572 0x3572 RT3572 /* AlphaSmart, Inc. products */ product ALPHASMART DANA_KB 0xdbac AlphaSmart Dana Keyboard product ALPHASMART DANA_SYNC 0xdf00 AlphaSmart Dana HotSync /* Amoi products */ product AMOI H01 0x0800 H01 3G modem product AMOI H01A 0x7002 H01A 3G modem product AMOI H02 0x0802 H02 3G modem /* American Power Conversion products */ product APC UPS 0x0002 Uninterruptible Power Supply /* Ambit Microsystems products */ product AMBIT WLAN 0x0302 WLAN product AMBIT NTL_250 0x6098 NTL 250 cable modem /* Apacer products */ product APACER HT202 0xb113 USB 2.0 Flash Drive /* American Power Conversion products */ product APC UPS 0x0002 Uninterruptible Power Supply /* Amigo Technology products */ product AMIGO RT2870_1 0x9031 RT2870 product AMIGO RT2870_2 0x9041 RT2870 /* AMIT products */ product AMIT CGWLUSB2GO 0x0002 CG-WLUSB2GO product AMIT CGWLUSB2GNR 0x0008 CG-WLUSB2GNR product AMIT RT2870_1 0x0012 RT2870 /* AMIT(2) products */ product AMIT2 RT2870 0x0008 RT2870 /* Analog Devices products */ product ANALOGDEVICES GNICE 0xf000 FTDI compatible adapter product ANALOGDEVICES GNICEPLUS 0xf001 FTDI compatible adapter /* Anchor products */ product ANCHOR SERIAL 0x2008 Serial product ANCHOR EZUSB 0x2131 EZUSB product ANCHOR EZLINK 0x2720 EZLINK /* AnyData products */ product ANYDATA ADU_620UW 0x6202 CDMA 2000 EV-DO USB Modem product ANYDATA ADU_E100X 0x6501 CDMA 2000 1xRTT/EV-DO USB Modem product ANYDATA ADU_500A 0x6502 CDMA 2000 EV-DO USB Modem /* AOX, Inc. products */ product AOX USB101 0x0008 Ethernet /* American Power Conversion products */ product APC UPS 0x0002 Uninterruptible Power Supply /* Apple Computer products */ product APPLE DUMMY 0x0000 Dummy product product APPLE IMAC_KBD 0x0201 USB iMac Keyboard product APPLE KBD 0x0202 USB Keyboard M2452 product APPLE EXT_KBD 0x020c Apple Extended USB Keyboard /* MacbookAir, aka wellspring */ product APPLE WELLSPRING_ANSI 0x0223 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING_ISO 0x0224 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING_JIS 0x0225 Apple Internal Keyboard/Trackpad /* MacbookProPenryn, aka wellspring2 */ product APPLE WELLSPRING2_ANSI 0x0230 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING2_ISO 0x0231 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING2_JIS 0x0232 Apple Internal Keyboard/Trackpad /* Macbook5,1 (unibody), aka wellspring3 */ product APPLE WELLSPRING3_ANSI 0x0236 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING3_ISO 0x0237 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING3_JIS 0x0238 Apple Internal Keyboard/Trackpad /* MacbookAir3,2 (unibody), aka wellspring4 */ product APPLE WELLSPRING4_ANSI 0x023f Apple Internal Keyboard/Trackpad product APPLE WELLSPRING4_ISO 0x0240 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING4_JIS 0x0241 Apple Internal Keyboard/Trackpad /* MacbookAir3,1 (unibody), aka wellspring4 */ product APPLE WELLSPRING4A_ANSI 0x0242 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING4A_ISO 0x0243 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING4A_JIS 0x0244 Apple Internal Keyboard/Trackpad /* Macbook8 (unibody, March 2011) */ product APPLE WELLSPRING5_ANSI 0x0245 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING5_ISO 0x0246 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING5_JIS 0x0247 Apple Internal Keyboard/Trackpad /* MacbookAir4,1 (unibody, July 2011) */ product APPLE WELLSPRING6A_ANSI 0x0249 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING6A_ISO 0x024a Apple Internal Keyboard/Trackpad product APPLE WELLSPRING6A_JIS 0x024b Apple Internal Keyboard/Trackpad /* MacbookAir4,2 (unibody, July 2011) */ product APPLE WELLSPRING6_ANSI 0x024c Apple Internal Keyboard/Trackpad product APPLE WELLSPRING6_ISO 0x024d Apple Internal Keyboard/Trackpad product APPLE WELLSPRING6_JIS 0x024e Apple Internal Keyboard/Trackpad /* Macbook8,2 (unibody) */ product APPLE WELLSPRING5A_ANSI 0x0252 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING5A_ISO 0x0253 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING5A_JIS 0x0254 Apple Internal Keyboard/Trackpad /* MacbookPro10,1 (unibody, June 2012) */ product APPLE WELLSPRING7_ANSI 0x0262 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING7_ISO 0x0263 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING7_JIS 0x0264 Apple Internal Keyboard/Trackpad /* MacbookPro10,2 (unibody, October 2012) */ product APPLE WELLSPRING7A_ANSI 0x0259 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING7A_ISO 0x025a Apple Internal Keyboard/Trackpad product APPLE WELLSPRING7A_JIS 0x025b Apple Internal Keyboard/Trackpad /* MacbookAir6,2 (unibody, June 2013) */ product APPLE WELLSPRING8_ANSI 0x0290 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING8_ISO 0x0291 Apple Internal Keyboard/Trackpad product APPLE WELLSPRING8_JIS 0x0292 Apple Internal Keyboard/Trackpad product APPLE MOUSE 0x0301 Mouse M4848 product APPLE OPTMOUSE 0x0302 Optical mouse product APPLE MIGHTYMOUSE 0x0304 Mighty Mouse product APPLE KBD_HUB 0x1001 Hub in Apple USB Keyboard product APPLE EXT_KBD_HUB 0x1003 Hub in Apple Extended USB Keyboard product APPLE SPEAKERS 0x1101 Speakers product APPLE IPOD 0x1201 iPod product APPLE IPOD2G 0x1202 iPod 2G product APPLE IPOD3G 0x1203 iPod 3G product APPLE IPOD_04 0x1204 iPod '04' product APPLE IPODMINI 0x1205 iPod Mini product APPLE IPOD_06 0x1206 iPod '06' product APPLE IPOD_07 0x1207 iPod '07' product APPLE IPOD_08 0x1208 iPod '08' product APPLE IPODVIDEO 0x1209 iPod Video product APPLE IPODNANO 0x120a iPod Nano product APPLE IPHONE 0x1290 iPhone product APPLE IPOD_TOUCH 0x1291 iPod Touch product APPLE IPHONE_3G 0x1292 iPhone 3G product APPLE IPHONE_3GS 0x1294 iPhone 3GS product APPLE IPHONE_4 0x1297 iPhone 4 product APPLE IPHONE_4S 0x12a0 iPhone 4S product APPLE IPHONE_5 0x12a8 iPhone 5 product APPLE IPAD 0x129a iPad product APPLE ETHERNET 0x1402 Ethernet A1277 /* Arkmicro Technologies */ product ARKMICRO ARK3116 0x0232 ARK3116 Serial /* Asahi Optical products */ product ASAHIOPTICAL OPTIO230 0x0004 Digital camera product ASAHIOPTICAL OPTIO330 0x0006 Digital camera /* Asante products */ product ASANTE EA 0x1427 Ethernet /* ASIX Electronics products */ product ASIX AX88172 0x1720 10/100 Ethernet product ASIX AX88178 0x1780 AX88178 product ASIX AX88178A 0x178a AX88178A USB 2.0 10/100/1000 Ethernet product ASIX AX88179 0x1790 AX88179 USB 3.0 10/100/1000 Ethernet product ASIX AX88772 0x7720 AX88772 product ASIX AX88772A 0x772a AX88772A USB 2.0 10/100 Ethernet product ASIX AX88772B 0x772b AX88772B USB 2.0 10/100 Ethernet product ASIX AX88772B_1 0x7e2b AX88772B USB 2.0 10/100 Ethernet /* ASUS products */ product ASUS2 USBN11 0x0b05 USB-N11 product ASUS RT2570 0x1706 RT2500USB Wireless Adapter product ASUS WL167G 0x1707 WL-167g Wireless Adapter product ASUS WL159G 0x170c WL-159g product ASUS A9T_WIFI 0x171b A9T wireless product ASUS P5B_WIFI 0x171d P5B wireless product ASUS RT2573_1 0x1723 RT2573 product ASUS RT2573_2 0x1724 RT2573 product ASUS LCM 0x1726 LCM display product ASUS RT2870_1 0x1731 RT2870 product ASUS RT2870_2 0x1732 RT2870 product ASUS RT2870_3 0x1742 RT2870 product ASUS RT2870_4 0x1760 RT2870 product ASUS RT2870_5 0x1761 RT2870 product ASUS USBN13 0x1784 USB-N13 product ASUS USBN10 0x1786 USB-N10 product ASUS RT3070_1 0x1790 RT3070 product ASUS RTL8192SU 0x1791 RTL8192SU product ASUS USB_N53 0x179d ASUS Black Diamond Dual Band USB-N53 product ASUS RTL8192CU 0x17ab RTL8192CU product ASUS USBN66 0x17ad USB-N66 product ASUS USBN10NANO 0x17ba USB-N10 Nano product ASUS USBAC51 0x17d1 USB-AC51 product ASUS A730W 0x4202 ASUS MyPal A730W product ASUS P535 0x420f ASUS P535 PDA product ASUS GMSC 0x422f ASUS Generic Mass Storage /* ATen products */ product ATEN UC1284 0x2001 Parallel printer product ATEN UC10T 0x2002 10Mbps Ethernet product ATEN UC110T 0x2007 UC-110T Ethernet product ATEN UC232A 0x2008 Serial product ATEN UC210T 0x2009 UC-210T Ethernet product ATEN DSB650C 0x4000 DSB-650C /* ATP Electronics products */ product ATP EUSB 0xaf01 ATP IG eUSB SSD /* Atheros Communications products */ product ATHEROS AR5523 0x0001 AR5523 product ATHEROS AR5523_NF 0x0002 AR5523 (no firmware) product ATHEROS2 AR5523_1 0x0001 AR5523 product ATHEROS2 AR5523_1_NF 0x0002 AR5523 (no firmware) product ATHEROS2 AR5523_2 0x0003 AR5523 product ATHEROS2 AR5523_2_NF 0x0004 AR5523 (no firmware) product ATHEROS2 AR5523_3 0x0005 AR5523 product ATHEROS2 AR5523_3_NF 0x0006 AR5523 (no firmware) /* Atmel Comp. products */ product ATMEL STK541 0x2109 Zigbee Controller product ATMEL UHB124 0x3301 UHB124 hub product ATMEL DWL120 0x7603 DWL-120 Wireless Adapter product ATMEL BW002 0x7605 BW002 Wireless Adapter product ATMEL WL1130USB 0x7613 WL-1130 USB product ATMEL AT76C505A 0x7614 AT76c505a Wireless Adapter /* AuthenTec products */ product AUTHENTEC AES1610 0x1600 AES1610 Fingerprint Sensor /* Avision products */ product AVISION 1200U 0x0268 1200U scanner /* Axesstel products */ product AXESSTEL DATAMODEM 0x1000 Data Modem /* AsureWave products */ product AZUREWAVE RT2870_1 0x3247 RT2870 product AZUREWAVE RT2870_2 0x3262 RT2870 product AZUREWAVE RT3070_1 0x3273 RT3070 product AZUREWAVE RT3070_2 0x3284 RT3070 product AZUREWAVE RT3070_3 0x3305 RT3070 product AZUREWAVE RTL8188CU 0x3357 RTL8188CU product AZUREWAVE RTL8188CE_1 0x3358 RTL8188CE product AZUREWAVE RTL8188CE_2 0x3359 RTL8188CE product AZUREWAVE RTL8192SU_1 0x3306 RTL8192SU product AZUREWAVE RTL8192SU_2 0x3309 RTL8192SU product AZUREWAVE RTL8192SU_3 0x3310 RTL8192SU product AZUREWAVE RTL8192SU_4 0x3311 RTL8192SU product AZUREWAVE RTL8192SU_5 0x3325 RTL8192SU /* Baltech products */ product BALTECH CARDREADER 0x9999 Card reader /* Bayer products */ product BAYER CONTOUR_CABLE 0x6001 FTDI compatible adapter /* B&B Electronics products */ product BBELECTRONICS USOTL4 0xAC01 RS-422/485 product BBELECTRONICS 232USB9M 0xac27 FTDI compatible adapter product BBELECTRONICS 485USB9F_2W 0xac25 FTDI compatible adapter product BBELECTRONICS 485USB9F_4W 0xac26 FTDI compatible adapter product BBELECTRONICS 485USBTB_2W 0xac33 FTDI compatible adapter product BBELECTRONICS 485USBTB_4W 0xac34 FTDI compatible adapter product BBELECTRONICS TTL3USB9M 0xac50 FTDI compatible adapter product BBELECTRONICS TTL5USB9M 0xac49 FTDI compatible adapter product BBELECTRONICS USO9ML2 0xac03 FTDI compatible adapter product BBELECTRONICS USO9ML2DR 0xac17 FTDI compatible adapter product BBELECTRONICS USO9ML2DR_2 0xac16 FTDI compatible adapter product BBELECTRONICS USOPTL4 0xac11 FTDI compatible adapter product BBELECTRONICS USOPTL4DR 0xac19 FTDI compatible adapter product BBELECTRONICS USOPTL4DR2 0xac18 FTDI compatible adapter product BBELECTRONICS USPTL4 0xac12 FTDI compatible adapter product BBELECTRONICS USTL4 0xac02 FTDI compatible adapter product BBELECTRONICS ZZ_PROG1_USB 0xba02 FTDI compatible adapter /* Belkin products */ /*product BELKIN F5U111 0x???? F5U111 Ethernet*/ product BELKIN F5D6050 0x0050 F5D6050 802.11b Wireless Adapter product BELKIN FBT001V 0x0081 FBT001v2 Bluetooth product BELKIN FBT003V 0x0084 FBT003v2 Bluetooth product BELKIN F5U103 0x0103 F5U103 Serial product BELKIN F5U109 0x0109 F5U109 Serial product BELKIN USB2SCSI 0x0115 USB to SCSI product BELKIN F8T012 0x0121 F8T012xx1 Bluetooth USB Adapter product BELKIN USB2LAN 0x0121 USB to LAN product BELKIN F5U208 0x0208 F5U208 VideoBus II product BELKIN F5U237 0x0237 F5U237 USB 2.0 7-Port Hub product BELKIN F5U257 0x0257 F5U257 Serial product BELKIN F5U409 0x0409 F5U409 Serial product BELKIN F6C550AVR 0x0551 F6C550-AVR UPS product BELKIN F5U120 0x1203 F5U120-PC Hub product BELKIN RTL8188CU 0x1102 RTL8188CU Wireless Adapter product BELKIN F9L1103 0x1103 F9L1103 Wireless Adapter product BELKIN RTL8192CU 0x2102 RTL8192CU Wireless Adapter product BELKIN F7D2102 0x2103 F7D2102 Wireless Adapter product BELKIN ZD1211B 0x4050 ZD1211B product BELKIN F5D5055 0x5055 F5D5055 product BELKIN F5D7050 0x7050 F5D7050 Wireless Adapter product BELKIN F5D7051 0x7051 F5D7051 54g USB Network Adapter product BELKIN F5D7050A 0x705a F5D7050A Wireless Adapter /* Also sold as 'Ativa 802.11g wireless card' */ product BELKIN F5D7050_V4000 0x705c F5D7050 v4000 Wireless Adapter product BELKIN F5D7050E 0x705e F5D7050E Wireless Adapter product BELKIN RT2870_1 0x8053 RT2870 product BELKIN RT2870_2 0x805c RT2870 product BELKIN F5D8053V3 0x815c F5D8053 v3 product BELKIN RTL8192SU_1 0x815f RTL8192SU product BELKIN RTL8192SU_2 0x845a RTL8192SU product BELKIN RTL8192SU_3 0x945a RTL8192SU product BELKIN F5D8055 0x825a F5D8055 product BELKIN F5D8055V2 0x825b F5D8055 v2 product BELKIN F5D9050V3 0x905b F5D9050 ver 3 Wireless Adapter product BELKIN2 F5U002 0x0002 F5U002 Parallel printer product BELKIN F6D4050V1 0x935a F6D4050 v1 product BELKIN F6D4050V2 0x935b F6D4050 v2 /* Billionton products */ product BILLIONTON USB100 0x0986 USB100N 10/100 FastEthernet product BILLIONTON USBLP100 0x0987 USB100LP product BILLIONTON USBEL100 0x0988 USB100EL product BILLIONTON USBE100 0x8511 USBE100 product BILLIONTON USB2AR 0x90ff USB2AR Ethernet /* Broadcom products */ product BROADCOM BCM2033 0x2033 BCM2033 Bluetooth USB dongle /* Brother Industries products */ product BROTHER HL1050 0x0002 HL-1050 laser printer product BROTHER MFC8600_9650 0x0100 MFC8600/9650 multifunction device /* Behavior Technology Computer products */ product BTC BTC6100 0x5550 6100C Keyboard product BTC BTC7932 0x6782 Keyboard with mouse port /* Canon, Inc. products */ product CANON N656U 0x2206 CanoScan N656U product CANON N1220U 0x2207 CanoScan N1220U product CANON D660U 0x2208 CanoScan D660U product CANON N676U 0x220d CanoScan N676U product CANON N1240U 0x220e CanoScan N1240U product CANON LIDE25 0x2220 CanoScan LIDE 25 product CANON S10 0x3041 PowerShot S10 product CANON S100 0x3045 PowerShot S100 product CANON S200 0x3065 PowerShot S200 product CANON REBELXT 0x30ef Digital Rebel XT /* CATC products */ product CATC NETMATE 0x000a Netmate Ethernet product CATC NETMATE2 0x000c Netmate2 Ethernet product CATC CHIEF 0x000d USB Chief Bus & Protocol Analyzer product CATC ANDROMEDA 0x1237 Andromeda hub /* CASIO products */ product CASIO QV_DIGICAM 0x1001 QV DigiCam product CASIO EXS880 0x1105 Exilim EX-S880 product CASIO BE300 0x2002 BE-300 PDA product CASIO NAMELAND 0x4001 CASIO Nameland EZ-USB /* CCYU products */ product CCYU ED1064 0x2136 EasyDisk ED1064 /* Century products */ product CENTURY EX35QUAT 0x011e Century USB Disk Enclosure product CENTURY EX35SW4_SB4 0x011f Century USB Disk Enclosure /* Cherry products */ product CHERRY MY3000KBD 0x0001 My3000 keyboard product CHERRY MY3000HUB 0x0003 My3000 hub product CHERRY CYBOARD 0x0004 CyBoard Keyboard /* Chic Technology products */ product CHIC MOUSE1 0x0001 mouse product CHIC CYPRESS 0x0003 Cypress USB Mouse /* Chicony products */ product CHICONY KB8933 0x0001 KB-8933 keyboard product CHICONY KU0325 0x0116 KU-0325 keyboard product CHICONY CNF7129 0xb071 Notebook Web Camera product CHICONY HDUVCCAM 0xb40a HD UVC WebCam product CHICONY RTL8188CUS_1 0xaff7 RTL8188CUS product CHICONY RTL8188CUS_2 0xaff8 RTL8188CUS product CHICONY RTL8188CUS_3 0xaff9 RTL8188CUS product CHICONY RTL8188CUS_4 0xaffa RTL8188CUS product CHICONY RTL8188CUS_5 0xaffa RTL8188CUS product CHICONY2 TWINKLECAM 0x600d TwinkleCam USB camera /* CH Products */ product CHPRODUCTS PROTHROTTLE 0x00f1 Pro Throttle product CHPRODUCTS PROPEDALS 0x00f2 Pro Pedals product CHPRODUCTS FIGHTERSTICK 0x00f3 Fighterstick product CHPRODUCTS FLIGHTYOKE 0x00ff Flight Sim Yoke /* Cisco-Linksys products */ product CISCOLINKSYS WUSB54AG 0x000c WUSB54AG Wireless Adapter product CISCOLINKSYS WUSB54G 0x000d WUSB54G Wireless Adapter product CISCOLINKSYS WUSB54GP 0x0011 WUSB54GP Wireless Adapter product CISCOLINKSYS USB200MV2 0x0018 USB200M v2 product CISCOLINKSYS HU200TS 0x001a HU200TS Wireless Adapter product CISCOLINKSYS WUSB54GC 0x0020 WUSB54GC product CISCOLINKSYS WUSB54GR 0x0023 WUSB54GR product CISCOLINKSYS WUSBF54G 0x0024 WUSBF54G product CISCOLINKSYS AE1000 0x002f AE1000 product CISCOLINKSYS2 RT3070 0x4001 RT3070 product CISCOLINKSYS3 RT3070 0x0101 RT3070 /* Clipsal products */ product CLIPSAL 560884 0x0101 560884 C-Bus Audio Matrix Switch product CLIPSAL 5500PACA 0x0201 5500PACA C-Bus Pascal Automation Controller product CLIPSAL 5800PC 0x0301 5800PC C-Bus Wireless Interface product CLIPSAL 5500PCU 0x0303 5500PCU C-Bus Interface product CLIPSAL 5000CT2 0x0304 5000CT2 C-Bus Touch Screen product CLIPSAL C5000CT2 0x0305 C5000CT2 C-Bus Touch Screen product CLIPSAL L51xx 0x0401 L51xx C-Bus Dimmer /* CMOTECH products */ product CMOTECH CNU510 0x5141 CDMA Technologies USB modem product CMOTECH CNU550 0x5543 CDMA 2000 1xRTT/1xEVDO USB modem product CMOTECH CGU628 0x6006 CGU-628 product CMOTECH CDMA_MODEM1 0x6280 CDMA Technologies USB modem product CMOTECH DISK 0xf000 disk mode /* Compaq products */ product COMPAQ IPAQPOCKETPC 0x0003 iPAQ PocketPC product COMPAQ PJB100 0x504a Personal Jukebox PJB100 product COMPAQ IPAQLINUX 0x505a iPAQ Linux /* Composite Corp products looks the same as "TANGTOP" */ product COMPOSITE USBPS2 0x0001 USB to PS2 Adaptor /* Conceptronic products */ product CONCEPTRONIC PRISM_GT 0x3762 PrismGT USB 2.0 WLAN product CONCEPTRONIC C11U 0x7100 C11U product CONCEPTRONIC WL210 0x7110 WL-210 product CONCEPTRONIC AR5523_1 0x7801 AR5523 product CONCEPTRONIC AR5523_1_NF 0x7802 AR5523 (no firmware) product CONCEPTRONIC AR5523_2 0x7811 AR5523 product CONCEPTRONIC AR5523_2_NF 0x7812 AR5523 (no firmware) product CONCEPTRONIC2 RTL8192SU_1 0x3300 RTL8192SU product CONCEPTRONIC2 RTL8192SU_2 0x3301 RTL8192SU product CONCEPTRONIC2 RTL8192SU_3 0x3302 RTL8192SU product CONCEPTRONIC2 C54RU 0x3c02 C54RU WLAN product CONCEPTRONIC2 C54RU2 0x3c22 C54RU product CONCEPTRONIC2 RT3070_1 0x3c08 RT3070 product CONCEPTRONIC2 RT3070_2 0x3c11 RT3070 product CONCEPTRONIC2 VIGORN61 0x3c25 VIGORN61 product CONCEPTRONIC2 RT2870_1 0x3c06 RT2870 product CONCEPTRONIC2 RT2870_2 0x3c07 RT2870 product CONCEPTRONIC2 RT2870_7 0x3c09 RT2870 product CONCEPTRONIC2 RT2870_8 0x3c12 RT2870 product CONCEPTRONIC2 RT2870_3 0x3c23 RT2870 product CONCEPTRONIC2 RT2870_4 0x3c25 RT2870 product CONCEPTRONIC2 RT2870_5 0x3c27 RT2870 product CONCEPTRONIC2 RT2870_6 0x3c28 RT2870 /* Connectix products */ product CONNECTIX QUICKCAM 0x0001 QuickCam /* Conect products */ product CONTEC COM1USBH 0x8311 FTDI compatible adapter /* Corega products */ product COREGA ETHER_USB_T 0x0001 Ether USB-T product COREGA FETHER_USB_TX 0x0004 FEther USB-TX product COREGA WLAN_USB_USB_11 0x000c WirelessLAN USB-11 product COREGA FETHER_USB_TXS 0x000d FEther USB-TXS product COREGA WLANUSB 0x0012 Wireless LAN Stick-11 product COREGA FETHER_USB2_TX 0x0017 FEther USB2-TX product COREGA WLUSB_11_KEY 0x001a ULUSB-11 Key product COREGA CGUSBRS232R 0x002a CG-USBRS232R product COREGA CGWLUSB2GL 0x002d CG-WLUSB2GL product COREGA CGWLUSB2GPX 0x002e CG-WLUSB2GPX product COREGA RT2870_1 0x002f RT2870 product COREGA RT2870_2 0x003c RT2870 product COREGA RT2870_3 0x003f RT2870 product COREGA RT3070 0x0041 RT3070 product COREGA CGWLUSB300GNM 0x0042 CG-WLUSB300GNM product COREGA RTL8192SU 0x0047 RTL8192SU product COREGA RTL8192CU 0x0056 RTL8192CU product COREGA WLUSB_11_STICK 0x7613 WLAN USB Stick 11 product COREGA FETHER_USB_TXC 0x9601 FEther USB-TXC /* Corsair products */ product CORSAIR K60 0x0a60 Corsair Vengeance K60 keyboard product CORSAIR K70 0x1b09 Corsair Vengeance K70 keyboard /* Creative products */ product CREATIVE NOMAD_II 0x1002 Nomad II MP3 player product CREATIVE NOMAD_IIMG 0x4004 Nomad II MG product CREATIVE NOMAD 0x4106 Nomad product CREATIVE2 VOIP_BLASTER 0x0258 Voip Blaster product CREATIVE3 OPTICAL_MOUSE 0x0001 Notebook Optical Mouse /* Cambridge Silicon Radio Ltd. products */ product CSR BT_DONGLE 0x0001 Bluetooth USB dongle product CSR CSRDFU 0xffff USB Bluetooth Device in DFU State /* Chipsbank Microelectronics Co., Ltd */ product CHIPSBANK USBMEMSTICK 0x6025 CBM2080 Flash drive controller product CHIPSBANK USBMEMSTICK1 0x6026 CBM1180 Flash drive controller /* CTX products */ product CTX EX1300 0x9999 Ex1300 hub /* Curitel products */ product CURITEL HX550C 0x1101 CDMA 2000 1xRTT USB modem (HX-550C) product CURITEL HX57XB 0x2101 CDMA 2000 1xRTT USB modem (HX-570/575B/PR-600) product CURITEL PC5740 0x3701 Broadband Wireless modem product CURITEL UM150 0x3711 EVDO modem product CURITEL UM175 0x3714 EVDO modem /* CyberPower products */ product CYBERPOWER 1500CAVRLCD 0x0501 1500CAVRLCD /* CyberTAN Technology products */ product CYBERTAN TG54USB 0x1666 TG54USB product CYBERTAN RT2870 0x1828 RT2870 /* Cypress Semiconductor products */ product CYPRESS MOUSE 0x0001 mouse product CYPRESS THERMO 0x0002 thermometer product CYPRESS WISPY1A 0x0bad MetaGeek Wi-Spy product CYPRESS KBDHUB 0x0101 Keyboard/Hub product CYPRESS FMRADIO 0x1002 FM Radio product CYPRESS IKARILASER 0x121f Ikari Laser SteelSeries ApS product CYPRESS USBRS232 0x5500 USB-RS232 Interface product CYPRESS SLIM_HUB 0x6560 Slim Hub product CYPRESS XX6830XX 0x6830 PATA Storage Device product CYPRESS SILVERSHIELD 0xfd13 Gembird Silver Shield PM /* Daisy Technology products */ product DAISY DMC 0x6901 USB MultiMedia Reader /* Dallas Semiconductor products */ product DALLAS J6502 0x4201 J-6502 speakers /* DataApex products */ product DATAAPEX MULTICOM 0xead6 MultiCom /* Dell products */ product DELL PORT 0x0058 Port Replicator product DELL AIO926 0x5115 Photo AIO Printer 926 product DELL BC02 0x8000 BC02 Bluetooth USB Adapter product DELL PRISM_GT_1 0x8102 PrismGT USB 2.0 WLAN product DELL TM350 0x8103 TrueMobile 350 Bluetooth USB Adapter product DELL PRISM_GT_2 0x8104 PrismGT USB 2.0 WLAN product DELL U5700 0x8114 Dell 5700 3G product DELL U5500 0x8115 Dell 5500 3G product DELL U5505 0x8116 Dell 5505 3G product DELL U5700_2 0x8117 Dell 5700 3G product DELL U5510 0x8118 Dell 5510 3G product DELL U5700_3 0x8128 Dell 5700 3G product DELL U5700_4 0x8129 Dell 5700 3G product DELL U5720 0x8133 Dell 5720 3G product DELL U5720_2 0x8134 Dell 5720 3G product DELL U740 0x8135 Dell U740 CDMA product DELL U5520 0x8136 Dell 5520 3G product DELL U5520_2 0x8137 Dell 5520 3G product DELL U5520_3 0x8138 Dell 5520 3G product DELL U5730 0x8180 Dell 5730 3G product DELL U5730_2 0x8181 Dell 5730 3G product DELL U5730_3 0x8182 Dell 5730 3G product DELL DW700 0x9500 Dell DW700 GPS /* Delorme Paublishing products */ product DELORME EARTHMATE 0x0100 Earthmate GPS /* Desknote products */ product DESKNOTE UCR_61S2B 0x0c55 UCR-61S2B /* Diamond products */ product DIAMOND RIO500USB 0x0001 Rio 500 USB /* Dick Smith Electronics (really C-Net) products */ product DICKSMITH RT2573 0x9022 RT2573 product DICKSMITH CWD854F 0x9032 C-Net CWD-854 rev F /* Digi International products */ product DIGI ACCELEPORT2 0x0002 AccelePort USB 2 product DIGI ACCELEPORT4 0x0004 AccelePort USB 4 product DIGI ACCELEPORT8 0x0008 AccelePort USB 8 /* Digianswer A/S products */ product DIGIANSWER ZIGBEE802154 0x000a ZigBee/802.15.4 MAC /* D-Link products */ /*product DLINK DSBS25 0x0100 DSB-S25 serial*/ product DLINK DUBE100 0x1a00 10/100 Ethernet product DLINK DUBE100C1 0x1a02 DUB-E100 rev C1 product DLINK DSB650TX4 0x200c 10/100 Ethernet product DLINK DWL120E 0x3200 DWL-120 rev E product DLINK DWA125D1 0x330f DWA-125 rev D1 product DLINK DWA123D1 0x3310 DWA-123 rev D1 product DLINK DWL122 0x3700 DWL-122 product DLINK DWLG120 0x3701 DWL-G120 product DLINK DWL120F 0x3702 DWL-120 rev F product DLINK DWLAG132 0x3a00 DWL-AG132 product DLINK DWLAG132_NF 0x3a01 DWL-AG132 (no firmware) product DLINK DWLG132 0x3a02 DWL-G132 product DLINK DWLG132_NF 0x3a03 DWL-G132 (no firmware) product DLINK DWLAG122 0x3a04 DWL-AG122 product DLINK DWLAG122_NF 0x3a05 DWL-AG122 (no firmware) product DLINK DWLG122 0x3c00 DWL-G122 b1 Wireless Adapter product DLINK DUBE100B1 0x3c05 DUB-E100 rev B1 product DLINK RT2870 0x3c09 RT2870 product DLINK RT3072 0x3c0a RT3072 product DLINK DWA140B3 0x3c15 DWA-140 rev B3 product DLINK DWA160B2 0x3c1a DWA-160 rev B2 product DLINK DWA127 0x3c1b DWA-127 Wireless Adapter product DLINK DWA162 0x3c1f DWA-162 Wireless Adapter product DLINK DWA140D1 0x3c20 DWA-140 rev D1 product DLINK DSB650C 0x4000 10Mbps Ethernet product DLINK DSB650TX1 0x4001 10/100 Ethernet product DLINK DSB650TX 0x4002 10/100 Ethernet product DLINK DSB650TX_PNA 0x4003 1/10/100 Ethernet product DLINK DSB650TX3 0x400b 10/100 Ethernet product DLINK DSB650TX2 0x4102 10/100 Ethernet product DLINK DUB1312 0x4a00 10/100/1000 Ethernet product DLINK DSB650 0xabc1 10/100 Ethernet product DLINK DUBH7 0xf103 DUB-H7 USB 2.0 7-Port Hub product DLINK DWR510_CD 0xa805 DWR-510 CD-ROM Mode product DLINK DWR510 0x7e12 DWR-510 product DLINK RTL8188CU 0x3308 RTL8188CU product DLINK RTL8192CU_1 0x3307 RTL8192CU product DLINK RTL8192CU_2 0x3309 RTL8192CU product DLINK RTL8192CU_3 0x330a RTL8192CU product DLINK DWA131B 0x330d DWA-131 rev B product DLINK2 RTL8192SU_1 0x3300 RTL8192SU product DLINK2 RTL8192SU_2 0x3302 RTL8192SU product DLINK2 DWA131A1 0x3303 DWA-131 A1 product DLINK2 DWA120 0x3a0c DWA-120 product DLINK2 DWA120_NF 0x3a0d DWA-120 (no firmware) product DLINK2 DWLG122C1 0x3c03 DWL-G122 c1 product DLINK2 WUA1340 0x3c04 WUA-1340 product DLINK2 DWA111 0x3c06 DWA-111 product DLINK2 RT2870_1 0x3c09 RT2870 product DLINK2 DWA110 0x3c07 DWA-110 product DLINK2 RT3072 0x3c0a RT3072 product DLINK2 RT3072_1 0x3c0b RT3072 product DLINK2 RT3070_1 0x3c0d RT3070 product DLINK2 RT3070_2 0x3c0e RT3070 product DLINK2 RT3070_3 0x3c0f RT3070 product DLINK2 RT2870_2 0x3c11 RT2870 product DLINK2 DWA130 0x3c13 DWA-130 product DLINK2 RT3070_4 0x3c15 RT3070 product DLINK2 RT3070_5 0x3c16 RT3070 product DLINK3 DWM652 0x3e04 DWM-652 /* DMI products */ product DMI CFSM_RW 0xa109 CF/SM Reader/Writer product DMI DISK 0x2bcf Generic Disk /* DrayTek products */ product DRAYTEK VIGOR550 0x0550 Vigor550 /* Dream Link products */ product DREAMLINK DL100B 0x0004 USB Webmail Notifier /* dresden elektronik products */ product DRESDENELEKTRONIK SENSORTERMINALBOARD 0x0001 SensorTerminalBoard product DRESDENELEKTRONIK WIRELESSHANDHELDTERMINAL 0x0004 Wireless Handheld Terminal product DRESDENELEKTRONIK DE_RFNODE 0x001c deRFnode product DRESDENELEKTRONIK LEVELSHIFTERSTICKLOWCOST 0x0022 Levelshifter Stick Low Cost /* Dynastream Innovations */ product DYNASTREAM ANTDEVBOARD 0x1003 ANT dev board product DYNASTREAM ANT2USB 0x1004 ANT2USB product DYNASTREAM ANTDEVBOARD2 0x1006 ANT dev board /* Edimax products */ product EDIMAX EW7318USG 0x7318 USB Wireless dongle product EDIMAX RTL8192SU_1 0x7611 RTL8192SU product EDIMAX RTL8192SU_2 0x7612 RTL8192SU product EDIMAX EW7622UMN 0x7622 EW-7622UMn product EDIMAX RT2870_1 0x7711 RT2870 product EDIMAX EW7717 0x7717 EW-7717 product EDIMAX EW7718 0x7718 EW-7718 product EDIMAX EW7733UND 0x7733 EW-7733UnD product EDIMAX EW7811UN 0x7811 EW-7811Un product EDIMAX RTL8192CU 0x7822 RTL8192CU /* eGalax Products */ product EGALAX TPANEL 0x0001 Touch Panel product EGALAX TPANEL2 0x0002 Touch Panel product EGALAX2 TPANEL 0x0001 Touch Panel /* EGO Products */ product EGO DUMMY 0x0000 Dummy Product product EGO M4U 0x1020 ESI M4U /* Eicon Networks */ product EICON DIVA852 0x4905 Diva 852 ISDN TA /* EIZO products */ product EIZO HUB 0x0000 hub product EIZO MONITOR 0x0001 monitor /* ELCON Systemtechnik products */ product ELCON PLAN 0x0002 Goldpfeil P-LAN /* Elecom products */ product ELECOM MOUSE29UO 0x0002 mouse 29UO product ELECOM LDUSBTX0 0x200c LD-USB/TX product ELECOM LDUSBTX1 0x4002 LD-USB/TX product ELECOM LDUSBLTX 0x4005 LD-USBL/TX product ELECOM WDC150SU2M 0x4008 WDC-150SU2M product ELECOM LDUSBTX2 0x400b LD-USB/TX product ELECOM LDUSB20 0x4010 LD-USB20 product ELECOM UCSGT 0x5003 UC-SGT product ELECOM UCSGT0 0x5004 UC-SGT product ELECOM LDUSBTX3 0xabc1 LD-USB/TX /* Elektor products */ product ELEKTOR FT323R 0x0005 FTDI compatible adapter /* Elsa products */ product ELSA MODEM1 0x2265 ELSA Modem Board product ELSA USB2ETHERNET 0x3000 Microlink USB2Ethernet /* ELV products */ product ELV USBI2C 0xe00f USB-I2C interface /* EMS products */ product EMS DUAL_SHOOTER 0x0003 PSX gun controller converter /* Emtec products */ product EMTEC RUF2PS 0x2240 Flash Drive /* Encore products */ product ENCORE RT3070_1 0x1480 RT3070 product ENCORE RT3070_2 0x14a1 RT3070 product ENCORE RT3070_3 0x14a9 RT3070 /* Entrega products */ product ENTREGA 1S 0x0001 1S serial product ENTREGA 2S 0x0002 2S serial product ENTREGA 1S25 0x0003 1S25 serial product ENTREGA 4S 0x0004 4S serial product ENTREGA E45 0x0005 E45 Ethernet product ENTREGA CENTRONICS 0x0006 Parallel Port product ENTREGA XX1 0x0008 Ethernet product ENTREGA 1S9 0x0093 1S9 serial product ENTREGA EZUSB 0x8000 EZ-USB /*product ENTREGA SERIAL 0x8001 DB25 Serial*/ product ENTREGA 2U4S 0x8004 2U4S serial/usb hub product ENTREGA XX2 0x8005 Ethernet /*product ENTREGA SERIAL_DB9 0x8093 DB9 Serial*/ /* Epson products */ product EPSON PRINTER1 0x0001 USB Printer product EPSON PRINTER2 0x0002 ISD USB Smart Cable for Mac product EPSON PRINTER3 0x0003 ISD USB Smart Cable product EPSON PRINTER5 0x0005 USB Printer product EPSON 636 0x0101 Perfection 636U / 636Photo scanner product EPSON 610 0x0103 Perfection 610 scanner product EPSON 1200 0x0104 Perfection 1200U / 1200Photo scanner product EPSON 1600 0x0107 Expression 1600 scanner product EPSON 1640 0x010a Perfection 1640SU scanner product EPSON 1240 0x010b Perfection 1240U / 1240Photo scanner product EPSON 640U 0x010c Perfection 640U scanner product EPSON 1250 0x010f Perfection 1250U / 1250Photo scanner product EPSON 1650 0x0110 Perfection 1650 scanner product EPSON GT9700F 0x0112 GT-9700F scanner product EPSON GT9300UF 0x011b GT-9300UF scanner product EPSON 3200 0x011c Perfection 3200 scanner product EPSON 1260 0x011d Perfection 1260 scanner product EPSON 1660 0x011e Perfection 1660 scanner product EPSON 1670 0x011f Perfection 1670 scanner product EPSON 1270 0x0120 Perfection 1270 scanner product EPSON 2480 0x0121 Perfection 2480 scanner product EPSON 3590 0x0122 Perfection 3590 scanner product EPSON 4990 0x012a Perfection 4990 Photo scanner product EPSON CRESSI_EDY 0x0521 Cressi Edy diving computer product EPSON N2ITION3 0x0522 Zeagle N2iTion3 diving computer product EPSON STYLUS_875DC 0x0601 Stylus Photo 875DC Card Reader product EPSON STYLUS_895 0x0602 Stylus Photo 895 Card Reader product EPSON CX5400 0x0808 CX5400 scanner product EPSON 3500 0x080e CX-3500/3600/3650 MFP product EPSON RX425 0x080f Stylus Photo RX425 scanner product EPSON DX3800 0x0818 CX3700/CX3800/DX38x0 MFP scanner product EPSON 4800 0x0819 CX4700/CX4800/DX48x0 MFP scanner product EPSON 4200 0x0820 CX4100/CX4200/DX4200 MFP scanner product EPSON 5000 0x082b CX4900/CX5000/DX50x0 MFP scanner product EPSON 6000 0x082e CX5900/CX6000/DX60x0 MFP scanner product EPSON DX4000 0x082f DX4000 MFP scanner product EPSON DX7400 0x0838 CX7300/CX7400/DX7400 MFP scanner product EPSON DX8400 0x0839 CX8300/CX8400/DX8400 MFP scanner product EPSON SX100 0x0841 SX100/NX100 MFP scanner product EPSON NX300 0x0848 NX300 MFP scanner product EPSON SX200 0x0849 SX200/SX205 MFP scanner product EPSON SX400 0x084a SX400/NX400/TX400 MFP scanner /* e-TEK Labs products */ product ETEK 1COM 0x8007 Serial /* Evolution products */ product EVOLUTION ER1 0x0300 FTDI compatible adapter product EVOLUTION HYBRID 0x0302 FTDI compatible adapter product EVOLUTION RCM4 0x0303 FTDI compatible adapter /* Extended Systems products */ product EXTENDED XTNDACCESS 0x0100 XTNDAccess IrDA /* Falcom products */ product FALCOM TWIST 0x0001 USB GSM/GPRS Modem product FALCOM SAMBA 0x0005 FTDI compatible adapter /* FEIYA products */ product FEIYA DUMMY 0x0000 Dummy product product FEIYA 5IN1 0x1132 5-in-1 Card Reader product FEIYA ELANGO 0x6200 MicroSDHC Card Reader product FEIYA AC110 0x6300 AC-110 Card Reader /* FeiXun Communication products */ product FEIXUN RTL8188CU 0x0090 RTL8188CU product FEIXUN RTL8192CU 0x0091 RTL8192CU /* Festo */ product FESTO CPX_USB 0x0102 CPX-USB product FESTO CMSP 0x0501 CMSP /* Fiberline */ product FIBERLINE WL430U 0x6003 WL-430U /* FIC / OpenMoko */ product FIC NEO1973_DEBUG 0x5118 FTDI compatible adapter /* Fossil, Inc products */ product FOSSIL WRISTPDA 0x0002 Wrist PDA /* Foxconn products */ product FOXCONN TCOM_TC_300 0xe000 T-Com TC 300 product FOXCONN PIRELLI_DP_L10 0xe003 Pirelli DP-L10 /* Freecom products */ product FREECOM DVD 0xfc01 DVD drive product FREECOM HDD 0xfc05 Classic SL Hard Drive /* Fujitsu Siemens Computers products */ product FSC E5400 0x1009 PrismGT USB 2.0 WLAN /* Future Technology Devices products */ product FTDI SERIAL_8U100AX 0x8372 8U100AX Serial product FTDI SERIAL_8U232AM 0x6001 8U232AM Serial product FTDI SERIAL_8U232AM4 0x6004 8U232AM Serial product FTDI SERIAL_232RL 0x6006 FT232RL Serial product FTDI SERIAL_2232C 0x6010 FT2232C Dual port Serial product FTDI 232H 0x6014 FTDI compatible adapter product FTDI 232EX 0x6015 FTDI compatible adapter product FTDI SERIAL_2232D 0x9e90 FT2232D Dual port Serial product FTDI SERIAL_4232H 0x6011 FT4232H Quad port Serial product FTDI XDS100V2 0xa6d0 TI XDS100V1/V2 and early Beaglebones product FTDI XDS100V3 0xa6d1 TI XDS100V3 product FTDI KTLINK 0xbbe2 KT-LINK Embedded Hackers Multitool product FTDI TURTELIZER2 0xbdc8 egnite Turtelizer 2 JTAG/RS232 Adapter /* Gude Analog- und Digitalsysteme products also uses FTDI's id: */ product FTDI TACTRIX_OPENPORT_13M 0xcc48 OpenPort 1.3 Mitsubishi product FTDI TACTRIX_OPENPORT_13S 0xcc49 OpenPort 1.3 Subaru product FTDI TACTRIX_OPENPORT_13U 0xcc4a OpenPort 1.3 Universal product FTDI GAMMASCOUT 0xd678 Gamma-Scout product FTDI KBS 0xe6c8 Pyramid KBS USB LCD product FTDI EISCOU 0xe888 Expert ISDN Control USB product FTDI UOPTBR 0xe889 USB-RS232 OptoBridge product FTDI EMCU2D 0xe88a Expert mouseCLOCK USB II product FTDI PCMSFU 0xe88b Precision Clock MSF USB product FTDI EMCU2H 0xe88c Expert mouseCLOCK USB II HBG product FTDI MAXSTREAM 0xee18 Maxstream PKG-U product FTDI USB_UIRT 0xf850 USB-UIRT product FTDI USBSERIAL 0xfa00 Matrix Orbital USB Serial product FTDI MX2_3 0xfa01 Matrix Orbital MX2 or MX3 product FTDI MX4_5 0xfa02 Matrix Orbital MX4 or MX5 product FTDI LK202 0xfa03 Matrix Orbital VK/LK202 Family product FTDI LK204 0xfa04 Matrix Orbital VK/LK204 Family product FTDI CFA_632 0xfc08 Crystalfontz CFA-632 USB LCD product FTDI CFA_634 0xfc09 Crystalfontz CFA-634 USB LCD product FTDI CFA_633 0xfc0b Crystalfontz CFA-633 USB LCD product FTDI CFA_631 0xfc0c Crystalfontz CFA-631 USB LCD product FTDI CFA_635 0xfc0d Crystalfontz CFA-635 USB LCD product FTDI SEMC_DSS20 0xfc82 SEMC DSS-20 SyncStation /* Commerzielle und Technische Informationssysteme GmbH products */ product FTDI CTI_USB_NANO_485 0xf60b CTI USB-Nano 485 product FTDI CTI_USB_MINI_485 0xf608 CTI USB-Mini 485 /* Other products */ product FTDI 232RL 0xfbfa FTDI compatible adapter product FTDI 4N_GALAXY_DE_1 0xf3c0 FTDI compatible adapter product FTDI 4N_GALAXY_DE_2 0xf3c1 FTDI compatible adapter product FTDI 4N_GALAXY_DE_3 0xf3c2 FTDI compatible adapter product FTDI 8U232AM_ALT 0x6006 FTDI compatible adapter product FTDI ACCESSO 0xfad0 FTDI compatible adapter product FTDI ACG_HFDUAL 0xdd20 FTDI compatible adapter product FTDI ACTIVE_ROBOTS 0xe548 FTDI compatible adapter product FTDI ACTZWAVE 0xf2d0 FTDI compatible adapter product FTDI AMC232 0xff00 FTDI compatible adapter product FTDI ARTEMIS 0xdf28 FTDI compatible adapter product FTDI ASK_RDR400 0xc991 FTDI compatible adapter product FTDI ATIK_ATK16 0xdf30 FTDI compatible adapter product FTDI ATIK_ATK16C 0xdf32 FTDI compatible adapter product FTDI ATIK_ATK16HR 0xdf31 FTDI compatible adapter product FTDI ATIK_ATK16HRC 0xdf33 FTDI compatible adapter product FTDI ATIK_ATK16IC 0xdf35 FTDI compatible adapter product FTDI BCS_SE923 0xfb99 FTDI compatible adapter product FTDI CANDAPTER 0x9f80 FTDI compatible adapter product FTDI CANUSB 0xffa8 FTDI compatible adapter product FTDI CCSICDU20_0 0xf9d0 FTDI compatible adapter product FTDI CCSICDU40_1 0xf9d1 FTDI compatible adapter product FTDI CCSICDU64_4 0xf9d4 FTDI compatible adapter product FTDI CCSLOAD_N_GO_3 0xf9d3 FTDI compatible adapter product FTDI CCSMACHX_2 0xf9d2 FTDI compatible adapter product FTDI CCSPRIME8_5 0xf9d5 FTDI compatible adapter product FTDI CHAMSYS_24_MASTER_WING 0xdaf8 FTDI compatible adapter product FTDI CHAMSYS_MAXI_WING 0xdafd FTDI compatible adapter product FTDI CHAMSYS_MEDIA_WING 0xdafe FTDI compatible adapter product FTDI CHAMSYS_MIDI_TIMECODE 0xdafb FTDI compatible adapter product FTDI CHAMSYS_MINI_WING 0xdafc FTDI compatible adapter product FTDI CHAMSYS_PC_WING 0xdaf9 FTDI compatible adapter product FTDI CHAMSYS_USB_DMX 0xdafa FTDI compatible adapter product FTDI CHAMSYS_WING 0xdaff FTDI compatible adapter product FTDI COM4SM 0xd578 FTDI compatible adapter product FTDI CONVERTER_0 0xd388 FTDI compatible adapter product FTDI CONVERTER_1 0xd389 FTDI compatible adapter product FTDI CONVERTER_2 0xd38a FTDI compatible adapter product FTDI CONVERTER_3 0xd38b FTDI compatible adapter product FTDI CONVERTER_4 0xd38c FTDI compatible adapter product FTDI CONVERTER_5 0xd38d FTDI compatible adapter product FTDI CONVERTER_6 0xd38e FTDI compatible adapter product FTDI CONVERTER_7 0xd38f FTDI compatible adapter product FTDI DMX4ALL 0xc850 FTDI compatible adapter product FTDI DOMINTELL_DGQG 0xef50 FTDI compatible adapter product FTDI DOMINTELL_DUSB 0xef51 FTDI compatible adapter product FTDI DOTEC 0x9868 FTDI compatible adapter product FTDI ECLO_COM_1WIRE 0xea90 FTDI compatible adapter product FTDI ECO_PRO_CDS 0xe520 FTDI compatible adapter product FTDI ELSTER_UNICOM 0xe700 FTDI compatible adapter product FTDI ELV_ALC8500 0xf06e FTDI compatible adapter product FTDI ELV_CLI7000 0xfb59 FTDI compatible adapter product FTDI ELV_CSI8 0xe0f0 FTDI compatible adapter product FTDI ELV_EC3000 0xe006 FTDI compatible adapter product FTDI ELV_EM1000DL 0xe0f1 FTDI compatible adapter product FTDI ELV_EM1010PC 0xe0ef FTDI compatible adapter product FTDI ELV_FEM 0xe00a FTDI compatible adapter product FTDI ELV_FHZ1000PC 0xf06f FTDI compatible adapter product FTDI ELV_FHZ1300PC 0xe0e8 FTDI compatible adapter product FTDI ELV_FM3RX 0xe0ed FTDI compatible adapter product FTDI ELV_FS20SIG 0xe0f4 FTDI compatible adapter product FTDI ELV_HS485 0xe0ea FTDI compatible adapter product FTDI ELV_KL100 0xe002 FTDI compatible adapter product FTDI ELV_MSM1 0xe001 FTDI compatible adapter product FTDI ELV_PCD200 0xf06c FTDI compatible adapter product FTDI ELV_PCK100 0xe0f2 FTDI compatible adapter product FTDI ELV_PPS7330 0xfb5c FTDI compatible adapter product FTDI ELV_RFP500 0xe0f3 FTDI compatible adapter product FTDI ELV_T1100 0xf06b FTDI compatible adapter product FTDI ELV_TFD128 0xe0ec FTDI compatible adapter product FTDI ELV_TFM100 0xfb5d FTDI compatible adapter product FTDI ELV_TWS550 0xe009 FTDI compatible adapter product FTDI ELV_UAD8 0xf068 FTDI compatible adapter product FTDI ELV_UDA7 0xf069 FTDI compatible adapter product FTDI ELV_UDF77 0xfb5e FTDI compatible adapter product FTDI ELV_UIO88 0xfb5f FTDI compatible adapter product FTDI ELV_ULA200 0xf06d FTDI compatible adapter product FTDI ELV_UM100 0xfb5a FTDI compatible adapter product FTDI ELV_UMS100 0xe0eb FTDI compatible adapter product FTDI ELV_UO100 0xfb5b FTDI compatible adapter product FTDI ELV_UR100 0xfb58 FTDI compatible adapter product FTDI ELV_USI2 0xf06a FTDI compatible adapter product FTDI ELV_USR 0xe000 FTDI compatible adapter product FTDI ELV_UTP8 0xe0f5 FTDI compatible adapter product FTDI ELV_WS300PC 0xe0f6 FTDI compatible adapter product FTDI ELV_WS444PC 0xe0f7 FTDI compatible adapter product FTDI ELV_WS500 0xe0e9 FTDI compatible adapter product FTDI ELV_WS550 0xe004 FTDI compatible adapter product FTDI ELV_WS777 0xe0ee FTDI compatible adapter product FTDI ELV_WS888 0xe008 FTDI compatible adapter product FTDI FUTURE_0 0xf44a FTDI compatible adapter product FTDI FUTURE_1 0xf44b FTDI compatible adapter product FTDI FUTURE_2 0xf44c FTDI compatible adapter product FTDI GENERIC 0x9378 FTDI compatible adapter product FTDI GUDEADS_E808 0xe808 FTDI compatible adapter product FTDI GUDEADS_E809 0xe809 FTDI compatible adapter product FTDI GUDEADS_E80A 0xe80a FTDI compatible adapter product FTDI GUDEADS_E80B 0xe80b FTDI compatible adapter product FTDI GUDEADS_E80C 0xe80c FTDI compatible adapter product FTDI GUDEADS_E80D 0xe80d FTDI compatible adapter product FTDI GUDEADS_E80E 0xe80e FTDI compatible adapter product FTDI GUDEADS_E80F 0xe80f FTDI compatible adapter product FTDI GUDEADS_E88D 0xe88d FTDI compatible adapter product FTDI GUDEADS_E88E 0xe88e FTDI compatible adapter product FTDI GUDEADS_E88F 0xe88f FTDI compatible adapter product FTDI HD_RADIO 0x937c FTDI compatible adapter product FTDI HO720 0xed72 FTDI compatible adapter product FTDI HO730 0xed73 FTDI compatible adapter product FTDI HO820 0xed74 FTDI compatible adapter product FTDI HO870 0xed71 FTDI compatible adapter product FTDI IBS_APP70 0xff3d FTDI compatible adapter product FTDI IBS_PCMCIA 0xff3a FTDI compatible adapter product FTDI IBS_PEDO 0xff3e FTDI compatible adapter product FTDI IBS_PICPRO 0xff39 FTDI compatible adapter product FTDI IBS_PK1 0xff3b FTDI compatible adapter product FTDI IBS_PROD 0xff3f FTDI compatible adapter product FTDI IBS_RS232MON 0xff3c FTDI compatible adapter product FTDI IBS_US485 0xff38 FTDI compatible adapter product FTDI IPLUS 0xd070 FTDI compatible adapter product FTDI IPLUS2 0xd071 FTDI compatible adapter product FTDI IRTRANS 0xfc60 FTDI compatible adapter product FTDI LENZ_LIUSB 0xd780 FTDI compatible adapter product FTDI LM3S_DEVEL_BOARD 0xbcd8 FTDI compatible adapter product FTDI LM3S_EVAL_BOARD 0xbcd9 FTDI compatible adapter product FTDI LM3S_ICDI_B_BOARD 0xbcda FTDI compatible adapter product FTDI MASTERDEVEL2 0xf449 FTDI compatible adapter product FTDI MHAM_DB9 0xeeed FTDI compatible adapter product FTDI MHAM_IC 0xeeec FTDI compatible adapter product FTDI MHAM_KW 0xeee8 FTDI compatible adapter product FTDI MHAM_RS232 0xeeee FTDI compatible adapter product FTDI MHAM_Y6 0xeeea FTDI compatible adapter product FTDI MHAM_Y8 0xeeeb FTDI compatible adapter product FTDI MHAM_Y9 0xeeef FTDI compatible adapter product FTDI MHAM_YS 0xeee9 FTDI compatible adapter product FTDI MICRO_CHAMELEON 0xcaa0 FTDI compatible adapter product FTDI MTXORB_5 0xfa05 FTDI compatible adapter product FTDI MTXORB_6 0xfa06 FTDI compatible adapter product FTDI NXTCAM 0xabb8 FTDI compatible adapter product FTDI OCEANIC 0xf460 FTDI compatible adapter product FTDI OOCDLINK 0xbaf8 FTDI compatible adapter product FTDI OPENDCC 0xbfd8 FTDI compatible adapter product FTDI OPENDCC_GATEWAY 0xbfdb FTDI compatible adapter product FTDI OPENDCC_GBM 0xbfdc FTDI compatible adapter product FTDI OPENDCC_SNIFFER 0xbfd9 FTDI compatible adapter product FTDI OPENDCC_THROTTLE 0xbfda FTDI compatible adapter product FTDI PCDJ_DAC2 0xfa88 FTDI compatible adapter product FTDI PERLE_ULTRAPORT 0xf0c0 FTDI compatible adapter product FTDI PHI_FISCO 0xe40b FTDI compatible adapter product FTDI PIEGROUP 0xf208 FTDI compatible adapter product FTDI PROPOX_JTAGCABLEII 0xd738 FTDI compatible adapter product FTDI R2000KU_TRUE_RNG 0xfb80 FTDI compatible adapter product FTDI R2X0 0xfc71 FTDI compatible adapter product FTDI RELAIS 0xfa10 FTDI compatible adapter product FTDI REU_TINY 0xed22 FTDI compatible adapter product FTDI RMP200 0xe729 FTDI compatible adapter product FTDI RM_CANVIEW 0xfd60 FTDI compatible adapter product FTDI RRCIRKITS_LOCOBUFFER 0xc7d0 FTDI compatible adapter product FTDI SCIENCESCOPE_HS_LOGBOOK 0xff1d FTDI compatible adapter product FTDI SCIENCESCOPE_LOGBOOKML 0xff18 FTDI compatible adapter product FTDI SCIENCESCOPE_LS_LOGBOOK 0xff1c FTDI compatible adapter product FTDI SCS_DEVICE_0 0xd010 FTDI compatible adapter product FTDI SCS_DEVICE_1 0xd011 FTDI compatible adapter product FTDI SCS_DEVICE_2 0xd012 FTDI compatible adapter product FTDI SCS_DEVICE_3 0xd013 FTDI compatible adapter product FTDI SCS_DEVICE_4 0xd014 FTDI compatible adapter product FTDI SCS_DEVICE_5 0xd015 FTDI compatible adapter product FTDI SCS_DEVICE_6 0xd016 FTDI compatible adapter product FTDI SCS_DEVICE_7 0xd017 FTDI compatible adapter product FTDI SDMUSBQSS 0xf448 FTDI compatible adapter product FTDI SIGNALYZER_SH2 0xbca2 FTDI compatible adapter product FTDI SIGNALYZER_SH4 0xbca4 FTDI compatible adapter product FTDI SIGNALYZER_SLITE 0xbca1 FTDI compatible adapter product FTDI SIGNALYZER_ST 0xbca0 FTDI compatible adapter product FTDI SPECIAL_1 0xfc70 FTDI compatible adapter product FTDI SPECIAL_3 0xfc72 FTDI compatible adapter product FTDI SPECIAL_4 0xfc73 FTDI compatible adapter product FTDI SPROG_II 0xf0c8 FTDI compatible adapter product FTDI SR_RADIO 0x9379 FTDI compatible adapter product FTDI SUUNTO_SPORTS 0xf680 FTDI compatible adapter product FTDI TAVIR_STK500 0xfa33 FTDI compatible adapter product FTDI TERATRONIK_D2XX 0xec89 FTDI compatible adapter product FTDI TERATRONIK_VCP 0xec88 FTDI compatible adapter product FTDI THORLABS 0xfaf0 FTDI compatible adapter product FTDI TNC_X 0xebe0 FTDI compatible adapter product FTDI TTUSB 0xff20 FTDI compatible adapter product FTDI USBX_707 0xf857 FTDI compatible adapter product FTDI USINT_CAT 0xb810 FTDI compatible adapter product FTDI USINT_RS232 0xb812 FTDI compatible adapter product FTDI USINT_WKEY 0xb811 FTDI compatible adapter product FTDI VARDAAN 0xf070 FTDI compatible adapter product FTDI VNHCPCUSB_D 0xfe38 FTDI compatible adapter product FTDI WESTREX_MODEL_777 0xdc00 FTDI compatible adapter product FTDI WESTREX_MODEL_8900F 0xdc01 FTDI compatible adapter product FTDI XF_547 0xfc0a FTDI compatible adapter product FTDI XF_640 0xfc0e FTDI compatible adapter product FTDI XF_642 0xfc0f FTDI compatible adapter product FTDI XM_RADIO 0x937a FTDI compatible adapter product FTDI YEI_SERVOCENTER31 0xe050 FTDI compatible adapter /* Fuji photo products */ product FUJIPHOTO MASS0100 0x0100 Mass Storage /* Fujitsu protducts */ product FUJITSU AH_F401U 0x105b AH-F401U Air H device /* Fujitsu-Siemens protducts */ product FUJITSUSIEMENS SCR 0x0009 Fujitsu-Siemens SCR USB Reader /* Garmin products */ product GARMIN IQUE_3600 0x0004 iQue 3600 /* Gemalto products */ product GEMALTO PROXPU 0x5501 Prox-PU/CU RFID Card Reader /* General Instruments (Motorola) products */ product GENERALINSTMNTS SB5100 0x5100 SURFboard SB5100 Cable modem /* Genesys Logic products */ product GENESYS GL620USB 0x0501 GL620USB Host-Host interface product GENESYS GL650 0x0604 GL650 HUB product GENESYS GL606 0x0606 USB 2.0 HUB product GENESYS GL641USB 0x0700 GL641USB CompactFlash Card Reader product GENESYS GL641USB2IDE_2 0x0701 GL641USB USB-IDE Bridge No 2 product GENESYS GL641USB2IDE 0x0702 GL641USB USB-IDE Bridge product GENESYS GL641USB_2 0x0760 GL641USB 6-in-1 Card Reader /* GIGABYTE products */ product GIGABYTE GN54G 0x8001 GN-54G product GIGABYTE GNBR402W 0x8002 GN-BR402W product GIGABYTE GNWLBM101 0x8003 GN-WLBM101 product GIGABYTE GNWBKG 0x8007 GN-WBKG product GIGABYTE GNWB01GS 0x8008 GN-WB01GS product GIGABYTE GNWI05GS 0x800a GN-WI05GS /* Gigaset products */ product GIGASET WLAN 0x0701 WLAN product GIGASET SMCWUSBTG 0x0710 SMCWUSBT-G product GIGASET SMCWUSBTG_NF 0x0711 SMCWUSBT-G (no firmware) product GIGASET AR5523 0x0712 AR5523 product GIGASET AR5523_NF 0x0713 AR5523 (no firmware) product GIGASET RT2573 0x0722 RT2573 product GIGASET RT3070_1 0x0740 RT3070 product GIGASET RT3070_2 0x0744 RT3070 product GIGABYTE RT2870_1 0x800b RT2870 product GIGABYTE GNWB31N 0x800c GN-WB31N product GIGABYTE GNWB32L 0x800d GN-WB32L /* Global Sun Technology product */ product GLOBALSUN AR5523_1 0x7801 AR5523 product GLOBALSUN AR5523_1_NF 0x7802 AR5523 (no firmware) product GLOBALSUN AR5523_2 0x7811 AR5523 product GLOBALSUN AR5523_2_NF 0x7812 AR5523 (no firmware) /* Globespan products */ product GLOBESPAN PRISM_GT_1 0x2000 PrismGT USB 2.0 WLAN product GLOBESPAN PRISM_GT_2 0x2002 PrismGT USB 2.0 WLAN /* G.Mate, Inc products */ product GMATE YP3X00 0x1001 YP3X00 PDA /* GN Otometrics */ product GNOTOMETRICS USB 0x0010 FTDI compatible adapter /* GoHubs products */ product GOHUBS GOCOM232 0x1001 GoCOM232 Serial /* Good Way Technology products */ product GOODWAY GWUSB2E 0x6200 GWUSB2E product GOODWAY RT2573 0xc019 RT2573 /* Google products */ product GOOGLE NEXUSONE 0x4e11 Nexus One /* Gravis products */ product GRAVIS GAMEPADPRO 0x4001 GamePad Pro /* GREENHOUSE products */ product GREENHOUSE KANA21 0x0001 CF-writer with MP3 /* Griffin Technology */ product GRIFFIN IMATE 0x0405 iMate, ADB Adapter /* Guillemot Corporation */ product GUILLEMOT DALEADER 0xa300 DA Leader product GUILLEMOT HWGUSB254 0xe000 HWGUSB2-54 WLAN product GUILLEMOT HWGUSB254LB 0xe010 HWGUSB2-54-LB product GUILLEMOT HWGUSB254V2AP 0xe020 HWGUSB2-54V2-AP product GUILLEMOT HWNU300 0xe030 HWNU-300 product GUILLEMOT HWNUM300 0xe031 HWNUm-300 product GUILLEMOT HWGUN54 0xe032 HWGUn-54 product GUILLEMOT HWNUP150 0xe033 HWNUP-150 /* Hagiwara products */ product HAGIWARA FGSM 0x0002 FlashGate SmartMedia Card Reader product HAGIWARA FGCF 0x0003 FlashGate CompactFlash Card Reader product HAGIWARA FG 0x0005 FlashGate /* HAL Corporation products */ product HAL IMR001 0x0011 Crossam2+USB IR commander /* Handspring, Inc. */ product HANDSPRING VISOR 0x0100 Handspring Visor product HANDSPRING TREO 0x0200 Handspring Treo product HANDSPRING TREO600 0x0300 Handspring Treo 600 /* Hauppauge Computer Works */ product HAUPPAUGE WINTV_USB_FM 0x4d12 WinTV USB FM product HAUPPAUGE2 NOVAT500 0x9580 NovaT 500Stick /* Hawking Technologies products */ product HAWKING RT2870_1 0x0001 RT2870 product HAWKING RT2870_2 0x0003 RT2870 product HAWKING HWUN2 0x0009 HWUN2 product HAWKING RT3070 0x000b RT3070 product HAWKING RTL8192CU 0x0019 RTL8192CU product HAWKING UF100 0x400c 10/100 USB Ethernet product HAWKING RTL8192SU_1 0x0015 RTL8192SU product HAWKING RTL8192SU_2 0x0016 RTL8192SU /* HID Global GmbH products */ product HIDGLOBAL CM2020 0x0596 Omnikey Cardman 2020 product HIDGLOBAL CM6020 0x1784 Omnikey Cardman 6020 /* Hitachi, Ltd. products */ product HITACHI DVDCAM_DZ_MV100A 0x0004 DVD-CAM DZ-MV100A Camcorder product HITACHI DVDCAM_USB 0x001e DVDCAM USB HS Interface /* HP products */ product HP 895C 0x0004 DeskJet 895C product HP 4100C 0x0101 Scanjet 4100C product HP S20 0x0102 Photosmart S20 product HP 880C 0x0104 DeskJet 880C product HP 4200C 0x0105 ScanJet 4200C product HP CDWRITERPLUS 0x0107 CD-Writer Plus product HP KBDHUB 0x010c Multimedia Keyboard Hub product HP G55XI 0x0111 OfficeJet G55xi product HP HN210W 0x011c HN210W 802.11b WLAN product HP 49GPLUS 0x0121 49g+ graphing calculator product HP 6200C 0x0201 ScanJet 6200C product HP S20b 0x0202 PhotoSmart S20 product HP 815C 0x0204 DeskJet 815C product HP 3300C 0x0205 ScanJet 3300C product HP CDW8200 0x0207 CD-Writer Plus 8200e product HP MMKEYB 0x020c Multimedia keyboard product HP 1220C 0x0212 DeskJet 1220C product HP UN2420_QDL 0x241d UN2420 QDL Firmware Loader product HP UN2420 0x251d UN2420 WWAN/GPS Module product HP 810C 0x0304 DeskJet 810C/812C product HP 4300C 0x0305 Scanjet 4300C product HP CDW4E 0x0307 CD-Writer+ CD-4e product HP G85XI 0x0311 OfficeJet G85xi product HP 1200 0x0317 LaserJet 1200 product HP 5200C 0x0401 Scanjet 5200C product HP 830C 0x0404 DeskJet 830C product HP 3400CSE 0x0405 ScanJet 3400cse product HP 6300C 0x0601 Scanjet 6300C product HP 840C 0x0604 DeskJet 840c product HP 2200C 0x0605 ScanJet 2200C product HP 5300C 0x0701 Scanjet 5300C product HP 4400C 0x0705 Scanjet 4400C product HP 4470C 0x0805 Scanjet 4470C product HP 82x0C 0x0b01 Scanjet 82x0C product HP 2300D 0x0b17 Laserjet 2300d product HP 970CSE 0x1004 Deskjet 970Cse product HP 5400C 0x1005 Scanjet 5400C product HP 2215 0x1016 iPAQ 22xx/Jornada 548 product HP 568J 0x1116 Jornada 568 product HP 930C 0x1204 DeskJet 930c product HP3 RTL8188CU 0x1629 RTL8188CU product HP P2000U 0x1801 Inkjet P-2000U product HP HS2300 0x1e1d HS2300 HSDPA (aka MC8775) product HP 640C 0x2004 DeskJet 640c product HP 4670V 0x3005 ScanJet 4670v product HP P1100 0x3102 Photosmart P1100 product HP LD220 0x3524 LD220 POS Display product HP OJ4215 0x3d11 OfficeJet 4215 product HP HN210E 0x811c Ethernet HN210E product HP2 C500 0x6002 PhotoSmart C500 product HP EV2200 0x1b1d ev2200 HSDPA (aka MC5720) product HP HS2300 0x1e1d hs2300 HSDPA (aka MC8775) /* HTC products */ product HTC WINMOBILE 0x00ce HTC USB Sync product HTC PPC6700MODEM 0x00cf PPC6700 Modem product HTC SMARTPHONE 0x0a51 SmartPhone USB Sync product HTC WIZARD 0x0bce HTC Wizard USB Sync product HTC LEGENDSYNC 0x0c97 HTC Legend USB Sync product HTC LEGEND 0x0ff9 HTC Legend product HTC LEGENDINTERNET 0x0ffe HTC Legend Internet Sharing /* HUAWEI products */ product HUAWEI MOBILE 0x1001 Huawei Mobile product HUAWEI E220 0x1003 HSDPA modem product HUAWEI E220BIS 0x1004 HSDPA modem product HUAWEI E1401 0x1401 3G modem product HUAWEI E1402 0x1402 3G modem product HUAWEI E1403 0x1403 3G modem product HUAWEI E1404 0x1404 3G modem product HUAWEI E1405 0x1405 3G modem product HUAWEI E1406 0x1406 3G modem product HUAWEI E1407 0x1407 3G modem product HUAWEI E1408 0x1408 3G modem product HUAWEI E1409 0x1409 3G modem product HUAWEI E140A 0x140a 3G modem product HUAWEI E140B 0x140b 3G modem product HUAWEI E180V 0x140c E180V product HUAWEI E140D 0x140d 3G modem product HUAWEI E140E 0x140e 3G modem product HUAWEI E140F 0x140f 3G modem product HUAWEI E1410 0x1410 3G modem product HUAWEI E1411 0x1411 3G modem product HUAWEI E1412 0x1412 3G modem product HUAWEI E1413 0x1413 3G modem product HUAWEI E1414 0x1414 3G modem product HUAWEI E1415 0x1415 3G modem product HUAWEI E1416 0x1416 3G modem product HUAWEI E1417 0x1417 3G modem product HUAWEI E1418 0x1418 3G modem product HUAWEI E1419 0x1419 3G modem product HUAWEI E141A 0x141a 3G modem product HUAWEI E141B 0x141b 3G modem product HUAWEI E141C 0x141c 3G modem product HUAWEI E141D 0x141d 3G modem product HUAWEI E141E 0x141e 3G modem product HUAWEI E141F 0x141f 3G modem product HUAWEI E1420 0x1420 3G modem product HUAWEI E1421 0x1421 3G modem product HUAWEI E1422 0x1422 3G modem product HUAWEI E1423 0x1423 3G modem product HUAWEI E1424 0x1424 3G modem product HUAWEI E1425 0x1425 3G modem product HUAWEI E1426 0x1426 3G modem product HUAWEI E1427 0x1427 3G modem product HUAWEI E1428 0x1428 3G modem product HUAWEI E1429 0x1429 3G modem product HUAWEI E142A 0x142a 3G modem product HUAWEI E142B 0x142b 3G modem product HUAWEI E142C 0x142c 3G modem product HUAWEI E142D 0x142d 3G modem product HUAWEI E142E 0x142e 3G modem product HUAWEI E142F 0x142f 3G modem product HUAWEI E1430 0x1430 3G modem product HUAWEI E1431 0x1431 3G modem product HUAWEI E1432 0x1432 3G modem product HUAWEI E1433 0x1433 3G modem product HUAWEI E1434 0x1434 3G modem product HUAWEI E1435 0x1435 3G modem product HUAWEI E1436 0x1436 3G modem product HUAWEI E1437 0x1437 3G modem product HUAWEI E1438 0x1438 3G modem product HUAWEI E1439 0x1439 3G modem product HUAWEI E143A 0x143a 3G modem product HUAWEI E143B 0x143b 3G modem product HUAWEI E143C 0x143c 3G modem product HUAWEI E143D 0x143d 3G modem product HUAWEI E143E 0x143e 3G modem product HUAWEI E143F 0x143f 3G modem product HUAWEI E1752 0x1446 3G modem product HUAWEI K4505 0x1464 3G modem product HUAWEI K3765 0x1465 3G modem product HUAWEI E1820 0x14ac E1820 HSPA+ USB Slider product HUAWEI K3770 0x14c9 3G modem product HUAWEI K3772 0x14cf K3772 product HUAWEI K3770_INIT 0x14d1 K3770 Initial product HUAWEI E3131_INIT 0x14fe 3G modem initial product HUAWEI E392 0x1505 LTE modem product HUAWEI E3131 0x1506 3G modem product HUAWEI K3765_INIT 0x1520 K3765 Initial product HUAWEI K4505_INIT 0x1521 K4505 Initial product HUAWEI K3772_INIT 0x1526 K3772 Initial product HUAWEI E3272_INIT 0x155b LTE modem initial product HUAWEI R215_INIT 0x1582 LTE modem initial product HUAWEI R215 0x1588 LTE modem product HUAWEI ETS2055 0x1803 CDMA modem product HUAWEI E173 0x1c05 3G modem product HUAWEI E173_INIT 0x1c0b 3G modem initial product HUAWEI E3272 0x1c1e LTE modem /* HUAWEI 3com products */ product HUAWEI3COM WUB320G 0x0009 Aolynk WUB320g /* IBM Corporation */ product IBM USBCDROMDRIVE 0x4427 USB CD-ROM Drive /* Icom products */ product ICOM SP1 0x0004 FTDI compatible adapter product ICOM OPC_U_UC 0x0018 FTDI compatible adapter product ICOM RP2C1 0x0009 FTDI compatible adapter product ICOM RP2C2 0x000a FTDI compatible adapter product ICOM RP2D 0x000b FTDI compatible adapter product ICOM RP2KVR 0x0013 FTDI compatible adapter product ICOM RP2KVT 0x0012 FTDI compatible adapter product ICOM RP2VR 0x000d FTDI compatible adapter product ICOM RP2VT 0x000c FTDI compatible adapter product ICOM RP4KVR 0x0011 FTDI compatible adapter product ICOM RP4KVT 0x0010 FTDI compatible adapter /* ID-tech products */ product IDTECH IDT1221U 0x0300 FTDI compatible adapter /* Imagination Technologies products */ product IMAGINATION DBX1 0x2107 DBX1 DSP core /* Initio Corporation products */ product INITIO DUMMY 0x0000 Dummy product product INITIO INIC_1610P 0x1e40 USB to SATA Bridge /* Inside Out Networks products */ product INSIDEOUT EDGEPORT4 0x0001 EdgePort/4 serial ports /* In-System products */ product INSYSTEM F5U002 0x0002 Parallel printer product INSYSTEM ATAPI 0x0031 ATAPI Adapter product INSYSTEM ISD110 0x0200 IDE Adapter ISD110 product INSYSTEM ISD105 0x0202 IDE Adapter ISD105 product INSYSTEM USBCABLE 0x081a USB cable product INSYSTEM STORAGE_V2 0x5701 USB Storage Adapter V2 /* Intel products */ product INTEL EASYPC_CAMERA 0x0110 Easy PC Camera product INTEL TESTBOARD 0x9890 82930 test board product INTEL2 IRMH 0x0020 Integrated Rate Matching Hub product INTEL2 IRMH2 0x0024 Integrated Rate Matching Hub /* Interbiometric products */ product INTERBIOMETRICS IOBOARD 0x1002 FTDI compatible adapter product INTERBIOMETRICS MINI_IOBOARD 0x1006 FTDI compatible adapter /* Intersil products */ product INTERSIL PRISM_GT 0x1000 PrismGT USB 2.0 WLAN product INTERSIL PRISM_2X 0x3642 Prism2.x or Atmel WLAN /* Interpid Control Systems products */ product INTREPIDCS VALUECAN 0x0601 ValueCAN CAN bus interface product INTREPIDCS NEOVI 0x0701 NeoVI Blue vehicle bus interface /* I/O DATA products */ product IODATA IU_CD2 0x0204 DVD Multi-plus unit iU-CD2 product IODATA DVR_UEH8 0x0206 DVD Multi-plus unit DVR-UEH8 product IODATA USBSSMRW 0x0314 USB-SSMRW SD-card product IODATA USBSDRW 0x031e USB-SDRW SD-card product IODATA USBETT 0x0901 USB ETT product IODATA USBETTX 0x0904 USB ETTX product IODATA USBETTXS 0x0913 USB ETTX product IODATA USBWNB11A 0x0919 USB WN-B11 product IODATA USBWNB11 0x0922 USB Airport WN-B11 product IODATA ETGUS2 0x0930 ETG-US2 product IODATA RT3072_1 0x0944 RT3072 product IODATA RT3072_2 0x0945 RT3072 product IODATA RT3072_3 0x0947 RT3072 product IODATA RT3072_4 0x0948 RT3072 product IODATA USBRSAQ 0x0a03 Serial USB-RSAQ1 product IODATA USBRSAQ5 0x0a0e Serial USB-RSAQ5 product IODATA2 USB2SC 0x0a09 USB2.0-SCSI Bridge USB2-SC /* Iomega products */ product IOMEGA ZIP100 0x0001 Zip 100 product IOMEGA ZIP250 0x0030 Zip 250 /* Ionic products */ product IONICS PLUGCOMPUTER 0x0102 FTDI compatible adapter /* Integrated System Solution Corp. products */ product ISSC ISSCBTA 0x1001 Bluetooth USB Adapter /* iTegno products */ product ITEGNO WM1080A 0x1080 WM1080A GSM/GPRS modem product ITEGNO WM2080A 0x2080 WM2080A CDMA modem /* Ituner networks products */ product ITUNERNET USBLCD2X20 0x0002 USB-LCD 2x20 product ITUNERNET USBLCD4X20 0xc001 USB-LCD 4x20 /* Jablotron products */ product JABLOTRON PC60B 0x0001 PC-60B /* Jaton products */ product JATON EDA 0x5704 Ethernet /* Jeti products */ product JETI SPC1201 0x04b2 FTDI compatible adapter /* JMicron products */ product JMICRON JM20336 0x2336 USB to SATA Bridge product JMICRON JM20337 0x2338 USB to ATA/ATAPI Bridge /* JVC products */ product JVC GR_DX95 0x000a GR-DX95 product JVC MP_PRX1 0x3008 MP-PRX1 Ethernet /* JRC products */ product JRC AH_J3001V_J3002V 0x0001 AirH PHONE AH-J3001V/J3002V /* Kamstrrup products */ product KAMSTRUP OPTICALEYE 0x0001 Optical Eye/3-wire product KAMSTRUP MBUS_250D 0x0005 M-Bus Master MultiPort 250D /* Kawatsu products */ product KAWATSU MH4000P 0x0003 MiniHub 4000P /* Keisokugiken Corp. products */ product KEISOKUGIKEN USBDAQ 0x0068 HKS-0200 USBDAQ /* Kensington products */ product KENSINGTON ORBIT 0x1003 Orbit USB/PS2 trackball product KENSINGTON TURBOBALL 0x1005 TurboBall /* Keyspan products */ product KEYSPAN USA28_NF 0x0101 USA-28 serial Adapter (no firmware) product KEYSPAN USA28X_NF 0x0102 USA-28X serial Adapter (no firmware) product KEYSPAN USA19_NF 0x0103 USA-19 serial Adapter (no firmware) product KEYSPAN USA18_NF 0x0104 USA-18 serial Adapter (no firmware) product KEYSPAN USA18X_NF 0x0105 USA-18X serial Adapter (no firmware) product KEYSPAN USA19W_NF 0x0106 USA-19W serial Adapter (no firmware) product KEYSPAN USA19 0x0107 USA-19 serial Adapter product KEYSPAN USA19W 0x0108 USA-19W serial Adapter product KEYSPAN USA49W_NF 0x0109 USA-49W serial Adapter (no firmware) product KEYSPAN USA49W 0x010a USA-49W serial Adapter product KEYSPAN USA19QI_NF 0x010b USA-19QI serial Adapter (no firmware) product KEYSPAN USA19QI 0x010c USA-19QI serial Adapter product KEYSPAN USA19Q_NF 0x010d USA-19Q serial Adapter (no firmware) product KEYSPAN USA19Q 0x010e USA-19Q serial Adapter product KEYSPAN USA28 0x010f USA-28 serial Adapter product KEYSPAN USA28XXB 0x0110 USA-28X/XB serial Adapter product KEYSPAN USA18 0x0111 USA-18 serial Adapter product KEYSPAN USA18X 0x0112 USA-18X serial Adapter product KEYSPAN USA28XB_NF 0x0113 USA-28XB serial Adapter (no firmware) product KEYSPAN USA28XA_NF 0x0114 USA-28XB serial Adapter (no firmware) product KEYSPAN USA28XA 0x0115 USA-28XA serial Adapter product KEYSPAN USA18XA_NF 0x0116 USA-18XA serial Adapter (no firmware) product KEYSPAN USA18XA 0x0117 USA-18XA serial Adapter product KEYSPAN USA19QW_NF 0x0118 USA-19WQ serial Adapter (no firmware) product KEYSPAN USA19QW 0x0119 USA-19WQ serial Adapter product KEYSPAN USA19HA 0x0121 USA-19HS serial Adapter product KEYSPAN UIA10 0x0201 UIA-10 remote control product KEYSPAN UIA11 0x0202 UIA-11 remote control /* Kingston products */ product KINGSTON XX1 0x0008 Ethernet product KINGSTON KNU101TX 0x000a KNU101TX USB Ethernet product KINGSTON HYPERX3_0 0x162b DT HyperX 3.0 /* Kawasaki products */ product KLSI DUH3E10BT 0x0008 USB Ethernet product KLSI DUH3E10BTN 0x0009 USB Ethernet /* Kobil products */ product KOBIL CONV_B1 0x2020 FTDI compatible adapter product KOBIL CONV_KAAN 0x2021 FTDI compatible adapter /* Kodak products */ product KODAK DC220 0x0100 Digital Science DC220 product KODAK DC260 0x0110 Digital Science DC260 product KODAK DC265 0x0111 Digital Science DC265 product KODAK DC290 0x0112 Digital Science DC290 product KODAK DC240 0x0120 Digital Science DC240 product KODAK DC280 0x0130 Digital Science DC280 /* Kontron AG products */ product KONTRON DM9601 0x8101 USB Ethernet product KONTRON JP1082 0x9700 USB Ethernet /* Konica Corp. Products */ product KONICA CAMERA 0x0720 Digital Color Camera /* KYE products */ product KYE NICHE 0x0001 Niche mouse product KYE NETSCROLL 0x0003 Genius NetScroll mouse product KYE FLIGHT2000 0x1004 Flight 2000 joystick product KYE VIVIDPRO 0x2001 ColorPage Vivid-Pro scanner /* Kyocera products */ product KYOCERA FINECAM_S3X 0x0100 Finecam S3x product KYOCERA FINECAM_S4 0x0101 Finecam S4 product KYOCERA FINECAM_S5 0x0103 Finecam S5 product KYOCERA FINECAM_L3 0x0105 Finecam L3 product KYOCERA AHK3001V 0x0203 AH-K3001V product KYOCERA2 CDMA_MSM_K 0x17da Qualcomm Kyocera CDMA Technologies MSM product KYOCERA2 KPC680 0x180a Qualcomm Kyocera CDMA Technologies MSM /* LaCie products */ product LACIE HD 0xa601 Hard Disk product LACIE CDRW 0xa602 CD R/W /* Lake Shore Cryotronics products */ product LAKESHORE 121 0x0100 121 Current Source product LAKESHORE 218A 0x0200 218A Temperature Monitor product LAKESHORE 219 0x0201 219 Temperature Monitor product LAKESHORE 233 0x0202 233 Temperature Transmitter product LAKESHORE 235 0x0203 235 Temperature Transmitter product LAKESHORE 335 0x0300 335 Temperature Controller product LAKESHORE 336 0x0301 336 Temperature Controller product LAKESHORE 350 0x0302 350 Temperature Controller product LAKESHORE 371 0x0303 371 AC Bridge product LAKESHORE 411 0x0400 411 Handheld Gaussmeter product LAKESHORE 425 0x0401 425 Gaussmeter product LAKESHORE 455A 0x0402 455A DSP Gaussmeter product LAKESHORE 475A 0x0403 475A DSP Gaussmeter product LAKESHORE 465 0x0404 465 Gaussmeter product LAKESHORE 625A 0x0600 625A Magnet PSU product LAKESHORE 642A 0x0601 642A Magnet PSU product LAKESHORE 648 0x0602 648 Magnet PSU product LAKESHORE 737 0x0700 737 VSM Controller product LAKESHORE 776 0x0701 776 Matrix Switch /* Larsen and Brusgaard products */ product LARSENBRUSGAARD ALTITRACK 0x0001 FTDI compatible adapter /* Leadtek products */ product LEADTEK 9531 0x2101 9531 GPS /* Lenovo products */ product LENOVO ETHERNET 0x7203 USB 2.0 Ethernet /* Lexar products */ product LEXAR JUMPSHOT 0x0001 jumpSHOT CompactFlash Reader product LEXAR CF_READER 0xb002 USB CF Reader /* Lexmark products */ product LEXMARK S2450 0x0009 Optra S 2450 /* Liebert products */ product LIEBERT POWERSURE_PXT 0xffff PowerSure Personal XT product LIEBERT2 PSI1000 0x0004 UPS PSI 1000 FW:08 /* Link Instruments Inc. products */ product LINKINSTRUMENTS MSO19 0xf190 Link Instruments MSO-19 product LINKINSTRUMENTS MSO28 0xf280 Link Instruments MSO-28 product LINKINSTRUMENTS MSO28_2 0xf281 Link Instruments MSO-28 /* Linksys products */ product LINKSYS MAUSB2 0x0105 Camedia MAUSB-2 product LINKSYS USB10TX1 0x200c USB10TX product LINKSYS USB10T 0x2202 USB10T Ethernet product LINKSYS USB100TX 0x2203 USB100TX Ethernet product LINKSYS USB100H1 0x2204 USB100H1 Ethernet/HPNA product LINKSYS USB10TA 0x2206 USB10TA Ethernet product LINKSYS USB10TX2 0x400b USB10TX product LINKSYS2 WUSB11 0x2219 WUSB11 Wireless Adapter product LINKSYS2 USB200M 0x2226 USB 2.0 10/100 Ethernet product LINKSYS3 WUSB11v28 0x2233 WUSB11 v2.8 Wireless Adapter product LINKSYS4 USB1000 0x0039 USB1000 product LINKSYS4 WUSB100 0x0070 WUSB100 product LINKSYS4 WUSB600N 0x0071 WUSB600N product LINKSYS4 WUSB54GCV2 0x0073 WUSB54GC v2 product LINKSYS4 WUSB54GCV3 0x0077 WUSB54GC v3 product LINKSYS4 RT3070 0x0078 RT3070 product LINKSYS4 WUSB600NV2 0x0079 WUSB600N v2 /* Logilink products */ product LOGILINK DUMMY 0x0000 Dummy product product LOGILINK U2M 0x0101 LogiLink USB MIDI Cable /* Logitech products */ product LOGITECH M2452 0x0203 M2452 keyboard product LOGITECH M4848 0x0301 M4848 mouse product LOGITECH PAGESCAN 0x040f PageScan product LOGITECH QUICKCAMWEB 0x0801 QuickCam Web product LOGITECH QUICKCAMPRO 0x0810 QuickCam Pro product LOGITECH WEBCAMC100 0X0817 Webcam C100 product LOGITECH QUICKCAMEXP 0x0840 QuickCam Express product LOGITECH QUICKCAM 0x0850 QuickCam product LOGITECH QUICKCAMPRO3 0x0990 QuickCam Pro 9000 product LOGITECH N43 0xc000 N43 product LOGITECH N48 0xc001 N48 mouse product LOGITECH MBA47 0xc002 M-BA47 mouse product LOGITECH WMMOUSE 0xc004 WingMan Gaming Mouse product LOGITECH BD58 0xc00c BD58 mouse product LOGITECH UN58A 0xc030 iFeel Mouse product LOGITECH UN53B 0xc032 iFeel MouseMan product LOGITECH WMPAD 0xc208 WingMan GamePad Extreme product LOGITECH WMRPAD 0xc20a WingMan RumblePad product LOGITECH WMJOY 0xc281 WingMan Force joystick product LOGITECH BB13 0xc401 USB-PS/2 Trackball product LOGITECH RK53 0xc501 Cordless mouse product LOGITECH RB6 0xc503 Cordless keyboard product LOGITECH MX700 0xc506 Cordless optical mouse product LOGITECH QUICKCAMPRO2 0xd001 QuickCam Pro /* Logitec Corp. products */ product LOGITEC LDR_H443SU2 0x0033 DVD Multi-plus unit LDR-H443SU2 product LOGITEC LDR_H443U2 0x00b3 DVD Multi-plus unit LDR-H443U2 product LOGITEC LAN_GTJU2A 0x0160 LAN-GTJ/U2A Ethernet product LOGITEC RT2870_1 0x0162 RT2870 product LOGITEC RT2870_2 0x0163 RT2870 product LOGITEC RT2870_3 0x0164 RT2870 product LOGITEC LANW300NU2 0x0166 LAN-W300N/U2 product LOGITEC LANW150NU2 0x0168 LAN-W150N/U2 product LOGITEC LANW300NU2S 0x0169 LAN-W300N/U2S /* Longcheer Holdings, Ltd. products */ product LONGCHEER WM66 0x6061 Longcheer WM66 HSDPA product LONGCHEER W14 0x9603 Mobilcom W14 product LONGCHEER DISK 0xf000 Driver disk product LONGCHEER XSSTICK 0x9605 4G Systems XSStick P14 /* Lucent products */ product LUCENT EVALKIT 0x1001 USS-720 evaluation kit /* Luwen products */ product LUWEN EASYDISK 0x0005 EasyDisc /* Macally products */ product MACALLY MOUSE1 0x0101 mouse /* Mag-Tek products */ product MAGTEK USBSWIPE 0x0002 USB Mag Stripe Swipe Reader /* Marvell Technology Group, Ltd. products */ product MARVELL SHEEVAPLUG 0x9e8f SheevaPlug serial interface /* Matrix Orbital products */ product MATRIXORBITAL FTDI_RANGE_0100 0x0100 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0101 0x0101 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0102 0x0102 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0103 0x0103 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0104 0x0104 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0105 0x0105 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0106 0x0106 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0107 0x0107 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0108 0x0108 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0109 0x0109 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_010A 0x010a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_010B 0x010b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_010C 0x010c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_010D 0x010d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_010E 0x010e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_010F 0x010f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0110 0x0110 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0111 0x0111 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0112 0x0112 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0113 0x0113 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0114 0x0114 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0115 0x0115 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0116 0x0116 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0117 0x0117 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0118 0x0118 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0119 0x0119 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_011A 0x011a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_011B 0x011b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_011C 0x011c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_011D 0x011d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_011E 0x011e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_011F 0x011f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0120 0x0120 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0121 0x0121 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0122 0x0122 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0123 0x0123 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0124 0x0124 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0125 0x0125 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0126 0x0126 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0128 0x0128 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0129 0x0129 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_012A 0x012a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_012B 0x012b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_012D 0x012d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_012E 0x012e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_012F 0x012f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0130 0x0130 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0131 0x0131 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0132 0x0132 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0133 0x0133 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0134 0x0134 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0135 0x0135 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0136 0x0136 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0137 0x0137 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0138 0x0138 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0139 0x0139 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_013A 0x013a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_013B 0x013b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_013C 0x013c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_013D 0x013d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_013E 0x013e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_013F 0x013f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0140 0x0140 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0141 0x0141 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0142 0x0142 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0143 0x0143 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0144 0x0144 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0145 0x0145 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0146 0x0146 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0147 0x0147 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0148 0x0148 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0149 0x0149 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_014A 0x014a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_014B 0x014b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_014C 0x014c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_014D 0x014d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_014E 0x014e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_014F 0x014f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0150 0x0150 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0151 0x0151 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0152 0x0152 FTDI compatible adapter product MATRIXORBITAL MOUA 0x0153 Martrix Orbital MOU-Axxxx LCD displays product MATRIXORBITAL FTDI_RANGE_0159 0x0159 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_015A 0x015a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_015B 0x015b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_015C 0x015c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_015D 0x015d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_015E 0x015e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_015F 0x015f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0160 0x0160 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0161 0x0161 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0162 0x0162 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0163 0x0163 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0164 0x0164 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0165 0x0165 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0166 0x0166 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0167 0x0167 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0168 0x0168 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0169 0x0169 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_016A 0x016a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_016B 0x016b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_016C 0x016c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_016D 0x016d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_016E 0x016e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_016F 0x016f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0170 0x0170 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0171 0x0171 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0172 0x0172 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0173 0x0173 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0174 0x0174 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0175 0x0175 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0176 0x0176 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0177 0x0177 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0178 0x0178 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0179 0x0179 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_017A 0x017a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_017B 0x017b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_017C 0x017c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_017D 0x017d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_017E 0x017e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_017F 0x017f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0180 0x0180 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0181 0x0181 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0182 0x0182 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0183 0x0183 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0184 0x0184 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0185 0x0185 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0186 0x0186 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0187 0x0187 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0188 0x0188 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0189 0x0189 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_018A 0x018a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_018B 0x018b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_018C 0x018c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_018D 0x018d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_018E 0x018e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_018F 0x018f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0190 0x0190 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0191 0x0191 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0192 0x0192 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0193 0x0193 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0194 0x0194 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0195 0x0195 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0196 0x0196 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0197 0x0197 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0198 0x0198 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_0199 0x0199 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_019A 0x019a FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_019B 0x019b FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_019C 0x019c FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_019D 0x019d FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_019E 0x019e FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_019F 0x019f FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A0 0x01a0 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A1 0x01a1 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A2 0x01a2 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A3 0x01a3 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A4 0x01a4 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A5 0x01a5 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A6 0x01a6 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A7 0x01a7 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A8 0x01a8 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01A9 0x01a9 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01AA 0x01aa FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01AB 0x01ab FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01AC 0x01ac FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01AD 0x01ad FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01AE 0x01ae FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01AF 0x01af FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B0 0x01b0 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B1 0x01b1 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B2 0x01b2 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B3 0x01b3 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B4 0x01b4 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B5 0x01b5 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B6 0x01b6 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B7 0x01b7 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B8 0x01b8 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01B9 0x01b9 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01BA 0x01ba FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01BB 0x01bb FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01BC 0x01bc FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01BD 0x01bd FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01BE 0x01be FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01BF 0x01bf FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C0 0x01c0 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C1 0x01c1 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C2 0x01c2 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C3 0x01c3 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C4 0x01c4 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C5 0x01c5 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C6 0x01c6 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C7 0x01c7 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C8 0x01c8 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01C9 0x01c9 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01CA 0x01ca FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01CB 0x01cb FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01CC 0x01cc FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01CD 0x01cd FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01CE 0x01ce FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01CF 0x01cf FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D0 0x01d0 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D1 0x01d1 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D2 0x01d2 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D3 0x01d3 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D4 0x01d4 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D5 0x01d5 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D6 0x01d6 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D7 0x01d7 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D8 0x01d8 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01D9 0x01d9 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01DA 0x01da FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01DB 0x01db FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01DC 0x01dc FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01DD 0x01dd FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01DE 0x01de FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01DF 0x01df FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E0 0x01e0 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E1 0x01e1 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E2 0x01e2 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E3 0x01e3 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E4 0x01e4 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E5 0x01e5 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E6 0x01e6 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E7 0x01e7 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E8 0x01e8 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01E9 0x01e9 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01EA 0x01ea FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01EB 0x01eb FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01EC 0x01ec FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01ED 0x01ed FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01EE 0x01ee FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01EF 0x01ef FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F0 0x01f0 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F1 0x01f1 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F2 0x01f2 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F3 0x01f3 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F4 0x01f4 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F5 0x01f5 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F6 0x01f6 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F7 0x01f7 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F8 0x01f8 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01F9 0x01f9 FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01FA 0x01fa FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01FB 0x01fb FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01FC 0x01fc FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01FD 0x01fd FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01FE 0x01fe FTDI compatible adapter product MATRIXORBITAL FTDI_RANGE_01FF 0x01ff FTDI compatible adapter /* MCT Corp. */ product MCT HUB0100 0x0100 Hub product MCT DU_H3SP_USB232 0x0200 D-Link DU-H3SP USB BAY Hub product MCT USB232 0x0210 USB-232 Interface product MCT SITECOM_USB232 0x0230 Sitecom USB-232 Products /* Medeli */ product MEDELI DD305 0x5011 DD305 Digital Drum Set /* MediaTek, Inc. */ product MEDIATEK MTK3329 0x3329 MTK II GPS Receiver /* Meizu Electronics */ product MEIZU M6_SL 0x0140 MiniPlayer M6 (SL) /* Melco, Inc products */ product MELCO LUATX1 0x0001 LUA-TX Ethernet product MELCO LUATX5 0x0005 LUA-TX Ethernet product MELCO LUA2TX5 0x0009 LUA2-TX Ethernet product MELCO LUAKTX 0x0012 LUA-KTX Ethernet product MELCO DUBPXXG 0x001c DUB-PxxG product MELCO LUAU2KTX 0x003d LUA-U2-KTX Ethernet product MELCO KG54YB 0x005e WLI-U2-KG54-YB WLAN product MELCO KG54 0x0066 WLI-U2-KG54 WLAN product MELCO KG54AI 0x0067 WLI-U2-KG54-AI WLAN product MELCO LUA3U2AGT 0x006e LUA3-U2-AGT product MELCO NINWIFI 0x008b Nintendo Wi-Fi product MELCO PCOPRS1 0x00b3 PC-OP-RS1 RemoteStation product MELCO SG54HP 0x00d8 WLI-U2-SG54HP product MELCO G54HP 0x00d9 WLI-U2-G54HP product MELCO KG54L 0x00da WLI-U2-KG54L product MELCO WLIUCG300N 0x00e8 WLI-UC-G300N product MELCO SG54HG 0x00f4 WLI-U2-SG54HG product MELCO WLRUCG 0x0116 WLR-UC-G product MELCO WLRUCGAOSS 0x0119 WLR-UC-G-AOSS product MELCO WLIUCAG300N 0x012e WLI-UC-AG300N product MELCO WLIUCG 0x0137 WLI-UC-G product MELCO RT2870_1 0x0148 RT2870 product MELCO RT2870_2 0x0150 RT2870 product MELCO WLIUCGN 0x015d WLI-UC-GN product MELCO WLIUCG301N 0x016f WLI-UC-G301N product MELCO WLIUCGNM 0x01a2 WLI-UC-GNM product MELCO WLIUCGNM2 0x01ee WLI-UC-GNM2 /* Merlin products */ product MERLIN V620 0x1110 Merlin V620 /* MetaGeek products */ product METAGEEK TELLSTICK 0x0c30 FTDI compatible adapter product METAGEEK WISPY1B 0x083e MetaGeek Wi-Spy product METAGEEK WISPY24X 0x083f MetaGeek Wi-Spy 2.4x product METAGEEK2 WISPYDBX 0x5000 MetaGeek Wi-Spy DBx /* Metricom products */ product METRICOM RICOCHET_GS 0x0001 Ricochet GS /* MGE UPS Systems */ product MGE UPS1 0x0001 MGE UPS SYSTEMS PROTECTIONCENTER 1 product MGE UPS2 0xffff MGE UPS SYSTEMS PROTECTIONCENTER 2 /* MEI products */ product MEI CASHFLOW_SC 0x1100 Cashflow-SC Cash Acceptor product MEI S2000 0x1101 Series 2000 Combo Acceptor /* Micro Star International products */ product MSI BT_DONGLE 0x1967 Bluetooth USB dongle product MSI RT3070_1 0x3820 RT3070 product MSI RT3070_2 0x3821 RT3070 product MSI RT3070_8 0x3822 RT3070 product MSI RT3070_3 0x3870 RT3070 product MSI RT3070_9 0x3871 RT3070 product MSI UB11B 0x6823 UB11B product MSI RT2570 0x6861 RT2570 product MSI RT2570_2 0x6865 RT2570 product MSI RT2570_3 0x6869 RT2570 product MSI RT2573_1 0x6874 RT2573 product MSI RT2573_2 0x6877 RT2573 product MSI RT3070_4 0x6899 RT3070 product MSI RT3070_5 0x821a RT3070 product MSI RT3070_10 0x822a RT3070 product MSI RT3070_6 0x870a RT3070 product MSI RT3070_11 0x871a RT3070 product MSI RT3070_7 0x899a RT3070 product MSI RT2573_3 0xa861 RT2573 product MSI RT2573_4 0xa874 RT2573 /* Micron products */ product MICRON REALSSD 0x0655 Real SSD eUSB /* Microsoft products */ product MICROSOFT SIDEPREC 0x0008 SideWinder Precision Pro product MICROSOFT INTELLIMOUSE 0x0009 IntelliMouse product MICROSOFT NATURALKBD 0x000b Natural Keyboard Elite product MICROSOFT DDS80 0x0014 Digital Sound System 80 product MICROSOFT SIDEWINDER 0x001a Sidewinder Precision Racing Wheel product MICROSOFT INETPRO 0x001c Internet Keyboard Pro product MICROSOFT TBEXPLORER 0x0024 Trackball Explorer product MICROSOFT INTELLIEYE 0x0025 IntelliEye mouse product MICROSOFT INETPRO2 0x002b Internet Keyboard Pro product MICROSOFT INTELLIMOUSE5 0x0039 IntelliMouse 1.1 5-Button Mouse product MICROSOFT WHEELMOUSE 0x0040 Wheel Mouse Optical product MICROSOFT MN510 0x006e MN510 Wireless product MICROSOFT 700WX 0x0079 Palm 700WX product MICROSOFT MN110 0x007a 10/100 USB NIC product MICROSOFT WLINTELLIMOUSE 0x008c Wireless Optical IntelliMouse product MICROSOFT WLNOTEBOOK 0x00b9 Wireless Optical Mouse (Model 1023) product MICROSOFT COMFORT3000 0x00d1 Comfort Optical Mouse 3000 (Model 1043) product MICROSOFT WLNOTEBOOK3 0x00d2 Wireless Optical Mouse 3000 (Model 1049) product MICROSOFT NATURAL4000 0x00db Natural Ergonomic Keyboard 4000 product MICROSOFT WLNOTEBOOK2 0x00e1 Wireless Optical Mouse 3000 (Model 1056) product MICROSOFT XBOX360 0x0292 XBOX 360 WLAN /* Microtech products */ product MICROTECH SCSIDB25 0x0004 USB-SCSI-DB25 product MICROTECH SCSIHD50 0x0005 USB-SCSI-HD50 product MICROTECH DPCM 0x0006 USB CameraMate product MICROTECH FREECOM 0xfc01 Freecom USB-IDE /* Microtek products */ product MICROTEK 336CX 0x0094 Phantom 336CX - C3 scanner product MICROTEK X6U 0x0099 ScanMaker X6 - X6U product MICROTEK C6 0x009a Phantom C6 scanner product MICROTEK 336CX2 0x00a0 Phantom 336CX - C3 scanner product MICROTEK V6USL 0x00a3 ScanMaker V6USL product MICROTEK V6USL2 0x80a3 ScanMaker V6USL product MICROTEK V6UL 0x80ac ScanMaker V6UL /* Microtune, Inc. products */ product MICROTUNE BT_DONGLE 0x1000 Bluetooth USB dongle /* Midiman products */ product MAUDIO MIDISPORT2X2 0x1001 Midisport 2x2 product MAUDIO FASTTRACKULTRA 0x2080 Fast Track Ultra product MAUDIO FASTTRACKULTRA8R 0x2081 Fast Track Ultra 8R /* MindsAtWork products */ product MINDSATWORK WALLET 0x0001 Digital Wallet /* Minolta Co., Ltd. */ product MINOLTA 2300 0x4001 Dimage 2300 product MINOLTA S304 0x4007 Dimage S304 product MINOLTA X 0x4009 Dimage X product MINOLTA 5400 0x400e Dimage 5400 product MINOLTA F300 0x4011 Dimage F300 product MINOLTA E223 0x4017 Dimage E223 /* Mitsumi products */ product MITSUMI CDRRW 0x0000 CD-R/RW Drive product MITSUMI BT_DONGLE 0x641f Bluetooth USB dongle product MITSUMI FDD 0x6901 USB FDD /* Mobile Action products */ product MOBILEACTION MA620 0x0620 MA-620 Infrared Adapter /* Mobility products */ product MOBILITY USB_SERIAL 0x0202 FTDI compatible adapter product MOBILITY EA 0x0204 Ethernet product MOBILITY EASIDOCK 0x0304 EasiDock Ethernet /* MosChip products */ product MOSCHIP MCS7703 0x7703 MCS7703 Serial Port Adapter product MOSCHIP MCS7730 0x7730 MCS7730 Ethernet product MOSCHIP MCS7820 0x7820 MCS7820 Serial Port Adapter product MOSCHIP MCS7830 0x7830 MCS7830 Ethernet product MOSCHIP MCS7832 0x7832 MCS7832 Ethernet product MOSCHIP MCS7840 0x7840 MCS7840 Serial Port Adapter /* Motorola products */ product MOTOROLA MC141555 0x1555 MC141555 hub controller product MOTOROLA SB4100 0x4100 SB4100 USB Cable Modem product MOTOROLA2 T720C 0x2822 T720c product MOTOROLA2 A41XV32X 0x2a22 A41x/V32x Mobile Phones product MOTOROLA2 E398 0x4810 E398 Mobile Phone product MOTOROLA2 USBLAN 0x600c USBLAN product MOTOROLA2 USBLAN2 0x6027 USBLAN product MOTOROLA2 MB886 0x710f MB886 Mobile Phone (Atria HD) product MOTOROLA4 RT2770 0x9031 RT2770 product MOTOROLA4 RT3070 0x9032 RT3070 /* MpMan products */ product MPMAN MPF400_2 0x25a8 MPF400 Music Player 2Go product MPMAN MPF400_1 0x36d0 MPF400 Music Player 1Go /* MultiTech products */ product MULTITECH ATLAS 0xf101 MT5634ZBA-USB modem /* Mustek products */ product MUSTEK 1200CU 0x0001 1200 CU scanner product MUSTEK 600CU 0x0002 600 CU scanner product MUSTEK 1200USB 0x0003 1200 USB scanner product MUSTEK 1200UB 0x0006 1200 UB scanner product MUSTEK 1200USBPLUS 0x0007 1200 USB Plus scanner product MUSTEK 1200CUPLUS 0x0008 1200 CU Plus scanner product MUSTEK BEARPAW1200F 0x0010 BearPaw 1200F scanner product MUSTEK BEARPAW2400TA 0x0218 BearPaw 2400TA scanner product MUSTEK BEARPAW1200TA 0x021e BearPaw 1200TA scanner product MUSTEK 600USB 0x0873 600 USB scanner product MUSTEK MDC800 0xa800 MDC-800 digital camera /* M-Systems products */ product MSYSTEMS DISKONKEY 0x0010 DiskOnKey product MSYSTEMS DISKONKEY2 0x0011 DiskOnKey /* Myson products */ product MYSON HEDEN_8813 0x8813 USB-IDE product MYSON HEDEN 0x8818 USB-IDE product MYSON HUBREADER 0x8819 COMBO Card reader with USB HUB product MYSON STARREADER 0x9920 USB flash card adapter /* National Semiconductor */ product NATIONAL BEARPAW1200 0x1000 BearPaw 1200 product NATIONAL BEARPAW2400 0x1001 BearPaw 2400 /* NEC products */ product NEC HUB_0050 0x0050 USB 2.0 7-Port Hub product NEC HUB_005A 0x005a USB 2.0 4-Port Hub product NEC HUB 0x55aa hub product NEC HUB_B 0x55ab hub /* NEODIO products */ product NEODIO ND3260 0x3260 8-in-1 Multi-format Flash Controller product NEODIO ND5010 0x5010 Multi-format Flash Controller /* Neotel products */ product NEOTEL PRIME 0x4000 Prime USB modem /* Netac products */ product NETAC CF_CARD 0x1060 USB-CF-Card product NETAC ONLYDISK 0x0003 OnlyDisk /* NetChip Technology Products */ product NETCHIP TURBOCONNECT 0x1080 Turbo-Connect product NETCHIP CLIK_40 0xa140 USB Clik! 40 product NETCHIP GADGETZERO 0xa4a0 Linux Gadget Zero product NETCHIP ETHERNETGADGET 0xa4a2 Linux Ethernet/RNDIS gadget on pxa210/25x/26x product NETCHIP POCKETBOOK 0xa4a5 PocketBook /* Netgear products */ product NETGEAR EA101 0x1001 Ethernet product NETGEAR EA101X 0x1002 Ethernet product NETGEAR FA101 0x1020 Ethernet 10/100, USB1.1 product NETGEAR FA120 0x1040 USB 2.0 Ethernet product NETGEAR M4100 0x1100 M4100/M5300/M7100 series switch product NETGEAR WG111V2_2 0x4240 PrismGT USB 2.0 WLAN product NETGEAR WG111V3 0x4260 WG111v3 product NETGEAR WG111U 0x4300 WG111U product NETGEAR WG111U_NF 0x4301 WG111U (no firmware) product NETGEAR WG111V2 0x6a00 WG111V2 product NETGEAR RTL8192CU 0x9021 RTL8192CU product NETGEAR WNA1000M 0x9041 WNA1000M product NETGEAR2 MA101 0x4100 MA101 product NETGEAR2 MA101B 0x4102 MA101 Rev B product NETGEAR3 WG111T 0x4250 WG111T product NETGEAR3 WG111T_NF 0x4251 WG111T (no firmware) product NETGEAR3 WPN111 0x5f00 WPN111 product NETGEAR3 WPN111_NF 0x5f01 WPN111 (no firmware) product NETGEAR3 WPN111_2 0x5f02 WPN111 product NETGEAR4 RTL8188CU 0x9041 RTL8188CU /* NetIndex products */ product NETINDEX WS002IN 0x2001 Willcom WS002IN /* NEWlink */ product NEWLINK USB2IDEBRIDGE 0x00ff USB 2.0 Hard Drive Enclosure /* Nikon products */ product NIKON E990 0x0102 Digital Camera E990 product NIKON LS40 0x4000 CoolScan LS40 ED product NIKON D300 0x041a Digital Camera D300 /* NovaTech Products */ product NOVATECH NV902 0x9020 NovaTech NV-902W product NOVATECH RT2573 0x9021 RT2573 product NOVATECH RTL8188CU 0x9071 RTL8188CU /* Nokia products */ product NOKIA N958GB 0x0070 Nokia N95 8GBc product NOKIA2 CA42 0x1234 CA-42 cable /* Novatel Wireless products */ product NOVATEL V640 0x1100 Merlin V620 product NOVATEL CDMA_MODEM 0x1110 Novatel Wireless Merlin CDMA product NOVATEL V620 0x1110 Merlin V620 product NOVATEL V740 0x1120 Merlin V740 product NOVATEL V720 0x1130 Merlin V720 product NOVATEL U740 0x1400 Merlin U740 product NOVATEL U740_2 0x1410 Merlin U740 product NOVATEL U870 0x1420 Merlin U870 product NOVATEL XU870 0x1430 Merlin XU870 product NOVATEL X950D 0x1450 Merlin X950D product NOVATEL ES620 0x2100 Expedite ES620 product NOVATEL E725 0x2120 Expedite E725 product NOVATEL ES620_2 0x2130 Expedite ES620 product NOVATEL ES620 0x2100 ES620 CDMA product NOVATEL U720 0x2110 Merlin U720 product NOVATEL EU730 0x2400 Expedite EU730 product NOVATEL EU740 0x2410 Expedite EU740 product NOVATEL EU870D 0x2420 Expedite EU870D product NOVATEL U727 0x4100 Merlin U727 CDMA product NOVATEL MC950D 0x4400 Novatel MC950D HSUPA product NOVATEL MC990D 0x7001 Novatel MC990D product NOVATEL ZEROCD 0x5010 Novatel ZeroCD product NOVATEL MIFI2200V 0x5020 Novatel MiFi 2200 CDMA Virgin Mobile product NOVATEL ZEROCD2 0x5030 Novatel ZeroCD product NOVATEL MIFI2200 0x5041 Novatel MiFi 2200 CDMA product NOVATEL U727_2 0x5100 Merlin U727 CDMA product NOVATEL U760 0x6000 Novatel U760 product NOVATEL MC760 0x6002 Novatel MC760 product NOVATEL MC547 0x7042 Novatel MC547 product NOVATEL MC679 0x7031 Novatel MC679 product NOVATEL2 FLEXPACKGPS 0x0100 NovAtel FlexPack GPS receiver /* Merlin products */ product MERLIN V620 0x1110 Merlin V620 /* O2Micro products */ product O2MICRO OZ776_HUB 0x7761 OZ776 hub product O2MICRO OZ776_CCID_SC 0x7772 OZ776 CCID SC Reader /* Olimex products */ product OLIMEX ARM_USB_OCD 0x0003 FTDI compatible adapter product OLIMEX ARM_USB_OCD_H 0x002b FTDI compatible adapter /* Olympus products */ product OLYMPUS C1 0x0102 C-1 Digital Camera product OLYMPUS C700 0x0105 C-700 Ultra Zoom /* OmniVision Technologies, Inc. products */ product OMNIVISION OV511 0x0511 OV511 Camera product OMNIVISION OV511PLUS 0xa511 OV511+ Camera /* OnSpec Electronic, Inc. */ product ONSPEC SDS_HOTFIND_D 0x0400 SDS-infrared.com Hotfind-D Infrared Camera product ONSPEC MDCFE_B_CF_READER 0xa000 MDCFE-B USB CF Reader product ONSPEC CFMS_RW 0xa001 SIIG/Datafab Memory Stick+CF Reader/Writer product ONSPEC READER 0xa003 Datafab-based Reader product ONSPEC CFSM_READER 0xa005 PNY/Datafab CF+SM Reader product ONSPEC CFSM_READER2 0xa006 Simple Tech/Datafab CF+SM Reader product ONSPEC MDSM_B_READER 0xa103 MDSM-B reader product ONSPEC CFSM_COMBO 0xa109 USB to CF + SM Combo (LC1) product ONSPEC UCF100 0xa400 FlashLink UCF-100 CompactFlash Reader product ONSPEC2 IMAGEMATE_SDDR55 0xa103 ImageMate SDDR55 /* Option products */ product OPTION VODAFONEMC3G 0x5000 Vodafone Mobile Connect 3G datacard product OPTION GT3G 0x6000 GlobeTrotter 3G datacard product OPTION GT3GQUAD 0x6300 GlobeTrotter 3G QUAD datacard product OPTION GT3GPLUS 0x6600 GlobeTrotter 3G+ datacard product OPTION GTICON322 0xd033 GlobeTrotter Icon322 storage product OPTION GTMAX36 0x6701 GlobeTrotter Max 3.6 Modem product OPTION GTMAX72 0x6711 GlobeTrotter Max 7.2 HSDPA product OPTION GTHSDPA 0x6971 GlobeTrotter HSDPA product OPTION GTMAXHSUPA 0x7001 GlobeTrotter HSUPA product OPTION GTMAXHSUPAE 0x6901 GlobeTrotter HSUPA PCIe product OPTION GTMAX380HSUPAE 0x7211 GlobeTrotter 380HSUPA PCIe product OPTION GT3G_1 0x6050 3G modem product OPTION GT3G_2 0x6100 3G modem product OPTION GT3G_3 0x6150 3G modem product OPTION GT3G_4 0x6200 3G modem product OPTION GT3G_5 0x6250 3G modem product OPTION GT3G_6 0x6350 3G modem product OPTION E6500 0x6500 3G modem product OPTION E6501 0x6501 3G modem product OPTION E6601 0x6601 3G modem product OPTION E6721 0x6721 3G modem product OPTION E6741 0x6741 3G modem product OPTION E6761 0x6761 3G modem product OPTION E6800 0x6800 3G modem product OPTION E7021 0x7021 3G modem product OPTION E7041 0x7041 3G modem product OPTION E7061 0x7061 3G modem product OPTION E7100 0x7100 3G modem product OPTION GTM380 0x7201 3G modem product OPTION GE40X 0x7601 Globetrotter HSUPA product OPTION GSICON72 0x6911 GlobeSurfer iCON product OPTION GSICONHSUPA 0x7251 Globetrotter HSUPA product OPTION ICON401 0x7401 GlobeSurfer iCON 401 product OPTION GTHSUPA 0x7011 Globetrotter HSUPA product OPTION GMT382 0x7501 Globetrotter HSUPA product OPTION GE40X_1 0x7301 Globetrotter HSUPA product OPTION GE40X_2 0x7361 Globetrotter HSUPA product OPTION GE40X_3 0x7381 Globetrotter HSUPA product OPTION GTM661W 0x9000 GTM661W product OPTION ICONEDGE 0xc031 GlobeSurfer iCON EDGE product OPTION MODHSXPA 0xd013 Globetrotter HSUPA product OPTION ICON321 0xd031 Globetrotter HSUPA product OPTION ICON505 0xd055 Globetrotter iCON 505 product OPTION ICON452 0x7901 Globetrotter iCON 452 /* Optoelectronics Co., Ltd */ product OPTO BARCODE 0x0001 Barcode Reader product OPTO OPTICONCODE 0x0009 Opticon Code Reader product OPTO BARCODE_1 0xa002 Barcode Reader product OPTO CRD7734 0xc000 USB Cradle CRD-7734-RU product OPTO CRD7734_1 0xc001 USB Cradle CRD-7734-RU /* OvisLink product */ product OVISLINK RT3072 0x3072 RT3072 /* OQO */ product OQO WIFI01 0x0002 model 01 WiFi interface product OQO BT01 0x0003 model 01 Bluetooth interface product OQO ETHER01PLUS 0x7720 model 01+ Ethernet product OQO ETHER01 0x8150 model 01 Ethernet interface /* Ours Technology Inc. */ product OTI DKU5 0x6858 DKU-5 Serial /* Owen.ru products */ product OWEN AC4 0x0004 AC4 USB-RS485 converter /* OWL producs */ product OWL CM_160 0xca05 OWL CM-160 power monitor /* Palm Computing, Inc. product */ product PALM SERIAL 0x0080 USB Serial product PALM M500 0x0001 Palm m500 product PALM M505 0x0002 Palm m505 product PALM M515 0x0003 Palm m515 product PALM I705 0x0020 Palm i705 product PALM TUNGSTEN_Z 0x0031 Palm Tungsten Z product PALM M125 0x0040 Palm m125 product PALM M130 0x0050 Palm m130 product PALM TUNGSTEN_T 0x0060 Palm Tungsten T product PALM ZIRE31 0x0061 Palm Zire 31 product PALM ZIRE 0x0070 Palm Zire /* Panasonic products */ product PANASONIC LS120CAM 0x0901 LS-120 Camera product PANASONIC KXL840AN 0x0d01 CD-R Drive KXL-840AN product PANASONIC KXLRW32AN 0x0d09 CD-R Drive KXL-RW32AN product PANASONIC KXLCB20AN 0x0d0a CD-R Drive KXL-CB20AN product PANASONIC KXLCB35AN 0x0d0e DVD-ROM & CD-R/RW product PANASONIC SDCAAE 0x1b00 MultiMediaCard product PANASONIC TYTP50P6S 0x3900 TY-TP50P6-S 50in Touch Panel /* Papouch products */ product PAPOUCH AD4USB 0x8003 FTDI compatible adapter product PAPOUCH AP485 0x0101 FTDI compatible adapter product PAPOUCH AP485_2 0x0104 FTDI compatible adapter product PAPOUCH DRAK5 0x0700 FTDI compatible adapter product PAPOUCH DRAK6 0x1000 FTDI compatible adapter product PAPOUCH GMSR 0x8005 FTDI compatible adapter product PAPOUCH GMUX 0x8004 FTDI compatible adapter product PAPOUCH IRAMP 0x0500 FTDI compatible adapter product PAPOUCH LEC 0x0300 FTDI compatible adapter product PAPOUCH MU 0x8001 FTDI compatible adapter product PAPOUCH QUIDO10X1 0x0b00 FTDI compatible adapter product PAPOUCH QUIDO2X16 0x0e00 FTDI compatible adapter product PAPOUCH QUIDO2X2 0x0a00 FTDI compatible adapter product PAPOUCH QUIDO30X3 0x0c00 FTDI compatible adapter product PAPOUCH QUIDO3X32 0x0f00 FTDI compatible adapter product PAPOUCH QUIDO4X4 0x0900 FTDI compatible adapter product PAPOUCH QUIDO60X3 0x0d00 FTDI compatible adapter product PAPOUCH QUIDO8X8 0x0800 FTDI compatible adapter product PAPOUCH SB232 0x0301 FTDI compatible adapter product PAPOUCH SB422 0x0102 FTDI compatible adapter product PAPOUCH SB422_2 0x0105 FTDI compatible adapter product PAPOUCH SB485 0x0100 FTDI compatible adapter product PAPOUCH SB485C 0x0107 FTDI compatible adapter product PAPOUCH SB485S 0x0106 FTDI compatible adapter product PAPOUCH SB485_2 0x0103 FTDI compatible adapter product PAPOUCH SIMUKEY 0x8002 FTDI compatible adapter product PAPOUCH TMU 0x0400 FTDI compatible adapter product PAPOUCH UPSUSB 0x8000 FTDI compatible adapter /* PARA Industrial products */ product PARA RT3070 0x8888 RT3070 /* Simtec Electronics products */ product SIMTEC ENTROPYKEY 0x0001 Entropy Key /* Pegatron products */ product PEGATRON RT2870 0x0002 RT2870 product PEGATRON RT3070 0x000c RT3070 product PEGATRON RT3070_2 0x000e RT3070 product PEGATRON RT3070_3 0x0010 RT3070 /* Peracom products */ product PERACOM SERIAL1 0x0001 Serial product PERACOM ENET 0x0002 Ethernet product PERACOM ENET3 0x0003 At Home Ethernet product PERACOM ENET2 0x0005 Ethernet /* Philips products */ product PHILIPS DSS350 0x0101 DSS 350 Digital Speaker System product PHILIPS DSS 0x0104 DSS XXX Digital Speaker System product PHILIPS HUB 0x0201 hub product PHILIPS PCA646VC 0x0303 PCA646VC PC Camera product PHILIPS PCVC680K 0x0308 PCVC680K Vesta Pro PC Camera product PHILIPS DSS150 0x0471 DSS 150 Digital Speaker System product PHILIPS ACE1001 0x066a AKTAKOM ACE-1001 cable product PHILIPS SPE3030CC 0x083a USB 2.0 External Disk product PHILIPS SNU5600 0x1236 SNU5600 product PHILIPS UM10016 0x1552 ISP 1581 Hi-Speed USB MPEG2 Encoder Reference Kit product PHILIPS DIVAUSB 0x1801 DIVA USB mp3 player product PHILIPS RT2870 0x200f RT2870 /* Philips Semiconductor products */ product PHILIPSSEMI HUB1122 0x1122 HUB /* Megatec */ product MEGATEC UPS 0x5161 Phoenixtec protocol based UPS /* P.I. Engineering products */ product PIENGINEERING PS2USB 0x020b PS2 to Mac USB Adapter /* Planex Communications products */ product PLANEX GW_US11H 0x14ea GW-US11H WLAN product PLANEX2 RTL8188CUS 0x1201 RTL8188CUS product PLANEX2 GW_US11S 0x3220 GW-US11S WLAN product PLANEX2 GW_US54GXS 0x5303 GW-US54GXS WLAN product PLANEX2 RTL8188CU_1 0xab2a RTL8188CU product PLANEX2 RTL8188CU_2 0xed17 RTL8188CU product PLANEX2 RTL8188CU_3 0x4902 RTL8188CU product PLANEX2 RTL8188CU_4 0xab2e RTL8188CU product PLANEX2 RTL8192CU 0xab2b RTL8192CU product PLANEX2 GWUS54HP 0xab01 GW-US54HP product PLANEX2 GWUS300MINIS 0xab24 GW-US300MiniS product PLANEX2 RT3070 0xab25 RT3070 product PLANEX2 MZKUE150N 0xab2f MZK-UE150N product PLANEX2 GWUS54MINI2 0xab50 GW-US54Mini2 product PLANEX2 GWUS54SG 0xc002 GW-US54SG product PLANEX2 GWUS54GZL 0xc007 GW-US54GZL product PLANEX2 GWUS54GD 0xed01 GW-US54GD product PLANEX2 GWUSMM 0xed02 GW-USMM product PLANEX2 RT2870 0xed06 RT2870 product PLANEX2 GWUSMICRON 0xed14 GW-USMicroN product PLANEX2 GWUSVALUEEZ 0xed17 GW-USValue-EZ product PLANEX3 GWUS54GZ 0xab10 GW-US54GZ product PLANEX3 GU1000T 0xab11 GU-1000T product PLANEX3 GWUS54MINI 0xab13 GW-US54Mini product PLANEX2 GWUSNANO 0xab28 GW-USNano /* Plextor Corp. */ product PLEXTOR 40_12_40U 0x0011 PlexWriter 40/12/40U /* PLX products */ product PLX TESTBOARD 0x9060 test board product PLX CA42 0xac70 CA-42 /* PNY products */ product PNY ATTACHE2 0x0010 USB 2.0 Flash Drive /* PortGear products */ product PORTGEAR EA8 0x0008 Ethernet product PORTGEAR EA9 0x0009 Ethernet /* Portsmith products */ product PORTSMITH EEA 0x3003 Express Ethernet /* Posiflex products */ product POSIFLEX PP7000 0x0300 FTDI compatible adapter /* Primax products */ product PRIMAX G2X300 0x0300 G2-200 scanner product PRIMAX G2E300 0x0301 G2E-300 scanner product PRIMAX G2300 0x0302 G2-300 scanner product PRIMAX G2E3002 0x0303 G2E-300 scanner product PRIMAX 9600 0x0340 Colorado USB 9600 scanner product PRIMAX 600U 0x0341 Colorado 600u scanner product PRIMAX 6200 0x0345 Visioneer 6200 scanner product PRIMAX 19200 0x0360 Colorado USB 19200 scanner product PRIMAX 1200U 0x0361 Colorado 1200u scanner product PRIMAX G600 0x0380 G2-600 scanner product PRIMAX 636I 0x0381 ReadyScan 636i product PRIMAX G2600 0x0382 G2-600 scanner product PRIMAX G2E600 0x0383 G2E-600 scanner product PRIMAX COMFORT 0x4d01 Comfort product PRIMAX MOUSEINABOX 0x4d02 Mouse-in-a-Box product PRIMAX PCGAUMS1 0x4d04 Sony PCGA-UMS1 product PRIMAX HP_RH304AA 0x4d17 HP RH304AA mouse /* Prolific products */ product PROLIFIC PL2301 0x0000 PL2301 Host-Host interface product PROLIFIC PL2302 0x0001 PL2302 Host-Host interface product PROLIFIC MOTOROLA 0x0307 Motorola Cable product PROLIFIC RSAQ2 0x04bb PL2303 Serial (IODATA USB-RSAQ2) product PROLIFIC ALLTRONIX_GPRS 0x0609 Alltronix ACM003U00 modem product PROLIFIC ALDIGA_AL11U 0x0611 AlDiga AL-11U modem product PROLIFIC MICROMAX_610U 0x0612 Micromax 610U product PROLIFIC DCU11 0x1234 DCU-11 Phone Cable product PROLIFIC UIC_MSR206 0x206a UIC MSR206 Card Reader product PROLIFIC PL2303 0x2303 PL2303 Serial (ATEN/IOGEAR UC232A) product PROLIFIC PL2305 0x2305 Parallel printer product PROLIFIC ATAPI4 0x2307 ATAPI-4 Controller product PROLIFIC PL2501 0x2501 PL2501 Host-Host interface product PROLIFIC PL2506 0x2506 PL2506 USB to IDE Bridge product PROLIFIC HCR331 0x331a HCR331 Hybrid Card Reader product PROLIFIC PHAROS 0xaaa0 Prolific Pharos product PROLIFIC RSAQ3 0xaaa2 PL2303 Serial Adapter (IODATA USB-RSAQ3) product PROLIFIC2 PL2303 0x2303 PL2303 Serial Adapter /* Putercom products */ product PUTERCOM UPA100 0x047e USB-1284 BRIDGE /* Qcom products */ product QCOM RT2573 0x6196 RT2573 product QCOM RT2573_2 0x6229 RT2573 product QCOM RT2573_3 0x6238 RT2573 product QCOM RT2870 0x6259 RT2870 /* QI-hardware */ product QIHARDWARE JTAGSERIAL 0x0713 FTDI compatible adapter /* Qisda products */ product QISDA H21_1 0x4512 3G modem product QISDA H21_2 0x4523 3G modem product QISDA H20_1 0x4515 3G modem product QISDA H20_2 0x4519 3G modem /* Qualcomm products */ product QUALCOMM CDMA_MSM 0x6000 CDMA Technologies MSM phone product QUALCOMM NTT_L02C_MODEM 0x618f NTT DOCOMO L-02C product QUALCOMM NTT_L02C_STORAGE 0x61dd NTT DOCOMO L-02C product QUALCOMM2 MF330 0x6613 MF330 product QUALCOMM2 RWT_FCT 0x3100 RWT FCT-CDMA 2000 1xRTT modem product QUALCOMM2 CDMA_MSM 0x3196 CDMA Technologies MSM modem product QUALCOMM2 AC8700 0x6000 AC8700 product QUALCOMM2 VW110L 0x1000 Vertex Wireless 110L modem product QUALCOMM2 SIM5218 0x9000 SIM5218 product QUALCOMM2 GOBI2000_QDL 0x9204 Qualcomm Gobi 2000 QDL product QUALCOMM2 GOBI2000 0x9205 Qualcomm Gobi 2000 modem product QUALCOMMINC CDMA_MSM 0x0001 CDMA Technologies MSM modem product QUALCOMMINC E0002 0x0002 3G modem product QUALCOMMINC E0003 0x0003 3G modem product QUALCOMMINC E0004 0x0004 3G modem product QUALCOMMINC E0005 0x0005 3G modem product QUALCOMMINC E0006 0x0006 3G modem product QUALCOMMINC E0007 0x0007 3G modem product QUALCOMMINC E0008 0x0008 3G modem product QUALCOMMINC E0009 0x0009 3G modem product QUALCOMMINC E000A 0x000a 3G modem product QUALCOMMINC E000B 0x000b 3G modem product QUALCOMMINC E000C 0x000c 3G modem product QUALCOMMINC E000D 0x000d 3G modem product QUALCOMMINC E000E 0x000e 3G modem product QUALCOMMINC E000F 0x000f 3G modem product QUALCOMMINC E0010 0x0010 3G modem product QUALCOMMINC E0011 0x0011 3G modem product QUALCOMMINC E0012 0x0012 3G modem product QUALCOMMINC E0013 0x0013 3G modem product QUALCOMMINC E0014 0x0014 3G modem product QUALCOMMINC MF628 0x0015 3G modem product QUALCOMMINC MF633R 0x0016 ZTE WCDMA modem product QUALCOMMINC E0017 0x0017 3G modem product QUALCOMMINC E0018 0x0018 3G modem product QUALCOMMINC E0019 0x0019 3G modem product QUALCOMMINC E0020 0x0020 3G modem product QUALCOMMINC E0021 0x0021 3G modem product QUALCOMMINC E0022 0x0022 3G modem product QUALCOMMINC E0023 0x0023 3G modem product QUALCOMMINC E0024 0x0024 3G modem product QUALCOMMINC E0025 0x0025 3G modem product QUALCOMMINC E0026 0x0026 3G modem product QUALCOMMINC E0027 0x0027 3G modem product QUALCOMMINC E0028 0x0028 3G modem product QUALCOMMINC E0029 0x0029 3G modem product QUALCOMMINC E0030 0x0030 3G modem product QUALCOMMINC MF626 0x0031 3G modem product QUALCOMMINC E0032 0x0032 3G modem product QUALCOMMINC E0033 0x0033 3G modem product QUALCOMMINC E0037 0x0037 3G modem product QUALCOMMINC E0039 0x0039 3G modem product QUALCOMMINC E0042 0x0042 3G modem product QUALCOMMINC E0043 0x0043 3G modem product QUALCOMMINC E0048 0x0048 3G modem product QUALCOMMINC E0049 0x0049 3G modem product QUALCOMMINC E0051 0x0051 3G modem product QUALCOMMINC E0052 0x0052 3G modem product QUALCOMMINC ZTE_STOR2 0x0053 USB ZTE Storage product QUALCOMMINC E0054 0x0054 3G modem product QUALCOMMINC E0055 0x0055 3G modem product QUALCOMMINC E0057 0x0057 3G modem product QUALCOMMINC E0058 0x0058 3G modem product QUALCOMMINC E0059 0x0059 3G modem product QUALCOMMINC E0060 0x0060 3G modem product QUALCOMMINC E0061 0x0061 3G modem product QUALCOMMINC E0062 0x0062 3G modem product QUALCOMMINC E0063 0x0063 3G modem product QUALCOMMINC E0064 0x0064 3G modem product QUALCOMMINC E0066 0x0066 3G modem product QUALCOMMINC E0069 0x0069 3G modem product QUALCOMMINC E0070 0x0070 3G modem product QUALCOMMINC E0073 0x0073 3G modem product QUALCOMMINC E0076 0x0076 3G modem product QUALCOMMINC E0078 0x0078 3G modem product QUALCOMMINC E0082 0x0082 3G modem product QUALCOMMINC E0086 0x0086 3G modem product QUALCOMMINC SURFSTICK 0x0117 1&1 Surf Stick product QUALCOMMINC K3772_Z_INIT 0x1179 K3772-Z Initial product QUALCOMMINC K3772_Z 0x1181 K3772-Z +product QUALCOMMINC MF195E_INIT 0x1514 MF195E initial +product QUALCOMMINC MF195E 0x1516 MF195E product QUALCOMMINC ZTE_STOR 0x2000 USB ZTE Storage product QUALCOMMINC E2002 0x2002 3G modem product QUALCOMMINC E2003 0x2003 3G modem product QUALCOMMINC AC8710 0xfff1 3G modem product QUALCOMMINC AC2726 0xfff5 3G modem product QUALCOMMINC AC8700 0xfffe CDMA 1xEVDO USB modem /* Quanta products */ product QUANTA RW6815_1 0x00ce HP iPAQ rw6815 product QUANTA RT3070 0x0304 RT3070 product QUANTA Q101_STOR 0x1000 USB Q101 Storage product QUANTA Q101 0xea02 HSDPA modem product QUANTA Q111 0xea03 HSDPA modem product QUANTA GLX 0xea04 HSDPA modem product QUANTA GKE 0xea05 HSDPA modem product QUANTA GLE 0xea06 HSDPA modem product QUANTA RW6815R 0xf003 HP iPAQ rw6815 RNDIS /* Qtronix products */ product QTRONIX 980N 0x2011 Scorpion-980N keyboard /* Quickshot products */ product QUICKSHOT STRIKEPAD 0x6238 USB StrikePad /* Radio Shack */ product RADIOSHACK USBCABLE 0x4026 USB to Serial Cable /* Rainbow Technologies products */ product RAINBOW IKEY2000 0x1200 i-Key 2000 /* Ralink Technology products */ product RALINK RT2570 0x1706 RT2500USB Wireless Adapter product RALINK RT2070 0x2070 RT2070 product RALINK RT2570_2 0x2570 RT2500USB Wireless Adapter product RALINK RT2573 0x2573 RT2501USB Wireless Adapter product RALINK RT2671 0x2671 RT2601USB Wireless Adapter product RALINK RT2770 0x2770 RT2770 product RALINK RT2870 0x2870 RT2870 product RALINK RT_STOR 0x2878 USB Storage product RALINK RT3070 0x3070 RT3070 product RALINK RT3071 0x3071 RT3071 product RALINK RT3072 0x3072 RT3072 product RALINK RT3370 0x3370 RT3370 product RALINK RT3572 0x3572 RT3572 product RALINK RT3573 0x3573 RT3573 product RALINK RT5370 0x5370 RT5370 product RALINK RT5572 0x5572 RT5572 product RALINK RT8070 0x8070 RT8070 product RALINK RT2570_3 0x9020 RT2500USB Wireless Adapter product RALINK RT2573_2 0x9021 RT2501USB Wireless Adapter /* RATOC Systems products */ product RATOC REXUSB60 0xb000 USB serial adapter REX-USB60 product RATOC REXUSB60F 0xb020 USB serial adapter REX-USB60F /* Realtek products */ /* Green House and CompUSA OEM this part */ product REALTEK DUMMY 0x0000 Dummy product product REALTEK USB20CRW 0x0158 USB20CRW Card Reader product REALTEK RTL8188ETV 0x0179 RTL8188ETV product REALTEK RTL8188CTV 0x018a RTL8188CTV product REALTEK USBKR100 0x8150 USBKR100 USB Ethernet product REALTEK RTL8188CE_0 0x8170 RTL8188CE product REALTEK RTL8171 0x8171 RTL8171 product REALTEK RTL8172 0x8172 RTL8172 product REALTEK RTL8173 0x8173 RTL8173 product REALTEK RTL8174 0x8174 RTL8174 product REALTEK RTL8188CU_0 0x8176 RTL8188CU product REALTEK RTL8188EU 0x8179 RTL8188EU product REALTEK RTL8188CE_1 0x817e RTL8188CE product REALTEK RTL8188CU_1 0x817a RTL8188CU product REALTEK RTL8188CU_2 0x817b RTL8188CU product REALTEK RTL8187 0x8187 RTL8187 Wireless Adapter product REALTEK RTL8187B_0 0x8189 RTL8187B Wireless Adapter product REALTEK RTL8196EU 0x8196 RTL8196EU product REALTEK RTL8187B_1 0x8197 RTL8187B Wireless Adapter product REALTEK RTL8187B_2 0x8198 RTL8187B Wireless Adapter product REALTEK RTL8188CUS 0x818a RTL8188CUS product REALTEK RTL8188CU_COMBO 0x8754 RTL8188CU product REALTEK RTL8191CU 0x8177 RTL8191CU product REALTEK RTL8192CU 0x8178 RTL8192CU product REALTEK RTL8192CE 0x817c RTL8192CE product REALTEK RTL8188RU_1 0x817d RTL8188RU product REALTEK RTL8188RU_3 0x817f RTL8188RU product REALTEK RTL8712 0x8712 RTL8712 product REALTEK RTL8713 0x8712 RTL8713 product REALTEK RTL8188RU_2 0x317f RTL8188RU product REALTEK RTL8192SU 0xc512 RTL8192SU /* RedOctane products */ product REDOCTANE DUMMY 0x0000 Dummy product product REDOCTANE GHMIDI 0x474b GH MIDI INTERFACE /* Renesas products */ product RENESAS RX610 0x0053 RX610 RX-Stick /* Ricoh products */ product RICOH VGPVCC2 0x1830 VGP-VCC2 Camera product RICOH VGPVCC3 0x1832 VGP-VCC3 Camera product RICOH VGPVCC2_2 0x1833 VGP-VCC2 Camera product RICOH VGPVCC2_3 0x1834 VGP-VCC2 Camera product RICOH VGPVCC7 0x183a VGP-VCC7 Camera product RICOH VGPVCC8 0x183b VGP-VCC8 Camera /* Reiner-SCT products */ product REINERSCT CYBERJACK_ECOM 0x0100 e-com cyberJack /* Roland products */ product ROLAND UA100 0x0000 UA-100 Audio I/F product ROLAND UM4 0x0002 UM-4 MIDI I/F product ROLAND SC8850 0x0003 SC-8850 MIDI Synth product ROLAND U8 0x0004 U-8 Audio I/F product ROLAND UM2 0x0005 UM-2 MIDI I/F product ROLAND SC8820 0x0007 SC-8820 MIDI Synth product ROLAND PC300 0x0008 PC-300 MIDI Keyboard product ROLAND UM1 0x0009 UM-1 MIDI I/F product ROLAND SK500 0x000b SK-500 MIDI Keyboard product ROLAND SCD70 0x000c SC-D70 MIDI Synth product ROLAND UM880N 0x0014 EDIROL UM-880 MIDI I/F (native) product ROLAND UM880G 0x0015 EDIROL UM-880 MIDI I/F (generic) product ROLAND SD90 0x0016 SD-90 MIDI Synth product ROLAND UM550 0x0023 UM-550 MIDI I/F product ROLAND SD20 0x0027 SD-20 MIDI Synth product ROLAND SD80 0x0029 SD-80 MIDI Synth product ROLAND UA700 0x002b UA-700 Audio I/F /* Rockfire products */ product ROCKFIRE GAMEPAD 0x2033 gamepad 203USB /* RATOC Systems products */ product RATOC REXUSB60 0xb000 REX-USB60 product RATOC REXUSB60F 0xb020 REX-USB60F /* RT system products */ product RTSYSTEMS CT29B 0x9e54 FTDI compatible adapter product RTSYSTEMS SERIAL_VX7 0x9e52 FTDI compatible adapter /* Sagem products */ product SAGEM USBSERIAL 0x0027 USB-Serial Controller product SAGEM XG760A 0x004a XG-760A product SAGEM XG76NA 0x0062 XG-76NA /* Samsung products */ product SAMSUNG WIS09ABGN 0x2018 WIS09ABGN Wireless LAN adapter product SAMSUNG ML6060 0x3008 ML-6060 laser printer product SAMSUNG YP_U2 0x5050 YP-U2 MP3 Player product SAMSUNG YP_U4 0x5092 YP-U4 MP3 Player product SAMSUNG I500 0x6601 I500 Palm USB Phone product SAMSUNG I330 0x8001 I330 phone cradle product SAMSUNG2 RT2870_1 0x2018 RT2870 /* Samsung Techwin products */ product SAMSUNG_TECHWIN DIGIMAX_410 0x000a Digimax 410 /* SanDisk products */ product SANDISK SDDR05A 0x0001 ImageMate SDDR-05a product SANDISK SDDR31 0x0002 ImageMate SDDR-31 product SANDISK SDDR05 0x0005 ImageMate SDDR-05 product SANDISK SDDR12 0x0100 ImageMate SDDR-12 product SANDISK SDDR09 0x0200 ImageMate SDDR-09 product SANDISK SDDR75 0x0810 ImageMate SDDR-75 product SANDISK SDCZ2_128 0x7100 Cruzer Mini 128MB product SANDISK SDCZ2_256 0x7104 Cruzer Mini 256MB product SANDISK SDCZ4_128 0x7112 Cruzer Micro 128MB product SANDISK SDCZ4_256 0x7113 Cruzer Micro 256MB product SANDISK IMAGEMATE_SDDR289 0xb6ba ImageMate SDDR-289 /* Sanwa Electric Instrument Co., Ltd. products */ product SANWA KB_USB2 0x0701 KB-USB2 multimeter cable /* Sanyo Electric products */ product SANYO SCP4900 0x0701 Sanyo SCP-4900 USB Phone /* ScanLogic products */ product SCANLOGIC SL11R 0x0002 SL11R IDE Adapter product SCANLOGIC 336CX 0x0300 Phantom 336CX - C3 scanner /* Schweitzer Engineering Laboratories products */ product SEL C662 0x0001 C662 Cable /* Sealevel products */ product SEALEVEL 2101 0x2101 FTDI compatible adapter product SEALEVEL 2102 0x2102 FTDI compatible adapter product SEALEVEL 2103 0x2103 FTDI compatible adapter product SEALEVEL 2104 0x2104 FTDI compatible adapter product SEALEVEL 2106 0x9020 FTDI compatible adapter product SEALEVEL 2201_1 0x2211 FTDI compatible adapter product SEALEVEL 2201_2 0x2221 FTDI compatible adapter product SEALEVEL 2202_1 0x2212 FTDI compatible adapter product SEALEVEL 2202_2 0x2222 FTDI compatible adapter product SEALEVEL 2203_1 0x2213 FTDI compatible adapter product SEALEVEL 2203_2 0x2223 FTDI compatible adapter product SEALEVEL 2401_1 0x2411 FTDI compatible adapter product SEALEVEL 2401_2 0x2421 FTDI compatible adapter product SEALEVEL 2401_3 0x2431 FTDI compatible adapter product SEALEVEL 2401_4 0x2441 FTDI compatible adapter product SEALEVEL 2402_1 0x2412 FTDI compatible adapter product SEALEVEL 2402_2 0x2422 FTDI compatible adapter product SEALEVEL 2402_3 0x2432 FTDI compatible adapter product SEALEVEL 2402_4 0x2442 FTDI compatible adapter product SEALEVEL 2403_1 0x2413 FTDI compatible adapter product SEALEVEL 2403_2 0x2423 FTDI compatible adapter product SEALEVEL 2403_3 0x2433 FTDI compatible adapter product SEALEVEL 2403_4 0x2443 FTDI compatible adapter product SEALEVEL 2801_1 0x2811 FTDI compatible adapter product SEALEVEL 2801_2 0x2821 FTDI compatible adapter product SEALEVEL 2801_3 0x2831 FTDI compatible adapter product SEALEVEL 2801_4 0x2841 FTDI compatible adapter product SEALEVEL 2801_5 0x2851 FTDI compatible adapter product SEALEVEL 2801_6 0x2861 FTDI compatible adapter product SEALEVEL 2801_7 0x2871 FTDI compatible adapter product SEALEVEL 2801_8 0x2881 FTDI compatible adapter product SEALEVEL 2802_1 0x2812 FTDI compatible adapter product SEALEVEL 2802_2 0x2822 FTDI compatible adapter product SEALEVEL 2802_3 0x2832 FTDI compatible adapter product SEALEVEL 2802_4 0x2842 FTDI compatible adapter product SEALEVEL 2802_5 0x2852 FTDI compatible adapter product SEALEVEL 2802_6 0x2862 FTDI compatible adapter product SEALEVEL 2802_7 0x2872 FTDI compatible adapter product SEALEVEL 2802_8 0x2882 FTDI compatible adapter product SEALEVEL 2803_1 0x2813 FTDI compatible adapter product SEALEVEL 2803_2 0x2823 FTDI compatible adapter product SEALEVEL 2803_3 0x2833 FTDI compatible adapter product SEALEVEL 2803_4 0x2843 FTDI compatible adapter product SEALEVEL 2803_5 0x2853 FTDI compatible adapter product SEALEVEL 2803_6 0x2863 FTDI compatible adapter product SEALEVEL 2803_7 0x2873 FTDI compatible adapter product SEALEVEL 2803_8 0x2883 FTDI compatible adapter /* Senao products */ product SENAO RT2870_3 0x0605 RT2870 product SENAO RT2870_4 0x0615 RT2870 product SENAO NUB8301 0x2000 NUB-8301 product SENAO RT2870_1 0x9701 RT2870 product SENAO RT2870_2 0x9702 RT2870 product SENAO RT3070 0x9703 RT3070 product SENAO RT3071 0x9705 RT3071 product SENAO RT3072_1 0x9706 RT3072 product SENAO RT3072_2 0x9707 RT3072 product SENAO RT3072_3 0x9708 RT3072 product SENAO RT3072_4 0x9709 RT3072 product SENAO RT3072_5 0x9801 RT3072 product SENAO RTL8192SU_1 0x9603 RTL8192SU product SENAO RTL8192SU_2 0x9605 RTL8192SU /* ShanTou products */ product SHANTOU ST268 0x0268 ST268 product SHANTOU DM9601 0x9601 DM 9601 product SHANTOU ADM8515 0x8515 ADM8515 /* Shark products */ product SHARK PA 0x0400 Pocket Adapter /* Sharp products */ product SHARP SL5500 0x8004 Zaurus SL-5500 PDA product SHARP SLA300 0x8005 Zaurus SL-A300 PDA product SHARP SL5600 0x8006 Zaurus SL-5600 PDA product SHARP SLC700 0x8007 Zaurus SL-C700 PDA product SHARP SLC750 0x9031 Zaurus SL-C750 PDA product SHARP WZERO3ES 0x9123 W-ZERO3 ES Smartphone product SHARP WZERO3ADES 0x91ac Advanced W-ZERO3 ES Smartphone product SHARP WILLCOM03 0x9242 WILLCOM03 /* Shuttle Technology products */ product SHUTTLE EUSB 0x0001 E-USB Bridge product SHUTTLE EUSCSI 0x0002 eUSCSI Bridge product SHUTTLE SDDR09 0x0003 ImageMate SDDR09 product SHUTTLE EUSBCFSM 0x0005 eUSB SmartMedia / CompactFlash Adapter product SHUTTLE ZIOMMC 0x0006 eUSB MultiMediaCard Adapter product SHUTTLE HIFD 0x0007 Sony Hifd product SHUTTLE EUSBATAPI 0x0009 eUSB ATA/ATAPI Adapter product SHUTTLE CF 0x000a eUSB CompactFlash Adapter product SHUTTLE EUSCSI_B 0x000b eUSCSI Bridge product SHUTTLE EUSCSI_C 0x000c eUSCSI Bridge product SHUTTLE CDRW 0x0101 CD-RW Device product SHUTTLE EUSBORCA 0x0325 eUSB ORCA Quad Reader /* Siemens products */ product SIEMENS SPEEDSTREAM 0x1001 SpeedStream product SIEMENS SPEEDSTREAM22 0x1022 SpeedStream 1022 product SIEMENS2 WLL013 0x001b WLL013 product SIEMENS2 ES75 0x0034 GSM module MC35 product SIEMENS2 WL54G 0x3c06 54g USB Network Adapter product SIEMENS3 SX1 0x0001 SX1 product SIEMENS3 X65 0x0003 X65 product SIEMENS3 X75 0x0004 X75 product SIEMENS3 EF81 0x0005 EF81 /* Sierra Wireless products */ product SIERRA EM5625 0x0017 EM5625 product SIERRA MC5720_2 0x0018 MC5720 product SIERRA MC5725 0x0020 MC5725 product SIERRA AIRCARD580 0x0112 Sierra Wireless AirCard 580 product SIERRA AIRCARD595 0x0019 Sierra Wireless AirCard 595 product SIERRA AC595U 0x0120 Sierra Wireless AirCard 595U product SIERRA AC597E 0x0021 Sierra Wireless AirCard 597E product SIERRA EM5725 0x0022 EM5725 product SIERRA C597 0x0023 Sierra Wireless Compass 597 product SIERRA MC5727 0x0024 MC5727 product SIERRA T598 0x0025 T598 product SIERRA T11 0x0026 T11 product SIERRA AC402 0x0027 AC402 product SIERRA MC5728 0x0028 MC5728 product SIERRA E0029 0x0029 E0029 product SIERRA AIRCARD580 0x0112 Sierra Wireless AirCard 580 product SIERRA AC595U 0x0120 Sierra Wireless AirCard 595U product SIERRA MC5720 0x0218 MC5720 Wireless Modem product SIERRA MINI5725 0x0220 Sierra Wireless miniPCI 5275 product SIERRA MC5727_2 0x0224 MC5727 product SIERRA MC8755_2 0x6802 MC8755 product SIERRA MC8765 0x6803 MC8765 product SIERRA MC8755 0x6804 MC8755 product SIERRA MC8765_2 0x6805 MC8765 product SIERRA MC8755_4 0x6808 MC8755 product SIERRA MC8765_3 0x6809 MC8765 product SIERRA AC875U 0x6812 AC875U HSDPA USB Modem product SIERRA MC8755_3 0x6813 MC8755 HSDPA product SIERRA MC8775_2 0x6815 MC8775 product SIERRA MC8775 0x6816 MC8775 product SIERRA AC875 0x6820 Sierra Wireless AirCard 875 product SIERRA AC875U_2 0x6821 AC875U product SIERRA AC875E 0x6822 AC875E product SIERRA MC8780 0x6832 MC8780 product SIERRA MC8781 0x6833 MC8781 product SIERRA MC8780_2 0x6834 MC8780 product SIERRA MC8781_2 0x6835 MC8781 product SIERRA MC8780_3 0x6838 MC8780 product SIERRA MC8781_3 0x6839 MC8781 product SIERRA MC8785 0x683A MC8785 product SIERRA MC8785_2 0x683B MC8785 product SIERRA MC8790 0x683C MC8790 product SIERRA MC8791 0x683D MC8791 product SIERRA MC8792 0x683E MC8792 product SIERRA AC880 0x6850 Sierra Wireless AirCard 880 product SIERRA AC881 0x6851 Sierra Wireless AirCard 881 product SIERRA AC880E 0x6852 Sierra Wireless AirCard 880E product SIERRA AC881E 0x6853 Sierra Wireless AirCard 881E product SIERRA AC880U 0x6855 Sierra Wireless AirCard 880U product SIERRA AC881U 0x6856 Sierra Wireless AirCard 881U product SIERRA AC885E 0x6859 AC885E product SIERRA AC885E_2 0x685A AC885E product SIERRA AC885U 0x6880 Sierra Wireless AirCard 885U product SIERRA C888 0x6890 C888 product SIERRA C22 0x6891 C22 product SIERRA E6892 0x6892 E6892 product SIERRA E6893 0x6893 E6893 product SIERRA MC8700 0x68A3 MC8700 product SIERRA AIRCARD875 0x6820 Aircard 875 HSDPA product SIERRA AC313U 0x68aa Sierra Wireless AirCard 313U product SIERRA TRUINSTALL 0x0fff Aircard Tru Installer /* Sigmatel products */ product SIGMATEL WBT_3052 0x4200 WBT-3052 IrDA/USB Bridge product SIGMATEL I_BEAD100 0x8008 i-Bead 100 MP3 Player /* SIIG products */ /* Also: Omnidirectional Control Technology products */ product SIIG DIGIFILMREADER 0x0004 DigiFilm-Combo Reader product SIIG WINTERREADER 0x0330 WINTERREADER Reader product SIIG2 DK201 0x0103 FTDI compatible adapter product SIIG2 USBTOETHER 0x0109 USB TO Ethernet product SIIG2 US2308 0x0421 Serial /* Silicom products */ product SILICOM U2E 0x0001 U2E product SILICOM GPE 0x0002 Psion Gold Port Ethernet /* SI Labs */ product SILABS VSTABI 0x0f91 VStabi Controller product SILABS ARKHAM_DS101_M 0x1101 Arkham DS101 Monitor product SILABS ARKHAM_DS101_A 0x1601 Arkham DS101 Adapter product SILABS BSM7DUSB 0x800a SPORTident BSM7-D USB product SILABS POLOLU 0x803b Pololu Serial product SILABS CYGNAL_DEBUG 0x8044 Cygnal Debug Adapter product SILABS SB_PARAMOUNT_ME 0x8043 Software Bisque Paramount ME product SILABS SAEL 0x8053 SA-EL USB product SILABS GSM2228 0x8054 Enfora GSM2228 USB product SILABS ARGUSISP 0x8066 Argussoft ISP product SILABS IMS_USB_RS422 0x806f IMS USB-RS422 product SILABS CRUMB128 0x807a Crumb128 board product SILABS OPTRIS_MSPRO 0x80c4 Optris MSpro LT Thermometer product SILABS DEGREE 0x80ca Degree Controls Inc product SILABS TRACIENT 0x80dd Tracient RFID product SILABS TRAQMATE 0x80ed Track Systems Traqmate product SILABS SUUNTO 0x80f6 Suunto Sports Instrument product SILABS ARYGON_MIFARE 0x8115 Arygon Mifare RFID reader product SILABS BURNSIDE 0x813d Burnside Telecon Deskmobile product SILABS TAMSMASTER 0x813f Tams Master Easy Control product SILABS WMRBATT 0x814a WMR RIGblaster Plug&Play product SILABS WMRRIGBLASTER 0x814a WMR RIGblaster Plug&Play product SILABS WMRRIGTALK 0x814b WMR RIGtalk RT1 product SILABS B_G_H3000 0x8156 B&G H3000 Data Cable product SILABS HELICOM 0x815e Helicomm IP-Link 1220-DVM product SILABS HAMLINKUSB 0x815f Timewave HamLinkUSB product SILABS AVIT_USB_TTL 0x818b AVIT Research USB-TTL product SILABS MJS_TOSLINK 0x819f MJS USB-TOSLINK product SILABS WAVIT 0x81a6 ThinkOptics WavIt product SILABS MULTIPLEX_RC 0x81a9 Multiplex RC adapter product SILABS MSD_DASHHAWK 0x81ac MSD DashHawk product SILABS INSYS_MODEM 0x81ad INSYS Modem product SILABS LIPOWSKY_JTAG 0x81c8 Lipowsky Baby-JTAG product SILABS LIPOWSKY_LIN 0x81e2 Lipowsky Baby-LIN product SILABS AEROCOMM 0x81e7 Aerocomm Radio product SILABS ZEPHYR_BIO 0x81e8 Zephyr Bioharness product SILABS EMS_C1007 0x81f2 EMS C1007 HF RFID controller product SILABS LIPOWSKY_HARP 0x8218 Lipowsky HARP-1 product SILABS C2_EDGE_MODEM 0x822b Commander 2 EDGE(GSM) Modem product SILABS CYGNAL_GPS 0x826b Cygnal Fasttrax GPS product SILABS TELEGESIS_ETRX2 0x8293 Telegesis ETRX2USB product SILABS PROCYON_AVS 0x82f9 Procyon AVS product SILABS MC35PU 0x8341 MC35pu product SILABS CYGNAL 0x8382 Cygnal product SILABS AMBER_AMB2560 0x83a8 Amber Wireless AMB2560 product SILABS DEKTEK_DTAPLUS 0x83d8 DekTec DTA Plus VHF/UHF Booster product SILABS KYOCERA_GPS 0x8411 Kyocera GPS product SILABS IRZ_SG10 0x8418 IRZ SG-10 GSM/GPRS Modem product SILABS BEI_VCP 0x846e BEI USB Sensor (VCP) product SILABS BALLUFF_RFID 0x8477 Balluff RFID reader product SILABS AC_SERV_IBUS 0x85ea AC-Services IBUS Interface product SILABS AC_SERV_CIS 0x85eb AC-Services CIS-IBUS product SILABS V_PREON32 0x85f8 Virtenio Preon32 product SILABS AC_SERV_CAN 0x8664 AC-Services CAN Interface product SILABS AC_SERV_OBD 0x8665 AC-Services OBD Interface product SILABS MMB_ZIGBEE 0x88a4 MMB Networks ZigBee product SILABS INGENI_ZIGBEE 0x88a5 Planet Innovation Ingeni ZigBee product SILABS CP2102 0xea60 SILABS USB UART product SILABS CP210X_2 0xea61 CP210x Serial product SILABS CP210X_3 0xea70 CP210x Serial product SILABS CP210X_4 0xea80 CP210x Serial product SILABS INFINITY_MIC 0xea71 Infinity GPS-MIC-1 Radio Monophone product SILABS USBSCOPE50 0xf001 USBscope50 product SILABS USBWAVE12 0xf002 USBwave12 product SILABS USBPULSE100 0xf003 USBpulse100 product SILABS USBCOUNT50 0xf004 USBcount50 product SILABS2 DCU11CLONE 0xaa26 DCU-11 clone product SILABS3 GPRS_MODEM 0xea61 GPRS Modem product SILABS4 100EU_MODEM 0xea61 GPRS Modem 100EU /* Silicon Portals Inc. */ product SILICONPORTALS YAPPH_NF 0x0200 YAP Phone (no firmware) product SILICONPORTALS YAPPHONE 0x0201 YAP Phone /* Sirius Technologies products */ product SIRIUS ROADSTER 0x0001 NetComm Roadster II 56 USB /* Sitecom products */ product SITECOM LN029 0x182d USB 2.0 Ethernet product SITECOM SERIAL 0x2068 USB to serial cable (v2) product SITECOM2 WL022 0x182d WL-022 /* Sitecom Europe products */ product SITECOMEU RT2870_1 0x0017 RT2870 product SITECOMEU WL168V1 0x000d WL-168 v1 product SITECOMEU LN030 0x0021 MCS7830 product SITECOMEU WL168V4 0x0028 WL-168 v4 product SITECOMEU RT2870_2 0x002b RT2870 product SITECOMEU RT2870_3 0x002c RT2870 product SITECOMEU RT2870_4 0x002d RT2870 product SITECOMEU RT2770 0x0039 RT2770 product SITECOMEU RT3070_2 0x003b RT3070 product SITECOMEU RT3070_3 0x003c RT3070 product SITECOMEU RT3070_4 0x003d RT3070 product SITECOMEU RT3070 0x003e RT3070 product SITECOMEU WL608 0x003f WL-608 product SITECOMEU RT3071 0x0040 RT3071 product SITECOMEU RT3072_1 0x0041 RT3072 product SITECOMEU RT3072_2 0x0042 RT3072 product SITECOMEU WL353 0x0045 WL-353 product SITECOMEU RT3072_3 0x0047 RT3072 product SITECOMEU RT3072_4 0x0048 RT3072 product SITECOMEU RT3072_5 0x004a RT3072 product SITECOMEU WL349V1 0x004b WL-349 v1 product SITECOMEU RT3072_6 0x004d RT3072 product SITECOMEU RTL8188CU_1 0x0052 RTL8188CU product SITECOMEU RTL8188CU_2 0x005c RTL8188CU product SITECOMEU RTL8192CU 0x0061 RTL8192CU product SITECOMEU LN032 0x0072 LN-032 product SITECOMEU LN028 0x061c LN-028 product SITECOMEU WL113 0x9071 WL-113 product SITECOMEU ZD1211B 0x9075 ZD1211B product SITECOMEU WL172 0x90ac WL-172 product SITECOMEU WL113R2 0x9712 WL-113 rev 2 /* Skanhex Technology products */ product SKANHEX MD_7425 0x410a MD 7425 Camera product SKANHEX SX_520Z 0x5200 SX 520z Camera /* Smart Technologies products */ product SMART PL2303 0x2303 Serial adapter /* SmartBridges products */ product SMARTBRIDGES SMARTLINK 0x0001 SmartLink USB Ethernet product SMARTBRIDGES SMARTNIC 0x0003 smartNIC 2 PnP Ethernet /* SMC products */ product SMC 2102USB 0x0100 10Mbps Ethernet product SMC 2202USB 0x0200 10/100 Ethernet product SMC 2206USB 0x0201 EZ Connect USB Ethernet product SMC 2862WG 0xee13 EZ Connect Wireless Adapter product SMC2 2020HUB 0x2020 USB Hub product SMC2 2514HUB 0x2514 USB Hub product SMC3 2662WUSB 0xa002 2662W-AR Wireless product SMC2 LAN9500_ETH 0x9500 USB/Ethernet product SMC2 LAN9505_ETH 0x9505 USB/Ethernet product SMC2 LAN9530_ETH 0x9530 USB/Ethernet product SMC2 LAN9730_ETH 0x9730 USB/Ethernet product SMC2 LAN9500_SAL10 0x9900 USB/Ethernet product SMC2 LAN9505_SAL10 0x9901 USB/Ethernet product SMC2 LAN9500A_SAL10 0x9902 USB/Ethernet product SMC2 LAN9505A_SAL10 0x9903 USB/Ethernet product SMC2 LAN9514_SAL10 0x9904 USB/Ethernet product SMC2 LAN9500A_HAL 0x9905 USB/Ethernet product SMC2 LAN9505A_HAL 0x9906 USB/Ethernet product SMC2 LAN9500_ETH_2 0x9907 USB/Ethernet product SMC2 LAN9500A_ETH_2 0x9908 USB/Ethernet product SMC2 LAN9514_ETH_2 0x9909 USB/Ethernet product SMC2 LAN9500A_ETH 0x9e00 USB/Ethernet product SMC2 LAN9505A_ETH 0x9e01 USB/Ethernet product SMC2 LAN89530_ETH 0x9e08 USB/Ethernet product SMC2 LAN9514_ETH 0xec00 USB/Ethernet /* SOHOware products */ product SOHOWARE NUB100 0x9100 10/100 USB Ethernet product SOHOWARE NUB110 0x9110 10/100 USB Ethernet /* SOLID YEAR products */ product SOLIDYEAR KEYBOARD 0x2101 Solid Year USB keyboard /* SONY products */ product SONY DSC 0x0010 DSC cameras product SONY MS_NW_MS7 0x0025 Memorystick NW-MS7 product SONY PORTABLE_HDD_V2 0x002b Portable USB Harddrive V2 product SONY MSACUS1 0x002d Memorystick MSAC-US1 product SONY HANDYCAM 0x002e Handycam product SONY MSC 0x0032 MSC memory stick slot product SONY CLIE_35 0x0038 Sony Clie v3.5 product SONY MS_PEG_N760C 0x0058 PEG N760c Memorystick product SONY CLIE_40 0x0066 Sony Clie v4.0 product SONY MS_MSC_U03 0x0069 Memorystick MSC-U03 product SONY CLIE_40_MS 0x006d Sony Clie v4.0 Memory Stick slot product SONY CLIE_S360 0x0095 Sony Clie s360 product SONY CLIE_41_MS 0x0099 Sony Clie v4.1 Memory Stick slot product SONY CLIE_41 0x009a Sony Clie v4.1 product SONY CLIE_NX60 0x00da Sony Clie nx60 product SONY CLIE_TH55 0x0144 Sony Clie th55 product SONY CLIE_TJ37 0x0169 Sony Clie tj37 product SONY RF_RECEIVER 0x01db Sony RF mouse/kbd Receiver VGP-WRC1 product SONY QN3 0x0437 Sony QN3 CMD-Jxx phone cable /* Sony Ericsson products */ product SONYERICSSON DCU10 0x0528 DCU-10 Phone Data Cable product SONYERICSSON DATAPILOT 0x2003 Datapilot Phone Cable /* SOURCENEXT products */ product SOURCENEXT KEIKAI8 0x039f KeikaiDenwa 8 product SOURCENEXT KEIKAI8_CHG 0x012e KeikaiDenwa 8 with charger /* SparkLAN products */ product SPARKLAN RT2573 0x0004 RT2573 product SPARKLAN RT2870_1 0x0006 RT2870 product SPARKLAN RT3070 0x0010 RT3070 /* Soundgraph products */ product SOUNDGRAPH IMON_VFD 0x0044 Antec Veris Elite VFD Panel, Knob, and Remote product SOUNDGRAPH SSTONE_LC16 0xffdc Silverstone LC16 VFD Panel, Knob, and Remote /* Speed Dragon Multimedia products */ product SPEEDDRAGON MS3303H 0x110b MS3303H Serial /* Sphairon Access Systems GmbH products */ product SPHAIRON UB801R 0x0110 UB801R /* Stelera Wireless products */ product STELERA ZEROCD 0x1000 Zerocd Installer product STELERA C105 0x1002 Stelera/Bandrish C105 USB product STELERA E1003 0x1003 3G modem product STELERA E1004 0x1004 3G modem product STELERA E1005 0x1005 3G modem product STELERA E1006 0x1006 3G modem product STELERA E1007 0x1007 3G modem product STELERA E1008 0x1008 3G modem product STELERA E1009 0x1009 3G modem product STELERA E100A 0x100a 3G modem product STELERA E100B 0x100b 3G modem product STELERA E100C 0x100c 3G modem product STELERA E100D 0x100d 3G modem product STELERA E100E 0x100e 3G modem product STELERA E100F 0x100f 3G modem product STELERA E1010 0x1010 3G modem product STELERA E1011 0x1011 3G modem product STELERA E1012 0x1012 3G modem /* STMicroelectronics products */ product STMICRO BIOCPU 0x2016 Biometric Coprocessor product STMICRO COMMUNICATOR 0x7554 USB Communicator product STMICRO ST72682 0xfada USB 2.0 Flash drive controller /* STSN products */ product STSN STSN0001 0x0001 Internet Access Device /* SUN Corporation products */ product SUNTAC DS96L 0x0003 SUNTAC U-Cable type D2 product SUNTAC PS64P1 0x0005 SUNTAC U-Cable type P1 product SUNTAC VS10U 0x0009 SUNTAC Slipper U product SUNTAC IS96U 0x000a SUNTAC Ir-Trinity product SUNTAC AS64LX 0x000b SUNTAC U-Cable type A3 product SUNTAC AS144L4 0x0011 SUNTAC U-Cable type A4 /* Sun Microsystems products */ product SUN KEYBOARD_TYPE_6 0x0005 Type 6 USB keyboard product SUN KEYBOARD_TYPE_7 0x00a2 Type 7 USB keyboard /* XXX The above is a North American PC style keyboard possibly */ product SUN MOUSE 0x0100 Type 6 USB mouse product SUN KBD_HUB 0x100e Kbd Hub /* Sunplus Innovation Technology Inc. products */ product SUNPLUS USBMOUSE 0x0007 USB Optical Mouse /* Super Top products */ product SUPERTOP IDE 0x6600 USB-IDE product SUPERTOP FLASHDRIVE 0x121c extrememory Snippy /* Syntech products */ product SYNTECH CPT8001C 0x0001 CPT-8001C Barcode scanner product SYNTECH CYPHERLAB100 0x1000 CipherLab USB Barcode Scanner /* Teclast products */ product TECLAST TLC300 0x3203 USB Media Player /* Testo products */ product TESTO USB_INTERFACE 0x0001 FTDI compatible adapter /* TexTech products */ product TEXTECH DUMMY 0x0000 Dummy product product TEXTECH U2M_1 0x0101 Textech USB MIDI cable product TEXTECH U2M_2 0x1806 Textech USB MIDI cable /* The Mobility Lab products */ product TML USB_SERIAL 0x0064 FTDI compatible adapter /* Thurlby Thandar Instrument products */ product TTI QL355P 0x03e8 FTDI compatible adapter /* Supra products */ product DIAMOND2 SUPRAEXPRESS56K 0x07da Supra Express 56K modem product DIAMOND2 SUPRA2890 0x0b4a SupraMax 2890 56K Modem product DIAMOND2 RIO600USB 0x5001 Rio 600 USB product DIAMOND2 RIO800USB 0x5002 Rio 800 USB /* Surecom Technology products */ product SURECOM EP9001G2A 0x11f2 EP-9001-G rev 2A product SURECOM RT2570 0x11f3 RT2570 product SURECOM RT2573 0x31f3 RT2573 /* Sweex products */ product SWEEX ZD1211 0x1809 ZD1211 product SWEEX2 LW153 0x0153 LW153 product SWEEX2 LW154 0x0154 LW154 product SWEEX2 LW303 0x0302 LW303 product SWEEX2 LW313 0x0313 LW313 /* System TALKS, Inc. */ product SYSTEMTALKS SGCX2UL 0x1920 SGC-X2UL /* Tapwave products */ product TAPWAVE ZODIAC 0x0100 Zodiac /* Taugagreining products */ product TAUGA CAMERAMATE 0x0005 CameraMate (DPCM_USB) /* TCTMobile products */ product TCTMOBILE X060S 0x0000 X060S 3G modem product TCTMOBILE X080S 0xf000 X080S 3G modem /* TDK products */ product TDK UPA9664 0x0115 USB-PDC Adapter UPA9664 product TDK UCA1464 0x0116 USB-cdmaOne Adapter UCA1464 product TDK UHA6400 0x0117 USB-PHS Adapter UHA6400 product TDK UPA6400 0x0118 USB-PHS Adapter UPA6400 product TDK BT_DONGLE 0x0309 Bluetooth USB dongle /* TEAC products */ product TEAC FD05PUB 0x0000 FD-05PUB floppy /* Tekram Technology products */ product TEKRAM QUICKWLAN 0x1630 QuickWLAN product TEKRAM ZD1211_1 0x5630 ZD1211 product TEKRAM ZD1211_2 0x6630 ZD1211 /* Telex Communications products */ product TELEX MIC1 0x0001 Enhanced USB Microphone /* Telit products */ product TELIT UC864E 0x1003 UC864E 3G modem product TELIT UC864G 0x1004 UC864G 3G modem /* Ten X Technology, Inc. */ product TENX UAUDIO0 0xf211 USB audio headset /* Texas Intel products */ product TI UTUSB41 0x1446 UT-USB41 hub product TI TUSB2046 0x2046 TUSB2046 hub /* Thrustmaster products */ product THRUST FUSION_PAD 0xa0a3 Fusion Digital Gamepad /* TLayTech products */ product TLAYTECH TEU800 0x1682 TEU800 3G modem /* Topre Corporation products */ product TOPRE HHKB 0x0100 HHKB Professional /* Toshiba Corporation products */ product TOSHIBA POCKETPC_E740 0x0706 PocketPC e740 product TOSHIBA RT3070 0x0a07 RT3070 product TOSHIBA G450 0x0d45 G450 modem product TOSHIBA HSDPA 0x1302 G450 modem product TOSHIBA TRANSMEMORY 0x6545 USB ThumbDrive /* Trek Technology products */ product TREK THUMBDRIVE 0x1111 ThumbDrive product TREK MEMKEY 0x8888 IBM USB Memory Key product TREK THUMBDRIVE_8MB 0x9988 ThumbDrive_8MB /* TRENDnet products */ product TRENDNET RTL8192CU 0x624d RTL8192CU product TRENDNET RTL8188CU 0x648b RTL8188CU /* Tripp-Lite products */ product TRIPPLITE U209 0x2008 Serial /* Trumpion products */ product TRUMPION T33520 0x1001 T33520 USB Flash Card Controller product TRUMPION C3310 0x1100 Comotron C3310 MP3 player product TRUMPION MP3 0x1200 MP3 player /* TwinMOS */ product TWINMOS G240 0xa006 G240 product TWINMOS MDIV 0x1325 Memory Disk IV /* Ubiquam products */ product UBIQUAM UALL 0x3100 CDMA 1xRTT USB Modem (U-100/105/200/300/520) /* Ultima products */ product ULTIMA 1200UBPLUS 0x4002 1200 UB Plus scanner /* UMAX products */ product UMAX ASTRA1236U 0x0002 Astra 1236U Scanner product UMAX ASTRA1220U 0x0010 Astra 1220U Scanner product UMAX ASTRA2000U 0x0030 Astra 2000U Scanner product UMAX ASTRA2100U 0x0130 Astra 2100U Scanner product UMAX ASTRA2200U 0x0230 Astra 2200U Scanner product UMAX ASTRA3400 0x0060 Astra 3400 Scanner /* U-MEDIA Communications products */ product UMEDIA TEW444UBEU 0x3006 TEW-444UB EU product UMEDIA TEW444UBEU_NF 0x3007 TEW-444UB EU (no firmware) product UMEDIA TEW429UB_A 0x300a TEW-429UB_A product UMEDIA TEW429UB 0x300b TEW-429UB product UMEDIA TEW429UBC1 0x300d TEW-429UB C1 product UMEDIA RT2870_1 0x300e RT2870 product UMEDIA ALL0298V2 0x3204 ALL0298 v2 product UMEDIA AR5523_2 0x3205 AR5523 product UMEDIA AR5523_2_NF 0x3206 AR5523 (no firmware) /* Universal Access products */ product UNIACCESS PANACHE 0x0101 Panache Surf USB ISDN Adapter /* Unknown products */ product UNKNOWN4 NF_RIC 0x0001 FTDI compatible adapter /* USI products */ product USI MC60 0x10c5 MC60 Serial /* U.S. Robotics products */ product USR USR5422 0x0118 USR5422 WLAN product USR USR5423 0x0121 USR5423 WLAN /* VIA Technologies products */ product VIA USB2IDEBRIDGE 0x6204 USB 2.0 IDE Bridge /* VIA Labs */ product VIALABS USB30SATABRIDGE 0x0700 USB 3.0 SATA Bridge /* Vaisala products */ product VAISALA CABLE 0x0200 USB Interface cable /* Vertex products */ product VERTEX VW110L 0x0100 Vertex VW110L modem /* VidzMedia products */ product VIDZMEDIA MONSTERTV 0x4fb1 MonsterTV P2H /* Vision products */ product VISION VC6452V002 0x0002 CPiA Camera /* Visioneer products */ product VISIONEER 7600 0x0211 OneTouch 7600 product VISIONEER 5300 0x0221 OneTouch 5300 product VISIONEER 3000 0x0224 Scanport 3000 product VISIONEER 6100 0x0231 OneTouch 6100 product VISIONEER 6200 0x0311 OneTouch 6200 product VISIONEER 8100 0x0321 OneTouch 8100 product VISIONEER 8600 0x0331 OneTouch 8600 /* Vivitar products */ product VIVITAR 35XX 0x0003 Vivicam 35Xx /* VTech products */ product VTECH RT2570 0x3012 RT2570 product VTECH ZD1211B 0x3014 ZD1211B /* Wacom products */ product WACOM CT0405U 0x0000 CT-0405-U Tablet product WACOM GRAPHIRE 0x0010 Graphire product WACOM GRAPHIRE3_4X5 0x0013 Graphire 3 4x5 product WACOM INTUOSA5 0x0021 Intuos A5 product WACOM GD0912U 0x0022 Intuos 9x12 Graphics Tablet /* WAGO Kontakttechnik GmbH products */ product WAGO SERVICECABLE 0x07a6 USB Service Cable 750-923 /* WaveSense products */ product WAVESENSE JAZZ 0xaaaa Jazz blood glucose meter /* WCH products */ product WCH CH341SER 0x5523 CH341/CH340 USB-Serial Bridge product WCH2 DUMMY 0x0000 Dummy product product WCH2 CH341SER_2 0x5523 CH341/CH340 USB-Serial Bridge product WCH2 CH341SER 0x7523 CH341/CH340 USB-Serial Bridge product WCH2 U2M 0X752d CH345 USB2.0-MIDI /* West Mountain Radio products */ product WESTMOUNTAIN RIGBLASTER_ADVANTAGE 0x0003 RIGblaster Advantage /* Western Digital products */ product WESTERN COMBO 0x0200 Firewire USB Combo product WESTERN EXTHDD 0x0400 External HDD product WESTERN HUB 0x0500 USB HUB product WESTERN MYBOOK 0x0901 MyBook External HDD product WESTERN MYPASSWORD 0x0704 MyPassword External HDD product WESTERN MYPASSPORT 0x0748 MyPassport External HDD /* WeTelecom products */ product WETELECOM WM_D200 0x6801 WM-D200 /* WIENER Plein & Baus GmbH products */ product WIENERPLEINBAUS PL512 0x0010 PL512 PSU product WIENERPLEINBAUS RCM 0x0011 RCM Remote Control product WIENERPLEINBAUS MPOD 0x0012 MPOD PSU product WIENERPLEINBAUS CML 0x0015 CML Data Logger /* Windbond Electronics */ product WINBOND UH104 0x5518 4-port USB Hub /* WinMaxGroup products */ product WINMAXGROUP FLASH64MC 0x6660 USB Flash Disk 64M-C /* Wistron NeWeb products */ product WISTRONNEWEB UR045G 0x0427 PrismGT USB 2.0 WLAN product WISTRONNEWEB UR055G 0x0711 UR055G product WISTRONNEWEB AR5523_1 0x0826 AR5523 product WISTRONNEWEB AR5523_1_NF 0x0827 AR5523 (no firmware) product WISTRONNEWEB AR5523_2 0x082a AR5523 product WISTRONNEWEB AR5523_2_NF 0x0829 AR5523 (no firmware) /* Xerox products */ product XEROX WCM15 0xffef WorkCenter M15 /* Xirlink products */ product XIRLINK PCCAM 0x8080 IBM PC Camera /* Xyratex products */ product XYRATEX PRISM_GT_1 0x2000 PrismGT USB 2.0 WLAN product XYRATEX PRISM_GT_2 0x2002 PrismGT USB 2.0 WLAN /* Yamaha products */ product YAMAHA UX256 0x1000 UX256 MIDI I/F product YAMAHA UX96 0x1008 UX96 MIDI I/F product YAMAHA RPU200 0x3104 RP-U200 product YAMAHA RTA54I 0x4000 NetVolante RTA54i Broadband&ISDN Router product YAMAHA RTW65B 0x4001 NetVolante RTW65b Broadband Wireless Router product YAMAHA RTW65I 0x4002 NetVolante RTW65i Broadband&ISDN Wireless Router product YAMAHA RTA55I 0x4004 NetVolante RTA55i Broadband VoIP Router /* Yano products */ product YANO U640MO 0x0101 U640MO-03 product YANO FW800HD 0x05fc METALWEAR-HDD /* Y.C. Cable products */ product YCCABLE PL2303 0x0fba PL2303 Serial /* Y-E Data products */ product YEDATA FLASHBUSTERU 0x0000 Flashbuster-U /* Yiso Wireless Co. products */ product YISO C893 0xc893 CDMA 2000 1xEVDO PC Card /* Z-Com products */ product ZCOM M4Y750 0x0001 M4Y-750 product ZCOM XI725 0x0002 XI-725/726 product ZCOM XI735 0x0005 XI-735 product ZCOM XG703A 0x0008 PrismGT USB 2.0 WLAN product ZCOM ZD1211 0x0011 ZD1211 product ZCOM AR5523 0x0012 AR5523 product ZCOM AR5523_NF 0x0013 AR5523 driver (no firmware) product ZCOM XM142 0x0015 XM-142 product ZCOM ZD1211B 0x001a ZD1211B product ZCOM RT2870_1 0x0022 RT2870 product ZCOM RT2870_2 0x0025 RT2870 /* Zinwell products */ product ZINWELL RT2570 0x0260 RT2570 product ZINWELL RT2870_1 0x0280 RT2870 product ZINWELL RT2870_2 0x0282 RT2870 product ZINWELL RT3072_1 0x0283 RT3072 product ZINWELL RT3072_2 0x0284 RT3072 product ZINWELL RT3070 0x5257 RT3070 /* Zoom Telephonics, Inc. products */ product ZOOM 2986L 0x9700 2986L Fax modem /* Zoran Microelectronics products */ product ZORAN EX20DSC 0x4343 Digital Camera EX-20 DSC /* Zydas Technology Corporation products */ product ZYDAS ZD1211 0x1211 ZD1211 WLAN abg product ZYDAS ZD1211B 0x1215 ZD1211B /* ZyXEL Communication Co. products */ product ZYXEL OMNI56K 0x1500 Omni 56K Plus product ZYXEL 980N 0x2011 Scorpion-980N keyboard product ZYXEL ZYAIRG220 0x3401 ZyAIR G-220 product ZYXEL G200V2 0x3407 G-200 v2 product ZYXEL AG225H 0x3409 AG-225H product ZYXEL M202 0x340a M-202 product ZYXEL G220V2 0x340f G-220 v2 product ZYXEL G202 0x3410 G-202 product ZYXEL RT2870_1 0x3416 RT2870 product ZYXEL RT2870_2 0x341a RT2870 product ZYXEL RT3070 0x341e NWD2105 product ZYXEL RTL8192CU 0x341f RTL8192CU product ZYXEL NWD2705 0x3421 NWD2705 Index: user/dchagin/lemul/sys/net/bpf.c =================================================================== --- user/dchagin/lemul/sys/net/bpf.c (revision 274240) +++ user/dchagin/lemul/sys/net/bpf.c (revision 274241) @@ -1,2934 +1,2934 @@ /*- * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_compat.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define BPF_INTERNAL #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #define SIZEOF_BPF_HDR(type) \ (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen)) #ifdef COMPAT_FREEBSD32 #include #include #define BPF_ALIGNMENT32 sizeof(int32_t) #define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1)) #ifndef BURN_BRIDGES /* * 32-bit version of structure prepended to each packet. We use this header * instead of the standard one for 32-bit streams. We mark the a stream as * 32-bit the first time we see a 32-bit compat ioctl request. */ struct bpf_hdr32 { struct timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; #endif struct bpf_program32 { u_int bf_len; uint32_t bf_insns; }; struct bpf_dltlist32 { u_int bfl_len; u_int bfl_list; }; #define BIOCSETF32 _IOW('B', 103, struct bpf_program32) #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32) #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32) #define BIOCGDLTLIST32 _IOWR('B', 121, struct bpf_dltlist32) #define BIOCSETWF32 _IOW('B', 123, struct bpf_program32) #define BIOCSETFNR32 _IOW('B', 130, struct bpf_program32) #endif /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ static LIST_HEAD(, bpf_if) bpf_iflist, bpf_freelist; static struct mtx bpf_mtx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_detachd_locked(struct bpf_d *); static void bpf_freed(struct bpf_d *); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_insn *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static void bpf_drvinit(void *); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); static int bpf_zerocopy_enable = 0; SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); static VNET_DEFINE(int, bpf_optimize_writers) = 0; #define V_bpf_optimize_writers VNET(bpf_optimize_writers) -SYSCTL_VNET_INT(_net_bpf, OID_AUTO, optimize_writers, - CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0, +SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(bpf_optimize_writers), 0, "Do not send packets until BPF program is set"); static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; static struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, }; eventhandler_tag bpf_ifdetach_cookie = NULL; /* * LOCKING MODEL USED BY BPF: * Locks: * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal, * some global counters and every bpf_if reference. * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters. * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields * used by bpf_mtap code. * * Lock order: * * Global lock, interface lock, descriptor lock * * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2] * working model. In many places (like bpf_detachd) we start with BPF descriptor * (and we need to at least rlock it to get reliable interface pointer). This * gives us potential LOR. As a result, we use global lock to protect from bpf_if * change in every such place. * * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and * 3) descriptor main wlock. * Reading bd_bif can be protected by any of these locks, typically global lock. * * Changing read/write BPF filter is protected by the same three locks, * the same applies for reading. * * Sleeping in global lock is not allowed due to bpfdetach() using it. */ /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure * similar to protosw, et. */ static void bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_bytes(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: d->bd_zcopy++; return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); default: panic("bpf_buf_append_bytes"); } } static void bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: d->bd_zcopy++; return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); default: panic("bpf_buf_append_mbuf"); } } /* * This function gets called when the free buffer is re-assigned. */ static void bpf_buf_reclaimed(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return; case BPF_BUFMODE_ZBUF: bpf_zerocopy_buf_reclaimed(d); return; default: panic("bpf_buf_reclaimed"); } } /* * If the buffer mechanism has a way to decide that a held buffer can be made * free, then it is exposed via the bpf_canfreebuf() interface. (1) is * returned if the buffer can be discarded, (0) is returned if it cannot. */ static int bpf_canfreebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canfreebuf(d)); } return (0); } /* * Allow the buffer model to indicate that the current store buffer is * immutable, regardless of the appearance of space. Return (1) if the * buffer is writable, and (0) if not. */ static int bpf_canwritebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canwritebuf(d)); } return (1); } /* * Notify buffer model that an attempt to write to the store buffer has * resulted in a dropped packet, in which case the buffer may be considered * full. */ static void bpf_buffull(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_buffull(d); break; } } /* * Notify the buffer model that a buffer has moved into the hold position. */ void bpf_bufheld(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_bufheld(d); break; } } static void bpf_free(struct bpf_d *d) { switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_free(d)); case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_free(d)); default: panic("bpf_buf_free"); } } static int bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_uiomove(d, buf, len, uio)); } static int bpf_ioctl_sblen(struct bpf_d *d, u_int *i) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_ioctl_sblen(d, i)); } static int bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_getzmax(td, d, i)); } static int bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); } static int bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); } /* * General BPF functions. */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) { const struct ieee80211_bpf_params *p; struct ether_header *eh; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len < hlen || len - hlen > ifp->if_mtu) return (EMSGSIZE); m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR); if (m == NULL) return (EIO); m->m_pkthdr.len = m->m_len = len; *mp = m; error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; } break; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(mtod(m, const void *), sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach file to the bpf interface, i.e. make d listen on bp. */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { int op_w; BPF_LOCK_ASSERT(); /* * Save sysctl value to protect from sysctl change * between reads */ op_w = V_bpf_optimize_writers; if (d->bd_bif != NULL) bpf_detachd_locked(d); /* * Point d at bp, and add d to the interface's list. * Since there are many applicaiotns using BPF for * sending raw packets only (dhcpd, cdpd are good examples) * we can delay adding d to the list of active listeners until * some filter is configured. */ BPFIF_WLOCK(bp); BPFD_LOCK(d); d->bd_bif = bp; if (op_w != 0) { /* Add to writers-only list */ LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up * snap length. After that appliation usually sets its own filter */ d->bd_writer = 2; } else LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); BPFD_UNLOCK(d); BPFIF_WUNLOCK(bp); bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", __func__, d->bd_pid, d->bd_writer ? "writer" : "active"); if (op_w == 0) EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Check if we need to upgrade our descriptor @d from write-only mode. */ static int bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) { int is_snap, need_upgrade; /* * Check if we've already upgraded or new filter is empty. */ if (d->bd_writer == 0 || fcode == NULL) return (0); need_upgrade = 0; /* * Check if cmd looks like snaplen setting from * pcap_bpf.c:pcap_open_live(). * Note we're not checking .k value here: * while pcap_open_live() definitely sets to to non-zero value, * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; if (is_snap == 0) { /* * We're setting first filter and it doesn't look like * setting snaplen. We're probably using bpf directly. * Upgrade immediately. */ need_upgrade = 1; } else { /* * Do not require upgrade by first BIOCSETF * (used to set snaplen) by pcap_open_live(). */ if (--d->bd_writer == 0) { /* * First snaplen filter has already * been set. This is probably catch-all * filter */ need_upgrade = 1; } } CTR5(KTR_NET, "%s: filter function set by pid %d, " "bd_writer counter %d, snap %d upgrade %d", __func__, d->bd_pid, d->bd_writer, is_snap, need_upgrade); return (need_upgrade); } /* * Add d to the list of active bp filters. * Reuqires bpf_attachd() to be called before */ static void bpf_upgraded(struct bpf_d *d) { struct bpf_if *bp; BPF_LOCK_ASSERT(); bp = d->bd_bif; /* * Filter can be set several times without specifying interface. * Mark d as reader and exit. */ if (bp == NULL) { BPFD_LOCK(d); d->bd_writer = 0; BPFD_UNLOCK(d); return; } BPFIF_WLOCK(bp); BPFD_LOCK(d); /* Remove from writers-only list */ LIST_REMOVE(d, bd_next); LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); /* Mark d as reader */ d->bd_writer = 0; BPFD_UNLOCK(d); BPFIF_WUNLOCK(bp); CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); bpf_detachd_locked(d); BPF_UNLOCK(); } static void bpf_detachd_locked(struct bpf_d *d) { int error; struct bpf_if *bp; struct ifnet *ifp; CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); BPF_LOCK_ASSERT(); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; BPFIF_WLOCK(bp); BPFD_LOCK(d); /* Save bd_writer value */ error = d->bd_writer; /* * Remove d from the interface's descriptor list. */ LIST_REMOVE(d, bd_next); ifp = bp->bif_ifp; d->bd_bif = NULL; BPFD_UNLOCK(d); BPFIF_WUNLOCK(bp); bpf_bpfd_cnt--; /* Call event handler iff d is attached */ if (error == 0) EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. * If so, turn it off. */ if (d->bd_promisc) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); CURVNET_RESTORE(); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ static void bpf_dtor(void *data) { struct bpf_d *d = data; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); bpf_detachd(d); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpf_freed(d); free(d, M_BPF); } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; int error, size; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); if (error != 0) { free(d, M_BPF); return (error); } /* * For historical reasons, perform a one-time initialization call to * the buffer routines, even though we're not yet committed to a * particular buffer method. */ bpf_buffer_init(d); d->bd_hbuf_in_use = 0; d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); /* Allocate default buffers */ size = d->bd_bufsize; bpf_buffer_ioctl_sblen(d, &size); return (0); } /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; int error; int non_block; int timed_out; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; while (d->bd_hbuf_in_use) { error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET|PCATCH, "bd_hbuf", 0); if (error != 0) { BPFD_UNLOCK(d); return (error); } } /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if (d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. */ if (d->bd_immediate || non_block || timed_out) { /* * Rotate the buffers and return what's here * if we are in immediate mode, non-blocking * flag is set, or this descriptor timed out. */ ROTATE_BUFFERS(d); break; } } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (non_block) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_lock, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ d->bd_hbuf_in_use = 1; BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. * * We do not have to worry about simultaneous reads because * we waited for sole access to the hold buffer above. */ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf")); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); d->bd_hbuf_in_use = 0; wakeup(&d->bd_hbuf_in_use); BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK_ASSERT(d); if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } } static int bpf_ready(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (!bpf_canfreebuf(d) && d->bd_hlen != 0) return (1); if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0) return (1); return (0); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; struct ifnet *ifp; struct mbuf *m, *mc; struct sockaddr dst; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); BPF_PID_REFRESH_CUR(d); d->bd_wcount++; /* XXX: locking required */ if (d->bd_bif == NULL) { d->bd_wdcount++; return (ENXIO); } ifp = d->bd_bif->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { d->bd_wdcount++; return (ENETDOWN); } if (uio->uio_resid == 0) { d->bd_wdcount++; return (0); } bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; /* XXX: bpf_movein() can sleep */ error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, &m, &dst, &hlen, d->bd_wfilter); if (error) { d->bd_wdcount++; return (error); } d->bd_wfcount++; if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_NOWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_PROMISC; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ CURVNET_SET(ifp->if_vnet); #ifdef MAC BPFD_LOCK(d); mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); BPFD_UNLOCK(d); #endif error = (*ifp->if_output)(ifp, m, &dst, NULL); if (error) d->bd_wdcount++; if (mc != NULL) { if (error == 0) (*ifp->if_input)(ifp, mc); else m_freem(mc); } CURVNET_RESTORE(); return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the receive * and drop counts. This is doable for kernel-only buffers, but with * zero-copy buffers, we can't write to (or rotate) buffers that are * currently owned by userspace. It would be nice if we could encapsulate * this logic in the buffer code rather than here. */ static void reset_d(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } if (bpf_canwritebuf(d)) d->bd_slen = 0; d->bd_rcount = 0; d->bd_dcount = 0; d->bd_fcount = 0; d->bd_wcount = 0; d->bd_wfcount = 0; d->bd_wdcount = 0; d->bd_zcopy = 0; } /* * FIONREAD Check for read packet available. * SIOCGIFADDR Get interface address - convenient hook to driver. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. * BIOCSETWF Set write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCGTSTAMP Get time stamp format and resolution. * BIOCSTSTAMP Set time stamp format and resolution. * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. * BIOCSETZBUF Set current zero-copy buffer locations. * BIOCGETZMAX Get maximum zero-copy buffer size. * BIOCROTZBUF Force rotation of zero-copy buffer * BIOCSETBUFMODE Set buffer mode. * BIOCGETBUFMODE Get current buffer mode. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d; int error; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: #endif case BIOCGETIF: case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCSTSTAMP: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCROTZBUF: break; default: return (EPERM); } } #ifdef COMPAT_FREEBSD32 /* * If we see a 32-bit compat ioctl, mark the stream as 32-bit so * that it will get 32-bit packet headers. */ switch (cmd) { case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: BPFD_LOCK(d); d->bd_compat32 = 1; BPFD_UNLOCK(d); } #endif CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } case SIOCGIFADDR: { struct ifnet *ifp; if (d->bd_bif == NULL) error = EINVAL; else { ifp = d->bd_bif->bif_ifp; error = (*ifp->if_ioctl)(ifp, cmd, addr); } break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufsize; BPFD_UNLOCK(d); break; /* * Set buffer length. */ case BIOCSBLEN: error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETFNR: case BIOCSETWF: #ifdef COMPAT_FREEBSD32 case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: #endif error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { error = ifpromisc(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } break; /* * Get current data link type. */ case BIOCGDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; BPF_UNLOCK(); break; /* * Get a list of supported data link types. */ #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: { struct bpf_dltlist32 *list32; struct bpf_dltlist dltlist; list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { error = bpf_getdltlist(d, &dltlist); if (error == 0) list32->bfl_len = dltlist.bfl_len; } BPF_UNLOCK(); break; } #endif case BIOCGDLTLIST: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); BPF_UNLOCK(); break; /* * Set data link type. */ case BIOCSDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); BPF_UNLOCK(); break; /* * Get interface name. */ case BIOCGETIF: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } BPF_UNLOCK(); break; /* * Set interface. */ case BIOCSETIF: BPF_LOCK(); error = bpf_setif(d, (struct ifreq *)addr); BPF_UNLOCK(); break; /* * Set read timeout. */ case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; #if defined(COMPAT_FREEBSD32) && !defined(__mips__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCSRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv = &tv64; tv->tv_sec = tv32->tv_sec; tv->tv_usec = tv32->tv_usec; } else #endif tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && !defined(__mips__) case BIOCGRTIMEOUT32: #endif { struct timeval *tv; #if defined(COMPAT_FREEBSD32) && !defined(__mips__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCGRTIMEOUT32) tv = &tv64; else #endif tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; #if defined(COMPAT_FREEBSD32) && !defined(__mips__) if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; } #endif break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; /* XXXCSJP overflow */ bs->bs_recv = d->bd_rcount; bs->bs_drop = d->bd_dcount; break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: BPFD_LOCK(d); d->bd_immediate = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: BPFD_LOCK(d); *(u_int *)addr = d->bd_hdrcmplt; BPFD_UNLOCK(d); break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: BPFD_LOCK(d); d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; BPFD_UNLOCK(d); break; /* * Get packet direction flag */ case BIOCGDIRECTION: BPFD_LOCK(d); *(u_int *)addr = d->bd_direction; BPFD_UNLOCK(d); break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: BPFD_LOCK(d); d->bd_direction = direction; BPFD_UNLOCK(d); break; default: error = EINVAL; } } break; /* * Get packet timestamp format and resolution. */ case BIOCGTSTAMP: BPFD_LOCK(d); *(u_int *)addr = d->bd_tstamp; BPFD_UNLOCK(d); break; /* * Set packet timestamp format and resolution. */ case BIOCSTSTAMP: { u_int func; func = *(u_int *)addr; if (BPF_T_VALID(func)) d->bd_tstamp = func; else error = EINVAL; } break; case BIOCFEEDBACK: BPFD_LOCK(d); d->bd_feedback = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCLOCK: BPFD_LOCK(d); d->bd_locked = 1; BPFD_UNLOCK(d); break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ BPFD_LOCK(d); d->bd_async = *(int *)addr; BPFD_UNLOCK(d); break; case FIOSETOWN: /* * XXX: Add some sort of locking here? * fsetown() can sleep. */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: BPFD_LOCK(d); *(int *)addr = fgetown(&d->bd_sigio); BPFD_UNLOCK(d); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else { BPFD_LOCK(d); d->bd_sig = sig; BPFD_UNLOCK(d); } break; } case BIOCGRSIG: BPFD_LOCK(d); *(u_int *)addr = d->bd_sig; BPFD_UNLOCK(d); break; case BIOCGETBUFMODE: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufmode; BPFD_UNLOCK(d); break; case BIOCSETBUFMODE: /* * Allow the buffering mode to be changed as long as we * haven't yet committed to a particular mode. Our * definition of commitment, for now, is whether or not a * buffer has been allocated or an interface attached, since * that's the point where things get tricky. */ switch (*(u_int *)addr) { case BPF_BUFMODE_BUFFER: break; case BPF_BUFMODE_ZBUF: if (bpf_zerocopy_enable) break; /* FALLSTHROUGH */ default: CURVNET_RESTORE(); return (EINVAL); } BPFD_LOCK(d); if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); CURVNET_RESTORE(); return (EBUSY); } d->bd_bufmode = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCGETZMAX: error = bpf_ioctl_getzmax(td, d, (size_t *)addr); break; case BIOCSETZBUF: error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCROTZBUF: error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); break; } CURVNET_RESTORE(); return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. * * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls * since reading d->bd_bif can't be protected by d or interface lock due to * lock order. * * Additionally, we have to acquire interface write lock due to bpf_mtap() uses * interface read lock to read all filers. * */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { #ifdef COMPAT_FREEBSD32 struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif struct bpf_insn *fcode, *old; #ifdef BPF_JITTER bpf_jit_filter *jfunc, *ofunc; #endif size_t size; u_int flen; int need_upgrade; #ifdef COMPAT_FREEBSD32 switch (cmd) { case BIOCSETF32: case BIOCSETWF32: case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: cmd = BIOCSETF; break; case BIOCSETWF32: cmd = BIOCSETWF; break; } break; } #endif fcode = NULL; #ifdef BPF_JITTER jfunc = ofunc = NULL; #endif need_upgrade = 0; /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. */ flen = fp->bf_len; if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0)) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { /* We're setting up new filter. Copy and check actual data. */ fcode = malloc(size, M_BPF, M_WAITOK); if (copyin(fp->bf_insns, fcode, size) != 0 || !bpf_validate(fcode, flen)) { free(fcode, M_BPF); return (EINVAL); } #ifdef BPF_JITTER /* Filter is copied inside fcode and is perfectly valid. */ jfunc = bpf_jitter(fcode, flen); #endif } BPF_LOCK(); /* * Set up new filter. * Protect filter change by interface lock. * Additionally, we are protected by global lock here. */ if (d->bd_bif != NULL) BPFIF_WLOCK(d->bd_bif); BPFD_LOCK(d); if (cmd == BIOCSETWF) { old = d->bd_wfilter; d->bd_wfilter = fcode; } else { old = d->bd_rfilter; d->bd_rfilter = fcode; #ifdef BPF_JITTER ofunc = d->bd_bfilter; d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen); } BPFD_UNLOCK(d); if (d->bd_bif != NULL) BPFIF_WUNLOCK(d->bd_bif); if (old != NULL) free(old, M_BPF); #ifdef BPF_JITTER if (ofunc != NULL) bpf_destroy_jit_filter(ofunc); #endif /* Move d to active readers list. */ if (need_upgrade != 0) bpf_upgraded(d); BPF_UNLOCK(); return (0); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; BPF_LOCK_ASSERT(); theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* Check if interface is not being detached from BPF */ BPFIF_RLOCK(bp); if (bp->flags & BPFIF_FLAG_DYING) { BPFIF_RUNLOCK(bp); return (ENXIO); } BPFIF_RUNLOCK(bp); /* * Behavior here depends on the buffering model. If we're using * kernel memory buffers, then we can allocate them here. If we're * using zero-copy, then the user process must have registered * buffers by the time we get here. If not, return an error. */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); break; default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } if (bp != d->bd_bif) bpf_attachd(d, bp); BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d; if (devfs_get_cdevpriv((void **)&d) != 0 || kn->kn_filter != EVFILT_READ) return (1); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if (d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } #define BPF_TSTAMP_NONE 0 #define BPF_TSTAMP_FAST 1 #define BPF_TSTAMP_NORMAL 2 #define BPF_TSTAMP_EXTERN 3 static int bpf_ts_quality(int tstype) { if (tstype == BPF_T_NONE) return (BPF_TSTAMP_NONE); if ((tstype & BPF_T_FAST) != 0) return (BPF_TSTAMP_FAST); return (BPF_TSTAMP_NORMAL); } static int bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) { struct m_tag *tag; int quality; quality = bpf_ts_quality(tstype); if (quality == BPF_TSTAMP_NONE) return (quality); if (m != NULL) { tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL); if (tag != NULL) { *bt = *(struct bintime *)(tag + 1); return (BPF_TSTAMP_EXTERN); } } if (quality == BPF_TSTAMP_NORMAL) binuptime(bt); else getbinuptime(bt); return (quality); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int slen; int gottime; gottime = BPF_TSTAMP_NONE; BPFIF_RLOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { /* * We are not using any locks for d here because: * 1) any filter change is protected by interface * write lock * 2) destroying/detaching d is protected by interface * write lock, too */ /* XXX: Do not protect counter for the sake of performance. */ ++d->bd_rcount; /* * NB: We dont call BPF_CHECK_DIRECTION() here since there is no * way for the caller to indiciate to us whether this packet * is inbound or outbound. In the bpf_mtap() routines, we use * the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; if (bf != NULL) slen = (*(bf->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { /* * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); d->bd_fcount++; if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &bt); BPFD_UNLOCK(d); } } BPFIF_RUNLOCK(bp); } #define BPF_CHECK_DIRECTION(d, r, i) \ (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ ((d)->bd_direction == BPF_D_OUT && (r) == (i))) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. * Locking model is explained in bpf_tap(). */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; BPFIF_RLOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; ++d->bd_rcount; #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); d->bd_fcount++; if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } BPFIF_RUNLOCK(bp); } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct bintime bt; struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; gottime = BPF_TSTAMP_NONE; BPFIF_RLOCK(bp); LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; ++d->bd_rcount; slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); d->bd_fcount++; if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } BPFIF_RUNLOCK(bp); } #undef BPF_CHECK_DIRECTION #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL #undef BPF_TSTAMP_EXTERN static int bpf_hdrlen(struct bpf_d *d) { int hdrlen; hdrlen = d->bd_bif->bif_hdrlen; #ifndef BURN_BRIDGES if (d->bd_tstamp == BPF_T_NONE || BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME) #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen = BPF_WORDALIGN32(hdrlen); else #endif hdrlen = BPF_WORDALIGN(hdrlen); return (hdrlen - d->bd_bif->bif_hdrlen); } static void bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype) { struct bintime bt2; struct timeval tsm; struct timespec tsn; if ((tstype & BPF_T_MONOTONIC) == 0) { bt2 = *bt; bintime_add(&bt2, &boottimebin); bt = &bt2; } switch (BPF_T_FORMAT(tstype)) { case BPF_T_MICROTIME: bintime2timeval(bt, &tsm); ts->bt_sec = tsm.tv_sec; ts->bt_frac = tsm.tv_usec; break; case BPF_T_NANOTIME: bintime2timespec(bt, &tsn); ts->bt_sec = tsn.tv_sec; ts->bt_frac = tsn.tv_nsec; break; case BPF_T_BINTIME: ts->bt_sec = bt->sec; ts->bt_frac = bt->frac; break; } } /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *bt) { struct bpf_xhdr hdr; #ifndef BURN_BRIDGES struct bpf_hdr hdr_old; #ifdef COMPAT_FREEBSD32 struct bpf_hdr32 hdr32_old; #endif #endif int caplen, curlen, hdrlen, totlen; int do_wakeup = 0; int do_timestamp; int tstype; BPFD_LOCK_ASSERT(d); /* * Detect whether user space has released a buffer back to us, and if * so, move it from being a hold buffer to a free buffer. This may * not be the best place to do it (for example, we might only want to * run this check if we need the space), but for now it's a reliable * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ hdrlen = bpf_hdrlen(d); totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. * * Drop the packet if there's no room and no hope of room * If the packet would overflow the storage buffer or the storage * buffer is considered immutable by the buffer model, try to rotate * the buffer and wakeup pending processes. */ #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { if (d->bd_fbuf == NULL) { /* * There's no room in the store buffer, and no * prospect of room, so drop the packet. Notify the * buffer model. */ bpf_buffull(d); ++d->bd_dcount; return; } while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* * Immediate mode is set, or the read timeout has already * expired during a select call. A packet arrived, so the * reader should be woken up. */ do_wakeup = 1; caplen = totlen - hdrlen; tstype = d->bd_tstamp; do_timestamp = tstype != BPF_T_NONE; #ifndef BURN_BRIDGES if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) { struct bpf_ts ts; if (do_timestamp) bpf_bintime2ts(bt, &ts, tstype); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) { bzero(&hdr32_old, sizeof(hdr32_old)); if (do_timestamp) { hdr32_old.bh_tstamp.tv_sec = ts.bt_sec; hdr32_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr32_old.bh_datalen = pktlen; hdr32_old.bh_hdrlen = hdrlen; hdr32_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old, sizeof(hdr32_old)); goto copy; } #endif bzero(&hdr_old, sizeof(hdr_old)); if (do_timestamp) { hdr_old.bh_tstamp.tv_sec = ts.bt_sec; hdr_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr_old.bh_datalen = pktlen; hdr_old.bh_hdrlen = hdrlen; hdr_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old, sizeof(hdr_old)); goto copy; } #endif /* * Append the bpf header. Note we append the actual header size, but * move forward the length of the header plus padding. */ bzero(&hdr, sizeof(hdr)); if (do_timestamp) bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype); hdr.bh_datalen = pktlen; hdr.bh_hdrlen = hdrlen; hdr.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); /* * Copy the packet data into the store buffer and update its length. */ #ifndef BURN_BRIDGES copy: #endif (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpf_freed(struct bpf_d *d) { /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ bpf_free(d); if (d->bd_rfilter != NULL) { free((caddr_t)d->bd_rfilter, M_BPF); #ifdef BPF_JITTER if (d->bd_bfilter != NULL) bpf_destroy_jit_filter(d->bd_bfilter); #endif } if (d->bd_wfilter != NULL) free((caddr_t)d->bd_wfilter, M_BPF); mtx_destroy(&d->bd_lock); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO); if (bp == NULL) panic("bpfattach"); LIST_INIT(&bp->bif_dlist); LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; rw_init(&bp->bif_lock, "bpf interface lock"); KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); *driverp = bp; BPF_LOCK(); LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); bp->bif_hdrlen = hdrlen; if (bootverbose) if_printf(ifp, "bpf attached\n"); } /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface. Notify each descriptor as it's detached * so that any sleepers wake up and get ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, *bp_temp; struct bpf_d *d; int ndetached; ndetached = 0; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; LIST_REMOVE(bp, bif_next); /* Add to to-be-freed list */ LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next); ndetached++; /* * Delay freeing bp till interface is detached * and all routes through this interface are removed. * Mark bp as detached to restrict new consumers. */ BPFIF_WLOCK(bp); bp->flags |= BPFIF_FLAG_DYING; BPFIF_WUNLOCK(bp); CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); /* Free common descriptors */ while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd_locked(d); BPFD_LOCK(d); bpf_wakeup(d); BPFD_UNLOCK(d); } /* Free writer-only descriptors */ while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) { bpf_detachd_locked(d); BPFD_LOCK(d); bpf_wakeup(d); BPFD_UNLOCK(d); } } BPF_UNLOCK(); #ifdef INVARIANTS if (ndetached == 0) printf("bpfdetach: %s was not attached\n", ifp->if_xname); #endif } /* * Interface departure handler. * Note departure event does not guarantee interface is going down. * Interface renaming is currently done via departure/arrival event set. * * Departure handled is called after all routes pointing to * given interface are removed and interface is in down state * restricting any packets to be sent/received. We assume it is now safe * to free data allocated by BPF. */ static void bpf_ifdetach(void *arg __unused, struct ifnet *ifp) { struct bpf_if *bp, *bp_temp; int nmatched = 0; BPF_LOCK(); /* * Find matching entries in free list. * Nothing should be found if bpfdetach() was not called. */ LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p", __func__, bp, ifp); LIST_REMOVE(bp, bif_next); rw_destroy(&bp->bif_lock); free(bp, M_BPF); nmatched++; } BPF_UNLOCK(); /* * Note that we cannot zero other pointers to * custom DLTs possibly used by given interface. */ if (nmatched != 0) ifp->if_bpf = NULL; } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { int n, error; struct ifnet *ifp; struct bpf_if *bp; BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; n = 0; error = 0; LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; if (bfl->bfl_list != NULL) { if (n >= bfl->bfl_len) return (ENOMEM); error = copyout(&bp->bif_dlt, bfl->bfl_list + n, sizeof(u_int)); } n++; } bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; BPF_LOCK_ASSERT(); if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp != NULL) { opromisc = d->bd_promisc; bpf_attachd(d, bp); BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "bpf_setdlt: ifpromisc failed (%d)\n", error); else d->bd_promisc = 1; } } return (bp == NULL ? EINVAL : 0); } static void bpf_drvinit(void *unused) { struct cdev *dev; mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF); LIST_INIT(&bpf_iflist); LIST_INIT(&bpf_freelist); dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); /* Register interface departure handler */ bpf_ifdetach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bpf_ifdetach, NULL, EVENTHANDLER_PRI_ANY); } /* * Zero out the various packet counters associated with all of the bpf * descriptors. At some point, we will probably want to get a bit more * granular and allow the user to specify descriptors to be zeroed. */ static void bpf_zero_counters(void) { struct bpf_if *bp; struct bpf_d *bd; BPF_LOCK(); LIST_FOREACH(bp, &bpf_iflist, bif_next) { BPFIF_RLOCK(bp); LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { BPFD_LOCK(bd); bd->bd_rcount = 0; bd->bd_dcount = 0; bd->bd_fcount = 0; bd->bd_wcount = 0; bd->bd_wfcount = 0; bd->bd_zcopy = 0; BPFD_UNLOCK(bd); } BPFIF_RUNLOCK(bp); } BPF_UNLOCK(); } /* * Fill filter statistics */ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { bzero(d, sizeof(*d)); BPFD_LOCK_ASSERT(bd); d->bd_structsize = sizeof(*d); /* XXX: reading should be protected by global lock */ d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = bd->bd_rcount; d->bd_dcount = bd->bd_dcount; d->bd_fcount = bd->bd_fcount; d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; d->bd_wcount = bd->bd_wcount; d->bd_wdcount = bd->bd_wdcount; d->bd_wfcount = bd->bd_wfcount; d->bd_zcopy = bd->bd_zcopy; d->bd_bufmode = bd->bd_bufmode; } /* * Handle `netstat -B' stats request */ static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { static const struct xbpf_d zerostats; struct xbpf_d *xbdbuf, *xbd, tempstats; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); /* * Check to see if the user is requesting that the counters be * zeroed out. Explicitly check that the supplied data is zeroed, * as we aren't allowing the user to set the counters currently. */ if (req->newptr != NULL) { if (req->newlen != sizeof(tempstats)) return (EINVAL); memset(&tempstats, 0, sizeof(tempstats)); error = SYSCTL_IN(req, &tempstats, sizeof(tempstats)); if (error) return (error); if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0) return (EINVAL); bpf_zero_counters(); return (0); } if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); BPF_LOCK(); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { BPF_UNLOCK(); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; LIST_FOREACH(bp, &bpf_iflist, bif_next) { BPFIF_RLOCK(bp); /* Send writers-only first */ LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); BPFD_UNLOCK(bd); } LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; BPFD_LOCK(bd); bpfstats_fill_xbpf(xbd, bd); BPFD_UNLOCK(bd); } BPFIF_RUNLOCK(bp); } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL); #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ static struct bpf_if bp_null; void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = &bp_null; } void bpfdetach(struct ifnet *ifp) { } u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return -1; /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return 0; /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ Index: user/dchagin/lemul/sys/net/flowtable.c =================================================================== --- user/dchagin/lemul/sys/net/flowtable.c (revision 274240) +++ user/dchagin/lemul/sys/net/flowtable.c (revision 274241) @@ -1,1189 +1,1189 @@ /*- * Copyright (c) 2014 Gleb Smirnoff * Copyright (c) 2008-2010, BitGravity Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Neither the name of the BitGravity Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "opt_route.h" #include "opt_mpath.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef FLOWTABLE_HASH_ALL #include #include #include #endif #include #ifdef FLOWTABLE_HASH_ALL #define KEY_PORTS (sizeof(uint16_t) * 2) #define KEY_ADDRS 2 #else #define KEY_PORTS 0 #define KEY_ADDRS 1 #endif #ifdef INET6 #define KEY_ADDR_LEN sizeof(struct in6_addr) #else #define KEY_ADDR_LEN sizeof(struct in_addr) #endif #define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) struct flentry { uint32_t f_hash; /* hash flowing forward */ uint32_t f_key[KEYLEN]; /* address(es and ports) */ uint32_t f_uptime; /* uptime at last access */ uint16_t f_fibnum; /* fib index */ #ifdef FLOWTABLE_HASH_ALL uint8_t f_proto; /* protocol */ uint8_t f_flags; /* stale? */ #define FL_STALE 1 #endif SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ struct rtentry *f_rt; /* rtentry for flow */ struct llentry *f_lle; /* llentry for flow */ }; #undef KEYLEN SLIST_HEAD(flist, flentry); /* Make sure we can use pcpu_zone_ptr for struct flist. */ CTASSERT(sizeof(struct flist) == sizeof(void *)); struct flowtable { counter_u64_t *ft_stat; int ft_size; /* * ft_table is a malloc(9)ed array of pointers. Pointers point to * memory from UMA_ZONE_PCPU zone. * ft_masks is per-cpu pointer itself. Each instance points * to a malloc(9)ed bitset, that is private to corresponding CPU. */ struct flist **ft_table; bitstr_t **ft_masks; bitstr_t *ft_tmpmask; }; #define FLOWSTAT_ADD(ft, name, v) \ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) #define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) static struct proc *flowcleanerproc; static uint32_t flow_hashjitter; static struct cv flowclean_f_cv; static struct cv flowclean_c_cv; static struct mtx flowclean_lock; static uint32_t flowclean_cycles; /* * TODO: * - add sysctls to resize && flush flow tables * - Add per flowtable sysctls for statistics and configuring timeouts * - add saturation counter to rtentry to support per-packet load-balancing * add flag to indicate round-robin flow, add list lookup from head for flows * - add sysctl / device node / syscall to support exporting and importing * of flows with flag to indicate that a flow was imported so should * not be considered for auto-cleaning * - support explicit connection state (currently only ad-hoc for DSR) * - idetach() cleanup for options VIMAGE builds. */ #ifdef INET static VNET_DEFINE(struct flowtable, ip4_ft); #define V_ip4_ft VNET(ip4_ft) #endif #ifdef INET6 static VNET_DEFINE(struct flowtable, ip6_ft); #define V_ip6_ft VNET(ip6_ft) #endif static uma_zone_t flow_zone; static VNET_DEFINE(int, flowtable_enable) = 1; #define V_flowtable_enable VNET(flowtable_enable) static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); -SYSCTL_VNET_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_RW, +SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, &flow_zone, "Maximum number of flows allowed"); static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); static struct flentry * flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); #ifdef INET static struct flentry * flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) { struct flentry *fle; struct sockaddr_in *sin; struct ip *ip; uint32_t fibnum; #ifdef FLOWTABLE_HASH_ALL uint32_t key[3]; int iphlen; uint16_t sport, dport; uint8_t proto; #endif ip = mtod(m, struct ip *); if (ip->ip_src.s_addr == ip->ip_dst.s_addr || (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) return (NULL); fibnum = M_GETFIB(m); #ifdef FLOWTABLE_HASH_ALL iphlen = ip->ip_hl << 2; proto = ip->ip_p; switch (proto) { case IPPROTO_TCP: { struct tcphdr *th; th = (struct tcphdr *)((char *)ip + iphlen); sport = th->th_sport; dport = th->th_dport; if (th->th_flags & (TH_RST|TH_FIN)) fibnum |= (FL_STALE << 24); break; } case IPPROTO_UDP: { struct udphdr *uh; uh = (struct udphdr *)((char *)ip + iphlen); sport = uh->uh_sport; dport = uh->uh_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh; sh = (struct sctphdr *)((char *)ip + iphlen); sport = sh->src_port; dport = sh->dest_port; /* XXXGL: handle stale? */ break; } default: sport = dport = 0; break; } key[0] = ip->ip_dst.s_addr; key[1] = ip->ip_src.s_addr; key[2] = (dport << 16) | sport; fibnum |= proto << 16; fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), fibnum); #else /* !FLOWTABLE_HASH_ALL */ fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, sizeof(struct in_addr), fibnum); #endif /* FLOWTABLE_HASH_ALL */ if (fle == NULL) return (NULL); sin = (struct sockaddr_in *)&ro->ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; return (fle); } #endif /* INET */ #ifdef INET6 /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) \ do { \ int x = (_len) + sizeof(T); \ if ((m)->m_len < x) \ return (NULL); \ p = (mtod(m, char *) + (_len)); \ } while (0) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) static struct flentry * flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) { struct flentry *fle; struct sockaddr_in6 *sin6; struct ip6_hdr *ip6; uint32_t fibnum; #ifdef FLOWTABLE_HASH_ALL uint32_t key[9]; void *ulp; int hlen; uint16_t sport, dport; u_short offset; uint8_t proto; #else uint32_t key[4]; #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_localaddr(&ip6->ip6_dst)) return (NULL); fibnum = M_GETFIB(m); #ifdef FLOWTABLE_HASH_ALL hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; offset = sport = dport = 0; ulp = NULL; while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: case IPPROTO_OSPFIGP: case IPPROTO_PIM: case IPPROTO_CARP: case IPPROTO_ESP: case IPPROTO_NONE: ulp = ip6; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dport = TCP(ulp)->th_dport; sport = TCP(ulp)->th_sport; if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) fibnum |= (FL_STALE << 24); break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); dport = SCTP(ulp)->src_port; sport = SCTP(ulp)->dest_port; /* XXXGL: handle stale? */ break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dport = UDP(ulp)->uh_dport; sport = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; default: PULLUP_TO(hlen, ulp, struct ip6_ext); break; } } bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); key[8] = (dport << 16) | sport; fibnum |= proto << 16; fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), fibnum); #else /* !FLOWTABLE_HASH_ALL */ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), fibnum); #endif /* FLOWTABLE_HASH_ALL */ if (fle == NULL) return (NULL); sin6 = (struct sockaddr_in6 *)&ro->ro_dst; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); return (fle); } #endif /* INET6 */ static bitstr_t * flowtable_mask(struct flowtable *ft) { /* * flowtable_free_stale() calls w/o critical section, but * with sched_bind(). Since pointer is stable throughout * ft lifetime, it is safe, otherwise... * * CRITICAL_ASSERT(curthread); */ return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); } static struct flist * flowtable_list(struct flowtable *ft, uint32_t hash) { CRITICAL_ASSERT(curthread); return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); } static int flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) { if (((fle->f_rt->rt_flags & RTF_HOST) && ((fle->f_rt->rt_flags & (RTF_UP)) != (RTF_UP))) || (fle->f_rt->rt_ifp == NULL) || !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || (fle->f_lle->la_flags & LLE_VALID) == 0) return (1); if (time_uptime - fle->f_uptime > maxidle) return (1); #ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) return (1); #endif return (0); } static int flow_full(void) { int count, max; count = uma_zone_get_cur(flow_zone); max = uma_zone_get_max(flow_zone); return (count > (max - (max >> 3))); } static int flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) { #ifdef FLOWTABLE_HASH_ALL uint8_t proto; proto = (fibnum >> 16) & 0xff; fibnum &= 0xffff; #endif CRITICAL_ASSERT(curthread); /* Microoptimization for IPv4: don't use bcmp(). */ if (((keylen == sizeof(uint32_t) && (fle->f_key[0] != key[0])) || (bcmp(fle->f_key, key, keylen) == 0)) && fibnum == fle->f_fibnum && #ifdef FLOWTABLE_HASH_ALL proto == fle->f_proto && #endif (fle->f_rt->rt_flags & RTF_UP) && fle->f_rt->rt_ifp != NULL && (fle->f_lle->la_flags & LLE_VALID)) return (1); return (0); } static struct flentry * flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, int keylen, uint32_t fibnum0) { #ifdef INET6 struct route_in6 sro6; #endif #ifdef INET struct route sro; #endif struct route *ro = NULL; struct rtentry *rt; struct lltable *lt = NULL; struct llentry *lle; struct sockaddr_storage *l3addr; struct ifnet *ifp; struct flist *flist; struct flentry *fle, *iter; bitstr_t *mask; uint16_t fibnum = fibnum0; #ifdef FLOWTABLE_HASH_ALL uint8_t proto; proto = (fibnum0 >> 16) & 0xff; fibnum = fibnum0 & 0xffff; #endif /* * This bit of code ends up locking the * same route 3 times (just like ip_output + ether_output) * - at lookup * - in rt_check when called by arpresolve * - dropping the refcount for the rtentry * * This could be consolidated to one if we wrote a variant * of arpresolve with an rt_check variant that expected to * receive the route locked */ #ifdef INET if (ft == &V_ip4_ft) { struct sockaddr_in *sin; ro = &sro; bzero(&sro.ro_dst, sizeof(sro.ro_dst)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr.s_addr = key[0]; } #endif #ifdef INET6 if (ft == &V_ip6_ft) { struct sockaddr_in6 *sin6; ro = (struct route *)&sro6; sin6 = &sro6.ro_dst; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); } #endif ro->ro_rt = NULL; #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, hash, fibnum); #else rtalloc_ign_fib(ro, 0, fibnum); #endif if (ro->ro_rt == NULL) return (NULL); rt = ro->ro_rt; ifp = rt->rt_ifp; if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { RTFREE(rt); return (NULL); } #ifdef INET if (ft == &V_ip4_ft) lt = LLTABLE(ifp); #endif #ifdef INET6 if (ft == &V_ip6_ft) lt = LLTABLE6(ifp); #endif if (rt->rt_flags & RTF_GATEWAY) l3addr = (struct sockaddr_storage *)rt->rt_gateway; else l3addr = (struct sockaddr_storage *)&ro->ro_dst; lle = llentry_alloc(ifp, lt, l3addr); if (lle == NULL) { RTFREE(rt); return (NULL); } /* Don't insert the entry if the ARP hasn't yet finished resolving. */ if ((lle->la_flags & LLE_VALID) == 0) { RTFREE(rt); LLE_FREE(lle); FLOWSTAT_INC(ft, ft_fail_lle_invalid); return (NULL); } fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); if (fle == NULL) { RTFREE(rt); LLE_FREE(lle); return (NULL); } fle->f_hash = hash; bcopy(key, &fle->f_key, keylen); fle->f_rt = rt; fle->f_lle = lle; fle->f_fibnum = fibnum; fle->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL fle->f_proto = proto; fle->f_flags = fibnum0 >> 24; #endif critical_enter(); mask = flowtable_mask(ft); flist = flowtable_list(ft, hash); if (SLIST_EMPTY(flist)) { bit_set(mask, (hash % ft->ft_size)); SLIST_INSERT_HEAD(flist, fle, f_next); goto skip; } /* * find end of list and make sure that we were not * preempted by another thread handling this flow */ SLIST_FOREACH(iter, flist, f_next) { KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, ("%s: wrong hash", __func__)); if (flow_matches(iter, key, keylen, fibnum)) { /* * We probably migrated to an other CPU after * lookup in flowtable_lookup_common() failed. * It appeared that this CPU already has flow * entry. */ iter->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL iter->f_flags |= fibnum >> 24; #endif critical_exit(); FLOWSTAT_INC(ft, ft_collisions); uma_zfree(flow_zone, fle); return (iter); } } SLIST_INSERT_HEAD(flist, fle, f_next); skip: critical_exit(); FLOWSTAT_INC(ft, ft_inserts); return (fle); } int flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) { struct flentry *fle; if (V_flowtable_enable == 0) return (ENXIO); switch (sa) { #ifdef INET case AF_INET: fle = flowtable_lookup_ipv4(m, ro); break; #endif #ifdef INET6 case AF_INET6: fle = flowtable_lookup_ipv6(m, ro); break; #endif default: panic("%s: sa %d", __func__, sa); } if (fle == NULL) return (EHOSTUNREACH); if (!(m->m_flags & M_FLOWID)) { m->m_flags |= M_FLOWID; m->m_pkthdr.flowid = fle->f_hash; } ro->ro_rt = fle->f_rt; ro->ro_lle = fle->f_lle; ro->ro_flags |= RT_NORTREF; return (0); } static struct flentry * flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, uint32_t fibnum) { struct flist *flist; struct flentry *fle; uint32_t hash; FLOWSTAT_INC(ft, ft_lookups); hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); critical_enter(); flist = flowtable_list(ft, hash); SLIST_FOREACH(fle, flist, f_next) { KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, ("%s: wrong hash", __func__)); if (flow_matches(fle, key, keylen, fibnum)) { fle->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL fle->f_flags |= fibnum >> 24; #endif critical_exit(); FLOWSTAT_INC(ft, ft_hits); return (fle); } } critical_exit(); FLOWSTAT_INC(ft, ft_misses); return (flowtable_insert(ft, hash, key, keylen, fibnum)); } /* * used by the bit_alloc macro */ #define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) static void flowtable_alloc(struct flowtable *ft) { ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), M_FTABLE, M_WAITOK); for (int i = 0; i < ft->ft_size; i++) ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); for (int i = 0; i < mp_ncpus; i++) { bitstr_t **b; b = zpcpu_get_cpu(ft->ft_masks, i); *b = bit_alloc(ft->ft_size); } ft->ft_tmpmask = bit_alloc(ft->ft_size); } #undef calloc static void flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) { struct flist *flist, freelist; struct flentry *fle, *fle1, *fleprev; bitstr_t *mask, *tmpmask; int curbit, tmpsize; SLIST_INIT(&freelist); mask = flowtable_mask(ft); tmpmask = ft->ft_tmpmask; tmpsize = ft->ft_size; memcpy(tmpmask, mask, ft->ft_size/8); curbit = 0; fleprev = NULL; /* pacify gcc */ /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { if (curbit >= ft->ft_size || curbit < -1) { log(LOG_ALERT, "warning: bad curbit value %d \n", curbit); break; } FLOWSTAT_INC(ft, ft_free_checks); critical_enter(); flist = flowtable_list(ft, curbit); #ifdef DIAGNOSTIC if (SLIST_EMPTY(flist) && curbit > 0) { log(LOG_ALERT, "warning bit=%d set, but no fle found\n", curbit); } #endif SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { if (rt != NULL && fle->f_rt != rt) { fleprev = fle; continue; } if (!flow_stale(ft, fle, maxidle)) { fleprev = fle; continue; } if (fle == SLIST_FIRST(flist)) SLIST_REMOVE_HEAD(flist, f_next); else SLIST_REMOVE_AFTER(fleprev, f_next); SLIST_INSERT_HEAD(&freelist, fle, f_next); } if (SLIST_EMPTY(flist)) bit_clear(mask, curbit); critical_exit(); bit_clear(tmpmask, curbit); tmpmask += (curbit / 8); tmpsize -= (curbit / 8) * 8; bit_ffs(tmpmask, tmpsize, &curbit); } SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { FLOWSTAT_INC(ft, ft_frees); if (fle->f_rt != NULL) RTFREE(fle->f_rt); if (fle->f_lle != NULL) LLE_FREE(fle->f_lle); uma_zfree(flow_zone, fle); } } static void flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) { int i; CPU_FOREACH(i) { if (smp_started == 1) { thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); } flowtable_free_stale(ft, rt, maxidle); if (smp_started == 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } } } void flowtable_route_flush(sa_family_t sa, struct rtentry *rt) { struct flowtable *ft; switch (sa) { #ifdef INET case AF_INET: ft = &V_ip4_ft; break; #endif #ifdef INET6 case AF_INET6: ft = &V_ip6_ft; break; #endif default: panic("%s: sa %d", __func__, sa); } flowtable_clean_vnet(ft, rt, 0); } static void flowtable_cleaner(void) { VNET_ITERATOR_DECL(vnet_iter); struct thread *td; if (bootverbose) log(LOG_INFO, "flowtable cleaner started\n"); td = curthread; while (1) { uint32_t flowclean_freq, maxidle; /* * The maximum idle time, as well as frequency are arbitrary. */ if (flow_full()) maxidle = 5; else maxidle = 30; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef INET flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); #endif #ifdef INET6 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); #endif CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); if (flow_full()) flowclean_freq = 4*hz; else flowclean_freq = 20*hz; mtx_lock(&flowclean_lock); thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); flowclean_cycles++; cv_broadcast(&flowclean_f_cv); cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq); mtx_unlock(&flowclean_lock); } } static void flowtable_flush(void *unused __unused) { uint64_t start; mtx_lock(&flowclean_lock); start = flowclean_cycles; while (start == flowclean_cycles) { cv_broadcast(&flowclean_c_cv); cv_wait(&flowclean_f_cv, &flowclean_lock); } mtx_unlock(&flowclean_lock); } static struct kproc_desc flow_kp = { "flowcleaner", flowtable_cleaner, &flowcleanerproc }; SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); static int flowtable_get_size(char *name) { int size; if (TUNABLE_INT_FETCH(name, &size)) { if (size < 256) size = 256; if (!powerof2(size)) { printf("%s must be power of 2\n", name); size = 2048; } } else { /* * round up to the next power of 2 */ size = 1 << fls((1024 + maxusers * 64) - 1); } return (size); } static void flowtable_init(const void *unused __unused) { flow_hashjitter = arc4random(); flow_zone = uma_zcreate("flows", sizeof(struct flentry), NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); cv_init(&flowclean_c_cv, "c_flowcleanwait"); cv_init(&flowclean_f_cv, "f_flowcleanwait"); mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, flowtable_init, NULL); #ifdef INET static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, "Flowtable for IPv4"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); VNET_PCPUSTAT_SYSINIT(ip4_ftstat); VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, ip4_ftstat, "Flowtable statistics for IPv4 " "(struct flowtable_stat, net/flowtable.h)"); static void flowtable_init_vnet_v4(const void *unused __unused) { V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); V_ip4_ft.ft_stat = VNET(ip4_ftstat); flowtable_alloc(&V_ip4_ft); } VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, flowtable_init_vnet_v4, NULL); #endif /* INET */ #ifdef INET6 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, "Flowtable for IPv6"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); VNET_PCPUSTAT_SYSINIT(ip6_ftstat); VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, ip6_ftstat, "Flowtable statistics for IPv6 " "(struct flowtable_stat, net/flowtable.h)"); static void flowtable_init_vnet_v6(const void *unused __unused) { V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); V_ip6_ft.ft_stat = VNET(ip6_ftstat); flowtable_alloc(&V_ip6_ft); } VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, flowtable_init_vnet_v6, NULL); #endif /* INET6 */ #ifdef DDB static bitstr_t * flowtable_mask_pcpu(struct flowtable *ft, int cpuid) { return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); } static struct flist * flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) { return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); } static void flow_show(struct flowtable *ft, struct flentry *fle) { int idle_time; int rt_valid, ifp_valid; volatile struct rtentry *rt; struct ifnet *ifp = NULL; uint32_t *hashkey = fle->f_key; idle_time = (int)(time_uptime - fle->f_uptime); rt = fle->f_rt; rt_valid = rt != NULL; if (rt_valid) ifp = rt->rt_ifp; ifp_valid = ifp != NULL; #ifdef INET if (ft == &V_ip4_ft) { char daddr[4*sizeof "123"]; #ifdef FLOWTABLE_HASH_ALL char saddr[4*sizeof "123"]; uint16_t sport, dport; #endif inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); #ifdef FLOWTABLE_HASH_ALL inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); dport = ntohs((uint16_t)(hashkey[2] >> 16)); sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); #else db_printf("%s ", daddr); #endif } #endif /* INET */ #ifdef INET6 if (ft == &V_ip6_ft) { #ifdef FLOWTABLE_HASH_ALL db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", hashkey[0], hashkey[1], hashkey[2], hashkey[3], hashkey[4], hashkey[5], hashkey[6], hashkey[7], hashkey[8]); #else db_printf("\n\tkey=%08x:%08x:%08x ", hashkey[0], hashkey[1], hashkey[2]); #endif } #endif /* INET6 */ db_printf("hash=%08x idle_time=%03d" "\n\tfibnum=%02d rt=%p", fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); #ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) db_printf(" FL_STALE "); #endif if (rt_valid) { if (rt->rt_flags & RTF_UP) db_printf(" RTF_UP "); } if (ifp_valid) { if (ifp->if_flags & IFF_LOOPBACK) db_printf(" IFF_LOOPBACK "); if (ifp->if_flags & IFF_UP) db_printf(" IFF_UP "); if (ifp->if_flags & IFF_POINTOPOINT) db_printf(" IFF_POINTOPOINT "); } db_printf("\n"); } static void flowtable_show(struct flowtable *ft, int cpuid) { int curbit = 0; bitstr_t *mask, *tmpmask; if (cpuid != -1) db_printf("cpu: %d\n", cpuid); mask = flowtable_mask_pcpu(ft, cpuid); tmpmask = ft->ft_tmpmask; memcpy(tmpmask, mask, ft->ft_size/8); /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { struct flist *flist; struct flentry *fle; if (curbit >= ft->ft_size || curbit < -1) { db_printf("warning: bad curbit value %d \n", curbit); break; } flist = flowtable_list_pcpu(ft, curbit, cpuid); SLIST_FOREACH(fle, flist, f_next) flow_show(ft, fle); bit_clear(tmpmask, curbit); bit_ffs(tmpmask, ft->ft_size, &curbit); } } static void flowtable_show_vnet(struct flowtable *ft) { int i; CPU_FOREACH(i) flowtable_show(ft, i); } DB_SHOW_COMMAND(flowtables, db_show_flowtables) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef VIMAGE db_printf("vnet %p\n", vnet_iter); #endif #ifdef INET printf("IPv4:\n"); flowtable_show_vnet(&V_ip4_ft); #endif #ifdef INET6 printf("IPv6:\n"); flowtable_show_vnet(&V_ip6_ft); #endif CURVNET_RESTORE(); } } #endif Index: user/dchagin/lemul/sys/net/if.c =================================================================== --- user/dchagin/lemul/sys/net/if.c (revision 274240) +++ user/dchagin/lemul/sys/net/if.c (revision 274241) @@ -1,4030 +1,4026 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include #ifdef COMPAT_FREEBSD32 #include #include #endif -struct ifindex_entry { - struct ifnet *ife_ifnet; -}; - SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*bridge_linkstate_p)(struct ifnet *ifp); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void if_freemulti(struct ifmultiaddr *); static void if_init(void *); static void if_grow(void); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static void link_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static int if_rtdel(struct radix_node *, void *); static int ifhwioctl(u_long, struct ifnet *, caddr_t, struct thread *); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int); static void if_detach_internal(struct ifnet *, int); #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); static VNET_DEFINE(int, if_indexlim) = 8; /* Table of ifnet by index. */ -VNET_DEFINE(struct ifindex_entry *, ifindex_table); +VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; struct sx ifnet_sxlock; /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); - if (V_ifindex_table[idx].ife_ifnet == IFNET_HOLD) + if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); - return (V_ifindex_table[idx].ife_ifnet); + return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { IFNET_RUNLOCK_NOSLEEP(); return (NULL); } if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static int ifindex_alloc_locked(u_short *idxp) { u_short idx; IFNET_WLOCK_ASSERT(); retry: /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { - if (V_ifindex_table[idx].ife_ifnet == NULL) + if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { if_grow(); goto retry; } if (idx > V_if_index) V_if_index = idx; *idxp = idx; return (0); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); - V_ifindex_table[idx].ife_ifnet = NULL; + V_ifindex_table[idx] = NULL; while (V_if_index > 0 && - V_ifindex_table[V_if_index].ife_ifnet == NULL) + V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex_locked(u_short idx, struct ifnet *ifp) { IFNET_WLOCK_ASSERT(); - V_ifindex_table[idx].ife_ifnet = ifp; + V_ifindex_table[idx] = ifp; } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { IFNET_WLOCK(); ifnet_setbyindex_locked(idx, ifp); IFNET_WUNLOCK(); } struct ifaddr * ifaddr_byindex(u_short idx) { struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); ifa = ifnet_byindex_locked(idx)->if_addr; if (ifa != NULL) ifa_ref(ifa); IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { TAILQ_INIT(&V_ifnet); TAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); if_grow(); /* create initial table */ IFNET_WUNLOCK(); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); /* ARGSUSED*/ static void if_init(void *dummy __unused) { IFNET_LOCK_INIT(); if_clone_init(); } SYSINIT(interfaces, SI_SUB_INIT_IF, SI_ORDER_FIRST, if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(TAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(TAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); #endif static void if_grow(void) { int oldlim; u_int n; - struct ifindex_entry *e; + struct ifnet **e; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return; } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); free((caddr_t)V_ifindex_table, M_IFNET); } V_if_indexlim <<= 1; V_ifindex_table = e; } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc(u_char type) { struct ifnet *ifp; u_short idx; ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK|M_ZERO); IFNET_WLOCK(); if (ifindex_alloc_locked(&idx) != 0) { IFNET_WUNLOCK(); free(ifp, M_IFNET); return (NULL); } ifnet_setbyindex_locked(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); TAILQ_INIT(&ifp->if_addrhead); TAILQ_INIT(&ifp->if_multiaddrs); TAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ if (ifp->if_description != NULL) free(ifp->if_description, M_IFDESCR); IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp, M_IFNET); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) if_free_internal(ifp); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; if_free_internal(ifp); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initalization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_rtrequest = link_rtrequest; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } /* * If the "if_hw_tsomax" limit is set, check if it is * too small: */ KASSERT(ifp->if_hw_tsomax == 0 || ifp->if_hw_tsomax >= (IP_MAXPACKET / 8), ("%s: if_hw_tsomax is outside of range", __func__)); #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = TAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); TAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_attachdomain(void *dummy) { struct ifnet *ifp; TAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ if (IF_AFDATA_TRYLOCK(ifp) == 0) return; if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa, *next; TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family == AF_LINK) continue; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 1); IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0); CURVNET_RESTORE(); } static void if_detach_internal(struct ifnet *ifp, int vmove) { struct ifaddr *ifa; struct radix_node_head *rnh; int i, j; struct domain *dp; struct ifnet *iter; int found = 0; IFNET_WLOCK(); TAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { TAILQ_REMOVE(&V_ifnet, ifp, if_link); found = 1; break; } #ifdef VIMAGE if (found) curvnet->vnet_ifcnt--; #endif IFNET_WUNLOCK(); if (!found) { if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ } /* * Remove/wait for pending events. */ taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Remove routes and flush queues. */ if_down(ifp); #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Remove link ifaddr pointer and maybe decrement if_index. * Clean up all addresses. */ ifp->if_addr = NULL; /* We can now free link ifaddr. */ if (!TAILQ_EMPTY(&ifp->if_addrhead)) { ifa = TAILQ_FIRST(&ifp->if_addrhead); TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); ifa_free(ifa); } } /* * Delete all remaining routes using this interface * Unfortuneatly the only way to do this is to slog through * the entire routing table looking for routes which point * to this interface...oh well... */ for (i = 1; i <= AF_MAX; i++) { for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); (void) rnh->rnh_walktree(rnh, if_rtdel, ifp); RADIX_NODE_HEAD_UNLOCK(rnh); } } if_delgroups(ifp); /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); } } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { u_short idx; /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. */ if_detach_internal(ifp, 1); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); IFNET_WLOCK(); if (ifindex_alloc_locked(&idx) != 0) { IFNET_WUNLOCK(); panic("if_index overflow"); } ifp->if_index = idx; ifnet_setbyindex_locked(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); CURVNET_RESTORE(); if (difp != NULL) { prison_free(pr); return (EEXIST); } /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; TAILQ_INIT(&ifg->ifg_members); TAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; IFNET_WLOCK(); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; IFNET_WLOCK(); while (!TAILQ_EMPTY(&ifp->if_groups)) { ifgl = TAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) { TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next); free(ifgm, M_TEMP); } if (--ifgl->ifgl_group->ifg_refcnt == 0) { TAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_next); IFNET_WUNLOCK(); EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } else IFNET_WUNLOCK(); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } /* * Stores all groups from an interface in memory pointed * to by data */ static int if_getgroup(struct ifgroupreq *data, struct ifnet *ifp) { int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; struct ifgroupreq *ifgr = data; if (ifgr->ifgr_len == 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); IF_ADDR_RUNLOCK(ifp); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; /* XXX: wire */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IF_ADDR_RUNLOCK(ifp); return (error); } len -= sizeof(ifgrq); ifgp++; } IF_ADDR_RUNLOCK(ifp); return (0); } /* * Stores all members of a group in memory pointed to by data */ static int if_getgroupmembers(struct ifgroupreq *data) { struct ifgroupreq *ifgr = data; struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); TAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr->ifgr_groups; TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rn pointer to node in the routing table * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated * */ static int if_rtdel(struct radix_node *rn, void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct ifnet *ifp = arg; int err; if (rt->rt_ifp == ifp) { /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rt_flags & RTF_UP) == 0) return (0); err = rtrequest_fib(RTM_DELETE, rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags|RTF_RNH_LOCKED|RTF_PINNED, (struct rtentry **) NULL, rt->rt_fibnum); if (err) { log(LOG_WARNING, "if_rtdel: error %d\n", err); } } return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { IF_ADDR_RLOCK(ifp); } void if_addr_runlock(struct ifnet *ifp) { IF_ADDR_RUNLOCK(ifp); } void if_maddr_rlock(if_t ifp) { IF_ADDR_RLOCK((struct ifnet *)ifp); } void if_maddr_runlock(if_t ifp) { IF_ADDR_RUNLOCK((struct ifnet *)ifp); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) { counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { int error = 0; struct rtentry *rt = NULL; struct rt_addrinfo info; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; bzero(&info, sizeof(info)); info.rti_ifp = V_loif; info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; error = rtrequest1_fib(RTM_ADD, &info, &rt, ifa->ifa_ifp->if_fib); if (error == 0 && rt != NULL) { RT_LOCK(rt); ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = ifa->ifa_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = ifa->ifa_ifp->if_index; RT_REMREF(rt); RT_UNLOCK(rt); } else if (error != 0) log(LOG_DEBUG, "%s: insertion failed: %u\n", __func__, error); return (error); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { int error = 0; struct rt_addrinfo info; struct sockaddr_dl null_sdl; bzero(&null_sdl, sizeof(null_sdl)); null_sdl.sdl_len = sizeof(null_sdl); null_sdl.sdl_family = AF_LINK; null_sdl.sdl_type = ifa->ifa_ifp->if_type; null_sdl.sdl_index = ifa->ifa_ifp->if_index; bzero(&info, sizeof(info)); info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; error = rtrequest1_fib(RTM_DELETE, &info, NULL, ifa->ifa_ifp->if_fib); if (error != 0) log(LOG_DEBUG, "%s: deletion failed: %u\n", __func__, error); return (error); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *sa, int fib) { struct rtentry *rt; rt = rtalloc1_fib(sa, 0, 0, fib); if (rt == NULL) { log(LOG_DEBUG, "%s: fail", __func__); return (EHOSTUNREACH); } ((struct sockaddr_dl *)rt->rt_gateway)->sdl_type = ifa->ifa_ifp->if_type; ((struct sockaddr_dl *)rt->rt_gateway)->sdl_index = ifa->ifa_ifp->if_index; RTFREE_LOCKED(rt); return (0); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((struct sockaddr_dl *)(a1))->sdl_len == \ ((struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(LLADDR((struct sockaddr_dl *)(a1)), \ LLADDR((struct sockaddr_dl *)(a2)), \ ((struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ static struct ifaddr * ifa_ifwithaddr_internal(struct sockaddr *addr, int getref) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { if (getref) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } struct ifaddr * ifa_ifwithaddr(struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 1)); } int ifa_ifwithaddr_check(struct sockaddr *addr) { return (ifa_ifwithaddr_internal(addr, 0) != NULL); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } IF_ADDR_RUNLOCK(ifp); } ifa = NULL; done: IFNET_RUNLOCK_NOSLEEP(); return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; char *addr_data = addr->sa_data, *cplim; /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. Maintain a reference * on ifa_maybe once we find one, as we release the IF_ADDR_RLOCK() that * kept it stable when we move onto the next interface. */ IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { if (ifa_maybe != NULL) ifa_free(ifa_maybe); ifa_maybe = ifa; ifa_ref(ifa_maybe); } } } IF_ADDR_RUNLOCK(ifp); } ifa = ifa_maybe; ifa_maybe = NULL; done: IFNET_RUNLOCK_NOSLEEP(); if (ifa_maybe != NULL) ifa_free(ifa_maybe); return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } #include /* * Default action when installing a route with a Link Level gateway. * Lookup an appropriate real ifa to point to. * This should be moved to /sys/net/link.c eventually. */ static void link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info) { struct ifaddr *ifa, *oifa; struct sockaddr *dst; struct ifnet *ifp; if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) return; ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { oifa = rt->rt_ifa; rt->rt_ifa = ifa; ifa_free(oifa); if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) ifa->ifa_rtrequest(cmd, rt, info); } } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && - IFP2AC(ifp)->ac_netgraph != NULL) + ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) (*bridge_linkstate_p)(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) log(LOG_NOTICE, "%s: link state changed to %s\n", ifp->if_xname, (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, ifp->if_link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != 0) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); IFNET_RUNLOCK_NOSLEEP(); return (ifp); } struct ifnet * ifunit(const char *name) { struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } IFNET_RUNLOCK_NOSLEEP(); return (ifp); } /* * Hardware specific interface ioctls. */ static int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr->ifr_buffer.length < descrlen) ifr->ifr_buffer.buffer = NULL; else error = copyout(ifp->if_description, ifr->ifr_buffer.buffer, descrlen); ifr->ifr_buffer.length = descrlen; } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr->ifr_buffer.length > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr->ifr_buffer.length == 0) descrbuf = NULL; else { descrbuf = malloc(ifr->ifr_buffer.length, M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr->ifr_buffer.buffer, descrbuf, ifr->ifr_buffer.length - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { if_up(ifp); } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; log(LOG_INFO, "%s: permanently promiscuous mode %s\n", ifp->if_xname, (new_flags & IFF_PPROMISC) ? "enabled" : "disabled"); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); log(LOG_INFO, "%s: changing name to '%s'\n", ifp->if_xname, new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ IF_ADDR_RLOCK(ifp); ifma = if_findmulti(ifp, &ifr->ifr_addr); IF_ADDR_RUNLOCK(ifp); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFGENERIC: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); EVENTHANDLER_INVOKE(iflladdr_event, ifp); break; case SIOCAIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr->ifgr_group))) return (error); break; } case SIOCGIFGROUP: if ((error = if_getgroup((struct ifgroupreq *)ifr, ifp))) return (error); break; case SIOCDIFGROUP: { struct ifgroupreq *ifgr = (struct ifgroupreq *)ifr; error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr->ifgr_group))) return (error); break; } default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; CURVNET_SET(so->so_vnet); switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); CURVNET_RESTORE(); return (error); #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL); CURVNET_RESTORE(); return (error); case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); CURVNET_RESTORE(); return (error); case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); CURVNET_RESTORE(); return (error); case SIOCGIFGMEMB: error = if_getgroupmembers((struct ifgroupreq *)data); CURVNET_RESTORE(); return (error); #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); CURVNET_RESTORE(); return (error); #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { CURVNET_RESTORE(); return (ENXIO); } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) { if_rele(ifp); CURVNET_RESTORE(); return (error); } oif_flags = ifp->if_flags; if (so->so_proto == NULL) { if_rele(ifp); CURVNET_RESTORE(); return (EOPNOTSUPP); } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } if_rele(ifp); CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC)) log(LOG_INFO, "%s: promiscuous mode %s\n", ifp->if_xname, (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { int addrs; /* * Zero the ifr_name buffer to make sure we don't * disclose the contents of the stack. */ memset(ifr.ifr_name, 0, sizeof(ifr.ifr_name)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } IF_ADDR_RUNLOCK(ifp); if (addrs == 0) { bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr)); sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ static void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; IFNET_RUNLOCK_NOSLEEP(); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma(struct ifmultiaddr *ifma) { struct ifnet *ifp; int lastref; ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct ifnet *oifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) { printf("%s: ifnet %p disappeared\n", __func__, ifp); ifp = NULL; } IFNET_RUNLOCK_NOSLEEP(); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, 0); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { TAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifma_link); } if_freemulti(ll_ifma); } } if (ifp != NULL) TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link); if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. */ int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EINVAL); } ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { ifa_free(ifa); return (EINVAL); } if (len != sdl->sdl_alen) { /* don't allow length to change */ ifa_free(ifa); return (EINVAL); } switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: case IFT_XETHER: case IFT_ISO88025: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_ARCNET: case IFT_IEEE8023ADLAG: case IFT_IEEE80211: bcopy(lladdr, LLADDR(sdl), len); ifa_free(ifa); break; default: ifa_free(ifa); return (ENODEV); } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } #ifdef INET /* * Also send gratuitous ARPs to notify other nodes about * the address change. */ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); } #endif } return (0); } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char * fmt, ...) { va_list ap; int retval; retval = printf("%s: ", ifp->if_xname); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse_drv(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: user/dchagin/lemul/sys/net/if_arp.h =================================================================== --- user/dchagin/lemul/sys/net/if_arp.h (revision 274240) +++ user/dchagin/lemul/sys/net/if_arp.h (revision 274241) @@ -1,147 +1,133 @@ /*- * Copyright (c) 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_arp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NET_IF_ARP_H_ #define _NET_IF_ARP_H_ /* * Address Resolution Protocol. * * See RFC 826 for protocol description. ARP packets are variable * in size; the arphdr structure defines the fixed-length portion. * Protocol type values are the same as those for 10 Mb/s Ethernet. * It is followed by the variable-sized fields ar_sha, arp_spa, * arp_tha and arp_tpa in that order, according to the lengths * specified. Field names used correspond to RFC 826. */ struct arphdr { u_short ar_hrd; /* format of hardware address */ #define ARPHRD_ETHER 1 /* ethernet hardware format */ #define ARPHRD_IEEE802 6 /* token-ring hardware format */ #define ARPHRD_ARCNET 7 /* arcnet hardware format */ #define ARPHRD_FRELAY 15 /* frame relay hardware format */ #define ARPHRD_IEEE1394 24 /* firewire hardware format */ #define ARPHRD_INFINIBAND 32 /* infiniband hardware format */ u_short ar_pro; /* format of protocol address */ u_char ar_hln; /* length of hardware address */ u_char ar_pln; /* length of protocol address */ u_short ar_op; /* one of: */ #define ARPOP_REQUEST 1 /* request to resolve address */ #define ARPOP_REPLY 2 /* response to previous request */ #define ARPOP_REVREQUEST 3 /* request protocol address given hardware */ #define ARPOP_REVREPLY 4 /* response giving protocol address */ #define ARPOP_INVREQUEST 8 /* request to identify peer */ #define ARPOP_INVREPLY 9 /* response identifying peer */ /* * The remaining fields are variable in size, * according to the sizes above. */ #ifdef COMMENT_ONLY u_char ar_sha[]; /* sender hardware address */ u_char ar_spa[]; /* sender protocol address */ u_char ar_tha[]; /* target hardware address */ u_char ar_tpa[]; /* target protocol address */ #endif }; #define ar_sha(ap) (((caddr_t)((ap)+1)) + 0) #define ar_spa(ap) (((caddr_t)((ap)+1)) + (ap)->ar_hln) #define ar_tha(ap) (((caddr_t)((ap)+1)) + (ap)->ar_hln + (ap)->ar_pln) #define ar_tpa(ap) (((caddr_t)((ap)+1)) + 2*(ap)->ar_hln + (ap)->ar_pln) #define arphdr_len2(ar_hln, ar_pln) \ (sizeof(struct arphdr) + 2*(ar_hln) + 2*(ar_pln)) #define arphdr_len(ap) (arphdr_len2((ap)->ar_hln, (ap)->ar_pln)) /* * ARP ioctl request */ struct arpreq { struct sockaddr arp_pa; /* protocol address */ struct sockaddr arp_ha; /* hardware address */ int arp_flags; /* flags */ }; /* arp_flags and at_flags field values */ #define ATF_INUSE 0x01 /* entry in use */ #define ATF_COM 0x02 /* completed entry (enaddr valid) */ #define ATF_PERM 0x04 /* permanent entry */ #define ATF_PUBL 0x08 /* publish entry (respond for other host) */ #define ATF_USETRAILERS 0x10 /* has requested trailers */ -#ifdef _KERNEL -/* - * Structure shared between the ethernet driver modules and - * the address resolution code. - */ -struct arpcom { - struct ifnet *ac_ifp; /* network-visible interface */ - void *ac_netgraph; /* ng_ether(4) netgraph node info */ -}; -#define IFP2AC(ifp) ((struct arpcom *)(ifp->if_l2com)) -#define AC2IFP(ac) ((ac)->ac_ifp) - -#endif /* _KERNEL */ - struct arpstat { /* Normal things that happen: */ uint64_t txrequests; /* # of ARP requests sent by this host. */ uint64_t txreplies; /* # of ARP replies sent by this host. */ uint64_t rxrequests; /* # of ARP requests received by this host. */ uint64_t rxreplies; /* # of ARP replies received by this host. */ uint64_t received; /* # of ARP packets received by this host. */ uint64_t arp_spares[4]; /* For either the upper or lower half. */ /* Abnormal event and error counting: */ uint64_t dropped; /* # of packets dropped waiting for a reply. */ uint64_t timeouts; /* # of times with entries removed */ /* due to timeout. */ uint64_t dupips; /* # of duplicate IPs detected. */ }; #ifdef _KERNEL #include #include VNET_PCPUSTAT_DECLARE(struct arpstat, arpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define ARPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct arpstat, arpstat, name, (val)) #define ARPSTAT_SUB(name, val) ARPSTAT_ADD(name, -(val)) #define ARPSTAT_INC(name) ARPSTAT_ADD(name, 1) #define ARPSTAT_DEC(name) ARPSTAT_SUB(name, 1) #endif /* _KERNEL */ #endif /* !_NET_IF_ARP_H_ */ Index: user/dchagin/lemul/sys/net/if_bridge.c =================================================================== --- user/dchagin/lemul/sys/net/if_bridge.c (revision 274240) +++ user/dchagin/lemul/sys/net/if_bridge.c (revision 274241) @@ -1,3562 +1,3562 @@ /* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ /* * Copyright 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Jason R. Thorpe for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp */ /* * Network interface bridge support. * * TODO: * * - Currently only supports Ethernet-like interfaces (Ethernet, * 802.11, VLANs on Ethernet, etc.) Figure out a nice way * to bridge other types of interfaces (FDDI-FDDI, and maybe * consider heterogenous bridges). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include /* for net/if.h */ #include #include /* string functions */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include /* for struct arpcom */ +#include #include #include #include #include #ifdef INET6 #include #include #include #endif #if defined(INET) || defined(INET6) #include #endif #include -#include /* for struct arpcom */ +#include #include #include #include #include #include /* * Size of the route hash table. Must be a power of two. */ #ifndef BRIDGE_RTHASH_SIZE #define BRIDGE_RTHASH_SIZE 1024 #endif #define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) /* * Default maximum number of addresses to cache. */ #ifndef BRIDGE_RTABLE_MAX #define BRIDGE_RTABLE_MAX 2000 #endif /* * Timeout (in seconds) for entries learned dynamically. */ #ifndef BRIDGE_RTABLE_TIMEOUT #define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ #endif /* * Number of seconds between walks of the route list. */ #ifndef BRIDGE_RTABLE_PRUNE_PERIOD #define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) #endif /* * List of capabilities to possibly mask on the member interface. */ #define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM) /* * List of capabilities to strip */ #define BRIDGE_IFCAPS_STRIP IFCAP_LRO /* * Bridge interface list entry. */ struct bridge_iflist { LIST_ENTRY(bridge_iflist) bif_next; struct ifnet *bif_ifp; /* member if */ struct bstp_port bif_stp; /* STP state */ uint32_t bif_flags; /* member if flags */ int bif_savedcaps; /* saved capabilities */ uint32_t bif_addrmax; /* max # of addresses */ uint32_t bif_addrcnt; /* cur. # of addresses */ uint32_t bif_addrexceeded;/* # of address violations */ }; /* * Bridge route node. */ struct bridge_rtnode { LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ struct bridge_iflist *brt_dst; /* destination if */ unsigned long brt_expire; /* expiration time */ uint8_t brt_flags; /* address flags */ uint8_t brt_addr[ETHER_ADDR_LEN]; uint16_t brt_vlan; /* vlan id */ }; #define brt_ifp brt_dst->bif_ifp /* * Software state for each bridge. */ struct bridge_softc { struct ifnet *sc_ifp; /* make this an interface */ LIST_ENTRY(bridge_softc) sc_list; struct mtx sc_mtx; struct cv sc_cv; uint32_t sc_brtmax; /* max # of addresses */ uint32_t sc_brtcnt; /* cur. # of addresses */ uint32_t sc_brttimeout; /* rt timeout in seconds */ struct callout sc_brcallout; /* bridge callout */ uint32_t sc_iflist_ref; /* refcount for sc_iflist */ uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */ LIST_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ uint32_t sc_rthash_key; /* key for hash */ LIST_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ struct bstp_state sc_stp; /* STP state */ uint32_t sc_brtexceeded; /* # of cache drops */ struct ifnet *sc_ifaddr; /* member mac copied from */ u_char sc_defaddr[6]; /* Default MAC address */ }; static VNET_DEFINE(struct mtx, bridge_list_mtx); #define V_bridge_list_mtx VNET(bridge_list_mtx) eventhandler_tag bridge_detach_cookie = NULL; int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; uma_zone_t bridge_rtnode_zone; static int bridge_clone_create(struct if_clone *, int, caddr_t); static void bridge_clone_destroy(struct ifnet *); static int bridge_ioctl(struct ifnet *, u_long, caddr_t); static void bridge_mutecaps(struct bridge_softc *); static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, int); static void bridge_ifdetach(void *arg __unused, struct ifnet *); static void bridge_init(void *); static void bridge_dummynet(struct mbuf *, struct ifnet *); static void bridge_stop(struct ifnet *, int); static int bridge_transmit(struct ifnet *, struct mbuf *); static void bridge_qflush(struct ifnet *); static struct mbuf *bridge_input(struct ifnet *, struct mbuf *); static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); static int bridge_enqueue(struct bridge_softc *, struct ifnet *, struct mbuf *); static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, struct mbuf *m); static void bridge_timer(void *); static void bridge_broadcast(struct bridge_softc *, struct ifnet *, struct mbuf *, int); static void bridge_span(struct bridge_softc *, struct mbuf *); static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, uint16_t, struct bridge_iflist *, int, uint8_t); static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *, uint16_t); static void bridge_rttrim(struct bridge_softc *); static void bridge_rtage(struct bridge_softc *); static void bridge_rtflush(struct bridge_softc *, int); static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, uint16_t); static void bridge_rtable_init(struct bridge_softc *); static void bridge_rtable_fini(struct bridge_softc *); static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, const uint8_t *, uint16_t); static int bridge_rtnode_insert(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtnode_destroy(struct bridge_softc *, struct bridge_rtnode *); static void bridge_rtable_expire(struct ifnet *, int); static void bridge_state_change(struct ifnet *, int); static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, const char *name); static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, struct ifnet *ifp); static void bridge_delete_member(struct bridge_softc *, struct bridge_iflist *, int); static void bridge_delete_span(struct bridge_softc *, struct bridge_iflist *); static int bridge_ioctl_add(struct bridge_softc *, void *); static int bridge_ioctl_del(struct bridge_softc *, void *); static int bridge_ioctl_gifflags(struct bridge_softc *, void *); static int bridge_ioctl_sifflags(struct bridge_softc *, void *); static int bridge_ioctl_scache(struct bridge_softc *, void *); static int bridge_ioctl_gcache(struct bridge_softc *, void *); static int bridge_ioctl_gifs(struct bridge_softc *, void *); static int bridge_ioctl_rts(struct bridge_softc *, void *); static int bridge_ioctl_saddr(struct bridge_softc *, void *); static int bridge_ioctl_sto(struct bridge_softc *, void *); static int bridge_ioctl_gto(struct bridge_softc *, void *); static int bridge_ioctl_daddr(struct bridge_softc *, void *); static int bridge_ioctl_flush(struct bridge_softc *, void *); static int bridge_ioctl_gpri(struct bridge_softc *, void *); static int bridge_ioctl_spri(struct bridge_softc *, void *); static int bridge_ioctl_ght(struct bridge_softc *, void *); static int bridge_ioctl_sht(struct bridge_softc *, void *); static int bridge_ioctl_gfd(struct bridge_softc *, void *); static int bridge_ioctl_sfd(struct bridge_softc *, void *); static int bridge_ioctl_gma(struct bridge_softc *, void *); static int bridge_ioctl_sma(struct bridge_softc *, void *); static int bridge_ioctl_sifprio(struct bridge_softc *, void *); static int bridge_ioctl_sifcost(struct bridge_softc *, void *); static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); static int bridge_ioctl_addspan(struct bridge_softc *, void *); static int bridge_ioctl_delspan(struct bridge_softc *, void *); static int bridge_ioctl_gbparam(struct bridge_softc *, void *); static int bridge_ioctl_grte(struct bridge_softc *, void *); static int bridge_ioctl_gifsstp(struct bridge_softc *, void *); static int bridge_ioctl_sproto(struct bridge_softc *, void *); static int bridge_ioctl_stxhc(struct bridge_softc *, void *); static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, int); static int bridge_ip_checkbasic(struct mbuf **mp); #ifdef INET6 static int bridge_ip6_checkbasic(struct mbuf **mp); #endif /* INET6 */ static int bridge_fragment(struct ifnet *, struct mbuf *, struct ether_header *, int, struct llc *); static void bridge_linkstate(struct ifnet *ifp); static void bridge_linkcheck(struct bridge_softc *sc); extern void (*bridge_linkstate_p)(struct ifnet *ifp); /* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ #define VLANTAGOF(_m) \ (_m->m_flags & M_VLANTAG) ? EVL_VLANOFTAG(_m->m_pkthdr.ether_vtag) : 1 static struct bstp_cb_ops bridge_ops = { .bcb_state = bridge_state_change, .bcb_rtage = bridge_rtable_expire }; SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge"); /* only pass IP[46] packets when pfil is enabled */ static VNET_DEFINE(int, pfil_onlyip) = 1; #define V_pfil_onlyip VNET(pfil_onlyip) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_onlyip), 0, "Only pass IP packets when pfil is enabled"); /* run pfil hooks on the bridge interface */ static VNET_DEFINE(int, pfil_bridge) = 1; #define V_pfil_bridge VNET(pfil_bridge) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_bridge), 0, "Packet filter on the bridge interface"); /* layer2 filter with ipfw */ static VNET_DEFINE(int, pfil_ipfw); #define V_pfil_ipfw VNET(pfil_ipfw) /* layer2 ARP filter with ipfw */ static VNET_DEFINE(int, pfil_ipfw_arp); #define V_pfil_ipfw_arp VNET(pfil_ipfw_arp) SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw_arp), 0, "Filter ARP packets through IPFW layer2"); /* run pfil hooks on the member interface */ static VNET_DEFINE(int, pfil_member) = 1; #define V_pfil_member VNET(pfil_member) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_member), 0, "Packet filter on the member interface"); /* run pfil hooks on the physical interface for locally destined packets */ static VNET_DEFINE(int, pfil_local_phys); #define V_pfil_local_phys VNET(pfil_local_phys) SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(pfil_local_phys), 0, "Packet filter on the physical interface for locally destined packets"); /* log STP state changes */ static VNET_DEFINE(int, log_stp); #define V_log_stp VNET(log_stp) SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(log_stp), 0, "Log STP state changes"); /* share MAC with first bridge member */ static VNET_DEFINE(int, bridge_inherit_mac); #define V_bridge_inherit_mac VNET(bridge_inherit_mac) SYSCTL_INT(_net_link_bridge, OID_AUTO, inherit_mac, CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(bridge_inherit_mac), 0, "Inherit MAC address from the first bridge member"); static VNET_DEFINE(int, allow_llz_overlap) = 0; #define V_allow_llz_overlap VNET(allow_llz_overlap) -SYSCTL_VNET_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap, - CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0, +SYSCTL_INT(_net_link_bridge, OID_AUTO, allow_llz_overlap, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(allow_llz_overlap), 0, "Allow overlap of link-local scope " "zones of a bridge interface and the member interfaces"); struct bridge_control { int (*bc_func)(struct bridge_softc *, void *); int bc_argsize; int bc_flags; }; #define BC_F_COPYIN 0x01 /* copy arguments in */ #define BC_F_COPYOUT 0x02 /* copy arguments out */ #define BC_F_SUSER 0x04 /* do super-user check */ const struct bridge_control bridge_control_table[] = { { bridge_ioctl_add, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_del, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gifflags, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_sifflags, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_scache, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gcache, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_gifs, sizeof(struct ifbifconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_rts, sizeof(struct ifbaconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_saddr, sizeof(struct ifbareq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sto, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gto, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_daddr, sizeof(struct ifbareq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_flush, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gpri, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_spri, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_ght, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sht, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gfd, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sfd, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gma, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_sma, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifprio, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifcost, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_addspan, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_delspan, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_gbparam, sizeof(struct ifbropreq), BC_F_COPYOUT }, { bridge_ioctl_grte, sizeof(struct ifbrparam), BC_F_COPYOUT }, { bridge_ioctl_gifsstp, sizeof(struct ifbpstpconf), BC_F_COPYIN|BC_F_COPYOUT }, { bridge_ioctl_sproto, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_stxhc, sizeof(struct ifbrparam), BC_F_COPYIN|BC_F_SUSER }, { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), BC_F_COPYIN|BC_F_SUSER }, }; const int bridge_control_table_size = nitems(bridge_control_table); static VNET_DEFINE(LIST_HEAD(, bridge_softc), bridge_list); #define V_bridge_list VNET(bridge_list) #define BRIDGE_LIST_LOCK_INIT(x) mtx_init(&V_bridge_list_mtx, \ "if_bridge list", NULL, MTX_DEF) #define BRIDGE_LIST_LOCK_DESTROY(x) mtx_destroy(&V_bridge_list_mtx) #define BRIDGE_LIST_LOCK(x) mtx_lock(&V_bridge_list_mtx) #define BRIDGE_LIST_UNLOCK(x) mtx_unlock(&V_bridge_list_mtx) static VNET_DEFINE(struct if_clone *, bridge_cloner); #define V_bridge_cloner VNET(bridge_cloner) static const char bridge_name[] = "bridge"; static void vnet_bridge_init(const void *unused __unused) { BRIDGE_LIST_LOCK_INIT(); LIST_INIT(&V_bridge_list); V_bridge_cloner = if_clone_simple(bridge_name, bridge_clone_create, bridge_clone_destroy, 0); } VNET_SYSINIT(vnet_bridge_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_bridge_init, NULL); static void vnet_bridge_uninit(const void *unused __unused) { if_clone_detach(V_bridge_cloner); BRIDGE_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_bridge_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_bridge_uninit, NULL); static int bridge_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: bridge_rtnode_zone = uma_zcreate("bridge_rtnode", sizeof(struct bridge_rtnode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); bridge_input_p = bridge_input; bridge_output_p = bridge_output; bridge_dn_p = bridge_dummynet; bridge_linkstate_p = bridge_linkstate; bridge_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, bridge_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, bridge_detach_cookie); uma_zdestroy(bridge_rtnode_zone); bridge_input_p = NULL; bridge_output_p = NULL; bridge_dn_p = NULL; bridge_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t bridge_mod = { "if_bridge", bridge_modevent, 0 }; DECLARE_MODULE(if_bridge, bridge_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_DEPEND(if_bridge, bridgestp, 1, 1, 1); /* * handler for net.link.bridge.ipfw */ static int sysctl_pfil_ipfw(SYSCTL_HANDLER_ARGS) { int enable = V_pfil_ipfw; int error; error = sysctl_handle_int(oidp, &enable, 0, req); enable &= 1; if (enable != V_pfil_ipfw) { V_pfil_ipfw = enable; /* * Disable pfil so that ipfw doesnt run twice, if the user * really wants both then they can re-enable pfil_bridge and/or * pfil_member. Also allow non-ip packets as ipfw can filter by * layer2 type. */ if (V_pfil_ipfw) { V_pfil_onlyip = 0; V_pfil_bridge = 0; V_pfil_member = 0; } } return (error); } SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(pfil_ipfw), 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); /* * bridge_clone_create: * * Create a new bridge instance. */ static int bridge_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct bridge_softc *sc, *sc2; struct ifnet *bifp, *ifp; int fb, retry; unsigned long hostid; sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_DEVBUF); return (ENOSPC); } BRIDGE_LOCK_INIT(sc); sc->sc_brtmax = BRIDGE_RTABLE_MAX; sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; /* Initialize our routing table. */ bridge_rtable_init(sc); callout_init_mtx(&sc->sc_brcallout, &sc->sc_mtx, 0); LIST_INIT(&sc->sc_iflist); LIST_INIT(&sc->sc_spanlist); ifp->if_softc = sc; if_initname(ifp, bridge_name, unit); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = bridge_ioctl; ifp->if_transmit = bridge_transmit; ifp->if_qflush = bridge_qflush; ifp->if_init = bridge_init; ifp->if_type = IFT_BRIDGE; /* * Generate an ethernet address with a locally administered address. * * Since we are using random ethernet addresses for the bridge, it is * possible that we might have address collisions, so make sure that * this hardware address isn't already in use on another bridge. * The first try uses the hostid and falls back to arc4rand(). */ fb = 0; getcredhostid(curthread->td_ucred, &hostid); do { if (fb || hostid == 0) { arc4rand(sc->sc_defaddr, ETHER_ADDR_LEN, 1); sc->sc_defaddr[0] &= ~1;/* clear multicast bit */ sc->sc_defaddr[0] |= 2; /* set the LAA bit */ } else { sc->sc_defaddr[0] = 0x2; sc->sc_defaddr[1] = (hostid >> 24) & 0xff; sc->sc_defaddr[2] = (hostid >> 16) & 0xff; sc->sc_defaddr[3] = (hostid >> 8 ) & 0xff; sc->sc_defaddr[4] = hostid & 0xff; sc->sc_defaddr[5] = ifp->if_dunit & 0xff; } fb = 1; retry = 0; BRIDGE_LIST_LOCK(); LIST_FOREACH(sc2, &V_bridge_list, sc_list) { bifp = sc2->sc_ifp; if (memcmp(sc->sc_defaddr, IF_LLADDR(bifp), ETHER_ADDR_LEN) == 0) { retry = 1; break; } } BRIDGE_LIST_UNLOCK(); } while (retry == 1); bstp_attach(&sc->sc_stp, &bridge_ops); ether_ifattach(ifp, sc->sc_defaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_BRIDGE; BRIDGE_LIST_LOCK(); LIST_INSERT_HEAD(&V_bridge_list, sc, sc_list); BRIDGE_LIST_UNLOCK(); return (0); } /* * bridge_clone_destroy: * * Destroy a bridge instance. */ static void bridge_clone_destroy(struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_softc; struct bridge_iflist *bif; BRIDGE_LOCK(sc); bridge_stop(ifp, 1); ifp->if_flags &= ~IFF_UP; while ((bif = LIST_FIRST(&sc->sc_iflist)) != NULL) bridge_delete_member(sc, bif, 0); while ((bif = LIST_FIRST(&sc->sc_spanlist)) != NULL) { bridge_delete_span(sc, bif); } BRIDGE_UNLOCK(sc); callout_drain(&sc->sc_brcallout); BRIDGE_LIST_LOCK(); LIST_REMOVE(sc, sc_list); BRIDGE_LIST_UNLOCK(); bstp_detach(&sc->sc_stp); ether_ifdetach(ifp); if_free(ifp); /* Tear down the routing table. */ bridge_rtable_fini(sc); BRIDGE_LOCK_DESTROY(sc); free(sc, M_DEVBUF); } /* * bridge_ioctl: * * Handle a control request from the operator. */ static int bridge_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct bridge_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct bridge_iflist *bif; struct thread *td = curthread; union { struct ifbreq ifbreq; struct ifbifconf ifbifconf; struct ifbareq ifbareq; struct ifbaconf ifbaconf; struct ifbrparam ifbrparam; struct ifbropreq ifbropreq; } args; struct ifdrv *ifd = (struct ifdrv *) data; const struct bridge_control *bc; int error = 0; switch (cmd) { case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGDRVSPEC: case SIOCSDRVSPEC: if (ifd->ifd_cmd >= bridge_control_table_size) { error = EINVAL; break; } bc = &bridge_control_table[ifd->ifd_cmd]; if (cmd == SIOCGDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) == 0) { error = EINVAL; break; } else if (cmd == SIOCSDRVSPEC && (bc->bc_flags & BC_F_COPYOUT) != 0) { error = EINVAL; break; } if (bc->bc_flags & BC_F_SUSER) { error = priv_check(td, PRIV_NET_BRIDGE); if (error) break; } if (ifd->ifd_len != bc->bc_argsize || ifd->ifd_len > sizeof(args)) { error = EINVAL; break; } bzero(&args, sizeof(args)); if (bc->bc_flags & BC_F_COPYIN) { error = copyin(ifd->ifd_data, &args, ifd->ifd_len); if (error) break; } BRIDGE_LOCK(sc); error = (*bc->bc_func)(sc, &args); BRIDGE_UNLOCK(sc); if (error) break; if (bc->bc_flags & BC_F_COPYOUT) error = copyout(&args, ifd->ifd_data, ifd->ifd_len); break; case SIOCSIFFLAGS: if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ BRIDGE_LOCK(sc); bridge_stop(ifp, 1); BRIDGE_UNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ (*ifp->if_init)(sc); } break; case SIOCSIFMTU: if (ifr->ifr_mtu < 576) { error = EINVAL; break; } if (LIST_EMPTY(&sc->sc_iflist)) { sc->sc_ifp->if_mtu = ifr->ifr_mtu; break; } BRIDGE_LOCK(sc); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp->if_mtu != ifr->ifr_mtu) { log(LOG_NOTICE, "%s: invalid MTU: %u(%s)" " != %d\n", sc->sc_ifp->if_xname, bif->bif_ifp->if_mtu, bif->bif_ifp->if_xname, ifr->ifr_mtu); error = EINVAL; break; } } if (!error) sc->sc_ifp->if_mtu = ifr->ifr_mtu; BRIDGE_UNLOCK(sc); break; default: /* * drop the lock as ether_ioctl() will call bridge_start() and * cause the lock to be recursed. */ error = ether_ioctl(ifp, cmd, data); break; } return (error); } /* * bridge_mutecaps: * * Clear or restore unwanted capabilities on the member interface */ static void bridge_mutecaps(struct bridge_softc *sc) { struct bridge_iflist *bif; int enabled, mask; /* Initial bitmask of capabilities to test */ mask = BRIDGE_IFCAPS_MASK; LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { /* Every member must support it or its disabled */ mask &= bif->bif_savedcaps; } LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { enabled = bif->bif_ifp->if_capenable; enabled &= ~BRIDGE_IFCAPS_STRIP; /* strip off mask bits and enable them again if allowed */ enabled &= ~BRIDGE_IFCAPS_MASK; enabled |= mask; bridge_set_ifcap(sc, bif, enabled); } } static void bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) { struct ifnet *ifp = bif->bif_ifp; struct ifreq ifr; int error; bzero(&ifr, sizeof(ifr)); ifr.ifr_reqcap = set; if (ifp->if_capenable != set) { error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); if (error) if_printf(sc->sc_ifp, "error setting interface capabilities on %s\n", ifp->if_xname); } } /* * bridge_lookup_member: * * Lookup a bridge member interface. */ static struct bridge_iflist * bridge_lookup_member(struct bridge_softc *sc, const char *name) { struct bridge_iflist *bif; struct ifnet *ifp; BRIDGE_LOCK_ASSERT(sc); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { ifp = bif->bif_ifp; if (strcmp(ifp->if_xname, name) == 0) return (bif); } return (NULL); } /* * bridge_lookup_member_if: * * Lookup a bridge member interface by ifnet*. */ static struct bridge_iflist * bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) { struct bridge_iflist *bif; BRIDGE_LOCK_ASSERT(sc); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp == member_ifp) return (bif); } return (NULL); } /* * bridge_delete_member: * * Delete the specified member interface. */ static void bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, int gone) { struct ifnet *ifs = bif->bif_ifp; struct ifnet *fif = NULL; BRIDGE_LOCK_ASSERT(sc); if (bif->bif_flags & IFBIF_STP) bstp_disable(&bif->bif_stp); ifs->if_bridge = NULL; BRIDGE_XLOCK(sc); LIST_REMOVE(bif, bif_next); BRIDGE_XDROP(sc); /* * If removing the interface that gave the bridge its mac address, set * the mac address of the bridge to the address of the next member, or * to its default address if no members are left. */ if (V_bridge_inherit_mac && sc->sc_ifaddr == ifs) { if (LIST_EMPTY(&sc->sc_iflist)) { bcopy(sc->sc_defaddr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = NULL; } else { fif = LIST_FIRST(&sc->sc_iflist)->bif_ifp; bcopy(IF_LLADDR(fif), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = fif; } EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } bridge_linkcheck(sc); bridge_mutecaps(sc); /* recalcuate now this interface is removed */ bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); KASSERT(bif->bif_addrcnt == 0, ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); BRIDGE_UNLOCK(sc); if (!gone) { switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: /* * Take the interface out of promiscuous mode. */ (void) ifpromisc(ifs, 0); break; case IFT_GIF: break; default: #ifdef DIAGNOSTIC panic("bridge_delete_member: impossible"); #endif break; } /* reneable any interface capabilities */ bridge_set_ifcap(sc, bif, bif->bif_savedcaps); } bstp_destroy(&bif->bif_stp); /* prepare to free */ BRIDGE_LOCK(sc); free(bif, M_DEVBUF); } /* * bridge_delete_span: * * Delete the specified span interface. */ static void bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif) { BRIDGE_LOCK_ASSERT(sc); KASSERT(bif->bif_ifp->if_bridge == NULL, ("%s: not a span interface", __func__)); LIST_REMOVE(bif, bif_next); free(bif, M_DEVBUF); } static int bridge_ioctl_add(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif = NULL; struct ifnet *ifs; int error = 0; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); if (ifs->if_ioctl == NULL) /* must be supported */ return (EINVAL); /* If it's in the span list, it can't be a member. */ LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) return (EBUSY); if (ifs->if_bridge == sc) return (EEXIST); if (ifs->if_bridge != NULL) return (EBUSY); switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_GIF: /* permitted interface types */ break; default: return (EINVAL); } #ifdef INET6 /* * Two valid inet6 addresses with link-local scope must not be * on the parent interface and the member interfaces at the * same time. This restriction is needed to prevent violation * of link-local scope zone. Attempts to add a member * interface which has inet6 addresses when the parent has * inet6 triggers removal of all inet6 addresses on the member * interface. */ /* Check if the parent interface has a link-local scope addr. */ if (V_allow_llz_overlap == 0 && in6ifa_llaonifp(sc->sc_ifp) != NULL) { /* * If any, remove all inet6 addresses from the member * interfaces. */ BRIDGE_XLOCK(sc); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (in6ifa_llaonifp(bif->bif_ifp)) { BRIDGE_UNLOCK(sc); in6_ifdetach(bif->bif_ifp); BRIDGE_LOCK(sc); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", bif->bif_ifp->if_xname); } } BRIDGE_XDROP(sc); if (in6ifa_llaonifp(ifs)) { BRIDGE_UNLOCK(sc); in6_ifdetach(ifs); BRIDGE_LOCK(sc); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", ifs->if_xname); } } #endif /* Allow the first Ethernet member to define the MTU */ if (LIST_EMPTY(&sc->sc_iflist)) sc->sc_ifp->if_mtu = ifs->if_mtu; else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { if_printf(sc->sc_ifp, "invalid MTU: %u(%s) != %u\n", ifs->if_mtu, ifs->if_xname, sc->sc_ifp->if_mtu); return (EINVAL); } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); bif->bif_ifp = ifs; bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; bif->bif_savedcaps = ifs->if_capenable; /* * Assign the interface's MAC address to the bridge if it's the first * member and the MAC address of the bridge has not been changed from * the default randomly generated one. */ if (V_bridge_inherit_mac && LIST_EMPTY(&sc->sc_iflist) && !memcmp(IF_LLADDR(sc->sc_ifp), sc->sc_defaddr, ETHER_ADDR_LEN)) { bcopy(IF_LLADDR(ifs), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); sc->sc_ifaddr = ifs; EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } ifs->if_bridge = sc; bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); /* * XXX: XLOCK HERE!?! * * NOTE: insert_***HEAD*** should be safe for the traversals. */ LIST_INSERT_HEAD(&sc->sc_iflist, bif, bif_next); /* Set interface capabilities to the intersection set of all members */ bridge_mutecaps(sc); bridge_linkcheck(sc); /* Place the interface into promiscuous mode */ switch (ifs->if_type) { case IFT_ETHER: case IFT_L2VLAN: BRIDGE_UNLOCK(sc); error = ifpromisc(ifs, 1); BRIDGE_LOCK(sc); break; } if (error) { bridge_delete_member(sc, bif, 0); free(bif, M_DEVBUF); } return (error); } static int bridge_ioctl_del(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bridge_delete_member(sc, bif, 0); return (0); } static int bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct bstp_port *bp; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bp = &bif->bif_stp; req->ifbr_ifsflags = bif->bif_flags; req->ifbr_state = bp->bp_state; req->ifbr_priority = bp->bp_priority; req->ifbr_path_cost = bp->bp_path_cost; req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; req->ifbr_proto = bp->bp_protover; req->ifbr_role = bp->bp_role; req->ifbr_stpflags = bp->bp_flags; req->ifbr_addrcnt = bif->bif_addrcnt; req->ifbr_addrmax = bif->bif_addrmax; req->ifbr_addrexceeded = bif->bif_addrexceeded; /* Copy STP state options as flags */ if (bp->bp_operedge) req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; if (bp->bp_flags & BSTP_PORT_AUTOEDGE) req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; if (bp->bp_ptp_link) req->ifbr_ifsflags |= IFBIF_BSTP_PTP; if (bp->bp_flags & BSTP_PORT_AUTOPTP) req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; if (bp->bp_flags & BSTP_PORT_ADMEDGE) req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; if (bp->bp_flags & BSTP_PORT_ADMCOST) req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; return (0); } static int bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct bstp_port *bp; int error; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bp = &bif->bif_stp; if (req->ifbr_ifsflags & IFBIF_SPAN) /* SPAN is readonly */ return (EINVAL); if (req->ifbr_ifsflags & IFBIF_STP) { if ((bif->bif_flags & IFBIF_STP) == 0) { error = bstp_enable(&bif->bif_stp); if (error) return (error); } } else { if ((bif->bif_flags & IFBIF_STP) != 0) bstp_disable(&bif->bif_stp); } /* Pass on STP flags */ bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); /* Save the bits relating to the bridge */ bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; return (0); } static int bridge_ioctl_scache(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; sc->sc_brtmax = param->ifbrp_csize; bridge_rttrim(sc); return (0); } static int bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_csize = sc->sc_brtmax; return (0); } static int bridge_ioctl_gifs(struct bridge_softc *sc, void *arg) { struct ifbifconf *bifc = arg; struct bridge_iflist *bif; struct ifbreq breq; char *buf, *outbuf; int count, buflen, len, error = 0; count = 0; LIST_FOREACH(bif, &sc->sc_iflist, bif_next) count++; LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) count++; buflen = sizeof(breq) * count; if (bifc->ifbic_len == 0) { bifc->ifbic_len = buflen; return (0); } BRIDGE_UNLOCK(sc); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); BRIDGE_LOCK(sc); count = 0; buf = outbuf; len = min(bifc->ifbic_len, buflen); bzero(&breq, sizeof(breq)); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (len < sizeof(breq)) break; strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, sizeof(breq.ifbr_ifsname)); /* Fill in the ifbreq structure */ error = bridge_ioctl_gifflags(sc, &breq); if (error) break; memcpy(buf, &breq, sizeof(breq)); count++; buf += sizeof(breq); len -= sizeof(breq); } LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { if (len < sizeof(breq)) break; strlcpy(breq.ifbr_ifsname, bif->bif_ifp->if_xname, sizeof(breq.ifbr_ifsname)); breq.ifbr_ifsflags = bif->bif_flags; breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; memcpy(buf, &breq, sizeof(breq)); count++; buf += sizeof(breq); len -= sizeof(breq); } BRIDGE_UNLOCK(sc); bifc->ifbic_len = sizeof(breq) * count; error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); BRIDGE_LOCK(sc); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_rts(struct bridge_softc *sc, void *arg) { struct ifbaconf *bac = arg; struct bridge_rtnode *brt; struct ifbareq bareq; char *buf, *outbuf; int count, buflen, len, error = 0; if (bac->ifbac_len == 0) return (0); count = 0; LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) count++; buflen = sizeof(bareq) * count; BRIDGE_UNLOCK(sc); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); BRIDGE_LOCK(sc); count = 0; buf = outbuf; len = min(bac->ifbac_len, buflen); bzero(&bareq, sizeof(bareq)); LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { if (len < sizeof(bareq)) goto out; strlcpy(bareq.ifba_ifsname, brt->brt_ifp->if_xname, sizeof(bareq.ifba_ifsname)); memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); bareq.ifba_vlan = brt->brt_vlan; if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && time_uptime < brt->brt_expire) bareq.ifba_expire = brt->brt_expire - time_uptime; else bareq.ifba_expire = 0; bareq.ifba_flags = brt->brt_flags; memcpy(buf, &bareq, sizeof(bareq)); count++; buf += sizeof(bareq); len -= sizeof(bareq); } out: BRIDGE_UNLOCK(sc); bac->ifbac_len = sizeof(bareq) * count; error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); BRIDGE_LOCK(sc); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_saddr(struct bridge_softc *sc, void *arg) { struct ifbareq *req = arg; struct bridge_iflist *bif; int error; bif = bridge_lookup_member(sc, req->ifba_ifsname); if (bif == NULL) return (ENOENT); error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, req->ifba_flags); return (error); } static int bridge_ioctl_sto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; sc->sc_brttimeout = param->ifbrp_ctime; return (0); } static int bridge_ioctl_gto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_ctime = sc->sc_brttimeout; return (0); } static int bridge_ioctl_daddr(struct bridge_softc *sc, void *arg) { struct ifbareq *req = arg; return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); } static int bridge_ioctl_flush(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; bridge_rtflush(sc, req->ifbr_ifsflags); return (0); } static int bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_prio = bs->bs_bridge_priority; return (0); } static int bridge_ioctl_spri(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); } static int bridge_ioctl_ght(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; return (0); } static int bridge_ioctl_sht(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); } static int bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; return (0); } static int bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); } static int bridge_ioctl_gma(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; struct bstp_state *bs = &sc->sc_stp; param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; return (0); } static int bridge_ioctl_sma(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); } static int bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); } static int bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); } static int bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; bif = bridge_lookup_member(sc, req->ifbr_ifsname); if (bif == NULL) return (ENOENT); bif->bif_addrmax = req->ifbr_addrmax; return (0); } static int bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif = NULL; struct ifnet *ifs; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) return (EBUSY); if (ifs->if_bridge != NULL) return (EBUSY); switch (ifs->if_type) { case IFT_ETHER: case IFT_GIF: case IFT_L2VLAN: break; default: return (EINVAL); } bif = malloc(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); if (bif == NULL) return (ENOMEM); bif->bif_ifp = ifs; bif->bif_flags = IFBIF_SPAN; LIST_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next); return (0); } static int bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) { struct ifbreq *req = arg; struct bridge_iflist *bif; struct ifnet *ifs; ifs = ifunit(req->ifbr_ifsname); if (ifs == NULL) return (ENOENT); LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifs == bif->bif_ifp) break; if (bif == NULL) return (ENOENT); bridge_delete_span(sc, bif); return (0); } static int bridge_ioctl_gbparam(struct bridge_softc *sc, void *arg) { struct ifbropreq *req = arg; struct bstp_state *bs = &sc->sc_stp; struct bstp_port *root_port; req->ifbop_maxage = bs->bs_bridge_max_age >> 8; req->ifbop_hellotime = bs->bs_bridge_htime >> 8; req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; root_port = bs->bs_root_port; if (root_port == NULL) req->ifbop_root_port = 0; else req->ifbop_root_port = root_port->bp_ifp->if_index; req->ifbop_holdcount = bs->bs_txholdcount; req->ifbop_priority = bs->bs_bridge_priority; req->ifbop_protocol = bs->bs_protover; req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; return (0); } static int bridge_ioctl_grte(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; param->ifbrp_cexceeded = sc->sc_brtexceeded; return (0); } static int bridge_ioctl_gifsstp(struct bridge_softc *sc, void *arg) { struct ifbpstpconf *bifstp = arg; struct bridge_iflist *bif; struct bstp_port *bp; struct ifbpstpreq bpreq; char *buf, *outbuf; int count, buflen, len, error = 0; count = 0; LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if ((bif->bif_flags & IFBIF_STP) != 0) count++; } buflen = sizeof(bpreq) * count; if (bifstp->ifbpstp_len == 0) { bifstp->ifbpstp_len = buflen; return (0); } BRIDGE_UNLOCK(sc); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); BRIDGE_LOCK(sc); count = 0; buf = outbuf; len = min(bifstp->ifbpstp_len, buflen); bzero(&bpreq, sizeof(bpreq)); LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (len < sizeof(bpreq)) break; if ((bif->bif_flags & IFBIF_STP) == 0) continue; bp = &bif->bif_stp; bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; memcpy(buf, &bpreq, sizeof(bpreq)); count++; buf += sizeof(bpreq); len -= sizeof(bpreq); } BRIDGE_UNLOCK(sc); bifstp->ifbpstp_len = sizeof(bpreq) * count; error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); BRIDGE_LOCK(sc); free(outbuf, M_TEMP); return (error); } static int bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); } static int bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) { struct ifbrparam *param = arg; return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); } /* * bridge_ifdetach: * * Detach an interface from a bridge. Called when a member * interface is detaching. */ static void bridge_ifdetach(void *arg __unused, struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif; if (ifp->if_flags & IFF_RENAMING) return; /* Check if the interface is a bridge member */ if (sc != NULL) { BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif != NULL) bridge_delete_member(sc, bif, 1); BRIDGE_UNLOCK(sc); return; } /* Check if the interface is a span port */ BRIDGE_LIST_LOCK(); LIST_FOREACH(sc, &V_bridge_list, sc_list) { BRIDGE_LOCK(sc); LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) if (ifp == bif->bif_ifp) { bridge_delete_span(sc, bif); break; } BRIDGE_UNLOCK(sc); } BRIDGE_LIST_UNLOCK(); } /* * bridge_init: * * Initialize a bridge interface. */ static void bridge_init(void *xsc) { struct bridge_softc *sc = (struct bridge_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; BRIDGE_LOCK(sc); callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ BRIDGE_UNLOCK(sc); } /* * bridge_stop: * * Stop the bridge interface. */ static void bridge_stop(struct ifnet *ifp, int disable) { struct bridge_softc *sc = ifp->if_softc; BRIDGE_LOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; callout_stop(&sc->sc_brcallout); bstp_stop(&sc->sc_stp); bridge_rtflush(sc, IFBF_FLUSHDYN); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } /* * bridge_enqueue: * * Enqueue a packet on a bridge member interface. * */ static int bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) { int len, err = 0; short mflags; struct mbuf *m0; /* We may be sending a fragment so traverse the mbuf */ for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = NULL; len = m->m_pkthdr.len; mflags = m->m_flags; /* * If underlying interface can not do VLAN tag insertion itself * then attach a packet tag that holds it. */ if ((m->m_flags & M_VLANTAG) && (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); if (m == NULL) { if_printf(dst_ifp, "unable to prepend VLAN header\n"); if_inc_counter(dst_ifp, IFCOUNTER_OERRORS, 1); continue; } m->m_flags &= ~M_VLANTAG; } if ((err = dst_ifp->if_transmit(dst_ifp, m))) { m_freem(m0); if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); break; } if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); if (mflags & M_MCAST) if_inc_counter(sc->sc_ifp, IFCOUNTER_OMCASTS, 1); } return (err); } /* * bridge_dummynet: * * Receive a queued packet from dummynet and pass it on to the output * interface. * * The mbuf has the Ethernet header already attached. */ static void bridge_dummynet(struct mbuf *m, struct ifnet *ifp) { struct bridge_softc *sc; sc = ifp->if_bridge; /* * The packet didnt originate from a member interface. This should only * ever happen if a member interface is removed while packets are * queued for it. */ if (sc == NULL) { m_freem(m); return; } if (PFIL_HOOKED(&V_inet_pfil_hook) #ifdef INET6 || PFIL_HOOKED(&V_inet6_pfil_hook) #endif ) { if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) return; if (m == NULL) return; } bridge_enqueue(sc, ifp, m); } /* * bridge_output: * * Send output from a bridge member interface. This * performs the bridging function for locally originated * packets. * * The mbuf has the Ethernet header already attached. We must * enqueue or free the mbuf before returning. */ static int bridge_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, struct rtentry *rt) { struct ether_header *eh; struct ifnet *dst_if; struct bridge_softc *sc; uint16_t vlan; if (m->m_len < ETHER_HDR_LEN) { m = m_pullup(m, ETHER_HDR_LEN); if (m == NULL) return (0); } eh = mtod(m, struct ether_header *); sc = ifp->if_bridge; vlan = VLANTAGOF(m); BRIDGE_LOCK(sc); /* * If bridge is down, but the original output interface is up, * go ahead and send out that interface. Otherwise, the packet * is dropped below. */ if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { dst_if = ifp; goto sendunicast; } /* * If the packet is a multicast, or we don't know a better way to * get there, send to all interfaces. */ if (ETHER_IS_MULTICAST(eh->ether_dhost)) dst_if = NULL; else dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); if (dst_if == NULL) { struct bridge_iflist *bif; struct mbuf *mc; int error = 0, used = 0; bridge_span(sc, m); BRIDGE_LOCK2REF(sc, error); if (error) { m_freem(m); return (0); } LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { dst_if = bif->bif_ifp; if (dst_if->if_type == IFT_GIF) continue; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; /* * If this is not the original output interface, * and the interface is participating in spanning * tree, make sure the port is in a state that * allows forwarding. */ if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; if (LIST_NEXT(bif, bif_next) == NULL) { used = 1; mc = m; } else { mc = m_copypacket(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); BRIDGE_UNREF(sc); return (0); } sendunicast: /* * XXX Spanning tree consideration here? */ bridge_span(sc, m); if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) { m_freem(m); BRIDGE_UNLOCK(sc); return (0); } BRIDGE_UNLOCK(sc); bridge_enqueue(sc, dst_if, m); return (0); } /* * bridge_transmit: * * Do output on a bridge. * */ static int bridge_transmit(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc; struct ether_header *eh; struct ifnet *dst_if; int error = 0; sc = ifp->if_softc; ETHER_BPF_MTAP(ifp, m); eh = mtod(m, struct ether_header *); BRIDGE_LOCK(sc); if (((m->m_flags & (M_BCAST|M_MCAST)) == 0) && (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1)) != NULL) { BRIDGE_UNLOCK(sc); error = bridge_enqueue(sc, dst_if, m); } else bridge_broadcast(sc, ifp, m, 0); return (error); } /* * The ifp->if_qflush entry point for if_bridge(4) is no-op. */ static void bridge_qflush(struct ifnet *ifp __unused) { } /* * bridge_forward: * * The forwarding function of the bridge. * * NOTE: Releases the lock on return. */ static void bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, struct mbuf *m) { struct bridge_iflist *dbif; struct ifnet *src_if, *dst_if, *ifp; struct ether_header *eh; uint16_t vlan; uint8_t *dst; int error; src_if = m->m_pkthdr.rcvif; ifp = sc->sc_ifp; if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); vlan = VLANTAGOF(m); if ((sbif->bif_flags & IFBIF_STP) && sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; eh = mtod(m, struct ether_header *); dst = eh->ether_dhost; /* If the interface is learning, record the address. */ if (sbif->bif_flags & IFBIF_LEARNING) { error = bridge_rtupdate(sc, eh->ether_shost, vlan, sbif, 0, IFBAF_DYNAMIC); /* * If the interface has addresses limits then deny any source * that is not in the cache. */ if (error && sbif->bif_addrmax) goto drop; } if ((sbif->bif_flags & IFBIF_STP) != 0 && sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING) goto drop; /* * At this point, the port either doesn't participate * in spanning tree or it is in the forwarding state. */ /* * If the packet is unicast, destined for someone on * "this" side of the bridge, drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { dst_if = bridge_rtlookup(sc, dst, vlan); if (src_if == dst_if) goto drop; } else { /* * Check if its a reserved multicast address, any address * listed in 802.1D section 7.12.6 may not be forwarded by the * bridge. * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F */ if (dst[0] == 0x01 && dst[1] == 0x80 && dst[2] == 0xc2 && dst[3] == 0x00 && dst[4] == 0x00 && dst[5] <= 0x0f) goto drop; /* ...forward it to all interfaces. */ if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); dst_if = NULL; } /* * If we have a destination interface which is a member of our bridge, * OR this is a unicast packet, push it through the bpf(4) machinery. * For broadcast or multicast packets, don't bother because it will * be reinjected into ether_input. We do this before we pass the packets * through the pfil(9) framework, as it is possible that pfil(9) will * drop the packet, or possibly modify it, making it difficult to debug * firewall issues on the bridge. */ if (dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) ETHER_BPF_MTAP(ifp, m); /* run the packet filter */ if (PFIL_HOOKED(&V_inet_pfil_hook) #ifdef INET6 || PFIL_HOOKED(&V_inet6_pfil_hook) #endif ) { BRIDGE_UNLOCK(sc); if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) return; if (m == NULL) return; BRIDGE_LOCK(sc); } if (dst_if == NULL) { bridge_broadcast(sc, src_if, m, 1); return; } /* * At this point, we're dealing with a unicast frame * going to a different interface. */ if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) goto drop; dbif = bridge_lookup_member_if(sc, dst_if); if (dbif == NULL) /* Not a member of the bridge (anymore?) */ goto drop; /* Private segments can not talk to each other */ if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) goto drop; if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) goto drop; BRIDGE_UNLOCK(sc); if (PFIL_HOOKED(&V_inet_pfil_hook) #ifdef INET6 || PFIL_HOOKED(&V_inet6_pfil_hook) #endif ) { if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) return; if (m == NULL) return; } bridge_enqueue(sc, dst_if, m); return; drop: BRIDGE_UNLOCK(sc); m_freem(m); } /* * bridge_input: * * Receive input from a member interface. Queue the packet for * bridging if it is not for us. */ static struct mbuf * bridge_input(struct ifnet *ifp, struct mbuf *m) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif, *bif2; struct ifnet *bifp; struct ether_header *eh; struct mbuf *mc, *mc2; uint16_t vlan; int error; if ((sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return (m); bifp = sc->sc_ifp; vlan = VLANTAGOF(m); /* * Implement support for bridge monitoring. If this flag has been * set on this interface, discard the packet once we push it through * the bpf(4) machinery, but before we do, increment the byte and * packet counters associated with this interface. */ if ((bifp->if_flags & IFF_MONITOR) != 0) { m->m_pkthdr.rcvif = bifp; ETHER_BPF_MTAP(bifp, m); if_inc_counter(bifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(bifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return (NULL); } BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { BRIDGE_UNLOCK(sc); return (m); } eh = mtod(m, struct ether_header *); bridge_span(sc, m); if (m->m_flags & (M_BCAST|M_MCAST)) { /* Tap off 802.1D packets; they do not get forwarded. */ if (memcmp(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN) == 0) { bstp_input(&bif->bif_stp, ifp, m); /* consumes mbuf */ BRIDGE_UNLOCK(sc); return (NULL); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { BRIDGE_UNLOCK(sc); return (m); } /* * Make a deep copy of the packet and enqueue the copy * for bridge processing; return the original packet for * local processing. */ mc = m_dup(m, M_NOWAIT); if (mc == NULL) { BRIDGE_UNLOCK(sc); return (m); } /* Perform the bridge forwarding function with the copy. */ bridge_forward(sc, bif, mc); /* * Reinject the mbuf as arriving on the bridge so we have a * chance at claiming multicast packets. We can not loop back * here from ether_input as a bridge is never a member of a * bridge. */ KASSERT(bifp->if_bridge == NULL, ("loop created in bridge_input")); mc2 = m_dup(m, M_NOWAIT); if (mc2 != NULL) { /* Keep the layer3 header aligned */ int i = min(mc2->m_pkthdr.len, max_protohdr); mc2 = m_copyup(mc2, i, ETHER_ALIGN); } if (mc2 != NULL) { mc2->m_pkthdr.rcvif = bifp; (*bifp->if_input)(bifp, mc2); } /* Return the original packet for local processing. */ return (m); } if ((bif->bif_flags & IFBIF_STP) && bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { BRIDGE_UNLOCK(sc); return (m); } #if (defined(INET) || defined(INET6)) # define OR_CARP_CHECK_WE_ARE_DST(iface) \ || ((iface)->if_carp \ && (*carp_forus_p)((iface), eh->ether_dhost)) # define OR_CARP_CHECK_WE_ARE_SRC(iface) \ || ((iface)->if_carp \ && (*carp_forus_p)((iface), eh->ether_shost)) #else # define OR_CARP_CHECK_WE_ARE_DST(iface) # define OR_CARP_CHECK_WE_ARE_SRC(iface) #endif #ifdef INET6 # define OR_PFIL_HOOKED_INET6 \ || PFIL_HOOKED(&V_inet6_pfil_hook) #else # define OR_PFIL_HOOKED_INET6 #endif #define GRAB_OUR_PACKETS(iface) \ if ((iface)->if_type == IFT_GIF) \ continue; \ /* It is destined for us. */ \ if (memcmp(IF_LLADDR((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_DST((iface)) \ ) { \ if ((iface)->if_type == IFT_BRIDGE) { \ ETHER_BPF_MTAP(iface, m); \ if_inc_counter(iface, IFCOUNTER_IPACKETS, 1); \ if_inc_counter(iface, IFCOUNTER_IBYTES, m->m_pkthdr.len); \ /* Filter on the physical interface. */ \ if (V_pfil_local_phys && \ (PFIL_HOOKED(&V_inet_pfil_hook) \ OR_PFIL_HOOKED_INET6)) { \ if (bridge_pfil(&m, NULL, ifp, \ PFIL_IN) != 0 || m == NULL) { \ BRIDGE_UNLOCK(sc); \ return (NULL); \ } \ eh = mtod(m, struct ether_header *); \ } \ } \ if (bif->bif_flags & IFBIF_LEARNING) { \ error = bridge_rtupdate(sc, eh->ether_shost, \ vlan, bif, 0, IFBAF_DYNAMIC); \ if (error && bif->bif_addrmax) { \ BRIDGE_UNLOCK(sc); \ m_freem(m); \ return (NULL); \ } \ } \ m->m_pkthdr.rcvif = iface; \ BRIDGE_UNLOCK(sc); \ return (m); \ } \ \ /* We just received a packet that we sent out. */ \ if (memcmp(IF_LLADDR((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ OR_CARP_CHECK_WE_ARE_SRC((iface)) \ ) { \ BRIDGE_UNLOCK(sc); \ m_freem(m); \ return (NULL); \ } /* * Unicast. Make sure it's not for the bridge. */ do { GRAB_OUR_PACKETS(bifp) } while (0); /* * Give a chance for ifp at first priority. This will help when the * packet comes through the interface like VLAN's with the same MACs * on several interfaces from the same bridge. This also will save * some CPU cycles in case the destination interface and the input * interface (eq ifp) are the same. */ do { GRAB_OUR_PACKETS(ifp) } while (0); /* Now check the all bridge members. */ LIST_FOREACH(bif2, &sc->sc_iflist, bif_next) { GRAB_OUR_PACKETS(bif2->bif_ifp) } #undef OR_CARP_CHECK_WE_ARE_DST #undef OR_CARP_CHECK_WE_ARE_SRC #undef OR_PFIL_HOOKED_INET6 #undef GRAB_OUR_PACKETS /* Perform the bridge forwarding function. */ bridge_forward(sc, bif, m); return (NULL); } /* * bridge_broadcast: * * Send a frame to all interfaces that are members of * the bridge, except for the one on which the packet * arrived. * * NOTE: Releases the lock on return. */ static void bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, struct mbuf *m, int runfilt) { struct bridge_iflist *dbif, *sbif; struct mbuf *mc; struct ifnet *dst_if; int error = 0, used = 0, i; sbif = bridge_lookup_member_if(sc, src_if); BRIDGE_LOCK2REF(sc, error); if (error) { m_freem(m); return; } /* Filter on the bridge interface before broadcasting */ if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) #ifdef INET6 || PFIL_HOOKED(&V_inet6_pfil_hook) #endif )) { if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) goto out; if (m == NULL) goto out; } LIST_FOREACH(dbif, &sc->sc_iflist, bif_next) { dst_if = dbif->bif_ifp; if (dst_if == src_if) continue; /* Private segments can not talk to each other */ if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) continue; if ((dbif->bif_flags & IFBIF_STP) && dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) continue; if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 && (m->m_flags & (M_BCAST|M_MCAST)) == 0) continue; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; if (LIST_NEXT(dbif, bif_next) == NULL) { mc = m; used = 1; } else { mc = m_dup(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } /* * Filter on the output interface. Pass a NULL bridge interface * pointer so we do not redundantly filter on the bridge for * each interface we broadcast on. */ if (runfilt && (PFIL_HOOKED(&V_inet_pfil_hook) #ifdef INET6 || PFIL_HOOKED(&V_inet6_pfil_hook) #endif )) { if (used == 0) { /* Keep the layer3 header aligned */ i = min(mc->m_pkthdr.len, max_protohdr); mc = m_copyup(mc, i, ETHER_ALIGN); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } } if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) continue; if (mc == NULL) continue; } bridge_enqueue(sc, dst_if, mc); } if (used == 0) m_freem(m); out: BRIDGE_UNREF(sc); } /* * bridge_span: * * Duplicate a packet out one or more interfaces that are in span mode, * the original mbuf is unmodified. */ static void bridge_span(struct bridge_softc *sc, struct mbuf *m) { struct bridge_iflist *bif; struct ifnet *dst_if; struct mbuf *mc; if (LIST_EMPTY(&sc->sc_spanlist)) return; LIST_FOREACH(bif, &sc->sc_spanlist, bif_next) { dst_if = bif->bif_ifp; if ((dst_if->if_drv_flags & IFF_DRV_RUNNING) == 0) continue; mc = m_copypacket(m, M_NOWAIT); if (mc == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); continue; } bridge_enqueue(sc, dst_if, mc); } } /* * bridge_rtupdate: * * Add a bridge routing entry. */ static int bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, struct bridge_iflist *bif, int setflags, uint8_t flags) { struct bridge_rtnode *brt; int error; BRIDGE_LOCK_ASSERT(sc); /* Check the source address is valid and not multicast. */ if (ETHER_IS_MULTICAST(dst) || (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) return (EINVAL); /* 802.1p frames map to vlan 1 */ if (vlan == 0) vlan = 1; /* * A route for this destination might already exist. If so, * update it, otherwise create a new one. */ if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) { if (sc->sc_brtcnt >= sc->sc_brtmax) { sc->sc_brtexceeded++; return (ENOSPC); } /* Check per interface address limits (if enabled) */ if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { bif->bif_addrexceeded++; return (ENOSPC); } /* * Allocate a new bridge forwarding node, and * initialize the expiration time and Ethernet * address. */ brt = uma_zalloc(bridge_rtnode_zone, M_NOWAIT | M_ZERO); if (brt == NULL) return (ENOMEM); if (bif->bif_flags & IFBIF_STICKY) brt->brt_flags = IFBAF_STICKY; else brt->brt_flags = IFBAF_DYNAMIC; memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); brt->brt_vlan = vlan; if ((error = bridge_rtnode_insert(sc, brt)) != 0) { uma_zfree(bridge_rtnode_zone, brt); return (error); } brt->brt_dst = bif; bif->bif_addrcnt++; } if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && brt->brt_dst != bif) { brt->brt_dst->bif_addrcnt--; brt->brt_dst = bif; brt->brt_dst->bif_addrcnt++; } if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) brt->brt_expire = time_uptime + sc->sc_brttimeout; if (setflags) brt->brt_flags = flags; return (0); } /* * bridge_rtlookup: * * Lookup the destination interface for an address. */ static struct ifnet * bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; BRIDGE_LOCK_ASSERT(sc); if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL) return (NULL); return (brt->brt_ifp); } /* * bridge_rttrim: * * Trim the routine table so that we have a number * of routing entries less than or equal to the * maximum number. */ static void bridge_rttrim(struct bridge_softc *sc) { struct bridge_rtnode *brt, *nbrt; BRIDGE_LOCK_ASSERT(sc); /* Make sure we actually need to do this. */ if (sc->sc_brtcnt <= sc->sc_brtmax) return; /* Force an aging cycle; this might trim enough addresses. */ bridge_rtage(sc); if (sc->sc_brtcnt <= sc->sc_brtmax) return; LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { bridge_rtnode_destroy(sc, brt); if (sc->sc_brtcnt <= sc->sc_brtmax) return; } } } /* * bridge_timer: * * Aging timer for the bridge. */ static void bridge_timer(void *arg) { struct bridge_softc *sc = arg; BRIDGE_LOCK_ASSERT(sc); bridge_rtage(sc); if (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) callout_reset(&sc->sc_brcallout, bridge_rtable_prune_period * hz, bridge_timer, sc); } /* * bridge_rtage: * * Perform an aging cycle. */ static void bridge_rtage(struct bridge_softc *sc) { struct bridge_rtnode *brt, *nbrt; BRIDGE_LOCK_ASSERT(sc); LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { if (time_uptime >= brt->brt_expire) bridge_rtnode_destroy(sc, brt); } } } /* * bridge_rtflush: * * Remove all dynamic addresses from the bridge. */ static void bridge_rtflush(struct bridge_softc *sc, int full) { struct bridge_rtnode *brt, *nbrt; BRIDGE_LOCK_ASSERT(sc); LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) bridge_rtnode_destroy(sc, brt); } } /* * bridge_rtdaddr: * * Remove an address from the table. */ static int bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; int found = 0; BRIDGE_LOCK_ASSERT(sc); /* * If vlan is zero then we want to delete for all vlans so the lookup * may return more than one. */ while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) { bridge_rtnode_destroy(sc, brt); found = 1; } return (found ? 0 : ENOENT); } /* * bridge_rtdelete: * * Delete routes to a speicifc member interface. */ static void bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) { struct bridge_rtnode *brt, *nbrt; BRIDGE_LOCK_ASSERT(sc); LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { if (brt->brt_ifp == ifp && (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) bridge_rtnode_destroy(sc, brt); } } /* * bridge_rtable_init: * * Initialize the route table for this bridge. */ static void bridge_rtable_init(struct bridge_softc *sc) { int i; sc->sc_rthash = malloc(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, M_DEVBUF, M_WAITOK); for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) LIST_INIT(&sc->sc_rthash[i]); sc->sc_rthash_key = arc4random(); LIST_INIT(&sc->sc_rtlist); } /* * bridge_rtable_fini: * * Deconstruct the route table for this bridge. */ static void bridge_rtable_fini(struct bridge_softc *sc) { KASSERT(sc->sc_brtcnt == 0, ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt)); free(sc->sc_rthash, M_DEVBUF); } /* * The following hash function is adapted from "Hash Functions" by Bob Jenkins * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). */ #define mix(a, b, c) \ do { \ a -= b; a -= c; a ^= (c >> 13); \ b -= c; b -= a; b ^= (a << 8); \ c -= a; c -= b; c ^= (b >> 13); \ a -= b; a -= c; a ^= (c >> 12); \ b -= c; b -= a; b ^= (a << 16); \ c -= a; c -= b; c ^= (b >> 5); \ a -= b; a -= c; a ^= (c >> 3); \ b -= c; b -= a; b ^= (a << 10); \ c -= a; c -= b; c ^= (b >> 15); \ } while (/*CONSTCOND*/0) static __inline uint32_t bridge_rthash(struct bridge_softc *sc, const uint8_t *addr) { uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; b += addr[5] << 8; b += addr[4]; a += addr[3] << 24; a += addr[2] << 16; a += addr[1] << 8; a += addr[0]; mix(a, b, c); return (c & BRIDGE_RTHASH_MASK); } #undef mix static int bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) { int i, d; for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { d = ((int)a[i]) - ((int)b[i]); } return (d); } /* * bridge_rtnode_lookup: * * Look up a bridge route node for the specified destination. Compare the * vlan id or if zero then just return the first match. */ static struct bridge_rtnode * bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) { struct bridge_rtnode *brt; uint32_t hash; int dir; BRIDGE_LOCK_ASSERT(sc); hash = bridge_rthash(sc, addr); LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr); if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0)) return (brt); if (dir > 0) return (NULL); } return (NULL); } /* * bridge_rtnode_insert: * * Insert the specified bridge node into the route table. We * assume the entry is not already in the table. */ static int bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) { struct bridge_rtnode *lbrt; uint32_t hash; int dir; BRIDGE_LOCK_ASSERT(sc); hash = bridge_rthash(sc, brt->brt_addr); lbrt = LIST_FIRST(&sc->sc_rthash[hash]); if (lbrt == NULL) { LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); goto out; } do { dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) return (EEXIST); if (dir > 0) { LIST_INSERT_BEFORE(lbrt, brt, brt_hash); goto out; } if (LIST_NEXT(lbrt, brt_hash) == NULL) { LIST_INSERT_AFTER(lbrt, brt, brt_hash); goto out; } lbrt = LIST_NEXT(lbrt, brt_hash); } while (lbrt != NULL); #ifdef DIAGNOSTIC panic("bridge_rtnode_insert: impossible"); #endif out: LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); sc->sc_brtcnt++; return (0); } /* * bridge_rtnode_destroy: * * Destroy a bridge rtnode. */ static void bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) { BRIDGE_LOCK_ASSERT(sc); LIST_REMOVE(brt, brt_hash); LIST_REMOVE(brt, brt_list); sc->sc_brtcnt--; brt->brt_dst->bif_addrcnt--; uma_zfree(bridge_rtnode_zone, brt); } /* * bridge_rtable_expire: * * Set the expiry time for all routes on an interface. */ static void bridge_rtable_expire(struct ifnet *ifp, int age) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_rtnode *brt; BRIDGE_LOCK(sc); /* * If the age is zero then flush, otherwise set all the expiry times to * age for the interface */ if (age == 0) bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); else { LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { /* Cap the expiry time to 'age' */ if (brt->brt_ifp == ifp && brt->brt_expire > time_uptime + age && (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) brt->brt_expire = time_uptime + age; } } BRIDGE_UNLOCK(sc); } /* * bridge_state_change: * * Callback from the bridgestp code when a port changes states. */ static void bridge_state_change(struct ifnet *ifp, int state) { struct bridge_softc *sc = ifp->if_bridge; static const char *stpstates[] = { "disabled", "listening", "learning", "forwarding", "blocking", "discarding" }; if (V_log_stp) log(LOG_NOTICE, "%s: state changed to %s on %s\n", sc->sc_ifp->if_xname, stpstates[state], ifp->if_xname); } /* * Send bridge packets through pfil if they are one of the types pfil can deal * with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without * question.) If *bifp or *ifp are NULL then packet filtering is skipped for * that interface. */ static int bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) { int snap, error, i, hlen; struct ether_header *eh1, eh2; struct ip *ip; struct llc llc1; u_int16_t ether_type; snap = 0; error = -1; /* Default error if not error == 0 */ #if 0 /* we may return with the IP fields swapped, ensure its not shared */ KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); #endif if (V_pfil_bridge == 0 && V_pfil_member == 0 && V_pfil_ipfw == 0) return (0); /* filtering is disabled */ i = min((*mp)->m_pkthdr.len, max_protohdr); if ((*mp)->m_len < i) { *mp = m_pullup(*mp, i); if (*mp == NULL) { printf("%s: m_pullup failed\n", __func__); return (-1); } } eh1 = mtod(*mp, struct ether_header *); ether_type = ntohs(eh1->ether_type); /* * Check for SNAP/LLC. */ if (ether_type < ETHERMTU) { struct llc *llc2 = (struct llc *)(eh1 + 1); if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && llc2->llc_dsap == LLC_SNAP_LSAP && llc2->llc_ssap == LLC_SNAP_LSAP && llc2->llc_control == LLC_UI) { ether_type = htons(llc2->llc_un.type_snap.ether_type); snap = 1; } } /* * If we're trying to filter bridge traffic, don't look at anything * other than IP and ARP traffic. If the filter doesn't understand * IPv6, don't allow IPv6 through the bridge either. This is lame * since if we really wanted, say, an AppleTalk filter, we are hosed, * but of course we don't have an AppleTalk filter to begin with. * (Note that since pfil doesn't understand ARP it will pass *ALL* * ARP traffic.) */ switch (ether_type) { case ETHERTYPE_ARP: case ETHERTYPE_REVARP: if (V_pfil_ipfw_arp == 0) return (0); /* Automatically pass */ break; case ETHERTYPE_IP: #ifdef INET6 case ETHERTYPE_IPV6: #endif /* INET6 */ break; default: /* * Check to see if the user wants to pass non-ip * packets, these will not be checked by pfil(9) and * passed unconditionally so the default is to drop. */ if (V_pfil_onlyip) goto bad; } /* Run the packet through pfil before stripping link headers */ if (PFIL_HOOKED(&V_link_pfil_hook) && V_pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { error = pfil_run_hooks(&V_link_pfil_hook, mp, ifp, dir, NULL); if (*mp == NULL || error != 0) /* packet consumed by filter */ return (error); } /* Strip off the Ethernet header and keep a copy. */ m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); m_adj(*mp, ETHER_HDR_LEN); /* Strip off snap header, if present */ if (snap) { m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); m_adj(*mp, sizeof(struct llc)); } /* * Check the IP header for alignment and errors */ if (dir == PFIL_IN) { switch (ether_type) { case ETHERTYPE_IP: error = bridge_ip_checkbasic(mp); break; #ifdef INET6 case ETHERTYPE_IPV6: error = bridge_ip6_checkbasic(mp); break; #endif /* INET6 */ default: error = 0; } if (error) goto bad; } error = 0; /* * Run the packet through pfil */ switch (ether_type) { case ETHERTYPE_IP: /* * Run pfil on the member interface and the bridge, both can * be skipped by clearing pfil_member or pfil_bridge. * * Keep the order: * in_if -> bridge_if -> out_if */ if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (V_pfil_member && ifp != NULL) error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) error = pfil_run_hooks(&V_inet_pfil_hook, mp, bifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; /* check if we need to fragment the packet */ if (V_pfil_member && ifp != NULL && dir == PFIL_OUT) { i = (*mp)->m_pkthdr.len; if (i > ifp->if_mtu) { error = bridge_fragment(ifp, *mp, &eh2, snap, &llc1); return (error); } } /* Recalculate the ip checksum. */ ip = mtod(*mp, struct ip *); hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) goto bad; if (hlen > (*mp)->m_len) { if ((*mp = m_pullup(*mp, hlen)) == 0) goto bad; ip = mtod(*mp, struct ip *); if (ip == NULL) goto bad; } ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(*mp, hlen); break; #ifdef INET6 case ETHERTYPE_IPV6: if (V_pfil_bridge && dir == PFIL_OUT && bifp != NULL) error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (V_pfil_member && ifp != NULL) error = pfil_run_hooks(&V_inet6_pfil_hook, mp, ifp, dir, NULL); if (*mp == NULL || error != 0) /* filter may consume */ break; if (V_pfil_bridge && dir == PFIL_IN && bifp != NULL) error = pfil_run_hooks(&V_inet6_pfil_hook, mp, bifp, dir, NULL); break; #endif default: error = 0; break; } if (*mp == NULL) return (error); if (error != 0) goto bad; error = -1; /* * Finally, put everything back the way it was and return */ if (snap) { M_PREPEND(*mp, sizeof(struct llc), M_NOWAIT); if (*mp == NULL) return (error); bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); } M_PREPEND(*mp, ETHER_HDR_LEN, M_NOWAIT); if (*mp == NULL) return (error); bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); return (0); bad: m_freem(*mp); *mp = NULL; return (error); } /* * Perform basic checks on header size since * pfil assumes ip_input has already processed * it for it. Cut-and-pasted from ip_input.c. * Given how simple the IPv6 version is, * does the IPv4 version really need to be * this complicated? * * XXX Should we update ipstat here, or not? * XXX Right now we update ipstat but not * XXX csum_counter. */ static int bridge_ip_checkbasic(struct mbuf **mp) { struct mbuf *m = *mp; struct ip *ip; int len, hlen; u_short sum; if (*mp == NULL) return (-1); if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { if ((m = m_copyup(m, sizeof(struct ip), (max_linkhdr + 3) & ~3)) == NULL) { /* XXXJRT new stat, please */ KMOD_IPSTAT_INC(ips_toosmall); goto bad; } } else if (__predict_false(m->m_len < sizeof (struct ip))) { if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); goto bad; } } ip = mtod(m, struct ip *); if (ip == NULL) goto bad; if (ip->ip_v != IPVERSION) { KMOD_IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ KMOD_IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == 0) { KMOD_IPSTAT_INC(ips_badhlen); goto bad; } ip = mtod(m, struct ip *); if (ip == NULL) goto bad; } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { KMOD_IPSTAT_INC(ips_badsum); goto bad; } /* Retrieve the packet length. */ len = ntohs(ip->ip_len); /* * Check for additional length bogosity */ if (len < hlen) { KMOD_IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < len) { KMOD_IPSTAT_INC(ips_tooshort); goto bad; } /* Checks out, proceed */ *mp = m; return (0); bad: *mp = m; return (-1); } #ifdef INET6 /* * Same as above, but for IPv6. * Cut-and-pasted from ip6_input.c. * XXX Should we update ip6stat, or not? */ static int bridge_ip6_checkbasic(struct mbuf **mp) { struct mbuf *m = *mp; struct ip6_hdr *ip6; /* * If the IPv6 header is not aligned, slurp it up into a new * mbuf with space for link headers, in the event we forward * it. Otherwise, if it is aligned, make sure the entire base * IPv6 header is in the first mbuf of the chain. */ if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_copyup(m, sizeof(struct ip6_hdr), (max_linkhdr + 3) & ~3)) == NULL) { /* XXXJRT new stat, please */ IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { struct ifnet *inifp = m->m_pkthdr.rcvif; if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(inifp, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); goto bad; } /* Checks out, proceed */ *mp = m; return (0); bad: *mp = m; return (-1); } #endif /* INET6 */ /* * bridge_fragment: * * Return a fragmented mbuf chain. */ static int bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, int snap, struct llc *llc) { struct mbuf *m0; struct ip *ip; int error = -1; if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) goto out; ip = mtod(m, struct ip *); m->m_pkthdr.csum_flags |= CSUM_IP; error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist); if (error) goto out; /* walk the chain and re-add the Ethernet header */ for (m0 = m; m0; m0 = m0->m_nextpkt) { if (error == 0) { if (snap) { M_PREPEND(m0, sizeof(struct llc), M_NOWAIT); if (m0 == NULL) { error = ENOBUFS; continue; } bcopy(llc, mtod(m0, caddr_t), sizeof(struct llc)); } M_PREPEND(m0, ETHER_HDR_LEN, M_NOWAIT); if (m0 == NULL) { error = ENOBUFS; continue; } bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN); } else m_freem(m); } if (error == 0) KMOD_IPSTAT_INC(ips_fragmented); return (error); out: if (m != NULL) m_freem(m); return (error); } static void bridge_linkstate(struct ifnet *ifp) { struct bridge_softc *sc = ifp->if_bridge; struct bridge_iflist *bif; BRIDGE_LOCK(sc); bif = bridge_lookup_member_if(sc, ifp); if (bif == NULL) { BRIDGE_UNLOCK(sc); return; } bridge_linkcheck(sc); BRIDGE_UNLOCK(sc); bstp_linkstate(&bif->bif_stp); } static void bridge_linkcheck(struct bridge_softc *sc) { struct bridge_iflist *bif; int new_link, hasls; BRIDGE_LOCK_ASSERT(sc); new_link = LINK_STATE_DOWN; hasls = 0; /* Our link is considered up if at least one of our ports is active */ LIST_FOREACH(bif, &sc->sc_iflist, bif_next) { if (bif->bif_ifp->if_capabilities & IFCAP_LINKSTATE) hasls++; if (bif->bif_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } if (!LIST_EMPTY(&sc->sc_iflist) && !hasls) { /* If no interfaces support link-state then we default to up */ new_link = LINK_STATE_UP; } if_link_state_change(sc->sc_ifp, new_link); } Index: user/dchagin/lemul/sys/net/if_ethersubr.c =================================================================== --- user/dchagin/lemul/sys/net/if_ethersubr.c (revision 274240) +++ user/dchagin/lemul/sys/net/if_ethersubr.c (revision 274241) @@ -1,1206 +1,1165 @@ /*- * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #endif #ifdef INET6 #include #endif int (*ef_inputp)(struct ifnet*, struct ether_header *eh, struct mbuf *m); int (*ef_outputp)(struct ifnet *ifp, struct mbuf **mp, const struct sockaddr *dst, short *tp, int *hlen); #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(struct pfil_head, link_pfil_hook); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ struct mbuf *(*bridge_input_p)(struct ifnet *, struct mbuf *); int (*bridge_output_p)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif -/* XXX: should be in an arp support file, not here */ -static MALLOC_DEFINE(M_ARPCOM, "arpcom", "802.* interface internals"); - #define ETHER_IS_BROADCAST(addr) \ (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { short type; int error = 0, hdrcmplt = 0; u_char esrc[ETHER_ADDR_LEN], edst[ETHER_ADDR_LEN]; struct llentry *lle = NULL; struct rtentry *rt0 = NULL; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ if (ro != NULL) { if (!(m->m_flags & (M_BCAST | M_MCAST))) lle = ro->ro_lle; rt0 = ro->ro_rt; } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); hlen = ETHER_HDR_LEN; switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); else error = arpresolve(ifp, rt0, m, dst, edst, &lle); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ETHER); loop_copy = 0; /* if this is for us, don't do it */ switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN); else bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); else error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif case pseudo_AF_HDRCMPLT: { const struct ether_header *eh; hdrcmplt = 1; eh = (const struct ether_header *)dst->sa_data; (void)memcpy(esrc, eh->ether_shost, sizeof (esrc)); /* FALLTHROUGH */ case AF_UNSPEC: loop_copy = 0; /* if this is for us, don't do it */ eh = (const struct ether_header *)dst->sa_data; (void)memcpy(edst, eh->ether_dhost, sizeof (edst)); type = eh->ether_type; break; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } if (lle != NULL && (lle->la_flags & LLE_IFADDR)) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); eh = mtod(m, struct ether_header *); (void)memcpy(&eh->ether_type, &type, sizeof(eh->ether_type)); (void)memcpy(eh->ether_dhost, edst, sizeof (edst)); if (hdrcmplt) (void)memcpy(eh->ether_shost, esrc, sizeof(eh->ether_shost)); else (void)memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { if (m->m_flags & M_BCAST) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } else if (bcmp(eh->ether_dhost, eh->ether_shost, ETHER_ADDR_LEN) == 0) { update_mbuf_csumflags(m, m); (void) if_simloop(ifp, m, dst->sa_family, hlen); return (0); /* XXX */ } } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { int i; if (PFIL_HOOKED(&V_link_pfil_hook)) { i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_OUT, NULL); if (i != 0) return (EACCES); if (m == NULL) return (0); } /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } #if defined(INET) || defined(INET6) #endif /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } #ifdef DIAGNOSTIC if (m->m_pkthdr.rcvif != ifp) { if_printf(ifp, "Warning, frame marked as received on %s\n", m->m_pkthdr.rcvif->if_xname); } #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } random_harvest(&(m->m_data), 12, 2, RANDOM_NET_ETHER); ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { int i; /* Initialize packet filter hooks. */ V_link_pfil_hook.ph_type = PFIL_TYPE_AF; V_link_pfil_hook.ph_af = AF_LINK; if ((i = pfil_head_register(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil link hook, " "error %d\n", __func__, i); } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); static void vnet_ether_destroy(__unused void *arg) { int i; if ((i = pfil_head_unregister(&V_link_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil link hook, " "error %d\n", __func__, i); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mn; /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch", __func__)); netisr_dispatch(NETISR_ETHER, m); m = mn; } } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED(&V_link_pfil_hook) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(&V_link_pfil_hook, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: if ((m = ip_fastforward(m)) == NULL) return; isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { - if (IFP2AC(ifp)->ac_netgraph != NULL) { + if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, ETHER_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = 0; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = 0; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } -static void* -ether_alloc(u_char type, struct ifnet *ifp) -{ - struct arpcom *ac; - - ac = malloc(sizeof(struct arpcom), M_ARPCOM, M_WAITOK | M_ZERO); - ac->ac_ifp = ifp; - - return (ac); -} - -static void -ether_free(void *com, u_char type) -{ - - free(com, M_ARPCOM); -} - -static int -ether_modevent(module_t mod, int type, void *data) -{ - - switch (type) { - case MOD_LOAD: - if_register_com_alloc(IFT_ETHER, ether_alloc, ether_free); - break; - case MOD_UNLOAD: - if_deregister_com_alloc(IFT_ETHER); - break; - default: - return EOPNOTSUPP; - } - - return (0); -} - static moduledata_t ether_mod = { - "ether", - ether_modevent, - 0 + .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); Index: user/dchagin/lemul/sys/net/if_fddisubr.c =================================================================== --- user/dchagin/lemul/sys/net/if_fddisubr.c (revision 274240) +++ user/dchagin/lemul/sys/net/if_fddisubr.c (revision 274241) @@ -1,672 +1,671 @@ /*- * Copyright (c) 1995, 1996 * Matt Thomas . All rights reserved. * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #endif #ifdef INET6 #include #endif #ifdef DECNET #include #endif #include static const u_char fddibroadcastaddr[FDDI_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int fddi_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); static int fddi_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void fddi_input(struct ifnet *ifp, struct mbuf *m); #define senderr(e) do { error = (e); goto bad; } while (0) /* * FDDI output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. - * Assumes that ifp is actually pointer to arpcom structure. */ static int fddi_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int16_t type; int loop_copy = 0, error = 0, hdrcmplt = 0; u_char esrc[FDDI_ADDR_LEN], edst[FDDI_ADDR_LEN]; struct fddi_header *fh; #if defined(INET) || defined(INET6) struct llentry *lle; #endif #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); getmicrotime(&ifp->if_lastchange); switch (dst->sa_family) { #ifdef INET case AF_INET: { struct rtentry *rt0 = NULL; if (ro != NULL) rt0 = ro->ro_rt; error = arpresolve(ifp, rt0, m, dst, edst, &lle); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; } case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_ETHER); loop_copy = -1; /* if this is for us, don't do it */ switch (ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, FDDI_ADDR_LEN); else bcopy(ar_tha(ah), edst, FDDI_ADDR_LEN); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); if (error) return (error); /* Something bad happened */ type = htons(ETHERTYPE_IPV6); break; #endif /* INET6 */ case pseudo_AF_HDRCMPLT: { const struct ether_header *eh; hdrcmplt = 1; eh = (const struct ether_header *)dst->sa_data; bcopy(eh->ether_shost, esrc, FDDI_ADDR_LEN); /* FALLTHROUGH */ } case AF_UNSPEC: { const struct ether_header *eh; loop_copy = -1; eh = (const struct ether_header *)dst->sa_data; bcopy(eh->ether_dhost, edst, FDDI_ADDR_LEN); if (*edst & 1) m->m_flags |= (M_BCAST|M_MCAST); type = eh->ether_type; break; } case AF_IMPLINK: { fh = mtod(m, struct fddi_header *); error = EPROTONOSUPPORT; switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) { case FDDIFC_LLC_ASYNC: { /* legal priorities are 0 through 7 */ if ((fh->fddi_fc & FDDIFC_Z) > 7) goto bad; break; } case FDDIFC_LLC_SYNC: { /* FDDIFC_Z bits reserved, must be zero */ if (fh->fddi_fc & FDDIFC_Z) goto bad; break; } case FDDIFC_SMT: { /* FDDIFC_Z bits must be non zero */ if ((fh->fddi_fc & FDDIFC_Z) == 0) goto bad; break; } default: { /* anything else is too dangerous */ goto bad; } } error = 0; if (fh->fddi_dhost[0] & 1) m->m_flags |= (M_BCAST|M_MCAST); goto queue_it; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); } /* * Add LLC header. */ if (type != 0) { struct llc *l; M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); if (m == 0) senderr(ENOBUFS); l = mtod(m, struct llc *); l->llc_control = LLC_UI; l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; l->llc_snap.org_code[0] = l->llc_snap.org_code[1] = l->llc_snap.org_code[2] = 0; l->llc_snap.ether_type = htons(type); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, FDDI_HDR_LEN, M_NOWAIT); if (m == 0) senderr(ENOBUFS); fh = mtod(m, struct fddi_header *); fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4; bcopy((caddr_t)edst, (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN); queue_it: if (hdrcmplt) bcopy((caddr_t)esrc, (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN); else bcopy(IF_LLADDR(ifp), (caddr_t)fh->fddi_shost, FDDI_ADDR_LEN); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); (void) if_simloop(ifp, n, dst->sa_family, FDDI_HDR_LEN); } else if (bcmp(fh->fddi_dhost, fh->fddi_shost, FDDI_ADDR_LEN) == 0) { (void) if_simloop(ifp, m, dst->sa_family, FDDI_HDR_LEN); return (0); /* XXX */ } } error = (ifp->if_transmit)(ifp, m); if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); bad: if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m) m_freem(m); return (error); } /* * Process a received FDDI packet. */ static void fddi_input(ifp, m) struct ifnet *ifp; struct mbuf *m; { int isr; struct llc *l; struct fddi_header *fh; /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m = m_pullup(m, FDDI_HDR_LEN); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } fh = mtod(m, struct fddi_header *); /* * Discard packet if interface is not up. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) goto dropanyway; /* * Give bpf a chance at the packet. */ BPF_MTAP(ifp, m); /* * Interface marked for monitoring; discard packet. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); return; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * Update interface statistics. */ if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); getmicrotime(&ifp->if_lastchange); /* * Discard non local unicast packets when interface * is in promiscuous mode. */ if ((ifp->if_flags & IFF_PROMISC) && ((fh->fddi_dhost[0] & 1) == 0) && (bcmp(IF_LLADDR(ifp), (caddr_t)fh->fddi_dhost, FDDI_ADDR_LEN) != 0)) goto dropanyway; /* * Set mbuf flags for bcast/mcast. */ if (fh->fddi_dhost[0] & 1) { if (bcmp(ifp->if_broadcastaddr, fh->fddi_dhost, FDDI_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef M_LINK0 /* * If this has a LLC priority of 0, then mark it so upper * layers have a hint that it really came via a FDDI/Ethernet * bridge. */ if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0) m->m_flags |= M_LINK0; #endif /* Strip off FDDI header. */ m_adj(m, FDDI_HDR_LEN); m = m_pullup(m, LLC_SNAPFRAMELEN); if (m == 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } l = mtod(m, struct llc *); switch (l->llc_dsap) { case LLC_SNAP_LSAP: { u_int16_t type; if ((l->llc_control != LLC_UI) || (l->llc_ssap != LLC_SNAP_LSAP)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0 || l->llc_snap.org_code[2] != 0) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } type = ntohs(l->llc_snap.ether_type); m_adj(m, LLC_SNAPFRAMELEN); switch (type) { #ifdef INET case ETHERTYPE_IP: if ((m = ip_fastforward(m)) == NULL) return; isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) goto dropanyway; isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif #ifdef DECNET case ETHERTYPE_DECNET: isr = NETISR_DECNET; break; #endif default: /* printf("fddi_input: unknown protocol 0x%x\n", type); */ if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } break; } default: /* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */ if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; dropanyway: if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (m) m_freem(m); return; } /* * Perform common duties while attaching to interface list */ void fddi_ifattach(ifp, lla, bpf) struct ifnet *ifp; const u_int8_t *lla; int bpf; { struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_type = IFT_FDDI; ifp->if_addrlen = FDDI_ADDR_LEN; ifp->if_hdrlen = 21; if_attach(ifp); /* Must be called before additional assignments */ ifp->if_mtu = FDDIMTU; ifp->if_output = fddi_output; ifp->if_input = fddi_input; ifp->if_resolvemulti = fddi_resolvemulti; ifp->if_broadcastaddr = fddibroadcastaddr; ifp->if_baudrate = 100000000; #ifdef IFF_NOTRAILERS ifp->if_flags |= IFF_NOTRAILERS; #endif ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_FDDI; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (bpf) bpfattach(ifp, DLT_FDDI, FDDI_HDR_LEN); return; } void fddi_ifdetach(ifp, bpf) struct ifnet *ifp; int bpf; { if (bpf) bpfdetach(ifp); if_detach(ifp); return; } int fddi_ioctl (ifp, command, data) struct ifnet *ifp; u_long command; caddr_t data; { struct ifaddr *ifa; struct ifreq *ifr; int error; ifa = (struct ifaddr *) data; ifr = (struct ifreq *) data; error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: /* before arpwhohas */ ifp->if_init(ifp->if_softc); arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, FDDI_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > FDDIMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; break; } return (error); } static int fddi_resolvemulti(ifp, llsa, sa) struct ifnet *ifp; struct sockaddr **llsa; struct sockaddr *sa; { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if ((e_addr[0] & 1) != 1) return (EADDRNOTAVAIL); *llsa = 0; return (0); #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_FDDI); sdl->sdl_nlen = 0; sdl->sdl_alen = FDDI_ADDR_LEN; sdl->sdl_slen = 0; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = 0; return (0); } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_FDDI); sdl->sdl_nlen = 0; sdl->sdl_alen = FDDI_ADDR_LEN; sdl->sdl_slen = 0; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return (EAFNOSUPPORT); } return (0); } static moduledata_t fddi_mod = { "fddi", /* module name */ NULL, /* event handler */ 0 /* extra data */ }; DECLARE_MODULE(fddi, fddi_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(fddi, 1); Index: user/dchagin/lemul/sys/net/if_gif.c =================================================================== --- user/dchagin/lemul/sys/net/if_gif.c (revision 274240) +++ user/dchagin/lemul/sys/net/if_gif.c (revision 274241) @@ -1,1090 +1,1091 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: if_gif.c,v 1.87 2001/10/19 08:50:27 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #include #endif /* INET */ #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #include #include #include #endif /* INET6 */ #include #include #include #include #include static const char gifname[] = "gif"; /* * gif_mtx protects a per-vnet gif_softc_list. */ static VNET_DEFINE(struct mtx, gif_mtx); #define V_gif_mtx VNET(gif_mtx) static MALLOC_DEFINE(M_GIF, "gif", "Generic Tunnel Interface"); static VNET_DEFINE(LIST_HEAD(, gif_softc), gif_softc_list); #define V_gif_softc_list VNET(gif_softc_list) static struct sx gif_ioctl_sx; SX_SYSINIT(gif_ioctl_sx, &gif_ioctl_sx, "gif_ioctl"); #define GIF_LIST_LOCK_INIT(x) mtx_init(&V_gif_mtx, "gif_mtx", \ NULL, MTX_DEF) #define GIF_LIST_LOCK_DESTROY(x) mtx_destroy(&V_gif_mtx) #define GIF_LIST_LOCK(x) mtx_lock(&V_gif_mtx) #define GIF_LIST_UNLOCK(x) mtx_unlock(&V_gif_mtx) void (*ng_gif_input_p)(struct ifnet *ifp, struct mbuf **mp, int af); void (*ng_gif_input_orphan_p)(struct ifnet *ifp, struct mbuf *m, int af); void (*ng_gif_attach_p)(struct ifnet *ifp); void (*ng_gif_detach_p)(struct ifnet *ifp); static int gif_set_tunnel(struct ifnet *, struct sockaddr *, struct sockaddr *); static void gif_delete_tunnel(struct ifnet *); static int gif_ioctl(struct ifnet *, u_long, caddr_t); static int gif_transmit(struct ifnet *, struct mbuf *); static void gif_qflush(struct ifnet *); static int gif_clone_create(struct if_clone *, int, caddr_t); static void gif_clone_destroy(struct ifnet *); static VNET_DEFINE(struct if_clone *, gif_cloner); #define V_gif_cloner VNET(gif_cloner) static int gifmodevent(module_t, int, void *); SYSCTL_DECL(_net_link); static SYSCTL_NODE(_net_link, IFT_GIF, gif, CTLFLAG_RW, 0, "Generic Tunnel Interface"); #ifndef MAX_GIF_NEST /* * This macro controls the default upper limitation on nesting of gif tunnels. * Since, setting a large value to this macro with a careless configuration * may introduce system crash, we don't allow any nestings by default. * If you need to configure nested gif tunnels, you can define this macro * in your kernel configuration file. However, if you do so, please be * careful to configure the tunnels so that it won't make a loop. */ #define MAX_GIF_NEST 1 #endif static VNET_DEFINE(int, max_gif_nesting) = MAX_GIF_NEST; #define V_max_gif_nesting VNET(max_gif_nesting) -SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_RW, +SYSCTL_INT(_net_link_gif, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(max_gif_nesting), 0, "Max nested tunnels"); /* * By default, we disallow creation of multiple tunnels between the same * pair of addresses. Some applications require this functionality so * we allow control over this check here. */ #ifdef XBONEHACK static VNET_DEFINE(int, parallel_tunnels) = 1; #else static VNET_DEFINE(int, parallel_tunnels) = 0; #endif #define V_parallel_tunnels VNET(parallel_tunnels) -SYSCTL_VNET_INT(_net_link_gif, OID_AUTO, parallel_tunnels, CTLFLAG_RW, - &VNET_NAME(parallel_tunnels), 0, "Allow parallel tunnels?"); +SYSCTL_INT(_net_link_gif, OID_AUTO, parallel_tunnels, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(parallel_tunnels), 0, + "Allow parallel tunnels?"); /* copy from src/sys/net/if_ethersubr.c */ static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; #ifndef ETHER_IS_BROADCAST #define ETHER_IS_BROADCAST(addr) \ (bcmp(etherbroadcastaddr, (addr), ETHER_ADDR_LEN) == 0) #endif static int gif_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct gif_softc *sc; sc = malloc(sizeof(struct gif_softc), M_GIF, M_WAITOK | M_ZERO); sc->gif_fibnum = curthread->td_proc->p_fibnum; GIF2IFP(sc) = if_alloc(IFT_GIF); GIF_LOCK_INIT(sc); GIF2IFP(sc)->if_softc = sc; if_initname(GIF2IFP(sc), gifname, unit); GIF2IFP(sc)->if_addrlen = 0; GIF2IFP(sc)->if_mtu = GIF_MTU; GIF2IFP(sc)->if_flags = IFF_POINTOPOINT | IFF_MULTICAST; #if 0 /* turn off ingress filter */ GIF2IFP(sc)->if_flags |= IFF_LINK2; #endif GIF2IFP(sc)->if_ioctl = gif_ioctl; GIF2IFP(sc)->if_transmit = gif_transmit; GIF2IFP(sc)->if_qflush = gif_qflush; GIF2IFP(sc)->if_output = gif_output; if_attach(GIF2IFP(sc)); bpfattach(GIF2IFP(sc), DLT_NULL, sizeof(u_int32_t)); if (ng_gif_attach_p != NULL) (*ng_gif_attach_p)(GIF2IFP(sc)); GIF_LIST_LOCK(); LIST_INSERT_HEAD(&V_gif_softc_list, sc, gif_list); GIF_LIST_UNLOCK(); return (0); } static void gif_clone_destroy(struct ifnet *ifp) { struct gif_softc *sc; sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; gif_delete_tunnel(ifp); GIF_LIST_LOCK(); LIST_REMOVE(sc, gif_list); GIF_LIST_UNLOCK(); if (ng_gif_detach_p != NULL) (*ng_gif_detach_p)(ifp); bpfdetach(ifp); if_detach(ifp); ifp->if_softc = NULL; sx_xunlock(&gif_ioctl_sx); if_free(ifp); GIF_LOCK_DESTROY(sc); free(sc, M_GIF); } static void vnet_gif_init(const void *unused __unused) { LIST_INIT(&V_gif_softc_list); GIF_LIST_LOCK_INIT(); V_gif_cloner = if_clone_simple(gifname, gif_clone_create, gif_clone_destroy, 0); } VNET_SYSINIT(vnet_gif_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_init, NULL); static void vnet_gif_uninit(const void *unused __unused) { if_clone_detach(V_gif_cloner); GIF_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_gif_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_gif_uninit, NULL); static int gifmodevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t gif_mod = { "if_gif", gifmodevent, 0 }; DECLARE_MODULE(if_gif, gif_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_gif, 1); int gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { GIF_RLOCK_TRACKER; struct gif_softc *sc; int ret; uint8_t ver; sc = (struct gif_softc *)arg; if (sc == NULL || (GIF2IFP(sc)->if_flags & IFF_UP) == 0) return (0); ret = 0; GIF_RLOCK(sc); /* no physical address */ if (sc->gif_family == 0) goto done; switch (proto) { #ifdef INET case IPPROTO_IPV4: #endif #ifdef INET6 case IPPROTO_IPV6: #endif case IPPROTO_ETHERIP: break; default: goto done; } /* Bail on short packets */ if (m->m_pkthdr.len < sizeof(struct ip)) goto done; m_copydata(m, 0, 1, &ver); switch (ver >> 4) { #ifdef INET case 4: if (sc->gif_family != AF_INET) goto done; ret = in_gif_encapcheck(m, off, proto, arg); break; #endif #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) goto done; if (sc->gif_family != AF_INET6) goto done; ret = in6_gif_encapcheck(m, off, proto, arg); break; #endif } done: GIF_RUNLOCK(sc); return (ret); } static int gif_transmit(struct ifnet *ifp, struct mbuf *m) { struct gif_softc *sc; struct etherip_header *eth; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif uint32_t af; uint8_t proto, ecn; int error; error = ENETDOWN; sc = ifp->if_softc; if (sc->gif_family == 0) { m_freem(m); goto err; } /* Now pull back the af that we stashed in the csum_data. */ af = m->m_pkthdr.csum_data; BPF_MTAP2(ifp, &af, sizeof(af), m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); M_SETFIB(m, sc->gif_fibnum); /* inner AF-specific encapsulation */ ecn = 0; switch (af) { #ifdef INET case AF_INET: proto = IPPROTO_IPV4; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { error = ENOBUFS; goto err; } ip = mtod(m, struct ip *); ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos); break; #endif #ifdef INET6 case AF_INET6: proto = IPPROTO_IPV6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { error = ENOBUFS; goto err; } t = 0; ip6 = mtod(m, struct ip6_hdr *); ip6_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow); ecn = (ntohl(t) >> 20) & 0xff; break; #endif case AF_LINK: proto = IPPROTO_ETHERIP; M_PREPEND(m, sizeof(struct etherip_header), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto err; } eth = mtod(m, struct etherip_header *); eth->eip_resvh = 0; if ((sc->gif_options & GIF_SEND_REVETHIP) != 0) { eth->eip_ver = 0; eth->eip_resvl = ETHERIP_VERSION; } else { eth->eip_ver = ETHERIP_VERSION; eth->eip_resvl = 0; } break; default: error = EAFNOSUPPORT; m_freem(m); goto err; } /* XXX should we check if our outer source is legal? */ /* dispatch to output logic based on outer AF */ switch (sc->gif_family) { #ifdef INET case AF_INET: error = in_gif_output(ifp, m, proto, ecn); break; #endif #ifdef INET6 case AF_INET6: error = in6_gif_output(ifp, m, proto, ecn); break; #endif default: m_freem(m); } err: if (error) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } static void gif_qflush(struct ifnet *ifp __unused) { } int gif_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct m_tag *mtag; uint32_t af; int gif_called; int error = 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto err; #endif if ((ifp->if_flags & IFF_MONITOR) != 0 || (ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto err; } /* * gif may cause infinite recursion calls when misconfigured. * We'll prevent this by detecting loops. * * High nesting level may cause stack exhaustion. * We'll prevent this by introducing upper limit. */ gif_called = 1; mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, NULL); while (mtag != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "gif_output: loop detected on %s\n", (*(struct ifnet **)(mtag + 1))->if_xname); error = EIO; /* is there better errno? */ goto err; } mtag = m_tag_locate(m, MTAG_GIF, MTAG_GIF_CALLED, mtag); gif_called++; } if (gif_called > V_max_gif_nesting) { log(LOG_NOTICE, "gif_output: recursively called too many times(%d)\n", gif_called); error = EIO; /* is there better errno? */ goto err; } mtag = m_tag_alloc(MTAG_GIF, MTAG_GIF_CALLED, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) { error = ENOMEM; goto err; } *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); m->m_flags &= ~(M_BCAST|M_MCAST); if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; if (ifp->if_bridge) af = AF_LINK; /* * Now save the af in the inbound pkt csum data, this is a cheat since * we are using the inbound csum_data field to carry the af over to * the gif_transmit() routine, avoiding using yet another mtag. */ m->m_pkthdr.csum_data = af; return (ifp->if_transmit(ifp, m)); err: if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return (error); } void gif_input(struct mbuf *m, struct ifnet *ifp, int proto, uint8_t ecn) { struct etherip_header *eip; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; uint32_t t; #endif struct gif_softc *sc; struct ether_header *eh; struct ifnet *oldifp; uint32_t gif_options; int isr, n, af; if (ifp == NULL) { /* just in case */ m_freem(m); return; } sc = ifp->if_softc; gif_options = sc->gif_options; m->m_pkthdr.rcvif = ifp; m_clrprotoflags(m); switch (proto) { #ifdef INET case IPPROTO_IPV4: af = AF_INET; if (m->m_len < sizeof(struct ip)) m = m_pullup(m, sizeof(struct ip)); if (m == NULL) goto drop; ip = mtod(m, struct ip *); if (ip_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &ecn, &ip->ip_tos) == 0) { m_freem(m); goto drop; } break; #endif #ifdef INET6 case IPPROTO_IPV6: af = AF_INET6; if (m->m_len < sizeof(struct ip6_hdr)) m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) goto drop; t = htonl((uint32_t)ecn << 20); ip6 = mtod(m, struct ip6_hdr *); if (ip6_ecn_egress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED: ECN_NOCARE, &t, &ip6->ip6_flow) == 0) { m_freem(m); goto drop; } break; #endif case IPPROTO_ETHERIP: af = AF_LINK; break; default: m_freem(m); goto drop; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (bpf_peers_present(ifp->if_bpf)) { uint32_t af1 = af; bpf_mtap2(ifp->if_bpf, &af1, sizeof(af1), m); } if ((ifp->if_flags & IFF_MONITOR) != 0) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); m_freem(m); return; } if (ng_gif_input_p != NULL) { (*ng_gif_input_p)(ifp, &m, af); if (m == NULL) goto drop; } /* * Put the packet to the network layer input queue according to the * specified address family. * Note: older versions of gif_input directly called network layer * input functions, e.g. ip6_input, here. We changed the policy to * prevent too many recursive calls of such input functions, which * might cause kernel panic. But the change may introduce another * problem; if the input queue is full, packets are discarded. * The kernel stack overflow really happened, and we believed * queue-full rarely occurs, so we changed the policy. */ switch (af) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif case AF_LINK: n = sizeof(struct etherip_header) + sizeof(struct ether_header); if (n > m->m_len) m = m_pullup(m, n); if (m == NULL) goto drop; eip = mtod(m, struct etherip_header *); /* * GIF_ACCEPT_REVETHIP (enabled by default) intentionally * accepts an EtherIP packet with revered version field in * the header. This is a knob for backward compatibility * with FreeBSD 7.2R or prior. */ if (eip->eip_ver != ETHERIP_VERSION) { if ((gif_options & GIF_ACCEPT_REVETHIP) == 0 || eip->eip_resvl != ETHERIP_VERSION) { /* discard unknown versions */ m_freem(m); goto drop; } } m_adj(m, sizeof(struct etherip_header)); m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = ifp; if (ifp->if_bridge) { oldifp = ifp; eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } BRIDGE_INPUT(ifp, m); if (m != NULL && ifp != oldifp) { /* * The bridge gave us back itself or one of the * members for which the frame is addressed. */ ether_demux(ifp, m); return; } } if (m != NULL) m_freem(m); return; default: if (ng_gif_input_orphan_p != NULL) (*ng_gif_input_orphan_p)(ifp, m, af); else m_freem(m); return; } if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; drop: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); } /* XXX how should we handle IPv6 scope on SIOC[GS]IFPHYADDR? */ int gif_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { GIF_RLOCK_TRACKER; struct ifreq *ifr = (struct ifreq*)data; struct sockaddr *dst, *src; struct gif_softc *sc; #ifdef INET struct sockaddr_in *sin = NULL; #endif #ifdef INET6 struct sockaddr_in6 *sin6 = NULL; #endif u_int options; int error; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; case SIOCADDMULTI: case SIOCDELMULTI: case SIOCGIFMTU: case SIOCSIFFLAGS: return (0); case SIOCSIFMTU: if (ifr->ifr_mtu < GIF_MTU_MIN || ifr->ifr_mtu > GIF_MTU_MAX) return (EINVAL); else ifp->if_mtu = ifr->ifr_mtu; return (0); } sx_xlock(&gif_ioctl_sx); sc = ifp->if_softc; if (sc == NULL) { error = ENXIO; goto bad; } error = 0; switch (cmd) { case SIOCSIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif error = EINVAL; switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: src = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in_aliasreq *)data)->ifra_dstaddr); break; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: src = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_addr); dst = (struct sockaddr *) &(((struct in6_aliasreq *)data)->ifra_dstaddr); break; #endif default: goto bad; } /* sa_family must be equal */ if (src->sa_family != dst->sa_family || src->sa_len != dst->sa_len) goto bad; /* validate sa_len */ switch (src->sa_family) { #ifdef INET case AF_INET: if (src->sa_len != sizeof(struct sockaddr_in)) goto bad; break; #endif #ifdef INET6 case AF_INET6: if (src->sa_len != sizeof(struct sockaddr_in6)) goto bad; break; #endif default: error = EAFNOSUPPORT; goto bad; } /* check sa_family looks sane for the cmd */ error = EAFNOSUPPORT; switch (cmd) { #ifdef INET case SIOCSIFPHYADDR: if (src->sa_family == AF_INET) break; goto bad; #endif #ifdef INET6 case SIOCSIFPHYADDR_IN6: if (src->sa_family == AF_INET6) break; goto bad; #endif } error = EADDRNOTAVAIL; switch (src->sa_family) { #ifdef INET case AF_INET: if (satosin(src)->sin_addr.s_addr == INADDR_ANY || satosin(dst)->sin_addr.s_addr == INADDR_ANY) goto bad; break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(src)->sin6_addr) || IN6_IS_ADDR_UNSPECIFIED(&satosin6(dst)->sin6_addr)) goto bad; /* * Check validity of the scope zone ID of the * addresses, and convert it into the kernel * internal form if necessary. */ error = sa6_embedscope(satosin6(src), 0); if (error != 0) goto bad; error = sa6_embedscope(satosin6(dst), 0); if (error != 0) goto bad; #endif }; error = gif_set_tunnel(ifp, src, dst); break; case SIOCDIFPHYADDR: gif_delete_tunnel(ifp); break; case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: #endif if (sc->gif_family == 0) { error = EADDRNOTAVAIL; break; } GIF_RLOCK(sc); switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: if (sc->gif_family != AF_INET) { error = EADDRNOTAVAIL; break; } sin = (struct sockaddr_in *)&ifr->ifr_addr; memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: if (sc->gif_family != AF_INET6) { error = EADDRNOTAVAIL; break; } sin6 = (struct sockaddr_in6 *) &(((struct in6_ifreq *)data)->ifr_addr); memset(sin6, 0, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); break; #endif default: error = EAFNOSUPPORT; } if (error == 0) { switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: sin->sin_addr = sc->gif_iphdr->ip_src; break; case SIOCGIFPDSTADDR: sin->sin_addr = sc->gif_iphdr->ip_dst; break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: sin6->sin6_addr = sc->gif_ip6hdr->ip6_src; break; case SIOCGIFPDSTADDR_IN6: sin6->sin6_addr = sc->gif_ip6hdr->ip6_dst; break; #endif } } GIF_RUNLOCK(sc); if (error != 0) break; switch (cmd) { #ifdef INET case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: error = prison_if(curthread->td_ucred, (struct sockaddr *)sin); if (error != 0) memset(sin, 0, sizeof(*sin)); break; #endif #ifdef INET6 case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: error = prison_if(curthread->td_ucred, (struct sockaddr *)sin6); if (error == 0) error = sa6_recoverscope(sin6); if (error != 0) memset(sin6, 0, sizeof(*sin6)); #endif } break; case GIFGOPTS: options = sc->gif_options; error = copyout(&options, ifr->ifr_data, sizeof(options)); break; case GIFSOPTS: if ((error = priv_check(curthread, PRIV_NET_GIF)) != 0) break; error = copyin(ifr->ifr_data, &options, sizeof(options)); if (error) break; if (options & ~GIF_OPTMASK) error = EINVAL; else sc->gif_options = options; break; default: error = EINVAL; break; } bad: sx_xunlock(&gif_ioctl_sx); return (error); } static void gif_detach(struct gif_softc *sc) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); if (sc->gif_ecookie != NULL) encap_detach(sc->gif_ecookie); sc->gif_ecookie = NULL; } static int gif_attach(struct gif_softc *sc, int af) { sx_assert(&gif_ioctl_sx, SA_XLOCKED); switch (af) { #ifdef INET case AF_INET: return (in_gif_attach(sc)); #endif #ifdef INET6 case AF_INET6: return (in6_gif_attach(sc)); #endif } return (EAFNOSUPPORT); } static int gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src, struct sockaddr *dst) { struct gif_softc *sc = ifp->if_softc; struct gif_softc *tsc; #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif void *hdr; int error = 0; if (sc == NULL) return (ENXIO); /* Disallow parallel tunnels unless instructed otherwise. */ if (V_parallel_tunnels == 0) { GIF_LIST_LOCK(); LIST_FOREACH(tsc, &V_gif_softc_list, gif_list) { if (tsc == sc || tsc->gif_family != src->sa_family) continue; #ifdef INET if (tsc->gif_family == AF_INET && tsc->gif_iphdr->ip_src.s_addr == satosin(src)->sin_addr.s_addr && tsc->gif_iphdr->ip_dst.s_addr == satosin(dst)->sin_addr.s_addr) { error = EADDRNOTAVAIL; GIF_LIST_UNLOCK(); goto bad; } #endif #ifdef INET6 if (tsc->gif_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_src, &satosin6(src)->sin6_addr) && IN6_ARE_ADDR_EQUAL(&tsc->gif_ip6hdr->ip6_dst, &satosin6(dst)->sin6_addr)) { error = EADDRNOTAVAIL; GIF_LIST_UNLOCK(); goto bad; } #endif } GIF_LIST_UNLOCK(); } switch (src->sa_family) { #ifdef INET case AF_INET: hdr = ip = malloc(sizeof(struct ip), M_GIF, M_WAITOK | M_ZERO); ip->ip_src.s_addr = satosin(src)->sin_addr.s_addr; ip->ip_dst.s_addr = satosin(dst)->sin_addr.s_addr; break; #endif #ifdef INET6 case AF_INET6: hdr = ip6 = malloc(sizeof(struct ip6_hdr), M_GIF, M_WAITOK | M_ZERO); ip6->ip6_src = satosin6(src)->sin6_addr; ip6->ip6_dst = satosin6(dst)->sin6_addr; ip6->ip6_vfc = IPV6_VERSION; break; #endif default: return (EAFNOSUPPORT); }; if (sc->gif_family != src->sa_family) gif_detach(sc); if (sc->gif_family == 0 || sc->gif_family != src->sa_family) error = gif_attach(sc, src->sa_family); GIF_WLOCK(sc); if (sc->gif_family != 0) free(sc->gif_hdr, M_GIF); sc->gif_family = src->sa_family; sc->gif_hdr = hdr; GIF_WUNLOCK(sc); #if defined(INET) || defined(INET6) bad: #endif if (error == 0 && sc->gif_family != 0) ifp->if_drv_flags |= IFF_DRV_RUNNING; else ifp->if_drv_flags &= ~IFF_DRV_RUNNING; return (error); } static void gif_delete_tunnel(struct ifnet *ifp) { struct gif_softc *sc = ifp->if_softc; int family; if (sc == NULL) return; GIF_WLOCK(sc); family = sc->gif_family; sc->gif_family = 0; GIF_WUNLOCK(sc); if (family != 0) { gif_detach(sc); free(sc->gif_hdr, M_GIF); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } Index: user/dchagin/lemul/sys/net/if_iso88025subr.c =================================================================== --- user/dchagin/lemul/sys/net/if_iso88025subr.c (revision 274240) +++ user/dchagin/lemul/sys/net/if_iso88025subr.c (revision 274241) @@ -1,744 +1,702 @@ /*- * Copyright (c) 1998, Larry Lile * All rights reserved. * * For latest sources and information on this driver, please * go to http://anarchy.stdio.com. * * Questions, comments or suggestions should be directed to * Larry Lile . * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ /* * * General ISO 802.5 (Token Ring) support routines * */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #endif #ifdef INET6 #include #endif #include static const u_char iso88025_broadcastaddr[ISO88025_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int iso88025_resolvemulti (struct ifnet *, struct sockaddr **, struct sockaddr *); #define senderr(e) do { error = (e); goto bad; } while (0) /* * Perform common duties while attaching to interface list */ void iso88025_ifattach(struct ifnet *ifp, const u_int8_t *lla, int bpf) { struct ifaddr *ifa; struct sockaddr_dl *sdl; ifa = NULL; ifp->if_type = IFT_ISO88025; ifp->if_addrlen = ISO88025_ADDR_LEN; ifp->if_hdrlen = ISO88025_HDR_LEN; if_attach(ifp); /* Must be called before additional assignments */ ifp->if_output = iso88025_output; ifp->if_input = iso88025_input; ifp->if_resolvemulti = iso88025_resolvemulti; ifp->if_broadcastaddr = iso88025_broadcastaddr; if (ifp->if_baudrate == 0) ifp->if_baudrate = TR_16MBPS; /* 16Mbit should be a safe default */ if (ifp->if_mtu == 0) ifp->if_mtu = ISO88025_DEFAULT_MTU; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ISO88025; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (bpf) bpfattach(ifp, DLT_IEEE802, ISO88025_HDR_LEN); return; } /* * Perform common duties while detaching a Token Ring interface */ void iso88025_ifdetach(ifp, bpf) struct ifnet *ifp; int bpf; { if (bpf) bpfdetach(ifp); if_detach(ifp); return; } int iso88025_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa; struct ifreq *ifr; int error; ifa = (struct ifaddr *) data; ifr = (struct ifreq *) data; error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif /* INET */ default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, ISO88025_ADDR_LEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ISO88025_MAX_MTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } /* * ISO88025 encapsulation */ int iso88025_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_int16_t snap_type = 0; int loop_copy = 0, error = 0, rif_len = 0; u_char edst[ISO88025_ADDR_LEN]; struct iso88025_header *th; struct iso88025_header gen_th; struct sockaddr_dl *sdl = NULL; struct rtentry *rt0 = NULL; #if defined(INET) || defined(INET6) struct llentry *lle; #endif if (ro != NULL) rt0 = ro->ro_rt; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); getmicrotime(&ifp->if_lastchange); /* Calculate routing info length based on arp table entry */ /* XXX any better way to do this ? */ if (rt0 && (sdl = (struct sockaddr_dl *)rt0->rt_gateway)) if (SDL_ISO88025(sdl)->trld_rcf != 0) rif_len = TR_RCF_RIFLEN(SDL_ISO88025(sdl)->trld_rcf); /* Generate a generic 802.5 header for the packet */ gen_th.ac = TR_AC; gen_th.fc = TR_LLC_FRAME; (void)memcpy((caddr_t)gen_th.iso88025_shost, IF_LLADDR(ifp), ISO88025_ADDR_LEN); if (rif_len) { gen_th.iso88025_shost[0] |= TR_RII; if (rif_len > 2) { gen_th.rcf = SDL_ISO88025(sdl)->trld_rcf; (void)memcpy((caddr_t)gen_th.rd, (caddr_t)SDL_ISO88025(sdl)->trld_route, rif_len - 2); } } switch (dst->sa_family) { #ifdef INET case AF_INET: error = arpresolve(ifp, rt0, m, dst, edst, &lle); if (error) return (error == EWOULDBLOCK ? 0 : error); snap_type = ETHERTYPE_IP; break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_IEEE802); loop_copy = -1; /* if this is for us, don't do it */ switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: snap_type = ETHERTYPE_REVARP; break; case ARPOP_REQUEST: case ARPOP_REPLY: default: snap_type = ETHERTYPE_ARP; break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, ISO88025_ADDR_LEN); else bcopy(ar_tha(ah), edst, ISO88025_ADDR_LEN); } break; #endif /* INET */ #ifdef INET6 case AF_INET6: error = nd6_storelladdr(ifp, m, dst, (u_char *)edst, &lle); if (error) return (error); snap_type = ETHERTYPE_IPV6; break; #endif /* INET6 */ case AF_UNSPEC: { const struct iso88025_sockaddr_data *sd; /* * For AF_UNSPEC sockaddr.sa_data must contain all of the * mac information needed to send the packet. This allows * full mac, llc, and source routing function to be controlled. * llc and source routing information must already be in the * mbuf provided, ac/fc are set in sa_data. sockaddr.sa_data * should be an iso88025_sockaddr_data structure see iso88025.h */ loop_copy = -1; sd = (const struct iso88025_sockaddr_data *)dst->sa_data; gen_th.ac = sd->ac; gen_th.fc = sd->fc; (void)memcpy(edst, sd->ether_dhost, ISO88025_ADDR_LEN); (void)memcpy(gen_th.iso88025_shost, sd->ether_shost, ISO88025_ADDR_LEN); rif_len = 0; break; } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); senderr(EAFNOSUPPORT); break; } /* * Add LLC header. */ if (snap_type != 0) { struct llc *l; M_PREPEND(m, LLC_SNAPFRAMELEN, M_NOWAIT); if (m == 0) senderr(ENOBUFS); l = mtod(m, struct llc *); l->llc_control = LLC_UI; l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; l->llc_snap.org_code[0] = l->llc_snap.org_code[1] = l->llc_snap.org_code[2] = 0; l->llc_snap.ether_type = htons(snap_type); } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, ISO88025_HDR_LEN + rif_len, M_NOWAIT); if (m == 0) senderr(ENOBUFS); th = mtod(m, struct iso88025_header *); bcopy((caddr_t)edst, (caddr_t)&gen_th.iso88025_dhost, ISO88025_ADDR_LEN); /* Copy as much of the generic header as is needed into the mbuf */ memcpy(th, &gen_th, ISO88025_HDR_LEN + rif_len); /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((ifp->if_flags & IFF_SIMPLEX) && (loop_copy != -1)) { if ((m->m_flags & M_BCAST) || (loop_copy > 0)) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); (void) if_simloop(ifp, n, dst->sa_family, ISO88025_HDR_LEN); } else if (bcmp(th->iso88025_dhost, th->iso88025_shost, ETHER_ADDR_LEN) == 0) { (void) if_simloop(ifp, m, dst->sa_family, ISO88025_HDR_LEN); return(0); /* XXX */ } } IFQ_HANDOFF_ADJ(ifp, m, ISO88025_HDR_LEN + LLC_SNAPFRAMELEN, error); if (error) { printf("iso88025_output: packet dropped QFULL.\n"); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } return (error); bad: if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m) m_freem(m); return (error); } /* * ISO 88025 de-encapsulation */ void iso88025_input(ifp, m) struct ifnet *ifp; struct mbuf *m; { struct iso88025_header *th; struct llc *l; int isr; int mac_hdr_len; /* * Do consistency checks to verify assumptions * made by code past this point. */ if ((m->m_flags & M_PKTHDR) == 0) { if_printf(ifp, "discard frame w/o packet header\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } if (m->m_pkthdr.rcvif == NULL) { if_printf(ifp, "discard frame w/o interface pointer\n"); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } m = m_pullup(m, ISO88025_HDR_LEN); if (m == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } th = mtod(m, struct iso88025_header *); /* * Discard packet if interface is not up. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) goto dropanyway; /* * Give bpf a chance at the packet. */ BPF_MTAP(ifp, m); /* * Interface marked for monitoring; discard packet. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); return; } #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif /* * Update interface statistics. */ if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); getmicrotime(&ifp->if_lastchange); /* * Discard non local unicast packets when interface * is in promiscuous mode. */ if ((ifp->if_flags & IFF_PROMISC) && ((th->iso88025_dhost[0] & 1) == 0) && (bcmp(IF_LLADDR(ifp), (caddr_t) th->iso88025_dhost, ISO88025_ADDR_LEN) != 0)) goto dropanyway; /* * Set mbuf flags for bcast/mcast. */ if (th->iso88025_dhost[0] & 1) { if (bcmp(iso88025_broadcastaddr, th->iso88025_dhost, ISO88025_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } mac_hdr_len = ISO88025_HDR_LEN; /* Check for source routing info */ if (th->iso88025_shost[0] & TR_RII) mac_hdr_len += TR_RCF_RIFLEN(th->rcf); /* Strip off ISO88025 header. */ m_adj(m, mac_hdr_len); m = m_pullup(m, LLC_SNAPFRAMELEN); if (m == 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); goto dropanyway; } l = mtod(m, struct llc *); switch (l->llc_dsap) { case LLC_SNAP_LSAP: { u_int16_t type; if ((l->llc_control != LLC_UI) || (l->llc_ssap != LLC_SNAP_LSAP)) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0 || l->llc_snap.org_code[2] != 0) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } type = ntohs(l->llc_snap.ether_type); m_adj(m, LLC_SNAPFRAMELEN); switch (type) { #ifdef INET case ETHERTYPE_IP: th->iso88025_shost[0] &= ~(TR_RII); if ((m = ip_fastforward(m)) == NULL) return; isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) goto dropanyway; isr = NETISR_ARP; break; #endif /* INET */ #ifdef INET6 case ETHERTYPE_IPV6: th->iso88025_shost[0] &= ~(TR_RII); isr = NETISR_IPV6; break; #endif /* INET6 */ default: printf("iso88025_input: unexpected llc_snap ether_type 0x%02x\n", type); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; } break; } #ifdef ISO case LLC_ISO_LSAP: switch (l->llc_control) { case LLC_UI: if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; case LLC_XID: case LLC_XID_P: if(m->m_len < ISO88025_ADDR_LEN) goto dropanyway; l->llc_window = 0; l->llc_fid = 9; l->llc_class = 1; l->llc_dsap = l->llc_ssap = 0; /* Fall through to */ case LLC_TEST: case LLC_TEST_P: { struct sockaddr sa; - struct arpcom *ac; struct iso88025_sockaddr_data *th2; int i; u_char c; c = l->llc_dsap; if (th->iso88025_shost[0] & TR_RII) { /* XXX */ printf("iso88025_input: dropping source routed LLC_TEST\n"); goto dropanyway; } l->llc_dsap = l->llc_ssap; l->llc_ssap = c; if (m->m_flags & (M_BCAST | M_MCAST)) bcopy((caddr_t)IF_LLADDR(ifp), (caddr_t)th->iso88025_dhost, ISO88025_ADDR_LEN); sa.sa_family = AF_UNSPEC; sa.sa_len = sizeof(sa); th2 = (struct iso88025_sockaddr_data *)sa.sa_data; for (i = 0; i < ISO88025_ADDR_LEN; i++) { th2->ether_shost[i] = c = th->iso88025_dhost[i]; th2->ether_dhost[i] = th->iso88025_dhost[i] = th->iso88025_shost[i]; th->iso88025_shost[i] = c; } th2->ac = TR_AC; th2->fc = TR_LLC_FRAME; ifp->if_output(ifp, m, &sa, NULL); return; } default: printf("iso88025_input: unexpected llc control 0x%02x\n", l->llc_control); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; } break; #endif /* ISO */ default: printf("iso88025_input: unknown dsap 0x%x\n", l->llc_dsap); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); goto dropanyway; break; } M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); return; dropanyway: if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if (m) m_freem(m); return; } static int iso88025_resolvemulti (ifp, llsa, sa) struct ifnet *ifp; struct sockaddr **llsa; struct sockaddr *sa; { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if ((e_addr[0] & 1) != 1) { return (EADDRNOTAVAIL); } *llsa = 0; return (0); #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { return (EADDRNOTAVAIL); } sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025); sdl->sdl_alen = ISO88025_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = 0; return (0); } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { return (EADDRNOTAVAIL); } sdl = link_init_sdl(ifp, *llsa, IFT_ISO88025); sdl->sdl_alen = ISO88025_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return (EAFNOSUPPORT); } return (0); } -static MALLOC_DEFINE(M_ISO88025, "arpcom", "802.5 interface internals"); - -static void* -iso88025_alloc(u_char type, struct ifnet *ifp) -{ - struct arpcom *ac; - - ac = malloc(sizeof(struct arpcom), M_ISO88025, M_WAITOK | M_ZERO); - ac->ac_ifp = ifp; - - return (ac); -} - -static void -iso88025_free(void *com, u_char type) -{ - - free(com, M_ISO88025); -} - -static int -iso88025_modevent(module_t mod, int type, void *data) -{ - - switch (type) { - case MOD_LOAD: - if_register_com_alloc(IFT_ISO88025, iso88025_alloc, - iso88025_free); - break; - case MOD_UNLOAD: - if_deregister_com_alloc(IFT_ISO88025); - break; - default: - return EOPNOTSUPP; - } - - return (0); -} - static moduledata_t iso88025_mod = { - "iso88025", - iso88025_modevent, - 0 + .name = "iso88025", }; DECLARE_MODULE(iso88025, iso88025_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(iso88025, 1); Index: user/dchagin/lemul/sys/net/if_mib.c =================================================================== --- user/dchagin/lemul/sys/net/if_mib.c (revision 274240) +++ user/dchagin/lemul/sys/net/if_mib.c (revision 274241) @@ -1,151 +1,151 @@ /*- * Copyright 1996 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include /* * A sysctl(3) MIB for generic interface information. This information * is exported in the net.link.generic branch, which has the following * structure: * * net.link.generic .system - system-wide control variables * and statistics (node) * .ifdata..general * - what's in `struct ifdata' * plus some other info * .ifdata..linkspecific * - a link-type-specific data * structure (as might be used * by an SNMP agent * * Perhaps someday we will make addresses accessible via this interface * as well (then there will be four such...). The reason that the * index comes before the last element in the name is because it * seems more orthogonal that way, particularly with the possibility * of other per-interface data living down here as well (e.g., integrated * services stuff). */ SYSCTL_DECL(_net_link_generic); static SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RW, 0, "Variables global to all interfaces"); -SYSCTL_VNET_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, - &VNET_NAME(if_index), 0, - "Number of configured interfaces"); +SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(if_index), 0, + "Number of configured interfaces"); static int sysctl_ifdata(SYSCTL_HANDLER_ARGS) /* XXX bad syntax! */ { int *name = (int *)arg1; int error; u_int namelen = arg2; struct ifnet *ifp; struct ifmibdata ifmd; size_t dlen; char *dbuf; if (namelen != 2) return EINVAL; if (name[0] <= 0) return (ENOENT); ifp = ifnet_byindex_ref(name[0]); if (ifp == NULL) return (ENOENT); switch(name[1]) { default: error = ENOENT; goto out; case IFDATA_GENERAL: bzero(&ifmd, sizeof(ifmd)); strlcpy(ifmd.ifmd_name, ifp->if_xname, sizeof(ifmd.ifmd_name)); ifmd.ifmd_pcount = ifp->if_pcount; if_data_copy(ifp, &ifmd.ifmd_data); ifmd.ifmd_flags = ifp->if_flags | ifp->if_drv_flags; ifmd.ifmd_snd_len = ifp->if_snd.ifq_len; ifmd.ifmd_snd_maxlen = ifp->if_snd.ifq_maxlen; ifmd.ifmd_snd_drops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); error = SYSCTL_OUT(req, &ifmd, sizeof ifmd); if (error) goto out; break; case IFDATA_LINKSPECIFIC: error = SYSCTL_OUT(req, ifp->if_linkmib, ifp->if_linkmiblen); if (error || !req->newptr) goto out; error = SYSCTL_IN(req, ifp->if_linkmib, ifp->if_linkmiblen); if (error) goto out; break; case IFDATA_DRIVERNAME: /* 20 is enough for 64bit ints */ dlen = strlen(ifp->if_dname) + 20 + 1; if ((dbuf = malloc(dlen, M_TEMP, M_NOWAIT)) == NULL) { error = ENOMEM; goto out; } if (ifp->if_dunit == IF_DUNIT_NONE) strcpy(dbuf, ifp->if_dname); else sprintf(dbuf, "%s%d", ifp->if_dname, ifp->if_dunit); error = SYSCTL_OUT(req, dbuf, strlen(dbuf) + 1); if (error == 0 && req->newptr != NULL) error = EPERM; free(dbuf, M_TEMP); goto out; } out: if_rele(ifp); return error; } static SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RW, sysctl_ifdata, "Interface table"); Index: user/dchagin/lemul/sys/net/vnet.h =================================================================== --- user/dchagin/lemul/sys/net/vnet.h (revision 274240) +++ user/dchagin/lemul/sys/net/vnet.h (revision 274241) @@ -1,491 +1,428 @@ /*- * Copyright (c) 2006-2009 University of Zagreb * Copyright (c) 2006-2009 FreeBSD Foundation * All rights reserved. * * This software was developed by the University of Zagreb and the * FreeBSD Foundation under sponsorship by the Stichting NLnet and the * FreeBSD Foundation. * * Copyright (c) 2009 Jeffrey Roberson * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /*- * This header file defines several sets of interfaces supporting virtualized * network stacks: * * - Definition of 'struct vnet' and functions and macros to allocate/free/ * manipulate it. * * - A virtual network stack memory allocator, which provides support for * virtualized global variables via a special linker set, set_vnet. * * - Virtualized sysinits/sysuninits, which allow constructors and * destructors to be run for each network stack subsystem as virtual * instances are created and destroyed. * * If VIMAGE isn't compiled into the kernel, virtualized global variables * compile to normal global variables, and virtualized sysinits to regular * sysinits. */ #ifndef _NET_VNET_H_ #define _NET_VNET_H_ /* * struct vnet describes a virtualized network stack, and is primarily a * pointer to storage for virtualized global variables. Expose to userspace * as required for libkvm. */ #if defined(_KERNEL) || defined(_WANT_VNET) #include struct vnet { LIST_ENTRY(vnet) vnet_le; /* all vnets list */ u_int vnet_magic_n; u_int vnet_ifcnt; u_int vnet_sockcnt; void *vnet_data_mem; uintptr_t vnet_data_base; }; #define VNET_MAGIC_N 0x3e0d8f29 /* * These two virtual network stack allocator definitions are also required * for libkvm so that it can evaluate virtualized global variables. */ #define VNET_SETNAME "set_vnet" #define VNET_SYMPREFIX "vnet_entry_" #endif #ifdef _KERNEL #define VNET_PCPUSTAT_DECLARE(type, name) \ VNET_DECLARE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) #define VNET_PCPUSTAT_DEFINE(type, name) \ VNET_DEFINE(counter_u64_t, name[sizeof(type) / sizeof(uint64_t)]) #define VNET_PCPUSTAT_ALLOC(name, wait) \ COUNTER_ARRAY_ALLOC(VNET(name), \ sizeof(VNET(name)) / sizeof(counter_u64_t), (wait)) #define VNET_PCPUSTAT_FREE(name) \ COUNTER_ARRAY_FREE(VNET(name), sizeof(VNET(name)) / sizeof(counter_u64_t)) #define VNET_PCPUSTAT_ADD(type, name, f, v) \ counter_u64_add(VNET(name)[offsetof(type, f) / sizeof(uint64_t)], (v)) #define VNET_PCPUSTAT_SYSINIT(name) \ static void \ vnet_##name##_init(const void *unused) \ { \ VNET_PCPUSTAT_ALLOC(name, M_WAITOK); \ } \ VNET_SYSINIT(vnet_ ## name ## _init, SI_SUB_PROTO_IFATTACHDOMAIN, \ SI_ORDER_ANY, vnet_ ## name ## _init, NULL) #define VNET_PCPUSTAT_SYSUNINIT(name) \ static void \ vnet_##name##_uninit(const void *unused) \ { \ VNET_PCPUSTAT_FREE(name); \ } \ VNET_SYSUNINIT(vnet_ ## name ## _uninit, SI_SUB_PROTO_IFATTACHDOMAIN, \ SI_ORDER_ANY, vnet_ ## name ## _uninit, NULL) +#ifdef SYSCTL_OID #define SYSCTL_VNET_PCPUSTAT(parent, nbr, name, type, array, desc) \ static int \ array##_sysctl(SYSCTL_HANDLER_ARGS) \ { \ type s; \ CTASSERT((sizeof(type) / sizeof(uint64_t)) == \ (sizeof(VNET(array)) / sizeof(counter_u64_t))); \ COUNTER_ARRAY_COPY(VNET(array), &s, sizeof(type) / sizeof(uint64_t));\ if (req->newptr) \ COUNTER_ARRAY_ZERO(VNET(array), \ sizeof(type) / sizeof(uint64_t)); \ return (SYSCTL_OUT(req, &s, sizeof(type))); \ } \ -SYSCTL_VNET_PROC(parent, nbr, name, CTLTYPE_OPAQUE | CTLFLAG_RW, NULL, \ - 0, array ## _sysctl, "I", desc) +SYSCTL_PROC(parent, nbr, name, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RW, \ + NULL, 0, array ## _sysctl, "I", desc) +#endif /* SYSCTL_OID */ #ifdef VIMAGE #include #include /* for struct thread */ #include #include /* * Location of the kernel's 'set_vnet' linker set. */ extern uintptr_t *__start_set_vnet; __GLOBL(__start_set_vnet); extern uintptr_t *__stop_set_vnet; __GLOBL(__stop_set_vnet); #define VNET_START (uintptr_t)&__start_set_vnet #define VNET_STOP (uintptr_t)&__stop_set_vnet /* * Functions to allocate and destroy virtual network stacks. */ struct vnet *vnet_alloc(void); void vnet_destroy(struct vnet *vnet); /* * The current virtual network stack -- we may wish to move this to struct * pcpu in the future. */ #define curvnet curthread->td_vnet /* * Various macros -- get and set the current network stack, but also * assertions. */ #if defined(INVARIANTS) || defined(VNET_DEBUG) #define VNET_ASSERT(exp, msg) do { \ if (!(exp)) \ panic msg; \ } while (0) #else #define VNET_ASSERT(exp, msg) do { \ } while (0) #endif #ifdef VNET_DEBUG void vnet_log_recursion(struct vnet *, const char *, int); #define CURVNET_SET_QUIET(arg) \ VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \ ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, (arg))); \ struct vnet *saved_vnet = curvnet; \ const char *saved_vnet_lpush = curthread->td_vnet_lpush; \ curvnet = arg; \ curthread->td_vnet_lpush = __func__; #define CURVNET_SET_VERBOSE(arg) \ CURVNET_SET_QUIET(arg) \ if (saved_vnet) \ vnet_log_recursion(saved_vnet, saved_vnet_lpush, __LINE__); #define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) #define CURVNET_RESTORE() \ VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL || \ saved_vnet->vnet_magic_n == VNET_MAGIC_N), \ ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, saved_vnet)); \ curvnet = saved_vnet; \ curthread->td_vnet_lpush = saved_vnet_lpush; #else /* !VNET_DEBUG */ #define CURVNET_SET_QUIET(arg) \ VNET_ASSERT((arg) != NULL && (arg)->vnet_magic_n == VNET_MAGIC_N, \ ("CURVNET_SET at %s:%d %s() curvnet=%p vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, (arg))); \ struct vnet *saved_vnet = curvnet; \ curvnet = arg; #define CURVNET_SET_VERBOSE(arg) \ CURVNET_SET_QUIET(arg) #define CURVNET_SET(arg) CURVNET_SET_VERBOSE(arg) #define CURVNET_RESTORE() \ VNET_ASSERT(curvnet != NULL && (saved_vnet == NULL || \ saved_vnet->vnet_magic_n == VNET_MAGIC_N), \ ("CURVNET_RESTORE at %s:%d %s() curvnet=%p saved_vnet=%p", \ __FILE__, __LINE__, __func__, curvnet, saved_vnet)); \ curvnet = saved_vnet; #endif /* VNET_DEBUG */ extern struct vnet *vnet0; #define IS_DEFAULT_VNET(arg) ((arg) == vnet0) #define CRED_TO_VNET(cr) (cr)->cr_prison->pr_vnet #define TD_TO_VNET(td) CRED_TO_VNET((td)->td_ucred) #define P_TO_VNET(p) CRED_TO_VNET((p)->p_ucred) /* * Global linked list of all virtual network stacks, along with read locks to * access it. If a caller may sleep while accessing the list, it must use * the sleepable lock macros. */ LIST_HEAD(vnet_list_head, vnet); extern struct vnet_list_head vnet_head; extern struct rwlock vnet_rwlock; extern struct sx vnet_sxlock; #define VNET_LIST_RLOCK() sx_slock(&vnet_sxlock) #define VNET_LIST_RLOCK_NOSLEEP() rw_rlock(&vnet_rwlock) #define VNET_LIST_RUNLOCK() sx_sunlock(&vnet_sxlock) #define VNET_LIST_RUNLOCK_NOSLEEP() rw_runlock(&vnet_rwlock) /* * Iteration macros to walk the global list of virtual network stacks. */ #define VNET_ITERATOR_DECL(arg) struct vnet *arg #define VNET_FOREACH(arg) LIST_FOREACH((arg), &vnet_head, vnet_le) /* * Virtual network stack memory allocator, which allows global variables to * be automatically instantiated for each network stack instance. */ #define VNET_NAME(n) vnet_entry_##n #define VNET_DECLARE(t, n) extern t VNET_NAME(n) #define VNET_DEFINE(t, n) t VNET_NAME(n) __section(VNET_SETNAME) __used #define _VNET_PTR(b, n) (__typeof(VNET_NAME(n))*) \ ((b) + (uintptr_t)&VNET_NAME(n)) #define _VNET(b, n) (*_VNET_PTR(b, n)) /* * Virtualized global variable accessor macros. */ #define VNET_VNET_PTR(vnet, n) _VNET_PTR((vnet)->vnet_data_base, n) #define VNET_VNET(vnet, n) (*VNET_VNET_PTR((vnet), n)) #define VNET_PTR(n) VNET_VNET_PTR(curvnet, n) #define VNET(n) VNET_VNET(curvnet, n) /* * Virtual network stack allocator interfaces from the kernel linker. */ void *vnet_data_alloc(int size); void vnet_data_copy(void *start, int size); void vnet_data_free(void *start_arg, int size); /* - * Sysctl variants for vnet-virtualized global variables. Include - * to expose these definitions. - * - * Note: SYSCTL_PROC() handler functions will need to resolve pointer - * arguments themselves, if required. - */ -#ifdef SYSCTL_OID -#define SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_INT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access), \ - ptr, val, sysctl_handle_int, "I", descr) -#define SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler, \ - fmt, descr) \ - CTASSERT(((access) & CTLTYPE) != 0); \ - SYSCTL_OID(parent, nbr, name, CTLFLAG_VNET|(access), ptr, arg, \ - handler, fmt, descr) -#define SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt, \ - descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, len, \ - sysctl_handle_opaque, fmt, descr) -#define SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_STRING|CTLFLAG_VNET|(access), \ - arg, len, sysctl_handle_string, "A", descr) -#define SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_OPAQUE|CTLFLAG_VNET|(access), ptr, \ - sizeof(struct type), sysctl_handle_opaque, "S," #type, \ - descr) -#define SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_OID(parent, nbr, name, \ - CTLTYPE_UINT|CTLFLAG_MPSAFE|CTLFLAG_VNET|(access), \ - ptr, val, sysctl_handle_int, "IU", descr) -#define VNET_SYSCTL_ARG(req, arg1) do { \ - if (arg1 != NULL) \ - arg1 = (void *)(TD_TO_VNET((req)->td)->vnet_data_base + \ - (uintptr_t)(arg1)); \ -} while (0) -#endif /* SYSCTL_OID */ - -/* * Virtual sysinit mechanism, allowing network stack components to declare * startup and shutdown methods to be run when virtual network stack * instances are created and destroyed. */ #include /* * SYSINIT/SYSUNINIT variants that provide per-vnet constructors and * destructors. */ struct vnet_sysinit { enum sysinit_sub_id subsystem; enum sysinit_elem_order order; sysinit_cfunc_t func; const void *arg; TAILQ_ENTRY(vnet_sysinit) link; }; #define VNET_SYSINIT(ident, subsystem, order, func, arg) \ static struct vnet_sysinit ident ## _vnet_init = { \ subsystem, \ order, \ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ (arg) \ }; \ SYSINIT(vnet_init_ ## ident, subsystem, order, \ vnet_register_sysinit, &ident ## _vnet_init); \ SYSUNINIT(vnet_init_ ## ident, subsystem, order, \ vnet_deregister_sysinit, &ident ## _vnet_init) #define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ static struct vnet_sysinit ident ## _vnet_uninit = { \ subsystem, \ order, \ (sysinit_cfunc_t)(sysinit_nfunc_t)func, \ (arg) \ }; \ SYSINIT(vnet_uninit_ ## ident, subsystem, order, \ vnet_register_sysuninit, &ident ## _vnet_uninit); \ SYSUNINIT(vnet_uninit_ ## ident, subsystem, order, \ vnet_deregister_sysuninit, &ident ## _vnet_uninit) /* * Run per-vnet sysinits or sysuninits during vnet creation/destruction. */ void vnet_sysinit(void); void vnet_sysuninit(void); /* * Interfaces for managing per-vnet constructors and destructors. */ void vnet_register_sysinit(void *arg); void vnet_register_sysuninit(void *arg); void vnet_deregister_sysinit(void *arg); void vnet_deregister_sysuninit(void *arg); /* * EVENTHANDLER(9) extensions. */ #include void vnet_global_eventhandler_iterator_func(void *, ...); #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ do { \ if (IS_DEFAULT_VNET(curvnet)) { \ (tag) = vimage_eventhandler_register(NULL, #name, func, \ arg, priority, \ vnet_global_eventhandler_iterator_func); \ } \ } while(0) #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ do { \ if (IS_DEFAULT_VNET(curvnet)) { \ vimage_eventhandler_register(NULL, #name, func, \ arg, priority, \ vnet_global_eventhandler_iterator_func); \ } \ } while(0) #else /* !VIMAGE */ /* * Various virtual network stack macros compile to no-ops without VIMAGE. */ #define curvnet NULL #define VNET_ASSERT(exp, msg) #define CURVNET_SET(arg) #define CURVNET_SET_QUIET(arg) #define CURVNET_RESTORE() #define VNET_LIST_RLOCK() #define VNET_LIST_RLOCK_NOSLEEP() #define VNET_LIST_RUNLOCK() #define VNET_LIST_RUNLOCK_NOSLEEP() #define VNET_ITERATOR_DECL(arg) #define VNET_FOREACH(arg) #define IS_DEFAULT_VNET(arg) 1 #define CRED_TO_VNET(cr) NULL #define TD_TO_VNET(td) NULL #define P_TO_VNET(p) NULL /* * Versions of the VNET macros that compile to normal global variables and * standard sysctl definitions. */ #define VNET_NAME(n) n #define VNET_DECLARE(t, n) extern t n #define VNET_DEFINE(t, n) t n #define _VNET_PTR(b, n) &VNET_NAME(n) /* * Virtualized global variable accessor macros. */ #define VNET_VNET_PTR(vnet, n) (&(n)) #define VNET_VNET(vnet, n) (n) #define VNET_PTR(n) (&(n)) #define VNET(n) (n) - -/* - * When VIMAGE isn't compiled into the kernel, virtaulized SYSCTLs simply - * become normal SYSCTLs. - */ -#ifdef SYSCTL_OID -#define SYSCTL_VNET_INT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_INT(parent, nbr, name, access, ptr, val, descr) -#define SYSCTL_VNET_PROC(parent, nbr, name, access, ptr, arg, handler, \ - fmt, descr) \ - SYSCTL_PROC(parent, nbr, name, access, ptr, arg, handler, fmt, \ - descr) -#define SYSCTL_VNET_OPAQUE(parent, nbr, name, access, ptr, len, fmt, \ - descr) \ - SYSCTL_OPAQUE(parent, nbr, name, access, ptr, len, fmt, descr) -#define SYSCTL_VNET_STRING(parent, nbr, name, access, arg, len, descr) \ - SYSCTL_STRING(parent, nbr, name, access, arg, len, descr) -#define SYSCTL_VNET_STRUCT(parent, nbr, name, access, ptr, type, descr) \ - SYSCTL_STRUCT(parent, nbr, name, access, ptr, type, descr) -#define SYSCTL_VNET_UINT(parent, nbr, name, access, ptr, val, descr) \ - SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr) -#define VNET_SYSCTL_ARG(req, arg1) -#endif /* SYSCTL_OID */ /* * When VIMAGE isn't compiled into the kernel, VNET_SYSINIT/VNET_SYSUNINIT * map into normal sysinits, which have the same ordering properties. */ #define VNET_SYSINIT(ident, subsystem, order, func, arg) \ SYSINIT(ident, subsystem, order, func, arg) #define VNET_SYSUNINIT(ident, subsystem, order, func, arg) \ SYSUNINIT(ident, subsystem, order, func, arg) /* * Without VIMAGE revert to the default implementation. */ #define VNET_GLOBAL_EVENTHANDLER_REGISTER_TAG(tag, name, func, arg, priority) \ (tag) = eventhandler_register(NULL, #name, func, arg, priority) #define VNET_GLOBAL_EVENTHANDLER_REGISTER(name, func, arg, priority) \ eventhandler_register(NULL, #name, func, arg, priority) #endif /* VIMAGE */ #endif /* _KERNEL */ #endif /* !_NET_VNET_H_ */ Index: user/dchagin/lemul/sys/netgraph/ng_ether.c =================================================================== --- user/dchagin/lemul/sys/netgraph/ng_ether.c (revision 274240) +++ user/dchagin/lemul/sys/netgraph/ng_ether.c (revision 274241) @@ -1,880 +1,880 @@ /* * ng_ether.c */ /*- * Copyright (c) 1996-2000 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Authors: Archie Cobbs * Julian Elischer * * $FreeBSD$ */ /* * ng_ether(4) netgraph node type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(ng_ether, 1); -#define IFP2NG(ifp) (IFP2AC((ifp))->ac_netgraph) +#define IFP2NG(ifp) ((ifp)->if_l2com) /* Per-node private data */ struct private { struct ifnet *ifp; /* associated interface */ hook_p upper; /* upper hook connection */ hook_p lower; /* lower hook connection */ hook_p orphan; /* orphan hook connection */ u_char autoSrcAddr; /* always overwrite source address */ u_char promisc; /* promiscuous mode enabled */ u_long hwassist; /* hardware checksum capabilities */ u_int flags; /* flags e.g. really die */ }; typedef struct private *priv_p; /* Hook pointers used by if_ethersubr.c to callback to netgraph */ extern void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); extern void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); extern int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); extern void (*ng_ether_attach_p)(struct ifnet *ifp); extern void (*ng_ether_detach_p)(struct ifnet *ifp); extern void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); /* Functional hooks called from if_ethersubr.c */ static void ng_ether_input(struct ifnet *ifp, struct mbuf **mp); static void ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m); static int ng_ether_output(struct ifnet *ifp, struct mbuf **mp); static void ng_ether_attach(struct ifnet *ifp); static void ng_ether_detach(struct ifnet *ifp); static void ng_ether_link_state(struct ifnet *ifp, int state); /* Other functions */ static int ng_ether_rcv_lower(hook_p node, item_p item); static int ng_ether_rcv_upper(hook_p node, item_p item); /* Netgraph node methods */ static ng_constructor_t ng_ether_constructor; static ng_rcvmsg_t ng_ether_rcvmsg; static ng_shutdown_t ng_ether_shutdown; static ng_newhook_t ng_ether_newhook; static ng_rcvdata_t ng_ether_rcvdata; static ng_disconnect_t ng_ether_disconnect; static int ng_ether_mod_event(module_t mod, int event, void *data); static eventhandler_tag ng_ether_ifnet_arrival_cookie; /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_ether_cmdlist[] = { { NGM_ETHER_COOKIE, NGM_ETHER_GET_IFNAME, "getifname", NULL, &ng_parse_string_type }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_IFINDEX, "getifindex", NULL, &ng_parse_int32_type }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_ENADDR, "getenaddr", NULL, &ng_parse_enaddr_type }, { NGM_ETHER_COOKIE, NGM_ETHER_SET_ENADDR, "setenaddr", &ng_parse_enaddr_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_PROMISC, "getpromisc", NULL, &ng_parse_int32_type }, { NGM_ETHER_COOKIE, NGM_ETHER_SET_PROMISC, "setpromisc", &ng_parse_int32_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_GET_AUTOSRC, "getautosrc", NULL, &ng_parse_int32_type }, { NGM_ETHER_COOKIE, NGM_ETHER_SET_AUTOSRC, "setautosrc", &ng_parse_int32_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_ADD_MULTI, "addmulti", &ng_parse_enaddr_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_DEL_MULTI, "delmulti", &ng_parse_enaddr_type, NULL }, { NGM_ETHER_COOKIE, NGM_ETHER_DETACH, "detach", NULL, NULL }, { 0 } }; static struct ng_type ng_ether_typestruct = { .version = NG_ABI_VERSION, .name = NG_ETHER_NODE_TYPE, .mod_event = ng_ether_mod_event, .constructor = ng_ether_constructor, .rcvmsg = ng_ether_rcvmsg, .shutdown = ng_ether_shutdown, .newhook = ng_ether_newhook, .rcvdata = ng_ether_rcvdata, .disconnect = ng_ether_disconnect, .cmdlist = ng_ether_cmdlist, }; NETGRAPH_INIT(ether, &ng_ether_typestruct); /****************************************************************** UTILITY FUNCTIONS ******************************************************************/ static void ng_ether_sanitize_ifname(const char *ifname, char *name) { int i; for (i = 0; i < IFNAMSIZ; i++) { if (ifname[i] == '.' || ifname[i] == ':') name[i] = '_'; else name[i] = ifname[i]; if (name[i] == '\0') break; } } /****************************************************************** ETHERNET FUNCTION HOOKS ******************************************************************/ /* * Handle a packet that has come in on an interface. We get to * look at it here before any upper layer protocols do. */ static void ng_ether_input(struct ifnet *ifp, struct mbuf **mp) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); int error; /* If "lower" hook not connected, let packet continue */ if (priv->lower == NULL) return; NG_SEND_DATA_ONLY(error, priv->lower, *mp); /* sets *mp = NULL */ } /* * Handle a packet that has come in on an interface, and which * does not match any of our known protocols (an ``orphan''). */ static void ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); int error; /* If "orphan" hook not connected, discard packet */ if (priv->orphan == NULL) { m_freem(m); return; } NG_SEND_DATA_ONLY(error, priv->orphan, m); } /* * Handle a packet that is going out on an interface. * The Ethernet header is already attached to the mbuf. */ static int ng_ether_output(struct ifnet *ifp, struct mbuf **mp) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); int error = 0; /* If "upper" hook not connected, let packet continue */ if (priv->upper == NULL) return (0); /* Send it out "upper" hook */ NG_OUTBOUND_THREAD_REF(); NG_SEND_DATA_ONLY(error, priv->upper, *mp); NG_OUTBOUND_THREAD_UNREF(); return (error); } /* * A new Ethernet interface has been attached. * Create a new node for it, etc. */ static void ng_ether_attach(struct ifnet *ifp) { char name[IFNAMSIZ]; priv_p priv; node_p node; /* * Do not create / attach an ether node to this ifnet if * a netgraph node with the same name already exists. * This should prevent ether nodes to become attached to * eiface nodes, which may be problematic due to naming * clashes. */ if ((node = ng_name2noderef(NULL, ifp->if_xname)) != NULL) { NG_NODE_UNREF(node); return; } /* Create node */ KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__)); if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) { log(LOG_ERR, "%s: can't %s for %s\n", __func__, "create node", ifp->if_xname); return; } /* Allocate private data */ priv = malloc(sizeof(*priv), M_NETGRAPH, M_NOWAIT | M_ZERO); if (priv == NULL) { log(LOG_ERR, "%s: can't %s for %s\n", __func__, "allocate memory", ifp->if_xname); NG_NODE_UNREF(node); return; } NG_NODE_SET_PRIVATE(node, priv); priv->ifp = ifp; IFP2NG(ifp) = node; priv->hwassist = ifp->if_hwassist; /* Try to give the node the same name as the interface */ ng_ether_sanitize_ifname(ifp->if_xname, name); if (ng_name_node(node, name) != 0) log(LOG_WARNING, "%s: can't name node %s\n", __func__, name); } /* * An Ethernet interface is being detached. * REALLY Destroy its node. */ static void ng_ether_detach(struct ifnet *ifp) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); NG_NODE_REALLY_DIE(node); /* Force real removal of node */ /* * We can't assume the ifnet is still around when we run shutdown * So zap it now. XXX We HOPE that anything running at this time * handles it (as it should in the non netgraph case). */ IFP2NG(ifp) = NULL; priv->ifp = NULL; /* XXX race if interrupted an output packet */ ng_rmnode_self(node); /* remove all netgraph parts */ } /* * Notify graph about link event. * if_link_state_change() has already checked that the state has changed. */ static void ng_ether_link_state(struct ifnet *ifp, int state) { const node_p node = IFP2NG(ifp); const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *msg; int cmd, dummy_error = 0; if (state == LINK_STATE_UP) cmd = NGM_LINK_IS_UP; else if (state == LINK_STATE_DOWN) cmd = NGM_LINK_IS_DOWN; else return; if (priv->lower != NULL) { NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT); if (msg != NULL) NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->lower, 0); } if (priv->orphan != NULL) { NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT); if (msg != NULL) NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->orphan, 0); } } /* * Interface arrival notification handler. * The notification is produced in two cases: * o a new interface arrives * o an existing interface got renamed * Currently the first case is handled by ng_ether_attach via special * hook ng_ether_attach_p. */ static void ng_ether_ifnet_arrival_event(void *arg __unused, struct ifnet *ifp) { char name[IFNAMSIZ]; node_p node; /* Only ethernet interfaces are of interest. */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return; /* * Just return if it's a new interface without an ng_ether companion. */ node = IFP2NG(ifp); if (node == NULL) return; /* Try to give the node the same name as the new interface name */ ng_ether_sanitize_ifname(ifp->if_xname, name); if (ng_name_node(node, name) != 0) log(LOG_WARNING, "%s: can't re-name node %s\n", __func__, name); } /****************************************************************** NETGRAPH NODE METHODS ******************************************************************/ /* * It is not possible or allowable to create a node of this type. * Nodes get created when the interface is attached (or, when * this node type's KLD is loaded). */ static int ng_ether_constructor(node_p node) { return (EINVAL); } /* * Check for attaching a new hook. */ static int ng_ether_newhook(node_p node, hook_p hook, const char *name) { const priv_p priv = NG_NODE_PRIVATE(node); hook_p *hookptr; /* Divert hook is an alias for lower */ if (strcmp(name, NG_ETHER_HOOK_DIVERT) == 0) name = NG_ETHER_HOOK_LOWER; /* Which hook? */ if (strcmp(name, NG_ETHER_HOOK_UPPER) == 0) { hookptr = &priv->upper; NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_upper); NG_HOOK_SET_TO_INBOUND(hook); } else if (strcmp(name, NG_ETHER_HOOK_LOWER) == 0) { hookptr = &priv->lower; NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower); } else if (strcmp(name, NG_ETHER_HOOK_ORPHAN) == 0) { hookptr = &priv->orphan; NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower); } else return (EINVAL); /* Check if already connected (shouldn't be, but doesn't hurt) */ if (*hookptr != NULL) return (EISCONN); /* Disable hardware checksums while 'upper' hook is connected */ if (hookptr == &priv->upper) priv->ifp->if_hwassist = 0; NG_HOOK_HI_STACK(hook); /* OK */ *hookptr = hook; return (0); } /* * Receive an incoming control message. */ static int ng_ether_rcvmsg(node_p node, item_p item, hook_p lasthook) { const priv_p priv = NG_NODE_PRIVATE(node); struct ng_mesg *resp = NULL; int error = 0; struct ng_mesg *msg; NGI_GET_MSG(item, msg); switch (msg->header.typecookie) { case NGM_ETHER_COOKIE: switch (msg->header.cmd) { case NGM_ETHER_GET_IFNAME: NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ); break; case NGM_ETHER_GET_IFINDEX: NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((u_int32_t *)resp->data) = priv->ifp->if_index; break; case NGM_ETHER_GET_ENADDR: NG_MKRESPONSE(resp, msg, ETHER_ADDR_LEN, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } bcopy(IF_LLADDR(priv->ifp), resp->data, ETHER_ADDR_LEN); break; case NGM_ETHER_SET_ENADDR: { if (msg->header.arglen != ETHER_ADDR_LEN) { error = EINVAL; break; } error = if_setlladdr(priv->ifp, (u_char *)msg->data, ETHER_ADDR_LEN); EVENTHANDLER_INVOKE(iflladdr_event, priv->ifp); break; } case NGM_ETHER_GET_PROMISC: NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((u_int32_t *)resp->data) = priv->promisc; break; case NGM_ETHER_SET_PROMISC: { u_char want; if (msg->header.arglen != sizeof(u_int32_t)) { error = EINVAL; break; } want = !!*((u_int32_t *)msg->data); if (want ^ priv->promisc) { if ((error = ifpromisc(priv->ifp, want)) != 0) break; priv->promisc = want; } break; } case NGM_ETHER_GET_AUTOSRC: NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } *((u_int32_t *)resp->data) = priv->autoSrcAddr; break; case NGM_ETHER_SET_AUTOSRC: if (msg->header.arglen != sizeof(u_int32_t)) { error = EINVAL; break; } priv->autoSrcAddr = !!*((u_int32_t *)msg->data); break; case NGM_ETHER_ADD_MULTI: { struct sockaddr_dl sa_dl; struct ifmultiaddr *ifma; if (msg->header.arglen != ETHER_ADDR_LEN) { error = EINVAL; break; } bzero(&sa_dl, sizeof(struct sockaddr_dl)); sa_dl.sdl_len = sizeof(struct sockaddr_dl); sa_dl.sdl_family = AF_LINK; sa_dl.sdl_alen = ETHER_ADDR_LEN; bcopy((void *)msg->data, LLADDR(&sa_dl), ETHER_ADDR_LEN); /* * Netgraph is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ if_maddr_rlock(priv->ifp); ifma = if_findmulti(priv->ifp, (struct sockaddr *)&sa_dl); if_maddr_runlock(priv->ifp); if (ifma != NULL) { error = EADDRINUSE; } else { error = if_addmulti(priv->ifp, (struct sockaddr *)&sa_dl, &ifma); } break; } case NGM_ETHER_DEL_MULTI: { struct sockaddr_dl sa_dl; if (msg->header.arglen != ETHER_ADDR_LEN) { error = EINVAL; break; } bzero(&sa_dl, sizeof(struct sockaddr_dl)); sa_dl.sdl_len = sizeof(struct sockaddr_dl); sa_dl.sdl_family = AF_LINK; sa_dl.sdl_alen = ETHER_ADDR_LEN; bcopy((void *)msg->data, LLADDR(&sa_dl), ETHER_ADDR_LEN); error = if_delmulti(priv->ifp, (struct sockaddr *)&sa_dl); break; } case NGM_ETHER_DETACH: ng_ether_detach(priv->ifp); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } NG_RESPOND_MSG(error, node, item, resp); NG_FREE_MSG(msg); return (error); } /* * Receive data on a hook. * Since we use per-hook recveive methods this should never be called. */ static int ng_ether_rcvdata(hook_p hook, item_p item) { NG_FREE_ITEM(item); panic("%s: weird hook", __func__); } /* * Handle an mbuf received on the "lower" or "orphan" hook. */ static int ng_ether_rcv_lower(hook_p hook, item_p item) { struct mbuf *m; const node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *const ifp = priv->ifp; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Check whether interface is ready for packets */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { NG_FREE_M(m); return (ENETDOWN); } /* Make sure header is fully pulled up */ if (m->m_pkthdr.len < sizeof(struct ether_header)) { NG_FREE_M(m); return (EINVAL); } if (m->m_len < sizeof(struct ether_header) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) return (ENOBUFS); /* Drop in the MAC address if desired */ if (priv->autoSrcAddr) { /* Make the mbuf writable if it's not already */ if (!M_WRITABLE(m) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) return (ENOBUFS); /* Overwrite source MAC address */ bcopy(IF_LLADDR(ifp), mtod(m, struct ether_header *)->ether_shost, ETHER_ADDR_LEN); } /* Send it on its way */ return ether_output_frame(ifp, m); } /* * Handle an mbuf received on the "upper" hook. */ static int ng_ether_rcv_upper(hook_p hook, item_p item) { struct mbuf *m; const node_p node = NG_HOOK_NODE(hook); const priv_p priv = NG_NODE_PRIVATE(node); struct ifnet *ifp = priv->ifp; NGI_GET_M(item, m); NG_FREE_ITEM(item); /* Check length and pull off header */ if (m->m_pkthdr.len < sizeof(struct ether_header)) { NG_FREE_M(m); return (EINVAL); } if (m->m_len < sizeof(struct ether_header) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) return (ENOBUFS); m->m_pkthdr.rcvif = ifp; /* Pass the packet to the bridge, it may come back to us */ if (ifp->if_bridge) { BRIDGE_INPUT(ifp, m); if (m == NULL) return (0); } /* Route packet back in */ ether_demux(ifp, m); return (0); } /* * Shutdown node. This resets the node but does not remove it * unless the REALLY_DIE flag is set. */ static int ng_ether_shutdown(node_p node) { const priv_p priv = NG_NODE_PRIVATE(node); if (node->nd_flags & NGF_REALLY_DIE) { /* * WE came here because the ethernet card is being unloaded, * so stop being persistant. * Actually undo all the things we did on creation. * Assume the ifp has already been freed. */ NG_NODE_SET_PRIVATE(node, NULL); free(priv, M_NETGRAPH); NG_NODE_UNREF(node); /* free node itself */ return (0); } if (priv->promisc) { /* disable promiscuous mode */ (void)ifpromisc(priv->ifp, 0); priv->promisc = 0; } NG_NODE_REVIVE(node); /* Signal ng_rmnode we are persisant */ return (0); } /* * Hook disconnection. */ static int ng_ether_disconnect(hook_p hook) { const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook)); if (hook == priv->upper) { priv->upper = NULL; if (priv->ifp != NULL) /* restore h/w csum */ priv->ifp->if_hwassist = priv->hwassist; } else if (hook == priv->lower) priv->lower = NULL; else if (hook == priv->orphan) priv->orphan = NULL; else panic("%s: weird hook", __func__); if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0) && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))) ng_rmnode_self(NG_HOOK_NODE(hook)); /* reset node */ return (0); } /****************************************************************** INITIALIZATION ******************************************************************/ /* * Handle loading and unloading for this node type. */ static int ng_ether_mod_event(module_t mod, int event, void *data) { int error = 0; switch (event) { case MOD_LOAD: /* Register function hooks */ if (ng_ether_attach_p != NULL) { error = EEXIST; break; } ng_ether_attach_p = ng_ether_attach; ng_ether_detach_p = ng_ether_detach; ng_ether_output_p = ng_ether_output; ng_ether_input_p = ng_ether_input; ng_ether_input_orphan_p = ng_ether_input_orphan; ng_ether_link_state_p = ng_ether_link_state; ng_ether_ifnet_arrival_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event, ng_ether_ifnet_arrival_event, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: /* * Note that the base code won't try to unload us until * all nodes have been removed, and that can't happen * until all Ethernet interfaces are removed. In any * case, we know there are no nodes left if the action * is MOD_UNLOAD, so there's no need to detach any nodes. */ EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ng_ether_ifnet_arrival_cookie); /* Unregister function hooks */ ng_ether_attach_p = NULL; ng_ether_detach_p = NULL; ng_ether_output_p = NULL; ng_ether_input_p = NULL; ng_ether_input_orphan_p = NULL; ng_ether_link_state_p = NULL; break; default: error = EOPNOTSUPP; break; } return (error); } static void vnet_ng_ether_init(const void *unused) { struct ifnet *ifp; /* If module load was rejected, don't attach to vnets. */ if (ng_ether_attach_p != ng_ether_attach) return; /* Create nodes for any already-existing Ethernet interfaces. */ IFNET_RLOCK(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) ng_ether_attach(ifp); } IFNET_RUNLOCK(); } VNET_SYSINIT(vnet_ng_ether_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_ng_ether_init, NULL); Index: user/dchagin/lemul/sys/netinet/cc/cc.c =================================================================== --- user/dchagin/lemul/sys/netinet/cc/cc.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/cc/cc.c (revision 274241) @@ -1,326 +1,327 @@ /*- * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * List of available cc algorithms on the current system. First element * is used as the system default CC algorithm. */ struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); /* Protects the cc_list TAILQ. */ struct rwlock cc_list_lock; VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; /* * Sysctl handler to show and change the default CC algorithm. */ static int cc_default_algo(SYSCTL_HANDLER_ARGS) { char default_cc[TCP_CA_NAME_MAX]; struct cc_algo *funcs; int error; /* Get the current default: */ CC_LIST_RLOCK(); strlcpy(default_cc, CC_DEFAULT()->name, sizeof(default_cc)); CC_LIST_RUNLOCK(); error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; error = ESRCH; /* Find algo with specified name and set it to default. */ CC_LIST_RLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (strncmp(default_cc, funcs->name, sizeof(default_cc))) continue; V_default_cc_ptr = funcs; error = 0; break; } CC_LIST_RUNLOCK(); done: return (error); } /* * Sysctl handler to display the list of available CC algorithms. */ static int cc_list_available(SYSCTL_HANDLER_ARGS) { struct cc_algo *algo; struct sbuf *s; int err, first, nalgos; err = nalgos = 0; first = 1; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { nalgos++; } CC_LIST_RUNLOCK(); s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN); if (s == NULL) return (ENOMEM); /* * It is theoretically possible for the CC list to have grown in size * since the call to sbuf_new() and therefore for the sbuf to be too * small. If this were to happen (incredibly unlikely), the sbuf will * reach an overflow condition, sbuf_printf() will return an error and * the sysctl will fail gracefully. */ CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { err = sbuf_printf(s, first ? "%s" : ", %s", algo->name); if (err) { /* Sbuf overflow condition. */ err = EOVERFLOW; break; } first = 0; } CC_LIST_RUNLOCK(); if (!err) { sbuf_finish(s); err = sysctl_handle_string(oidp, sbuf_data(s), 0, req); } sbuf_delete(s); return (err); } /* * Reset the default CC algo to NewReno for any netstack which is using the algo * that is about to go away as its default. */ static void cc_checkreset_default(struct cc_algo *remove_cc) { VNET_ITERATOR_DECL(vnet_iter); CC_LIST_LOCK_ASSERT(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (strncmp(CC_DEFAULT()->name, remove_cc->name, TCP_CA_NAME_MAX) == 0) V_default_cc_ptr = &newreno_cc_algo; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Initialise CC subsystem on system boot. */ static void cc_init(void) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); } /* * Returns non-zero on success, 0 on failure. */ int cc_deregister_algo(struct cc_algo *remove_cc) { struct cc_algo *funcs, *tmpfuncs; int err; err = ENOENT; /* Never allow newreno to be deregistered. */ if (&newreno_cc_algo == remove_cc) return (EPERM); /* Remove algo from cc_list so that new connections can't use it. */ CC_LIST_WLOCK(); STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { if (funcs == remove_cc) { cc_checkreset_default(remove_cc); STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); err = 0; break; } } CC_LIST_WUNLOCK(); if (!err) /* * XXXLAS: * - We may need to handle non-zero return values in future. * - If we add CC framework support for protocols other than * TCP, we may want a more generic way to handle this step. */ tcp_ccalgounload(remove_cc); return (err); } /* * Returns 0 on success, non-zero on failure. */ int cc_register_algo(struct cc_algo *add_cc) { struct cc_algo *funcs; int err; err = 0; /* * Iterate over list of registered CC algorithms and make sure * we're not trying to add a duplicate. */ CC_LIST_WLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == add_cc || strncmp(funcs->name, add_cc->name, TCP_CA_NAME_MAX) == 0) err = EEXIST; } if (!err) STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); CC_LIST_WUNLOCK(); return (err); } /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ int cc_modevent(module_t mod, int event_type, void *data) { struct cc_algo *algo; int err; err = 0; algo = (struct cc_algo *)data; switch(event_type) { case MOD_LOAD: if (algo->mod_init != NULL) err = algo->mod_init(); if (!err) err = cc_register_algo(algo); break; case MOD_QUIESCE: case MOD_SHUTDOWN: case MOD_UNLOAD: err = cc_deregister_algo(algo); if (!err && algo->mod_destroy != NULL) algo->mod_destroy(); if (err == ENOENT) err = 0; break; default: err = EINVAL; break; } return (err); } SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); /* Declare sysctl tree and populate it. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, "congestion control related settings"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, + CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW, NULL, 0, cc_default_algo, "A", "default congestion control algorithm"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, cc_list_available, "A", "list available congestion control algorithms"); Index: user/dchagin/lemul/sys/netinet/cc/cc_cdg.c =================================================================== --- user/dchagin/lemul/sys/netinet/cc/cc_cdg.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/cc/cc_cdg.c (revision 274241) @@ -1,700 +1,700 @@ /*- * Copyright (c) 2009-2013 * Swinburne University of Technology, Melbourne, Australia * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes, made * possible in part by a gift from The Cisco University Research Program Fund, * a corporate advised fund of Silicon Valley Community Foundation. Development * and testing were further assisted by a grant from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * CAIA Delay-Gradient (CDG) congestion control algorithm * * An implemention of the delay-gradient congestion control algorithm proposed * in the following paper: * * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011. * * Developed as part of the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CDG_VERSION "0.1" #define CAST_PTR_INT(X) (*((int*)(X))) /* Private delay-gradient induced congestion control signal. */ #define CC_CDG_DELAY 0x01000000 /* NewReno window deflation factor on loss (as a percentage). */ #define RENO_BETA 50 /* Queue states. */ #define CDG_Q_EMPTY 1 #define CDG_Q_RISING 2 #define CDG_Q_FALLING 3 #define CDG_Q_FULL 4 #define CDG_Q_UNKNOWN 9999 /* Number of bit shifts used in probexp lookup table. */ #define EXP_PREC 15 /* Largest gradient represented in probexp lookup table. */ #define MAXGRAD 5 /* * Delay Precision Enhance - number of bit shifts used for qtrend related * integer arithmetic precision. */ #define D_P_E 7 struct qdiff_sample { long qdiff; STAILQ_ENTRY(qdiff_sample) qdiff_lnk; }; struct cdg { long max_qtrend; long min_qtrend; STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q; STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q; long window_incr; /* rttcount for window increase when in congestion avoidance */ long rtt_count; /* maximum measured rtt within an rtt period */ int maxrtt_in_rtt; /* maximum measured rtt within prev rtt period */ int maxrtt_in_prevrtt; /* minimum measured rtt within an rtt period */ int minrtt_in_rtt; /* minimum measured rtt within prev rtt period */ int minrtt_in_prevrtt; /* consecutive congestion episode counter */ uint32_t consec_cong_cnt; /* when tracking a new reno type loss window */ uint32_t shadow_w; /* maximum number of samples in the moving average queue */ int sample_q_size; /* number of samples in the moving average queue */ int num_samples; /* estimate of the queue state of the path */ int queue_state; }; /* * Lookup table for: * (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments * * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase * gradients. */ static const int probexp[641] = { 10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624, 3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846, 7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713, 9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939, 12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951, 14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768, 15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410, 17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893, 19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233, 20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444, 21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537, 22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525, 23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418, 24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224, 25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953, 26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611, 26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206, 27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743, 27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228, 28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667, 28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063, 29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421, 29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744, 29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036, 30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300, 30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538, 30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753, 30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948, 30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124, 31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283, 31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426, 31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556, 31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673, 31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778, 31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874, 31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960, 31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038, 32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109, 32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173, 32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230, 32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282, 32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329, 32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371, 32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410, 32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444, 32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476, 32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504, 32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529, 32531,32533,32535,32537,32538,32540,32542,32544,32545,32547}; static uma_zone_t qdiffsample_zone; static MALLOC_DEFINE(M_CDG, "cdg data", "Per connection data required for the CDG congestion control algorithm"); static int ertt_id; static VNET_DEFINE(uint32_t, cdg_alpha_inc); static VNET_DEFINE(uint32_t, cdg_beta_delay); static VNET_DEFINE(uint32_t, cdg_beta_loss); static VNET_DEFINE(uint32_t, cdg_smoothing_factor); static VNET_DEFINE(uint32_t, cdg_exp_backoff_scale); static VNET_DEFINE(uint32_t, cdg_consec_cong); static VNET_DEFINE(uint32_t, cdg_hold_backoff); #define V_cdg_alpha_inc VNET(cdg_alpha_inc) #define V_cdg_beta_delay VNET(cdg_beta_delay) #define V_cdg_beta_loss VNET(cdg_beta_loss) #define V_cdg_smoothing_factor VNET(cdg_smoothing_factor) #define V_cdg_exp_backoff_scale VNET(cdg_exp_backoff_scale) #define V_cdg_consec_cong VNET(cdg_consec_cong) #define V_cdg_hold_backoff VNET(cdg_hold_backoff) /* Function prototypes. */ static int cdg_mod_init(void); static int cdg_mod_destroy(void); static void cdg_conn_init(struct cc_var *ccv); static int cdg_cb_init(struct cc_var *ccv); static void cdg_cb_destroy(struct cc_var *ccv); static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type); static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type); struct cc_algo cdg_cc_algo = { .name = "cdg", .mod_init = cdg_mod_init, .ack_received = cdg_ack_received, .cb_destroy = cdg_cb_destroy, .cb_init = cdg_cb_init, .conn_init = cdg_conn_init, .cong_signal = cdg_cong_signal, .mod_destroy = cdg_mod_destroy }; /* Vnet created and being initialised. */ static void cdg_init_vnet(const void *unused __unused) { V_cdg_alpha_inc = 0; V_cdg_beta_delay = 70; V_cdg_beta_loss = 50; V_cdg_smoothing_factor = 8; V_cdg_exp_backoff_scale = 3; V_cdg_consec_cong = 5; V_cdg_hold_backoff = 5; } static int cdg_mod_init(void) { VNET_ITERATOR_DECL(v); ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) return (EINVAL); qdiffsample_zone = uma_zcreate("cdg_qdiffsample", sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0); VNET_LIST_RLOCK(); VNET_FOREACH(v) { CURVNET_SET(v); cdg_init_vnet(NULL); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery; cdg_cc_algo.after_idle = newreno_cc_algo.after_idle; return (0); } static int cdg_mod_destroy(void) { uma_zdestroy(qdiffsample_zone); return (0); } static int cdg_cb_init(struct cc_var *ccv) { struct cdg *cdg_data; cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT); if (cdg_data == NULL) return (ENOMEM); cdg_data->shadow_w = 0; cdg_data->max_qtrend = 0; cdg_data->min_qtrend = 0; cdg_data->queue_state = CDG_Q_UNKNOWN; cdg_data->maxrtt_in_rtt = 0; cdg_data->maxrtt_in_prevrtt = 0; cdg_data->minrtt_in_rtt = INT_MAX; cdg_data->minrtt_in_prevrtt = 0; cdg_data->window_incr = 0; cdg_data->rtt_count = 0; cdg_data->consec_cong_cnt = 0; cdg_data->sample_q_size = V_cdg_smoothing_factor; cdg_data->num_samples = 0; STAILQ_INIT(&cdg_data->qdiffmin_q); STAILQ_INIT(&cdg_data->qdiffmax_q); ccv->cc_data = cdg_data; return (0); } static void cdg_conn_init(struct cc_var *ccv) { struct cdg *cdg_data = ccv->cc_data; /* * Initialise the shadow_cwnd in case we are competing with loss based * flows from the start */ cdg_data->shadow_w = CCV(ccv, snd_cwnd); } static void cdg_cb_destroy(struct cc_var *ccv) { struct cdg *cdg_data; struct qdiff_sample *qds, *qds_n; cdg_data = ccv->cc_data; qds = STAILQ_FIRST(&cdg_data->qdiffmin_q); while (qds != NULL) { qds_n = STAILQ_NEXT(qds, qdiff_lnk); uma_zfree(qdiffsample_zone,qds); qds = qds_n; } qds = STAILQ_FIRST(&cdg_data->qdiffmax_q); while (qds != NULL) { qds_n = STAILQ_NEXT(qds, qdiff_lnk); uma_zfree(qdiffsample_zone,qds); qds = qds_n; } free(ccv->cc_data, M_CDG); } static int cdg_beta_handler(SYSCTL_HANDLER_ARGS) { if (req->newptr != NULL && (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100)) return (EINVAL); return (sysctl_handle_int(oidp, arg1, arg2, req)); } static int cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS) { if (req->newptr != NULL && CAST_PTR_INT(req->newptr) < 1) return (EINVAL); return (sysctl_handle_int(oidp, arg1, arg2, req)); } static inline unsigned long cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta) { return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100); } /* * Window increase function * This window increase function is independent of the initial window size * to ensure small window flows are not discriminated against (i.e. fairness). * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for * the next alpha_inc rtts, etc. */ static void cdg_window_increase(struct cc_var *ccv, int new_measurement) { struct cdg *cdg_data; int incr, s_w_incr; cdg_data = ccv->cc_data; incr = s_w_incr = 0; if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { /* Slow start. */ incr = CCV(ccv, t_maxseg); s_w_incr = incr; cdg_data->window_incr = cdg_data->rtt_count = 0; } else { /* Congestion avoidance. */ if (new_measurement) { s_w_incr = CCV(ccv, t_maxseg); if (V_cdg_alpha_inc == 0) { incr = CCV(ccv, t_maxseg); } else { if (++cdg_data->rtt_count >= V_cdg_alpha_inc) { cdg_data->window_incr++; cdg_data->rtt_count = 0; } incr = CCV(ccv, t_maxseg) * cdg_data->window_incr; } } } if (cdg_data->shadow_w > 0) cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr, TCP_MAXWIN << CCV(ccv, snd_scale)); CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type) { struct cdg *cdg_data = ccv->cc_data; switch(signal_type) { case CC_CDG_DELAY: CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv, CCV(ccv, snd_cwnd), V_cdg_beta_delay); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); CCV(ccv, snd_recover) = CCV(ccv, snd_max); cdg_data->window_incr = cdg_data->rtt_count = 0; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); break; case CC_NDUPACK: /* * If already responding to congestion OR we have guessed no * queue in the path is full. */ if (IN_CONGRECOVERY(CCV(ccv, t_flags)) || cdg_data->queue_state < CDG_Q_FULL) { CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); CCV(ccv, snd_recover) = CCV(ccv, snd_max); } else { /* * Loss is likely to be congestion related. We have * inferred a queue full state, so have shadow window * react to loss as NewReno would. */ if (cdg_data->shadow_w > 0) cdg_data->shadow_w = cdg_window_decrease(ccv, cdg_data->shadow_w, RENO_BETA); CCV(ccv, snd_ssthresh) = ulmax(cdg_data->shadow_w, cdg_window_decrease(ccv, CCV(ccv, snd_cwnd), V_cdg_beta_loss)); cdg_data->window_incr = cdg_data->rtt_count = 0; } ENTER_RECOVERY(CCV(ccv, t_flags)); break; default: newreno_cc_algo.cong_signal(ccv, signal_type); break; } } /* * Using a negative exponential probabilistic backoff so that sources with * varying RTTs which share the same link will, on average, have the same * probability of backoff over time. * * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff. */ static inline int prob_backoff(long qtrend) { int backoff, idx, p; backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E)); if (!backoff) { if (V_cdg_exp_backoff_scale > 1) idx = (qtrend + V_cdg_exp_backoff_scale / 2) / V_cdg_exp_backoff_scale; else idx = qtrend; /* Backoff probability proportional to rate of queue growth. */ p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx]; backoff = (random() < p); } return (backoff); } static inline void calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min) { struct qdiff_sample *qds; ++cdg_data->num_samples; if (cdg_data->num_samples > cdg_data->sample_q_size) { /* Minimum RTT. */ qds = STAILQ_FIRST(&cdg_data->qdiffmin_q); cdg_data->min_qtrend = cdg_data->min_qtrend + (qdiff_min - qds->qdiff) / cdg_data->sample_q_size; STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk); qds->qdiff = qdiff_min; STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk); /* Maximum RTT. */ qds = STAILQ_FIRST(&cdg_data->qdiffmax_q); cdg_data->max_qtrend = cdg_data->max_qtrend + (qdiff_max - qds->qdiff) / cdg_data->sample_q_size; STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk); qds->qdiff = qdiff_max; STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk); --cdg_data->num_samples; } else { qds = uma_zalloc(qdiffsample_zone, M_NOWAIT); if (qds != NULL) { cdg_data->min_qtrend = cdg_data->min_qtrend + qdiff_min / cdg_data->sample_q_size; qds->qdiff = qdiff_min; STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk); } qds = uma_zalloc(qdiffsample_zone, M_NOWAIT); if (qds) { cdg_data->max_qtrend = cdg_data->max_qtrend + qdiff_max / cdg_data->sample_q_size; qds->qdiff = qdiff_max; STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk); } } } static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type) { struct cdg *cdg_data; struct ertt *e_t; long qdiff_max, qdiff_min; int congestion, new_measurement, slowstart; cdg_data = ccv->cc_data; e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id); new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT; congestion = 0; cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt); cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt); if (new_measurement) { slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)); /* * Update smoothed gradient measurements. Since we are only * using one measurement per RTT, use max or min rtt_in_rtt. * This is also less noisy than a sample RTT measurement. Max * RTT measurements can have trouble due to OS issues. */ if (cdg_data->maxrtt_in_prevrtt) { qdiff_max = ((long)(cdg_data->maxrtt_in_rtt - cdg_data->maxrtt_in_prevrtt) << D_P_E ); qdiff_min = ((long)(cdg_data->minrtt_in_rtt - cdg_data->minrtt_in_prevrtt) << D_P_E ); calc_moving_average(cdg_data, qdiff_max, qdiff_min); /* Probabilistic backoff with respect to gradient. */ if (slowstart && qdiff_min > 0) congestion = prob_backoff(qdiff_min); else if (cdg_data->min_qtrend > 0) congestion = prob_backoff(cdg_data->min_qtrend); else if (slowstart && qdiff_max > 0) congestion = prob_backoff(qdiff_max); else if (cdg_data->max_qtrend > 0) congestion = prob_backoff(cdg_data->max_qtrend); /* Update estimate of queue state. */ if (cdg_data->min_qtrend > 0 && cdg_data->max_qtrend <= 0) { cdg_data->queue_state = CDG_Q_FULL; } else if (cdg_data->min_qtrend >= 0 && cdg_data->max_qtrend < 0) { cdg_data->queue_state = CDG_Q_EMPTY; cdg_data->shadow_w = 0; } else if (cdg_data->min_qtrend > 0 && cdg_data->max_qtrend > 0) { cdg_data->queue_state = CDG_Q_RISING; } else if (cdg_data->min_qtrend < 0 && cdg_data->max_qtrend < 0) { cdg_data->queue_state = CDG_Q_FALLING; } if (cdg_data->min_qtrend < 0 || cdg_data->max_qtrend < 0) cdg_data->consec_cong_cnt = 0; } cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt; cdg_data->minrtt_in_rtt = INT_MAX; cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt; cdg_data->maxrtt_in_rtt = 0; e_t->flags &= ~ERTT_NEW_MEASUREMENT; } if (congestion) { cdg_data->consec_cong_cnt++; if (!IN_RECOVERY(CCV(ccv, t_flags))) { if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong) cdg_cong_signal(ccv, CC_CDG_DELAY); else /* * We have been backing off but the queue is not * falling. Assume we are competing with * loss-based flows and don't back off for the * next V_cdg_hold_backoff RTT periods. */ if (cdg_data->consec_cong_cnt >= V_cdg_consec_cong + V_cdg_hold_backoff) cdg_data->consec_cong_cnt = 0; /* Won't see effect until 2nd RTT. */ cdg_data->maxrtt_in_prevrtt = 0; /* * Resync shadow window in case we are competing with a * loss based flow */ cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd), cdg_data->shadow_w); } } else if (ack_type == CC_ACK) cdg_window_increase(ccv, new_measurement); } /* When a vnet is created and being initialised, init the per-stack CDG vars. */ VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, cdg_init_vnet, NULL); SYSCTL_DECL(_net_inet_tcp_cc_cdg); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL, "CAIA delay-gradient congestion control related settings"); SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version, CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1, "Current algorithm/implementation version number"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc, - CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0, +SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0, "Increment the window increase factor alpha by 1 MSS segment every " "alpha_inc RTTs during congestion avoidance mode."); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70, +SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70, &cdg_beta_handler, "IU", "Delay-based window decrease factor as a percentage " "(on delay-based backoff, w = w * beta_delay / 100)"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50, +SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50, &cdg_beta_handler, "IU", "Loss-based window decrease factor as a percentage " "(on loss-based backoff, w = w * beta_loss / 100)"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_exp_backoff_scale), 2, - &cdg_exp_backoff_scale_handler, "IU", +SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU", "Scaling parameter for the probabilistic exponential backoff"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, smoothing_factor, - CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8, +SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, smoothing_factor, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8, "Number of samples used for moving average smoothing (0 = no smoothing)"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong, - CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5, +SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5, "Number of consecutive delay-gradient based congestion episodes which will " "trigger loss based CC compatibility"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff, - CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5, +SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5, "Number of consecutive delay-gradient based congestion episodes to hold " "the window backoff for loss based CC compatibility"); DECLARE_CC_MODULE(cdg, &cdg_cc_algo); MODULE_DEPEND(cdg, ertt, 1, 1, 1); Index: user/dchagin/lemul/sys/netinet/cc/cc_chd.c =================================================================== --- user/dchagin/lemul/sys/netinet/cc/cc_chd.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/cc/cc_chd.c (revision 274241) @@ -1,496 +1,499 @@ /*- * Copyright (c) 2009-2010 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2010-2011 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes and * Lawrence Stewart, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, Melbourne, Australia by * David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the CAIA-Hamilton delay based congestion control * algorithm, based on "Improved coexistence and loss tolerance for delay based * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA, * 11-14 October 2010. * * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CAST_PTR_INT(X) (*((int*)(X))) /* * Private signal type for rate based congestion signal. * See for appropriate bit-range to use for private signals. */ #define CC_CHD_DELAY 0x02000000 /* Largest possible number returned by random(). */ #define RANDOM_MAX INT_MAX static void chd_ack_received(struct cc_var *ccv, uint16_t ack_type); static void chd_cb_destroy(struct cc_var *ccv); static int chd_cb_init(struct cc_var *ccv); static void chd_cong_signal(struct cc_var *ccv, uint32_t signal_type); static void chd_conn_init(struct cc_var *ccv); static int chd_mod_init(void); struct chd { /* * Shadow window - keeps track of what the NewReno congestion window * would have been if delay-based cwnd backoffs had not been made. This * functionality aids coexistence with loss-based TCP flows which may be * sharing links along the path. */ unsigned long shadow_w; /* * Loss-based TCP compatibility flag - When set, it turns on the shadow * window functionality. */ int loss_compete; /* The maximum round trip time seen within a measured rtt period. */ int maxrtt_in_rtt; /* The previous qdly that caused cwnd to backoff. */ int prev_backoff_qdly; }; static int ertt_id; static VNET_DEFINE(uint32_t, chd_qmin) = 5; static VNET_DEFINE(uint32_t, chd_pmax) = 50; static VNET_DEFINE(uint32_t, chd_loss_fair) = 1; static VNET_DEFINE(uint32_t, chd_use_max) = 1; static VNET_DEFINE(uint32_t, chd_qthresh) = 20; #define V_chd_qthresh VNET(chd_qthresh) #define V_chd_qmin VNET(chd_qmin) #define V_chd_pmax VNET(chd_pmax) #define V_chd_loss_fair VNET(chd_loss_fair) #define V_chd_use_max VNET(chd_use_max) static MALLOC_DEFINE(M_CHD, "chd data", "Per connection data required for the CHD congestion control algorithm"); struct cc_algo chd_cc_algo = { .name = "chd", .ack_received = chd_ack_received, .cb_destroy = chd_cb_destroy, .cb_init = chd_cb_init, .cong_signal = chd_cong_signal, .conn_init = chd_conn_init, .mod_init = chd_mod_init }; static __inline void chd_window_decrease(struct cc_var *ccv) { unsigned long win; win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg); win -= max((win / 2), 1); CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg); } /* * Probabilistic backoff function. Returns 1 if we should backoff or 0 * otherwise. The calculation of p is similar to the calculation of p in cc_hd. */ static __inline int should_backoff(int qdly, int maxqdly, struct chd *chd_data) { unsigned long p, rand; rand = random(); if (qdly < V_chd_qthresh) { chd_data->loss_compete = 0; p = (((RANDOM_MAX / 100) * V_chd_pmax) / (V_chd_qthresh - V_chd_qmin)) * (qdly - V_chd_qmin); } else { if (qdly > V_chd_qthresh) { p = (((RANDOM_MAX / 100) * V_chd_pmax) / (maxqdly - V_chd_qthresh)) * (maxqdly - qdly); if (V_chd_loss_fair && rand < p) chd_data->loss_compete = 1; } else { p = (RANDOM_MAX / 100) * V_chd_pmax; chd_data->loss_compete = 0; } } return (rand < p); } static __inline void chd_window_increase(struct cc_var *ccv, int new_measurement) { struct chd *chd_data; int incr; chd_data = ccv->cc_data; incr = 0; if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { /* Adapted from NewReno slow start. */ if (V_tcp_do_rfc3465) { /* In slow-start with ABC enabled. */ if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) { /* Not due to RTO. */ incr = min(ccv->bytes_this_ack, V_tcp_abc_l_var * CCV(ccv, t_maxseg)); } else { /* Due to RTO. */ incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); } } else incr = CCV(ccv, t_maxseg); } else { /* Congestion avoidance. */ if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) { ccv->flags &= ~CCF_ABC_SENTAWND; incr = CCV(ccv, t_maxseg); } } else if (new_measurement) incr = CCV(ccv, t_maxseg); } if (chd_data->shadow_w > 0) { /* Track NewReno window. */ chd_data->shadow_w = min(chd_data->shadow_w + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } /* * All ACK signals are used for timing measurements to determine delay-based * congestion. However, window increases are only performed when * ack_type == CC_ACK. */ static void chd_ack_received(struct cc_var *ccv, uint16_t ack_type) { struct chd *chd_data; struct ertt *e_t; int backoff, new_measurement, qdly, rtt; e_t = khelp_get_osd(CCV(ccv, osd), ertt_id); chd_data = ccv->cc_data; new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT; backoff = qdly = 0; chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt); if (new_measurement) { /* * There is a new per RTT measurement, so check to see if there * is delay based congestion. */ rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt; chd_data->maxrtt_in_rtt = 0; if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) { qdly = rtt - e_t->minrtt; if (qdly > V_chd_qmin) { /* * Probabilistic delay based congestion * indication. */ backoff = should_backoff(qdly, e_t->maxrtt - e_t->minrtt, chd_data); } else chd_data->loss_compete = 0; } /* Reset per RTT measurement flag to start a new measurement. */ e_t->flags &= ~ERTT_NEW_MEASUREMENT; } if (backoff) { /* * Update shadow_w before delay based backoff. */ if (chd_data->loss_compete || qdly > chd_data->prev_backoff_qdly) { /* * Delay is higher than when we backed off previously, * so it is possible that this flow is competing with * loss based flows. */ chd_data->shadow_w = max(CCV(ccv, snd_cwnd), chd_data->shadow_w); } else { /* * Reset shadow_w, as it is probable that this flow is * not competing with loss based flows at the moment. */ chd_data->shadow_w = 0; } chd_data->prev_backoff_qdly = qdly; /* * Send delay-based congestion signal to the congestion signal * handler. */ chd_cong_signal(ccv, CC_CHD_DELAY); } else if (ack_type == CC_ACK) chd_window_increase(ccv, new_measurement); } static void chd_cb_destroy(struct cc_var *ccv) { if (ccv->cc_data != NULL) free(ccv->cc_data, M_CHD); } static int chd_cb_init(struct cc_var *ccv) { struct chd *chd_data; chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT); if (chd_data == NULL) return (ENOMEM); chd_data->shadow_w = 0; ccv->cc_data = chd_data; return (0); } static void chd_cong_signal(struct cc_var *ccv, uint32_t signal_type) { struct ertt *e_t; struct chd *chd_data; int qdly; e_t = khelp_get_osd(CCV(ccv, osd), ertt_id); chd_data = ccv->cc_data; qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt; switch(signal_type) { case CC_CHD_DELAY: chd_window_decrease(ccv); /* Set new ssthresh. */ CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); CCV(ccv, snd_recover) = CCV(ccv, snd_max); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); break; case CC_NDUPACK: /* Packet loss. */ /* * Only react to loss as a congestion signal if qdly > * V_chd_qthresh. If qdly is less than qthresh, presume that * this is a non congestion related loss. If qdly is greater * than qthresh, assume that we are competing with loss based * tcp flows and restore window from any unnecessary backoffs, * before the decrease. */ if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) { if (chd_data->loss_compete) { CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd), chd_data->shadow_w); } chd_window_decrease(ccv); } else { /* * This loss isn't congestion related, or already * recovering from congestion. */ CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); CCV(ccv, snd_recover) = CCV(ccv, snd_max); } if (chd_data->shadow_w > 0) { chd_data->shadow_w = max(chd_data->shadow_w / CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg); } ENTER_FASTRECOVERY(CCV(ccv, t_flags)); break; default: newreno_cc_algo.cong_signal(ccv, signal_type); } } static void chd_conn_init(struct cc_var *ccv) { struct chd *chd_data; chd_data = ccv->cc_data; chd_data->prev_backoff_qdly = 0; chd_data->maxrtt_in_rtt = 0; chd_data->loss_compete = 0; /* * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are * competing with loss based flows from the start. */ chd_data->shadow_w = CCV(ccv, snd_cwnd); } static int chd_mod_init(void) { ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) { printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } chd_cc_algo.after_idle = newreno_cc_algo.after_idle; chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery; return (0); } static int chd_loss_fair_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_chd_loss_fair; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) > 1) error = EINVAL; else V_chd_loss_fair = new; } return (error); } static int chd_pmax_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_chd_pmax; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100) error = EINVAL; else V_chd_pmax = new; } return (error); } static int chd_qthresh_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_chd_qthresh; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) <= V_chd_qmin) error = EINVAL; else V_chd_qthresh = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_chd); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW, NULL, "CAIA Hamilton delay-based congestion control related settings"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler, +SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler, "IU", "Flag to enable shadow window functionality."); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_pmax), 5, &chd_pmax_handler, +SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + &VNET_NAME(chd_pmax), 5, &chd_pmax_handler, "IU", "Per RTT maximum backoff probability as a percentage"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler, +SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler, "IU", "Queueing congestion threshold in ticks"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min, - CTLFLAG_RW, &VNET_NAME(chd_qmin), 5, +SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_qmin), 5, "Minimum queueing delay threshold in ticks"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd, OID_AUTO, use_max, - CTLFLAG_RW, &VNET_NAME(chd_use_max), 1, +SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, use_max, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_use_max), 1, "Use the maximum RTT seen within the measurement period (RTT) " "as the basic delay measurement for the algorithm."); DECLARE_CC_MODULE(chd, &chd_cc_algo); MODULE_DEPEND(chd, ertt, 1, 1, 1); Index: user/dchagin/lemul/sys/netinet/cc/cc_hd.c =================================================================== --- user/dchagin/lemul/sys/netinet/cc/cc_hd.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/cc/cc_hd.c (revision 274241) @@ -1,254 +1,255 @@ /*- * Copyright (c) 2009-2010 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2010 Lawrence Stewart * Copyright (c) 2010-2011 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes and * Lawrence Stewart, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, Melbourne, Australia by * David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the Hamilton Institute's delay-based congestion control * algorithm for FreeBSD, based on "A strategy for fair coexistence of loss and * delay-based congestion control algorithms," by L. Budzisz, R. Stanojevic, R. * Shorten, and F. Baker, IEEE Commun. Lett., vol. 13, no. 7, pp. 555--557, Jul. * 2009. * * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CAST_PTR_INT(X) (*((int*)(X))) /* Largest possible number returned by random(). */ #define RANDOM_MAX INT_MAX static void hd_ack_received(struct cc_var *ccv, uint16_t ack_type); static int hd_mod_init(void); static int ertt_id; static VNET_DEFINE(uint32_t, hd_qthresh) = 20; static VNET_DEFINE(uint32_t, hd_qmin) = 5; static VNET_DEFINE(uint32_t, hd_pmax) = 5; #define V_hd_qthresh VNET(hd_qthresh) #define V_hd_qmin VNET(hd_qmin) #define V_hd_pmax VNET(hd_pmax) struct cc_algo hd_cc_algo = { .name = "hd", .ack_received = hd_ack_received, .mod_init = hd_mod_init }; /* * Hamilton backoff function. Returns 1 if we should backoff or 0 otherwise. */ static __inline int should_backoff(int qdly, int maxqdly) { unsigned long p; if (qdly < V_hd_qthresh) { p = (((RANDOM_MAX / 100) * V_hd_pmax) / (V_hd_qthresh - V_hd_qmin)) * (qdly - V_hd_qmin); } else { if (qdly > V_hd_qthresh) p = (((RANDOM_MAX / 100) * V_hd_pmax) / (maxqdly - V_hd_qthresh)) * (maxqdly - qdly); else p = (RANDOM_MAX / 100) * V_hd_pmax; } return (random() < p); } /* * If the ack type is CC_ACK, and the inferred queueing delay is greater than * the Qmin threshold, cwnd is reduced probabilistically. When backing off due * to delay, HD behaves like NewReno when an ECN signal is received. HD behaves * as NewReno in all other circumstances. */ static void hd_ack_received(struct cc_var *ccv, uint16_t ack_type) { struct ertt *e_t; int qdly; if (ack_type == CC_ACK) { e_t = khelp_get_osd(CCV(ccv, osd), ertt_id); if (e_t->rtt && e_t->minrtt && V_hd_qthresh > 0) { qdly = e_t->rtt - e_t->minrtt; if (qdly > V_hd_qmin && !IN_RECOVERY(CCV(ccv, t_flags))) { /* Probabilistic backoff of cwnd. */ if (should_backoff(qdly, e_t->maxrtt - e_t->minrtt)) { /* * Update cwnd and ssthresh update to * half cwnd and behave like an ECN (ie * not a packet loss). */ newreno_cc_algo.cong_signal(ccv, CC_ECN); return; } } } } newreno_cc_algo.ack_received(ccv, ack_type); /* As for NewReno. */ } static int hd_mod_init(void) { ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) { printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } hd_cc_algo.after_idle = newreno_cc_algo.after_idle; hd_cc_algo.cong_signal = newreno_cc_algo.cong_signal; hd_cc_algo.post_recovery = newreno_cc_algo.post_recovery; return (0); } static int hd_pmax_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_hd_pmax; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100) error = EINVAL; else V_hd_pmax = new; } return (error); } static int hd_qmin_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_hd_qmin; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) > V_hd_qthresh) error = EINVAL; else V_hd_qmin = new; } return (error); } static int hd_qthresh_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_hd_qthresh; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) < 1 || CAST_PTR_INT(req->newptr) < V_hd_qmin) error = EINVAL; else V_hd_qthresh = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_hd); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hd, CTLFLAG_RW, NULL, "Hamilton delay-based congestion control related settings"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_threshold, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(hd_qthresh), 20, &hd_qthresh_handler, - "IU", "queueing congestion threshold (qth) in ticks"); +SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_threshold, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_qthresh), 20, + &hd_qthresh_handler, "IU", "queueing congestion threshold (qth) in ticks"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_hd, OID_AUTO, pmax, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(hd_pmax), 5, &hd_pmax_handler, - "IU", "per packet maximum backoff probability as a percentage"); +SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, pmax, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_pmax), 5, + &hd_pmax_handler, "IU", + "per packet maximum backoff probability as a percentage"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_min, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(hd_qmin), 5, &hd_qmin_handler, - "IU", "minimum queueing delay threshold (qmin) in ticks"); +SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_min, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_qmin), 5, + &hd_qmin_handler, "IU", "minimum queueing delay threshold (qmin) in ticks"); DECLARE_CC_MODULE(hd, &hd_cc_algo); MODULE_DEPEND(hd, ertt, 1, 1, 1); Index: user/dchagin/lemul/sys/netinet/cc/cc_htcp.c =================================================================== --- user/dchagin/lemul/sys/netinet/cc/cc_htcp.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/cc/cc_htcp.c (revision 274241) @@ -1,520 +1,522 @@ /*- * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the H-TCP congestion control algorithm for FreeBSD, * based on the Internet Draft "draft-leith-tcp-htcp-06.txt" by Leith and * Shorten. Originally released as part of the NewTCP research project at * Swinburne University of Technology's Centre for Advanced Internet * Architectures, Melbourne, Australia, which was made possible in part by a * grant from the Cisco University Research Program Fund at Community Foundation * Silicon Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Fixed point math shifts. */ #define HTCP_SHIFT 8 #define HTCP_ALPHA_INC_SHIFT 4 #define HTCP_INIT_ALPHA 1 #define HTCP_DELTA_L hz /* 1 sec in ticks. */ #define HTCP_MINBETA 128 /* 0.5 << HTCP_SHIFT. */ #define HTCP_MAXBETA 204 /* ~0.8 << HTCP_SHIFT. */ #define HTCP_MINROWE 26 /* ~0.1 << HTCP_SHIFT. */ #define HTCP_MAXROWE 512 /* 2 << HTCP_SHIFT. */ /* RTT_ref (ms) used in the calculation of alpha if RTT scaling is enabled. */ #define HTCP_RTT_REF 100 /* Don't trust SRTT until this many samples have been taken. */ #define HTCP_MIN_RTT_SAMPLES 8 /* * HTCP_CALC_ALPHA performs a fixed point math calculation to determine the * value of alpha, based on the function defined in the HTCP spec. * * i.e. 1 + 10(delta - delta_l) + ((delta - delta_l) / 2) ^ 2 * * "diff" is passed in to the macro as "delta - delta_l" and is expected to be * in units of ticks. * * The joyousnous of fixed point maths means our function implementation looks a * little funky... * * In order to maintain some precision in the calculations, a fixed point shift * HTCP_ALPHA_INC_SHIFT is used to ensure the integer divisions don't * truncate the results too badly. * * The "16" value is the "1" term in the alpha function shifted up by * HTCP_ALPHA_INC_SHIFT * * The "160" value is the "10" multiplier in the alpha function multiplied by * 2^HTCP_ALPHA_INC_SHIFT * * Specifying these as constants reduces the computations required. After * up-shifting all the terms in the function and performing the required * calculations, we down-shift the final result by HTCP_ALPHA_INC_SHIFT to * ensure it is back in the correct range. * * The "hz" terms are required as kernels can be configured to run with * different tick timers, which we have to adjust for in the alpha calculation * (which originally was defined in terms of seconds). * * We also have to be careful to constrain the value of diff such that it won't * overflow whilst performing the calculation. The middle term i.e. (160 * diff) * / hz is the limiting factor in the calculation. We must constrain diff to be * less than the max size of an int divided by the constant 160 figure * i.e. diff < INT_MAX / 160 * * NB: Changing HTCP_ALPHA_INC_SHIFT will require you to MANUALLY update the * constants used in this function! */ #define HTCP_CALC_ALPHA(diff) \ ((\ (16) + \ ((160 * (diff)) / hz) + \ (((diff) / hz) * (((diff) << HTCP_ALPHA_INC_SHIFT) / (4 * hz))) \ ) >> HTCP_ALPHA_INC_SHIFT) static void htcp_ack_received(struct cc_var *ccv, uint16_t type); static void htcp_cb_destroy(struct cc_var *ccv); static int htcp_cb_init(struct cc_var *ccv); static void htcp_cong_signal(struct cc_var *ccv, uint32_t type); static int htcp_mod_init(void); static void htcp_post_recovery(struct cc_var *ccv); static void htcp_recalc_alpha(struct cc_var *ccv); static void htcp_recalc_beta(struct cc_var *ccv); static void htcp_record_rtt(struct cc_var *ccv); static void htcp_ssthresh_update(struct cc_var *ccv); struct htcp { /* cwnd before entering cong recovery. */ unsigned long prev_cwnd; /* cwnd additive increase parameter. */ int alpha; /* cwnd multiplicative decrease parameter. */ int beta; /* Largest rtt seen for the flow. */ int maxrtt; /* Shortest rtt seen for the flow. */ int minrtt; /* Time of last congestion event in ticks. */ int t_last_cong; }; static int htcp_rtt_ref; /* * The maximum number of ticks the value of diff can reach in * htcp_recalc_alpha() before alpha will stop increasing due to overflow. * See comment above HTCP_CALC_ALPHA for more info. */ static int htcp_max_diff = INT_MAX / ((1 << HTCP_ALPHA_INC_SHIFT) * 10); /* Per-netstack vars. */ static VNET_DEFINE(u_int, htcp_adaptive_backoff) = 0; static VNET_DEFINE(u_int, htcp_rtt_scaling) = 0; #define V_htcp_adaptive_backoff VNET(htcp_adaptive_backoff) #define V_htcp_rtt_scaling VNET(htcp_rtt_scaling) static MALLOC_DEFINE(M_HTCP, "htcp data", "Per connection data required for the HTCP congestion control algorithm"); struct cc_algo htcp_cc_algo = { .name = "htcp", .ack_received = htcp_ack_received, .cb_destroy = htcp_cb_destroy, .cb_init = htcp_cb_init, .cong_signal = htcp_cong_signal, .mod_init = htcp_mod_init, .post_recovery = htcp_post_recovery, }; static void htcp_ack_received(struct cc_var *ccv, uint16_t type) { struct htcp *htcp_data; htcp_data = ccv->cc_data; htcp_record_rtt(ccv); /* * Regular ACK and we're not in cong/fast recovery and we're cwnd * limited and we're either not doing ABC or are slow starting or are * doing ABC and we've sent a cwnd's worth of bytes. */ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { htcp_recalc_beta(ccv); htcp_recalc_alpha(ccv); /* * Use the logic in NewReno ack_received() for slow start and * for the first HTCP_DELTA_L ticks after either the flow starts * or a congestion event (when alpha equals 1). */ if (htcp_data->alpha == 1 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) newreno_cc_algo.ack_received(ccv, type); else { if (V_tcp_do_rfc3465) { /* Increment cwnd by alpha segments. */ CCV(ccv, snd_cwnd) += htcp_data->alpha * CCV(ccv, t_maxseg); ccv->flags &= ~CCF_ABC_SENTAWND; } else /* * Increment cwnd by alpha/cwnd segments to * approximate an increase of alpha segments * per RTT. */ CCV(ccv, snd_cwnd) += (((htcp_data->alpha << HTCP_SHIFT) / (CCV(ccv, snd_cwnd) / CCV(ccv, t_maxseg))) * CCV(ccv, t_maxseg)) >> HTCP_SHIFT; } } } static void htcp_cb_destroy(struct cc_var *ccv) { if (ccv->cc_data != NULL) free(ccv->cc_data, M_HTCP); } static int htcp_cb_init(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = malloc(sizeof(struct htcp), M_HTCP, M_NOWAIT); if (htcp_data == NULL) return (ENOMEM); /* Init some key variables with sensible defaults. */ htcp_data->alpha = HTCP_INIT_ALPHA; htcp_data->beta = HTCP_MINBETA; htcp_data->maxrtt = TCPTV_SRTTBASE; htcp_data->minrtt = TCPTV_SRTTBASE; htcp_data->prev_cwnd = 0; htcp_data->t_last_cong = ticks; ccv->cc_data = htcp_data; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void htcp_cong_signal(struct cc_var *ccv, uint32_t type) { struct htcp *htcp_data; htcp_data = ccv->cc_data; switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { /* * Apply hysteresis to maxrtt to ensure * reductions in the RTT are reflected in our * measurements. */ htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt - htcp_data->minrtt) * 95) / 100; htcp_ssthresh_update(ccv); htcp_data->t_last_cong = ticks; htcp_data->prev_cwnd = CCV(ccv, snd_cwnd); } ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { /* * Apply hysteresis to maxrtt to ensure reductions in * the RTT are reflected in our measurements. */ htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt - htcp_data->minrtt) * 95) / 100; htcp_ssthresh_update(ccv); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); htcp_data->t_last_cong = ticks; htcp_data->prev_cwnd = CCV(ccv, snd_cwnd); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: /* * Grab the current time and record it so we know when the * most recent congestion event was. Only record it when the * timeout has fired more than once, as there is a reasonable * chance the first one is a false alarm and may not indicate * congestion. */ if (CCV(ccv, t_rxtshift) >= 2) htcp_data->t_last_cong = ticks; break; } } static int htcp_mod_init(void) { htcp_cc_algo.after_idle = newreno_cc_algo.after_idle; /* * HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in * units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units * as t_srtt. */ htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000; return (0); } /* * Perform any necessary tasks before we exit congestion recovery. */ static void htcp_post_recovery(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in the * NewReno RFC. Otherwise, use the HTCP method. * * XXXLAS: Find a way to do this without needing curack */ if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh), CCV(ccv, snd_max))) CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) - ccv->curack + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = max(1, ((htcp_data->beta * htcp_data->prev_cwnd / CCV(ccv, t_maxseg)) >> HTCP_SHIFT)) * CCV(ccv, t_maxseg); } } static void htcp_recalc_alpha(struct cc_var *ccv) { struct htcp *htcp_data; int alpha, diff, now; htcp_data = ccv->cc_data; now = ticks; /* * If ticks has wrapped around (will happen approximately once every 49 * days on a machine with the default kern.hz=1000) and a flow straddles * the wrap point, our alpha calcs will be completely wrong. We cut our * losses and restart alpha from scratch by setting t_last_cong = now - * HTCP_DELTA_L. * * This does not deflate our cwnd at all. It simply slows the rate cwnd * is growing by until alpha regains the value it held prior to taking * this drastic measure. */ if (now < htcp_data->t_last_cong) htcp_data->t_last_cong = now - HTCP_DELTA_L; diff = now - htcp_data->t_last_cong - HTCP_DELTA_L; /* Cap alpha if the value of diff would overflow HTCP_CALC_ALPHA(). */ if (diff < htcp_max_diff) { /* * If it has been more than HTCP_DELTA_L ticks since congestion, * increase alpha according to the function defined in the spec. */ if (diff > 0) { alpha = HTCP_CALC_ALPHA(diff); /* * Adaptive backoff fairness adjustment: * 2 * (1 - beta) * alpha_raw */ if (V_htcp_adaptive_backoff) alpha = max(1, (2 * ((1 << HTCP_SHIFT) - htcp_data->beta) * alpha) >> HTCP_SHIFT); /* * RTT scaling: (RTT / RTT_ref) * alpha * alpha will be the raw value from HTCP_CALC_ALPHA() if * adaptive backoff is off, or the adjusted value if * adaptive backoff is on. */ if (V_htcp_rtt_scaling) alpha = max(1, (min(max(HTCP_MINROWE, (CCV(ccv, t_srtt) << HTCP_SHIFT) / htcp_rtt_ref), HTCP_MAXROWE) * alpha) >> HTCP_SHIFT); } else alpha = 1; htcp_data->alpha = alpha; } } static void htcp_recalc_beta(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* * TCPTV_SRTTBASE is the initialised value of each connection's SRTT, so * we only calc beta if the connection's SRTT has been changed from its * inital value. beta is bounded to ensure it is always between * HTCP_MINBETA and HTCP_MAXBETA. */ if (V_htcp_adaptive_backoff && htcp_data->minrtt != TCPTV_SRTTBASE && htcp_data->maxrtt != TCPTV_SRTTBASE) htcp_data->beta = min(max(HTCP_MINBETA, (htcp_data->minrtt << HTCP_SHIFT) / htcp_data->maxrtt), HTCP_MAXBETA); else htcp_data->beta = HTCP_MINBETA; } /* * Record the minimum and maximum RTT seen for the connection. These are used in * the calculation of beta if adaptive backoff is enabled. */ static void htcp_record_rtt(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* XXXLAS: Should there be some hysteresis for minrtt? */ /* * Record the current SRTT as our minrtt if it's the smallest we've seen * or minrtt is currently equal to its initialised value. Ignore SRTT * until a min number of samples have been taken. */ if ((CCV(ccv, t_srtt) < htcp_data->minrtt || htcp_data->minrtt == TCPTV_SRTTBASE) && (CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES)) htcp_data->minrtt = CCV(ccv, t_srtt); /* * Record the current SRTT as our maxrtt if it's the largest we've * seen. Ignore SRTT until a min number of samples have been taken. */ if (CCV(ccv, t_srtt) > htcp_data->maxrtt && CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES) htcp_data->maxrtt = CCV(ccv, t_srtt); } /* * Update the ssthresh in the event of congestion. */ static void htcp_ssthresh_update(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* * On the first congestion event, set ssthresh to cwnd * 0.5, on * subsequent congestion events, set it to cwnd * beta. */ if (CCV(ccv, snd_ssthresh) == TCP_MAXWIN << TCP_MAX_WINSHIFT) CCV(ccv, snd_ssthresh) = (CCV(ccv, snd_cwnd) * HTCP_MINBETA) >> HTCP_SHIFT; else { htcp_recalc_beta(ccv); CCV(ccv, snd_ssthresh) = (CCV(ccv, snd_cwnd) * htcp_data->beta) >> HTCP_SHIFT; } } SYSCTL_DECL(_net_inet_tcp_cc_htcp); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, htcp, CTLFLAG_RW, NULL, "H-TCP related settings"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff, CTLFLAG_RW, - &VNET_NAME(htcp_adaptive_backoff), 0, "enable H-TCP adaptive backoff"); -SYSCTL_VNET_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling, CTLFLAG_RW, - &VNET_NAME(htcp_rtt_scaling), 0, "enable H-TCP RTT scaling"); +SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_adaptive_backoff), 0, + "enable H-TCP adaptive backoff"); +SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_rtt_scaling), 0, + "enable H-TCP RTT scaling"); DECLARE_CC_MODULE(htcp, &htcp_cc_algo); Index: user/dchagin/lemul/sys/netinet/cc/cc_vegas.c =================================================================== --- user/dchagin/lemul/sys/netinet/cc/cc_vegas.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/cc/cc_vegas.c (revision 274241) @@ -1,307 +1,309 @@ /*- * Copyright (c) 2009-2010 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2010 Lawrence Stewart * Copyright (c) 2010-2011 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes and * Lawrence Stewart, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, Melbourne, Australia by * David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the Vegas congestion control algorithm for FreeBSD, * based on L. S. Brakmo and L. L. Peterson, "TCP Vegas: end to end congestion * avoidance on a global internet", IEEE J. Sel. Areas Commun., vol. 13, no. 8, * pp. 1465-1480, Oct. 1995. The original Vegas duplicate ack policy has not * been implemented, since clock ticks are not as coarse as they were (i.e. * 500ms) when Vegas was designed. Also, packets are timed once per RTT as in * the original paper. * * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CAST_PTR_INT(X) (*((int*)(X))) /* * Private signal type for rate based congestion signal. * See for appropriate bit-range to use for private signals. */ #define CC_VEGAS_RATE 0x01000000 static void vegas_ack_received(struct cc_var *ccv, uint16_t ack_type); static void vegas_cb_destroy(struct cc_var *ccv); static int vegas_cb_init(struct cc_var *ccv); static void vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type); static void vegas_conn_init(struct cc_var *ccv); static int vegas_mod_init(void); struct vegas { int slow_start_toggle; }; static int32_t ertt_id; static VNET_DEFINE(uint32_t, vegas_alpha) = 1; static VNET_DEFINE(uint32_t, vegas_beta) = 3; #define V_vegas_alpha VNET(vegas_alpha) #define V_vegas_beta VNET(vegas_beta) static MALLOC_DEFINE(M_VEGAS, "vegas data", "Per connection data required for the Vegas congestion control algorithm"); struct cc_algo vegas_cc_algo = { .name = "vegas", .ack_received = vegas_ack_received, .cb_destroy = vegas_cb_destroy, .cb_init = vegas_cb_init, .cong_signal = vegas_cong_signal, .conn_init = vegas_conn_init, .mod_init = vegas_mod_init }; /* * The vegas window adjustment is done once every RTT, as indicated by the * ERTT_NEW_MEASUREMENT flag. This flag is reset once the new measurment data * has been used. */ static void vegas_ack_received(struct cc_var *ccv, uint16_t ack_type) { struct ertt *e_t; struct vegas *vegas_data; long actual_tx_rate, expected_tx_rate, ndiff; e_t = khelp_get_osd(CCV(ccv, osd), ertt_id); vegas_data = ccv->cc_data; if (e_t->flags & ERTT_NEW_MEASUREMENT) { /* Once per RTT. */ if (e_t->minrtt && e_t->markedpkt_rtt) { expected_tx_rate = e_t->marked_snd_cwnd / e_t->minrtt; actual_tx_rate = e_t->bytes_tx_in_marked_rtt / e_t->markedpkt_rtt; ndiff = (expected_tx_rate - actual_tx_rate) * e_t->minrtt / CCV(ccv, t_maxseg); if (ndiff < V_vegas_alpha) { if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { vegas_data->slow_start_toggle = vegas_data->slow_start_toggle ? 0 : 1; } else { vegas_data->slow_start_toggle = 0; CCV(ccv, snd_cwnd) = min(CCV(ccv, snd_cwnd) + CCV(ccv, t_maxseg), TCP_MAXWIN << CCV(ccv, snd_scale)); } } else if (ndiff > V_vegas_beta) { /* Rate-based congestion. */ vegas_cong_signal(ccv, CC_VEGAS_RATE); vegas_data->slow_start_toggle = 0; } } e_t->flags &= ~ERTT_NEW_MEASUREMENT; } if (vegas_data->slow_start_toggle) newreno_cc_algo.ack_received(ccv, ack_type); } static void vegas_cb_destroy(struct cc_var *ccv) { if (ccv->cc_data != NULL) free(ccv->cc_data, M_VEGAS); } static int vegas_cb_init(struct cc_var *ccv) { struct vegas *vegas_data; vegas_data = malloc(sizeof(struct vegas), M_VEGAS, M_NOWAIT); if (vegas_data == NULL) return (ENOMEM); vegas_data->slow_start_toggle = 1; ccv->cc_data = vegas_data; return (0); } /* * If congestion has been triggered triggered by the Vegas measured rates, it is * handled here, otherwise it falls back to newreno's congestion handling. */ static void vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type) { struct vegas *vegas_data; int presignalrecov; vegas_data = ccv->cc_data; if (IN_RECOVERY(CCV(ccv, t_flags))) presignalrecov = 1; else presignalrecov = 0; switch(signal_type) { case CC_VEGAS_RATE: if (!IN_RECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_cwnd) = max(2 * CCV(ccv, t_maxseg), CCV(ccv, snd_cwnd) - CCV(ccv, t_maxseg)); if (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) /* Exit slow start. */ CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); } break; default: newreno_cc_algo.cong_signal(ccv, signal_type); } if (IN_RECOVERY(CCV(ccv, t_flags)) && !presignalrecov) vegas_data->slow_start_toggle = (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) ? 1 : 0; } static void vegas_conn_init(struct cc_var *ccv) { struct vegas *vegas_data; vegas_data = ccv->cc_data; vegas_data->slow_start_toggle = 1; } static int vegas_mod_init(void) { ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) { printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } vegas_cc_algo.after_idle = newreno_cc_algo.after_idle; vegas_cc_algo.post_recovery = newreno_cc_algo.post_recovery; return (0); } static int vegas_alpha_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_vegas_alpha; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) < 1 || CAST_PTR_INT(req->newptr) > V_vegas_beta) error = EINVAL; else V_vegas_alpha = new; } return (error); } static int vegas_beta_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_vegas_beta; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (CAST_PTR_INT(req->newptr) < 1 || CAST_PTR_INT(req->newptr) < V_vegas_alpha) error = EINVAL; else V_vegas_beta = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_vegas); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, vegas, CTLFLAG_RW, NULL, "Vegas related settings"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, alpha, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(vegas_alpha), 1, &vegas_alpha_handler, - "IU", "vegas alpha, specified as number of \"buffers\" (0 < alpha < beta)"); +SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, alpha, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + &VNET_NAME(vegas_alpha), 1, &vegas_alpha_handler, "IU", + "vegas alpha, specified as number of \"buffers\" (0 < alpha < beta)"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, beta, - CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(vegas_beta), 3, &vegas_beta_handler, - "IU", "vegas beta, specified as number of \"buffers\" (0 < alpha < beta)"); +SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, beta, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + &VNET_NAME(vegas_beta), 3, &vegas_beta_handler, "IU", + "vegas beta, specified as number of \"buffers\" (0 < alpha < beta)"); DECLARE_CC_MODULE(vegas, &vegas_cc_algo); MODULE_DEPEND(vegas, ertt, 1, 1, 1); Index: user/dchagin/lemul/sys/netinet/if_ether.c =================================================================== --- user/dchagin/lemul/sys/netinet/if_ether.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/if_ether.c (revision 274241) @@ -1,944 +1,944 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 */ /* * Ethernet address resolution protocol. * TODO: * add "inuse/lock" bit (or ref. count) along with valid bit */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #include #include #define SIN(s) ((const struct sockaddr_in *)(s)) #define SDL(s) ((struct sockaddr_dl *)s) SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); /* timer values */ static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ static VNET_DEFINE(int, arp_maxtries) = 5; static VNET_DEFINE(int, arp_proxyall) = 0; static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ VNET_PCPUSTAT_SYSINIT(arpstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(arpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) #define V_arp_maxhold VNET(arp_maxhold) -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); static void arp_init(void); static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, .nh_proto = NETISR_ARP, .nh_policy = NETISR_POLICY_SOURCE, }; #ifdef AF_INET /* * called by in_scrubprefix() to remove entry from the table when * the interface goes away */ void arp_ifscrub(struct ifnet *ifp, uint32_t addr) { struct sockaddr_in addr4; bzero((void *)&addr4, sizeof(addr4)); addr4.sin_len = sizeof(addr4); addr4.sin_family = AF_INET; addr4.sin_addr.s_addr = addr; IF_AFDATA_RLOCK(ifp); lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR), (struct sockaddr *)&addr4); IF_AFDATA_RUNLOCK(ifp); } #endif /* * Timeout routine. Age arp_tab entries periodically. */ static void arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; if (lle->la_flags & LLE_STATIC) { LLE_WUNLOCK(lle); return; } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); if ((lle->la_flags & LLE_DELETED) == 0) { int evt; if (lle->la_flags & LLE_VALID) evt = LLENTRY_EXPIRED; else evt = LLENTRY_TIMEDOUT; EVENTHANDLER_INVOKE(lle_event, lle, evt); } callout_stop(&lle->la_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); IF_AFDATA_LOCK(ifp); LLE_WLOCK(lle); /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { size_t pkts_dropped; LLE_REMREF(lle); pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } else LLE_FREE_LOCKED(lle); IF_AFDATA_UNLOCK(ifp); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address * - arp header target ip address * - arp header source ethernet address */ void arprequest(struct ifnet *ifp, const struct in_addr *sip, const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; if (sip == NULL) { /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; if (ifa->ifa_carp) { if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) continue; sip = &IA_SIN(ifa)->sin_addr; } else { carpaddr = NULL; sip = &IA_SIN(ifa)->sin_addr; } if (0 == ((sip->s_addr ^ tip->s_addr) & IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } IF_ADDR_RUNLOCK(ifp); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return; } } if (enaddr == NULL) enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return; m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; MH_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC mac_netinet_arp_send(ifp, m); #endif ah->ar_pro = htons(ETHERTYPE_IP); ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); bcopy(enaddr, ar_sha(ah), ah->ar_hln); bcopy(sip, ar_spa(ah), ah->ar_pln); bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, NULL); ARPSTAT_INC(txrequests); } /* * Resolve an IP address into an ethernet address. * On input: * ifp is the interface we use * rt0 is the route to the final destination (possibly useless) * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, * desten is where we want the address. * * On success, desten is filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ int arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, const struct sockaddr *dst, u_char *desten, struct llentry **lle) { struct llentry *la = 0; u_int flags = 0; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; *lle = NULL; if (m != NULL) { if (m->m_flags & M_BCAST) { /* broadcast */ (void)memcpy(desten, ifp->if_broadcastaddr, ifp->if_addrlen); return (0); } if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) { /* multicast */ ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); return (0); } } retry: IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, dst); IF_AFDATA_RUNLOCK(ifp); if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0) && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { flags |= (LLE_CREATE | LLE_EXCLUSIVE); IF_AFDATA_WLOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, dst); IF_AFDATA_WUNLOCK(ifp); } if (la == NULL) { if (flags & LLE_CREATE) log(LOG_DEBUG, "arpresolve: can't allocate llinfo for %s on %s\n", inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname); m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { bcopy(&la->ll_addr, desten, ifp->if_addrlen); /* * If entry has an expiry time and it is approaching, * see if we need to send an ARP request within this * arpt_down interval. */ if (!(la->la_flags & LLE_STATIC) && time_uptime + la->la_preempt > la->la_expire) { arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); la->la_preempt--; } *lle = la; error = 0; goto done; } if (la->la_flags & LLE_STATIC) { /* should not happen! */ log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n", inet_ntoa(SIN(dst)->sin_addr)); m_freem(m); error = EINVAL; goto done; } renew = (la->la_asked == 0 || la->la_expire != time_uptime); if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) { flags |= LLE_EXCLUSIVE; LLE_RUNLOCK(la); goto retry; } /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping * the oldest packet if we have exceeded the system * setting. */ if (m != NULL) { if (la->la_numheld >= V_arp_maxhold) { if (la->la_hold != NULL) { next = la->la_hold->m_nextpkt; m_freem(la->la_hold); la->la_hold = next; la->la_numheld--; ARPSTAT_INC(dropped); } } if (la->la_hold != NULL) { curr = la->la_hold; while (curr->m_nextpkt != NULL) curr = curr->m_nextpkt; curr->m_nextpkt = m; } else la->la_hold = m; la->la_numheld++; if (renew == 0 && (flags & LLE_EXCLUSIVE)) { flags &= ~LLE_EXCLUSIVE; LLE_DOWNGRADE(la); } } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH * if we have already sent arp_maxtries ARP requests. Retransmit the * ARP request, but not faster than one request per second. */ if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime; canceled = callout_reset(&la->la_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (error); } done: if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(la); else LLE_RUNLOCK(la); return (error); } /* * Common length and type checks are done here, * then the protocol-specific routine is called. */ static void arpintr(struct mbuf *m) { struct arphdr *ar; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n"); return; } ar = mtod(m, struct arphdr *); if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && ntohs(ar->ar_hrd) != ARPHRD_ARCNET && ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 && ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) { log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)" " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "", ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":", ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":"); m_freem(m); return; } if (m->m_len < arphdr_len(ar)) { if ((m = m_pullup(m, arphdr_len(ar))) == NULL) { log(LOG_NOTICE, "arp: runt packet\n"); m_freem(m); return; } ar = mtod(m, struct arphdr *); } ARPSTAT_INC(received); switch (ntohs(ar->ar_pro)) { #ifdef INET case ETHERTYPE_IP: in_arpinput(m); return; #endif } m_freem(m); } #ifdef INET /* * ARP for Internet protocols on 10 Mb/s Ethernet. * Algorithm is that given in RFC 826. * In addition, a sanity check is performed on the sender * protocol address, to catch impersonators. * We no longer handle negotiations for use of trailer protocol: * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent * along with IP replies if we wanted trailers sent to us, * and also sent them in response to IP replies. * This allowed either end to announce the desire to receive * trailer packets. * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, * but formerly didn't normally send requests. */ static int log_arp_wrong_iface = 1; static int log_arp_movements = 1; static int log_arp_permanent_modify = 1; static int allow_multicast = 0; static struct timeval arp_lastlog; static int arp_curpps; static int arp_maxpps = 1; SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, &log_arp_wrong_iface, 0, "log arp packets arriving on the wrong interface"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, &log_arp_movements, 0, "log arp replies from MACs different than the one in the cache"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, &log_arp_permanent_modify, 0, "log arp replies from MACs different than the one in the permanent arp entry"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, &allow_multicast, 0, "accept multicast addresses"); SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, CTLFLAG_RW, &arp_maxpps, 0, "Maximum number of remotely triggered ARP messages that can be " "logged per second"); #define ARP_LOG(pri, ...) do { \ if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ log((pri), "arp: " __VA_ARGS__); \ } while (0) static void in_arpinput(struct mbuf *m) { struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; struct llentry *la = NULL; struct rtentry *rt; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; int op, flags; int req_len; int bridged = 0, is_bridge = 0; int carped; struct sockaddr_in sin; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { ARP_LOG(LOG_NOTICE, "runt packet -- m_pullup failed\n"); return; } ah = mtod(m, struct arphdr *); /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } op = ntohs(ah->ar_op); (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); if (op == ARPOP_REPLY) ARPSTAT_INC(rxreplies); /* * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). */ IN_IFADDR_RLOCK(); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && (ia->ia_ifa.ifa_carp == NULL || (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); goto match; } } LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); goto match; } #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* * Check the case when bridge shares its MAC address with * some of its children, so packets are claimed by bridge * itself (bridge_input() does it first), but they are really * meant to be destined to the bridge member. */ if (is_bridge) { LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; IN_IFADDR_RUNLOCK(); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP IN_IFADDR_RUNLOCK(); /* * No match, use the first inet address on the receive interface * as a dummy address for the rest of the function. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (ifa->ifa_carp == NULL || (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); /* * If bridging, fall back to using any inet address. */ IN_IFADDR_RLOCK(); if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { IN_IFADDR_RUNLOCK(); goto drop; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " "%s!\n", inet_ntoa(isaddr)); goto drop; } /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; ARPSTAT_INC(dupips); goto reply; } if (ifp->if_flags & IFF_STATICARP) goto reply; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0; flags |= LLE_EXCLUSIVE; IF_AFDATA_LOCK(ifp); la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin); IF_AFDATA_UNLOCK(ifp); if (la != NULL) { /* the following is not an error when doing bridging */ if (!bridged && la->lle_tbl->llt_ifp != ifp) { if (log_arp_wrong_iface) ARP_LOG(LOG_WARNING, "%s is on %s " "but got reply from %*D on %s\n", inet_ntoa(isaddr), la->lle_tbl->llt_ifp->if_xname, ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); LLE_WUNLOCK(la); goto reply; } if ((la->la_flags & LLE_VALID) && bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) ARP_LOG(LOG_ERR, "%*D attempts to modify " "permanent entry for %s on %s\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); goto reply; } if (log_arp_movements) { ARP_LOG(LOG_INFO, "%s moved from %*D " "to %*D on %s\n", inet_ntoa(isaddr), ifp->if_addrlen, (u_char *)&la->ll_addr, ":", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", ifp->if_xname); } } if (ifp->if_addrlen != ah->ar_hln) { LLE_WUNLOCK(la); ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " "i/f %d (ignored)\n", ifp->if_addrlen, (u_char *) ar_sha(ah), ":", ah->ar_hln, ifp->if_addrlen); goto drop; } (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); la->la_flags |= LLE_VALID; EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); if (!(la->la_flags & LLE_STATIC)) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime + V_arpt_keep; canceled = callout_reset(&la->la_timer, hz * V_arpt_keep, arptimer, la); if (canceled) LLE_REMREF(la); } la->la_asked = 0; la->la_preempt = V_arp_maxtries; /* * The packets are all freed within the call to the output * routine. * * NB: The lock MUST be released before the call to the * output routine. */ if (la->la_hold != NULL) { struct mbuf *m_hold, *m_hold_next; m_hold = la->la_hold; la->la_hold = NULL; la->la_numheld = 0; memcpy(&sa, L3_ADDR(la), sizeof(sa)); LLE_WUNLOCK(la); for (; m_hold != NULL; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* Avoid confusing lower layers. */ m_clrprotoflags(m_hold); (*ifp->if_output)(ifp, m_hold, &sa, NULL); } } else LLE_WUNLOCK(la); } reply: if (op != ARPOP_REQUEST) goto drop; ARPSTAT_INC(rxrequests); if (itaddr.s_addr == myaddr.s_addr) { /* Shortcut.. the receiving interface is the target. */ (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); } else { struct llentry *lle = NULL; sin.sin_addr = itaddr; IF_AFDATA_RLOCK(ifp); lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); IF_AFDATA_RUNLOCK(ifp); if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { if (lle != NULL) LLE_RUNLOCK(lle); if (!V_arp_proxyall) goto drop; sin.sin_addr = itaddr; /* XXX MRT use table 0 for arp reply */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) goto drop; /* * Don't send proxies for nodes on the same interface * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ if (!rt->rt_ifp || rt->rt_ifp == ifp) { RTFREE_LOCKED(rt); goto drop; } RTFREE_LOCKED(rt); (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); /* * Also check that the node which sent the ARP packet * is on the interface we expect it to be on. This * avoids ARP chaos if an interface is connected to the * wrong network. */ sin.sin_addr = isaddr; /* XXX MRT use table 0 for arp checks */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); if (!rt) goto drop; if (rt->rt_ifp != ifp) { ARP_LOG(LOG_INFO, "proxy: ignoring request" " from %s via %s, expecting %s\n", inet_ntoa(isaddr), ifp->if_xname, rt->rt_ifp->if_xname); RTFREE_LOCKED(rt); goto drop; } RTFREE_LOCKED(rt); #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa(itaddr)); #endif } } if (itaddr.s_addr == myaddr.s_addr && IN_LINKLOCAL(ntohl(itaddr.s_addr))) { /* RFC 3927 link-local IPv4; always reply by broadcast. */ #ifdef DEBUG_LINKLOCAL printf("arp: sending reply for link-local addr %s\n", inet_ntoa(itaddr)); #endif m->m_flags |= M_BCAST; m->m_flags &= ~M_MCAST; } else { /* default behaviour; never reply by broadcast. */ m->m_flags &= ~(M_BCAST|M_MCAST); } (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); ah->ar_op = htons(ARPOP_REPLY); ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; m_clrprotoflags(m); /* Avoid confusing lower layers. */ (*ifp->if_output)(ifp, m, &sa, NULL); ARPSTAT_INC(txreplies); return; drop: m_freem(m); } #endif void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { struct llentry *lle; if (ifa->ifa_carp != NULL) return; if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); /* * interface address is considered static entry * because the output of the arp utility shows * that L2 entry as permanent */ IF_AFDATA_LOCK(ifp); lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC), (struct sockaddr *)IA_SIN(ifa)); IF_AFDATA_UNLOCK(ifp); if (lle == NULL) log(LOG_INFO, "arp_ifinit: cannot create arp " "entry for interface address\n"); else LLE_RUNLOCK(lle); } ifa->ifa_rtrequest = NULL; } void arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr) { if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) arprequest(ifp, &IA_SIN(ifa)->sin_addr, &IA_SIN(ifa)->sin_addr, enaddr); ifa->ifa_rtrequest = NULL; } static void arp_init(void) { netisr_register(&arp_nh); } SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); Index: user/dchagin/lemul/sys/netinet/igmp.c =================================================================== --- user/dchagin/lemul/sys/netinet/igmp.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/igmp.c (revision 274241) @@ -1,3655 +1,3655 @@ /*- * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ /* * Internet Group Management Protocol (IGMP) routines. * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IGMPV3 #define KTR_IGMPV3 KTR_INET #endif static struct igmp_ifinfo * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); static void igmp_dispatch_queue(struct ifqueue *, int, const int); static void igmp_fasttimo_vnet(void); static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *); static int igmp_handle_state_change(struct in_multi *, struct igmp_ifinfo *); static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static void igmp_intr(struct mbuf *); static int igmp_isgroupreported(const struct in_addr); static struct mbuf * igmp_ra_alloc(void); #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif static void igmp_set_version(struct igmp_ifinfo *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *); static void igmp_v2_update_group(struct in_multi *, const int); static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *); static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); static int igmp_v3_enqueue_group_record(struct ifqueue *, struct in_multi *, const int, const int, const int); static int igmp_v3_enqueue_filter_change(struct ifqueue *, struct in_multi *); static void igmp_v3_process_group_timers(struct igmp_ifinfo *, struct ifqueue *, struct ifqueue *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, struct ifqueue *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); static const struct netisr_handler igmp_nh = { .nh_name = "igmp", .nh_handler = igmp_intr, .nh_proto = NETISR_IGMP, .nh_policy = NETISR_POLICY_SOURCE, }; /* * System-wide globals. * * Unlocked access to these is OK, except for the global IGMP output * queue. The IGMP subsystem lock ends up being system-wide for the moment, * because all VIMAGEs have to share a global output queue, as netisrs * themselves are not virtualized. * * Locking: * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LOCK covers in_multi. * * IGMP_LOCK covers igmp_ifinfo and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * igmp_ifinfo is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. * We allow unlocked reads of igmp_ifinfo when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to * it and the group is being joined for the first time. * * IGMP releases its reference(s) on in_multi in a deferred way, * because the operations which process the release run as part of * a loop whose control variables are directly affected by the release * (that, and not recursing on the IF_ADDR_LOCK). * * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. * * SMPng: XXX We may potentially race operations on ifma_protospec. * The problem is that we currently lack a clean way of taking the * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, * as anything which modifies ifma needs to be covered by that lock. * So check for ifma_protospec being NULL before proceeding. */ struct mtx igmp_mtx; struct mbuf *m_raopt; /* Router Alert option */ static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); /* * VIMAGE-wide globals. * * The IGMPv3 timers themselves need to run per-image, however, * protosw timers run globally (see tcp). * An ifnet can only be in one vimage at a time, and the loopback * ifnet, loif, is itself virtualized. * It would otherwise be possible to seriously hose IGMP state, * and create inconsistencies in upstream multicast routing, if you have * multiple VIMAGEs running on the same link joining different multicast * groups, UNLESS the "primary IP address" is different. This is because * IGMP for IPv4 does not force link-local addresses to be used for each * node, unlike MLD for IPv6. * Obviously the IGMPv3 per-interface state has per-vimage granularity * also as a result. * * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection * policy to control the address used by IGMP on the link. */ static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general * query response */ static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change * retransmit */ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host * report; IGMPv3 g/sg * query response */ #define V_interface_timers_running VNET(interface_timers_running) #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), }; static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; #define V_igi_head VNET(igi_head) #define V_igmpstat VNET(igmpstat) #define V_igmp_gsrdelay VNET(igmp_gsrdelay) static VNET_DEFINE(int, igmp_recvifkludge) = 1; static VNET_DEFINE(int, igmp_sendra) = 1; static VNET_DEFINE(int, igmp_sendlocal) = 1; static VNET_DEFINE(int, igmp_v1enable) = 1; static VNET_DEFINE(int, igmp_v2enable) = 1; static VNET_DEFINE(int, igmp_legacysupp); static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; #define V_igmp_recvifkludge VNET(igmp_recvifkludge) #define V_igmp_sendra VNET(igmp_sendra) #define V_igmp_sendlocal VNET(igmp_sendlocal) #define V_igmp_v1enable VNET(igmp_v1enable) #define V_igmp_v2enable VNET(igmp_v2enable) #define V_igmp_legacysupp VNET(igmp_legacysupp) #define V_igmp_default_version VNET(igmp_default_version) /* * Virtualized sysctls. */ -SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); -SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); -SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); static __inline void igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void igmp_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } #ifdef KTR static __inline char * inet_ntoa_haddr(in_addr_t haddr) { struct in_addr ia; ia.s_addr = htonl(haddr); return (inet_ntoa(ia)); } #endif /* * Restore context from a queued IGMP output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t igmp_restore_context(struct mbuf *m) { #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set default IGMP version. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) { int error; int new; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); new = V_igmp_default_version; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) goto out_locked; if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", V_igmp_default_version, new); V_igmp_default_version = new; out_locked: IGMP_UNLOCK(); return (error); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by IGMP lock. */ static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); IGMP_LOCK(); i = V_igmp_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", V_igmp_gsrdelay.tv_sec, i); V_igmp_gsrdelay.tv_sec = i; out_locked: IGMP_UNLOCK(); return (error); } /* * Expose struct igmp_ifinfo to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct igmp_ifinfo *igi; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); if (error) return (error); IN_MULTI_LOCK(); IGMP_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { error = SYSCTL_OUT(req, igi, sizeof(struct igmp_ifinfo)); break; } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains * using the netisr. * VIMAGE: Assumes the vnet pointer has been set. */ static void igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop) { struct mbuf *m; for (;;) { _IF_DEQUEUE(ifq, m); if (m == NULL) break; CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); if (--limit == 0) break; } } /* * Filter outgoing IGMP report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are * disabled for all groups in the 224.0.0.0/24 link-local scope. However, * this may break certain IGMP snooping switches which rely on the old * report behaviour. * * Return zero if the given group is one for which IGMP reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int igmp_isgroupreported(const struct in_addr addr) { if (in_allhosts(addr) || ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) return (0); return (1); } /* * Construct a Router Alert option to use in outgoing packets. */ static struct mbuf * igmp_ra_alloc(void) { struct mbuf *m; struct ipoption *p; m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ p->ipopt_list[1] = 0x04; /* 4 bytes long */ p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ p->ipopt_list[3] = 0x00; /* pad byte */ m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; return (m); } /* * Attach IGMP when PF_INET is attached to an interface. */ struct igmp_ifinfo * igmp_domifattach(struct ifnet *ifp) { struct igmp_ifinfo *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = igi_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) igi->igi_flags |= IGIF_SILENT; IGMP_UNLOCK(); return (igi); } /* * VIMAGE: assume curvnet set by caller. */ static struct igmp_ifinfo * igi_alloc_locked(/*const*/ struct ifnet *ifp) { struct igmp_ifinfo *igi; IGMP_LOCK_ASSERT(); igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; igi->igi_ifp = ifp; igi->igi_version = V_igmp_default_version; igi->igi_flags = 0; igi->igi_rv = IGMP_RV_INIT; igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; SLIST_INIT(&igi->igi_relinmhead); /* * Responses to general queries are subject to bounds. */ IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)", ifp, ifp->if_xname); out: return (igi); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). * XXX This is also bitten by unlocked ifma_protospec access. */ void igmp_ifdetach(struct ifnet *ifp) { struct igmp_ifinfo *igi; struct ifmultiaddr *ifma; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; if (igi->igi_version == IGMP_VERSION_3) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_state == IGMP_LEAVING_MEMBER) { SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } inm_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); /* * Free the in_multi reference(s) for this IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } IGMP_UNLOCK(); } /* * Hook for domifdetach. */ void igmp_domifdetach(struct ifnet *ifp) { struct igmp_ifinfo *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; igi_delete_locked(ifp); IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { struct igmp_ifinfo *igi, *tigi; CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { if (igi->igi_ifp == ifp) { /* * Free deferred General Query responses. */ _IF_DRAIN(&igi->igi_gq); LIST_REMOVE(igi, igi_link); KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), ("%s: there are dangling in_multi references", __func__)); free(igi, M_IGMP); return; } } #ifdef INVARIANTS panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); #endif } /* * Process a received IGMPv1 query. * Return non-zero if the message should be dropped. * * VIMAGE: The curvnet pointer is derived from the input ifp. */ static int igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifinfo *igi; struct in_multi *inm; /* * IGMPv1 Host Mmembership Queries SHOULD always be addressed to * 224.0.0.1. They are always treated as General Queries. * igmp_group is always ignored. Do not drop it as a userland * daemon may wish to see it. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } IGMPSTAT_INC(igps_rcv_gen_queries); IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Switch to IGMPv1 host compatibility mode. */ igmp_set_version(igi, IGMP_VERSION_1); CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); /* * Start the timers in all of our group records * for the interface on which the query arrived, * except those which are already running. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; if (inm->inm_timer != 0) continue; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; break; case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 general or group-specific query. */ static int igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; struct igmp_ifinfo *igi; struct in_multi *inm; int is_general_query; uint16_t timer; is_general_query = 0; /* * Validate address fields upfront. * XXX SMPng: unlocked increments in igmpstat assumed atomic. */ if (in_nullhost(igmp->igmp_group)) { /* * IGMPv2 General Query. * If this was not sent to the all-hosts group, ignore it. */ if (!in_allhosts(ip->ip_dst)) return (0); IGMPSTAT_INC(igps_rcv_gen_queries); is_general_query = 1; } else { /* IGMPv2 Group-Specific Query. */ IGMPSTAT_INC(igps_rcv_group_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Ignore v2 query if in v1 Compatibility Mode. */ if (igi->igi_version == IGMP_VERSION_1) goto out_locked; igmp_set_version(igi, IGMP_VERSION_2); timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", ifp, ifp->if_xname); IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; igmp_v2_update_group(inm, timer); } IF_ADDR_RUNLOCK(ifp); } else { /* * Group-specific IGMPv2 query, we need only * look up the single group to process it. */ inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); igmp_v2_update_group(inm, timer); } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an IGMPv2 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to IGMPv3. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike IGMPv3, the delay per group should be jittered * to avoid bursts of IGMPv2 reports. */ static void igmp_v2_update_group(struct in_multi *inm, const int timer) { CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer); IN_MULTI_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: if (inm->inm_timer != 0 && inm->inm_timer <= timer) { CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); inm->inm_state = IGMP_REPORTING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; break; case IGMP_SLEEPING_MEMBER: CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); inm->inm_state = IGMP_AWAKENING_MEMBER; break; case IGMP_LEAVING_MEMBER: break; } } /* * Process a received IGMPv3 general, group-specific or * group-and-source-specific query. * Assumes m has already been pulled up to the full IGMP message length. * Return 0 if successful, otherwise an appropriate error code is returned. */ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { struct igmp_ifinfo *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; uint16_t timer; uint8_t qrv; is_general_query = 0; CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ if (maxresp >= 128) { maxresp = IGMP_MANT(igmpv3->igmp_code) << (IGMP_EXP(igmpv3->igmp_code) + 3); } /* * Robustness must never be less than 2 for on-wire IGMPv3. * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make * an exception for interfaces whose IGMPv3 state changes * are redirected to loopback (e.g. MANET). */ qrv = IGMP_QRV(igmpv3->igmp_misc); if (qrv < 2) { CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, qrv, IGMP_RV_INIT); qrv = IGMP_RV_INIT; } qqi = igmpv3->igmp_qqi; if (qqi >= 128) { qqi = IGMP_MANT(igmpv3->igmp_qqi) << (IGMP_EXP(igmpv3->igmp_qqi) + 3); } timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; if (timer == 0) timer = 1; nsrc = ntohs(igmpv3->igmp_numsrc); /* * Validate address fields and versions upfront before * accepting v3 query. * XXX SMPng: Unlocked access to igmpstat counters here. */ if (in_nullhost(igmpv3->igmp_group)) { /* * IGMPv3 General Query. * * General Queries SHOULD be directed to 224.0.0.1. * A general query with a source list has undefined * behaviour; discard it. */ IGMPSTAT_INC(igps_rcv_gen_queries); if (!in_allhosts(ip->ip_dst) || nsrc > 0) { IGMPSTAT_INC(igps_rcv_badqueries); return (0); } is_general_query = 1; } else { /* Group or group-source specific query. */ if (nsrc == 0) IGMPSTAT_INC(igps_rcv_group_queries); else IGMPSTAT_INC(igps_rcv_gsr_queries); } IN_MULTI_LOCK(); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", ifp, ifp->if_xname); goto out_locked; } /* * Discard the v3 query if we're in Compatibility Mode. * The RFC is not obviously worded that hosts need to stay in * compatibility mode until the Old Version Querier Present * timer expires. */ if (igi->igi_version != IGMP_VERSION_3) { CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", igi->igi_version, ifp, ifp->if_xname); goto out_locked; } igmp_set_version(igi, IGMP_VERSION_3); igi->igi_rv = qrv; igi->igi_qi = qqi; igi->igi_qri = maxresp; CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, maxresp); if (is_general_query) { /* * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", ifp, ifp->if_xname); if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); V_interface_timers_running = 1; } } else { /* * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = inm_lookup(ifp, igmpv3->igmp_group); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->inm_lastgsrtv, &V_igmp_gsrdelay)) { CTR1(KTR_IGMPV3, "%s: GS query throttled.", __func__); IGMPSTAT_INC(igps_drop_gsr_queries); goto out_locked; } } CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)", inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) igmp_input_v3_group_query(inm, igi, timer, igmpv3); } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); return (0); } /* * Process a recieved IGMPv3 group-specific or group-and-source-specific * query. * Return <0 if any error occured. Currently this is ignored. */ static int igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; uint16_t nsrc; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); retval = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LEAVING_MEMBER: return (retval); break; case IGMP_REPORTING_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(igmpv3->igmp_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { inm_clear_recorded(inm); timer = min(inm->inm_timer, timer); } inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { timer = min(inm->inm_timer, timer); inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. * FIXME: Handling source lists larger than 1 mbuf requires that * we pass the mbuf chain pointer down to this function, and use * m_getptr() to walk the chain. */ if (inm->inm_nsrc > 0) { const struct in_addr *ap; int i, nrecorded; ap = (const struct in_addr *)(igmpv3 + 1); nrecorded = 0; for (i = 0; i < nsrc; i++, ap++) { retval = inm_record_source(inm, ap->s_addr); if (retval < 0) break; nrecorded += retval; } if (nrecorded > 0) { CTR1(KTR_IGMPV3, "%s: schedule response to SG query", __func__); inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; inm->inm_timer = IGMP_RANDOM_DELAY(timer); V_current_state_timers_running = 1; } } return (retval); } /* * Process a received IGMPv1 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct in_ifaddr *ia; struct in_multi *inm; IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) return (0); if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { IFP_TO_IA(ifp, ia); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); } } CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); /* * IGMPv1 report suppression. * If we are a member of this group, and our membership should be * reported, stop our group timer and transition to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifinfo *igi; igi = inm->inm_igi; if (igi == NULL) { KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); goto out_locked; } IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); case IGMP_SLEEPING_MEMBER: inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_REPORTING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); if (igi->igi_version == IGMP_VERSION_1) inm->inm_state = IGMP_LAZY_MEMBER; else if (igi->igi_version == IGMP_VERSION_2) inm->inm_state = IGMP_SLEEPING_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } /* * Process a received IGMPv2 host membership report. * * NOTE: 0.0.0.0 workaround breaks const correctness. */ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { struct in_ifaddr *ia; struct in_multi *inm; /* * Make sure we don't hear our own membership report. Fast * leave requires knowing that we are the only member of a * group. */ IFP_TO_IA(ifp, ia); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); } IGMPSTAT_INC(igps_rcv_reports); if (ifp->if_flags & IFF_LOOPBACK) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || !in_hosteq(igmp->igmp_group, ip->ip_dst)) { if (ia != NULL) ifa_free(&ia->ia_ifa); IGMPSTAT_INC(igps_rcv_badreports); return (EINVAL); } /* * RFC 3376, Section 4.2.13, 9.2, 9.3: * Booting clients may use the source address 0.0.0.0. Some * IGMP daemons may not know how to use IP_RECVIF to determine * the interface upon which this message was received. * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { if (ia != NULL) ip->ip_src.s_addr = htonl(ia->ia_subnet); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); /* * IGMPv2 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { struct igmp_ifinfo *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); IGMPSTAT_INC(igps_rcv_ourreports); /* * If we are in IGMPv3 host mode, do not allow the * other host's IGMPv1 report to suppress our reports * unless explicitly configured to do so. */ if (igi->igi_version == IGMP_VERSION_3) { if (V_igmp_legacysupp) igmp_v3_suppress_group_record(inm); goto out_locked; } inm->inm_timer = 0; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_AWAKENING_MEMBER: CTR3(KTR_IGMPV3, "report suppressed for %s on ifp %p(%s)", inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); case IGMP_LAZY_MEMBER: inm->inm_state = IGMP_LAZY_MEMBER; break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } out_locked: IN_MULTI_UNLOCK(); return (0); } int igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; int igmplen; int minlen; int queryver; CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); m = *mp; ifp = m->m_pkthdr.rcvif; *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); iphlen = *offp; igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. */ if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } /* * Always pullup to the minimum size for v1/v2 or v3 * to amortize calls to m_pullup(). */ minlen = iphlen; if (igmplen >= IGMP_V3_QUERY_MINLEN) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; if ((!M_WRITABLE(m) || m->m_len < minlen) && (m = m_pullup(m, minlen)) == 0) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); /* * Validate checksum. */ m->m_data += iphlen; m->m_len -= iphlen; igmp = mtod(m, struct igmp *); if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; /* * IGMP control traffic is link-scope, and must have a TTL of 1. * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; * probe packets may come from beyond the LAN. */ if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); return (IPPROTO_DONE); } switch (igmp->igmp_type) { case IGMP_HOST_MEMBERSHIP_QUERY: if (igmplen == IGMP_MINLEN) { if (igmp->igmp_code == 0) queryver = IGMP_VERSION_1; else queryver = IGMP_VERSION_2; } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { queryver = IGMP_VERSION_3; } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); return (IPPROTO_DONE); } switch (queryver) { case IGMP_VERSION_1: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v1enable) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_2: IGMPSTAT_INC(igps_rcv_v1v2_queries); if (!V_igmp_v2enable) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_VERSION_3: { struct igmpv3 *igmpv3; uint16_t igmpv3len; uint16_t srclen; int nsrc; IGMPSTAT_INC(igps_rcv_v3_queries); igmpv3 = (struct igmpv3 *)igmp; /* * Validate length based on source count. */ nsrc = ntohs(igmpv3->igmp_numsrc); srclen = sizeof(struct in_addr) * nsrc; if (nsrc * sizeof(in_addr_t) > srclen) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + srclen; if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); return (IPPROTO_DONE); } } break; } break; case IGMP_v1_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v1enable) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v2_HOST_MEMBERSHIP_REPORT: if (!V_igmp_v2enable) break; if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); return (IPPROTO_DONE); } break; case IGMP_v3_HOST_MEMBERSHIP_REPORT: /* * Hosts do not need to process IGMPv3 membership reports, * as report suppression is no longer required. */ if (!ip_checkrouteralert(m)) IGMPSTAT_INC(igps_rcv_nora); break; default: break; } /* * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ *mp = m; return (rip_input(mp, offp, proto)); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * Sends are shuffled off to a netisr to deal with Giant. * * VIMAGE: Assume caller has set up our curvnet. */ static void igmp_fasttimo_vnet(void) { struct ifqueue scq; /* State-change packets */ struct ifqueue qrq; /* Query response packets */ struct ifnet *ifp; struct igmp_ifinfo *igi; struct ifmultiaddr *ifma; struct in_multi *inm; int loop, uri_fasthz; loop = 0; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running && !V_interface_timers_running && !V_state_change_timers_running) return; IN_MULTI_LOCK(); IGMP_LOCK(); /* * IGMPv3 General Query response timer processing. */ if (V_interface_timers_running) { CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); V_interface_timers_running = 0; LIST_FOREACH(igi, &V_igi_head, igi_link) { if (igi->igi_v3_timer == 0) { /* Do nothing. */ } else if (--igi->igi_v3_timer == 0) { igmp_v3_dispatch_general_query(igi); } else { V_interface_timers_running = 1; } } } if (!V_current_state_timers_running && !V_state_change_timers_running) goto out_locked; V_current_state_timers_running = 0; V_state_change_timers_running = 0; CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); /* * IGMPv1/v2/v3 host report and state-change timer processing. * Note: Processing a v3 group timer may remove a node. */ LIST_FOREACH(igi, &V_igi_head, igi_link) { ifp = igi->igi_ifp; if (igi->igi_version == IGMP_VERSION_3) { loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); memset(&qrq, 0, sizeof(struct ifqueue)); IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS); memset(&scq, 0, sizeof(struct ifqueue)); IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: igmp_v1v2_process_group_timer(inm, igi->igi_version); break; case IGMP_VERSION_3: igmp_v3_process_group_timers(igi, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); if (igi->igi_version == IGMP_VERSION_3) { struct in_multi *tinm; igmp_dispatch_queue(&qrq, 0, loop); igmp_dispatch_queue(&scq, 0, loop); /* * Free the in_multi reference(s) for this * IGMP lifecycle. */ SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } } out_locked: IGMP_UNLOCK(); IN_MULTI_UNLOCK(); } /* * Update host report group timer for IGMPv1/v2. * Will update the global pending timer flags. */ static void igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) { int report_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); if (inm->inm_timer == 0) { report_timer_expired = 0; } else if (--inm->inm_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running = 1; return; } switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: break; case IGMP_REPORTING_MEMBER: if (report_timer_expired) { inm->inm_state = IGMP_IDLE_MEMBER; (void)igmp_v1v2_queue_report(inm, (version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); } break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } /* * Update a group's timers for IGMPv3. * Will update the global pending timer flags. * Note: Unlocked read from igi. */ static void igmp_v3_process_group_timers(struct igmp_ifinfo *igi, struct ifqueue *qrq, struct ifqueue *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from v1/v2 compatibility mode back to v3, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->inm_timer == 0) { query_response_timer_expired = 0; } else if (--inm->inm_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running = 1; } if (inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->inm_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_AWAKENING_MEMBER: case IGMP_IDLE_MEMBER: break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); inm->inm_state = IGMP_REPORTING_MEMBER; /* XXX Clear recorded sources for next time. */ inm_clear_recorded(inm); } /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: case IGMP_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->inm_scrv > 0) { inm->inm_sctimer = uri_fasthz; V_state_change_timers_running = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)igmp_v3_merge_state_changes(inm, scq); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); /* * If we are leaving the group for good, make sure * we release IGMP's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->inm_state == IGMP_LEAVING_MEMBER && inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); } } break; } } /* * Suppress a group's pending response to a group or source/group query. * * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. * Do NOT update ST1/ST0 as this operation merely suppresses * the currently pending group record. * Do NOT suppress the response to a general query. It is possible but * it would require adding another state or flag. */ static void igmp_v3_suppress_group_record(struct in_multi *inm) { IN_MULTI_LOCK_ASSERT(); KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, ("%s: not IGMPv3 mode on link", __func__)); if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) return; if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) inm_clear_recorded(inm); inm->inm_timer = 0; inm->inm_state = IGMP_REPORTING_MEMBER; } /* * Switch to a different IGMP version on the given interface, * as per Section 7.2.1. */ static void igmp_set_version(struct igmp_ifinfo *igi, const int version) { int old_version_timer; IGMP_LOCK_ASSERT(); CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, version, igi->igi_ifp, igi->igi_ifp->if_xname); if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { /* * Compute the "Older Version Querier Present" timer as per * Section 8.12. */ old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; old_version_timer *= PR_SLOWHZ; if (version == IGMP_VERSION_1) { igi->igi_v1_timer = old_version_timer; igi->igi_v2_timer = 0; } else if (version == IGMP_VERSION_2) { igi->igi_v1_timer = 0; igi->igi_v2_timer = old_version_timer; } } if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { if (igi->igi_version != IGMP_VERSION_2) { igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } else if (igi->igi_v1_timer > 0) { if (igi->igi_version != IGMP_VERSION_1) { igi->igi_version = IGMP_VERSION_1; igmp_v3_cancel_link_timers(igi); } } } /* * Cancel pending IGMPv3 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. * * Only ever called on a transition from v3 to Compatibility mode. Kill * the timers stone dead (this may be expensive for large N groups), they * will be restarted if Compatibility Mode deems that they must be due to * query processing. */ static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm, *tinm; CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * Stop the v3 General Query Response on this link stone dead. * If fasttimo is woken up due to V_interface_timers_running, * the flag will be cleared if there are no pending link timers. */ igi->igi_v3_timer = 0; /* * Now clear the current-state and state-change report timers * for all memberships scoped to this link. */ ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* * These states are either not relevant in v3 mode, * or are unreported. Do nothing. */ break; case IGMP_LEAVING_MEMBER: /* * If we are leaving the group and switching to * compatibility mode, we need to release the final * reference held for issuing the INCLUDE {}, and * transition to REPORTING to ensure the host leave * message is sent upstream to the old querier -- * transition to NOT would lose the leave and race. */ SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); /* FALLTHROUGH */ case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: inm_clear_recorded(inm); /* FALLTHROUGH */ case IGMP_REPORTING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; break; } /* * Always clear state-change and group report timers. * Free any pending IGMPv3 state-change records. */ inm->inm_sctimer = 0; inm->inm_timer = 0; _IF_DRAIN(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); inm_release_locked(inm); } } /* * Update the Older Version Querier Present timers for a link. * See Section 7.2.1 of RFC 3376. */ static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) { IGMP_LOCK_ASSERT(); if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { /* * IGMPv1 and IGMPv2 Querier Present timers expired. * * Revert to IGMPv3. */ if (igi->igi_version != IGMP_VERSION_3) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_3; } } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { /* * IGMPv1 Querier Present timer expired, * IGMPv2 Querier Present timer running. * If IGMPv2 was disabled since last timeout, * revert to IGMPv3. * If IGMPv2 is enabled, revert to IGMPv2. */ if (!V_igmp_v2enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v2_timer; if (igi->igi_version != IGMP_VERSION_2) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { /* * IGMPv1 Querier Present timer running. * Stop IGMPv2 timer if running. * * If IGMPv1 was disabled since last timeout, * revert to IGMPv3. * If IGMPv1 is enabled, reset IGMPv2 timer if running. */ if (!V_igmp_v1enable) { CTR5(KTR_IGMPV3, "%s: transition from v%d -> v%d on %p(%s)", __func__, igi->igi_version, IGMP_VERSION_3, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v1_timer = 0; igi->igi_version = IGMP_VERSION_3; } else { --igi->igi_v1_timer; } if (igi->igi_v2_timer > 0) { CTR3(KTR_IGMPV3, "%s: cancel v2 timer on %p(%s)", __func__, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_v2_timer = 0; } } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void igmp_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); igmp_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void igmp_slowtimo_vnet(void) { struct igmp_ifinfo *igi; IGMP_LOCK(); LIST_FOREACH(igi, &V_igi_head, igi_link) { igmp_v1v2_process_querier_timers(igi); } IGMP_UNLOCK(); } /* * Dispatch an IGMPv1/v2 host report or leave message. * These are always small enough to fit inside a single mbuf. */ static int igmp_v1v2_queue_report(struct in_multi *inm, const int type) { struct ifnet *ifp; struct igmp *igmp; struct ip *ip; struct mbuf *m; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); ifp = inm->inm_ifp; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); m->m_data += sizeof(struct ip); m->m_len = sizeof(struct igmp); igmp = mtod(m, struct igmp *); igmp->igmp_type = type; igmp->igmp_code = 0; igmp->igmp_group = inm->inm_addr; igmp->igmp_cksum = 0; igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = 0; ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; if (type == IGMP_HOST_LEAVE_MESSAGE) ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); else ip->ip_dst = inm->inm_addr; igmp_save_context(m, ifp); m->m_flags |= M_IGMPV2; if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) m->m_flags |= M_IGMP_LOOP; CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); netisr_dispatch(NETISR_IGMP, m); return (0); } /* * Process a state change from the upper layer for the given IPv4 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to IGMP to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the IGMPv3 state machine at group level. The IGMP module * however makes the decision as to which IGMP protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can * save ourselves a bunch of work; any exclusive mode groups need not * compute source filter lists. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int igmp_change_state(struct in_multi *inm) { struct igmp_ifinfo *igi; struct ifnet *ifp; int error; IN_MULTI_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->inm_ifma->ifma_ifp; /* * Sanity check that netinet's notion of ifp is the * same as net's. */ KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an IGMP * life cycle for this group. */ if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: initial join", __func__); error = igmp_initial_join(inm, igi); goto out_locked; } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_IGMPV3, "%s: final leave", __func__); igmp_final_leave(inm, igi); goto out_locked; } } else { CTR1(KTR_IGMPV3, "%s: filter set change", __func__); } error = igmp_handle_state_change(inm, igi); out_locked: IGMP_UNLOCK(); return (error); } /* * Perform the initial join for an IGMP group. * * When joining a group: * If the group should have its IGMP traffic suppressed, do nothing. * IGMPv1 starts sending IGMPv1 host membership reports. * IGMPv2 starts sending IGMPv2 host membership reports. * IGMPv3 will schedule an IGMPv3 state-change report containing the * initial state of the membership. */ static int igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) { struct ifnet *ifp; struct ifqueue *ifq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); error = 0; syncstates = 1; ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and * are never reported in any IGMP protocol exchanges. * All other groups enter the appropriate IGMP state machine * for the version in use on this link. * A link marked as IGIF_SILENT causes IGMP to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); inm->inm_state = IGMP_SILENT_MEMBER; inm->inm_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (igi->igi_version == IGMP_VERSION_3 && inm->inm_state == IGMP_LEAVING_MEMBER) inm_release_locked(inm); inm->inm_state = IGMP_REPORTING_MEMBER; switch (igi->igi_version) { case IGMP_VERSION_1: case IGMP_VERSION_2: inm->inm_state = IGMP_IDLE_MEMBER; error = igmp_v1v2_queue_report(inm, (igi->igi_version == IGMP_VERSION_2) ? IGMP_v2_HOST_MEMBERSHIP_REPORT : IGMP_v1_HOST_MEMBERSHIP_REPORT); if (error == 0) { inm->inm_timer = IGMP_RANDOM_DELAY( IGMP_V1V2_MAX_RI * PR_FASTHZ); V_current_state_timers_running = 1; } break; case IGMP_VERSION_3: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ ifq = &inm->inm_scq; _IF_DRAIN(ifq); retval = igmp_v3_enqueue_group_record(ifq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next igmp_fasttimo (~200ms), * giving us an opportunity to merge the reports. */ if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { KASSERT(igi->igi_rv > 1, ("%s: invalid robustness %d", __func__, igi->igi_rv)); inm->inm_scrv = igi->igi_rv; } inm->inm_sctimer = 1; V_state_change_timers_running = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); } return (error); } /* * Issue an intermediate state change during the IGMP life-cycle. */ static int igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) { struct ifnet *ifp; int retval; CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); ifp = inm->inm_ifp; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (igi->igi_flags & IGIF_SILENT) || !igmp_isgroupreported(inm->inm_addr) || (igi->igi_version != IGMP_VERSION_3)) { if (!igmp_isgroupreported(inm->inm_addr)) { CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); return (0); } _IF_DRAIN(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); inm->inm_sctimer = 1; V_state_change_timers_running = 1; return (0); } /* * Perform the final leave for an IGMP group. * * When leaving a group: * IGMPv1 does nothing. * IGMPv2 sends a host leave message, if and only if we are the reporter. * IGMPv3 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) { int syncstates; syncstates = 1; CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, inm->inm_ifp->if_xname); IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: case IGMP_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_IGMPV3, "%s: not kicking state machine for silent group", __func__); break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: if (igi->igi_version == IGMP_VERSION_2) { #ifdef INVARIANTS if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) panic("%s: IGMPv3 state reached, not IGMPv3 mode", __func__); #endif igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); inm->inm_state = IGMP_NOT_MEMBER; } else if (igi->igi_version == IGMP_VERSION_3) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ _IF_DRAIN(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; } else { inm->inm_scrv = igi->igi_rv; } CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, inm->inm_scrv); if (inm->inm_scrv == 0) { inm->inm_state = IGMP_NOT_MEMBER; inm->inm_sctimer = 0; } else { int retval; inm_acquire_locked(inm); retval = igmp_v3_enqueue_group_record( &inm->inm_scq, inm, 1, 0, 0); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->inm_state = IGMP_LEAVING_MEMBER; inm->inm_sctimer = 1; V_state_change_timers_running = 1; syncstates = 0; } break; } break; case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { inm_commit(inm); CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); } } /* * Enqueue an IGMPv3 group record to the given output queue. * * XXX This function could do with having the allocation code * split out, and the multiple-tree-walks coalesced into a single * routine as has been done in igmp_v3_enqueue_filter_change(). * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * The function will attempt to allocate leading space in the packet * for the IP/IGMP header to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ifnet *ifp; struct ip_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; in_addr_t naddr; uint8_t mode; IN_MULTI_LOCK_ASSERT(); error = 0; ifp = inm->inm_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pig = NULL; type = IGMP_DO_NOTHING; mode = inm->inm_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && inm->inm_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. */ if (mode != inm->inm_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", __func__); type = IGMP_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_IGMPV3, "%s: change to INCLUDE", __func__); type = IGMP_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = IGMP_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = IGMP_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = IGMP_MODE_IS_INCLUDE; KASSERT(inm->inm_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->inm_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (igmp_v3_enqueue_filter_change(ifq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct igmp_grouprec); if (record_has_sources) minrec0len += sizeof(in_addr_t); CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__, igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = ifq->ifq_tail; if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { if (_IF_QFULL(ifq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) MH_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ ig.ig_type = type; ig.ig_datalen = 0; ig.ig_numsrc = 0; ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct igmp_grouprec); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, do not * include source entries. * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(in_addr_t); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, msrcs); pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); } if (is_source_query && msrcs == 0) { CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; _IF_ENQUEUE(ifq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (_IF_QFULL(ifq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) MH_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); igmp_save_context(m, ifp); md = m_getptr(m, 0, &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct igmp_grouprec); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); msrcs = 0; RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && mode == MCAST_UNDEFINED)) { CTR1(KTR_IGMPV3, "%s: skip node", __func__); continue; } if (is_source_query && ims->ims_stp == 0) { CTR1(KTR_IGMPV3, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_IGMPV3, "%s: append node", __func__); naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pig->ig_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); _IF_ENQUEUE(ifq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an IGMPv3 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); struct ifnet *ifp; struct igmp_grouprec ig; struct igmp_grouprec *pig; struct ip_msource *ims, *nims; struct mbuf *m, *m0, *md; in_addr_t naddr; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; IN_MULTI_LOCK_ASSERT(); if (inm->inm_nsrc == 0 || (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) return (0); ifp = inm->inm_ifp; /* interface */ mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ nbytes = 0; /* # of bytes appended to group's state-change queue */ npbytes = 0; /* # of bytes appended this packet */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = ifq->ifq_tail; if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - IGMP_LEADINGSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m) MH_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; igmp_save_context(m, ifp); m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); npbytes = 0; CTR1(KTR_IGMPV3, "%s: allocated new packet", __func__); } /* * Append the IGMP group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&ig, 0, sizeof(ig)); ig.ig_group = inm->inm_addr; if (!m_append(m, sizeof(ig), (void *)&ig)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct igmp_grouprec); if (m != m0) { /* new packet; offset in c hain */ md = m_getptr(m, npbytes - sizeof(struct igmp_grouprec), &off); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct igmp_grouprec)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); RB_FOREACH_FROM(ims, ip_msource_tree, nims) { CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, inet_ntoa_haddr(ims->ims_haddr)); now = ims_get_mode(inm, ims, 1); then = ims_get_mode(inm, ims, 0); CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_IGMPV3, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_IGMPV3, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; naddr = htonl(ims->ims_haddr); if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { if (m != m0) m_freem(m); CTR1(KTR_IGMPV3, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct igmp_grouprec); if (m != m0) { CTR1(KTR_IGMPV3, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_IGMPV3, "%s: m_adj(m, -ig)", __func__); m_adj(m, -((int)sizeof( struct igmp_grouprec))); } continue; } npbytes += (rsrcs * sizeof(in_addr_t)); if (crt == REC_ALLOW) pig->ig_type = IGMP_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pig->ig_type = IGMP_BLOCK_OLD_SOURCES; pig->ig_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) _IF_ENQUEUE(ifq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) { struct ifqueue *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->inm_scrv > 0) docopy = 1; gq = &inm->inm_scq; #ifdef KTR if (gq->ifq_head == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = gq->ifq_head; while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an IGMPv3 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = ifscq->ifq_tail; if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= IGMP_V3_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) domerge = 1; } if (!domerge && _IF_QFULL(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); _IF_DEQUEUE(gq, m0); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)", __func__, m0, ifscq); _IF_ENQUEUE(ifscq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending IGMPv3 General Query. */ static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in_multi *inm; int retval, loop; IN_MULTI_LOCK_ASSERT(); IGMP_LOCK_ASSERT(); KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; inm = (struct in_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->inm_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->inm_state) { case IGMP_NOT_MEMBER: case IGMP_SILENT_MEMBER: break; case IGMP_REPORTING_MEMBER: case IGMP_IDLE_MEMBER: case IGMP_LAZY_MEMBER: case IGMP_SLEEPING_MEMBER: case IGMP_AWAKENING_MEMBER: inm->inm_state = IGMP_REPORTING_MEMBER; retval = igmp_v3_enqueue_group_record(&igi->igi_gq, inm, 0, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); break; case IGMP_G_QUERY_PENDING_MEMBER: case IGMP_SG_QUERY_PENDING_MEMBER: case IGMP_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ if (igi->igi_gq.ifq_head != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; } } /* * Transmit the next pending IGMP message in the output queue. * * We get called from netisr_processqueue(). A mutex private to igmpoq * will be acquired and released around this routine. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as IGMP traffic is always local to * a link and uses a link-scope multicast address. */ static void igmp_intr(struct mbuf *m) { struct ip_moptions imo; struct ifnet *ifp; struct mbuf *ipopts, *m0; int error; uint32_t ifindex; CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IPSTAT_INC(ips_noroute); goto out; } ipopts = V_igmp_sendra ? m_raopt : NULL; imo.imo_multicast_ttl = 1; imo.imo_multicast_vif = -1; imo.imo_multicast_loop = (V_ip_mrouter != NULL); /* * If the user requested that IGMP traffic be explicitly * redirected to the loopback interface (e.g. they are running a * MANET interface and the routing protocol needs to see the * updates), handle this now. */ if (m->m_flags & M_IGMP_LOOP) imo.imo_multicast_ifp = V_loif; else imo.imo_multicast_ifp = ifp; if (m->m_flags & M_IGMPV2) { m0 = m; } else { m0 = igmp_v3_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); m_freem(m); IPSTAT_INC(ips_odropped); goto out; } } igmp_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); #endif error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); if (error) { CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); goto out; } IGMPSTAT_INC(igps_snd_reports); out: /* * We must restore the existing vnet pointer before * continuing as we are run from netisr context. */ CURVNET_RESTORE(); } /* * Encapsulate an IGMPv3 report. * * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf * chain has already had its IP/IGMPv3 header prepended. In this case * the function will not attempt to prepend; the lengths and checksums * will however be re-computed. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); igmpreclen = m_length(m, NULL); hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; } CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); m->m_data += sizeof(struct ip); m->m_len -= sizeof(struct ip); igmp = mtod(m, struct igmp_report *); igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; igmp->ir_rsv1 = 0; igmp->ir_rsv2 = 0; igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); igmp->ir_cksum = 0; igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); m->m_pkthdr.PH_vt.vt_nrecs = 0; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; ip->ip_len = htons(hdrlen + igmpreclen); ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; ip->ip_src.s_addr = INADDR_ANY; if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; IFP_TO_IA(ifp, ia); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); } } ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); return (m); } #ifdef KTR static char * igmp_rec_type_to_str(const int type) { switch (type) { case IGMP_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case IGMP_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case IGMP_MODE_IS_EXCLUDE: return "MODE_EX"; break; case IGMP_MODE_IS_INCLUDE: return "MODE_IN"; break; case IGMP_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case IGMP_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void igmp_init(void *unused __unused) { CTR1(KTR_IGMPV3, "%s: initializing", __func__); IGMP_LOCK_INIT(); m_raopt = igmp_ra_alloc(); netisr_register(&igmp_nh); } SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL); static void igmp_uninit(void *unused __unused) { CTR1(KTR_IGMPV3, "%s: tearing down", __func__); netisr_unregister(&igmp_nh); m_free(m_raopt); m_raopt = NULL; IGMP_LOCK_DESTROY(); } SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL); static void vnet_igmp_init(const void *unused __unused) { CTR1(KTR_IGMPV3, "%s: initializing", __func__); LIST_INIT(&V_igi_head); } VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init, NULL); static void vnet_igmp_uninit(const void *unused __unused) { CTR1(KTR_IGMPV3, "%s: tearing down", __func__); KASSERT(LIST_EMPTY(&V_igi_head), ("%s: igi list not empty; ifnets not detached?", __func__)); } VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_uninit, NULL); static int igmp_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t igmp_mod = { "igmp", igmp_modevent, 0 }; DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: user/dchagin/lemul/sys/netinet/in.c =================================================================== --- user/dchagin/lemul/sys/netinet/in.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/in.c (revision 274241) @@ -1,1269 +1,1269 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mpath.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static VNET_DEFINE(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { IN_IFADDR_RUNLOCK(); return (1); } } IN_IFADDR_RUNLOCK(); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in_localip(struct in_addr in) { struct in_ifaddr *ia; IN_IFADDR_RLOCK(); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { IN_IFADDR_RUNLOCK(); return (1); } } IN_IFADDR_RUNLOCK(); return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *ia) { in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; struct in_ifaddr *it; IN_IFADDR_RLOCK(); LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { ifa_ref(&it->ia_ifa); IN_IFADDR_RUNLOCK(); return (it); } } IN_IFADDR_RUNLOCK(); return (NULL); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { register u_long i = ntohl(in.s_addr); register u_long net; if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) return (0); if (IN_CLASSA(i)) { net = i & IN_CLASSA_NET; if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) return (0); } return (1); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { register char *cplim = (char *) &ap->sin_addr; register char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(data, ifp, td); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, td); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(td->td_ucred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } IF_ADDR_RUNLOCK(ifp); return (error); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check(td, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid > 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; iaIsFirst = false; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) ia = it; } IF_ADDR_RUNLOCK(ifp); if (ia != NULL) (void )in_difaddr_ioctl(data, ifp, td); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * Be compatible with network classes, if netmask isn't * supplied, guess it based on classes. */ if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; /* XXXGL: rtinit() needs this strange assignment. */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_dstaddr = ia->ia_addr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ IN_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { int flags = RTF_UP; if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) flags |= RTF_HOST; error = in_addprefix(ia, flags); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && ia->ia_addr.sin_addr.s_addr != INADDR_ANY && !((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa); IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (td != NULL) { error = priv_check(td, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (td == NULL || prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (td == NULL || prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ IN_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia_hash); IN_IFADDR_WUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); IN_MULTI_LOCK(); if (ii->ii_allhosts) { (void)in_leavegroup_locked(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } IN_MULTI_UNLOCK(); } EVENTHANDLER_INVOKE(ifaddr_event, ifp); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } #define rtinitflags(x) \ ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ ? RTF_HOST : 0) /* * Check if we have a route for the given prefix already or add one accordingly. */ int in_addprefix(struct in_ifaddr *target, int flags) { struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; if ((flags & RTF_HOST) != 0) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } IN_IFADDR_RLOCK(); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { IN_IFADDR_RUNLOCK(); return (EEXIST); } else break; #endif if (V_nosameprefix) { IN_IFADDR_RUNLOCK(); return (EEXIST); } else { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); IN_IFADDR_RUNLOCK(); return (0); } } } IN_IFADDR_RUNLOCK(); /* * No-one seem to have this prefix route, so we try to insert it. */ error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; struct sockaddr_in prefix0, mask0; /* * Remove the loopback route to the interface address. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; eia = in_localip_more(target); if (eia != NULL) { int fibnum = target->ia_ifp->if_fib; error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr, fibnum); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } if (!(target->ia_ifp->if_flags & IFF_NOARP)) /* remove arp cache */ arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr); } if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; mask.s_addr = 0; } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; } if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); return (0); } IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; if (prefix.s_addr != p.s_addr) continue; } else { p = ia->ia_addr.sin_addr; m = ia->ia_sockmask.sin_addr; p.s_addr &= m.s_addr; if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; } if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } IN_IFADDR_RUNLOCK(); /* * remove all L2 entries on the given prefix */ bzero(&prefix0, sizeof(prefix0)); prefix0.sin_len = sizeof(prefix0); prefix0.sin_family = AF_INET; prefix0.sin_addr.s_addr = target->ia_subnet; bzero(&mask0, sizeof(mask0)); mask0.sin_len = sizeof(mask0); mask0.sin_family = AF_INET; mask0.sin_addr.s_addr = target->ia_subnetmask; lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0, (struct sockaddr *)&mask0, flags); /* * As no-one seem to have this prefix, we can remove the route. */ error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } #undef rtinitflags /* * Return 1 if the address might be a local broadcast address. */ int in_broadcast(struct in_addr in, struct ifnet *ifp) { register struct ifaddr *ifa; u_long t; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); t = ntohl(in.s_addr); /* * Look through the list of addresses for a match * with a broadcast address. */ #define ia ((struct in_ifaddr *)ifa) TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ (ia->ia_subnetmask != IN_RFC3021_MASK && t == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff) return (1); return (0); #undef ia } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); } /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { LIST_HEAD(,in_multi) purgeinms; struct in_multi *inm, *tinm; struct ifmultiaddr *ifma; LIST_INIT(&purgeinms); IN_MULTI_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET || ifma->ifma_protospec == NULL) continue; #if 0 KASSERT(ifma->ifma_protospec != NULL, ("%s: ifma_protospec is NULL", __func__)); #endif inm = (struct in_multi *)ifma->ifma_protospec; LIST_INSERT_HEAD(&purgeinms, inm, inm_link); } IF_ADDR_RUNLOCK(ifp); LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { LIST_REMOVE(inm, inm_link); inm_release_locked(inm); } igmp_ifdetach(ifp); IN_MULTI_UNLOCK(); } struct in_llentry { struct llentry base; struct sockaddr_in l3_addr4; }; /* * Deletes an address from the address table. * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. */ static void in_lltable_free(struct lltable *llt, struct llentry *lle) { LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); free(lle, M_LLTABLE); } static struct llentry * in_lltable_new(const struct sockaddr *l3addr, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->l3_addr4 = *(const struct sockaddr_in *)l3addr; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_free; LLE_LOCK_INIT(&lle->base); callout_init_rw(&lle->base.la_timer, &lle->base.lle_lock, CALLOUT_RETURNUNLOCKED); return (&lle->base); } #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 ) static void in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags) { const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix; const struct sockaddr_in *msk = (const struct sockaddr_in *)mask; struct llentry *lle, *next; int i; size_t pkts_dropped; IF_AFDATA_WLOCK(llt->llt_ifp); for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { /* * (flags & LLE_STATIC) means deleting all entries * including static ARP entries. */ if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)), pfx, msk) && ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) { LLE_WLOCK(lle); if (callout_stop(&lle->la_timer)) LLE_REMREF(lle); pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } } } IF_AFDATA_WUNLOCK(llt->llt_ifp); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rtentry *rt; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* XXX rtalloc1_fib should take a const param */ rt = rtalloc1_fib(__DECONST(struct sockaddr *, l3addr), 0, 0, ifp->if_fib); if (rt == NULL) return (EINVAL); /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (rt->rt_flags & RTF_GATEWAY) { if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp || rt->rt_ifp->if_type != IFT_ETHER || (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { RTFREE_LOCKED(rt); return (EINVAL); } } /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) { const char *sa, *mask, *addr, *lim; int len; mask = (const char *)rt_mask(rt); /* * Just being extra cautious to avoid some custom * code getting into trouble. */ if (mask == NULL) { RTFREE_LOCKED(rt); return (EINVAL); } sa = (const char *)rt_key(rt); addr = (const char *)l3addr; len = ((const struct sockaddr_in *)l3addr)->sin_len; lim = addr + len; for ( ; addr < lim; sa++, mask++, addr++) { if ((*sa ^ *addr) & *mask) { #ifdef DIAGNOSTIC log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); #endif RTFREE_LOCKED(rt); return (EINVAL); } } } RTFREE_LOCKED(rt); return (0); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; struct llentries *lleh; u_int hashkey; IF_AFDATA_LOCK_ASSERT(ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); hashkey = sin->sin_addr.s_addr; lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; LIST_FOREACH(lle, lleh, lle_next) { struct sockaddr_in *sa2 = satosin(L3_ADDR(lle)); if (lle->la_flags & LLE_DELETED) continue; if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr) break; } if (lle == NULL) { #ifdef DIAGNOSTIC if (flags & LLE_DELETE) log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle); #endif if (!(flags & LLE_CREATE)) return (NULL); IF_AFDATA_WLOCK_ASSERT(ifp); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) goto done; lle = in_lltable_new(l3addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); goto done; } lle->la_flags = flags & ~LLE_CREATE; if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) { bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen); lle->la_flags |= (LLE_VALID | LLE_STATIC); } lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); } else if (flags & LLE_DELETE) { if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif if ((lle->la_flags & (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC) llentry_free(lle); else LLE_WUNLOCK(lle); } lle = (void *)-1; } if (LLE_IS_VALID(lle)) { if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); } done: return (lle); } static int in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) { #define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle)) struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; int error, i; LLTABLE_LOCK_ASSERT(); error = 0; for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { struct sockaddr_dl *sdl; /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) continue; /* Skip if jailed and not a valid IP of the prison. */ if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0) continue; /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ bzero(&arpc, sizeof(arpc)); arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; arpc.sin.sin_family = AF_INET; arpc.sin.sin_len = sizeof(arpc.sin); arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); if (error) break; } } return error; #undef SIN } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; struct lltable *llt; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); llt = lltable_init(ifp, AF_INET); if (llt != NULL) { llt->llt_prefix_free = in_lltable_prefix_free; llt->llt_lookup = in_lltable_lookup; llt->llt_dump = in_lltable_dump; } ii->ii_llt = llt; ii->ii_igmp = igmp_domifattach(ifp); return ii; } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } Index: user/dchagin/lemul/sys/netinet/in_gif.c =================================================================== --- user/dchagin/lemul/sys/netinet/in_gif.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/in_gif.c (revision 274241) @@ -1,240 +1,240 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include static int gif_validate4(const struct ip *, struct gif_softc *, struct ifnet *); extern struct domain inetdomain; struct protosw in_gif_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = 0/* IPPROTO_IPV[46] */, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in_gif_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; #define V_ip_gif_ttl VNET(ip_gif_ttl) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gif_ttl), 0, ""); int in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { GIF_RLOCK_TRACKER; struct gif_softc *sc = ifp->if_softc; struct ip *ip; int len; /* prepend new IP header */ len = sizeof(struct ip); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip = mtod(m, struct ip *); GIF_RLOCK(sc); if (sc->gif_family != AF_INET) { m_freem(m); GIF_RUNLOCK(sc); return (ENETDOWN); } bcopy(sc->gif_iphdr, ip, sizeof(struct ip)); GIF_RUNLOCK(sc); ip->ip_p = proto; /* version will be set in ip_output() */ ip->ip_ttl = V_ip_gif_ttl; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_tos = ecn; return (ip_output(m, NULL, NULL, 0, NULL, NULL)); } int in_gif_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct gif_softc *sc; struct ifnet *gifp; struct ip *ip; uint8_t ecn; sc = encap_getarg(m); if (sc == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip = mtod(m, struct ip *); ecn = ip->ip_tos; m_adj(m, *offp); gif_input(m, gifp, proto, ecn); } else { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); } return (IPPROTO_DONE); } /* * validate outer address. */ static int gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) { GIF_RLOCK_ASSERT(sc); /* check for address match */ if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr || sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr) return (0); /* martian filters on outer source - NOT done in ip_input! */ if (IN_MULTICAST(ntohl(ip->ip_src.s_addr))) return (0); switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) { case 0: case 127: case 255: return (0); } /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) { struct sockaddr_in sin; struct rtentry *rt; bzero(&sin, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = ip->ip_src; /* XXX MRT check for the interface we would use on output */ rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, sc->gif_fibnum); if (!rt || rt->rt_ifp != ifp) { if (rt) RTFREE_LOCKED(rt); return (0); } RTFREE_LOCKED(rt); } return (32 * 2); } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int in_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip ip; struct gif_softc *sc; struct ifnet *ifp; /* sanity check done in caller */ sc = (struct gif_softc *)arg; GIF_RLOCK_ASSERT(sc); m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL; return (gif_validate4(&ip, sc, ifp)); } int in_gif_attach(struct gif_softc *sc) { KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL")); sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck, &in_gif_protosw, sc); if (sc->gif_ecookie == NULL) return (EEXIST); return (0); } Index: user/dchagin/lemul/sys/netinet/in_pcb.c =================================================================== --- user/dchagin/lemul/sys/netinet/in_pcb.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/in_pcb.c (revision 274241) @@ -1,2633 +1,2637 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); static VNET_DEFINE(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, - CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, + &VNET_NAME(ipport_reservedhigh), 0, ""); +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, uma_fini inpcbzone_fini, uint32_t inpcbzone_flags, u_int hashfields) { INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; INP_INFO_WLOCK_ASSERT(pcbinfo); error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(inp, inp_zero_size); inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #ifdef IPSEC error = ipsec_init_policy(so, &inp->inp_sp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ #if defined(IPSEC) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ short inp_so_options(const struct inpcb *inp) { short so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT, 0) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; ifa_free(&ia->ia_ifa); goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ifa_free(&ia->ia_ifa); ia = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; IF_ADDR_RUNLOCK(ifp); goto done; } IF_ADDR_RUNLOCK(ifp); } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(); faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(); if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) return (0); KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); if (inp->in6p_moptions != NULL) ip6_freemoptions(inp->in6p_moptions); } #endif if (inp->inp_options) (void)m_free(inp->inp_options); #ifdef INET if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); #endif inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_RLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { in_delmulti(imo->imo_membership[i]); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; /* XXX inp locking */ if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; /* XXX inp locking */ if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; /* XXX inp locking */ if (ifp && ifp->if_type == IFT_FAITH && (inp->inp_flags & INP_FAITH) == 0) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { in_pcbref(inp); INP_HASH_RUNLOCK(pcbinfo); if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } else panic("%s: locking bug", __func__); } else INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Go through port list and look for a head for this lport. */ LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } phd->phd_port = inp->inp_lport; LIST_INIT(&phd->phd_pcblist); LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { LIST_REMOVE(phd, phd_hash); free(phd, M_PCB); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANTS void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_RLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_FAITH) { db_printf("%sINP_FAITH", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ Index: user/dchagin/lemul/sys/netinet/in_rmx.c =================================================================== --- user/dchagin/lemul/sys/netinet/in_rmx.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/in_rmx.c (revision 274241) @@ -1,508 +1,508 @@ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int in_inithead(void **head, int off); #ifdef VIMAGE extern int in_detachhead(void **head, int off); #endif #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ /* * Do what we need to do when inserting a route. */ static struct radix_node * in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); RADIX_NODE_HEAD_WLOCK_ASSERT(head); /* * A little bit of help for both IP output and input: * For host routes, we make sure that RTF_BROADCAST * is set for anything that looks like a broadcast address. * This way, we can avoid an expensive call to in_broadcast() * in ip_output() most of the time (because the route passed * to ip_output() is almost always a host route). * * We also do the same for local addresses, with the thought * that this might one day be used to speed up ip_input(). * * We also mark routes to multicast addresses as such, because * it's easy to do and might be useful (but this is much more * dubious since it's so easy to inspect the address). */ if (rt->rt_flags & RTF_HOST) { if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { rt->rt_flags |= RTF_BROADCAST; } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == sin->sin_addr.s_addr) { rt->rt_flags |= RTF_LOCAL; } } if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) rt->rt_flags |= RTF_MULTICAST; if (rt->rt_ifp != NULL) { /* * Check route MTU: * inherit interface MTU if not set or * check if MTU is too large. */ if (rt->rt_mtu == 0) { rt->rt_mtu = rt->rt_ifp->if_mtu; } else if (rt->rt_mtu > rt->rt_ifp->if_mtu) rt->rt_mtu = rt->rt_ifp->if_mtu; } return (rn_addroute(v_arg, n_arg, head, treenodes)); } /* * This code is the inverse of in_clsroute: on first reference, if we * were managing the route, stop doing so and set the expiration timer * back off again. */ static struct radix_node * in_matroute(void *v_arg, struct radix_node_head *head) { struct radix_node *rn = rn_match(v_arg, head); struct rtentry *rt = (struct rtentry *)rn; if (rt) { RT_LOCK(rt); if (rt->rt_flags & RTPRF_OURS) { rt->rt_flags &= ~RTPRF_OURS; rt->rt_expire = 0; } RT_UNLOCK(rt); } return rn; } static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */ #define V_rtq_reallyold VNET(rtq_reallyold) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(rtq_reallyold), 0, "Default expiration time on dynamically learned routes"); /* never automatically crank down to less */ static VNET_DEFINE(int, rtq_minreallyold) = 10; #define V_rtq_minreallyold VNET(rtq_minreallyold) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(rtq_minreallyold), 0, "Minimum time to attempt to hold onto dynamically learned routes"); /* 128 cached routes is "too many" */ static VNET_DEFINE(int, rtq_toomany) = 128; #define V_rtq_toomany VNET(rtq_toomany) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(rtq_toomany), 0, "Upper limit on dynamically learned routes"); /* * On last reference drop, mark the route as belong to us so that it can be * timed out. */ static void in_clsroute(struct radix_node *rn, struct radix_node_head *head) { struct rtentry *rt = (struct rtentry *)rn; RT_LOCK_ASSERT(rt); if (!(rt->rt_flags & RTF_UP)) return; /* prophylactic measures */ if (rt->rt_flags & RTPRF_OURS) return; if (!(rt->rt_flags & RTF_DYNAMIC)) return; /* * If rtq_reallyold is 0, just delete the route without * waiting for a timeout cycle to kill it. */ if (V_rtq_reallyold != 0) { rt->rt_flags |= RTPRF_OURS; rt->rt_expire = time_uptime + V_rtq_reallyold; } else rt_expunge(head, rt); } struct rtqk_arg { struct radix_node_head *rnh; int draining; int killed; int found; int updating; time_t nextstop; }; /* * Get rid of old routes. When draining, this deletes everything, even when * the timeout is not expired yet. When updating, this makes sure that * nothing has a timeout longer than the current value of rtq_reallyold. */ static int in_rtqkill(struct radix_node *rn, void *rock) { struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; RADIX_NODE_HEAD_WLOCK_ASSERT(ap->rnh); if (rt->rt_flags & RTPRF_OURS) { ap->found++; if (ap->draining || rt->rt_expire <= time_uptime) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); err = in_rtrequest(RTM_DELETE, (struct sockaddr *)rt_key(rt), rt->rt_gateway, rt_mask(rt), rt->rt_flags | RTF_RNH_LOCKED, 0, rt->rt_fibnum); if (err) { log(LOG_WARNING, "in_rtqkill: error %d\n", err); } else { ap->killed++; } } else { if (ap->updating && (rt->rt_expire - time_uptime > V_rtq_reallyold)) rt->rt_expire = time_uptime + V_rtq_reallyold; ap->nextstop = lmin(ap->nextstop, rt->rt_expire); } } return 0; } #define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT; static VNET_DEFINE(struct callout, rtq_timer); #define V_rtq_timeout VNET(rtq_timeout) #define V_rtq_timer VNET(rtq_timer) static void in_rtqtimo_one(void *rock); static void in_rtqtimo(void *rock) { CURVNET_SET((struct vnet *) rock); int fibnum; void *newrock; struct timeval atv; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { newrock = rt_tables_get_rnh(fibnum, AF_INET); if (newrock != NULL) in_rtqtimo_one(newrock); } atv.tv_usec = 0; atv.tv_sec = V_rtq_timeout; callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); CURVNET_RESTORE(); } static void in_rtqtimo_one(void *rock) { struct radix_node_head *rnh = rock; struct rtqk_arg arg; static time_t last_adjusted_timeout = 0; arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = time_uptime + V_rtq_timeout; arg.draining = arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); /* * Attempt to be somewhat dynamic about this: * If there are ``too many'' routes sitting around taking up space, * then crank down the timeout, and see if we can't make some more * go away. However, we make sure that we will never adjust more * than once in rtq_timeout seconds, to keep from cranking down too * hard. */ if ((arg.found - arg.killed > V_rtq_toomany) && (time_uptime - last_adjusted_timeout >= V_rtq_timeout) && V_rtq_reallyold > V_rtq_minreallyold) { V_rtq_reallyold = 2 * V_rtq_reallyold / 3; if (V_rtq_reallyold < V_rtq_minreallyold) { V_rtq_reallyold = V_rtq_minreallyold; } last_adjusted_timeout = time_uptime; #ifdef DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", V_rtq_reallyold); #endif arg.found = arg.killed = 0; arg.updating = 1; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } } void in_rtqdrain(void) { VNET_ITERATOR_DECL(vnet_iter); struct radix_node_head *rnh; struct rtqk_arg arg; int fibnum; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = rt_tables_get_rnh(fibnum, AF_INET); arg.found = arg.killed = 0; arg.rnh = rnh; arg.nextstop = 0; arg.draining = 1; arg.updating = 0; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_rtqkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void in_setmatchfunc(struct radix_node_head *rnh, int val) { rnh->rnh_matchaddr = (val != 0) ? rn_match : in_matroute; } static int _in_rt_was_here; /* * Initialize our routing tree. */ int in_inithead(void **head, int off) { struct radix_node_head *rnh; if (!rn_inithead(head, 32)) return 0; rnh = *head; RADIX_NODE_HEAD_LOCK_INIT(rnh); rnh->rnh_addaddr = in_addroute; in_setmatchfunc(rnh, V_drop_redirect); rnh->rnh_close = in_clsroute; if (_in_rt_was_here == 0 ) { callout_init(&V_rtq_timer, CALLOUT_MPSAFE); callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet); _in_rt_was_here = 1; } return 1; } #ifdef VIMAGE int in_detachhead(void **head, int off) { callout_drain(&V_rtq_timer); return (1); } #endif /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes * that point to this address. If we don't do this, we may end up * using the old address in the future. The ones we always want to * get rid of are things like ARP entries, since the user might down * the interface, walk over to a completely different network, and * plug back in. */ struct in_ifadown_arg { struct radix_node_head *rnh; struct ifaddr *ifa; int del; }; static int in_ifadownkill(struct radix_node *rn, void *xap) { struct in_ifadown_arg *ap = xap; struct rtentry *rt = (struct rtentry *)rn; RT_LOCK(rt); if (rt->rt_ifa == ap->ifa && (ap->del || !(rt->rt_flags & RTF_STATIC))) { /* * Aquire a reference so that it can later be freed * as the refcount would be 0 here in case of at least * ap->del. */ RT_ADDREF(rt); /* * Disconnect it from the tree and permit protocols * to cleanup. */ rt_expunge(ap->rnh, rt); /* * At this point it is an rttrash node, and in case * the above is the only reference we must free it. * If we do not noone will have a pointer and the * rtentry will be leaked forever. * In case someone else holds a reference, we are * fine as we only decrement the refcount. In that * case if the other entity calls RT_REMREF, we * will still be leaking but at least we tried. */ RTFREE_LOCKED(rt); return (0); } RT_UNLOCK(rt); return 0; } void in_ifadown(struct ifaddr *ifa, int delete) { struct in_ifadown_arg arg; struct radix_node_head *rnh; int fibnum; KASSERT(ifa->ifa_addr->sa_family == AF_INET, ("%s: wrong family", __func__)); for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = rt_tables_get_rnh(fibnum, AF_INET); arg.rnh = rnh; arg.ifa = ifa; arg.del = delete; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in_ifadownkill, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ } } /* * inet versions of rt functions. These have fib extensions and * for now will just reference the _fib variants. * eventually this order will be reversed, */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) { rtalloc_ign_fib(ro, ignflags, fibnum); } int in_rtrequest( int req, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, u_int fibnum) { return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, fibnum)); } struct rtentry * in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { return (rtalloc1_fib(dst, report, ignflags, fibnum)); } void in_rtredirect(struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *netmask, int flags, struct sockaddr *src, u_int fibnum) { rtredirect_fib(dst, gateway, netmask, flags, src, fibnum); } void in_rtalloc(struct route *ro, u_int fibnum) { rtalloc_ign_fib(ro, 0UL, fibnum); } #if 0 int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); int in_rtioctl(u_long, caddr_t, u_int); int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); #endif Index: user/dchagin/lemul/sys/netinet/ip_carp.c =================================================================== --- user/dchagin/lemul/sys/netinet/ip_carp.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/ip_carp.c (revision 274241) @@ -1,2195 +1,2198 @@ /*- * Copyright (c) 2002 Michael Shalayeff. * Copyright (c) 2003 Ryan McBride. * Copyright (c) 2011 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #include #include #include #endif #include static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ struct ifaddr **sc_ifas; /* Our ifaddrs. */ struct sockaddr_dl sc_addr; /* Our link level address. */ struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 struct callout sc_md6_tmo; /* XXX: Master down timeout. */ #endif struct mtx sc_mtx; int sc_vhid; int sc_advskew; int sc_advbase; int sc_naddrs; int sc_naddrs6; int sc_ifasiz; enum { INIT = 0, BACKUP, MASTER } sc_state; int sc_suppress; int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 int sc_init_counter; uint64_t sc_counter; /* authentication */ #define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; struct carp_if { #ifdef INET int cif_naddrs; #endif #ifdef INET6 int cif_naddrs6; #endif TAILQ_HEAD(, carp_softc) cif_vrs; #ifdef INET struct ip_moptions cif_imo; #endif #ifdef INET6 struct ip6_moptions cif_im6o; #endif struct ifnet *cif_ifp; struct mtx cif_mtx; uint32_t cif_flags; #define CIF_PROMISC 0x00000001 }; #define CARP_INET 0 #define CARP_INET6 1 static int proto_reg[] = {-1, -1}; /* * Brief design of carp(4). * * Any carp-capable ifnet may have a list of carp softcs hanging off * its ifp->if_carp pointer. Each softc represents one unique virtual * host id, or vhid. The softc has a back pointer to the ifnet. All * softcs are joined in a global list, which has quite limited use. * * Any interface address that takes part in CARP negotiation has a * pointer to the softc of its vhid, ifa->ifa_carp. That could be either * AF_INET or AF_INET6 address. * * Although, one can get the softc's backpointer to ifnet and traverse * through its ifp->if_addrhead queue to find all interface addresses * involved in CARP, we keep a growable array of ifaddr pointers. This * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that * do calls into the network stack, thus avoiding LORs. * * Locking: * * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), * callout-driven events and ioctl()s. * * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to * traverse the global list we use the mutex carp_mtx. * * Known issues with locking: * * - There is no protection for races between two ioctl() requests, * neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all * interface ioctl()s should be serialized right in net/if.c. * - Sending ad, we put the pointer to the softc in an mtag, and no reference * counting is done on the softc. * - On module unload we may race (?) with packet processing thread * dereferencing our function pointers. */ /* Accept incoming CARP packets. */ static VNET_DEFINE(int, carp_allow) = 1; #define V_carp_allow VNET(carp_allow) /* Preempt slower nodes. */ static VNET_DEFINE(int, carp_preempt) = 0; #define V_carp_preempt VNET(carp_preempt) /* Log level. */ static VNET_DEFINE(int, carp_log) = 1; #define V_carp_log VNET(carp_log) /* Global advskew demotion. */ static VNET_DEFINE(int, carp_demotion) = 0; #define V_carp_demotion VNET(carp_demotion) /* Send error demotion factor. */ static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW; #define V_carp_senderr_adj VNET(carp_senderr_adj) /* Iface down demotion factor. */ static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW; #define V_carp_ifdown_adj VNET(carp_ifdown_adj) static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); -SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_RW, +SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets"); -SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_RW, +SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); -SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_RW, +SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_log), 0, "CARP log level"); -SYSCTL_VNET_PROC(_net_inet_carp, OID_AUTO, demotion, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_demote_adj_sysctl, "I", "Adjust demotion factor (skew of advskew)"); -SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_RW, +SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); -SYSCTL_VNET_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_RW, +SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(carp_ifdown_adj), 0, "Interface down demotion factor adjustment"); VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); VNET_PCPUSTAT_SYSINIT(carpstats); VNET_PCPUSTAT_SYSUNINIT(carpstats); #define CARPSTATS_ADD(name, val) \ counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ sizeof(uint64_t)], (val)) #define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); #define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ NULL, MTX_DEF) #define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) #define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ NULL, MTX_DEF) #define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) #define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) #define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) #define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) #define CIF_FREE(cif) do { \ CIF_LOCK_ASSERT(cif); \ if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ carp_free_if(cif); \ else \ CIF_UNLOCK(cif); \ } while (0) #define CARP_LOG(...) do { \ if (V_carp_log > 0) \ log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) #define IFNET_FOREACH_IFA(ifp, ifa) \ IF_ADDR_LOCK_ASSERT(ifp); \ TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ if ((ifa)->ifa_carp != NULL) #define CARP_FOREACH_IFA(sc, ifa) \ CARP_LOCK_ASSERT(sc); \ for (int _i = 0; \ _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ ((ifa) = sc->sc_ifas[_i]) != NULL; \ ++_i) #define IFNET_FOREACH_CARP(ifp, sc) \ CIF_LOCK_ASSERT(ifp->if_carp); \ TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) #define DEMOTE_ADVSKEW(sc) \ (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion)) static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); static struct carp_softc *carp_alloc(struct ifnet *); static void carp_detach_locked(struct ifaddr *); static void carp_destroy(struct carp_softc *); static struct carp_if *carp_alloc_if(struct ifnet *); static void carp_free_if(struct carp_if *); static void carp_set_state(struct carp_softc *, int); static void carp_sc_state(struct carp_softc *); static void carp_setrun(struct carp_softc *, sa_family_t); static void carp_master_down(void *); static void carp_master_down_locked(struct carp_softc *); static void carp_send_ad(void *); static void carp_send_ad_locked(struct carp_softc *); static void carp_addroute(struct carp_softc *); static void carp_ifa_addroute(struct ifaddr *); static void carp_delroute(struct carp_softc *); static void carp_ifa_delroute(struct ifaddr *); static void carp_send_ad_all(void *, int); static void carp_demote_adj(int, char *); static LIST_HEAD(, carp_softc) carp_list; static struct mtx carp_mtx; static struct task carp_sendall_task = TASK_INITIALIZER(0, carp_send_ad_all, NULL); static void carp_hmac_prepare(struct carp_softc *sc) { uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET struct in_addr last, cur, in; #endif #ifdef INET6 struct in6_addr last6, cur6, in6; #endif CARP_LOCK_ASSERT(sc); /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); #ifdef INET cur.s_addr = 0; do { found = 0; last = cur; cur.s_addr = 0xffffffff; CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && ntohl(in.s_addr) < ntohl(cur.s_addr)) { cur.s_addr = in.s_addr; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); #endif /* INET */ #ifdef INET6 memset(&cur6, 0, sizeof(cur6)); do { found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; if (ifa->ifa_addr->sa_family == AF_INET6 && memcmp(&in6, &last6, sizeof(in6)) > 0 && memcmp(&in6, &cur6, sizeof(in6)) < 0) { cur6 = in6; found++; } } if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); #endif /* INET6 */ /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; } static void carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; CARP_LOCK_ASSERT(sc); /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); SHA1Final(md, &sha1ctx); /* outer hash */ SHA1Init(&sha1ctx); SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sha1ctx, md, 20); SHA1Final(md, &sha1ctx); } static int carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET int carp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; iplen = *offp; *mp = NULL; CARPSTATS_INC(carps_ipackets); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); CARP_DEBUG("%s: pullup failed\n", __func__); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ch = (struct carp_header *)((char *)ip + iplen); /* * verify that the received packet length is * equal to the CARP header */ len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } if ((m = m_pullup(m, len)) == NULL) { CARPSTATS_INC(carps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; if (in_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= iplen; carp_input_c(m, ch, AF_INET); return (IPPROTO_DONE); } #endif #ifdef INET6 int carp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct carp_header *ch; u_int len; CARPSTATS_INC(carps_ipackets6); if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); CARP_DEBUG("%s: packet received on non-carp interface: %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } /* verify that we have a complete carp packet */ len = m->m_len; IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { CARPSTATS_INC(carps_badlen); CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } /* verify the CARP checksum */ m->m_data += *offp; if (in_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } m->m_data -= *offp; carp_input_c(m, ch, AF_INET6); return (IPPROTO_DONE); } #endif /* INET6 */ static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; struct ifaddr *ifa; struct carp_softc *sc; uint64_t tmp_counter; struct timeval sc_tv, ch_tv; /* verify that the VHID is valid on the receiving interface */ IF_ADDR_RLOCK(ifp); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == af && ifa->ifa_carp->sc_vhid == ch->carp_vhid) { ifa_ref(ifa); break; } IF_ADDR_RUNLOCK(ifp); if (ifa == NULL) { CARPSTATS_INC(carps_badvhid); m_freem(m); return; } /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); ifa_free(ifa); m_freem(m); return; } sc = ifa->ifa_carp; CARP_LOCK(sc); ifa_free(ifa); if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, sc->sc_vhid, ifp->if_xname); goto out; } tmp_counter = ntohl(ch->carp_counter[0]); tmp_counter = tmp_counter<<32; tmp_counter += ntohl(ch->carp_counter[1]); /* XXX Replay protection goes here */ sc->sc_init_counter = 0; sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; switch (sc->sc_state) { case INIT: break; case MASTER: /* * If we receive an advertisement from a master who's going to * be more frequent than us, go into BACKUP state. */ if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); CARP_LOG("VHID %u@%s: MASTER -> BACKUP " "(more frequent advertisement received)\n", sc->sc_vhid, sc->sc_carpdev->if_xname); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); carp_delroute(sc); } break; case BACKUP: /* * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { CARP_LOG("VHID %u@%s: BACKUP -> MASTER " "(preempting a slower master)\n", sc->sc_vhid, sc->sc_carpdev->if_xname); carp_master_down_locked(sc); break; } /* * If the master is going to advertise at such a low frequency * that he's guaranteed to time out, we'd might as well just * treat him as timed out now. */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { CARP_LOG("VHID %u@%s: BACKUP -> MASTER " "(master timed out)\n", sc->sc_vhid, sc->sc_carpdev->if_xname); carp_master_down_locked(sc); break; } /* * Otherwise, we reset the counter and wait for the next * advertisement. */ carp_setrun(sc, af); break; } out: CARP_UNLOCK(sc); m_freem(m); } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ sc->sc_counter = arc4random(); sc->sc_counter = sc->sc_counter << 32; sc->sc_counter += arc4random(); } else sc->sc_counter++; ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT)) == NULL) { m_freem(m); CARPSTATS_INC(carps_onomem); return (ENOMEM); } bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (0); } /* * To avoid LORs and possible recursions this function shouldn't * be called directly, but scheduled via taskqueue. */ static void carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; mtx_lock(&carp_mtx); LIST_FOREACH(sc, &carp_list, sc_next) if (sc->sc_state == MASTER) { CARP_LOCK(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } mtx_unlock(&carp_mtx); } /* Send a periodic advertisement, executed in callout context. */ static void carp_send_ad(void *v) { struct carp_softc *sc = v; CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); CURVNET_RESTORE(); CARP_UNLOCK(sc); } static void carp_send_ad_error(struct carp_softc *sc, int error) { if (error) { if (sc->sc_sendad_errors < INT_MAX) sc->sc_sendad_errors++; if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { static const char fmt[] = "send error %d on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, error, sc->sc_carpdev->if_xname); carp_demote_adj(V_carp_senderr_adj, msg); } sc->sc_sendad_success = 0; } else { if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS && ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { static const char fmt[] = "send ok on %s"; char msg[sizeof(fmt) + IFNAMSIZ]; sprintf(msg, fmt, sc->sc_carpdev->if_xname); carp_demote_adj(-V_carp_senderr_adj, msg); sc->sc_sendad_errors = 0; } else sc->sc_sendad_errors = 0; } } static void carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; struct sockaddr sa; struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; int len, advskew; CARP_LOCK_ASSERT(sc); advskew = DEMOTE_ADVSKEW(sc); tv.tv_sec = sc->sc_advbase; tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; /* XXXGL: OpenBSD picks first ifaddr with needed family. */ #ifdef INET if (sc->sc_naddrs) { struct ip *ip; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; MH_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(len); ip->ip_id = ip_newid(); ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; bzero(&sa, sizeof(sa)); sa.sa_family = AF_INET; ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); if (ifa != NULL) { ip->ip_src.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; ifa_free(ifa); } else ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); CARPSTATS_INC(carps_opackets); carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 if (sc->sc_naddrs6) { struct ip6_hdr *ip6; m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CARPSTATS_INC(carps_onomem); goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; MH_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; bzero(&sa, sizeof(sa)); /* set the source address */ sa.sa_family = AF_INET6; ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); if (ifa != NULL) { bcopy(IFA_IN6(ifa), &ip6->ip6_src, sizeof(struct in6_addr)); ifa_free(ifa); } else /* This should never happen with IPv6. */ bzero(&ip6->ip6_src, sizeof(struct in6_addr)); /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); goto resched; } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) goto resched; m->m_data += sizeof(*ip6); ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); CARPSTATS_INC(carps_opackets6); carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ resched: callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } static void carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_addroute(ifa); } static void carp_ifa_addroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: in_addprefix(ifatoia(ifa), RTF_UP); ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); break; #endif #ifdef INET6 case AF_INET6: ifa_add_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_add_ifa_lle(ifatoia6(ifa)); break; #endif } } static void carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) carp_ifa_delroute(ifa); } static void carp_ifa_delroute(struct ifaddr *ifa) { switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia(ifa)->ia_addr); in_scrubprefix(ifatoia(ifa), LLE_STATIC); break; #endif #ifdef INET6 case AF_INET6: ifa_del_loopback_route(ifa, (struct sockaddr *)&ifatoia6(ifa)->ia_addr); nd6_rem_ifa_lle(ifatoia6(ifa)); break; #endif } } int carp_master(struct ifaddr *ifa) { struct carp_softc *sc = ifa->ifa_carp; return (sc->sc_state == MASTER); } #ifdef INET /* * Broadcast a gratuitous ARP request containing * the virtual router MAC address for each IP address * associated with the virtual router. */ static void carp_send_arp(struct carp_softc *sc) { struct ifaddr *ifa; CARP_FOREACH_IFA(sc, ifa) if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr)); } int carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { struct carp_softc *sc = ifa->ifa_carp; if (sc->sc_state == MASTER) { *enaddr = LLADDR(&sc->sc_addr); return (1); } return (0); } #endif #ifdef INET6 static void carp_send_na(struct carp_softc *sc) { static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; struct ifaddr *ifa; struct in6_addr *in6; CARP_FOREACH_IFA(sc, ifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6 = IFA_IN6(ifa); nd6_na_output(sc->sc_carpdev, &mcast, in6, ND_NA_FLAG_OVERRIDE, 1, NULL); DELAY(1000); /* XXX */ } } /* * Returns ifa in case it's a carp address and it is MASTER, or if the address * matches and is not a carp address. Returns NULL otherwise. */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { struct ifaddr *ifa; ifa = NULL; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) continue; if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) ifa = NULL; else ifa_ref(ifa); break; } IF_ADDR_RUNLOCK(ifp); return (ifa); } caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); IFNET_FOREACH_IFA(ifp, ifa) if (ifa->ifa_addr->sa_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { struct carp_softc *sc = ifa->ifa_carp; struct m_tag *mtag; IF_ADDR_RUNLOCK(ifp); mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), M_NOWAIT); if (mtag == NULL) /* Better a bit than nothing. */ return (LLADDR(&sc->sc_addr)); bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (LLADDR(&sc->sc_addr)); } IF_ADDR_RUNLOCK(ifp); return (NULL); } #endif /* INET6 */ int carp_forus(struct ifnet *ifp, u_char *dhost) { struct carp_softc *sc; uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) return (0); CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), ETHER_ADDR_LEN)) { CARP_UNLOCK(sc); CIF_UNLOCK(ifp->if_carp); return (1); } CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); return (0); } /* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; CARP_LOCK_ASSERT(sc); CURVNET_SET(sc->sc_carpdev->if_vnet); if (sc->sc_state == BACKUP) { CARP_LOG("VHID %u@%s: BACKUP -> MASTER (master down)\n", sc->sc_vhid, sc->sc_carpdev->if_xname); carp_master_down_locked(sc); } CURVNET_RESTORE(); CARP_UNLOCK(sc); } static void carp_master_down_locked(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { case BACKUP: carp_set_state(sc, MASTER); carp_send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); #endif carp_setrun(sc, 0); carp_addroute(sc); break; case INIT: case MASTER: #ifdef INVARIANTS panic("carp: VHID %u@%s: master_down event in %s state\n", sc->sc_vhid, sc->sc_carpdev->if_xname, sc->sc_state ? "MASTER" : "INIT"); #endif break; } } /* * When in backup state, af indicates whether to reset the master down timer * for v4 or v6. If it's set to zero, reset the ones which are already pending. */ static void carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; CARP_LOCK_ASSERT(sc); if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || sc->sc_carpdev->if_link_state != LINK_STATE_UP || (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)) return; switch (sc->sc_state) { case INIT: CARP_LOG("VHID %u@%s: INIT -> BACKUP\n", sc->sc_vhid, sc->sc_carpdev->if_xname); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); break; case BACKUP: callout_stop(&sc->sc_ad_tmo); tv.tv_sec = 3 * sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; switch (af) { #ifdef INET case AF_INET: callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; #endif default: #ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); #endif #ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); #endif break; } break; case MASTER: tv.tv_sec = sc->sc_advbase; tv.tv_usec = sc->sc_advskew * 1000000 / 256; callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); break; } } /* * Setup multicast structures. */ static int carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { struct ifnet *ifp = cif->cif_ifp; int error = 0; CIF_LOCK_ASSERT(cif); switch (sa) { #ifdef INET case AF_INET: { struct ip_moptions *imo = &cif->cif_imo; struct in_addr addr; if (imo->imo_membership) return (0); imo->imo_membership = (struct in_multi **)malloc( (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, M_NOWAIT); if (imo->imo_membership == NULL) return (ENOMEM); imo->imo_mfilters = NULL; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_multicast_vif = -1; addr.s_addr = htonl(INADDR_CARP_GROUP); if ((error = in_joingroup(ifp, &addr, NULL, &imo->imo_membership[0])) != 0) { free(imo->imo_membership, M_CARP); break; } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_moptions *im6o = &cif->cif_im6o; struct in6_addr in6; struct in6_multi *in6m; if (im6o->im6o_membership) return (0); im6o->im6o_membership = (struct in6_multi **)malloc( (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, M_ZERO | M_NOWAIT); if (im6o->im6o_membership == NULL) return (ENOMEM); im6o->im6o_mfilters = NULL; im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; im6o->im6o_multicast_hlim = CARP_DFLTTL; im6o->im6o_multicast_ifp = ifp; /* Join IPv6 CARP multicast group. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr8[15] = 0x12; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { free(im6o->im6o_membership, M_CARP); break; } in6m = NULL; if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { free(im6o->im6o_membership, M_CARP); break; } im6o->im6o_membership[0] = in6m; im6o->im6o_num_memberships++; /* Join solicited multicast address. */ bzero(&in6, sizeof(in6)); in6.s6_addr16[0] = htons(0xff02); in6.s6_addr32[1] = 0; in6.s6_addr32[2] = htonl(1); in6.s6_addr32[3] = 0; in6.s6_addr8[12] = 0xff; if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { in6_mc_leave(im6o->im6o_membership[0], NULL); free(im6o->im6o_membership, M_CARP); break; } in6m = NULL; if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { in6_mc_leave(im6o->im6o_membership[0], NULL); free(im6o->im6o_membership, M_CARP); break; } im6o->im6o_membership[1] = in6m; im6o->im6o_num_memberships++; break; } #endif } return (error); } /* * Free multicast structures. */ static void carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) { CIF_LOCK_ASSERT(cif); switch (sa) { #ifdef INET case AF_INET: if (cif->cif_naddrs == 0) { struct ip_moptions *imo = &cif->cif_imo; in_leavegroup(imo->imo_membership[0], NULL); KASSERT(imo->imo_mfilters == NULL, ("%s: imo_mfilters != NULL", __func__)); free(imo->imo_membership, M_CARP); imo->imo_membership = NULL; } break; #endif #ifdef INET6 case AF_INET6: if (cif->cif_naddrs6 == 0) { struct ip6_moptions *im6o = &cif->cif_im6o; in6_mc_leave(im6o->im6o_membership[0], NULL); in6_mc_leave(im6o->im6o_membership[1], NULL); KASSERT(im6o->im6o_mfilters == NULL, ("%s: im6o_mfilters != NULL", __func__)); free(im6o->im6o_membership, M_CARP); im6o->im6o_membership = NULL; } break; #endif } } int carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) { struct m_tag *mtag; struct carp_softc *sc; if (!sa) return (0); switch (sa->sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return (0); } mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); if (mtag == NULL) return (0); bcopy(mtag + 1, &sc, sizeof(sc)); /* Set the source MAC address to the Virtual Router MAC Address. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_BRIDGE: case IFT_L2VLAN: { struct ether_header *eh; eh = mtod(m, struct ether_header *); eh->ether_shost[0] = 0; eh->ether_shost[1] = 0; eh->ether_shost[2] = 0x5e; eh->ether_shost[3] = 0; eh->ether_shost[4] = 1; eh->ether_shost[5] = sc->sc_vhid; } break; case IFT_FDDI: { struct fddi_header *fh; fh = mtod(m, struct fddi_header *); fh->fddi_shost[0] = 0; fh->fddi_shost[1] = 0; fh->fddi_shost[2] = 0x5e; fh->fddi_shost[3] = 0; fh->fddi_shost[4] = 1; fh->fddi_shost[5] = sc->sc_vhid; } break; case IFT_ISO88025: { struct iso88025_header *th; th = mtod(m, struct iso88025_header *); th->iso88025_shost[0] = 3; th->iso88025_shost[1] = 0; th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1); th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1); th->iso88025_shost[4] = 0; th->iso88025_shost[5] = 0; } break; default: printf("%s: carp is not supported for the %d interface type\n", ifp->if_xname, ifp->if_type); return (EOPNOTSUPP); } return (0); } static struct carp_softc* carp_alloc(struct ifnet *ifp) { struct carp_softc *sc; struct carp_if *cif; if ((cif = ifp->if_carp) == NULL) cif = carp_alloc_if(ifp); sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); sc->sc_advbase = CARP_DFLTINTV; sc->sc_vhid = -1; /* required setting */ sc->sc_init_counter = 1; sc->sc_state = INIT; sc->sc_ifasiz = sizeof(struct ifaddr *); sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); sc->sc_carpdev = ifp; CARP_LOCK_INIT(sc); #ifdef INET callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif #ifdef INET6 callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); #endif callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); CIF_LOCK(cif); TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); CIF_UNLOCK(cif); mtx_lock(&carp_mtx); LIST_INSERT_HEAD(&carp_list, sc, sc_next); mtx_unlock(&carp_mtx); return (sc); } static int carp_grow_ifas(struct carp_softc *sc) { struct ifaddr **new; CARP_LOCK_ASSERT(sc); new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO); if (new == NULL) return (ENOMEM); bcopy(sc->sc_ifas, new, sc->sc_ifasiz); free(sc->sc_ifas, M_CARP); sc->sc_ifas = new; sc->sc_ifasiz *= 2; return (0); } static void carp_destroy(struct carp_softc *sc) { struct ifnet *ifp = sc->sc_carpdev; struct carp_if *cif = ifp->if_carp; CIF_LOCK_ASSERT(cif); TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); mtx_lock(&carp_mtx); LIST_REMOVE(sc, sc_next); mtx_unlock(&carp_mtx); CARP_LOCK(sc); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); callout_drain(&sc->sc_ad_tmo); #ifdef INET callout_drain(&sc->sc_md_tmo); #endif #ifdef INET6 callout_drain(&sc->sc_md6_tmo); #endif CARP_LOCK_DESTROY(sc); free(sc->sc_ifas, M_CARP); free(sc, M_CARP); } static struct carp_if* carp_alloc_if(struct ifnet *ifp) { struct carp_if *cif; int error; cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); if ((error = ifpromisc(ifp, 1)) != 0) printf("%s: ifpromisc(%s) failed: %d\n", __func__, ifp->if_xname, error); else cif->cif_flags |= CIF_PROMISC; CIF_LOCK_INIT(cif); cif->cif_ifp = ifp; TAILQ_INIT(&cif->cif_vrs); IF_ADDR_WLOCK(ifp); ifp->if_carp = cif; if_ref(ifp); IF_ADDR_WUNLOCK(ifp); return (cif); } static void carp_free_if(struct carp_if *cif) { struct ifnet *ifp = cif->cif_ifp; CIF_LOCK_ASSERT(cif); KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", __func__)); IF_ADDR_WLOCK(ifp); ifp->if_carp = NULL; IF_ADDR_WUNLOCK(ifp); CIF_LOCK_DESTROY(cif); if (cif->cif_flags & CIF_PROMISC) ifpromisc(ifp, 0); if_rele(ifp); free(cif, M_CARP); } static void carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) { CARP_LOCK(sc); carpr->carpr_state = sc->sc_state; carpr->carpr_vhid = sc->sc_vhid; carpr->carpr_advbase = sc->sc_advbase; carpr->carpr_advskew = sc->sc_advskew; if (priv) bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); else bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); CARP_UNLOCK(sc); } int carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { struct carpreq carpr; struct ifnet *ifp; struct carp_softc *sc = NULL; int error = 0, locked = 0; if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) return (error); ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) return (ENXIO); switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_FDDI: case IFT_ISO88025: break; default: error = EOPNOTSUPP; goto out; } if ((ifp->if_flags & IFF_MULTICAST) == 0) { error = EADDRNOTAVAIL; goto out; } switch (cmd) { case SIOCSVH: if ((error = priv_check(td, PRIV_NETINET_CARP))) break; if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { error = EINVAL; break; } if (ifp->if_carp) { CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; CIF_UNLOCK(ifp->if_carp); } if (sc == NULL) { sc = carp_alloc(ifp); CARP_LOCK(sc); sc->sc_vhid = carpr.carpr_vhid; LLADDR(&sc->sc_addr)[0] = 0; LLADDR(&sc->sc_addr)[1] = 0; LLADDR(&sc->sc_addr)[2] = 0x5e; LLADDR(&sc->sc_addr)[3] = 0; LLADDR(&sc->sc_addr)[4] = 1; LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; } else CARP_LOCK(sc); locked = 1; if (carpr.carpr_advbase > 0) { if (carpr.carpr_advbase > 255 || carpr.carpr_advbase < CARP_DFLTINTV) { error = EINVAL; break; } sc->sc_advbase = carpr.carpr_advbase; } if (carpr.carpr_advskew > 0) { if (carpr.carpr_advskew >= 255) { error = EINVAL; break; } sc->sc_advskew = carpr.carpr_advskew; } if (carpr.carpr_key[0] != '\0') { bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); carp_hmac_prepare(sc); } if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { switch (carpr.carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); carp_set_state(sc, BACKUP); carp_setrun(sc, 0); carp_delroute(sc); break; case MASTER: carp_master_down_locked(sc); break; default: break; } } break; case SIOCGVH: { int priveleged; if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { error = EINVAL; break; } if (carpr.carpr_count < 1) { error = EMSGSIZE; break; } if (ifp->if_carp == NULL) { error = ENOENT; break; } priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); if (carpr.carpr_vhid != 0) { CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == carpr.carpr_vhid) break; CIF_UNLOCK(ifp->if_carp); if (sc == NULL) { error = ENOENT; break; } carp_carprcp(&carpr, sc, priveleged); error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); } else { int i, count; count = 0; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) count++; if (count > carpr.carpr_count) { CIF_UNLOCK(ifp->if_carp); error = EMSGSIZE; break; } i = 0; IFNET_FOREACH_CARP(ifp, sc) { carp_carprcp(&carpr, sc, priveleged); carpr.carpr_count = count; error = copyout(&carpr, ifr->ifr_data + (i * sizeof(carpr)), sizeof(carpr)); if (error) { CIF_UNLOCK(ifp->if_carp); break; } i++; } CIF_UNLOCK(ifp->if_carp); } break; } default: error = EINVAL; } out: if (locked) CARP_UNLOCK(sc); if_rele(ifp); return (error); } static int carp_get_vhid(struct ifaddr *ifa) { if (ifa == NULL || ifa->ifa_carp == NULL) return (0); return (ifa->ifa_carp->sc_vhid); } int carp_attach(struct ifaddr *ifa, int vhid) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; int index, error; if (ifp->if_carp == NULL) return (ENOPROTOOPT); switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: #endif break; default: return (EPROTOTYPE); } CIF_LOCK(cif); IFNET_FOREACH_CARP(ifp, sc) if (sc->sc_vhid == vhid) break; if (sc == NULL) { CIF_UNLOCK(cif); return (ENOENT); } if (ifa->ifa_carp) { if (ifa->ifa_carp->sc_vhid != vhid) carp_detach_locked(ifa); else { CIF_UNLOCK(cif); return (0); } } error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); if (error) { CIF_FREE(cif); return (error); } CARP_LOCK(sc); index = sc->sc_naddrs + sc->sc_naddrs6 + 1; if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) if ((error = carp_grow_ifas(sc)) != 0) { carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); CARP_UNLOCK(sc); CIF_FREE(cif); return (error); } switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs++; sc->sc_naddrs++; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6++; sc->sc_naddrs6++; break; #endif } ifa_ref(ifa); sc->sc_ifas[index - 1] = ifa; ifa->ifa_carp = sc; carp_hmac_prepare(sc); carp_sc_state(sc); CARP_UNLOCK(sc); CIF_UNLOCK(cif); return (0); } void carp_detach(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; CIF_LOCK(cif); carp_detach_locked(ifa); CIF_FREE(cif); } static void carp_detach_locked(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct carp_if *cif = ifp->if_carp; struct carp_softc *sc = ifa->ifa_carp; int i, index; KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); CIF_LOCK_ASSERT(cif); CARP_LOCK(sc); /* Shift array. */ index = sc->sc_naddrs + sc->sc_naddrs6; for (i = 0; i < index; i++) if (sc->sc_ifas[i] == ifa) break; KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); for (; i < index - 1; i++) sc->sc_ifas[i] = sc->sc_ifas[i+1]; sc->sc_ifas[index - 1] = NULL; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: cif->cif_naddrs--; sc->sc_naddrs--; break; #endif #ifdef INET6 case AF_INET6: cif->cif_naddrs6--; sc->sc_naddrs6--; break; #endif } carp_ifa_delroute(ifa); carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); ifa->ifa_carp = NULL; ifa_free(ifa); carp_hmac_prepare(sc); carp_sc_state(sc); if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) { CARP_UNLOCK(sc); carp_destroy(sc); } else CARP_UNLOCK(sc); } static void carp_set_state(struct carp_softc *sc, int state) { CARP_LOCK_ASSERT(sc); if (sc->sc_state != state) { const char *carp_states[] = { CARP_STATES }; char subsys[IFNAMSIZ+5]; sc->sc_state = state; snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, sc->sc_carpdev->if_xname); devctl_notify("CARP", subsys, carp_states[state], NULL); } } static void carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; CIF_LOCK(ifp->if_carp); IFNET_FOREACH_CARP(ifp, sc) { CARP_LOCK(sc); carp_sc_state(sc); CARP_UNLOCK(sc); } CIF_UNLOCK(ifp->if_carp); } static void carp_sc_state(struct carp_softc *sc) { CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP)) { callout_stop(&sc->sc_ad_tmo); #ifdef INET callout_stop(&sc->sc_md_tmo); #endif #ifdef INET6 callout_stop(&sc->sc_md6_tmo); #endif carp_set_state(sc, INIT); carp_setrun(sc, 0); if (!sc->sc_suppress) carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { carp_set_state(sc, INIT); carp_setrun(sc, 0); if (sc->sc_suppress) carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } } static void carp_demote_adj(int adj, char *reason) { atomic_add_int(&V_carp_demotion, adj); CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); } static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) { int new, error; new = V_carp_demotion; error = sysctl_handle_int(oidp, &new, 0, req); if (error || !req->newptr) return (error); carp_demote_adj(new, "sysctl"); return (0); } #ifdef INET extern struct domain inetdomain; static struct protosw in_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif #ifdef INET6 extern struct domain inet6domain; static struct protosw in6_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }; #endif static void carp_mod_cleanup(void) { #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET] = -1; } carp_iamatch_p = NULL; #endif #ifdef INET6 if (proto_reg[CARP_INET6] == 0) { (void)ip6proto_unregister(IPPROTO_CARP); pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW); proto_reg[CARP_INET6] = -1; } carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif carp_ioctl_p = NULL; carp_attach_p = NULL; carp_detach_p = NULL; carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; carp_demote_adj_p = NULL; carp_master_p = NULL; mtx_unlock(&carp_mtx); taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); } static int carp_mod_load(void) { int err; mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); LIST_INIT(&carp_list); carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; carp_linkstate_p = carp_linkstate; carp_ioctl_p = carp_ioctl; carp_attach_p = carp_attach; carp_detach_p = carp_detach; carp_demote_adj_p = carp_demote_adj; carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); return (proto_reg[CARP_INET6]); } err = ip6proto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET6\n", err); carp_mod_cleanup(); return (err); } #endif #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); return (proto_reg[CARP_INET]); } err = ipproto_register(IPPROTO_CARP); if (err) { printf("carp: error %d registering with INET\n", err); carp_mod_cleanup(); return (err); } #endif return (0); } static int carp_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: mtx_lock(&carp_mtx); if (LIST_EMPTY(&carp_list)) carp_mod_cleanup(); else { mtx_unlock(&carp_mtx); return (EBUSY); } break; default: return (EINVAL); } return (0); } static moduledata_t carp_mod = { "carp", carp_modevent, 0 }; DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); Index: user/dchagin/lemul/sys/netinet/ip_fastfwd.c =================================================================== --- user/dchagin/lemul/sys/netinet/ip_fastfwd.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/ip_fastfwd.c (revision 274241) @@ -1,599 +1,599 @@ /*- * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * ip_fastforward gets its speed from processing the forwarded packet to * completion (if_output on the other side) without any queues or netisr's. * The receiving interface DMAs the packet into memory, the upper half of * driver calls ip_fastforward, we do our routing table lookup and directly * send it off to the outgoing interface, which DMAs the packet to the * network card. The only part of the packet we touch with the CPU is the * IP header (unless there are complex firewall rules touching other parts * of the packet, but that is up to you). We are essentially limited by bus * bandwidth and how fast the network card/driver can set up receives and * transmits. * * We handle basic errors, IP header errors, checksum errors, * destination unreachable, fragmentation and fragmentation needed and * report them via ICMP to the sender. * * Else if something is not pure IPv4 unicast forwarding we fall back to * the normal ip_input processing path. We should only be called from * interfaces connected to the outside world. * * Firewalling is fully supported including divert, ipfw fwd and ipfilter * ipnat and address rewrite. * * IPSEC is not supported if this host is a tunnel broker. IPSEC is * supported for connections to/from local host. * * We try to do the least expensive (in CPU ops) checks and operations * first to catch junk with as little overhead as possible. * * We take full advantage of hardware support for IP checksum and * fragmentation offloading. * * We don't do ICMP redirect in the fast forwarding path. I have had my own * cases where two core routers with Zebra routing suite would send millions * ICMP redirects to connected hosts if the destination router was not the * default gateway. In one case it was filling the routing table of a host * with approximately 300.000 cloned redirect entries until it ran out of * kernel memory. However the networking code proved very robust and it didn't * crash or fail in other ways. */ /* * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which * is being followed here. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_ipstealth.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static VNET_DEFINE(int, ipfastforward_active); #define V_ipfastforward_active VNET(ipfastforward_active) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding"); static struct sockaddr_in * ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) { struct sockaddr_in *dst; struct rtentry *rt; /* * Find route to destination. */ bzero(ro, sizeof(*ro)); dst = (struct sockaddr_in *)&ro->ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr.s_addr = dest.s_addr; in_rtalloc_ign(ro, 0, M_GETFIB(m)); /* * Route there and interface still up? */ rt = ro->ro_rt; if (rt && (rt->rt_flags & RTF_UP) && (rt->rt_ifp->if_flags & IFF_UP) && (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) { if (rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)rt->rt_gateway; } else { IPSTAT_INC(ips_noroute); IPSTAT_INC(ips_cantforward); if (rt) RTFREE(rt); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return NULL; } return dst; } /* * Try to forward a packet based on the destination address. * This is a fast path optimized for the plain forwarding case. * If the packet is handled (and consumed) here then we return NULL; * otherwise mbuf is returned and the packet should be delivered * to ip_input for full processing. */ struct mbuf * ip_fastforward(struct mbuf *m) { struct ip *ip; struct mbuf *m0 = NULL; struct route ro; struct sockaddr_in *dst = NULL; struct ifnet *ifp; struct in_addr odest, dest; uint16_t sum, ip_len, ip_off; int error = 0; int hlen, mtu; struct m_tag *fwd_tag = NULL; /* * Are we active and forwarding packets? */ if (!V_ipfastforward_active || !V_ipforwarding) return m; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); bzero(&ro, sizeof(ro)); /* * Step 1: check for packet drop conditions (and sanity checks) */ /* * Is entire packet big enough? */ if (m->m_pkthdr.len < sizeof(struct ip)) { IPSTAT_INC(ips_tooshort); goto drop; } /* * Is first mbuf large enough for ip header and is header present? */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); /* * Is it IPv4? */ if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto drop; } /* * Is IP header length correct and is it in first mbuf? */ hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto drop; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return NULL; /* mbuf already free'd */ } ip = mtod(m, struct ip *); } /* * Checksum correct? */ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); else { if (hlen == sizeof(struct ip)) sum = in_cksum_hdr(ip); else sum = in_cksum(m, hlen); } if (sum) { IPSTAT_INC(ips_badsum); goto drop; } /* * Remember that we have checked the IP header and found it valid. */ m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); ip_len = ntohs(ip->ip_len); /* * Is IP length longer than packet we have got? */ if (m->m_pkthdr.len < ip_len) { IPSTAT_INC(ips_tooshort); goto drop; } /* * Is packet longer than IP header tells us? If yes, truncate packet. */ if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Is packet from or to 127/8? */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { IPSTAT_INC(ips_badaddr); goto drop; } #ifdef ALTQ /* * Is packet dropped by traffic conditioner? */ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) goto drop; #endif /* * Step 2: fallback conditions to normal ip_input path processing */ /* * Only IP packets without options */ if (ip->ip_hl != (sizeof(struct ip) >> 2)) { if (V_ip_doopts == 1) return m; else if (V_ip_doopts == 2) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, 0, 0); return NULL; /* mbuf already free'd */ } /* else ignore IP options and continue */ } /* * Only unicast IP, not from loopback, no L2 or IP broadcast, * no multicast, no INADDR_ANY * * XXX: Probably some of these checks could be direct drop * conditions. However it is not clear whether there are some * hacks or obscure behaviours which make it neccessary to * let ip_input handle it. We play safe here and let ip_input * deal with it until it is proven that we can directly drop it. */ if ((m->m_flags & (M_BCAST|M_MCAST)) || (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST || ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || ip->ip_src.s_addr == INADDR_ANY || ip->ip_dst.s_addr == INADDR_ANY ) return m; /* * Is it for a local address on this host? */ if (in_localip(ip->ip_dst)) return m; IPSTAT_INC(ips_total); /* * Step 3: incoming packet firewall processing */ odest.s_addr = dest.s_addr = ip->ip_dst.s_addr; /* * Run through list of ipfilter hooks for input packets */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; if (pfil_run_hooks( &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) || m == NULL) goto drop; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); /* m may have changed by pfil hook */ dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ if (odest.s_addr != dest.s_addr) { /* * Is it now for a local address on this host? */ if (in_localip(dest)) goto forwardlocal; /* * Go on with new destination address */ } if (m->m_flags & M_FASTFWD_OURS) { /* * ipfw changed it for a local address on this host. */ goto forwardlocal; } passin: /* * Step 4: decrement TTL and look up route */ /* * Check TTL */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return NULL; /* mbuf already free'd */ } /* * Decrement the TTL and incrementally change the IP header checksum. * Don't bother doing this with hw checksum offloading, it's faster * doing it right here. */ ip->ip_ttl -= IPTTLDEC; if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) ip->ip_sum -= ~htons(IPTTLDEC << 8); else ip->ip_sum += htons(IPTTLDEC << 8); #ifdef IPSTEALTH } #endif /* * Find route to destination. */ if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ ifp = ro.ro_rt->rt_ifp; /* * Immediately drop blackholed traffic, and directed broadcasts * for either the all-ones or all-zero subnet addresses on * locally attached networks. */ if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE|RTF_BROADCAST)) != 0) goto drop; /* * Step 5: outgoing firewall packet processing */ /* * Run through list of hooks for output packets. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passout; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) { goto drop; } M_ASSERTVALID(m); M_ASSERTPKTHDR(m); ip = mtod(m, struct ip *); dest.s_addr = ip->ip_dst.s_addr; /* * Destination address changed? */ if (m->m_flags & M_IP_NEXTHOP) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { /* * Is it now for a local address on this host? */ if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { forwardlocal: /* * Return packet for processing by ip_input(). */ m->m_flags |= M_FASTFWD_OURS; if (ro.ro_rt) RTFREE(ro.ro_rt); return m; } /* * Redo route lookup with new destination address */ if (fwd_tag) { dest.s_addr = ((struct sockaddr_in *) (fwd_tag + 1))->sin_addr.s_addr; m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } RTFREE(ro.ro_rt); if ((dst = ip_findroute(&ro, dest, m)) == NULL) return NULL; /* icmp unreach already sent */ ifp = ro.ro_rt->rt_ifp; } passout: /* * Step 6: send off the packet */ ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); /* * Check if route is dampned (when ARP is unable to resolve) */ if ((ro.ro_rt->rt_flags & RTF_REJECT) && (ro.ro_rt->rt_expire == 0 || time_uptime < ro.ro_rt->rt_expire)) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } #ifndef ALTQ /* * Check if there is enough space in the interface queue */ if ((ifp->if_snd.ifq_len + ip_len / ifp->if_mtu + 1) >= ifp->if_snd.ifq_maxlen) { IPSTAT_INC(ips_odropped); /* would send source quench here but that is depreciated */ goto drop; } #endif /* * Check if media link state of interface is not down */ if (ifp->if_link_state == LINK_STATE_DOWN) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } /* * Check if packet fits MTU or if hardware will fragment for us */ if (ro.ro_rt->rt_mtu) mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu); else mtu = ifp->if_mtu; if (ip_len <= mtu) { /* * Avoid confusing lower layers. */ m_clrprotoflags(m); /* * Send off the packet via outgoing interface */ IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, &ro); } else { /* * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ if (ip_off & IP_DF) { IPSTAT_INC(ips_cantfrag); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); goto consumed; } else { /* * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &m, mtu, ifp->if_hwassist)) goto drop; KASSERT(m != NULL, ("null mbuf and no error")); /* * Send off the fragments via outgoing interface */ error = 0; do { m0 = m->m_nextpkt; m->m_nextpkt = NULL; /* * Avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, &ro); if (error) break; } while ((m = m0) != NULL); if (error) { /* Reclaim remaining fragments */ for (m = m0; m; m = m0) { m0 = m->m_nextpkt; m_freem(m); } } else IPSTAT_INC(ips_fragmented); } } if (error != 0) IPSTAT_INC(ips_odropped); else { counter_u64_add(ro.ro_rt->rt_pksent, 1); IPSTAT_INC(ips_forward); IPSTAT_INC(ips_fastforward); } consumed: RTFREE(ro.ro_rt); return NULL; drop: if (m) m_freem(m); if (ro.ro_rt) RTFREE(ro.ro_rt); return NULL; } Index: user/dchagin/lemul/sys/netinet/ip_icmp.c =================================================================== --- user/dchagin/lemul/sys/netinet/ip_icmp.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/ip_icmp.c (revision 274241) @@ -1,1022 +1,1022 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #ifdef IPSEC #include #include #endif #include #include #endif /* INET */ /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ static VNET_DEFINE(int, icmplim) = 200; #define V_icmplim VNET(icmplim) -SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); static VNET_DEFINE(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, "Enable logging of ICMP response rate limiting"); #ifdef INET VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); VNET_PCPUSTAT_SYSINIT(icmpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ static VNET_DEFINE(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) -SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets."); static VNET_DEFINE(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) -SYSCTL_VNET_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW, +SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, "Fake reply to ICMP Address Mask Request packets."); VNET_DEFINE(int, drop_redirect) = 0; static VNET_DEFINE(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); static VNET_DEFINE(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) -SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW, +SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "icmp reply source for non-local packets."); static VNET_DEFINE(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); static VNET_DEFINE(int, icmp_quotelen) = 8; #define V_icmp_quotelen VNET(icmp_quotelen) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); /* * ICMP broadcast echo sysctl */ static VNET_DEFINE(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, ""); static VNET_DEFINE(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; static int sysctl_net_icmp_drop_redir(SYSCTL_HANDLER_ARGS) { int error, new; int i; struct radix_node_head *rnh; new = V_drop_redirect; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { new = (new != 0) ? 1 : 0; if (new == V_drop_redirect) return (0); for (i = 0; i < rt_numfibs; i++) { if ((rnh = rt_tables_get_rnh(i, AF_INET)) == NULL) continue; RADIX_NODE_HEAD_LOCK(rnh); in_setmatchfunc(rnh, new); RADIX_NODE_HEAD_UNLOCK(rnh); } V_drop_redirect = new; } return (error); } -SYSCTL_VNET_PROC(_net_inet_icmp, OID_AUTO, drop_redirect, - CTLTYPE_INT|CTLFLAG_RW, 0, 0, +SYSCTL_PROC(_net_inet_icmp, OID_AUTO, drop_redirect, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_net_icmp_drop_redir, "I", "Ignore ICMP redirects"); /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the * general layout of icmpstat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmpstat_inc(int statnum) { counter_u64_add(VNET(icmpstat)[statnum], 1); } /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { register struct ip *oip = mtod(n, struct ip *), *nip; register unsigned oiphlen = oip->ip_hl << 2; register struct icmp *icp; register struct mbuf *m; unsigned icmplen, icmpelen, nlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (type != ICMP_REDIRECT) ICMPSTAT_INC(icps_error); /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && n->m_len >= oiphlen + ICMP_MINLEN && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { ICMPSTAT_INC(icps_oldicmp); goto freeit; } /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ if (oiphlen + 8 > n->m_len) goto freeit; /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunatly this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)) goto freeit; th = (struct tcphdr *)((caddr_t)oip + oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && ((n = m_pullup(n, oiphlen + tcphlen)) == NULL)) goto freeit; icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_NOWAIT, MT_DATA); else m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, ICMP_MINLEN + icmplen); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); ICMPSTAT_INC(icps_outhist[type]); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; nip->ip_off = 0; icmp_reflect(m); freeit: m_freem(n); } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto) { struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = *offp; int icmplen = ntohs(ip->ip_len) - *offp; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; *mp = NULL; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_src)); printf("icmp_input from %s to %s, len %d\n", buf, inet_ntoa(ip->ip_dst), icmplen); } #endif if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { ICMPSTAT_INC(icps_checksum); goto freeit; } m->m_len += hlen; m->m_data -= hlen; if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { /* * Deliver very specific ICMP type only. */ switch (icp->icmp_type) { case ICMP_UNREACH: case ICMP_TIMXCEED: break; default: goto freeit; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; ICMPSTAT_INC(icps_inhist[icp->icmp_type]); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: code = PRC_UNREACH_NET; break; case ICMP_UNREACH_NEEDFRAG: code = PRC_MSGSIZE; break; /* * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. * Treat subcodes 2,3 as immediate RST */ case ICMP_UNREACH_PROTOCOL: case ICMP_UNREACH_PORT: code = PRC_UNREACH_PORT; break; case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: code = PRC_UNREACH_ADMIN_PROHIB; break; default: goto badcode; } goto deliver; case ICMP_TIMXCEED: if (code > 1) goto badcode; code += PRC_TIMXCEED_INTRANS; goto deliver; case ICMP_PARAMPROB: if (code > 1) goto badcode; code = PRC_PARAMPROB; goto deliver; case ICMP_SOURCEQUENCH: if (code) goto badcode; code = PRC_QUENCH; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); goto freeit; } /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, (void *)&icp->icmp_ip); break; badcode: ICMPSTAT_INC(icps_badcode); break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcastecho); break; } icp->icmp_type = ICMP_ECHOREPLY; if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; else goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); break; } if (icmplen < ICMP_TSLEN) { ICMPSTAT_INC(icps_badlen); break; } icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; else goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; if (ia->ia_ifp == NULL) { ifa_free(&ia->ia_ifa); break; } icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } ifa_free(&ia->ia_ifa); reflect: ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); printf("redirect dst %s to %s\n", buf, inet_ntoa(icp->icmp_gwaddr)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { in_rtredirect((struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); #ifdef IPSEC key_sa_routechange((struct sockaddr *)&icmpsrc); #endif break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: default: break; } raw: *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: m_freem(m); return (IPPROTO_DONE); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; struct mbuf *opts = 0; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ IN_IFADDR_RLOCK(); LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; IN_IFADDR_RUNLOCK(); goto match; } } IN_IFADDR_RUNLOCK(); /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; IF_ADDR_RUNLOCK(ifp); goto match; } IF_ADDR_RUNLOCK(ifp); } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); if (ia == NULL) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } t = IA_SIN(ia)->sin_addr; ifa_free(&ia->ia_ifa); match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { register u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == 0 && (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { register struct ip *ip = mtod(m, struct ip *); register int hlen; register struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char buf[4 * sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); printf("icmp_send dst %s src %s\n", buf, inet_ntoa(ip->ip_src)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } /* * Return milliseconds since 00:00 GMT in network format. */ uint32_t iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } #endif /* INET */ /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ int badport_bandlim(int which) { #define N(a) (sizeof (a) / sizeof (a[0])) static struct rate { const char *type; struct timeval lasttime; int curpps; } rates[BANDLIM_MAX+1] = { { "icmp unreach response" }, { "icmp ping response" }, { "icmp tstamp response" }, { "closed port RST response" }, { "open port RST response" }, { "icmp6 unreach response" }, { "sctp ootb response" } }; /* * Return ok status if feature disabled or argument out of range. */ if (V_icmplim > 0 && (u_int) which < N(rates)) { struct rate *r = &rates[which]; int opps = r->curpps; if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim)) return -1; /* discard packet */ /* * If we've dropped below the threshold after having * rate-limited traffic print the message. This preserves * the previous behaviour at the expense of added complexity. */ if (V_icmplim_output && opps > V_icmplim) log(LOG_NOTICE, "Limiting %s from %d to %d packets/sec\n", r->type, opps, V_icmplim); } return 0; /* okay to send packet */ #undef N } Index: user/dchagin/lemul/sys/netinet/ip_input.c =================================================================== --- user/dchagin/lemul/sys/netinet/ip_input.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/ip_input.c (revision 274241) @@ -1,1916 +1,1916 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipfw.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /* IPSEC */ #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif struct rwlock in_ifaddr_lock; RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); static VNET_DEFINE(int, ip_keepfaith); #define V_ip_keepfaith VNET(ip_keepfaith) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_keepfaith), 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static VNET_DEFINE(int, ip_sendsourcequench); #define V_ip_sendsourcequench VNET(ip_sendsourcequench) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_sendsourcequench), 0, "Enable the transmission of source quench packets"); VNET_DEFINE(int, ip_do_randomid); -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_do_randomid), 0, "Assign random ip_id values"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_m2cpuid, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ static VNET_DEFINE(uma_zone_t, ipq_zone); static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]); static struct mtx ipqlock; #define V_ipq_zone VNET(ipq_zone) #define V_ipq VNET(ipq) #define IPQ_LOCK() mtx_lock(&ipqlock) #define IPQ_UNLOCK() mtx_unlock(&ipqlock) #define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) #define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) static void maxnipq_update(void); static void ipq_zone_change(void *); static void ip_drain_locked(void); static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */ static VNET_DEFINE(int, nipq); /* Total # of reass queues */ #define V_maxnipq VNET(maxnipq) #define V_nipq VNET(nipq) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(nipq), 0, "Current number of IPv4 fragment reassembly queue entries"); static VNET_DEFINE(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif static void ip_freef(struct ipqhead *, struct ipq *); /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; V_ip_id = time_second & 0xffff; TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ for (i = 0; i < IPREASS_NHASH; i++) TAILQ_INIT(&V_ipq[i]); V_maxnipq = nmbclusters / 32; V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); maxnipq_update(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ IPQ_LOCK_INIT(); netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE void ip_destroy(void) { int i; if ((i = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, i); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); IPQ_LOCK(); ip_drain_locked(); IPQ_UNLOCK(); uma_zdestroy(V_ipq_zone); } #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; #endif /* IPSEC */ /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); if (dchg != 0) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (TAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * FAITH(Firewall Aided Internet Translator) */ if (ifp && ifp->if_type == IFT_FAITH) { if (V_ip_keepfaith) { if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) goto ours; } m_freem(m); return; } /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { #ifdef IPSEC if (ip_ipsec_fwd(m)) goto bad; #endif /* IPSEC */ ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if (ip_ipsec_input(m)) goto bad; #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void maxnipq_update(void) { /* * -1 for unlimited allocation. */ if (V_maxnipq < 0) uma_zone_set_max(V_ipq_zone, 0); /* * Positive number for specific bound. */ if (V_maxnipq > 0) uma_zone_set_max(V_ipq_zone, V_maxnipq); /* * Zero specifies no further fragment queue allocation -- set the * bound very low, but rely on implementation elsewhere to actually * prevent allocation and reclaim current queues. */ if (V_maxnipq == 0) uma_zone_set_max(V_ipq_zone, 1); } static void ipq_zone_change(void *tag) { if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { V_maxnipq = nmbclusters / 32; maxnipq_update(); } } static int sysctl_maxnipq(SYSCTL_HANDLER_ARGS) { int error, i; i = V_maxnipq; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); /* * XXXRW: Might be a good idea to sanity check the argument and place * an extreme upper bound. */ if (i < -1) return (EINVAL); V_maxnipq = i; maxnipq_update(); return (0); } SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, sysctl_maxnipq, "I", "Maximum number of IPv4 fragment reassembly queue entries"); #define M_IP_FRAG M_PROTO9 /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ struct mbuf * ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp = NULL; struct ipqhead *head; int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; #ifdef RSS uint32_t rss_hash, rss_type; #endif /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); head = &V_ipq[hash]; IPQ_LOCK(); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) goto found; fp = NULL; /* * Attempt to trim the number of allocated fragment queues if it * exceeds the administrative limit. */ if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { /* * drop something from the tail of the current queue * before proceeding further */ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ for (i = 0; i < IPREASS_NHASH; i++) { struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); if (r) { IPSTAT_ADD(ips_fragtimeout, r->ipq_nfrags); ip_freef(&V_ipq[i], r); break; } } } else { IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); ip_freef(head, q); } } found: /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len = htons(ntohs(ip->ip_len) - hlen); if (ip->ip_off & htons(IP_MF)) { /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes. */ if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) { IPSTAT_INC(ips_toosmall); /* XXX */ goto dropfrag; } m->m_flags |= M_IP_FRAG; } else m->m_flags &= ~M_IP_FRAG; ip->ip_off = htons(ntohs(ip->ip_off) << 3); /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ IPSTAT_INC(ips_fragments); m->m_pkthdr.PH_loc.ptr = ip; /* Previous ip_reass() started here. */ /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); V_nipq++; fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; m->m_nextpkt = NULL; goto done; } else { fp->ipq_nfrags++; #ifdef MAC mac_ipq_update(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the preceding * segment, then it's checksum is invalidated. */ if (p) { i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - ntohs(ip->ip_off); if (i > 0) { if (i >= ntohs(ip->ip_len)) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > ntohs(GETIP(q)->ip_off); q = nq) { i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - ntohs(GETIP(q)->ip_off); if (i < ntohs(GETIP(q)->ip_len)) { GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ip_freef(head, fp); } goto done; } next += ntohs(GETIP(q)->ip_len); } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ip_freef(head, fp); } goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ip_freef(head, fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_cat(m, q); } /* * In order to do checksumming faster we do 'end-around carry' here * (and not in for{} loop), though it implies we are not going to * reassemble more than 64k fragments. */ while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); V_nipq--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); #ifdef RSS /* * Query the RSS layer for the flowid / flowtype for the * mbuf payload. * * For now, just assume we have to calculate a new one. * Later on we should check to see if the assigned flowid matches * what RSS wants for the given IP protocol and if so, just keep it. * * We then queue into the relevant netisr so it can be dispatched * to the correct CPU. * * Note - this may return 1, which means the flowid in the mbuf * is correct for the configured RSS hash types and can be used. */ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { m->m_pkthdr.flowid = rss_hash; M_HASHTYPE_SET(m, rss_type); m->m_flags |= M_FLOWID; } /* * Queue/dispatch for reprocessing. * * Note: this is much slower than just handling the frame in the * current receive context. It's likely worth investigating * why this is. */ netisr_dispatch(NETISR_IP_DIRECT, m); return (NULL); #endif /* Handle in-line */ return (m); dropfrag: IPSTAT_INC(ips_fragdropped); if (fp != NULL) fp->ipq_nfrags--; m_freem(m); done: IPQ_UNLOCK(); return (NULL); #undef GETIP } /* * Free a fragment reassembly header and all * associated datagrams. */ static void ip_freef(struct ipqhead *fhp, struct ipq *fp) { struct mbuf *q; IPQ_LOCK_ASSERT(); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(fhp, fp, ipq_list); uma_zfree(V_ipq_zone, fp); V_nipq--; } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); struct ipq *fp; int i; VNET_LIST_RLOCK_NOSLEEP(); IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { struct ipq *fpp; fpp = fp; fp = TAILQ_NEXT(fp, ipq_list); if(--fpp->ipq_ttl == 0) { IPSTAT_ADD(ips_fragtimeout, fpp->ipq_nfrags); ip_freef(&V_ipq[i], fpp); } } } /* * If we are over the maximum number of fragments * (due to the limit being lowered), drain off * enough to get down to the new limit. */ if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) { IPSTAT_ADD(ips_fragdropped, TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } CURVNET_RESTORE(); } IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Drain off all datagram fragments. */ static void ip_drain_locked(void) { int i; IPQ_LOCK_ASSERT(); for (i = 0; i < IPREASS_NHASH; i++) { while(!TAILQ_EMPTY(&V_ipq[i])) { IPSTAT_ADD(ips_fragdropped, TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); } } } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ip_drain_locked(); CURVNET_RESTORE(); } IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); in_rtqdrain(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } /* * Given address of next destination (final or next hop), return (referenced) * internet address info of interface to be used to get there. */ struct in_ifaddr * ip_rtaddr(struct in_addr dst, u_int fibnum) { struct route sro; struct sockaddr_in *sin; struct in_ifaddr *ia; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; in_rtalloc_ign(&sro, 0, fibnum); if (sro.ro_rt == NULL) return (NULL); ia = ifatoia(sro.ro_rt->rt_ifa); ifa_ref(&ia->ia_ifa); RTFREE(sro.ro_rt); return (ia); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } #ifdef IPSTEALTH if (!V_ipstealth) { #endif if (ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } #ifdef IPSTEALTH } #endif ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); #ifndef IPSEC /* * 'ia' may be NULL if there is no route for this destination. * In case of IPsec, Don't discard it just yet, but pass it to * ip_output in case of outgoing IPsec policy. */ if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); return; } #endif /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copy() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (!V_ipstealth) { #endif ip->ip_ttl -= IPTTLDEC; #ifdef IPSTEALTH } #endif /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct sockaddr_in *sin; struct rtentry *rt; bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; in_rtalloc_ign(&ro, 0, M_GETFIB(m)); rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } if (rt) RTFREE(rt); } /* * Try to cache the route MTU from ip_output so we can consider it for * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. */ bzero(&ro, sizeof(ro)); error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } } if (mcopy == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return; } switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; #ifdef IPSEC /* * If IPsec is configured for this path, * override any possibly mtu value set by ip_output. */ mtu = ip_ipsec_mtu(mcopy, mtu); #endif /* IPSEC */ /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: /* * A router should not generate ICMP_SOURCEQUENCH as * required in RFC1812 Requirements for IP Version 4 Routers. * Source quench could be a big problem under DoS attacks, * or if the underlying interface is rate-limited. * Those who need source quench packets may re-enable them * via the net.inet.ip.sendsourcequench sysctl. */ if (V_ip_sendsourcequench == 0) { m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } else { type = ICMP_SOURCEQUENCH; code = 0; } break; case EACCES: /* ipfw denied packet */ m_freem(mcopy); if (ia != NULL) ifa_free(&ia->ia_ifa); return; } if (ia != NULL) ifa_free(&ia->ia_ifa); icmp_error(mcopy, type, code, dest.s_addr, mtu); } void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { struct bintime bt; bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; bintime2timeval(&bt, &tv); *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ static VNET_DEFINE(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: user/dchagin/lemul/sys/netinet/ip_ipsec.c =================================================================== --- user/dchagin/lemul/sys/netinet/ip_ipsec.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/ip_ipsec.c (revision 274241) @@ -1,380 +1,380 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipsec.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #endif #include #include #include #include extern struct protosw inetsw[]; #ifdef IPSEC_FILTERTUNNEL static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 1; #else static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 0; #endif #define V_ip4_ipsec_filtertunnel VNET(ip4_ipsec_filtertunnel) SYSCTL_DECL(_net_inet_ipsec); -SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, filtertunnel, - CTLFLAG_RW, &VNET_NAME(ip4_ipsec_filtertunnel), 0, +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, filtertunnel, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_filtertunnel), 0, "If set filter packets from an IPsec tunnel."); /* * Check if we have to jump over firewall processing for this packet. * Called from ip_input(). * 1 = jump over firewall, 0 = packet goes through firewall. */ int ip_ipsec_filtertunnel(struct mbuf *m) { /* * Bypass packet filtering for packets previously handled by IPsec. */ if (!V_ip4_ipsec_filtertunnel && m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL) return 1; return 0; } /* * Check if this packet has an active SA and needs to be dropped instead * of forwarded. * Called from ip_input(). * 1 = drop packet, 0 = forward packet. */ int ip_ipsec_fwd(struct mbuf *m) { struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; int error; mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); } else { sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); } if (sp == NULL) { /* NB: can happen if error */ /*XXX error stat???*/ DPRINTF(("ip_input: no SP for forwarding\n")); /*XXX*/ return 1; } /* * Check security policy against packet attributes. */ error = ipsec_in_reject(sp, m); KEY_FREESP(&sp); if (error) { IPSTAT_INC(ips_cantforward); return 1; } return 0; } /* * Check if protocol type doesn't have a further header and do IPSEC * decryption or reject right now. Protocols with further headers get * their IPSEC treatment within the protocol specific processing. * Called from ip_input(). * 1 = drop packet, 0 = continue processing packet. */ int ip_ipsec_input(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; int error; /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) { /* * Check if the packet has already had IPsec processing * done. If so, then just pass it along. This tag gets * set during AH, ESP, etc. input handling, before the * packet is returned to the ip input queue for delivery. */ mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); } else { sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); } if (sp != NULL) { /* * Check security policy against packet attributes. */ error = ipsec_in_reject(sp, m); KEY_FREESP(&sp); } else { /* XXX error stat??? */ error = EINVAL; DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/ return 1; } if (error) return 1; } return 0; } /* * Compute the MTU for a forwarded packet that gets IPSEC encapsulated. * Called from ip_forward(). * Returns MTU suggestion for ICMP needfrag reply. */ int ip_ipsec_mtu(struct mbuf *m, int mtu) { /* * If the packet is routed over IPsec tunnel, tell the * originator the tunnel MTU. * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz * XXX quickhack!!! */ struct secpolicy *sp = NULL; int ipsecerror; int ipsechdr; struct route *ro; sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp != NULL) { /* count IPsec header size */ ipsechdr = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, NULL); /* * find the correct route for outer IPv4 * header, compute tunnel MTU. */ if (sp->req != NULL && sp->req->sav != NULL && sp->req->sav->sah != NULL) { ro = &sp->req->sav->sah->route_cache.sa_route; if (ro->ro_rt && ro->ro_rt->rt_ifp) { mtu = ro->ro_rt->rt_mtu ? ro->ro_rt->rt_mtu : ro->ro_rt->rt_ifp->if_mtu; mtu -= ipsechdr; } } KEY_FREESP(&sp); } return mtu; } /* * * Called from ip_output(). * 1 = drop packet, 0 = continue processing packet, * -1 = packet was reinjected and stop processing packet */ int ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error) { struct secpolicy *sp = NULL; struct tdb_ident *tdbi; struct m_tag *mtag; /* * Check the security policy (SP) for the packet and, if * required, do IPsec-related processing. There are two * cases here; the first time a packet is sent through * it will be untagged and handled by ipsec4_checkpolicy. * If the packet is resubmitted to ip_output (e.g. after * AH, ESP, etc. processing), there will be a tag to bypass * the lookup and related policy checking. */ mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND); if (sp == NULL) *error = -EINVAL; /* force silent drop */ m_tag_delete(*m, mtag); } else { sp = ipsec4_checkpolicy(*m, IPSEC_DIR_OUTBOUND, *flags, error, inp); } /* * There are four return cases: * sp != NULL apply IPsec policy * sp == NULL, error == 0 no IPsec handling needed * sp == NULL, error == -EINVAL discard packet w/o error * sp == NULL, error != 0 discard packet, report error */ if (sp != NULL) { /* Loop detection, check if ipsec processing already done */ KASSERT(sp->req != NULL, ("ip_output: no ipsec request")); for (mtag = m_tag_first(*m); mtag != NULL; mtag = m_tag_next(*m, mtag)) { if (mtag->m_tag_cookie != MTAG_ABI_COMPAT) continue; if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE && mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED) continue; /* * Check if policy has an SA associated with it. * This can happen when an SP has yet to acquire * an SA; e.g. on first reference. If it occurs, * then we let ipsec4_process_packet do its thing. */ if (sp->req->sav == NULL) break; tdbi = (struct tdb_ident *)(mtag + 1); if (tdbi->spi == sp->req->sav->spi && tdbi->proto == sp->req->sav->sah->saidx.proto && bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst, sizeof (union sockaddr_union)) == 0) { /* * No IPsec processing is needed, free * reference to SP. * * NB: null pointer to avoid free at * done: below. */ KEY_FREESP(&sp), sp = NULL; goto done; } } /* * Do delayed checksums now because we send before * this is done in the normal processing path. */ if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(*m); (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if ((*m)->m_pkthdr.csum_flags & CSUM_SCTP) { struct ip *ip = mtod(*m, struct ip *); sctp_delayed_cksum(*m, (uint32_t)(ip->ip_hl << 2)); (*m)->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* NB: callee frees mbuf */ *error = ipsec4_process_packet(*m, sp->req, *flags, 0); if (*error == EJUSTRETURN) { /* * We had a SP with a level of 'use' and no SA. We * will just continue to process the packet without * IPsec processing and return without error. */ *error = 0; goto done; } /* * Preserve KAME behaviour: ENOENT can be returned * when an SA acquire is in progress. Don't propagate * this to user-level; it confuses applications. * * XXX this will go away when the SADB is redone. */ if (*error == ENOENT) *error = 0; goto reinjected; } else { /* sp == NULL */ if (*error != 0) { /* * Hack: -EINVAL is used to signal that a packet * should be silently discarded. This is typically * because we asked key management for an SA and * it was delayed (e.g. kicked up to IKE). */ if (*error == -EINVAL) *error = 0; goto bad; } else { /* No IPsec processing for this packet. */ } } done: if (sp != NULL) KEY_FREESP(&sp); return 0; reinjected: if (sp != NULL) KEY_FREESP(&sp); return -1; bad: if (sp != NULL) KEY_FREESP(&sp); return 1; } Index: user/dchagin/lemul/sys/netinet/ip_mroute.c =================================================================== --- user/dchagin/lemul/sys/netinet/ip_mroute.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/ip_mroute.c (revision 274241) @@ -1,2948 +1,2948 @@ /*- * Copyright (c) 1989 Stephen Deering * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 */ /* * IP multicast forwarding procedures * * Written by David Waitzman, BBN Labs, August 1988. * Modified by Steve Deering, Stanford, February 1989. * Modified by Mark J. Steiglitz, Stanford, May, 1991 * Modified by Van Jacobson, LBL, January 1993 * Modified by Ajit Thyagarajan, PARC, August 1993 * Modified by Bill Fenner, PARC, April 1995 * Modified by Ahmed Helmy, SGI, June 1996 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 * Modified by Hitoshi Asaeda, WIDE, August 2000 * Modified by Pavlin Radoslavov, ICSI, October 2002 * * MROUTING Revision: 3.5 * and PIM-SMv2 and PIM-DM support, advanced API support, * bandwidth metering and signaling */ /* * TODO: Prefix functions with ipmf_. * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol * domain attachment (if_afdata) so we can track consumers of that service. * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, * move it to socket options. * TODO: Cleanup LSRR removal further. * TODO: Push RSVP stubs into raw_ip.c. * TODO: Use bitstring.h for vif set. * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. * TODO: Sync ip6_mroute.c with this file. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_mrouting.h" #define _PIM_VT 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_IPMF #define KTR_IPMF KTR_INET #endif #define VIFI_INVALID ((vifi_t) -1) static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); /* * Locking. We use two locks: one for the virtual interface table and * one for the forwarding table. These locks may be nested in which case * the VIF lock must always be taken first. Note that each lock is used * to cover not only the specific data structure but also related data * structures. */ static struct mtx mrouter_mtx; #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) #define MROUTER_LOCK_INIT() \ mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); VNET_PCPUSTAT_SYSINIT(mrtstat); VNET_PCPUSTAT_SYSUNINIT(mrtstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); #define V_mfchash VNET(mfchash) #define MFCHASH(a, g) \ ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) #define MFCHASHSIZE 256 static u_long mfchashsize; /* Hash size */ static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ #define V_nexpire VNET(nexpire) static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) static struct mtx mfc_mtx; #define MFC_LOCK() mtx_lock(&mfc_mtx) #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) #define MFC_LOCK_INIT() \ mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) static VNET_DEFINE(vifi_t, numvifs); #define V_numvifs VNET(numvifs) static VNET_DEFINE(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) -SYSCTL_VNET_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, +SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); static struct mtx vif_mtx; #define VIF_LOCK() mtx_lock(&vif_mtx) #define VIF_UNLOCK() mtx_unlock(&vif_mtx) #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) #define VIF_LOCK_INIT() \ mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) static eventhandler_tag if_detach_event_tag = NULL; static VNET_DEFINE(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ /* * Bandwidth meter variables and constants */ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* * Pending timeouts are stored in a hash table, the key being the * expiration time. Periodically, the entries are analysed and processed. */ #define BW_METER_BUCKETS 1024 static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); #define V_bw_meter_timers VNET(bw_meter_timers) static VNET_DEFINE(struct callout, bw_meter_ch); #define V_bw_meter_ch VNET(bw_meter_ch) #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ /* * Pending upcalls are stored in a vector which is flushed when * full, or periodically */ static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); #define V_bw_upcalls VNET(bw_upcalls) static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ #define V_bw_upcalls_n VNET(bw_upcalls_n) static VNET_DEFINE(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); VNET_PCPUSTAT_SYSINIT(pimstat); VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); extern struct domain inetdomain; static const struct protosw in_pim_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); /* * Note: the PIM Register encapsulation adds the following in front of a * data packet: * * struct pim_encap_hdr { * struct ip ip; * struct pim_encap_pimhdr pim; * } * */ struct pim_encap_pimhdr { struct pim pim; uint32_t flags; }; #define PIM_ENCAP_TTL 64 static struct ip pim_encap_iphdr = { #if BYTE_ORDER == LITTLE_ENDIAN sizeof(struct ip) >> 2, IPVERSION, #else IPVERSION, sizeof(struct ip) >> 2, #endif 0, /* tos */ sizeof(struct ip), /* total length */ 0, /* id */ 0, /* frag offset */ PIM_ENCAP_TTL, IPPROTO_PIM, 0, /* checksum */ }; static struct pim_encap_pimhdr pim_encap_pimhdr = { { PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 0, /* reserved */ 0, /* checksum */ }, 0 /* flags */ }; static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; #define V_reg_vif_num VNET(reg_vif_num) static VNET_DEFINE(struct ifnet, multicast_register_if); #define V_multicast_register_if VNET(multicast_register_if) /* * Private variables. */ static u_long X_ip_mcast_src(int); static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *, struct sockopt *); static int X_ip_mrouter_set(struct socket *, struct sockopt *); static int X_legal_vif_num(int); static int X_mrt_ioctl(u_long, caddr_t, int); static int add_bw_upcall(struct bw_upcall *); static int add_mfc(struct mfcctl2 *); static int add_vif(struct vifctl *); static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); static void bw_meter_process(void); static void bw_meter_receive_packet(struct bw_meter *, int, struct timeval *); static void bw_upcalls_send(void); static int del_bw_upcall(struct bw_upcall *); static int del_mfc(struct mfcctl2 *); static int del_vif(vifi_t); static int del_vif_locked(vifi_t); static void expire_bw_meter_process(void *); static void expire_bw_upcalls_send(void *); static void expire_mfc(struct mfc *); static void expire_upcalls(void *); static void free_bw_list(struct bw_meter *); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); static void if_detached_event(void *, struct ifnet *); static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); static int ip_mrouter_init(struct socket *, int); static __inline struct mfc * mfc_find(struct in_addr *, struct in_addr *); static void phyint_send(struct ip *, struct vif *, struct mbuf *); static struct mbuf * pim_register_prepare(struct ip *, struct mbuf *); static int pim_register_send(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_rp(struct ip *, struct vif *, struct mbuf *, struct mfc *); static int pim_register_send_upcall(struct ip *, struct vif *, struct mbuf *, struct mfc *); static void schedule_bw_meter(struct bw_meter *, struct timeval *); static void send_packet(struct vif *, struct mbuf *); static int set_api_config(uint32_t *); static int set_assert(int); static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); static void unschedule_bw_meter(struct bw_meter *); /* * Kernel multicast forwarding API capabilities and setup. * If more API capabilities are added to the kernel, they should be * recorded in `mrt_api_support'. */ #define MRT_API_VERSION 0x0305 static const int mrt_api_version = MRT_API_VERSION; static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | MRT_MFC_FLAGS_BORDER_VIF | MRT_MFC_RP | MRT_MFC_BW_UPCALL); static VNET_DEFINE(uint32_t, mrt_api_config); #define V_mrt_api_config VNET(mrt_api_config) static VNET_DEFINE(int, pim_assert_enabled); #define V_pim_assert_enabled VNET(pim_assert_enabled) static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ /* * Find a route for a given origin IP address and multicast group address. * Statistics must be updated by the caller. */ static __inline struct mfc * mfc_find(struct in_addr *o, struct in_addr *g) { struct mfc *rt; MFC_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && TAILQ_EMPTY(&rt->mfc_stall)) break; } return (rt); } /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ static int X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) { int error, optval; vifi_t vifi; struct vifctl vifc; struct mfcctl2 mfc; struct bw_upcall bw_upcall; uint32_t i; if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) return EPERM; error = 0; switch (sopt->sopt_name) { case MRT_INIT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; error = ip_mrouter_init(so, optval); break; case MRT_DONE: error = ip_mrouter_done(); break; case MRT_ADD_VIF: error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); if (error) break; error = add_vif(&vifc); break; case MRT_DEL_VIF: error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); if (error) break; error = del_vif(vifi); break; case MRT_ADD_MFC: case MRT_DEL_MFC: /* * select data size depending on API version. */ if (sopt->sopt_name == MRT_ADD_MFC && V_mrt_api_config & MRT_API_FLAGS_ALL) { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), sizeof(struct mfcctl2)); } else { error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), sizeof(struct mfcctl)); bzero((caddr_t)&mfc + sizeof(struct mfcctl), sizeof(mfc) - sizeof(struct mfcctl)); } if (error) break; if (sopt->sopt_name == MRT_ADD_MFC) error = add_mfc(&mfc); else error = del_mfc(&mfc); break; case MRT_ASSERT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; set_assert(optval); break; case MRT_API_CONFIG: error = sooptcopyin(sopt, &i, sizeof i, sizeof i); if (!error) error = set_api_config(&i); if (!error) error = sooptcopyout(sopt, &i, sizeof i); break; case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, sizeof bw_upcall); if (error) break; if (sopt->sopt_name == MRT_ADD_BW_UPCALL) error = add_bw_upcall(&bw_upcall); else error = del_bw_upcall(&bw_upcall); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle MRT getsockopt commands */ static int X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) { int error; switch (sopt->sopt_name) { case MRT_VERSION: error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); break; case MRT_ASSERT: error = sooptcopyout(sopt, &V_pim_assert_enabled, sizeof V_pim_assert_enabled); break; case MRT_API_SUPPORT: error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); break; case MRT_API_CONFIG: error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); break; default: error = EOPNOTSUPP; break; } return error; } /* * Handle ioctl commands to obtain information from the cache */ static int X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) { int error = 0; /* * Currently the only function calling this ioctl routine is rtioctl(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error) return (error); switch (cmd) { case (SIOCGETVIFCNT): error = get_vif_cnt((struct sioc_vif_req *)data); break; case (SIOCGETSGCNT): error = get_sg_cnt((struct sioc_sg_req *)data); break; default: error = EINVAL; break; } return error; } /* * returns the packet, byte, rpf-failure count for the source group provided */ static int get_sg_cnt(struct sioc_sg_req *req) { struct mfc *rt; MFC_LOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { MFC_UNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; MFC_UNLOCK(); return 0; } /* * returns the input and output packet and byte counts on the vif provided */ static int get_vif_cnt(struct sioc_vif_req *req) { vifi_t vifi = req->vifi; VIF_LOCK(); if (vifi >= V_numvifs) { VIF_UNLOCK(); return EINVAL; } req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; VIF_UNLOCK(); return 0; } static void if_detached_event(void *arg __unused, struct ifnet *ifp) { vifi_t vifi; u_long i; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return; } VIF_LOCK(); MFC_LOCK(); /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. * 2. Walk the multicast forwarding cache (mfc) looking for * inner matches with this vif's index. * 3. Expire any matching multicast forwarding cache entries. * 4. Free vif state. This should disable ALLMULTI on the interface. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (V_viftable[vifi].v_ifp != ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } } } del_vif_locked(vifi); } MFC_UNLOCK(); VIF_UNLOCK(); MROUTER_UNLOCK(); } /* * Enable multicast forwarding. */ static int ip_mrouter_init(struct socket *so, int version) { CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, so->so_type, so->so_proto->pr_protocol); if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) return EOPNOTSUPP; if (version != 1) return ENOPROTOOPT; MROUTER_LOCK(); if (ip_mrouter_unloading) { MROUTER_UNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { MROUTER_UNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); V_ip_mrouter = so; ip_mrouter_cnt++; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Disable multicast forwarding. */ static int X_ip_mrouter_done(void) { struct ifnet *ifp; u_long i; vifi_t vifi; MROUTER_LOCK(); if (V_ip_mrouter == NULL) { MROUTER_UNLOCK(); return EINVAL; } /* * Detach/disable hooks to the reset of the system. */ V_ip_mrouter = NULL; ip_mrouter_cnt--; V_mrt_api_config = 0; VIF_LOCK(); /* * For each phyint in use, disable promiscuous reception of all IP * multicasts. */ for (vifi = 0; vifi < V_numvifs; vifi++) { if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ifp = V_viftable[vifi].v_ifp; if_allmulti(ifp, 0); } } bzero((caddr_t)V_viftable, sizeof(V_viftable)); V_numvifs = 0; V_pim_assert_enabled = 0; VIF_UNLOCK(); callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); callout_stop(&V_bw_meter_ch); MFC_LOCK(); /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } free(V_mfchashtbl, M_MRTABLE); V_mfchashtbl = NULL; bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); V_bw_upcalls_n = 0; bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); MFC_UNLOCK(); V_reg_vif_num = VIFI_INVALID; MROUTER_UNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); return 0; } /* * Set PIM assert processing global */ static int set_assert(int i) { if ((i != 1) && (i != 0)) return EINVAL; V_pim_assert_enabled = i; return 0; } /* * Configure API capabilities */ int set_api_config(uint32_t *apival) { u_long i; /* * We can set the API capabilities only if it is the first operation * after MRT_INIT. I.e.: * - there are no vifs installed * - pim_assert is not enabled * - the MFC table is empty */ if (V_numvifs > 0) { *apival = 0; return EPERM; } if (V_pim_assert_enabled) { *apival = 0; return EPERM; } MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { MFC_UNLOCK(); *apival = 0; return EPERM; } } MFC_UNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; return 0; } /* * Add a vif to the vif table */ static int add_vif(struct vifctl *vifcp) { struct vif *vifp = V_viftable + vifcp->vifc_vifi; struct sockaddr_in sin = {sizeof sin, AF_INET}; struct ifaddr *ifa; struct ifnet *ifp; int error; VIF_LOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { VIF_UNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); VIF_UNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { VIF_UNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { VIF_UNLOCK(); return EADDRNOTAVAIL; } /* Find the interface with an address in AF_INET family */ if (vifcp->vifc_flags & VIFF_REGISTER) { /* * XXX: Because VIFF_REGISTER does not really need a valid * local interface (e.g. it could be 127.0.0.2), we don't * check its address. */ ifp = NULL; } else { sin.sin_addr = vifcp->vifc_lcl_addr; ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { VIF_UNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; ifa_free(ifa); } if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); VIF_UNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); if (V_reg_vif_num == VIFI_INVALID) { if_initname(&V_multicast_register_if, "register_vif", 0); V_multicast_register_if.if_flags = IFF_LOOPBACK; V_reg_vif_num = vifcp->vifc_vifi; } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { VIF_UNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { VIF_UNLOCK(); return error; } } vifp->v_flags = vifcp->vifc_flags; vifp->v_threshold = vifcp->vifc_threshold; vifp->v_lcl_addr = vifcp->vifc_lcl_addr; vifp->v_rmt_addr = vifcp->vifc_rmt_addr; vifp->v_ifp = ifp; /* initialize per vif pkt counters */ vifp->v_pkt_in = 0; vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; VIF_UNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__, (int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr), (int)vifcp->vifc_threshold); return 0; } /* * Delete a vif from the vif table */ static int del_vif_locked(vifi_t vifi) { struct vif *vifp; VIF_LOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; } vifp = &V_viftable[vifi]; if (in_nullhost(vifp->v_lcl_addr)) { return EADDRNOTAVAIL; } if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) if_allmulti(vifp->v_ifp, 0); if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); /* Adjust numvifs down */ for (vifi = V_numvifs; vifi > 0; vifi--) if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) break; V_numvifs = vifi; return 0; } static int del_vif(vifi_t vifi) { int cc; VIF_LOCK(); cc = del_vif_locked(vifi); VIF_UNLOCK(); return cc; } /* * update an mfc entry without resetting counters and S,G addresses. */ static void update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { int i; rt->mfc_parent = mfccp->mfcc_parent; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & MRT_MFC_FLAGS_ALL; } /* set the RP address */ if (V_mrt_api_config & MRT_MFC_RP) rt->mfc_rp = mfccp->mfcc_rp; else rt->mfc_rp.s_addr = INADDR_ANY; } /* * fully initialize an mfc entry from the parameter. */ static void init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) { rt->mfc_origin = mfccp->mfcc_origin; rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; update_mfc_params(rt, mfccp); /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); } static void expire_mfc(struct mfc *rt) { struct rtdetq *rte, *nrte; MFC_LOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter); TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); } /* * Add an mfc entry */ static int add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; struct rtdetq *rte, *nrte; u_long hash = 0; u_short nstl; VIF_LOCK(); MFC_LOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ if (rt) { CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x", __func__, inet_ntoa(mfccp->mfcc_origin), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Find the entry for which the upcall was made and update */ nstl = 0; hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) { CTR5(KTR_IPMF, "%s: add mfc orig %s group %lx parent %x qh %p", __func__, inet_ntoa(mfccp->mfcc_origin), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, TAILQ_FIRST(&rt->mfc_stall)); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); init_mfc_params(rt, mfccp); rt->mfc_expire = 0; /* Don't clean this guy up */ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall--; free(rte, M_MRTABLE); } } } /* * It is possible that an entry is being inserted without an upcall */ if (nstl == 0) { CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { init_mfc_params(rt, mfccp); if (rt->mfc_expire) V_nexpire[hash]--; rt->mfc_expire = 0; break; /* XXX */ } } if (rt == NULL) { /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter = NULL; /* insert new entry at head of hash chain */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); } } MFC_UNLOCK(); VIF_UNLOCK(); return (0); } /* * Delete an mfc entry */ static int del_mfc(struct mfcctl2 *mfccp) { struct in_addr origin; struct in_addr mcastgrp; struct mfc *rt; origin = mfccp->mfcc_origin; mcastgrp = mfccp->mfcc_mcastgrp; CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__, inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr)); MFC_LOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } /* * free the bw_meter entries */ free_bw_list(rt->mfc_bw_meter); rt->mfc_bw_meter = NULL; LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); MFC_UNLOCK(); return (0); } /* * Send a message to the routing daemon on the multicast routing socket. */ static int socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) { if (s) { SOCKBUF_LOCK(&s->so_rcv); if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { sorwakeup_locked(s); return 0; } SOCKBUF_UNLOCK(&s->so_rcv); } m_freem(mm); return -1; } /* * IP multicast forwarding function. This function assumes that the packet * pointed to by "ip" has arrived on (or is about to be sent to) the interface * pointed to by "ifp", and the packet is to be relayed to other networks * that have members of the packet's destination IP multicast group. * * The packet is returned unscathed to the caller, unless it is * erroneous, in which case a non-zero return value tells the caller to * discard it. */ #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { struct mfc *rt; int error; vifi_t vifi; CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p", inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp); if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* * Packet arrived via a physical interface or * an encapsulated tunnel or a register_vif. */ } else { /* * Packet arrived through a source-route tunnel. * Source-route tunnels are no longer supported. */ return (1); } VIF_LOCK(); MFC_LOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); MFC_UNLOCK(); VIF_UNLOCK(); return error; } /* * Don't forward a packet with time-to-live of zero or one, * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { MFC_UNLOCK(); VIF_UNLOCK(); return 0; } /* * Determine forwarding vifs from the forwarding cache table */ MRTSTAT_INC(mrts_mfc_lookups); rt = mfc_find(&ip->ip_src, &ip->ip_dst); /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); MFC_UNLOCK(); VIF_UNLOCK(); return error; } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ struct mbuf *mb0; struct rtdetq *rte; u_long hash; int hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)", inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr)); /* * Allocate mbufs early so that we don't do extra work if we are * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && !TAILQ_EMPTY(&rt->mfc_stall)) break; } if (rt == NULL) { int i; struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct mbuf *mm; /* * Locate the vifi for the incoming interface for this packet. * If none found, drop packet. */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) /* vif not found, drop packet */ goto non_fatal; /* no upcall, so make a new entry */ rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); if (rt == NULL) goto fail; /* Make a copy of the header to send to the user level process */ mm = m_copy(mb0, 0, hlen); if (mm == NULL) goto fail1; /* * Send message to routing daemon to install * a route into the kernel table */ im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_NOCACHE; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = ip->ip_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); fail1: free(rt, M_MRTABLE); fail: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return ENOBUFS; } /* insert new entry at head of hash chain */ rt->mfc_origin.s_addr = ip->ip_src.s_addr; rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; rt->mfc_expire = UPCALL_EXPIRE; V_nexpire[hash]++; for (i = 0; i < V_numvifs; i++) { rt->mfc_ttls[i] = 0; rt->mfc_flags[i] = 0; } rt->mfc_parent = -1; /* clear the RP address */ rt->mfc_rp.s_addr = INADDR_ANY; rt->mfc_bw_meter = NULL; /* initialize pkt counters per src-grp */ rt->mfc_pkt_cnt = 0; rt->mfc_byte_cnt = 0; rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); TAILQ_INIT(&rt->mfc_stall); rt->mfc_nstall = 0; /* link into table */ LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } else { /* determine if queue has overflowed */ if (rt->mfc_nstall > MAX_UPQ) { MRTSTAT_INC(mrts_upq_ovflw); non_fatal: free(rte, M_MRTABLE); m_freem(mb0); MFC_UNLOCK(); VIF_UNLOCK(); return (0); } TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); rt->mfc_nstall++; } rte->m = mb0; rte->ifp = ifp; MFC_UNLOCK(); VIF_UNLOCK(); return 0; } } /* * Clean up the cache entry if upcall is not serviced */ static void expire_upcalls(void *arg) { u_long i; CURVNET_SET((struct vnet *) arg); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; if (V_nexpire[i] == 0) continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) continue; /* * free the bw_meter entries */ while (rt->mfc_bw_meter != NULL) { struct bw_meter *x = rt->mfc_bw_meter; rt->mfc_bw_meter = x->bm_mfc_next; free(x, M_BWMETER); } MRTSTAT_INC(mrts_cache_cleanups); CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, (u_long)ntohl(rt->mfc_origin.s_addr), (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); expire_mfc(rt); } } MFC_UNLOCK(); callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); CURVNET_RESTORE(); } /* * Packet forwarding routine once entry in the cache is made */ static int ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. * * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) */ if (xmt_vif < V_numvifs) { if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + xmt_vif, m, rt); else phyint_send(ip, V_viftable + xmt_vif, m); return 1; } /* * Don't forward if it didn't arrive from the parent vif for its origin. */ vifi = rt->mfc_parent; if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); MRTSTAT_INC(mrts_wrong_if); ++rt->mfc_wrong_if; /* * If we are doing PIM assert processing, send a message * to the routing daemon. * * XXX: A PIM-SM router needs the WRONGVIF detection so it * can complete the SPT switch, regardless of the type * of the iif (broadcast media, GRE tunnel, etc). */ if (V_pim_assert_enabled && (vifi < V_numvifs) && V_viftable[vifi].v_ifp) { if (ifp == &V_multicast_register_if) PIMSTAT_INC(pims_rcv_registers_wrongiif); /* Get vifi for the incoming packet */ for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) ; if (vifi >= V_numvifs) return 0; /* The iif is not found: ignore the packet. */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) return 0; /* WRONGVIF disabled: ignore the packet */ if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; struct igmpmsg *im; int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copy(m, 0, hlen); if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; im = mtod(mm, struct igmpmsg *); im->im_msgtype = IGMPMSG_WRONGVIF; im->im_mbz = 0; im->im_vif = vifi; MRTSTAT_INC(mrts_upcalls); k_igmpsrc.sin_addr = im->im_src; if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } } } return 0; } /* If I sourced this packet, it counts as output, else it was input. */ if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; } else { V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; /* * For each vif, decide if a copy of the packet should be forwarded. * Forward if: * - the ttl exceeds the vif's threshold * - there are group members downstream on interface */ for (vifi = 0; vifi < V_numvifs; vifi++) if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; if (V_viftable[vifi].v_flags & VIFF_REGISTER) pim_register_send(ip, V_viftable + vifi, m, rt); else phyint_send(ip, V_viftable + vifi, m); } /* * Perform upcall-related bw measuring. */ if (rt->mfc_bw_meter != NULL) { struct bw_meter *x; struct timeval now; microtime(&now); MFC_LOCK_ASSERT(); for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) bw_meter_receive_packet(x, plen, &now); } return 0; } /* * Check if a vif number is legal/ok. This is used by in_mcast.c. */ static int X_legal_vif_num(int vif) { int ret; ret = 0; if (vif < 0) return (ret); VIF_LOCK(); if (vif < V_numvifs) ret = 1; VIF_UNLOCK(); return (ret); } /* * Return the local address used by this vif */ static u_long X_ip_mcast_src(int vifi) { in_addr_t addr; addr = INADDR_ANY; if (vifi < 0) return (addr); VIF_LOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; VIF_UNLOCK(); return (addr); } static void phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) { struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; VIF_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; send_packet(vifp, mb_copy); } static void send_packet(struct vif *vifp, struct mbuf *m) { struct ip_moptions imo; struct in_multi *imm[2]; int error; VIF_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; imo.imo_multicast_loop = 1; imo.imo_multicast_vif = -1; imo.imo_num_memberships = 0; imo.imo_max_memberships = 2; imo.imo_membership = &imm[0]; /* * Re-entrancy should not be a problem here, because * the packets that we send out and are looped back at us * should get rejected because they appear to come from * the loopback interface, thus preventing looping. */ error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL); CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, (ptrdiff_t)(vifp - V_viftable), error); } /* * Stubs for old RSVP socket shim implementation. */ static int X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) { return (EOPNOTSUPP); } static void X_ip_rsvp_force_done(struct socket *so __unused) { } static int X_rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (!V_rsvp_on) m_freem(m); return (IPPROTO_DONE); } /* * Code for bandwidth monitors */ /* * Define common interface for timeval-related methods */ #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) static uint32_t compute_bw_meter_flags(struct bw_upcall *req) { uint32_t flags = 0; if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) flags |= BW_METER_UNIT_PACKETS; if (req->bu_flags & BW_UPCALL_UNIT_BYTES) flags |= BW_METER_UNIT_BYTES; if (req->bu_flags & BW_UPCALL_GEQ) flags |= BW_METER_GEQ; if (req->bu_flags & BW_UPCALL_LEQ) flags |= BW_METER_LEQ; return flags; } /* * Add a bw_meter entry */ static int add_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; struct timeval now; struct bw_meter *x; uint32_t flags; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; /* Test if the flags are valid */ if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) return EINVAL; if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) return EINVAL; if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) return EINVAL; /* Test if the threshold time interval is valid */ if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) return EINVAL; flags = compute_bw_meter_flags(req); /* * Find if we have already same bw_meter entry */ MFC_LOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { MFC_UNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); if (x == NULL) { MFC_UNLOCK(); return ENOBUFS; } /* Set the new bw_meter entry */ x->bm_threshold.b_time = req->bu_threshold.b_time; microtime(&now); x->bm_start_time = now; x->bm_threshold.b_packets = req->bu_threshold.b_packets; x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags = flags; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; /* Add the new bw_meter entry to the front of entries for this MFC */ x->bm_mfc = mfc; x->bm_mfc_next = mfc->mfc_bw_meter; mfc->mfc_bw_meter = x; schedule_bw_meter(x, &now); MFC_UNLOCK(); return 0; } static void free_bw_list(struct bw_meter *list) { while (list != NULL) { struct bw_meter *x = list; list = list->bm_mfc_next; unschedule_bw_meter(x); free(x, M_BWMETER); } } /* * Delete one or multiple bw_meter entries */ static int del_bw_upcall(struct bw_upcall *req) { struct mfc *mfc; struct bw_meter *x; if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; MFC_LOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { MFC_UNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* * Delete all bw_meter entries for this mfc */ struct bw_meter *list; list = mfc->mfc_bw_meter; mfc->mfc_bw_meter = NULL; free_bw_list(list); MFC_UNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; uint32_t flags = 0; flags = compute_bw_meter_flags(req); /* Find the bw_meter entry to delete */ for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; prev = x, x = x->bm_mfc_next) { if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) && (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) break; } if (x != NULL) { /* Delete entry from the list for this MFC */ if (prev != NULL) prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ else x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ unschedule_bw_meter(x); MFC_UNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { MFC_UNLOCK(); return EINVAL; } } /* NOTREACHED */ } /* * Perform bandwidth measurement processing that may result in an upcall */ static void bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) { struct timeval delta; MFC_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); if (x->bm_flags & BW_METER_GEQ) { /* * Processing for ">=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should deliver an upcall */ if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); x->bm_flags |= BW_METER_UPCALL_DELIVERED; } } } else if (x->bm_flags & BW_METER_LEQ) { /* * Processing for "<=" type of bw_meter entry */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* * We are behind time with the multicast forwarding table * scanning for "<=" type of bw_meter entries, so test now * if we should deliver an upcall. */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, nowp); } /* Reschedule the bw_meter entry */ unschedule_bw_meter(x); schedule_bw_meter(x, nowp); } /* Record that a packet is received */ x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; /* * Test if we should restart the measuring interval */ if ((x->bm_flags & BW_METER_UNIT_PACKETS && x->bm_measured.b_packets <= x->bm_threshold.b_packets) || (x->bm_flags & BW_METER_UNIT_BYTES && x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { /* Don't restart the measuring interval */ } else { /* Do restart the measuring interval */ /* * XXX: note that we don't unschedule and schedule, because this * might be too much overhead per packet. Instead, when we process * all entries for a given timer hash bin, we check whether it is * really a timeout. If not, we reschedule at that time. */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; } } } /* * Prepare a bandwidth-related upcall */ static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) { struct timeval delta; struct bw_upcall *u; MFC_LOCK_ASSERT(); /* * Compute the measured time interval */ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* * If there are too many pending upcalls, deliver them now */ if (V_bw_upcalls_n >= BW_UPCALLS_MAX) bw_upcalls_send(); /* * Set the bw_upcall entry */ u = &V_bw_upcalls[V_bw_upcalls_n++]; u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; u->bu_threshold.b_packets = x->bm_threshold.b_packets; u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; u->bu_measured.b_time = delta; u->bu_measured.b_packets = x->bm_measured.b_packets; u->bu_measured.b_bytes = x->bm_measured.b_bytes; u->bu_flags = 0; if (x->bm_flags & BW_METER_UNIT_PACKETS) u->bu_flags |= BW_UPCALL_UNIT_PACKETS; if (x->bm_flags & BW_METER_UNIT_BYTES) u->bu_flags |= BW_UPCALL_UNIT_BYTES; if (x->bm_flags & BW_METER_GEQ) u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; } /* * Send the pending bandwidth-related upcalls */ static void bw_upcalls_send(void) { struct mbuf *m; int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ IGMPMSG_BW_UPCALL,/* im_msgtype */ 0, /* im_mbz */ 0, /* im_vif */ 0, /* unused3 */ { 0 }, /* im_src */ { 0 } }; /* im_dst */ MFC_LOCK_ASSERT(); if (V_bw_upcalls_n == 0) return; /* No pending upcalls */ V_bw_upcalls_n = 0; /* * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); /* * Send the upcalls * XXX do we need to set the address in k_igmpsrc ? */ MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); MRTSTAT_INC(mrts_upq_sockfull); } } /* * Compute the timeout hash value for the bw_meter entries */ #define BW_METER_TIMEHASH(bw_meter, hash) \ do { \ struct timeval next_timeval = (bw_meter)->bm_start_time; \ \ BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ (hash) = next_timeval.tv_sec; \ if (next_timeval.tv_usec) \ (hash)++; /* XXX: make sure we don't timeout early */ \ (hash) %= BW_METER_BUCKETS; \ } while (0) /* * Schedule a timer to process periodically bw_meter entry of type "<=" * by linking the entry in the proper hash bucket. */ static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) { int time_hash; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Reset the bw_meter entry */ x->bm_start_time = *nowp; x->bm_measured.b_packets = 0; x->bm_measured.b_bytes = 0; x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; /* * Compute the timeout hash value and insert the entry */ BW_METER_TIMEHASH(x, time_hash); x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; } /* * Unschedule the periodic timer that processes bw_meter entry of type "<=" * by removing the entry from the proper hash bucket. */ static void unschedule_bw_meter(struct bw_meter *x) { int time_hash; struct bw_meter *prev, *tmp; MFC_LOCK_ASSERT(); if (!(x->bm_flags & BW_METER_LEQ)) return; /* XXX: we schedule timers only for "<=" entries */ /* * Compute the timeout hash value and delete the entry */ time_hash = x->bm_time_hash; if (time_hash >= BW_METER_BUCKETS) return; /* Entry was not scheduled */ for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) if (tmp == x) break; if (tmp == NULL) panic("unschedule_bw_meter: bw_meter entry not found"); if (prev != NULL) prev->bm_time_next = x->bm_time_next; else V_bw_meter_timers[time_hash] = x->bm_time_next; x->bm_time_next = NULL; x->bm_time_hash = BW_METER_BUCKETS; } /* * Process all "<=" type of bw_meter that should be processed now, * and for each entry prepare an upcall if necessary. Each processed * entry is rescheduled again for the (periodic) processing. * * This is run periodically (once per second normally). On each round, * all the potentially matching entries are in the hash slot that we are * looking at. */ static void bw_meter_process() { uint32_t loops; int i; struct timeval now, process_endtime; microtime(&now); if (V_last_tv_sec == now.tv_sec) return; /* nothing to do */ loops = now.tv_sec - V_last_tv_sec; V_last_tv_sec = now.tv_sec; if (loops > BW_METER_BUCKETS) loops = BW_METER_BUCKETS; MFC_LOCK(); /* * Process all bins of bw_meter entries from the one after the last * processed to the current one. On entry, i points to the last bucket * visited, so we need to increment i at the beginning of the loop. */ for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { struct bw_meter *x, *tmp_list; if (++i >= BW_METER_BUCKETS) i = 0; /* Disconnect the list of bw_meter entries from the bin */ tmp_list = V_bw_meter_timers[i]; V_bw_meter_timers[i] = NULL; /* Process the list of bw_meter entries */ while (tmp_list != NULL) { x = tmp_list; tmp_list = tmp_list->bm_time_next; /* Test if the time interval is over */ process_endtime = x->bm_start_time; BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); if (BW_TIMEVALCMP(&process_endtime, &now, >)) { /* Not yet: reschedule, but don't reset */ int time_hash; BW_METER_TIMEHASH(x, time_hash); if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { /* * XXX: somehow the bin processing is a bit ahead of time. * Put the entry in the next bin. */ if (++time_hash >= BW_METER_BUCKETS) time_hash = 0; } x->bm_time_next = V_bw_meter_timers[time_hash]; V_bw_meter_timers[time_hash] = x; x->bm_time_hash = time_hash; continue; } /* * Test if we should deliver an upcall */ if (((x->bm_flags & BW_METER_UNIT_PACKETS) && (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || ((x->bm_flags & BW_METER_UNIT_BYTES) && (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { /* Prepare an upcall for delivery */ bw_meter_prepare_upcall(x, &now); } /* * Reschedule for next processing */ schedule_bw_meter(x, &now); } } /* Send all upcalls that are pending delivery */ bw_upcalls_send(); MFC_UNLOCK(); } /* * A periodic function for sending all upcalls that are pending delivery */ static void expire_bw_upcalls_send(void *arg) { CURVNET_SET((struct vnet *) arg); MFC_LOCK(); bw_upcalls_send(); MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); CURVNET_RESTORE(); } /* * A periodic function for periodic scanning of the multicast forwarding * table for processing all "<=" bw_meter entries. */ static void expire_bw_meter_process(void *arg) { CURVNET_SET((struct vnet *) arg); if (V_mrt_api_config & MRT_MFC_BW_UPCALL) bw_meter_process(); callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, curvnet); CURVNET_RESTORE(); } /* * End of bandwidth monitoring code */ /* * Send the packet up to the user daemon, or eventually do kernel encapsulation * */ static int pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, struct mfc *rt) { struct mbuf *mb_copy, *mm; /* * Do not send IGMP_WHOLEPKT notifications to userland, if the * rendezvous point was unspecified, and we were told not to. */ if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && in_nullhost(rt->mfc_rp)) return 0; mb_copy = pim_register_prepare(ip, m); if (mb_copy == NULL) return ENOBUFS; /* * Send all the fragments. Note that the mbuf for each fragment * is freed by the sending machinery. */ for (mm = mb_copy; mm; mm = mb_copy) { mb_copy = mm->m_nextpkt; mm->m_nextpkt = 0; mm = m_pullup(mm, sizeof(struct ip)); if (mm != NULL) { ip = mtod(mm, struct ip *); if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { pim_register_send_rp(ip, vifp, mm, rt); } else { pim_register_send_upcall(ip, vifp, mm, rt); } } } return 0; } /* * Return a copy of the data packet that is ready for PIM Register * encapsulation. * XXX: Note that in the returned copy the IP header is a valid one. */ static struct mbuf * pim_register_prepare(struct ip *ip, struct mbuf *m) { struct mbuf *mb_copy = NULL; int mtu; /* Take care of delayed checksums */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } /* * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); if (mb_copy == NULL) return NULL; /* take care of the TTL */ ip = mtod(mb_copy, struct ip *); --ip->ip_ttl; /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ mb_copy->m_pkthdr.csum_flags |= CSUM_IP; if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } } return mb_copy; } /* * Send an upcall with the data packet to the user-level process. */ static int pim_register_send_upcall(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; int len = ntohs(ip->ip_len); struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; VIF_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); mb_first->m_len = sizeof(struct igmpmsg); mb_first->m_next = mb_copy; /* Send message to routing daemon */ im = mtod(mb_first, struct igmpmsg *); im->im_msgtype = IGMPMSG_WHOLEPKT; im->im_mbz = 0; im->im_vif = vifp - V_viftable; im->im_src = ip->ip_src; im->im_dst = ip->ip_dst; k_igmpsrc.sin_addr = ip->ip_src; MRTSTAT_INC(mrts_upcalls); if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { CTR1(KTR_IPMF, "%s: socket queue full", __func__); MRTSTAT_INC(mrts_upq_sockfull); return ENOBUFS; } /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * Encapsulate the data packet in PIM Register message and send it to the RP. */ static int pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, struct mfc *rt) { struct mbuf *mb_first; struct ip *ip_outer; struct pim_encap_pimhdr *pimhdr; int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; VIF_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); return EADDRNOTAVAIL; /* The iif vif is invalid */ } /* * Add a new mbuf with the encapsulating header */ mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; } mb_first->m_data += max_linkhdr; mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); mb_first->m_next = mb_copy; mb_first->m_pkthdr.len = len + mb_first->m_len; /* * Fill in the encapsulating IP and PIM header */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; ip_outer->ip_id = ip_newid(); ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* * Copy the inner header TOS to the outer header, and take care of the * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; if (ip->ip_off & htons(IP_DF)) ip_outer->ip_off |= htons(IP_DF); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; /* If the iif crosses a border, set the Border-bit */ if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) pimhdr->flags |= htonl(PIM_BORDER_REGISTER); mb_first->m_data += sizeof(pim_encap_iphdr); pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); mb_first->m_data -= sizeof(pim_encap_iphdr); send_packet(vifp, mb_first); /* Keep statistics */ PIMSTAT_INC(pims_snd_registers_msgs); PIMSTAT_ADD(pims_snd_registers_bytes, len); return 0; } /* * pim_encapcheck() is called by the encap4_input() path at runtime to * determine if a packet is for PIM; allowing PIM to be dynamically loaded * into the kernel. */ static int pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { #ifdef DIAGNOSTIC KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); #endif if (proto != IPPROTO_PIM) return 0; /* not for us; reject the datagram. */ return 64; /* claim the datagram. */ } /* * PIM-SMv2 and PIM-DM messages processing. * Receives and verifies the PIM control messages, and passes them * up to the listening socket, using rip_input(). * The only message with special processing is the PIM_REGISTER message * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ int pim_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; int iphlen = *offp; int minlen; int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); PIMSTAT_ADD(pims_rcv_total_bytes, datalen); /* * Validate lengths */ if (datalen < PIM_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); CTR3(KTR_IPMF, "%s: short packet (%d) from %s", __func__, datalen, inet_ntoa(ip->ip_src)); m_freem(m); return (IPPROTO_DONE); } /* * If the packet is at least as big as a REGISTER, go agead * and grab the PIM REGISTER header size, to avoid another * possible m_pullup() later. * * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 */ minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); /* * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ if (m->m_len < minlen && (m = m_pullup(m, minlen)) == 0) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ ip = mtod(m, struct ip *); ip_tos = ip->ip_tos; /* adjust mbuf to point to the PIM header */ m->m_data += iphlen; m->m_len -= iphlen; pim = mtod(m, struct pim *); /* * Validate checksum. If PIM REGISTER, exclude the data packet. * * XXX: some older PIMv2 implementations don't make this distinction, * so for compatibility reason perform the checksum over part of the * message, and if error, then over the whole message. */ if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { /* do nothing, checksum okay */ } else if (in_cksum(m, datalen)) { PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); return (IPPROTO_DONE); } /* PIM version check */ if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { PIMSTAT_INC(pims_rcv_badversion); CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ m->m_data -= iphlen; m->m_len += iphlen; if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { /* * Since this is a REGISTER, we'll make a copy of the register * headers ip + pim + u_int32 + encap_ip, to be passed up to the * routing daemon. */ struct sockaddr_in dst = { sizeof(dst), AF_INET }; struct mbuf *mcp; struct ip *encap_ip; u_int32_t *reghdr; struct ifnet *vifp; VIF_LOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { VIF_UNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; VIF_UNLOCK(); /* * Validate length */ if (datalen < PIM_REG_MINLEN) { PIMSTAT_INC(pims_rcv_tooshort); PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); encap_ip = (struct ip *)(reghdr + 1); CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d", __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len)); /* verify the version number of the inner packet */ if (encap_ip->ip_v != IPVERSION) { PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { PIMSTAT_INC(pims_rcv_badregisters); CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__, inet_ntoa(encap_ip->ip_dst)); m_freem(m); return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) goto pim_input_to_daemon; /* * Copy the TOS from the outer IP header to the inner IP header. */ if (encap_ip->ip_tos != ip_tos) { /* Outer TOS -> inner TOS */ encap_ip->ip_tos = ip_tos; /* Recompute the inner header checksum. Sigh... */ /* adjust mbuf to point to the inner IP header */ m->m_data += (iphlen + PIM_MINLEN); m->m_len -= (iphlen + PIM_MINLEN); encap_ip->ip_sum = 0; encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); /* restore mbuf to point back to the outer IP header */ m->m_data -= (iphlen + PIM_MINLEN); m->m_len += (iphlen + PIM_MINLEN); } /* * Decapsulate the inner IP packet and loopback to forward it * as a normal multicast packet. Also, make a copy of the * outer_iphdr + pimhdr + reghdr + encap_iphdr * to pass to the daemon later, so it can take the appropriate * actions (e.g., send back PIM_REGISTER_STOP). * XXX: here m->m_data points to the outer IP header. */ mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); if (mcp == NULL) { CTR1(KTR_IPMF, "%s: m_copy() failed", __func__); m_freem(m); return (IPPROTO_DONE); } /* Keep statistics */ /* XXX: registers_bytes include only the encap. mcast pkt */ PIMSTAT_INC(pims_rcv_registers_msgs); PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); /* * forward the inner ip packet; point m_data at the inner ip. */ m_adj(m, iphlen + PIM_MINLEN); CTR4(KTR_IPMF, "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", __func__, (u_long)ntohl(encap_ip->ip_src.s_addr), (u_long)ntohl(encap_ip->ip_dst.s_addr), (int)V_reg_vif_num); /* NB: vifp was collected above; can it change on us? */ if_simloop(vifp, m, dst.sin_family, 0); /* prepare the register head to send to the mrouting daemon */ m = mcp; } pim_input_to_daemon: /* * Pass the PIM message up to the daemon; if it is a Register message, * pass the 'head' only up to the daemon. This includes the * outer IP header, PIM header, PIM-Register header and the * inner IP header. * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } static int sysctl_mfctable(SYSCTL_HANDLER_ARGS) { struct mfc *rt; int error, i; if (req->newptr) return (EPERM); if (V_mfchashtbl == NULL) /* XXX unlocked */ return (0); error = sysctl_wire_old_buffer(req, 0); if (error) return (error); MFC_LOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); if (error) goto out_locked; } } out_locked: MFC_UNLOCK(); return (error); } static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, "IPv4 Multicast Forwarding Table " "(struct *mfc[mfchashsize], netinet/ip_mroute.h)"); static void vnet_mroute_init(const void *unused __unused) { MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); callout_init(&V_expire_upcalls_ch, CALLOUT_MPSAFE); callout_init(&V_bw_upcalls_ch, CALLOUT_MPSAFE); callout_init(&V_bw_meter_ch, CALLOUT_MPSAFE); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mroute_init, NULL); static void vnet_mroute_uninit(const void *unused __unused) { FREE(V_nexpire, M_MRTABLE); V_nexpire = NULL; } VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int ip_mroute_modevent(module_t mod, int type, void *unused) { switch (type) { case MOD_LOAD: MROUTER_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); MROUTER_LOCK_DESTROY(); return (EINVAL); } MFC_LOCK_INIT(); VIF_LOCK_INIT(); mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { printf("WARNING: %s not a power of 2; using default\n", "net.inet.ip.mfchashsize"); mfchashsize = MFCHASHSIZE; } pim_squelch_wholepkt = 0; TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", &pim_squelch_wholepkt); pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, pim_encapcheck, &in_pim_protosw, NULL); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); return (EINVAL); } ip_mcast_src = X_ip_mcast_src; ip_mforward = X_ip_mforward; ip_mrouter_done = X_ip_mrouter_done; ip_mrouter_get = X_ip_mrouter_get; ip_mrouter_set = X_ip_mrouter_set; ip_rsvp_force_done = X_ip_rsvp_force_done; ip_rsvp_vif = X_ip_rsvp_vif; legal_vif_num = X_legal_vif_num; mrt_ioctl = X_mrt_ioctl; rsvp_input_p = X_rsvp_input; break; case MOD_UNLOAD: /* * Typically module unload happens after the user-level * process has shutdown the kernel services (the check * below insures someone can't just yank the module out * from under a running process). But if the module is * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ MROUTER_LOCK(); if (ip_mrouter_cnt != 0) { MROUTER_UNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; MROUTER_UNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); if (pim_encap_cookie) { encap_detach(pim_encap_cookie); pim_encap_cookie = NULL; } ip_mcast_src = NULL; ip_mforward = NULL; ip_mrouter_done = NULL; ip_mrouter_get = NULL; ip_mrouter_set = NULL; ip_rsvp_force_done = NULL; ip_rsvp_vif = NULL; legal_vif_num = NULL; mrt_ioctl = NULL; rsvp_input_p = NULL; VIF_LOCK_DESTROY(); MFC_LOCK_DESTROY(); MROUTER_LOCK_DESTROY(); break; default: return EOPNOTSUPP; } return 0; } static moduledata_t ip_mroutemod = { "ip_mroute", ip_mroute_modevent, 0 }; DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); Index: user/dchagin/lemul/sys/netinet/raw_ip.c =================================================================== --- user/dchagin/lemul/sys/netinet/raw_ip.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/raw_ip.c (revision 274241) @@ -1,1125 +1,1125 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, ip_defttl) = IPDEFTTL; -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); #define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* * Control and data hooks for ipfw, dummynet, divert and so on. * The data hooks are not used here but it is convenient * to keep them all in one place. */ VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); #ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip_mrouter); /* * The various mrouter and rsvp functions. */ int (*ip_mrouter_set)(struct socket *, struct sockopt *); int (*ip_mrouter_get)(struct socket *, struct sockopt *); int (*ip_mrouter_done)(void); int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int (*mrt_ioctl)(u_long, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); int (*rsvp_input_p)(struct mbuf **, int *, int); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); u_long rip_recvspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); /* * Hash functions */ #define INP_PCBHASH_RAW_SIZE 256 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ (((proto) + (laddr) + (faddr)) % (mask) + 1) #ifdef INET static void rip_inshash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *pcbhash; int hash; INP_INFO_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != INADDR_ANY) { hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; pcbhash = &pcbinfo->ipi_hashbase[hash]; LIST_INSERT_HEAD(pcbhash, inp, inp_hash); } static void rip_delhash(struct inpcb *inp) { INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); LIST_REMOVE(inp, inp_hash); } #endif /* INET */ /* * Raw interface to IP protocol. */ /* * Initialize raw connection block q. */ static void rip_zone_change(void *tag) { uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); } static int rip_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "rawinp"); return (0); } void rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE void rip_destroy(void) { in_pcbinfo_destroy(&V_ripcbinfo); } #endif #ifdef INET static int rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, struct sockaddr_in *ripsrc) { int policyfail = 0; INP_LOCK_ASSERT(last); #ifdef IPSEC /* check AH/ESP integrity. */ if (ipsec4_in_reject(n, last)) { policyfail = 1; } #endif /* IPSEC */ #ifdef MAC if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) policyfail = 1; #endif /* Check the minimum TTL for socket. */ if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) policyfail = 1; if (!policyfail) { struct mbuf *opts = NULL; struct socket *so; so = last->inp_socket; if ((last->inp_flags & INP_CONTROLOPTS) || (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) ip_savecontrol(last, &opts, ip, n); SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)ripsrc, n, opts) == 0) { /* should notify about lost packet */ m_freem(n); if (opts) m_freem(opts); SOCKBUF_UNLOCK(&so->so_rcv); } else sorwakeup_locked(so); } else m_freem(n); return (policyfail); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; int hash; *mp = NULL; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; ripsrc.sin_addr = ip->ip_src; last = NULL; ifp = m->m_pkthdr.rcvif; hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); INP_INFO_RLOCK(&V_ripcbinfo); LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { if (inp->inp_ip_p && inp->inp_ip_p != proto) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (!in_nullhost(inp->inp_laddr) && !in_hosteq(inp->inp_laddr, ip->ip_dst)) continue; if (!in_nullhost(inp->inp_faddr) && !in_hosteq(inp->inp_faddr, ip->ip_src)) continue; if (jailed_without_vnet(inp->inp_cred)) { /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) continue; } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. * * In the case of IGMPv2, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. imo_multi_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if (proto != IPPROTO_IGMP) { struct sockaddr_in group; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, (struct sockaddr *)&group, (struct sockaddr *)&ripsrc); } if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); continue; } } if (last != NULL) { struct mbuf *n; n = m_copy(m, 0, (int)M_COPYALL); if (n != NULL) (void) rip_append(last, ip, n, &ripsrc); /* XXX count dropped packet */ INP_RUNLOCK(last); } INP_RLOCK(inp); last = inp; } INP_INFO_RUNLOCK(&V_ripcbinfo); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); INP_RUNLOCK(last); } else { m_freem(m); IPSTAT_INC(ips_noproto); IPSTAT_DEC(ips_delivered); } return (IPPROTO_DONE); } /* * Generate IP header and pass packet to ip_output. Tack on options user may * have setup with control call. */ int rip_output(struct mbuf *m, struct socket *so, ...) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); va_list ap; u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; va_start(ap, so); dst = va_arg(ap, u_long); va_end(ap); /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. */ if ((inp->inp_flags & INP_HDRINCL) == 0) { if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return(ENOBUFS); INP_RLOCK(inp); ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) ip->ip_off = htons(IP_DF); else ip->ip_off = htons(0); ip->ip_p = inp->inp_ip_p; ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; ip->ip_dst.s_addr = dst; if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would * let a source of INADDR_ANY pass, which we do not * want to see from jails. */ if (ip->ip_src.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, inp->inp_cred); } else { error = prison_local_ip4(inp->inp_cred, &ip->ip_src); } if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } } ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { m_freem(m); return(EMSGSIZE); } INP_RLOCK(inp); ip = mtod(m, struct ip *); error = prison_check_ip4(inp->inp_cred, &ip->ip_src); if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || (ntohs(ip->ip_len) > m->m_pkthdr.len) || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } if (ip->ip_id == 0) ip->ip_id = ip_newid(); /* * XXX prevent ip_output from overwriting header fields. */ flags |= IP_RAWOUTPUT; IPSTAT_INC(ips_rawout); } if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif error = ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp); INP_RUNLOCK(inp); return (error); } /* * Raw IP socket option processing. * * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could * only be created by a privileged process, and as such, socket option * operations to manage system properties on any raw socket were allowed to * take place without explicit additional access control checks. However, * raw sockets can now also be created in jail(), and therefore explicit * checks are now required. Likewise, raw sockets can be used by a process * after it gives up privilege, so some caution is required. For options * passed down to the IP layer via ip_ctloutput(), checks are assumed to be * performed in ip_ctloutput() and therefore no check occurs here. * Unilaterally checking priv_check() here breaks normal IP socket option * operations on raw sockets. * * When adding new socket options here, make sure to add access control * checks here as necessary. * * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; if (sopt->sopt_level != IPPROTO_IP) { if ((sopt->sopt_level == SOL_SOCKET) && (sopt->sopt_name == SO_SETFIB)) { inp->inp_inc.inc_fibnum = so->so_fibnum; return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case IP_HDRINCL: optval = inp->inp_flags & INP_HDRINCL; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_GET: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case IP_HDRINCL: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; if (optval) inp->inp_flags |= INP_HDRINCL; else inp->inp_flags &= ~INP_HDRINCL; break; case IP_FW3: /* generic ipfw v.3 functions */ case IP_FW_ADD: case IP_FW_DEL: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (V_ip_fw_ctl_ptr != NULL) error = V_ip_fw_ctl_ptr(sopt); else error = ENOPROTOOPT; break; case IP_DUMMYNET3: /* generic dummynet v.3 functions */ case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT ; break ; case IP_RSVP_ON: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_init(so); break; case IP_RSVP_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_done(); break; case IP_RSVP_VIF_ON: case IP_RSVP_VIF_OFF: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_rsvp_vif ? ip_rsvp_vif(so, sopt) : EINVAL; break; case MRT_INIT: case MRT_DONE: case MRT_ADD_VIF: case MRT_DEL_VIF: case MRT_ADD_MFC: case MRT_DEL_MFC: case MRT_VERSION: case MRT_ASSERT: case MRT_API_SUPPORT: case MRT_API_CONFIG: case MRT_ADD_BW_UPCALL: case MRT_DEL_BW_UPCALL: error = priv_check(curthread, PRIV_NETINET_MROUTE); if (error != 0) return (error); error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : EOPNOTSUPP; break; default: error = ip_ctloutput(so, sopt); break; } break; } return (error); } /* * This function exists solely to receive the PRC_IFDOWN messages which are * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls * in_ifadown() to remove all routes corresponding to that address. It also * receives the PRC_IFUP messages from if_up() and reinstalls the interface * routes. */ void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_ifaddr *ia; struct ifnet *ifp; int err; int flags; switch (cmd) { case PRC_IFDOWN: IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(&ia->ia_ifa, 0); ifa_free(&ia->ia_ifa); break; } } if (ia == NULL) /* If ia matched, already unlocked. */ IN_IFADDR_RUNLOCK(); break; case PRC_IFUP: IN_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { IN_IFADDR_RUNLOCK(); return; } ifa_ref(&ia->ia_ifa); IN_IFADDR_RUNLOCK(); flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; if ((ifp->if_flags & IFF_LOOPBACK) || (ifp->if_flags & IFF_POINTOPOINT)) flags |= RTF_HOST; err = ifa_del_loopback_route((struct ifaddr *)ia, sa); err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; err = ifa_add_loopback_route((struct ifaddr *)ia, sa); ifa_free(&ia->ia_ifa); break; } } static int rip_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return EPROTONOSUPPORT; error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; rip_inshash(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static void rip_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) ip_rsvp_force_done(so); if (so == V_ip_rsvpd) ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); } static void rip_dodisconnect(struct socket *so, struct inpcb *inp) { struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); } static void rip_abort(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_abort: inp == NULL")); rip_dodisconnect(so, inp); } static void rip_close(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_close: inp == NULL")); rip_dodisconnect(so, inp); } static int rip_disconnect(struct socket *so) { struct inpcb *inp; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); rip_dodisconnect(so, inp); return (0); } static int rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; int error; if (nam->sa_len != sizeof(*addr)) return (EINVAL); error = prison_check_ip4(td->td_ucred, &addr->sin_addr); if (error != 0) return (error); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_bind: inp == NULL")); if (TAILQ_EMPTY(&V_ifnet) || (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || (addr->sin_addr.s_addr && (inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct inpcb *inp; if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (TAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); soisconnected(so); INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } static int rip_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } static int rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct inpcb *inp; u_long dst; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_send: inp == NULL")); /* * Note: 'dst' reads below are unlocked. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { m_freem(m); return (EISCONN); } dst = inp->inp_faddr.s_addr; /* Unlocked read. */ } else { if (nam == NULL) { m_freem(m); return (ENOTCONN); } dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; } return (rip_output(m, so, dst)); } #endif /* INET */ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_ripcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_ripcbinfo); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_ripcbinfo); for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_ripcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_ripcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_ripcbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_ripcbinfo); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_ripcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #ifdef INET struct pr_usrreqs rip_usrreqs = { .pru_abort = rip_abort, .pru_attach = rip_attach, .pru_bind = rip_bind, .pru_connect = rip_connect, .pru_control = in_control, .pru_detach = rip_detach, .pru_disconnect = rip_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = rip_send, .pru_shutdown = rip_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = rip_close, }; #endif /* INET */ Index: user/dchagin/lemul/sys/netinet/tcp_hostcache.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_hostcache.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_hostcache.c (revision 274241) @@ -1,699 +1,699 @@ /*- * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The tcp_hostcache moves the tcp-specific cached metrics from the routing * table to a dedicated structure indexed by the remote IP address. It keeps * information on the measured TCP parameters of past TCP sessions to allow * better initial start values to be used with later connections to/from the * same source. Depending on the network parameters (delay, bandwidth, max * MTU, congestion window) between local and remote sites, this can lead to * significant speed-ups for new TCP connections after the first one. * * Due to the tcp_hostcache, all TCP-specific metrics information in the * routing table have been removed. The inpcb no longer keeps a pointer to * the routing entry, and protocol-initiated route cloning has been removed * as well. With these changes, the routing table has gone back to being * more lightwight and only carries information related to packet forwarding. * * tcp_hostcache is designed for multiple concurrent access in SMP * environments and high contention. All bucket rows have their own lock and * thus multiple lookups and modifies can be done at the same time as long as * they are in different bucket rows. If a request for insertion of a new * record can't be satisfied, it simply returns an empty structure. Nobody * and nothing outside of tcp_hostcache.c will ever point directly to any * entry in the tcp_hostcache. All communication is done in an * object-oriented way and only functions of tcp_hostcache will manipulate * hostcache entries. Otherwise, we are unable to achieve good behaviour in * concurrent access situations. Since tcp_hostcache is only caching * information, there are no fatal consequences if we either can't satisfy * any particular request or have to drop/overwrite an existing entry because * of bucket limit memory constrains. */ /* * Many thanks to jlemon for basic structure of tcp_syncache which is being * followed here. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include #ifdef INET6 #include #endif #include /* Arbitrary values */ #define TCP_HOSTCACHE_HASHSIZE 512 #define TCP_HOSTCACHE_BUCKETLIMIT 30 #define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ #define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache); #define V_tcp_hostcache VNET(tcp_hostcache) static VNET_DEFINE(struct callout, tcp_hc_callout); #define V_tcp_hc_callout VNET(tcp_hc_callout) static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge_internal(int); static void tcp_hc_purge(void *); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.cache_limit), 0, "Overall entry limit for hostcache"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.hashsize), 0, "Size of TCP hostcache hashtable"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, - CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, + CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0, "Per-bucket hash limit for hostcache"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_hostcache.cache_count), 0, "Current number of entries in hostcache"); -SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.expire), 0, "Expire time of TCP hostcache entries"); -SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.prune), 0, "Time between purge runs"); -SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.purgeall), 0, "Expire all entires on next purge run"); SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, sysctl_tcp_hc_list, "A", "List of all hostcache entries"); static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache"); #define HOSTCACHE_HASH(ip) \ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ V_tcp_hostcache.hashmask) /* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ #define HOSTCACHE_HASH6(ip6) \ (((ip6)->s6_addr32[0] ^ \ (ip6)->s6_addr32[1] ^ \ (ip6)->s6_addr32[2] ^ \ (ip6)->s6_addr32[3]) & \ V_tcp_hostcache.hashmask) #define THC_LOCK(lp) mtx_lock(lp) #define THC_UNLOCK(lp) mtx_unlock(lp) void tcp_hc_init(void) { u_int cache_limit; int i; /* * Initialize hostcache structures. */ V_tcp_hostcache.cache_count = 0; V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", &V_tcp_hostcache.hashsize); if (!powerof2(V_tcp_hostcache.hashsize)) { printf("WARNING: hostcache hash size is not a power of 2.\n"); V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ } V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", &V_tcp_hostcache.bucket_limit); cache_limit = V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; V_tcp_hostcache.cache_limit = cache_limit; TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", &V_tcp_hostcache.cache_limit); if (V_tcp_hostcache.cache_limit > cache_limit) V_tcp_hostcache.cache_limit = cache_limit; /* * Allocate the hash table. */ V_tcp_hostcache.hashbase = (struct hc_head *) malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), M_HOSTCACHE, M_WAITOK | M_ZERO); /* * Initialize the hash buckets. */ for (i = 0; i < V_tcp_hostcache.hashsize; i++) { TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); V_tcp_hostcache.hashbase[i].hch_length = 0; mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", NULL, MTX_DEF); } /* * Allocate the hostcache entries. */ V_tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); /* * Set up periodic cache cleanup. */ callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, curvnet); } #ifdef VIMAGE void tcp_hc_destroy(void) { int i; callout_drain(&V_tcp_hc_callout); /* Purge all hc entries. */ tcp_hc_purge_internal(1); /* Free the uma zone and the allocated hash table. */ uma_zdestroy(V_tcp_hostcache.zone); for (i = 0; i < V_tcp_hostcache.hashsize; i++) mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx); free(V_tcp_hostcache.hashbase, M_HOSTCACHE); } #endif /* * Internal function: look up an entry in the hostcache or return NULL. * * If an entry has been returned, the caller becomes responsible for * unlocking the bucket row after he is done reading/modifying the entry. */ static struct hc_metrics * tcp_hc_lookup(struct in_conninfo *inc) { int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer")); /* * Hash the foreign ip address. */ if (inc->inc_flags & INC_ISIPV6) hash = HOSTCACHE_HASH6(&inc->inc6_faddr); else hash = HOSTCACHE_HASH(&inc->inc_faddr); hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't * find an entry, otherwise the caller has to unlock after he is * done. */ THC_LOCK(&hc_head->hch_mtx); /* * Iterate through entries in bucket row looking for a match. */ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { if (inc->inc_flags & INC_ISIPV6) { /* XXX: check ip6_zoneid */ if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, sizeof(inc->inc6_faddr)) == 0) return hc_entry; } else { if (memcmp(&inc->inc_faddr, &hc_entry->ip4, sizeof(inc->inc_faddr)) == 0) return hc_entry; } } /* * We were unsuccessful and didn't find anything. */ THC_UNLOCK(&hc_head->hch_mtx); return NULL; } /* * Internal function: insert an entry into the hostcache or return NULL if * unable to allocate a new one. * * If an entry has been returned, the caller becomes responsible for * unlocking the bucket row after he is done reading/modifying the entry. */ static struct hc_metrics * tcp_hc_insert(struct in_conninfo *inc) { int hash; struct hc_head *hc_head; struct hc_metrics *hc_entry; KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer")); /* * Hash the foreign ip address. */ if (inc->inc_flags & INC_ISIPV6) hash = HOSTCACHE_HASH6(&inc->inc6_faddr); else hash = HOSTCACHE_HASH(&inc->inc_faddr); hc_head = &V_tcp_hostcache.hashbase[hash]; /* * Acquire lock for this bucket row; we release the lock if we don't * find an entry, otherwise the caller has to unlock after he is * done. */ THC_LOCK(&hc_head->hch_mtx); /* * If the bucket limit is reached, reuse the least-used element. */ if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) { hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); /* * At first we were dropping the last element, just to * reacquire it in the next two lines again, which isn't very * efficient. Instead just reuse the least used element. * We may drop something that is still "in-use" but we can be * "lossy". * Just give up if this bucket row is empty and we don't have * anything to replace. */ if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; } TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); V_tcp_hostcache.hashbase[hash].hch_length--; V_tcp_hostcache.cache_count--; TCPSTAT_INC(tcps_hc_bucketoverflow); #if 0 uma_zfree(V_tcp_hostcache.zone, hc_entry); #endif } else { /* * Allocate a new entry, or balk if not possible. */ hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); if (hc_entry == NULL) { THC_UNLOCK(&hc_head->hch_mtx); return NULL; } } /* * Initialize basic information of hostcache entry. */ bzero(hc_entry, sizeof(*hc_entry)); if (inc->inc_flags & INC_ISIPV6) { hc_entry->ip6 = inc->inc6_faddr; hc_entry->ip6_zoneid = inc->inc6_zoneid; } else hc_entry->ip4 = inc->inc_faddr; hc_entry->rmx_head = hc_head; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* * Put it upfront. */ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); V_tcp_hostcache.hashbase[hash].hch_length++; V_tcp_hostcache.cache_count++; TCPSTAT_INC(tcps_hc_added); return hc_entry; } /* * External function: look up an entry in the hostcache and fill out the * supplied TCP metrics structure. Fills in NULL when no entry was found or * a value is not set. */ void tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) { struct hc_metrics *hc_entry; /* * Find the right bucket. */ hc_entry = tcp_hc_lookup(inc); /* * If we don't have an existing object. */ if (hc_entry == NULL) { bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); return; } hc_entry->rmx_hits++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth; hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; /* * Unlock bucket row. */ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * External function: look up an entry in the hostcache and return the * discovered path MTU. Returns NULL if no entry is found or value is not * set. */ u_long tcp_hc_getmtu(struct in_conninfo *inc) { struct hc_metrics *hc_entry; u_long mtu; hc_entry = tcp_hc_lookup(inc); if (hc_entry == NULL) { return 0; } hc_entry->rmx_hits++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ mtu = hc_entry->rmx_mtu; THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); return mtu; } /* * External function: update the MTU value of an entry in the hostcache. * Creates a new entry if none was found. */ void tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu) { struct hc_metrics *hc_entry; /* * Find the right bucket. */ hc_entry = tcp_hc_lookup(inc); /* * If we don't have an existing object, try to insert a new one. */ if (hc_entry == NULL) { hc_entry = tcp_hc_insert(inc); if (hc_entry == NULL) return; } hc_entry->rmx_updates++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ hc_entry->rmx_mtu = mtu; /* * Put it upfront so we find it faster next time. */ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); /* * Unlock bucket row. */ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * External function: update the TCP metrics of an entry in the hostcache. * Creates a new entry if none was found. */ void tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) { struct hc_metrics *hc_entry; hc_entry = tcp_hc_lookup(inc); if (hc_entry == NULL) { hc_entry = tcp_hc_insert(inc); if (hc_entry == NULL) return; } hc_entry->rmx_updates++; hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ if (hcml->rmx_rtt != 0) { if (hc_entry->rmx_rtt == 0) hc_entry->rmx_rtt = hcml->rmx_rtt; else hc_entry->rmx_rtt = (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2; TCPSTAT_INC(tcps_cachedrtt); } if (hcml->rmx_rttvar != 0) { if (hc_entry->rmx_rttvar == 0) hc_entry->rmx_rttvar = hcml->rmx_rttvar; else hc_entry->rmx_rttvar = (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2; TCPSTAT_INC(tcps_cachedrttvar); } if (hcml->rmx_ssthresh != 0) { if (hc_entry->rmx_ssthresh == 0) hc_entry->rmx_ssthresh = hcml->rmx_ssthresh; else hc_entry->rmx_ssthresh = (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; TCPSTAT_INC(tcps_cachedssthresh); } if (hcml->rmx_bandwidth != 0) { if (hc_entry->rmx_bandwidth == 0) hc_entry->rmx_bandwidth = hcml->rmx_bandwidth; else hc_entry->rmx_bandwidth = (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; /* TCPSTAT_INC(tcps_cachedbandwidth); */ } if (hcml->rmx_cwnd != 0) { if (hc_entry->rmx_cwnd == 0) hc_entry->rmx_cwnd = hcml->rmx_cwnd; else hc_entry->rmx_cwnd = (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2; /* TCPSTAT_INC(tcps_cachedcwnd); */ } if (hcml->rmx_sendpipe != 0) { if (hc_entry->rmx_sendpipe == 0) hc_entry->rmx_sendpipe = hcml->rmx_sendpipe; else hc_entry->rmx_sendpipe = (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2; /* TCPSTAT_INC(tcps_cachedsendpipe); */ } if (hcml->rmx_recvpipe != 0) { if (hc_entry->rmx_recvpipe == 0) hc_entry->rmx_recvpipe = hcml->rmx_recvpipe; else hc_entry->rmx_recvpipe = (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2; /* TCPSTAT_INC(tcps_cachedrecvpipe); */ } TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); } /* * Sysctl function: prints the list and values of all hostcache entries in * unsorted order. */ static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) { int bufsize; int linesize = 128; char *p, *buf; int len, i, error; struct hc_metrics *hc_entry; #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif bufsize = linesize * (V_tcp_hostcache.cache_count + 1); p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); len = snprintf(p, linesize, "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH " " CWND SENDPIPE RECVPIPE HITS UPD EXP\n"); p += len; #define msec(u) (((u) + 500) / 1000) for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { len = snprintf(p, linesize, "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " "%4lu %4lu %4i\n", hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) : #ifdef INET6 ip6_sprintf(ip6buf, &hc_entry->ip6), #else "IPv6?", #endif hc_entry->rmx_mtu, hc_entry->rmx_ssthresh, msec(hc_entry->rmx_rtt * (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), msec(hc_entry->rmx_rttvar * (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))), hc_entry->rmx_bandwidth * 8, hc_entry->rmx_cwnd, hc_entry->rmx_sendpipe, hc_entry->rmx_recvpipe, hc_entry->rmx_hits, hc_entry->rmx_updates, hc_entry->rmx_expire); p += len; } THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } #undef msec error = SYSCTL_OUT(req, buf, p - buf); free(buf, M_TEMP); return(error); } /* * Caller has to make sure the curvnet is set properly. */ static void tcp_hc_purge_internal(int all) { struct hc_metrics *hc_entry, *hc_next; int i; for (i = 0; i < V_tcp_hostcache.hashsize; i++) { THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); TAILQ_FOREACH_SAFE(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) { if (all || hc_entry->rmx_expire <= 0) { TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket, hc_entry, rmx_q); uma_zfree(V_tcp_hostcache.zone, hc_entry); V_tcp_hostcache.hashbase[i].hch_length--; V_tcp_hostcache.cache_count--; } else hc_entry->rmx_expire -= V_tcp_hostcache.prune; } THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } } /* * Expire and purge (old|all) entries in the tcp_hostcache. Runs * periodically from the callout. */ static void tcp_hc_purge(void *arg) { CURVNET_SET((struct vnet *) arg); int all = 0; if (V_tcp_hostcache.purgeall) { all = 1; V_tcp_hostcache.purgeall = 0; } tcp_hc_purge_internal(all); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, arg); CURVNET_RESTORE(); } Index: user/dchagin/lemul/sys/netinet/tcp_input.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_input.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_input.c (revision 274241) @@ -1,3678 +1,3678 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #include #endif /*IPSEC*/ #include #include const int tcprexmtthresh = 3; int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; #define V_drop_synfin VNET(drop_synfin) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0, "Experimental TCP extensions"); VNET_DEFINE(int, tcp_do_initcwnd10) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_initcwnd10), 0, "Enable RFC 6928 (Increasing initial CWND to 10)"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 0; -SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); VNET_DEFINE(int, tcp_insecure_syn) = 0; #define V_tcp_insecure_syn VNET(tcp_insecure_syn) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; #define V_tcp_insecure_rst VNET(tcp_insecure_rst) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_inc), 0, "Incrementor step size of automatic receive buffer"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type); static void inline cc_conn_init(struct tcpcb *tp); static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void inline hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); /* * TCP statistics are stored in an "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); VNET_PCPUSTAT_SYSINIT(tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(tcpstat); #endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_inc(int statnum) { counter_u64_add(VNET(tcpstat)[statnum], 1); } /* * Wrapper for the TCP established input helper hook. */ static void inline hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } /* * CC wrapper hook functions */ static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if (tp->snd_cwnd <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, V_tcp_abc_l_var * tp->t_maxseg); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } } static void inline cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshhold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ else if (V_tcp_do_initcwnd10) tp->snd_cwnd = min(10 * tp->t_maxseg, max(2 * tp->t_maxseg, 14600)); else if (V_tcp_do_rfc3390) tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (tp->t_maxseg > 2190) tp->snd_cwnd = 2 * tp->t_maxseg; else if (tp->t_maxseg > 1095) tp->snd_cwnd = 3 * tp->t_maxseg; else tp->snd_cwnd = 4 * tp->t_maxseg; } if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags)) { TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) tp->t_flags |= TF_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg) * tp->t_maxseg; tp->snd_cwnd = tp->t_maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; } #ifdef TCP_SIGNATURE static inline int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) { int ret; tcp_fields_to_net(th); ret = tcp_signature_verify(m, off0, tlen, optlen, to, th, tcpbflag); tcp_fields_to_host(th); return (ret); } #endif /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) #endif /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxopd) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct in6_ifaddr *ia6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ia6 = ip6_getdstifaddr(m); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; ifa_free(&ia6->ia_ifa); ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ int tcp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ #ifdef TCP_SIGNATURE uint8_t sig_checked = 0; #endif uint8_t iptos = 0; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 #define TI_WLOCKED 2 #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(tlen + off0); } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } /* Re-initialization for later version check */ ip->ip_v = IPVERSION; } #endif /* INET */ #ifdef INET6 if (isipv6) iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET iptos = ip->ip_tos; #endif /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = th->th_flags; /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Locate pcb for segment; if we're likely to add or remove a * connection then first acquire pcbinfo lock. There are three cases * where we might discover later we need a write lock despite the * flags: ACKs moving a connection out of the syncache, ACKs for a * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; } else ti_locked = TI_UNLOCKED; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); findpcb: #ifdef INVARIANTS if (ti_locked == TI_WLOCKED) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { /* * Log communication attempts to ports that are not * in use. */ if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_WLOCK_ASSERT(inp); if (!(inp->inp_flags & INP_HW_FLOWID) && (m->m_flags & M_FLOWID) && ((inp->inp_socket == NULL) || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { inp->inp_flags |= INP_HW_FLOWID; inp->inp_flags &= ~INP_SW_FLOWID; inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { IPSEC6STAT_INC(ips_in_polvio); goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { IPSECSTAT_INC(ips_in_polvio); goto dropunlock; } #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb * lock, and not the inpcbinfo write lock. If so, we need to try to * acquire it, or if that fails, acquire a reference on the inpcb, * drop all locks, acquire a global write lock, and then re-acquire * the inpcb lock. We may at that point discover that another thread * has tried to free the inpcb, in which case we need to loop back * and try to find a new inpcb to deliver to. * * XXXRW: It may be time to rethink timewait locking. */ relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } } else ti_locked = TI_WLOCKED; } INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* * NB: tcp_twcheck unlocks the INP and frees the mbuf. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; INP_INFO_WUNLOCK(&V_tcbinfo); return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ tp = intotcpcb(inp); if (tp == NULL || tp->t_state == TCPS_CLOSED) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif /* * We've identified a valid inpcb, but it could be that we need an * inpcbinfo write lock but don't hold it. In this case, attempt to * acquire using the same strategy as the TIMEWAIT case above. If we * relock, we have to jump back to 'relocked' as the connection might * now be in TIMEWAIT. */ #ifdef INVARIANTS if ((thflags & (TH_FIN | TH_RST)) != 0) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); #endif if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN)))) { if (ti_locked == TI_UNLOCKED) { if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); INP_INFO_WLOCK(&V_tcbinfo); ti_locked = TI_WLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } goto relocked; } else ti_locked = TI_WLOCKED; } INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ if (!syncache_expand(&inc, &to, th, &so, m)) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. * NB: syncache did its own logging * of the failure cause. */ rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); INP_WLOCK(inp); /* new connection */ tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); #ifdef TCP_SIGNATURE if (sig_checked == 0) { tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, th, tp->t_flags)) { /* * In SYN_SENT state if it receives an * RST, it is allowed for further * processing. */ if ((thflags & TH_RST) == 0 || (tp->t_state == TCPS_SYN_SENT) == 0) goto dropunlock; } sig_checked = 1; } #endif /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = ip6_getdstifaddr(m); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { ifa_free(&ia6->ia_ifa); if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } if (ia6) ifa_free(&ia6->ia_ifa); } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #ifdef TCP_SIGNATURE if (sig_checked == 0) { tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); if (!tcp_signature_verify_input(m, off0, tlen, optlen, &to, th, tp->t_flags)) { /* * In SYN_SENT state if it receives an RST, it is * allowed for further processing. */ if ((thflags & TH_RST) == 0 || (tp->t_state == TCPS_SYN_SENT) == 0) goto dropunlock; } sig_checked = 1; } #endif TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif if (inp != NULL) INP_WUNLOCK(inp); drop: INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags, acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; u_long tiwin; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS if (ti_locked == TI_WLOCKED) INP_INFO_WLOCK_ASSERT(&V_tcbinfo); else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { u_int t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); ND6_HINT(tp); /* Some progress has been made. */ /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); if (so->so_snd.sb_cc) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); * 3. received bytes per RTT is within seven eighth of the * current socket buffer size; * 4. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ if (V_tcp_do_autorcvbuf && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && to.to_tsecr - tp->rfbuf_ts < hz) { if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else tp->rfbuf_cnt += tlen; /* add up */ } /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); /* Reset receive buffer auto scaling when not in bulk receive mode. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += imin(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_WLOCKED, ("%s: TH_RST ti_locked %d, th %p tp %p", __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { KASSERT(ti_locked == TI_WLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, &to, th->th_ack); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshhold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { cc_ack_received(tp, th, CC_DUPACK); u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { u_int t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; mfree = sbcut_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + so->so_rcv.sb_cc > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = so->so_rcv.sb_cc + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp, tlen)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); INP_INFO_WUNLOCK(&V_tcbinfo); return; } } if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (ti_locked == TI_WLOCKED) INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: if (ti_locked == TI_WLOCKED) { INP_INFO_WUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); #endif /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ static void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_WLOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (th->th_flags & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (th->th_flags & TH_SYN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ static void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; #ifdef TCP_SIGNATURE /* * XXX In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have to * record the fact that the option was observed here * for the syncache code to perform the correct response. */ case TCPOPT_SIGNATURE: if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; #endif case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ static void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * Also take into account the space needed for options that we * send regularly. Make maxseg shorter by that amount to assure * that we can send maxseg amount of data even when the options * are present. Store the upper limit of the length of options plus * data in maxopd. * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; u_long maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; int origoffer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else const size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } origoffer = offer; /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxopd above. */ offer = tp->t_maxopd; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. */ mss = max(mss, 64); /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal * segment. We need to store this value (maxopd) apart * from maxseg, because now every segment carries options * and thus we normally have somewhat less data in segments. */ tp->t_maxopd = mss; /* * origoffer==-1 indicates that no segments were received yet. * In this case we just guess. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && (origoffer == -1 || (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) mss -= TCPOLEN_TSTAMP_APPA; tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; u_long bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_snd); tp->t_maxseg = mss; SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; u_long maxmtu = 0; u_long thcmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += tp->t_maxseg; } Index: user/dchagin/lemul/sys/netinet/tcp_output.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_output.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_output.c (revision 274241) @@ -1,1679 +1,1679 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #define TCPOUTFLAGS #include #include #include #include #include #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #endif /*IPSEC*/ #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* * Wrapper for the TCP established output helper hook. */ static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, tp->osd); } } /* * CC wrapper hook functions */ static void inline cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_output(struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; long len, recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC unsigned ipsec_optlen = 0; #endif int idle, sendalot; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; #if 0 int maxburst = TCP_MAXBURST; #endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp); sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { long cwin; cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else /* Can rexmit part of the current hole */ len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit)); } else len = ((long)ulmin(cwin, p->end - p->rxmit)); off = p->rxmit - tp->snd_una; KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); if (len > 0) { sack_rxmit = 1; sendalot = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, tp->t_maxseg)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCKBUF_LOCK(&so->so_snd); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < so->so_snd.sb_cc) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); else { long cwin; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it * isn't really necessary), to work around a gcc * optimization issue - to force gcc to compute * len above. Without this check, the computation * of len is bungled by the optimizer. */ if (len > 0) { cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - sack_bytes_rxmt; if (cwin < 0) cwin = 0; len = lmin(len, cwin); } } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } if (len < 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. */ len = 0; if (sendwin == 0) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwith product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwith (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < V_tcp_autosndbuf_max && sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and * IP options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. */ #ifdef IPSEC /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ ipsec_optlen = ipsec_hdrsiz_tcp(tp); #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && #ifdef IPSEC ipsec_optlen == 0 && #endif tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL) tso = 1; if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; } recwin = sbspace(&so->so_rcv); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && len + off >= so->so_snd.sb_cc && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. Skip this if the connection is in T/TCP * half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ long adv; int oldwin; adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); adv -= oldwin; } else oldwin = 0; /* * If the new window size ends up being the same as the old * size when it is scaled, then don't force a window update. */ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (long)(2 * tp->t_maxseg) && (adv >= (long)(so->so_rcv.sb_hiwat / 4) || recwin <= (long)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(&so->so_snd); return (0); send: SOCKBUF_LOCK_ASSERT(&so->so_snd); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ if ((tp->t_flags & TF_NOOPT) == 0) { to.to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); } #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #ifdef IPSEC ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxopd length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxopd) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int if_hw_tsomaxsegcount; u_int if_hw_tsomaxsegsize; struct mbuf *mb; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen); if (max_len <= 0) { len = 0; } else if (len > (u_int)max_len) { sendalot = 1; len = (u_int)max_len; } } /* * Check if we should limit by maximum segment * size and count: */ if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { max_len = 0; mb = sbsndmbuf(&so->so_snd, off, &moff); while (mb != NULL && (u_int)max_len < len) { u_int mlen; u_int frags; /* * Get length of mbuf fragment * and how many hardware frags, * rounded up, it would use: */ mlen = (mb->m_len - moff); frags = howmany(mlen, if_hw_tsomaxsegsize); /* Handle special case: Zero Length Mbuf */ if (frags == 0) frags = 1; /* * Check if the fragment limit * will be reached or exceeded: */ if (frags >= if_hw_tsomaxsegcount) { max_len += min(mlen, if_hw_tsomaxsegcount * if_hw_tsomaxsegsize); break; } max_len += mlen; if_hw_tsomaxsegcount -= frags; moff = 0; mb = mb->m_next; } if (max_len <= 0) { len = 0; } else if (len > (u_int)max_len) { sendalot = 1; len = (u_int)max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = (tp->t_maxopd - optlen); if ((off + len) < so->so_snd.sb_cc) { moff = len % (u_int)max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= (u_int)max_len) { len = (u_int)max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { len = tp->t_maxopd - optlen - ipoptlen; sendalot = 1; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) TCPSTAT_INC(tcps_sndprobe); else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr(&so->so_snd, off, len, &moff); if (len <= MHLEN - hdrlen - max_linkhdr) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t) + hdrlen); m->m_len += len; } else { m->m_next = m_copy(mb, moff, (int)len); if (m->m_next == NULL) { SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (off + len == so->so_snd.sb_cc) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCKBUF_UNLOCK(&so->so_snd); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { MH_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(tp->t_inpcb, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; } else flags |= TH_ECE|TH_CWR; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with * ECN capable transmission (ECT). * Ignore pure ack packets, retransmissions and window probes. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); } /* * Reply with proper ECN notifications. */ if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags &= ~TF_ECN_SND_CWR; } if (tp->t_flags & TF_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE if (tp->t_flags & TF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); } #endif /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { KASSERT(len > tp->t_maxopd - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } #ifdef IPSEC KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u - %u != %u", __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); #else KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %ld + %u + %u != %u", __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { struct route_in6 ro; bzero(&ro, sizeof(ro)); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct route ro; bzero(&ro, sizeof(ro)); ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) { ip->ip_off |= htons(IP_DF); tp->t_flags2 |= TF2_PLPMTU_PMTUD; } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, tp->t_inpcb->inp_options, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET */ out: /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + len; } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit -= len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); } else tp->snd_nxt -= len; } SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: if (!tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (recwin >= 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); #if 0 /* * This completely breaks TCP if newreno is turned on. What happens * is that if delayed-acks are turned on on the receiver, this code * on the transmitter effectively destroys the TCP window, forcing * it to four packets (1.5Kx4 = 6K window). */ if (sendalot && --maxburst) goto again; #endif if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) continue; optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } Index: user/dchagin/lemul/sys/netinet/tcp_sack.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_sack.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_sack.c (revision 274241) @@ -1,687 +1,687 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 */ /*- * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #include VNET_DECLARE(struct uma_zone *, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); VNET_DEFINE(int, tcp_do_sack) = 1; #define V_tcp_do_sack VNET(tcp_do_sack) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support"); VNET_DEFINE(int, tcp_sack_maxholes) = 128; #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_maxholes), 0, "Maximum number of TCP SACK holes allowed per connection"); VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536; #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_globalmaxholes), 0, "Global maximum number of TCP SACK holes"); VNET_DEFINE(int, tcp_sack_globalholes) = 0; #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_sack_globalholes), 0, "Global number of TCP SACK holes currently allocated"); /* * This function is called upon receipt of new valid data (while not in * header prediction mode), and it updates the ordered list of sacks. */ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) { /* * First reported block MUST be the most recent one. Subsequent * blocks SHOULD be in the order in which they arrived at the * receiver. These two conditions make the implementation fully * compliant with RFC 2018. */ struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; int num_head, num_saved, i; INP_WLOCK_ASSERT(tp->t_inpcb); /* Check arguments. */ KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); /* SACK block for the received segment. */ head_blk.start = rcv_start; head_blk.end = rcv_end; /* * Merge updated SACK blocks into head_blk, and save unchanged SACK * blocks into saved_blks[]. num_saved will have the number of the * saved SACK blocks. */ num_saved = 0; for (i = 0; i < tp->rcv_numsacks; i++) { tcp_seq start = tp->sackblks[i].start; tcp_seq end = tp->sackblks[i].end; if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { /* * Discard this SACK block. */ } else if (SEQ_LEQ(head_blk.start, end) && SEQ_GEQ(head_blk.end, start)) { /* * Merge this SACK block into head_blk. This SACK * block itself will be discarded. */ if (SEQ_GT(head_blk.start, start)) head_blk.start = start; if (SEQ_LT(head_blk.end, end)) head_blk.end = end; } else { /* * Save this SACK block. */ saved_blks[num_saved].start = start; saved_blks[num_saved].end = end; num_saved++; } } /* * Update SACK list in tp->sackblks[]. */ num_head = 0; if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { /* * The received data segment is an out-of-order segment. Put * head_blk at the top of SACK list. */ tp->sackblks[0] = head_blk; num_head = 1; /* * If the number of saved SACK blocks exceeds its limit, * discard the last SACK block. */ if (num_saved >= MAX_SACK_BLKS) num_saved--; } if (num_saved > 0) { /* * Copy the saved SACK blocks back. */ bcopy(saved_blks, &tp->sackblks[num_head], sizeof(struct sackblk) * num_saved); } /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; } /* * Delete all receiver-side SACK information. */ void tcp_clean_sackreport(struct tcpcb *tp) { int i; INP_WLOCK_ASSERT(tp->t_inpcb); tp->rcv_numsacks = 0; for (i = 0; i < MAX_SACK_BLKS; i++) tp->sackblks[i].start = tp->sackblks[i].end=0; } /* * Allocate struct sackhole. */ static struct sackhole * tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) { struct sackhole *hole; if (tp->snd_numholes >= V_tcp_sack_maxholes || V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { TCPSTAT_INC(tcps_sack_sboverflow); return NULL; } hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT); if (hole == NULL) return NULL; hole->start = start; hole->end = end; hole->rxmit = start; tp->snd_numholes++; atomic_add_int(&V_tcp_sack_globalholes, 1); return hole; } /* * Free struct sackhole. */ static void tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) { uma_zfree(V_sack_hole_zone, hole); tp->snd_numholes--; atomic_subtract_int(&V_tcp_sack_globalholes, 1); KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); } /* * Insert new SACK hole into scoreboard. */ static struct sackhole * tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *after) { struct sackhole *hole; /* Allocate a new SACK hole. */ hole = tcp_sackhole_alloc(tp, start, end); if (hole == NULL) return NULL; /* Insert the new SACK hole into scoreboard. */ if (after != NULL) TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); else TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); /* Update SACK hint. */ if (tp->sackhint.nexthole == NULL) tp->sackhint.nexthole = hole; return hole; } /* * Remove SACK hole from scoreboard. */ static void tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) { /* Update SACK hint. */ if (tp->sackhint.nexthole == hole) tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); /* Remove this SACK hole. */ TAILQ_REMOVE(&tp->snd_holes, hole, scblink); /* Free this SACK hole. */ tcp_sackhole_free(tp, hole); } /* * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). */ void tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, tp->snd_una) && SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) sack_blocks[num_sack_blks++] = sack; } } /* * Return if SND.UNA is not advanced and no valid SACK block is * received. */ if (num_sack_blks == 0) return; /* * Sort the SACK blocks so we can update the scoreboard with just one * pass. The overhead of sorting upto 4+1 elements is less than * making upto 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } if (TAILQ_EMPTY(&tp->snd_holes)) /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes * (from the sack blocks received) are created later below * (in the logic that adds holes to the tail of the * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just * one pass in order to reduce the number of compares especially when * the bandwidth-delay product is large. * * Note: Typically, in the first RTT of SACK recovery, the highest * three or four SACK blocks with the same ack number are received. * In the second RTT, if retransmitted data segments are not lost, * the highest three or four SACK blocks with ack number advancing * are received. */ sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ tp->sackhint.last_sack_ack = sblkp->end; if (SEQ_LT(tp->snd_fack, sblkp->start)) { /* * The highest SACK block is beyond fack. Append new SACK * hole at the tail. If the second or later highest SACK * blocks are also beyond the current fack, they will be * inserted by way of hole splitting in the while-loop below. */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; } else { /* * We failed to add a new hole based on the current * sack block. Skip over all the sack blocks that * fall completely to the right of snd_fack and * proceed to trim the scoreboard based on the * remaining sack blocks. This also trims the * scoreboard for th_ack (which is sack_blocks[0]). */ while (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->start)) sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) /* fack is advanced. */ tp->snd_fack = sblkp->end; /* We must have at least one SACK hole in scoreboard. */ KASSERT(!TAILQ_EMPTY(&tp->snd_holes), ("SACK scoreboard must not be empty")); cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* * Since the incoming sack blocks are sorted, we can process them * making one sweep of the scoreboard. */ while (sblkp >= sack_blocks && cur != NULL) { if (SEQ_GEQ(sblkp->start, cur->end)) { /* * SACKs data beyond the current hole. Go to the * previous sack block. */ sblkp--; continue; } if (SEQ_LEQ(sblkp->end, cur->start)) { /* * SACKs data before the current hole. Go to the * previous hole. */ cur = TAILQ_PREV(cur, sackhole_head, scblink); continue; } tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); /* * The sack block may ack all or part of the * next hole too, so continue onto the next * hole. */ continue; } else { /* Move start of hole forward. */ cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } } else { /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { /* * ACKs some data in middle of a hole; need * to split current hole */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; tp->sackhint.sack_bytes_rexmit += (temp->rxmit - temp->start); } cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } } } tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); /* * Testing sblkp->start against cur->start tells us whether * we're done with the sack block or the sack hole. * Accordingly, we advance one or the other. */ if (SEQ_LEQ(sblkp->start, cur->start)) cur = TAILQ_PREV(cur, sackhole_head, scblink); else sblkp--; } } /* * Free all SACK holes to clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *q; INP_WLOCK_ASSERT(tp->t_inpcb); while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) tcp_sackhole_remove(tp, q); tp->sackhint.sack_bytes_rexmit = 0; KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); KASSERT(tp->sackhint.nexthole == NULL, ("tp->sackhint.nexthole == NULL")); } /* * Partial ack handling within a sack recovery episode. Keeping this very * simple for now. When a partial ack is received, force snd_cwnd to a value * that will allow the sender to transmit no more than 2 segments. If * necessary, a better scheme can be adopted at a later point, but for now, * the goal is to prevent the sender from bursting a large amount of data in * the midst of sack recovery. */ void tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) { int num_segs = 1; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) >= 2) num_segs = 2; tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } #if 0 /* * Debug version of tcp_sack_output() that walks the scoreboard. Used for * now to sanity check the hint. */ static struct sackhole * tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *p; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = 0; TAILQ_FOREACH(p, &tp->snd_holes, scblink) { if (SEQ_LT(p->rxmit, p->end)) { if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ continue; } *sack_bytes_rexmt += (p->rxmit - p->start); break; } *sack_bytes_rexmt += (p->rxmit - p->start); } return (p); } #endif /* * Returns the next hole to retransmit and the number of retransmitted bytes * from the scoreboard. We store both the next hole and the number of * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK * reception). This avoids scoreboard traversals completely. * * The loop here will traverse *at most* one link. Here's the argument. For * the loop to traverse more than 1 link before finding the next hole to * retransmit, we would need to have at least 1 node following the current * hint with (rxmit == end). But, for all holes following the current hint, * (start == rxmit), since we have not yet retransmitted from them. * Therefore, in order to traverse more 1 link in the loop below, we need to * have at least one node following the current hint with (start == rxmit == * end). But that can't happen, (start == end) means that all the data in * that hole has been sacked, in which case, the hole would have been removed * from the scoreboard. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *hole = NULL; INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; hole = tp->sackhint.nexthole; if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) goto out; while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { if (SEQ_LT(hole->rxmit, hole->end)) { tp->sackhint.nexthole = hole; break; } } out: return (hole); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tp->t_inpcb); if (cur == NULL) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) return; /* We're already beyond any SACKed blocks */ /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, p->start)) cur = p; else { tp->snd_nxt = p->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->snd_fack; } Index: user/dchagin/lemul/sys/netinet/tcp_subr.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_subr.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_subr.c (revision 274241) @@ -1,2453 +1,2453 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #include #ifdef INET6 #include #endif #include #include #endif /*IPSEC*/ #include #include #include VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_VNET_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); #ifdef TCP_SIGNATURE static int tcp_sig_checksigs = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, signature_verify_input, CTLFLAG_RW, &tcp_sig_checksigs, 0, "Verify RFC2385 digests on inbound traffic"); #endif VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; struct osd osd; }; static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) /* * TCP initialization. */ static void tcp_zone_change(void *tag) { uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_tcpcb_zone, maxsockets); tcp_tw_zone_change(); } static int tcp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp = mem; INP_LOCK_INIT(inp, "inp", "tcpinp"); return (0); } /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } void tcp_init(void) { const char *tcbhash_tuneable; int hashsize; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_usrreqs.pru_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) #else /* INET6 */ #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) #endif /* INET6 */ if (max_protohdr < TCP_MINPROTOHDR) max_protohdr = TCP_MINPROTOHDR; if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE void tcp_destroy(void) { int error; tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } } #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_x2 = 0; th->th_off = 5; th->th_flags = 0; th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { int tlen; int win = 0; struct ip *ip; struct tcphdr *nth; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; struct inpcb *inp; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_WLOCK_ASSERT(inp); } else inp = NULL; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else { /* * reuse the mbuf. * XXX MRT We inherrit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ tlen = 0; #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { tlen += sizeof (struct tcpiphdr); ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_WLOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; nth->th_off = sizeof (struct tcphdr) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), tp, nth); TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); #ifdef INET6 if (isipv6) (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); #endif } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; /* * Use the current system default CC algorithm. */ CC_LIST_RLOCK(); KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!")); CC_ALGO(tp) = CC_DEFAULT(); CC_LIST_RUNLOCK(); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { uma_zfree(V_tcpcb_zone, tm); return (NULL); } tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; return (tp); /* XXX */ } /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. * This ensures the CC framework can allow the unload to proceed without leaving * any dangling pointers which would trigger a panic. * Returning non-zero would inform the CC framework that something went wrong * and it would be unsafe to allow the unload to proceed. However, there is no * way for this to occur with this implementation so we always return zero. */ int tcp_ccalgounload(struct cc_algo *unload_algo) { struct cc_algo *tmpalgo; struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); /* * Check all active control blocks across all network stacks and change * any that are using "unload_algo" back to NewReno. If "unload_algo" * requires cleanup code to be run, call it. */ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); INP_INFO_RLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code * because the INP_INFO_WLOCK is held during initialisation. We * therefore don't enter the loop below until the connection * list has stabilised. */ LIST_FOREACH(inp, &V_tcb, inp_list) { INP_WLOCK(inp); /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { /* * By holding INP_WLOCK here, we are assured * that the connection is not currently * executing inside the CC module's functions * i.e. it is safe to make the switch back to * NewReno. */ if (CC_ALGO(tp) == unload_algo) { tmpalgo = CC_ALGO(tp); /* NewReno does not require any init. */ CC_ALGO(tp) = &newreno_cc_algo; if (tmpalgo->cb_destroy != NULL) tmpalgo->cb_destroy(tp->ccv); } } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * XXXRW: Really, we would like to use callout_drain() here in order * to avoid races experienced in tcp_timer.c where a timer is already * executing at this point. However, we can't, both because we're * running in a context where we can't sleep, and also because we * hold locks required by the timers. What we instead need to do is * test to see if callout_drain() is required, and if so, defer some * portion of the remainder of tcp_discardcb() to an asynchronous * context that can callout_drain() and then continue. Some care * will be required to ensure that no further processing takes place * on the tcpcb, even though it hasn't been freed (a flag?). */ callout_stop(&tp->t_timers->tt_rexmt); callout_stop(&tp->t_timers->tt_persist); callout_stop(&tp->t_timers->tt_keep); callout_stop(&tp->t_timers->tt_2msl); callout_stop(&tp->t_timers->tt_delack); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! */ if (tp->t_rttupdated >= 4) { struct hc_metrics_lite metrics; u_long ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occured on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); khelp_destroy_osd(tp->osd); CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (tp); } void tcp_drain(void) { VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ INP_INFO_RLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; INP_WLOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); } INP_WUNLOCK(inpb); } INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, m, n, pcb_count; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = V_tcbinfo.ipi_count + syncache_pcbcount(); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; INP_INFO_RUNLOCK(&V_tcbinfo); m = syncache_pcbcount(); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n + m; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req, m, &pcb_count); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for * now, better than nothing. */ if (inp->inp_flags & INP_TIMEWAIT) { if (intotw(inp) != NULL) error = cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred); else error = EINVAL; /* Skip this inp. */ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) { in_pcbref(inp); inp_list[i++] = inp; } } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_tcbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xtcpcb xt; void *inp_ppcb; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; if (inp_ppcb == NULL) bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); else if (inp->inp_flags & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); xt.xt_tp.t_state = TCPS_TIME_WAIT; } else { bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (xt.xt_tp.t_timers) tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); } if (inp->inp_socket != NULL) sotoxsocket(inp->inp_socket, &xt.xt_socket); else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } xt.xt_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ INP_INFO_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; INP_INFO_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET void tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct ip *ip = vip; struct tcphdr *th; struct in_addr faddr; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; tcp_seq icmp_tcp_seq; int mtu; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; /* * Redirects don't need to be handled up here. */ else if (PRC_IS_REDIRECT(cmd)) return; /* * Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an * excellent DoS attack on machines with many connections. */ else if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); INP_INFO_WLOCK(&V_tcbinfo); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket == NULL)) { icmp_tcp_seq = htonl(th->th_seq); tp = intotcpcb(inp); if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && SEQ_LT(icmp_tcp_seq, tp->snd_max)) { if (cmd == PRC_MSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ bzero(&inc, sizeof(inc)); inc.inc_faddr = faddr; inc.inc_fibnum = inp->inp_inc.inc_fibnum; mtu = ntohs(icp->icmp_nextmtu); /* * If no alternative MTU was * proposed, try the next smaller * one. */ if (!mtu) mtu = ip_next_mtu( ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); /* * Only cache the MTU if it * is smaller than the interface * or route MTU. tcp_mtudisc() * will do right thing by itself. */ if (mtu <= tcp_maxmtu(&inc, NULL)) tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); } else inp = (*notify)(inp, inetctlerrmap[cmd]); } } if (inp != NULL) INP_WUNLOCK(inp); } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = faddr; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, th); } INP_INFO_WUNLOCK(&V_tcbinfo); } else in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); } #endif /* INET */ #ifdef INET6 void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct tcphdr th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; int off; struct tcp_portonly { u_int16_t th_sport; u_int16_t th_dport; } *thp; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (!PRC_IS_REDIRECT(cmd) && ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) return; /* Source quench is depreciated. */ else if (cmd == PRC_QUENCH) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; } if (ip6 != NULL) { struct in_conninfo inc; /* * XXX: We assume that when IPV6 is non NULL, * M and OFF are valid. */ /* check if we can safely examine src and dst ports */ if (m->m_pkthdr.len < off + sizeof(*thp)) return; bzero(&th, sizeof(th)); m_copydata(m, off, sizeof(*thp), (caddr_t)&th); in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, th.th_sport, cmd, NULL, notify); bzero(&inc, sizeof(inc)); inc.inc_fport = th.th_dport; inc.inc_lport = th.th_sport; inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; inc.inc_flags |= INC_ISIPV6; INP_INFO_WLOCK(&V_tcbinfo); syncache_unreach(&inc, &th); INP_INFO_WUNLOCK(&V_tcbinfo); } else in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, 0, cmd, NULL, notify); } #endif /* INET6 */ /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) static VNET_DEFINE(u_char, isn_secret[32]); static VNET_DEFINE(int, isn_last); static VNET_DEFINE(int, isn_last_reseed); static VNET_DEFINE(u_int32_t, isn_offset); static VNET_DEFINE(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct tcpcb *tp) { MD5_CTX isn_ctx; u_int32_t md5_buffer[4]; tcp_seq new_isn; u_int32_t projected_offset; INP_WLOCK_ASSERT(tp->t_inpcb); ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { read_random(&V_isn_secret, sizeof(V_isn_secret)); V_isn_last_reseed = ticks; } /* Compute the md5 hash and return the ISN. */ MD5Init(&isn_ctx); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, sizeof(struct in6_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, sizeof(struct in6_addr)); } else #endif { MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, sizeof(struct in_addr)); MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, sizeof(struct in_addr)); } MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); MD5Final((u_char *) &md5_buffer, &isn_ctx); new_isn = (tcp_seq) md5_buffer[0]; V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); tcp_output(tp); return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ u_long tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct route sro; struct sockaddr_in *dst; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { dst = (struct sockaddr_in *)&sro.ro_dst; dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = inc->inc_faddr; in_rtalloc_ign(&sro, 0, inc->inc_fibnum); } if (sro.ro_rt != NULL) { ifp = sro.ro_rt->rt_ifp; if (sro.ro_rt->rt_mtu == 0) maxmtu = ifp->if_mtu; else maxmtu = min(sro.ro_rt->rt_mtu, ifp->if_mtu); /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } RTFREE(sro.ro_rt); } return (maxmtu); } #endif /* INET */ #ifdef INET6 u_long tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct route_in6 sro6; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { sro6.ro_dst.sin6_family = AF_INET6; sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); sro6.ro_dst.sin6_addr = inc->inc6_faddr; in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum); } if (sro6.ro_rt != NULL) { ifp = sro6.ro_rt->rt_ifp; if (sro6.ro_rt->rt_mtu == 0) maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); else maxmtu = min(sro6.ro_rt->rt_mtu, IN6_LINKMTU(sro6.ro_rt->rt_ifp)); /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } RTFREE(sro6.ro_rt); } return (maxmtu); } #endif /* INET6 */ #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t ipsec_hdrsiz_tcp(struct tcpcb *tp) { struct inpcb *inp; struct mbuf *m; size_t hdrsiz; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) return (0); m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) return (0); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); tcpip_fillheaders(inp, ip6, th); hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); tcpip_fillheaders(inp, ip, th); hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); } m_free(m); return (hdrsiz); } #endif /* IPSEC */ #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data * contained within an mbuf chain. */ static int tcp_signature_apply(void *fstate, void *data, u_int len) { MD5Update(fstate, (u_char *)data, len); return (0); } /* * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 */ struct secasvar * tcp_get_sav(struct mbuf *m, u_int direction) { union sockaddr_union dst; struct secasvar *sav; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; char ip6buf[INET6_ADDRSTRLEN]; #endif /* Extract the destination from the IP header in the mbuf. */ bzero(&dst, sizeof(union sockaddr_union)); ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif switch (ip->ip_v) { #ifdef INET case IPVERSION: dst.sa.sa_len = sizeof(struct sockaddr_in); dst.sa.sa_family = AF_INET; dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? ip->ip_src : ip->ip_dst; break; #endif #ifdef INET6 case (IPV6_VERSION >> 4): ip6 = mtod(m, struct ip6_hdr *); dst.sa.sa_len = sizeof(struct sockaddr_in6); dst.sa.sa_family = AF_INET6; dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? ip6->ip6_src : ip6->ip6_dst; break; #endif default: return (NULL); /* NOTREACHED */ break; } /* Look up an SADB entry which matches the address of the peer. */ sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); if (sav == NULL) { ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : #ifdef INET6 (ip->ip_v == (IPV6_VERSION >> 4)) ? ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : #endif "(unsupported)")); } return (sav); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * sav pointer to security assosiation * * We do this over ip, tcphdr, segment data, and the key in the SADB. * When called from tcp_input(), we can be sure that th_sum has been * zeroed out and verified already. * * Releases reference to SADB key before return. * * Return 0 if successful, otherwise return -1. * */ int tcp_signature_do_compute(struct mbuf *m, int len, int optlen, u_char *buf, struct secasvar *sav) { #ifdef INET struct ippseudo ippseudo; #endif MD5_CTX ctx; int doff; struct ip *ip; #ifdef INET struct ipovly *ipovly; #endif struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6; struct in6_addr in6; uint32_t plen; uint16_t nhdr; #endif u_short savecsum; KASSERT(m != NULL, ("NULL mbuf chain")); KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ ip = mtod(m, struct ip *); #ifdef INET6 ip6 = NULL; /* Make the compiler happy. */ #endif MD5Init(&ctx); /* * Step 1: Update MD5 hash with IP(v6) pseudo-header. * * XXX The ippseudo header MUST be digested in network byte order, * or else we'll fail the regression test. Assume all fields we've * been doing arithmetic on have been in host byte order. * XXX One cannot depend on ipovly->ih_len here. When called from * tcp_output(), the underlying ip_len member has not yet been set. */ switch (ip->ip_v) { #ifdef INET case IPVERSION: ipovly = (struct ipovly *)ip; ippseudo.ippseudo_src = ipovly->ih_src; ippseudo.ippseudo_dst = ipovly->ih_dst; ippseudo.ippseudo_pad = 0; ippseudo.ippseudo_p = IPPROTO_TCP; ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; break; #endif #ifdef INET6 /* * RFC 2385, 2.0 Proposal * For IPv6, the pseudo-header is as described in RFC 2460, namely the * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- * extended next header value (to form 32 bits), and 32-bit segment * length. * Note: Upper-Layer Packet Length comes before Next Header. */ case (IPV6_VERSION >> 4): in6 = ip6->ip6_src; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); in6 = ip6->ip6_dst; in6_clearscope(&in6); MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); plen = htonl(len + sizeof(struct tcphdr) + optlen); MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); nhdr = 0; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); nhdr = IPPROTO_TCP; MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; break; #endif default: return (-1); /* NOTREACHED */ break; } /* * Step 2: Update MD5 hash with TCP header, excluding options. * The TCP checksum must be set to zero. */ savecsum = th->th_sum; th->th_sum = 0; MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); th->th_sum = savecsum; /* * Step 3: Update MD5 hash with TCP segment data. * Use m_apply() to avoid an early m_pullup(). */ if (len > 0) m_apply(m, doff, len, tcp_signature_apply, &ctx); /* * Step 4: Update MD5 hash with shared secret. */ MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); MD5Final(buf, &ctx); key_sa_recordxfer(sav, m); KEY_FREESAV(&sav); return (0); } /* * Compute TCP-MD5 hash of a TCP segment. (RFC2385) * * Return 0 if successful, otherwise return -1. */ int tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, u_char *buf, u_int direction) { struct secasvar *sav; if ((sav = tcp_get_sav(m, direction)) == NULL) return (-1); return (tcp_signature_do_compute(m, len, optlen, buf, sav)); } /* * Verify the TCP-MD5 hash of a TCP segment. (RFC2385) * * Parameters: * m pointer to head of mbuf chain * len length of TCP segment data, excluding options * optlen length of TCP segment options * buf pointer to storage for computed MD5 digest * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) * * Return 1 if successful, otherwise return 0. */ int tcp_signature_verify(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) { char tmpdigest[TCP_SIGLEN]; if (tcp_sig_checksigs == 0) return (1); if ((tcpbflag & TF_SIGNATURE) == 0) { if ((to->to_flags & TOF_SIGNATURE) != 0) { /* * If this socket is not expecting signature but * the segment contains signature just fail. */ TCPSTAT_INC(tcps_sig_err_sigopt); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } /* Signature is not expected, and not present in segment. */ return (1); } /* * If this socket is expecting signature but the segment does not * contain any just fail. */ if ((to->to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } if (tcp_signature_compute(m, off0, tlen, optlen, &tmpdigest[0], IPSEC_DIR_INBOUND) == -1) { TCPSTAT_INC(tcps_sig_err_buildsig); TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } if (bcmp(to->to_signature, &tmpdigest[0], TCP_SIGLEN) != 0) { TCPSTAT_INC(tcps_sig_rcvbadsig); return (0); } TCPSTAT_INC(tcps_sig_rcvgoodsig); return (1); } #endif /* TCP_SIGNATURE */ static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; struct tcptw *tw; struct sockaddr_in *fin, *lin; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; fin = lin = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } INP_INFO_WLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (inp->inp_flags & INP_TIMEWAIT) { /* * XXXRW: There currently exists a state where an * inpcb is present, but its timewait state has been * discarded. For now, don't allow dropping of this * type of inpcb. */ tw = intotw(inp); if (tw != NULL) tcp_twclose(tw, 0); else INP_WUNLOCK(inp); } else if (!(inp->inp_flags & INP_DROPPED) && !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; INP_INFO_WUNLOCK(&V_tcbinfo); return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop, - CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, + CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; struct ip *ip; #ifdef INET6 const struct ip6_hdr *ip6; ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ ip = (struct ip *)ip4hdr; /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } Index: user/dchagin/lemul/sys/netinet/tcp_syncache.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_syncache.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_syncache.c (revision 274241) @@ -1,2046 +1,2046 @@ /*- * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include static VNET_DEFINE(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); static VNET_DEFINE(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, struct syncache_head *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso); #endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 static VNET_DEFINE(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.rexmt_limit), 0, "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, - CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SYNCACHE_HASH(inc, mask) \ ((V_tcp_syncache.hash_secret ^ \ (inc)->inc_faddr.s_addr ^ \ ((inc)->inc_faddr.s_addr >> 16) ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define SYNCACHE_HASH6(inc, mask) \ ((V_tcp_syncache.hash_secret ^ \ (inc)->inc6_faddr.s6_addr32[0] ^ \ (inc)->inc6_faddr.s6_addr32[3] ^ \ (inc)->inc_fport ^ (inc)->inc_lport) & mask) #define ENDPTS_EQ(a, b) ( \ (a)->ie_fport == (b)->ie_fport && \ (a)->ie_lport == (b)->ie_lport && \ (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ ) #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { int i; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); #ifdef VIMAGE V_tcp_syncache.vnet = curvnet; #endif /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); /* Start the SYN cookie reseeder callout. */ callout_init(&V_tcp_syncache.secret.reseed, 1); arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); } #ifdef VIMAGE void syncache_destroy(void) { struct syncache_head *sch; struct syncache *sc, *nsc; int i; /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; callout_drain(&sch->sch_timer); SCH_LOCK(sch); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) syncache_drop(sc, sch); SCH_UNLOCK(sch); KASSERT(TAILQ_EMPTY(&sch->sch_bucket), ("%s: sch->sch_bucket not empty", __func__)); KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", __func__, sch->sch_length)); mtx_destroy(&sch->sch_mtx); } KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); callout_drain(&V_tcp_syncache.secret.reseed); } #endif /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); syncache_drop(sc2, sch); TCPSTAT_INC(tcps_sc_bucketoverflow); } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_added(tod, sc->sc_todctx); } #endif /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); TCPSTAT_INC(tcps_sc_added); } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif syncache_free(sc); } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; int tick = ticks; char *s; CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_stale); continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } syncache_respond(sc, sch, 1); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); CURVNET_RESTORE(); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { sch = &V_tcp_syncache.hashbase[ SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } else #endif { sch = &V_tcp_syncache.hashbase[ SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) continue; #endif if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) return (sc); } } SCH_LOCK_ASSERT(*schp); return (NULL); /* always returns with locked sch */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); TCPSTAT_INC(tcps_sc_reset); } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "IRS %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); TCPSTAT_INC(tcps_badrst); } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_unreach); done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or * have the peer retransmit its SYN again after its * RTO and try again. */ TCPSTAT_INC(tcps_listendrop); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } goto abort2; } #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif /* * If there's an mbuf and it has a flowid, then let's initialise the * inp with that particular flowid. */ if (m != NULL && m->m_flags & M_FLOWID) { inp->inp_flags |= INP_HW_FLOWID; inp->inp_flags &= ~INP_SW_FLOWID; inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } /* * Install in the reservation hash table for now, but don't yet * install a connection group since the full 4-tuple isn't yet * configured. */ inp->inp_lport = sc->sc_inc.inc_lport; if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef IPSEC /* Copy old policy into new socket's. */ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) printf("syncache_socket: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, thread0.td_ucred, m)) != 0) { inp->in6p_laddr = laddr6; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, thread0.td_ucred, m)) != 0) { inp->inp_laddr = laddr; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } #endif /* INET */ INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = tcp_ts_getticks(); tp->ts_offset = sc->sc_tsoff; } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } if (sc->sc_flags & SCF_ECN) tp->t_flags |= TF_ECN_PERMIT; /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, indicate that CWND to be * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) tp->snd_cwnd = 1; #ifdef TCP_OFFLOAD /* * Allow a TOE driver to install its hooks. Note that we hold the * pcbinfo lock too and that prevents tcp_usr_accept from accepting a * new connection before the TOE driver has done its thing. */ if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_offload_socket(tod, sc->sc_todctx, so); } #endif /* * Copy and activate timers. */ tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); INP_WUNLOCK(inp); soisconnected(so); TCPSTAT_INC(tcps_accepts); return (so); abort: INP_WUNLOCK(inp); abort2: if (so != NULL) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); #ifdef INVARIANTS /* * Test code for syncookies comparing the syncache stored * values with the reconstructed values from the cookie. */ if (sc != NULL) syncookie_cmp(inc, sch, sc, th, to, *lsop); #endif if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. See if this socket has had a syncache entry dropped in * the past. We don't want to accept a bogus syncookie * if we've never received a SYN. * B. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } } else { /* Pull out the entry to unlock the bucket row. */ TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if (SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } /* * If timestamps were not negotiated during SYN/ACK they * must not appear on any segment during this session. */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session. * XXXAO: This is only informal as there have been unverified * reports of non-compliants stacks. */ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); s = NULL; } } /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", s, __func__, to->to_tsecr, sc->sc_ts); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) TCPSTAT_INC(tcps_sc_aborted); else TCPSTAT_INC(tcps_sc_completed); /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL && sc != &scs) syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. */ void syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; int win, sb_hiwat, ip_ttl, ip_tos; char *s; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ so = *lsop; tp = sototcpcb(so); cred = crhold(so->so_cred); #ifdef INET6 if ((inc->inc_flags & INC_ISIPV6) && (inp->inp_flags & IN6P_AUTOFLOWLABEL)) autoflowlabel = 1; #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; win = sbspace(&so->so_rcv); sb_hiwat = so->so_rcv.sb_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_WUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif INP_WUNLOCK(inp); /* * Remember the IP options, if any. */ #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif #ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; #else ipopts = NULL; #endif /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); #endif /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (syncache_respond(sc, sch, 1) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); goto done; } sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ TCPSTAT_INC(tcps_sc_zonefail); if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) syncache_drop(sc, sch); sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (V_tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif { sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } #ifdef TCP_OFFLOAD sc->sc_tod = tod; sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; sc->sc_ts = tcp_ts_getticks(); sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatiblity problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #ifdef TCP_SIGNATURE /* * If listening socket requested TCP digests, OR received SYN * contains the option, flag this in the syncache so that * syncache_respond() will do the right thing with the SYN+ACK. */ if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) sc->sc_flowlabel = sc->sc_iss; else sc->sc_flowlabel = ip6_randomflowlabel(); sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif SCH_UNLOCK(sch); /* * Do a standard 3-way handshake. */ if (syncache_respond(sc, sch, 0) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } else { if (sc != &scs) syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } done: if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif if (m) { *lsop = NULL; m_freem(m); } } static int syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif #ifdef TCP_SIGNATURE struct secasvar *sav; #endif hlen = #ifdef INET6 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); th = (struct tcphdr *)(ip + 1); } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; if (sc->sc_flags & SCF_ECN) { th->th_flags |= TH_ECE; TCPSTAT_INC(tcps_ecn_shs); } /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { /* Virgin timestamp or TCP cookie enhanced one. */ to.to_tsval = sc->sc_ts; to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #ifdef TCP_SIGNATURE sav = NULL; if (sc->sc_flags & SCF_SIGNATURE) { sav = tcp_get_sav(m, IPSEC_DIR_OUTBOUND); if (sav != NULL) to.to_flags |= TOF_SIGNATURE; else { /* * We've got SCF_SIGNATURE flag * inherited from listening socket, * but no SADB key for given source * address. Assume signature is not * required and remove signature flag * instead of silently dropping * connection. */ if (locked == 0) SCH_LOCK(sch); sc->sc_flags &= ~SCF_SIGNATURE; if (locked == 0) SCH_UNLOCK(sch); } } #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tcp_signature_do_compute(m, 0, optlen, to.to_signature, sav); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len = htons(ntohs(ip->ip_len) + optlen); } else optlen = 0; M_SETFIB(m, sc->sc_inc.inc_fibnum); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } /* * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks * that exceed the capacity of the syncache by avoiding the storage of any * of the SYNs we receive. Syncookies defend against blind SYN flooding * attacks where the attacker does not have access to our responses. * * Syncookies encode and include all necessary information about the * connection setup within the SYN|ACK that we send back. That way we * can avoid keeping any local state until the ACK to our SYN|ACK returns * (if ever). Normally the syncache and syncookies are running in parallel * with the latter taking over when the former is exhausted. When matching * syncache entry is found the syncookie is ignored. * * The only reliable information persisting the 3WHS is our inital sequence * number ISS of 32 bits. Syncookies embed a cryptographically sufficient * strong hash (MAC) value and a few bits of TCP SYN options in the ISS * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK * returns and signifies a legitimate connection if it matches the ACK. * * The available space of 32 bits to store the hash and to encode the SYN * option information is very tight and we should have at least 24 bits for * the MAC to keep the number of guesses by blind spoofing reasonably high. * * SYN option information we have to encode to fully restore a connection: * MSS: is imporant to chose an optimal segment size to avoid IP level * fragmentation along the path. The common MSS values can be encoded * in a 3-bit table. Uncommon values are captured by the next lower value * in the table leading to a slight increase in packetization overhead. * WSCALE: is necessary to allow large windows to be used for high delay- * bandwidth product links. Not scaling the window when it was initially * negotiated is bad for performance as lack of scaling further decreases * the apparent available send window. We only need to encode the WSCALE * we received from the remote end. Our end can be recalculated at any * time. The common WSCALE values can be encoded in a 3-bit table. * Uncommon values are captured by the next lower value in the table * making us under-estimate the available window size halving our * theoretically possible maximum throughput for that connection. * SACK: Greatly assists in packet loss recovery and requires 1 bit. * TIMESTAMP and SIGNATURE is not encoded because they are permanent options * that are included in all segments on a connection. We enable them when * the ACK has them. * * Security of syncookies and attack vectors: * * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) * together with the gloabl secret to make it unique per connection attempt. * Thus any change of any of those parameters results in a different MAC output * in an unpredictable way unless a collision is encountered. 24 bits of the * MAC are embedded into the ISS. * * To prevent replay attacks two rotating global secrets are updated with a * new random value every 15 seconds. The life-time of a syncookie is thus * 15-30 seconds. * * Vector 1: Attacking the secret. This requires finding a weakness in the * MAC itself or the way it is used here. The attacker can do a chosen plain * text attack by varying and testing the all parameters under his control. * The strength depends on the size and randomness of the secret, and the * cryptographic security of the MAC function. Due to the constant updating * of the secret the attacker has at most 29.999 seconds to find the secret * and launch spoofed connections. After that he has to start all over again. * * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC * size an average of 4,823 attempts are required for a 50% chance of success * to spoof a single syncookie (birthday collision paradox). However the * attacker is blind and doesn't know if one of his attempts succeeded unless * he has a side channel to interfere success from. A single connection setup * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. * This many attempts are required for each one blind spoofed connection. For * every additional spoofed connection he has to launch another N attempts. * Thus for a sustained rate 100 spoofed connections per second approximately * 1,800,000 packets per second would have to be sent. * * NB: The MAC function should be fast so that it doesn't become a CPU * exhaustion attack vector itself. * * References: * RFC4987 TCP SYN Flooding Attacks and Common Mitigations * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (details) * * * Schematic construction of a syncookie enabled Initial Sequence Number: * 0 1 2 3 * 12345678901234567890123456789012 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| * * x 24 MAC (truncated) * W 3 Send Window Scale index * M 3 MSS index * S 1 SACK permitted * P 1 Odd/even secret */ /* * Distribution and probability of certain MSS values. Those in between are * rounded down to the next lower one. * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] * .2% .3% 5% 7% 7% 20% 15% 45% */ static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; /* * Distribution and probability of certain WSCALE values. We have to map the * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 * bits based on prevalence of certain values. Where we don't have an exact * match for are rounded down to the next lower one letting us under-estimate * the true available window. At the moment this would happen only for the * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer * and window size). The absence of the WSCALE option (no scaling in either * direction) is encoded with index zero. * [WSCALE values histograms, Allman, 2012] * X 10 10 35 5 6 14 10% by host * X 11 4 5 5 18 49 3% by connections */ static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; /* * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed * and good cryptographic properties. */ static uint32_t syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, uint8_t *secbits, uintptr_t secmod) { SIPHASH_CTX ctx; uint32_t siphash[2]; SipHash24_Init(&ctx); SipHash_SetKey(&ctx, secbits); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); break; #endif } SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); SipHash_Update(&ctx, &flags, sizeof(flags)); SipHash_Update(&ctx, &secmod, sizeof(secmod)); SipHash_Final((u_int8_t *)&siphash, &ctx); return (siphash[0] ^ siphash[1]); } static tcp_seq syncookie_generate(struct syncache_head *sch, struct syncache *sc) { u_int i, mss, secbit, wscale; uint32_t iss, hash; uint8_t *secbits; union syncookie cookie; SCH_LOCK_ASSERT(sch); cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); for (i = sizeof(tcp_sc_msstab) / sizeof(*tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0; i--) ; cookie.flags.mss_idx = i; /* * Map the send window scale into the 3-bit index but only if * the wscale option was received. */ if (sc->sc_flags & SCF_WINSCALE) { wscale = sc->sc_requested_s_scale; for (i = sizeof(tcp_sc_wstab) / sizeof(*tcp_sc_wstab) - 1; tcp_sc_wstab[i] > wscale && i > 0; i--) ; cookie.flags.wscale_idx = i; } /* Can we do SACK? */ if (sc->sc_flags & SCF_SACK) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ secbit = sch->sch_sc->secret.oddeven & 0x1; cookie.flags.odd_even = secbit; secbits = sch->sch_sc->secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); /* * Put the flags into the hash and XOR them to get better ISS number * variance. This doesn't enhance the cryptographic strength and is * done to prevent the 8 cookie bits from showing up directly on the * wire. */ iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { sc->sc_ts = arc4random(); sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); } TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { uint32_t hash; uint8_t *secbits; tcp_seq ack, seq; int wnd, wscale = 0; union syncookie cookie; SCH_LOCK_ASSERT(sch); /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; /* * Unpack the flags containing enough information to restore the * connection. */ cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) return (NULL); /* Fill in the syncache values. */ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; break; #endif } sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; /* We can simply recompute receive window scale we sent earlier. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; /* Only use wscale if it was enabled in the orignal SYN. */ if (cookie.flags.wscale_idx > 0) { sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; sc->sc_flags |= SCF_WINSCALE; } wnd = sbspace(&lso->so_rcv); wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; if (cookie.flags.sack_ok) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); } if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); scx = syncookie_lookup(inc, sch, &scs, th, to, lso); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); if (scx != NULL) { if (sc->sc_peer_mss != scx->sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, scx->sc_requested_r_scale); if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, scx->sc_requested_s_scale); if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } if (s != NULL) free(s, M_TCPLOG); return (0); } #endif /* INVARIANTS */ static void syncookie_reseed(void *arg) { struct tcp_syncache *sc = arg; uint8_t *secbits; int secbit; /* * Reseeding the secret doesn't have to be protected by a lock. * It only must be ensured that the new random values are visible * to all CPUs in a SMP environment. The atomic with release * semantics ensures that. */ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; secbits = sc->secret.key[secbit]; arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); atomic_add_rel_int(&sc->secret.oddeven, 1); /* Reschedule ourself. */ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* * Returns the current number of syncache entries. This number * will probably change before you get around to calling * syncache_pcblist. */ int syncache_pcbcount(void) { struct syncache_head *sch; int count, i; for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { /* No need to lock for a read. */ sch = &V_tcp_syncache.hashbase[i]; count += sch->sch_length; } return count; } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { SCH_UNLOCK(sch); goto exit; } if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); xt.xt_tp.t_inpcb = &xt.xt_inp; xt.xt_tp.t_state = TCPS_SYN_RECEIVED; xt.xt_socket.xso_protocol = IPPROTO_TCP; xt.xt_socket.xso_len = sizeof (struct xsocket); xt.xt_socket.so_type = SOCK_STREAM; xt.xt_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); goto exit; } count++; } SCH_UNLOCK(sch); } exit: *pcbs_exported = count; return error; } Index: user/dchagin/lemul/sys/netinet/tcp_timewait.c =================================================================== --- user/dchagin/lemul/sys/netinet/tcp_timewait.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/tcp_timewait.c (revision 274241) @@ -1,723 +1,723 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef TCPDEBUG #include #endif #ifdef INET6 #include #endif #include #include static VNET_DEFINE(uma_zone_t, tcptw_zone); #define V_tcptw_zone VNET(tcptw_zone) static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global * timewait lock, which must be held over queue iteration and modification. * * Rules on tcptw usage: * - a inpcb is always freed _after_ its tcptw * - a tcptw relies on its inpcb reference counting for memory stability * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) /* Global timewait lock */ static VNET_DEFINE(struct rwlock, tw_lock); #define V_tw_lock VNET(tw_lock) #define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) #define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) #define TW_RLOCK(tw) rw_rlock(&(tw)) #define TW_WLOCK(tw) rw_wlock(&(tw)) #define TW_RUNLOCK(tw) rw_runlock(&(tw)) #define TW_WUNLOCK(tw) rw_wunlock(&(tw)) #define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) #define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) #define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) #define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) static void tcp_tw_2msl_reset(struct tcptw *, int); static void tcp_tw_2msl_stop(struct tcptw *, int); static int tcp_twrespond(struct tcptw *, int); static int tcptw_auto_size(void) { int halfrange; /* * Max out at half the ephemeral port range so that TIME_WAIT * sockets don't tie up too many ephemeral ports. */ if (V_ipport_lastauto > V_ipport_firstauto) halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; else halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; /* Protect against goofy port ranges smaller than 32. */ return (imin(imax(halfrange, 32), maxsockets / 5)); } static int sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) { int error, new; if (maxtcptw == 0) new = tcptw_auto_size(); else new = maxtcptw; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) if (new >= 32) { maxtcptw = new; uma_zone_set_max(V_tcptw_zone, maxtcptw); } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, &maxtcptw, 0, sysctl_maxtcptw, "IU", "Maximum number of compressed TCP TIME_WAIT entries"); VNET_DEFINE(int, nolocaltimewait) = 0; #define V_nolocaltimewait VNET(nolocaltimewait) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); void tcp_tw_zone_change(void) { if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); } void tcp_tw_init(void) { V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(V_tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); TW_LOCK_INIT(V_tw_lock, "tcptw"); } #ifdef VIMAGE void tcp_tw_destroy(void) { struct tcptw *tw; INP_INFO_WLOCK(&V_tcbinfo); while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); INP_INFO_WUNLOCK(&V_tcbinfo); TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); } #endif /* * Move a TCP connection into TIME_WAIT state. * tcbinfo is locked. * inp is locked, and is unlocked before returning. */ void tcp_twstart(struct tcpcb *tp) { struct tcptw *tw; struct inpcb *inp = tp->t_inpcb; int acknow; struct socket *so; #ifdef INET6 int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if (V_nolocaltimewait) { int error = 0; #ifdef INET6 if (isipv6) error = in6_localaddr(&inp->in6p_faddr); #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET error = in_localip(inp->inp_faddr); #endif if (error) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } } tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { /* * Reached limit on total number of TIMEWAIT connections * allowed. Remove a connection from TIMEWAIT queue in LRU * fashion to make room for this connection. * * pcbinfo lock is needed here to prevent deadlock as * two inpcb locks can be acquired simultaneously. */ tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) INP_WUNLOCK(inp); return; } } /* * The tcptw will hold a reference on its inpcb until tcp_twclose * is called */ tw->tw_inpcb = inp; in_pcbref(inp); /* Reference from tw */ /* * Recover last window size sent. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; else tw->last_win = 0; /* * Set t_recent if timestamps are used on the connection. */ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { tw->t_recent = tp->ts_recent; tw->ts_offset = tp->ts_offset; } else { tw->t_recent = 0; tw->ts_offset = 0; } tw->snd_nxt = tp->snd_nxt; tw->rcv_nxt = tp->rcv_nxt; tw->iss = tp->iss; tw->irs = tp->irs; tw->t_starttime = tp->t_starttime; tw->tw_time = 0; /* XXX * If this code will * be used for fin-wait-2 state also, then we may need * a ts_recent from the last segment. */ acknow = tp->t_flags & TF_ACKNOW; /* * First, discard tcpcb state, which includes stopping its timers and * freeing it. tcp_discardcb() used to also release the inpcb, but * that work is now done in the caller. * * Note: soisdisconnected() call used to be made in tcp_discardcb(), * and might not be needed here any longer. */ tcp_discardcb(tp); so = inp->inp_socket; soisdisconnected(so); tw->tw_cred = crhold(so->so_cred); SOCK_LOCK(so); tw->tw_so_options = so->so_options; SOCK_UNLOCK(so); if (acknow) tcp_twrespond(tw, TH_ACK); inp->inp_ppcb = tw; inp->inp_flags |= INP_TIMEWAIT; tcp_tw_2msl_reset(tw, 0); /* * If the inpcb owns the sole reference to the socket, then we can * detach and free the socket as it is not needed in time wait. */ if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); } else INP_WUNLOCK(inp); } /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* * XXXRW: Time wait state for inpcb has been recycled, but inpcb is * still present. This is undesirable, but temporarily necessary * until we work out how to handle inpcb's who's timewait state has * been removed. */ tw = intotw(inp); if (tw == NULL) goto drop; thflags = th->th_flags; /* * NOTE: for FIN_WAIT_2 (to be added later), * must validate sequence number before accepting RST */ /* * If the segment contains RST: * Drop the segment - see Stevens, vol. 2, p. 964 and * RFC 1337. */ if (thflags & TH_RST) goto drop; #if 0 /* PAWS not needed at the moment */ /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { if ((thflags & TH_ACK) == 0) goto drop; goto ack; } /* * ts_recent is never updated because we never accept new segments. */ #endif /* * If a new connection request is received * while in TIME_WAIT, drop the old connection * and start over if the sequence numbers * are above the previous ones. */ if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { tcp_twclose(tw, 0); return (1); } /* * Drop the segment if it does not contain an ACK. */ if ((thflags & TH_ACK) == 0) goto drop; /* * Reset the 2MSL timer if this is a duplicate FIN. */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); if (seq + 1 == tw->rcv_nxt) tcp_tw_2msl_reset(tw, 1); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) tcp_twrespond(tw, TH_ACK); drop: INP_WUNLOCK(inp); m_freem(m); return (0); } void tcp_twclose(struct tcptw *tw, int reuse) { struct socket *so; struct inpcb *inp; /* * At this point, we are in one of two situations: * * (1) We have no socket, just an inpcb<->twtcp pair. We can free * all state. * * (2) We have a socket -- if we own a reference, release it and * notify the socket layer. */ inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); so = inp->inp_socket; if (so != NULL) { /* * If there's a socket, handle two cases: first, we own a * strong reference, which we will now release, or we don't * in which case another reference exists (XXXRW: think * about this more), and we don't need to take action. */ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); so->so_state &= ~SS_PROTOREF; sofree(so); } else { /* * If we don't own the only reference, the socket and * inpcb need to be left around to be handled by * tcp_usr_detach() later. */ INP_WUNLOCK(inp); } } else { /* * The socket has been already cleaned-up for us, only free the * inpcb. */ in_pcbfree(inp); } TCPSTAT_INC(tcps_closed); } static int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; #if defined(INET6) || defined(INET) struct tcphdr *th = NULL; #endif struct mbuf *m; #ifdef INET struct ip *ip = NULL; #endif u_int hdrlen, optlen; int error = 0; /* Keep compiler happy */ struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif hdrlen = 0; /* Keep compiler happy */ INP_WLOCK_ASSERT(inp); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, ip6, th); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { hdrlen = sizeof(struct tcpiphdr); ip = mtod(m, struct ip *); th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, ip, th); } #endif to.to_flags = 0; /* * Send a timestamp and echo-reply if both our side and our peer * have sent timestamps in our SYN's and this is not a RST. */ if (tw->t_recent && flags == TH_ACK) { to.to_flags |= TOF_TS; to.to_tsval = tcp_ts_getticks() + tw->ts_offset; to.to_tsecr = tw->t_recent; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); m->m_len = hdrlen + optlen; m->m_pkthdr.len = m->m_len; KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); th->th_seq = htonl(tw->snd_nxt); th->th_ack = htonl(tw->rcv_nxt); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; th->th_flags = flags; th->th_win = htons(tw->last_win); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(inp, NULL); error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); } #endif if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); return (error); } static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); TW_WLOCK(V_tw_lock); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); TW_WUNLOCK(V_tw_lock); } static void tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { struct ucred *cred; struct inpcb *inp; int released; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TW_WLOCK(V_tw_lock); inp = tw->tw_inpcb; tw->tw_inpcb = NULL; TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); cred = tw->tw_cred; tw->tw_cred = NULL; TW_WUNLOCK(V_tw_lock); if (cred != NULL) crfree(cred); released = in_pcbrele_wlocked(inp); KASSERT(!released, ("%s: inp should not be released here", __func__)); if (!reuse) uma_zfree(V_tcptw_zone, tw); } struct tcptw * tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; struct inpcb *inp; #ifdef INVARIANTS if (reuse) { /* * pcbinfo lock is needed in reuse case to prevent deadlock * as two inpcb locks can be acquired simultaneously: * - the inpcb transitioning to TIME_WAIT state in * tcp_tw_start(), * - the inpcb closed by tcp_twclose(). */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } #endif for (;;) { TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { TW_RUNLOCK(V_tw_lock); break; } KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", __func__)); inp = tw->tw_inpcb; in_pcbref(inp); TW_RUNLOCK(V_tw_lock); if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) { INP_WLOCK(inp); tw = intotw(inp); if (in_pcbrele_wlocked(inp)) { KASSERT(tw == NULL, ("%s: held last inp " "reference but tw not NULL", __func__)); INP_INFO_WUNLOCK(&V_tcbinfo); continue; } if (tw == NULL) { /* tcp_twclose() has already been called */ INP_WUNLOCK(inp); INP_INFO_WUNLOCK(&V_tcbinfo); continue; } tcp_twclose(tw, reuse); INP_INFO_WUNLOCK(&V_tcbinfo); if (reuse) return tw; } else { /* INP_INFO lock is busy, continue later. */ INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) INP_WUNLOCK(inp); break; } } return NULL; } Index: user/dchagin/lemul/sys/netinet/udp_usrreq.c =================================================================== --- user/dchagin/lemul/sys/netinet/udp_usrreq.c (revision 274240) +++ user/dchagin/lemul/sys/netinet/udp_usrreq.c (revision 274241) @@ -1,1903 +1,1903 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef IPSEC #include #include #endif #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; -SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; -SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); u_long udp_sendspace = 9216; /* really max datagram size */ /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); #endif #ifdef IPSEC #ifdef IPSEC_NAT_T #define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP) #ifdef INET static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int); #endif #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ static void udp_zone_change(void *tag) { uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); uma_zone_set_max(V_udpcb_zone, maxsockets); } static int udp_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpinp"); return (0); } static int udplite_inpcb_init(void *mem, int size, int flags) { struct inpcb *inp; inp = mem; INP_LOCK_INIT(inp, "inp", "udpliteinp"); return (0); } void udp_init(void) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(V_udpcb_zone, maxsockets); uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } void udplite_init(void) { in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); } /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } int udp_newudpcb(struct inpcb *inp) { struct udpcb *up; up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO); if (up == NULL) return (ENOBUFS); inp->inp_ppcb = up; return (0); } void udp_discardcb(struct udpcb *up) { uma_zfree(V_udpcb_zone, up); } #ifdef VIMAGE void udp_destroy(void) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } void udplite_destroy(void) { in_pcbinfo_destroy(&V_ulitecbinfo); } #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. */ static void udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *opts = 0; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in, up->u_tun_ctx); return; } off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); IPSECSTAT_INC(ips_in_polvio); return; } #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ n = udp4_espdecap(inp, n, off); if (n == NULL) /* Consumed. */ return; } #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return; } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(udp_in, &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)udp_in; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); } int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; struct mbuf *m; struct m_tag *fwd_tag; int cscov_partial, iphlen; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in, sizeof(udp_in)); udp_in.sin_len = sizeof(udp_in); udp_in.sin_family = AF_INET; udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Save a copy of the IP header in case we want restore it for * sending an ICMP error message in response. */ if (!V_udp_blackhole) save_ip = *ip; else memset(&save_ip, 0, sizeof(save_ip)); /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } pcbinfo = get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || in_broadcast(ip->ip_dst, ifp)) { struct inpcb *last; struct inpcbhead *pcblist; struct ip_moptions *imo; INP_INFO_RLOCK(pcbinfo); pcblist = get_pcblist(proto); last = NULL; LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) continue; if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) continue; INP_RLOCK(inp); /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ imo = inp->inp_moptions; if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct sockaddr_in group; int blocked; if (imo == NULL) { INP_RUNLOCK(inp); continue; } bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, ifp, (struct sockaddr *)&group, (struct sockaddr *)&udp_in); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); INP_RUNLOCK(inp); continue; } } if (last != NULL) { struct mbuf *n; if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { UDP_PROBE(receive, NULL, last, ip, last, uh); udp_append(last, ip, n, iphlen, &udp_in); } INP_RUNLOCK(last); } last = inp; /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((last->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEADDR)) == 0) break; } if (last == NULL) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } UDP_PROBE(receive, NULL, last, ip, last, uh); udp_append(last, ip, m, iphlen, &udp_in); INP_RUNLOCK(last); INP_INFO_RUNLOCK(pcbinfo); return (IPPROTO_DONE); } /* * Locate pcb for datagram. */ /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { if (udp_log_in_vain) { char buf[4*sizeof "123"]; strcpy(buf, inet_ntoa(ip->ip_dst)); log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), ntohs(uh->uh_sport)); } UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } UDP_PROBE(receive, NULL, inp, ip, inp, uh); udp_append(inp, ip, m, iphlen, &udp_in); INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { /* * While udp_ctlinput() always calls udp_notify() with a read lock * when invoking it directly, in_pcbnotifyall() currently uses write * locks due to sharing code with TCP. For now, accept either a read * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; struct in_addr faddr; struct inpcb *inp; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; /* * Redirects don't need to be handled up here. */ if (PRC_IS_REDIRECT(cmd)) return; /* * Hostdead is ugly because it goes linearly through all PCBs. * * XXX: We never get this from ICMP, otherwise it makes an excellent * DoS attack on machines with many connections. */ if (cmd == PRC_HOSTDEAD) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket != NULL) { udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); } } else in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } void udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); } void udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) { return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { int error, i, n; struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; /* * The process of preparing the PCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == 0) { n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if (req->newptr != 0) return (EPERM); /* * OK, now we're committed to doing something. */ INP_INFO_RLOCK(&V_udbinfo); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); if (inp_list == 0) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); if (inp->inp_gencnt <= gencnt && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { in_pcbref(inp); inp_list[i++] = inp; } INP_WUNLOCK(inp); } INP_INFO_RUNLOCK(&V_udbinfo); n = i; error = 0; for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (inp->inp_gencnt <= gencnt) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof xi; /* XXX should avoid extra copy */ bcopy(inp, &xi.xi_inp, sizeof *inp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi.xi_socket); xi.xi_inp.inp_gencnt = inp->inp_gencnt; INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); } else INP_RUNLOCK(inp); } INP_INFO_WLOCK(&V_udbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(&V_udbinfo); if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ INP_INFO_RLOCK(&V_udbinfo); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; INP_INFO_RUNLOCK(&V_udbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case UDP_ENCAP: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); #ifdef IPSEC_NAT_T up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); #endif switch (optval) { case 0: /* Clear all UDP encap. */ #ifdef IPSEC_NAT_T up->u_flags &= ~UF_ESPINUDP_ALL; #endif break; #ifdef IPSEC_NAT_T case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: up->u_flags &= ~UF_ESPINUDP_ALL; if (optval == UDP_ENCAP_ESPINUDP) up->u_flags |= UF_ESPINUDP; else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE) up->u_flags |= UF_ESPINUDP_NON_IKE; break; #endif default: error = EINVAL; break; } INP_WUNLOCK(inp); break; case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #ifdef IPSEC_NAT_T case UDP_ENCAP: up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); optval = up->u_flags & UF_ESPINUDP_ALL; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #define UH_WLOCKED 2 #define UH_RLOCKED 1 #define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct udpiphdr *ui; int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; int unlock_udbinfo; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; int flowid_type = 0; int use_flowid = 0; /* * udp_output() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; INP_RLOCK(inp); tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid_type = *(uint32_t *) CMSG_DATA(cm); use_flowid = 1; break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); } if (error) { INP_RUNLOCK(inp); m_freem(m); return (error); } /* * Depending on whether or not the application has bound or connected * the socket, we may have to do varying levels of work. The optimal * case is for a connected UDP socket, as a global lock isn't * required at all. * * In order to decide which we need, we require stability of the * inpcb binding, which we ensure by acquiring a read lock on the * inpcb. This doesn't strictly follow the lock order, so we play * the trylock and retry game; note that we may end up with more * conservative locks than required the second time around, so later * assertions have to accept that. Further analysis of the number of * misses under contention is required. * * XXXRW: Check that hash locking update here is correct. */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { INP_RUNLOCK(inp); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { INP_HASH_RLOCK(pcbinfo); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); if (error) goto release; /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } else ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. */ if (inp->inp_flags & INP_DONTFRAG) { struct ip *ip; ip = (struct ip *)&ui->ui_i; ip->ip_off |= htons(IP_DF); } ipflags = 0; if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ if (use_flowid) { m->m_flags |= M_FLOWID; m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowid_type); #ifdef RSS } else { uint32_t hash_val, hash_type; /* * Calculate an appropriate RSS hash for UDP and * UDP Lite. * * The called function will take care of figuring out * whether a 2-tuple or 4-tuple hash is required based * on the currently configured scheme. * * Later later on connected socket values should be * cached in the inpcb and reused, rather than constantly * re-calculating it. * * UDP Lite is a different protocol number and will * likely end up being hashed as a 2-tuple until * RSS / NICs grow UDP Lite protocol awareness. */ if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, pr, &hash_val, &hash_type) == 0) { m->m_pkthdr.flowid = hash_val; m->m_flags |= M_FLOWID; M_HASHTYPE_SET(m, hash_type); } #endif } #ifdef RSS /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) INP_HASH_RUNLOCK(pcbinfo); UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); if (unlock_udbinfo == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (error); release: if (unlock_udbinfo == UH_WLOCKED) { INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { INP_HASH_RUNLOCK(pcbinfo); INP_RUNLOCK(inp); } else INP_RUNLOCK(inp); m_freem(m); return (error); } #if defined(IPSEC) && defined(IPSEC_NAT_T) /* * Potentially decap ESP in UDP frame. Check for an ESP header * and optional marker; if present, strip the UDP header and * push the result through IPSec. * * Returns mbuf to be processed (potentially re-allocated) or * NULL if consumed and/or processed. */ static struct mbuf * udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) { size_t minlen, payload, skip, iphlen; caddr_t data; struct udpcb *up; struct m_tag *tag; struct udphdr *udphdr; struct ip *ip; INP_RLOCK_ASSERT(inp); /* * Pull up data so the longest case is contiguous: * IP/UDP hdr + non ESP marker + ESP hdr. */ minlen = off + sizeof(uint64_t) + sizeof(struct esp); if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { IPSECSTAT_INC(ips_in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ payload = m->m_len - off; /* Size of payload. */ if (payload == 1 && data[off] == '\xff') return (m); /* NB: keepalive packet, no decap. */ up = intoudpcb(inp); KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0, ("u_flags 0x%x", up->u_flags)); /* * Check that the payload is large enough to hold an * ESP header and compute the amount of data to remove. * * NB: the caller has already done a pullup for us. * XXX can we assume alignment and eliminate bcopys? */ if (up->u_flags & UF_ESPINUDP_NON_IKE) { /* * draft-ietf-ipsec-nat-t-ike-0[01].txt and * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring * possible AH mode non-IKE marker+non-ESP marker * from draft-ietf-ipsec-udp-encaps-00.txt. */ uint64_t marker; if (payload <= sizeof(uint64_t) + sizeof(struct esp)) return (m); /* NB: no decap. */ bcopy(data + off, &marker, sizeof(uint64_t)); if (marker != 0) /* Non-IKE marker. */ return (m); /* NB: no decap. */ skip = sizeof(uint64_t) + sizeof(struct udphdr); } else { uint32_t spi; if (payload <= sizeof(struct esp)) { IPSECSTAT_INC(ips_in_inval); m_freem(m); return (NULL); /* Discard. */ } bcopy(data + off, &spi, sizeof(uint32_t)); if (spi == 0) /* Non-ESP marker. */ return (m); /* NB: no decap. */ skip = sizeof(struct udphdr); } /* * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember * the UDP ports. This is required if we want to select * the right SPD for multiple hosts behind same NAT. * * NB: ports are maintained in network byte order everywhere * in the NAT-T code. */ tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { IPSECSTAT_INC(ips_in_nomem); m_freem(m); return (NULL); /* Discard. */ } iphlen = off - sizeof(struct udphdr); udphdr = (struct udphdr *)(data + iphlen); ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport; ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport; m_tag_prepend(m, tag); /* * Remove the UDP header (and possibly the non ESP marker) * IP header length is iphlen * Before: * <--- off ---> * +----+------+-----+ * | IP | UDP | ESP | * +----+------+-----+ * <-skip-> * After: * +----+-----+ * | IP | ESP | * +----+-----+ * <-skip-> */ ovbcopy(data, data + skip, iphlen); m_adj(m, skip); ip = mtod(m, struct ip *); ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* * We cannot yet update the cksums so clear any * h/w cksum flags as they are no longer valid. */ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); (void) ipsec4_common_input(m, iphlen, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ static void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); if (error) { INP_INFO_WUNLOCK(pcbinfo); return (error); } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = V_ip_defttl; error = udp_newudpcb(inp); if (error) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if (up->u_tun_func != NULL) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } sin = (struct sockaddr_in *)nam; error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct udpcb *up; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } static int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } static int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); return (udp_output(inp, m, addr, control, td)); } #endif /* INET */ int udp_shutdown(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); INP_WLOCK(inp); socantsendmore(so); INP_WUNLOCK(inp); return (0); } #ifdef INET struct pr_usrreqs udp_usrreqs = { .pru_abort = udp_abort, .pru_attach = udp_attach, .pru_bind = udp_bind, .pru_connect = udp_connect, .pru_control = in_control, .pru_detach = udp_detach, .pru_disconnect = udp_disconnect, .pru_peeraddr = in_getpeeraddr, .pru_send = udp_send, .pru_soreceive = soreceive_dgram, .pru_sosend = sosend_dgram, .pru_shutdown = udp_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = udp_close, }; #endif /* INET */ Index: user/dchagin/lemul/sys/netinet6/in6_gif.c =================================================================== --- user/dchagin/lemul/sys/netinet6/in6_gif.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/in6_gif.c (revision 274241) @@ -1,249 +1,249 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #ifdef INET6 #include #include #include #include #endif #include #ifdef INET6 #include #endif #include VNET_DEFINE(int, ip6_gif_hlim) = GIF_HLIM; #define V_ip6_gif_hlim VNET(ip6_gif_hlim) SYSCTL_DECL(_net_inet6_ip6); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_RW, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gif_hlim), 0, ""); static int gif_validate6(const struct ip6_hdr *, struct gif_softc *, struct ifnet *); extern struct domain inet6domain; struct protosw in6_gif_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = 0, /* IPPROTO_IPV[46] */ .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in6_gif_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }; int in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { GIF_RLOCK_TRACKER; struct gif_softc *sc = ifp->if_softc; struct ip6_hdr *ip6; int len; /* prepend new IP header */ len = sizeof(struct ip6_hdr); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif M_PREPEND(m, len, M_NOWAIT); if (m == NULL) return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in6_gif_output: unexpected misalignment")); m->m_data += len; m->m_len -= ETHERIP_ALIGN; } #endif ip6 = mtod(m, struct ip6_hdr *); GIF_RLOCK(sc); if (sc->gif_family != AF_INET6) { m_freem(m); GIF_RUNLOCK(sc); return (ENETDOWN); } bcopy(sc->gif_ip6hdr, ip6, sizeof(struct ip6_hdr)); GIF_RUNLOCK(sc); ip6->ip6_flow |= htonl((uint32_t)ecn << 20); ip6->ip6_nxt = proto; ip6->ip6_hlim = V_ip6_gif_hlim; /* * force fragmentation to minimum MTU, to avoid path MTU discovery. * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL)); } int in6_gif_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *gifp; struct gif_softc *sc; struct ip6_hdr *ip6; uint8_t ecn; sc = encap_getarg(m); if (sc == NULL) { m_freem(m); IP6STAT_INC(ip6s_nogif); return (IPPROTO_DONE); } gifp = GIF2IFP(sc); if ((gifp->if_flags & IFF_UP) != 0) { ip6 = mtod(m, struct ip6_hdr *); ecn = (ntohl(ip6->ip6_flow) >> 20) & 0xff; m_adj(m, *offp); gif_input(m, gifp, proto, ecn); } else { m_freem(m); IP6STAT_INC(ip6s_nogif); } return (IPPROTO_DONE); } /* * validate outer address. */ static int gif_validate6(const struct ip6_hdr *ip6, struct gif_softc *sc, struct ifnet *ifp) { GIF_RLOCK_ASSERT(sc); /* * Check for address match. Note that the check is for an incoming * packet. We should compare the *source* address in our configuration * and the *destination* address of the packet, and vice versa. */ if (!IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, &ip6->ip6_dst) || !IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, &ip6->ip6_src)) return (0); /* martian filters on outer source - done in ip6_input */ /* ingress filters on outer source */ if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) { struct sockaddr_in6 sin6; struct rtentry *rt; bzero(&sin6, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = ip6->ip6_src; sin6.sin6_scope_id = 0; /* XXX */ rt = in6_rtalloc1((struct sockaddr *)&sin6, 0, 0UL, sc->gif_fibnum); if (!rt || rt->rt_ifp != ifp) { if (rt) RTFREE_LOCKED(rt); return (0); } RTFREE_LOCKED(rt); } return (128 * 2); } /* * we know that we are in IFF_UP, outer address available, and outer family * matched the physical addr family. see gif_encapcheck(). */ int in6_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { struct ip6_hdr ip6; struct gif_softc *sc; struct ifnet *ifp; /* sanity check done in caller */ sc = (struct gif_softc *)arg; GIF_RLOCK_ASSERT(sc); m_copydata(m, 0, sizeof(ip6), (caddr_t)&ip6); ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL; return (gif_validate6(&ip6, sc, ifp)); } int in6_gif_attach(struct gif_softc *sc) { KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL")); sc->gif_ecookie = encap_attach_func(AF_INET6, -1, gif_encapcheck, (void *)&in6_gif_protosw, sc); if (sc->gif_ecookie == NULL) return (EEXIST); return (0); } Index: user/dchagin/lemul/sys/netinet6/in6_proto.c =================================================================== --- user/dchagin/lemul/sys/netinet6/in6_proto.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/in6_proto.c (revision 274241) @@ -1,620 +1,619 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include "opt_sctp.h" #include "opt_mpath.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #include #include #include #endif /* SCTP */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ FEATURE(inet6, "Internet Protocol version 6"); extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 /* Spacer for loadable protocols. */ #define IP6PROTOSPACER \ { \ .pr_domain = &inet6domain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inet6sw[] = { { .pr_type = 0, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, #ifdef VIMAGE .pr_destroy = ip6_destroy, #endif .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = ip6_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = udp_init, #endif .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN, .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, #ifndef INET /* don't call initialization and timeout routines twice */ .pr_init = tcp_init, .pr_slowtimo = tcp_slowtimo, #endif .pr_drain = tcp_drain, .pr_usrreqs = &tcp6_usrreqs, }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, #ifndef INET /* Do not call initialization twice. */ .pr_init = sctp_init, #endif .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp6_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDPLITE, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp6_input, .pr_ctlinput = udplite6_ctlinput, .pr_ctloutput = udp_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = udplite_init, #endif .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = rip_init, #endif .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_fasttimo = icmp6_fasttimo, .pr_slowtimo = icmp6_slowtimo, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = dest6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = route6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = frag6_input, .pr_usrreqs = &nousrreqs }, #ifdef IPSEC { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_AH, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ESP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_ctlinput = esp6_ctlinput, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPCOMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_usrreqs = &nousrreqs, }, #endif /* IPSEC */ #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, #endif /* INET */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, /* Spacer n-times for loadable protocols. */ IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, }; extern int in6_inithead(void **, int); #ifdef VIMAGE extern int in6_detachhead(void **, int); #endif struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_protosw = (struct protosw *)inet6sw, .dom_protoswNPROTOSW = (struct protosw *) &inet6sw[sizeof(inet6sw)/sizeof(inet6sw[0])], #ifdef RADIX_MPATH .dom_rtattach = rn6_mpath_inithead, #else .dom_rtattach = in6_inithead, #endif #ifdef VIMAGE .dom_rtdetach = in6_detachhead, #endif .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach, .dom_ifmtu = in6_domifmtu }; VNET_DOMAIN_SET(inet6); /* * Internet configuration info */ #ifndef IPV6FORWARDING #ifdef GATEWAY6 #define IPV6FORWARDING 1 /* forward IP6 packets not for us */ #else #define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ #endif /* GATEWAY6 */ #endif /* !IPV6FORWARDING */ #ifndef IPV6_SENDREDIRECTS #define IPV6_SENDREDIRECTS 1 #endif VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING; /* act as router? */ VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS; VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM; VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS; VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_maxfrags); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ VNET_DEFINE(int, ip6_dad_count) = 1; /* DupAddrDetectionTransmits */ VNET_DEFINE(int, ip6_auto_flowlabel) = 1; VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr * (RFC2462 5.5.4) */ VNET_DEFINE(int, ip6_rr_prune) = 5; /* router renumbering prefix * walk list every 5 sec. */ VNET_DEFINE(int, ip6_mcast_pmtu) = 0; /* enable pMTU discovery for multicast? */ VNET_DEFINE(int, ip6_v6only) = 1; VNET_DEFINE(int, ip6_keepfaith) = 0; VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L; #ifdef IPSTEALTH VNET_DEFINE(int, ip6stealth) = 0; #endif VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS * (RFC 4861) */ /* icmp6 */ /* * BSDI4 defines these variables in in_proto.c... * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ VNET_DEFINE(int, pmtu_expire) = 60*10; VNET_DEFINE(int, pmtu_probe) = 60*2; /* ICMPV6 parameters */ VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */ VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ VNET_DEFINE(int, icmp6errppslim) = 100; /* 100pps */ /* control how to respond to NI queries */ VNET_DEFINE(int, icmp6_nodeinfo) = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1; /* * sysctl related items. */ SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW, 0, "Internet6 Family"); /* net.inet6 */ SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW, 0, "IP6"); SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW, 0, "ICMP6"); SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW, 0, "UDP6"); SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW, 0, "TCP6"); #ifdef SCTP SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW, 0, "SCTP6"); #endif #ifdef IPSEC SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW, 0, "IPSEC6"); #endif /* IPSEC */ /* net.inet6.ip6 */ static int sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { int error = 0; int old; - VNET_SYSCTL_ARG(req, arg1); - error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); old = V_ip6_temp_preferred_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); if (V_ip6_temp_preferred_lifetime < V_ip6_desync_factor + V_ip6_temp_regen_advance) { V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } static int sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { int error = 0; int old; - VNET_SYSCTL_ARG(req, arg1); - error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); old = V_ip6_temp_valid_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) { V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_RW, - &VNET_NAME(ip6_forwarding), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_RW, - &VNET_NAME(ip6_sendredirects), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_RW, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defhlim), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, - ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, - CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, - CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, + ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 Router" "Advertisement messages"); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, - CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, "Default value of per-interface flag to control whether routers " "sending ICMPv6 RA messages on that interface are added into the " "default router list."); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_RW, - &VNET_NAME(ip6_norbit_raif), 0, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0, "Always set 0 to R flag in ICMPv6 NA messages when accepting RA" " on the interface."); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3, - CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0, "Accept the default router list from ICMPv6 RA messages even " "when packet forwarding enabled."); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, keepfaith, CTLFLAG_RW, - &VNET_NAME(ip6_keepfaith), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, - CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, - CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_RW, - &VNET_NAME(ip6_dad_count), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, - CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, - CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, keepfaith, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_keepfaith), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, ""); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, - CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_RW, - &VNET_NAME(ip6_rr_prune), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, - CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, ""); -SYSCTL_VNET_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ip6_temp_preferred_lifetime), 0, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, ""); +SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ip6_temp_preferred_lifetime), 0, sysctl_ip6_temppltime, "I", ""); -SYSCTL_VNET_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ip6_temp_valid_lifetime), 0, +SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ip6_temp_valid_lifetime), 0, sysctl_ip6_tempvltime, "I", ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only, CTLFLAG_RW, - &VNET_NAME(ip6_v6only), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, - CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0, "Default value of per-interface flag for automatically adding an IPv6" " link-local address to interfaces when attached"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, - struct rip6stat, rip6stat, - "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)"); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, - CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, - CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_RW, - &VNET_NAME(ip6_maxfrags), 0, ""); -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_RW, - &VNET_NAME(ip6_mcast_pmtu), 0, ""); + struct rip6stat, rip6stat, + "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)"); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, ""); #ifdef IPSTEALTH -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_RW, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6stealth), 0, ""); #endif /* net.inet6.icmp6 */ -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, - CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, - CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, - struct icmp6stat, icmp6stat, - "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_RW, - &VNET_NAME(nd6_prune), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_RW, - &VNET_NAME(nd6_delay), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, - CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, - CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, - CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_RW, - &VNET_NAME(icmp6_nodeinfo), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX, - nodeinfo_oldmcprefix, CTLFLAG_RW, + struct icmp6stat, icmp6stat, + "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX, + nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, "Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup" " for compatibility with KAME implememtation."); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, - CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, - CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_RW, - &VNET_NAME(nd6_debug), 0, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, - nd6_onlink_ns_rfc4861, CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), - 0, "Accept 'on-link' nd6 NS in compliance with RFC 4861."); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, + nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(nd6_onlink_ns_rfc4861), 0, + "Accept 'on-link' nd6 NS in compliance with RFC 4861."); Index: user/dchagin/lemul/sys/netinet6/in6_rmx.c =================================================================== --- user/dchagin/lemul/sys/netinet6/in6_rmx.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/in6_rmx.c (revision 274241) @@ -1,341 +1,341 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $ */ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern int in6_inithead(void **head, int off); #ifdef VIMAGE extern int in6_detachhead(void **head, int off); #endif /* * Do what we need to do when inserting a route. */ static struct radix_node * in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt); struct radix_node *ret; RADIX_NODE_HEAD_WLOCK_ASSERT(head); if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) rt->rt_flags |= RTF_MULTICAST; /* * A little bit of help for both IPv6 output and input: * For local addresses, we make sure that RTF_LOCAL is set, * with the thought that this might one day be used to speed up * ip_input(). * * We also mark routes to multicast addresses as such, because * it's easy to do and might be useful (but this is much more * dubious since it's so easy to inspect the address). (This * is done above.) * * XXX * should elaborate the code. */ if (rt->rt_flags & RTF_HOST) { if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr) ->sin6_addr, &sin6->sin6_addr)) { rt->rt_flags |= RTF_LOCAL; } } if (rt->rt_ifp != NULL) { /* * Check route MTU: * inherit interface MTU if not set or * check if MTU is too large. */ if (rt->rt_mtu == 0) { rt->rt_mtu = IN6_LINKMTU(rt->rt_ifp); } else if (rt->rt_mtu > IN6_LINKMTU(rt->rt_ifp)) rt->rt_mtu = IN6_LINKMTU(rt->rt_ifp); } ret = rn_addroute(v_arg, n_arg, head, treenodes); if (ret == NULL) { struct rtentry *rt2; /* * We are trying to add a net route, but can't. * The following case should be allowed, so we'll make a * special check for this: * Two IPv6 addresses with the same prefix is assigned * to a single interrface. * # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1) * # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2) * In this case, (*1) and (*2) want to add the same * net route entry, 3ffe:0501:: -> if0. * This case should not raise an error. */ rt2 = in6_rtalloc1((struct sockaddr *)sin6, 0, RTF_RNH_LOCKED, rt->rt_fibnum); if (rt2) { if (((rt2->rt_flags & (RTF_HOST|RTF_GATEWAY)) == 0) && rt2->rt_gateway && rt2->rt_gateway->sa_family == AF_LINK && rt2->rt_ifp == rt->rt_ifp) { ret = rt2->rt_nodes; } RTFREE_LOCKED(rt2); } } return (ret); } SYSCTL_DECL(_net_inet6_ip6); static VNET_DEFINE(int, rtq_toomany6) = 128; /* 128 cached routes is ``too many'' */ #define V_rtq_toomany6 VNET(rtq_toomany6) -SYSCTL_VNET_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(rtq_toomany6) , 0, ""); struct rtqk_arg { struct radix_node_head *rnh; int mode; int updating; int draining; int killed; int found; time_t nextstop; }; /* * Age old PMTUs. */ struct mtuex_arg { struct radix_node_head *rnh; time_t nextstop; }; static VNET_DEFINE(struct callout, rtq_mtutimer); #define V_rtq_mtutimer VNET(rtq_mtutimer) static int in6_mtuexpire(struct radix_node *rn, void *rock) { struct rtentry *rt = (struct rtentry *)rn; struct mtuex_arg *ap = rock; /* sanity */ if (!rt) panic("rt == NULL in in6_mtuexpire"); if (rt->rt_expire && !(rt->rt_flags & RTF_PROBEMTU)) { if (rt->rt_expire <= time_uptime) { rt->rt_flags |= RTF_PROBEMTU; } else { ap->nextstop = lmin(ap->nextstop, rt->rt_expire); } } return 0; } #define MTUTIMO_DEFAULT (60*1) static void in6_mtutimo_one(struct radix_node_head *rnh) { struct mtuex_arg arg; arg.rnh = rnh; arg.nextstop = time_uptime + MTUTIMO_DEFAULT; RADIX_NODE_HEAD_LOCK(rnh); rnh->rnh_walktree(rnh, in6_mtuexpire, &arg); RADIX_NODE_HEAD_UNLOCK(rnh); } static void in6_mtutimo(void *rock) { CURVNET_SET_QUIET((struct vnet *) rock); struct radix_node_head *rnh; struct timeval atv; u_int fibnum; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { rnh = rt_tables_get_rnh(fibnum, AF_INET6); if (rnh != NULL) in6_mtutimo_one(rnh); } atv.tv_sec = MTUTIMO_DEFAULT; atv.tv_usec = 0; callout_reset(&V_rtq_mtutimer, tvtohz(&atv), in6_mtutimo, rock); CURVNET_RESTORE(); } /* * Initialize our routing tree. */ static VNET_DEFINE(int, _in6_rt_was_here); #define V__in6_rt_was_here VNET(_in6_rt_was_here) int in6_inithead(void **head, int off) { struct radix_node_head *rnh; if (!rn_inithead(head, offsetof(struct sockaddr_in6, sin6_addr) << 3)) return (0); rnh = *head; RADIX_NODE_HEAD_LOCK_INIT(rnh); rnh->rnh_addaddr = in6_addroute; if (V__in6_rt_was_here == 0) { callout_init(&V_rtq_mtutimer, CALLOUT_MPSAFE); in6_mtutimo(curvnet); /* kick off timeout first time */ V__in6_rt_was_here = 1; } return (1); } #ifdef VIMAGE int in6_detachhead(void **head, int off) { callout_drain(&V_rtq_mtutimer); return (1); } #endif /* * Extended API for IPv6 FIB support. */ void in6_rtredirect(struct sockaddr *dst, struct sockaddr *gw, struct sockaddr *nm, int flags, struct sockaddr *src, u_int fibnum) { rtredirect_fib(dst, gw, nm, flags, src, fibnum); } int in6_rtrequest(int req, struct sockaddr *dst, struct sockaddr *gw, struct sockaddr *mask, int flags, struct rtentry **ret_nrt, u_int fibnum) { return (rtrequest_fib(req, dst, gw, mask, flags, ret_nrt, fibnum)); } void in6_rtalloc(struct route_in6 *ro, u_int fibnum) { rtalloc_ign_fib((struct route *)ro, 0ul, fibnum); } void in6_rtalloc_ign(struct route_in6 *ro, u_long ignflags, u_int fibnum) { rtalloc_ign_fib((struct route *)ro, ignflags, fibnum); } struct rtentry * in6_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) { return (rtalloc1_fib(dst, report, ignflags, fibnum)); } Index: user/dchagin/lemul/sys/netinet6/ip6_ipsec.c =================================================================== --- user/dchagin/lemul/sys/netinet6/ip6_ipsec.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/ip6_ipsec.c (revision 274241) @@ -1,410 +1,410 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #ifdef IPSEC_DEBUG #include #else #define KEYDEBUG(lev,arg) #endif #endif /*IPSEC*/ #include #include extern struct protosw inet6sw[]; #ifdef INET6 #ifdef IPSEC #ifdef IPSEC_FILTERTUNNEL static VNET_DEFINE(int, ip6_ipsec6_filtertunnel) = 1; #else static VNET_DEFINE(int, ip6_ipsec6_filtertunnel) = 0; #endif #define V_ip6_ipsec6_filtertunnel VNET(ip6_ipsec6_filtertunnel) SYSCTL_DECL(_net_inet6_ipsec6); -SYSCTL_VNET_INT(_net_inet6_ipsec6, OID_AUTO, - filtertunnel, CTLFLAG_RW, &VNET_NAME(ip6_ipsec6_filtertunnel), 0, +SYSCTL_INT(_net_inet6_ipsec6, OID_AUTO, filtertunnel, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ipsec6_filtertunnel), 0, "If set filter packets from an IPsec tunnel."); #endif /* IPSEC */ #endif /* INET6 */ /* * Check if we have to jump over firewall processing for this packet. * Called from ip6_input(). * 1 = jump over firewall, 0 = packet goes through firewall. */ int ip6_ipsec_filtertunnel(struct mbuf *m) { #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. */ if (!V_ip6_ipsec6_filtertunnel && m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL) return 1; #endif return 0; } /* * Check if this packet has an active SA and needs to be dropped instead * of forwarded. * Called from ip6_input(). * 1 = drop packet, 0 = forward packet. */ int ip6_ipsec_fwd(struct mbuf *m) { #ifdef IPSEC struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; int error; mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); } else { sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); } if (sp == NULL) { /* NB: can happen if error */ /*XXX error stat???*/ DPRINTF(("%s: no SP for forwarding\n", __func__)); /*XXX*/ return 1; } /* * Check security policy against packet attributes. */ error = ipsec_in_reject(sp, m); KEY_FREESP(&sp); if (error) { IP6STAT_INC(ip6s_cantforward); return 1; } #endif /* IPSEC */ return 0; } /* * Check if protocol type doesn't have a further header and do IPSEC * decryption or reject right now. Protocols with further headers get * their IPSEC treatment within the protocol specific processing. * Called from ip6_input(). * 1 = drop packet, 0 = continue processing packet. */ int ip6_ipsec_input(struct mbuf *m, int nxt) { #ifdef IPSEC struct m_tag *mtag; struct tdb_ident *tdbi; struct secpolicy *sp; int error; /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 && ipsec6_in_reject(m, NULL)) { /* * Check if the packet has already had IPsec processing * done. If so, then just pass it along. This tag gets * set during AH, ESP, etc. input handling, before the * packet is returned to the ip input queue for delivery. */ mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); } else { sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); } if (sp != NULL) { /* * Check security policy against packet attributes. */ error = ipsec_in_reject(sp, m); KEY_FREESP(&sp); } else { /* XXX error stat??? */ error = EINVAL; DPRINTF(("%s: no SP, packet discarded\n", __func__));/*XXX*/ return 1; } if (error) return 1; } #endif /* IPSEC */ return 0; } /* * Called from ip6_output(). * 1 = drop packet, 0 = continue processing packet, * -1 = packet was reinjected and stop processing packet */ int ip6_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error, struct ifnet **ifp) { #ifdef IPSEC struct secpolicy *sp = NULL; struct tdb_ident *tdbi; struct m_tag *mtag; /* XXX int s; */ mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); if (mtag != NULL) { tdbi = (struct tdb_ident *)(mtag + 1); sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND); if (sp == NULL) *error = -EINVAL; /* force silent drop */ m_tag_delete(*m, mtag); } else { sp = ipsec4_checkpolicy(*m, IPSEC_DIR_OUTBOUND, *flags, error, inp); } /* * There are four return cases: * sp != NULL apply IPsec policy * sp == NULL, error == 0 no IPsec handling needed * sp == NULL, error == -EINVAL discard packet w/o error * sp == NULL, error != 0 discard packet, report error */ if (sp != NULL) { /* Loop detection, check if ipsec processing already done */ KASSERT(sp->req != NULL, ("ip_output: no ipsec request")); for (mtag = m_tag_first(*m); mtag != NULL; mtag = m_tag_next(*m, mtag)) { if (mtag->m_tag_cookie != MTAG_ABI_COMPAT) continue; if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE && mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED) continue; /* * Check if policy has no SA associated with it. * This can happen when an SP has yet to acquire * an SA; e.g. on first reference. If it occurs, * then we let ipsec4_process_packet do its thing. */ if (sp->req->sav == NULL) break; tdbi = (struct tdb_ident *)(mtag + 1); if (tdbi->spi == sp->req->sav->spi && tdbi->proto == sp->req->sav->sah->saidx.proto && bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst, sizeof (union sockaddr_union)) == 0) { /* * No IPsec processing is needed, free * reference to SP. * * NB: null pointer to avoid free at * done: below. */ KEY_FREESP(&sp), sp = NULL; goto done; } } /* * Do delayed checksums now because we send before * this is done in the normal processing path. */ #ifdef INET if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(*m); (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #endif if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(*m, (*m)->m_pkthdr.len - sizeof(struct ip6_hdr), sizeof(struct ip6_hdr)); (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if ((*m)->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(*m, sizeof(struct ip6_hdr)); (*m)->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* NB: callee frees mbuf */ *error = ipsec6_process_packet(*m, sp->req); if (*error == EJUSTRETURN) { /* * We had a SP with a level of 'use' and no SA. We * will just continue to process the packet without * IPsec processing. */ *error = 0; goto done; } /* * Preserve KAME behaviour: ENOENT can be returned * when an SA acquire is in progress. Don't propagate * this to user-level; it confuses applications. * * XXX this will go away when the SADB is redone. */ if (*error == ENOENT) *error = 0; goto reinjected; } else { /* sp == NULL */ if (*error != 0) { /* * Hack: -EINVAL is used to signal that a packet * should be silently discarded. This is typically * because we asked key management for an SA and * it was delayed (e.g. kicked up to IKE). */ if (*error == -EINVAL) *error = 0; goto bad; } else { /* No IPsec processing for this packet. */ } } done: if (sp != NULL) KEY_FREESP(&sp); return 0; reinjected: if (sp != NULL) KEY_FREESP(&sp); return -1; bad: if (sp != NULL) KEY_FREESP(&sp); return 1; #endif /* IPSEC */ return 0; } #if 0 /* * Compute the MTU for a forwarded packet that gets IPSEC encapsulated. * Called from ip_forward(). * Returns MTU suggestion for ICMP needfrag reply. */ int ip6_ipsec_mtu(struct mbuf *m) { int mtu = 0; /* * If the packet is routed over IPsec tunnel, tell the * originator the tunnel MTU. * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz * XXX quickhack!!! */ #ifdef IPSEC struct secpolicy *sp = NULL; int ipsecerror; int ipsechdr; struct route *ro; sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, IP_FORWARDING, &ipsecerror); if (sp != NULL) { /* count IPsec header size */ ipsechdr = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, NULL); /* * find the correct route for outer IPv4 * header, compute tunnel MTU. */ if (sp->req != NULL && sp->req->sav != NULL && sp->req->sav->sah != NULL) { ro = &sp->req->sav->sah->route_cache.sa_route; if (ro->ro_rt && ro->ro_rt->rt_ifp) { mtu = ro->ro_rt->rt_mtu ? ro->ro_rt->rt_mtu : ro->ro_rt->rt_ifp->if_mtu; mtu -= ipsechdr; } } KEY_FREESP(&sp); } #endif /* IPSEC */ /* XXX else case missing. */ return mtu; } #endif Index: user/dchagin/lemul/sys/netinet6/mld6.c =================================================================== --- user/dchagin/lemul/sys/netinet6/mld6.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/mld6.c (revision 274241) @@ -1,3313 +1,3313 @@ /*- * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)igmp.c 8.1 (Berkeley) 7/19/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif static struct mld_ifinfo * mli_alloc_locked(struct ifnet *); static void mli_delete_locked(const struct ifnet *); static void mld_dispatch_packet(struct mbuf *); static void mld_dispatch_queue(struct ifqueue *, int); static void mld_final_leave(struct in6_multi *, struct mld_ifinfo *); static void mld_fasttimo_vnet(void); static int mld_handle_state_change(struct in6_multi *, struct mld_ifinfo *); static int mld_initial_join(struct in6_multi *, struct mld_ifinfo *, const int); #ifdef KTR static char * mld_rec_type_to_str(const int); #endif static void mld_set_version(struct mld_ifinfo *, const int); static void mld_slowtimo_vnet(void); static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static void mld_v1_process_group_timer(struct mld_ifinfo *, struct in6_multi *); static void mld_v1_process_querier_timers(struct mld_ifinfo *); static int mld_v1_transmit_report(struct in6_multi *, const int); static void mld_v1_update_group(struct in6_multi *, const int); static void mld_v2_cancel_link_timers(struct mld_ifinfo *); static void mld_v2_dispatch_general_query(struct mld_ifinfo *); static struct mbuf * mld_v2_encap_report(struct ifnet *, struct mbuf *); static int mld_v2_enqueue_filter_change(struct ifqueue *, struct in6_multi *); static int mld_v2_enqueue_group_record(struct ifqueue *, struct in6_multi *, const int, const int, const int, const int); static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, struct mbuf *, const int, const int); static int mld_v2_merge_state_changes(struct in6_multi *, struct ifqueue *); static void mld_v2_process_group_timers(struct mld_ifinfo *, struct ifqueue *, struct ifqueue *, struct in6_multi *, const int); static int mld_v2_process_group_query(struct in6_multi *, struct mld_ifinfo *mli, int, struct mbuf *, const int); static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS); /* * Normative references: RFC 2710, RFC 3590, RFC 3810. * * Locking: * * The MLD subsystem lock ends up being system-wide for the moment, * but could be per-VIMAGE later on. * * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * IN6_MULTI_LOCK covers in_multi. * * MLD_LOCK covers per-link state and any global variables in this file. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * XXX LOR PREVENTION * A special case for IPv6 is the in6_setscope() routine. ip6_output() * will not accept an ifp; it wants an embedded scope ID, unlike * ip_output(), which happily takes the ifp given to it. The embedded * scope ID is only used by MLD to select the outgoing interface. * * During interface attach and detach, MLD will take MLD_LOCK *after* * the IF_AFDATA_LOCK. * As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call * it with MLD_LOCK held without triggering an LOR. A netisr with indirect * dispatch could work around this, but we'd rather not do that, as it * can introduce other races. * * As such, we exploit the fact that the scope ID is just the interface * index, and embed it in the IPv6 destination address accordingly. * This is potentially NOT VALID for MLDv1 reports, as they * are always sent to the multicast group itself; as MLDv2 * reports are always sent to ff02::16, this is not an issue * when MLDv2 is in use. * * This does not however eliminate the LOR when ip6_output() itself * calls in6_setscope() internally whilst MLD_LOCK is held. This will * trigger a LOR warning in WITNESS when the ifnet is detached. * * The right answer is probably to make IF_AFDATA_LOCK an rwlock, given * how it's used across the network stack. Here we're simply exploiting * the fact that MLD runs at a similar layer in the stack to scope6.c. * * VIMAGE: * * Each in6_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. */ static struct mtx mld_mtx; static MALLOC_DEFINE(M_MLD, "mld", "mld state"); #define MLD_EMBEDSCOPE(pin6, zoneid) \ if (IN6_IS_SCOPE_LINKLOCAL(pin6) || \ IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \ (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \ /* * VIMAGE-wide globals. */ static VNET_DEFINE(struct timeval, mld_gsrdelay) = {10, 0}; static VNET_DEFINE(LIST_HEAD(, mld_ifinfo), mli_head); static VNET_DEFINE(int, interface_timers_running6); static VNET_DEFINE(int, state_change_timers_running6); static VNET_DEFINE(int, current_state_timers_running6); #define V_mld_gsrdelay VNET(mld_gsrdelay) #define V_mli_head VNET(mli_head) #define V_interface_timers_running6 VNET(interface_timers_running6) #define V_state_change_timers_running6 VNET(state_change_timers_running6) #define V_current_state_timers_running6 VNET(current_state_timers_running6) SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW, 0, "IPv6 Multicast Listener Discovery"); /* * Virtualized sysctls. */ -SYSCTL_VNET_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, +SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I", "Rate limit for MLDv2 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); static int mld_v1enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, &mld_v1enable, 0, "Enable fallback to MLDv1"); static int mld_use_allow = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. */ struct mld_raopt { struct ip6_hbh hbh; struct ip6_opt pad; struct ip6_opt_router ra; } __packed; /* * Router Alert hop-by-hop option header. */ static struct mld_raopt mld_ra = { .hbh = { 0, 0 }, .pad = { .ip6o_type = IP6OPT_PADN, 0 }, .ra = { .ip6or_type = IP6OPT_ROUTER_ALERT, .ip6or_len = IP6OPT_RTALERT_LEN - 2, .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF), .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF) } }; static struct ip6_pktopts mld_po; static __inline void mld_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } static __inline void mld_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t mld_restore_context(struct mbuf *m) { #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr, ("%s: called when curvnet was not restored", __func__)); #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by MLD lock. */ static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); MLD_LOCK(); i = V_mld_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d", V_mld_gsrdelay.tv_sec, i); V_mld_gsrdelay.tv_sec = i; out_locked: MLD_UNLOCK(); return (error); } /* * Expose struct mld_ifinfo to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS) { int *name; int error; u_int namelen; struct ifnet *ifp; struct mld_ifinfo *mli; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo)); if (error) return (error); IN6_MULTI_LOCK(); MLD_LOCK(); if (name[0] <= 0 || name[0] > V_if_index) { error = ENOENT; goto out_locked; } error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (ifp == mli->mli_ifp) { error = SYSCTL_OUT(req, mli, sizeof(struct mld_ifinfo)); break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains. * VIMAGE: Assumes the vnet pointer has been set. */ static void mld_dispatch_queue(struct ifqueue *ifq, int limit) { struct mbuf *m; for (;;) { _IF_DEQUEUE(ifq, m); if (m == NULL) break; CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, ifq, m); mld_dispatch_packet(m); if (--limit == 0) break; } } /* * Filter outgoing MLD report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) * and node-local addresses. However, kernel and socket consumers * always embed the KAME scope ID in the address provided, so strip it * when performing comparison. * Note: This is not the same as the *multicast* scope. * * Return zero if the given group is one for which MLD reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int mld_is_addr_reported(const struct in6_addr *addr) { KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__)); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) return (0); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { struct in6_addr tmp = *addr; in6_clearscope(&tmp); if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) return (0); } return (1); } /* * Attach MLD when PF_INET6 is attached to an interface. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ struct mld_ifinfo * mld_domifattach(struct ifnet *ifp) { struct mld_ifinfo *mli; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli = mli_alloc_locked(ifp); if (!(ifp->if_flags & IFF_MULTICAST)) mli->mli_flags |= MLIF_SILENT; if (mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_UNLOCK(); return (mli); } /* * VIMAGE: assume curvnet set by caller. */ static struct mld_ifinfo * mli_alloc_locked(/*const*/ struct ifnet *ifp) { struct mld_ifinfo *mli; MLD_LOCK_ASSERT(); mli = malloc(sizeof(struct mld_ifinfo), M_MLD, M_NOWAIT|M_ZERO); if (mli == NULL) goto out; mli->mli_ifp = ifp; mli->mli_version = MLD_VERSION_2; mli->mli_flags = 0; mli->mli_rv = MLD_RV_INIT; mli->mli_qi = MLD_QI_INIT; mli->mli_qri = MLD_QRI_INIT; mli->mli_uri = MLD_URI_INIT; SLIST_INIT(&mli->mli_relinmhead); /* * Responses to general queries are subject to bounds. */ IFQ_SET_MAXLEN(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_mli_head, mli, mli_link); CTR2(KTR_MLD, "allocate mld_ifinfo for ifp %p(%s)", ifp, if_name(ifp)); out: return (mli); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * Run before link-layer cleanup; cleanup groups, but do not free MLD state. * * SMPng: Caller must hold IN6_MULTI_LOCK(). * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator. * XXX This routine is also bitten by unlocked ifma_protospec access. */ void mld_ifdetach(struct ifnet *ifp) { struct mld_ifinfo *mli; struct ifmultiaddr *ifma; struct in6_multi *inm, *tinm; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); IN6_MULTI_LOCK_ASSERT(); MLD_LOCK(); mli = MLD_IFINFO(ifp); if (mli->mli_version == MLD_VERSION_2) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; if (inm->in6m_state == MLD_LEAVING_MEMBER) { SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, in6m_nrele); } in6m_clear_recorded(inm); } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); in6m_release_locked(inm); } } MLD_UNLOCK(); } /* * Hook for domifdetach. * Runs after link-layer cleanup; free MLD state. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ void mld_domifdetach(struct ifnet *ifp) { CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli_delete_locked(ifp); MLD_UNLOCK(); } static void mli_delete_locked(const struct ifnet *ifp) { struct mld_ifinfo *mli, *tmli; CTR3(KTR_MLD, "%s: freeing mld_ifinfo for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK_ASSERT(); LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) { if (mli->mli_ifp == ifp) { /* * Free deferred General Query responses. */ _IF_DRAIN(&mli->mli_gq); LIST_REMOVE(mli, mli_link); KASSERT(SLIST_EMPTY(&mli->mli_relinmhead), ("%s: there are dangling in_multi references", __func__)); free(mli, M_MLD); return; } } #ifdef INVARIANTS panic("%s: mld_ifinfo not found for ifp %p\n", __func__, ifp); #endif } /* * Process a received MLDv1 general or address-specific query. * Assumes that the query header has been pulled up to sizeof(mld_hdr). * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct ifmultiaddr *ifma; struct mld_ifinfo *mli; struct in6_multi *inm; int is_general_query; uint16_t timer; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif is_general_query = 0; if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * Do address field validation upfront before we accept * the query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * MLDv1 General Query. * If this was not sent to the all-nodes group, ignore it. */ struct in6_addr dst; dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LOCK(); MLD_LOCK(); /* * Switch to MLDv1 host compatibility mode. */ mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp)); mld_set_version(mli, MLD_VERSION_1); timer = (ntohs(mld->mld_maxdelay) * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; IF_ADDR_RLOCK(ifp); if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)", ifp, if_name(ifp)); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; mld_v1_update_group(inm, timer); } } else { /* * MLDv1 Group-Specific Query. * If this is a group-specific MLDv1 query, we need only * look up the single group to process it. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); mld_v1_update_group(inm, timer); } /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); } IF_ADDR_RUNLOCK(ifp); MLD_UNLOCK(); IN6_MULTI_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an MLDv1 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to MLDv2. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike MLDv2, the delay per group should be jittered * to avoid bursts of MLDv1 reports. */ static void mld_v1_update_group(struct in6_multi *inm, const int timer) { #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), timer); IN6_MULTI_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: if (inm->in6m_timer != 0 && inm->in6m_timer <= timer) { CTR1(KTR_MLD, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case MLD_SG_QUERY_PENDING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: CTR1(KTR_MLD, "%s: ->REPORTING", __func__); inm->in6m_state = MLD_REPORTING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; break; case MLD_SLEEPING_MEMBER: CTR1(KTR_MLD, "%s: ->AWAKENING", __func__); inm->in6m_state = MLD_AWAKENING_MEMBER; break; case MLD_LEAVING_MEMBER: break; } } /* * Process a received MLDv2 general, group-specific or * group-and-source-specific query. * * Assumes that the query header has been pulled up to sizeof(mldv2_query). * * Return 0 if successful, otherwise an appropriate error code is returned. */ static int mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, struct mbuf *m, const int off, const int icmp6len) { struct mld_ifinfo *mli; struct mldv2_query *mld; struct in6_multi *inm; uint32_t maxdelay, nsrc, qqi; int is_general_query; uint16_t timer; uint8_t qrv; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif is_general_query = 0; /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off); maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ if (maxdelay >= 32768) { maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << (MLD_MRC_EXP(maxdelay) + 3); } timer = (maxdelay * PR_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; qrv = MLD_QRV(mld->mld_misc); if (qrv < 2) { CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__, qrv, MLD_RV_INIT); qrv = MLD_RV_INIT; } qqi = mld->mld_qqi; if (qqi >= 128) { qqi = MLD_QQIC_MANT(mld->mld_qqi) << (MLD_QQIC_EXP(mld->mld_qqi) + 3); } nsrc = ntohs(mld->mld_numsrc); if (nsrc > MLD_MAX_GS_SOURCES) return (EMSGSIZE); if (icmp6len < sizeof(struct mldv2_query) + (nsrc * sizeof(struct in6_addr))) return (EMSGSIZE); /* * Do further input validation upfront to avoid resetting timers * should we need to discard this query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * A general query with a source list has undefined * behaviour; discard it. */ if (nsrc > 0) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks (due to KAME * locking lameness). We own this mbuf chain just now. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LOCK(); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp)); /* * Discard the v2 query if we're in Compatibility Mode. * The RFC is pretty clear that hosts need to stay in MLDv1 mode * until the Old Version Querier Present timer expires. */ if (mli->mli_version != MLD_VERSION_2) goto out_locked; mld_set_version(mli, MLD_VERSION_2); mli->mli_rv = qrv; mli->mli_qi = qqi; mli->mli_qri = maxdelay; CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi, maxdelay); if (is_general_query) { /* * MLDv2 General Query. * * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)", ifp, if_name(ifp)); if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); V_interface_timers_running6 = 1; } } else { /* * MLDv2 Group-specific or Group-and-source-specific Query. * * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ IF_ADDR_RLOCK(ifp); inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm == NULL) { IF_ADDR_RUNLOCK(ifp); goto out_locked; } if (nsrc > 0) { if (!ratecheck(&inm->in6m_lastgsrtv, &V_mld_gsrdelay)) { CTR1(KTR_MLD, "%s: GS query throttled.", __func__); IF_ADDR_RUNLOCK(ifp); goto out_locked; } } CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)", ifp, if_name(ifp)); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) mld_v2_process_group_query(inm, mli, timer, m, off); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); IF_ADDR_RUNLOCK(ifp); } out_locked: MLD_UNLOCK(); IN6_MULTI_UNLOCK(); return (0); } /* * Process a recieved MLDv2 group-specific or group-and-source-specific * query. * Return <0 if any error occured. Currently this is ignored. */ static int mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifinfo *mli, int timer, struct mbuf *m0, const int off) { struct mldv2_query *mld; int retval; uint16_t nsrc; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); retval = 0; mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LEAVING_MEMBER: return (retval); break; case MLD_REPORTING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(mld->mld_numsrc); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { in6m_clear_recorded(inm); timer = min(inm->in6m_timer, timer); } inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { timer = min(inm->in6m_timer, timer); inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. */ if (inm->in6m_nsrc > 0) { struct mbuf *m; uint8_t *sp; int i, nrecorded; int soff; m = m0; soff = off + sizeof(struct mldv2_query); nrecorded = 0; for (i = 0; i < nsrc; i++) { sp = mtod(m, uint8_t *) + soff; retval = in6m_record_source(inm, (const struct in6_addr *)sp); if (retval < 0) break; nrecorded += retval; soff += sizeof(struct in6_addr); if (soff >= m->m_len) { soff = soff - m->m_len; m = m->m_next; if (m == NULL) break; } } if (nrecorded > 0) { CTR1(KTR_MLD, "%s: schedule response to SG query", __func__); inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; } } return (retval); } /* * Process a received MLDv1 host membership report. * Assumes mld points to mld_hdr in pulled up mbuf chain. * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct in6_addr src, dst; struct in6_ifaddr *ia; struct in6_multi *inm; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } if (ifp->if_flags & IFF_LOOPBACK) return (0); /* * MLDv1 reports must originate from a host's link-local address, * or the unspecified address (when booting). */ src = ip6->ip6_src; in6_clearscope(&src); if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (EINVAL); } /* * RFC2710 Section 4: MLDv1 reports must pertain to a multicast * group, and must be directed to the group itself. */ dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_dst), ifp, if_name(ifp)); return (EINVAL); } /* * Make sure we don't hear our own membership report, as fast * leave requires knowing that we are the only member of a * group. Assume we used the link-local address if available, * otherwise look for ::. * * XXX Note that scope ID comparison is needed for the address * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be * performed for the on-wire address. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) || (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); /* * Embed scope ID of receiving interface in MLD query for lookup * whilst we don't hold other locks (due to KAME locking lameness). */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) in6_setscope(&mld->mld_addr, ifp, NULL); IN6_MULTI_LOCK(); MLD_LOCK(); IF_ADDR_RLOCK(ifp); /* * MLDv1 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { struct mld_ifinfo *mli; mli = inm->in6m_mli; KASSERT(mli != NULL, ("%s: no mli for ifp %p", __func__, ifp)); /* * If we are in MLDv2 host mode, do not allow the * other host's MLDv1 report to suppress our reports. */ if (mli->mli_version == MLD_VERSION_2) goto out_locked; inm->in6m_timer = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_AWAKENING_MEMBER: CTR3(KTR_MLD, "report suppressed for %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); case MLD_LAZY_MEMBER: inm->in6m_state = MLD_LAZY_MEMBER; break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } out_locked: IF_ADDR_RUNLOCK(ifp); MLD_UNLOCK(); IN6_MULTI_UNLOCK(); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); return (0); } /* * MLD input path. * * Assume query messages which fit in a single ICMPv6 message header * have been pulled up. * Assume that userland will want to see the message, even if it * otherwise fails kernel input validation; do not free it. * Pullup may however free the mbuf chain m if it fails. * * Return IPPROTO_DONE if we freed m. Otherwise, return 0. */ int mld_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct mld_hdr *mld; int mldlen; CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off); ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); /* Pullup to appropriate size. */ mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); if (mld->mld_type == MLD_LISTENER_QUERY && icmp6len >= sizeof(struct mldv2_query)) { mldlen = sizeof(struct mldv2_query); } else { mldlen = sizeof(struct mld_hdr); } IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); if (mld == NULL) { ICMP6STAT_INC(icp6s_badlen); return (IPPROTO_DONE); } /* * Userland needs to see all of this traffic for implementing * the endpoint discovery portion of multicast routing. */ switch (mld->mld_type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(ifp, ifs6_in_mldquery); if (icmp6len == sizeof(struct mld_hdr)) { if (mld_v1_input_query(ifp, ip6, mld) != 0) return (0); } else if (icmp6len >= sizeof(struct mldv2_query)) { if (mld_v2_input_query(ifp, ip6, m, off, icmp6len) != 0) return (0); } break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); if (mld_v1_input_report(ifp, ip6, mld) != 0) return (0); break; case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(ifp, ifs6_in_mlddone); break; default: break; } return (0); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_fasttimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_fasttimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Fast timeout handler (per-vnet). * * VIMAGE: Assume caller has set up our curvnet. */ static void mld_fasttimo_vnet(void) { struct ifqueue scq; /* State-change packets */ struct ifqueue qrq; /* Query response packets */ struct ifnet *ifp; struct mld_ifinfo *mli; struct ifmultiaddr *ifma; struct in6_multi *inm, *tinm; int uri_fasthz; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running6 && !V_interface_timers_running6 && !V_state_change_timers_running6) return; IN6_MULTI_LOCK(); MLD_LOCK(); /* * MLDv2 General Query response timer processing. */ if (V_interface_timers_running6) { CTR1(KTR_MLD, "%s: interface timers running", __func__); V_interface_timers_running6 = 0; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (mli->mli_v2_timer == 0) { /* Do nothing. */ } else if (--mli->mli_v2_timer == 0) { mld_v2_dispatch_general_query(mli); } else { V_interface_timers_running6 = 1; } } } if (!V_current_state_timers_running6 && !V_state_change_timers_running6) goto out_locked; V_current_state_timers_running6 = 0; V_state_change_timers_running6 = 0; CTR1(KTR_MLD, "%s: state change timers running", __func__); /* * MLD host report and state-change timer processing. * Note: Processing a v2 group timer may remove a node. */ LIST_FOREACH(mli, &V_mli_head, mli_link) { ifp = mli->mli_ifp; if (mli->mli_version == MLD_VERSION_2) { uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * PR_FASTHZ); memset(&qrq, 0, sizeof(struct ifqueue)); IFQ_SET_MAXLEN(&qrq, MLD_MAX_G_GS_PACKETS); memset(&scq, 0, sizeof(struct ifqueue)); IFQ_SET_MAXLEN(&scq, MLD_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; switch (mli->mli_version) { case MLD_VERSION_1: mld_v1_process_group_timer(mli, inm); break; case MLD_VERSION_2: mld_v2_process_group_timers(mli, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_RUNLOCK(ifp); switch (mli->mli_version) { case MLD_VERSION_1: /* * Transmit reports for this lifecycle. This * is done while not holding IF_ADDR_LOCK * since this can call * in6ifa_ifpforlinklocal() which locks * IF_ADDR_LOCK internally as well as * ip6_output() to transmit a packet. */ SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); (void)mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); } break; case MLD_VERSION_2: mld_dispatch_queue(&qrq, 0); mld_dispatch_queue(&scq, 0); /* * Free the in_multi reference(s) for * this lifecycle. */ SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); in6m_release_locked(inm); } break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_UNLOCK(); } /* * Update host report group timer. * Will update the global pending timer flags. */ static void mld_v1_process_group_timer(struct mld_ifinfo *mli, struct in6_multi *inm) { int report_timer_expired; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); if (inm->in6m_timer == 0) { report_timer_expired = 0; } else if (--inm->in6m_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running6 = 1; return; } switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_REPORTING_MEMBER: if (report_timer_expired) { inm->in6m_state = MLD_IDLE_MEMBER; SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, in6m_nrele); } break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } /* * Update a group's timers for MLDv2. * Will update the global pending timer flags. * Note: Unlocked read from mli. */ static void mld_v2_process_group_timers(struct mld_ifinfo *mli, struct ifqueue *qrq, struct ifqueue *scq, struct in6_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from compatibility mode back to MLDv2, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->in6m_timer == 0) { query_response_timer_expired = 0; } else if (--inm->in6m_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running6 = 1; } if (inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running6 = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval; retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); inm->in6m_state = MLD_REPORTING_MEMBER; in6m_clear_recorded(inm); } /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: case MLD_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->in6m_scrv > 0) { inm->in6m_sctimer = uri_fasthz; V_state_change_timers_running6 = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)mld_v2_merge_state_changes(inm, scq); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * If we are leaving the group for good, make sure * we release MLD's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->in6m_state == MLD_LEAVING_MEMBER && inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, in6m_nrele); } } break; } } /* * Switch to a different version on the given interface, * as per Section 9.12. */ static void mld_set_version(struct mld_ifinfo *mli, const int version) { int old_version_timer; MLD_LOCK_ASSERT(); CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__, version, mli->mli_ifp, if_name(mli->mli_ifp)); if (version == MLD_VERSION_1) { /* * Compute the "Older Version Querier Present" timer as per * Section 9.12. */ old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; old_version_timer *= PR_SLOWHZ; mli->mli_v1_timer = old_version_timer; } if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { mli->mli_version = MLD_VERSION_1; mld_v2_cancel_link_timers(mli); } } /* * Cancel pending MLDv2 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. */ static void mld_v2_cancel_link_timers(struct mld_ifinfo *mli) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm, *tinm; CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__, mli->mli_ifp, if_name(mli->mli_ifp)); IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * Fast-track this potentially expensive operation * by checking all the global 'timer pending' flags. */ if (!V_interface_timers_running6 && !V_state_change_timers_running6 && !V_current_state_timers_running6) return; mli->mli_v2_timer = 0; ifp = mli->mli_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6) continue; inm = (struct in6_multi *)ifma->ifma_protospec; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_LEAVING_MEMBER: /* * If we are leaving the group and switching * version, we need to release the final * reference held for issuing the INCLUDE {}. */ SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, in6m_nrele); /* FALLTHROUGH */ case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: in6m_clear_recorded(inm); /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: inm->in6m_sctimer = 0; inm->in6m_timer = 0; inm->in6m_state = MLD_REPORTING_MEMBER; /* * Free any pending MLDv2 state-change records. */ _IF_DRAIN(&inm->in6m_scq); break; } } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &mli->mli_relinmhead, in6m_nrele, tinm) { SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); in6m_release_locked(inm); } } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ void mld_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Per-vnet slowtimo handler. */ static void mld_slowtimo_vnet(void) { struct mld_ifinfo *mli; MLD_LOCK(); LIST_FOREACH(mli, &V_mli_head, mli_link) { mld_v1_process_querier_timers(mli); } MLD_UNLOCK(); } /* * Update the Older Version Querier Present timers for a link. * See Section 9.12 of RFC 3810. */ static void mld_v1_process_querier_timers(struct mld_ifinfo *mli) { MLD_LOCK_ASSERT(); if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { /* * MLDv1 Querier Present timer expired; revert to MLDv2. */ CTR5(KTR_MLD, "%s: transition from v%d -> v%d on %p(%s)", __func__, mli->mli_version, MLD_VERSION_2, mli->mli_ifp, if_name(mli->mli_ifp)); mli->mli_version = MLD_VERSION_2; } } /* * Transmit an MLDv1 report immediately. */ static int mld_v1_transmit_report(struct in6_multi *in6m, const int type) { struct ifnet *ifp; struct in6_ifaddr *ia; struct ip6_hdr *ip6; struct mbuf *mh, *md; struct mld_hdr *mld; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); ifp = in6m->in6m_ifp; ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); /* ia may be NULL if link-local address is tentative. */ mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } md = m_get(M_NOWAIT, MT_DATA); if (md == NULL) { m_free(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } mh->m_next = md; /* * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so * that ether_output() does not need to allocate another mbuf * for the header in the most common case. */ MH_ALIGN(mh, sizeof(struct ip6_hdr)); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = in6m->in6m_addr; md->m_len = sizeof(struct mld_hdr); mld = mtod(md, struct mld_hdr *); mld->mld_type = type; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_maxdelay = 0; mld->mld_reserved = 0; mld->mld_addr = in6m->in6m_addr; in6_clearscope(&mld->mld_addr); mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); mld_save_context(mh, ifp); mh->m_flags |= M_MLDV1; mld_dispatch_packet(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } /* * Process a state change from the upper layer for the given IPv6 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to MLD to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the MLDv2 state machine at group level. The MLd module * however makes the decision as to which MLD protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * If delay is non-zero, and the state change is an initial multicast * join, the state change report will be delayed by 'delay' ticks * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise * the initial MLDv2 state change report will be delayed by whichever * is sooner, a pending state-change timer or delay itself. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int mld_change_state(struct in6_multi *inm, const int delay) { struct mld_ifinfo *mli; struct ifnet *ifp; int error; IN6_MULTI_LOCK_ASSERT(); error = 0; /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp != NULL) { /* * Sanity check that netinet6's notion of ifp is the * same as net's. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); } MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifinfo for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an MLD * life cycle for this group. */ if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__, inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode); if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: initial join", __func__); error = mld_initial_join(inm, mli, delay); goto out_locked; } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: final leave", __func__); mld_final_leave(inm, mli); goto out_locked; } } else { CTR1(KTR_MLD, "%s: filter set change", __func__); } error = mld_handle_state_change(inm, mli); out_locked: MLD_UNLOCK(); return (error); } /* * Perform the initial join for an MLD group. * * When joining a group: * If the group should have its MLD traffic suppressed, do nothing. * MLDv1 starts sending MLDv1 host membership reports. * MLDv2 will schedule an MLDv2 state-change report containing the * initial state of the membership. * * If the delay argument is non-zero, then we must delay sending the * initial state change for delay ticks (in units of PR_FASTHZ). */ static int mld_initial_join(struct in6_multi *inm, struct mld_ifinfo *mli, const int delay) { struct ifnet *ifp; struct ifqueue *ifq; int error, retval, syncstates; int odelay; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); error = 0; syncstates = 1; ifp = inm->in6m_ifp; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * enter the MLD_SILENT_MEMBER state and * are never reported in any protocol exchanges. * All other groups enter the appropriate state machine * for the version in use on this link. * A link marked as MLIF_SILENT causes MLD to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); inm->in6m_state = MLD_SILENT_MEMBER; inm->in6m_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (mli->mli_version == MLD_VERSION_2 && inm->in6m_state == MLD_LEAVING_MEMBER) in6m_release_locked(inm); inm->in6m_state = MLD_REPORTING_MEMBER; switch (mli->mli_version) { case MLD_VERSION_1: /* * If a delay was provided, only use it if * it is greater than the delay normally * used for an MLDv1 state change report, * and delay sending the initial MLDv1 report * by not transitioning to the IDLE state. */ odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_FASTHZ); if (delay) { inm->in6m_timer = max(delay, odelay); V_current_state_timers_running6 = 1; } else { inm->in6m_state = MLD_IDLE_MEMBER; error = mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); if (error == 0) { inm->in6m_timer = odelay; V_current_state_timers_running6 = 1; } } break; case MLD_VERSION_2: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ ifq = &inm->in6m_scq; _IF_DRAIN(ifq); retval = mld_v2_enqueue_group_record(ifq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next mld_fasttimo (~200ms), * giving us an opportunity to merge the reports. * * If a delay was provided to this function, only * use this delay if sooner than the existing one. */ KASSERT(mli->mli_rv > 1, ("%s: invalid robustness %d", __func__, mli->mli_rv)); inm->in6m_scrv = mli->mli_rv; if (delay) { if (inm->in6m_sctimer > 1) { inm->in6m_sctimer = min(inm->in6m_sctimer, delay); } else inm->in6m_sctimer = delay; } else inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); } return (error); } /* * Issue an intermediate state change during the life-cycle. */ static int mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli) { struct ifnet *ifp; int retval; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); ifp = inm->in6m_ifp; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr) || (mli->mli_version != MLD_VERSION_2)) { if (!mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_MLD, "%s: nothing to do", __func__); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } _IF_DRAIN(&inm->in6m_scq); retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->in6m_scrv = mli->mli_rv; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; return (0); } /* * Perform the final leave for a multicast address. * * When leaving a group: * MLDv1 sends a DONE message, if and only if we are the reporter. * MLDv2 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void mld_final_leave(struct in6_multi *inm, struct mld_ifinfo *mli) { int syncstates; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif syncstates = 1; CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: if (mli->mli_version == MLD_VERSION_1) { #ifdef INVARIANTS if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) panic("%s: MLDv2 state reached, not MLDv2 mode", __func__); #endif mld_v1_transmit_report(inm, MLD_LISTENER_DONE); inm->in6m_state = MLD_NOT_MEMBER; V_current_state_timers_running6 = 1; } else if (mli->mli_version == MLD_VERSION_2) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ _IF_DRAIN(&inm->in6m_scq); inm->in6m_timer = 0; inm->in6m_scrv = mli->mli_rv; CTR4(KTR_MLD, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), inm->in6m_scrv); if (inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; inm->in6m_sctimer = 0; } else { int retval; in6m_acquire_locked(inm); retval = mld_v2_enqueue_group_record( &inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->in6m_state = MLD_LEAVING_MEMBER; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; syncstates = 0; } break; } break; case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s", __func__, &inm->in6m_addr, if_name(inm->in6m_ifp)); } } /* * Enqueue an MLDv2 group record to the given output queue. * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * If use_block_allow is non-zero, state change reports for initial join * and final leave, on an inclusive mode group with a source list, will be * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. * * The function will attempt to allocate leading space in the packet * for the IPv6+ICMP headers to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_group_record(struct ifqueue *ifq, struct in6_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query, const int use_block_allow) { struct mldv2_record mr; struct mldv2_record *pmr; struct ifnet *ifp; struct ip6_msource *ims, *nims; struct mbuf *m0, *m, *md; int error, is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; uint8_t mode; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LOCK_ASSERT(); error = 0; ifp = inm->in6m_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pmr = NULL; type = MLD_DO_NOTHING; mode = inm->in6m_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && inm->in6m_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. * * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. A transition to/from this state is * considered inclusive with some special treatment. * * If we are rewriting initial joins/leaves to use * ALLOW/BLOCK, and the group's membership is inclusive, * we need to send sources in all cases. */ if (mode != inm->in6m_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: change to EXCLUDE", __func__); type = MLD_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_MLD, "%s: change to INCLUDE", __func__); if (use_block_allow) { /* * XXX * Here we're interested in state * edges either direction between * MCAST_UNDEFINED and MCAST_INCLUDE. * Perhaps we should just check * the group state, rather than * the filter mode. */ if (mode == MCAST_UNDEFINED) { type = MLD_BLOCK_OLD_SOURCES; } else { type = MLD_ALLOW_NEW_SOURCES; } } else { type = MLD_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = MLD_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = MLD_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = MLD_MODE_IS_INCLUDE; KASSERT(inm->in6m_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->in6m_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (mld_v2_enqueue_filter_change(ifq, inm)); if (type == MLD_DO_NOTHING) { CTR3(KTR_MLD, "%s: nothing to do for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct mldv2_record); if (record_has_sources) minrec0len += sizeof(struct in6_addr); CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__, mld_rec_type_to_str(type), ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP6+RA+ICMPV6+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = ifq->ifq_tail; if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - MLD_MTUSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); m = m0; CTR1(KTR_MLD, "%s: use existing packet", __func__); } else { if (_IF_QFULL(ifq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); if (!is_state_change && !is_group_query) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); CTR1(KTR_MLD, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ mr.mr_type = type; mr.mr_datalen = 0; mr.mr_numsrc = 0; mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct mldv2_record); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, and * use_block_allow is zero, do not include source entries. * Otherwise, we need to include this source in the report. * * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); CTR2(KTR_MLD, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct in6_addr); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__, msrcs); pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); } if (is_source_query && msrcs == 0) { CTR1(KTR_MLD, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_MLD, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; _IF_ENQUEUE(ifq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (_IF_QFULL(ifq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); CTR1(KTR_MLD, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 1; nbytes += sizeof(struct mldv2_record); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); msrcs = 0; RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); CTR1(KTR_MLD, "%s: enqueueing next packet", __func__); _IF_ENQUEUE(ifq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an MLDv2 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_filter_change(struct ifqueue *ifq, struct in6_multi *inm) { static const int MINRECLEN = sizeof(struct mldv2_record) + sizeof(struct in6_addr); struct ifnet *ifp; struct mldv2_record mr; struct mldv2_record *pmr; struct ip6_msource *ims, *nims; struct mbuf *m, *m0, *md; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; int nallow, nblock; uint8_t mode, now, then; rectype_t crt, drt, nrt; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LOCK_ASSERT(); if (inm->in6m_nsrc == 0 || (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) return (0); ifp = inm->in6m_ifp; /* interface */ mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ npbytes = 0; /* # of bytes appended this packet */ nbytes = 0; /* # of bytes appended to group's state-change queue */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = ifq->ifq_tail; if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - MLD_MTUSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); CTR1(KTR_MLD, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CTR1(KTR_MLD, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.PH_vt.vt_nrecs = 0; mld_save_context(m, ifp); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); npbytes = 0; CTR1(KTR_MLD, "%s: allocated new packet", __func__); } /* * Append the MLD group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&mr, 0, sizeof(mr)); mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(mr), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct mldv2_record); if (m != m0) { /* new packet; offset in chain */ md = m_getptr(m, npbytes - sizeof(struct mldv2_record), &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct mldv2_record)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) { nims = RB_MIN(ip6_msource_tree, &inm->in6m_srcs); } RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); then = im6s_get_mode(inm, ims, 0); CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_MLD, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct mldv2_record); if (m != m0) { CTR1(KTR_MLD, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_MLD, "%s: m_adj(m, -mr)", __func__); m_adj(m, -((int)sizeof( struct mldv2_record))); } continue; } npbytes += (rsrcs * sizeof(struct in6_addr)); if (crt == REC_ALLOW) pmr->mr_type = MLD_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pmr->mr_type = MLD_BLOCK_OLD_SOURCES; pmr->mr_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) _IF_ENQUEUE(ifq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int mld_v2_merge_state_changes(struct in6_multi *inm, struct ifqueue *ifscq) { struct ifqueue *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->in6m_scrv > 0) docopy = 1; gq = &inm->in6m_scq; #ifdef KTR if (gq->ifq_head == NULL) { CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = gq->ifq_head; while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an MLDv2 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = ifscq->ifq_tail; if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.PH_vt.vt_nrecs + m->m_pkthdr.PH_vt.vt_nrecs <= MLD_V2_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) domerge = 1; } if (!domerge && _IF_QFULL(gq)) { CTR2(KTR_MLD, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m); _IF_DEQUEUE(gq, m0); m = m0->m_nextpkt; } else { CTR2(KTR_MLD, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_MLD, "%s: queueing %p to ifscq %p)", __func__, m0, ifscq); _IF_ENQUEUE(ifscq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.PH_vt.vt_nrecs += m0->m_pkthdr.PH_vt.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending MLDv2 General Query. */ static void mld_v2_dispatch_general_query(struct mld_ifinfo *mli) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; int retval; IN6_MULTI_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli->mli_version == MLD_VERSION_2, ("%s: called when version %d", __func__, mli->mli_version)); ifp = mli->mli_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_INET6 || ifma->ifma_protospec == NULL) continue; inm = (struct in6_multi *)ifma->ifma_protospec; KASSERT(ifp == inm->in6m_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: inm->in6m_state = MLD_REPORTING_MEMBER; retval = mld_v2_enqueue_group_record(&mli->mli_gq, inm, 0, 0, 0, 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } IF_ADDR_RUNLOCK(ifp); mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); /* * Slew transmission of bursts over 500ms intervals. */ if (mli->mli_gq.ifq_head != NULL) { mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( MLD_RESPONSE_BURST_INTERVAL); V_interface_timers_running6 = 1; } } /* * Transmit the next pending message in the output queue. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as MLD traffic is always local to * a link and uses a link-scope multicast address. */ static void mld_dispatch_packet(struct mbuf *m) { struct ip6_moptions im6o; struct ifnet *ifp; struct ifnet *oifp; struct mbuf *m0; struct mbuf *md; struct ip6_hdr *ip6; struct mld_hdr *mld; int error; int off; int type; uint32_t ifindex; CTR2(KTR_MLD, "%s: transmit %p", __func__, m); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ ifindex = mld_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IP6STAT_INC(ip6s_noroute); goto out; } im6o.im6o_multicast_hlim = 1; im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL); im6o.im6o_multicast_ifp = ifp; if (m->m_flags & M_MLDV1) { m0 = m; } else { m0 = mld_v2_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_MLD, "%s: dropped %p", __func__, m); IP6STAT_INC(ip6s_odropped); goto out; } } mld_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; ip6 = mtod(m0, struct ip6_hdr *); #if 0 (void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ #else /* * XXX XXX Break some KPI rules to prevent an LOR which would * occur if we called in6_setscope() at transmission. * See comments at top of file. */ MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); #endif /* * Retrieve the ICMPv6 type before handoff to ip6_output(), * so we can bump the stats. */ md = m_getptr(m0, sizeof(struct ip6_hdr), &off); mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); type = mld->mld_type; error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o, &oifp, NULL); if (error) { CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error); goto out; } ICMP6STAT_INC(icp6s_outhist[type]); if (oifp != NULL) { icmp6_ifstat_inc(oifp, ifs6_out_msg); switch (type) { case MLD_LISTENER_REPORT: case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(oifp, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(oifp, ifs6_out_mlddone); break; } } out: return; } /* * Encapsulate an MLDv2 report. * * KAME IPv6 requires that hop-by-hop options be passed separately, * and that the IPv6 header be prepended in a separate mbuf. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mh; struct mldv2_report *mld; struct ip6_hdr *ip6; struct in6_ifaddr *ia; int mldreclen; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); /* * RFC3590: OK to send as :: or tentative during DAD. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ia == NULL) CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__); mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); m_freem(m); return (NULL); } MH_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); mldreclen = m_length(m, NULL); CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen); mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report) + mldreclen; ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; if (ia != NULL) ifa_free(&ia->ia_ifa); ip6->ip6_dst = in6addr_linklocal_allv2routers; /* scope ID will be set in netisr */ mld = (struct mldv2_report *)(ip6 + 1); mld->mld_type = MLDV2_LISTENER_REPORT; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_v2_reserved = 0; mld->mld_v2_numrecs = htons(m->m_pkthdr.PH_vt.vt_nrecs); m->m_pkthdr.PH_vt.vt_nrecs = 0; mh->m_next = m; mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); return (mh); } #ifdef KTR static char * mld_rec_type_to_str(const int type) { switch (type) { case MLD_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case MLD_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case MLD_MODE_IS_EXCLUDE: return "MODE_EX"; break; case MLD_MODE_IS_INCLUDE: return "MODE_IN"; break; case MLD_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case MLD_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void mld_init(void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); MLD_LOCK_INIT(); ip6_initpktopts(&mld_po); mld_po.ip6po_hlim = 1; mld_po.ip6po_hbh = &mld_ra.hbh; mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; mld_po.ip6po_flags = IP6PO_DONTFRAG; } SYSINIT(mld_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_init, NULL); static void mld_uninit(void *unused __unused) { CTR1(KTR_MLD, "%s: tearing down", __func__); MLD_LOCK_DESTROY(); } SYSUNINIT(mld_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, mld_uninit, NULL); static void vnet_mld_init(const void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); LIST_INIT(&V_mli_head); } VNET_SYSINIT(vnet_mld_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_init, NULL); static void vnet_mld_uninit(const void *unused __unused) { CTR1(KTR_MLD, "%s: tearing down", __func__); KASSERT(LIST_EMPTY(&V_mli_head), ("%s: mli list not empty; ifnets not detached?", __func__)); } VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mld_uninit, NULL); static int mld_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t mld_mod = { "mld", mld_modevent, 0 }; DECLARE_MODULE(mld, mld_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); Index: user/dchagin/lemul/sys/netinet6/nd6.c =================================================================== --- user/dchagin/lemul/sys/netinet6/nd6.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/nd6.c (revision 274241) @@ -1,2475 +1,2475 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define L3_ADDR_SIN6(le) ((struct sockaddr_in6 *) L3_ADDR(le)) #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ static VNET_DEFINE(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ static VNET_DEFINE(int, nd6_maxqueuelen) = 1; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif /* for debugging? */ #if 0 static int nd6_inuse, nd6_allocated; #endif VNET_DEFINE(struct nd_drhead, nd_defrouter); VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static struct llentry *nd6_free(struct llentry *, int); static void nd6_llinfo_timer(void *); static void clear_llinfo_pqueue(struct llentry *); static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); static VNET_DEFINE(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE(struct callout, nd6_timer_ch); void nd6_init(void) { LIST_INIT(&V_nd_prefix); /* initialization of the default router list */ TAILQ_INIT(&V_nd_defrouter); /* start timer */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = (struct nd_ifinfo *)malloc(sizeof(*nd), M_IP6NDP, M_WAITOK|M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) nd->flags |= ND6_IFF_ACCEPT_RTADV; if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct nd_ifinfo *nd) { free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; switch (ifp->if_type) { case IFT_ARCNET: ndi->maxmtu = MIN(ARC_PHDS_MAXMTU, ifp->if_mtu); /* RFC2497 */ break; case IFT_FDDI: ndi->maxmtu = MIN(FDDIIPMTU, ifp->if_mtu); /* RFC2467 */ break; case IFT_ISO88025: ndi->maxmtu = MIN(ISO88025_MAX_MTU, ifp->if_mtu); break; default: ndi->maxmtu = ifp->if_mtu; break; } /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accomodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->ln_timer_ch); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->ln_timer_ch, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->ln_timer_ch, tick, nd6_llinfo_timer, ln); } } if (canceled) LLE_REMREF(ln); } void nd6_llinfo_settimer(struct llentry *ln, long tick) { LLE_WLOCK(ln); nd6_llinfo_settimer_locked(ln, tick); LLE_WUNLOCK(ln); } static void nd6_llinfo_timer(void *arg) { struct llentry *ln; struct in6_addr *dst; struct ifnet *ifp; struct nd_ifinfo *ndi = NULL; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; LLE_WLOCK_ASSERT(ln); ifp = ln->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } ndi = ND_IFINFO(ifp); dst = &L3_ADDR_SIN6(ln)->sin6_addr; if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { (void)nd6_free(ln, 0); ln = NULL; goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); LLE_WUNLOCK(ln); nd6_ns_output(ifp, NULL, dst, ln, 0); LLE_WLOCK(ln); } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_TIMEDOUT); (void)nd6_free(ln, 0); ln = NULL; if (m != NULL) icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) { ln->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); } break; case ND6_LLINFO_STALE: /* Garbage Collection(RFC 2461 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) { EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); (void)nd6_free(ln, 1); ln = NULL; } break; case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); LLE_WUNLOCK(ln); nd6_ns_output(ifp, dst, dst, ln, 0); LLE_WLOCK(ln); } else { ln->ln_state = ND6_LLINFO_STALE; /* XXX */ nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); } break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); LLE_WUNLOCK(ln); nd6_ns_output(ifp, dst, dst, ln, 0); LLE_WLOCK(ln); } else { EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); (void)nd6_free(ln, 0); ln = NULL; } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (ln != NULL) LLE_FREE_LOCKED(ln); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; struct in6_ifaddr *ia6, *nia6; callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); /* expire default router list */ TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->expire && dr->expire < time_uptime) defrtrlist_del(dr); } /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ addrloop: TAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else { /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } /* expire prefix list */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* * check prefix lifetime. * since pltime is just for autoconf, pltime processing for * prefix is not necessary. */ if (pr->ndpr_vltime != ND6_INFINITE_LIFETIME && time_uptime - pr->ndpr_lastupdate > pr->ndpr_vltime) { /* * address expiration and prefix expiration are * separate. NEVER perform in6_purgeaddr here. */ prelist_remove(pr); } } CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; ifp = ia6->ia_ifa.ifa_ifp; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); } IF_ADDR_RUNLOCK(ifp); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Nuke neighbor cache/prefix/default router management table, right before * ifp goes away. */ void nd6_purge(struct ifnet *ifp) { struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; /* * Nuke default router list entries toward ifp. * We defer removal of default router list entries that is installed * in the routing table, in order to keep additional side effects as * small as possible. */ TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (dr->installed) continue; if (dr->ifp == ifp) defrtrlist_del(dr); } TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, ndr) { if (!dr->installed) continue; if (dr->ifp == ifp) defrtrlist_del(dr); } /* Nuke prefix list entries toward ifp */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { if (pr->ndpr_ifp == ifp) { /* * Because if_detach() does *not* release prefixes * while purging addresses the reference count will * still be above zero. We therefore reset it to * make sure that the prefix really gets purged. */ pr->ndpr_refcnt = 0; /* * Previously, pr->ndpr_addr is removed as well, * but I strongly believe we don't have to do it. * nd6_purge() is only called from in6_ifdetach(), * which removes all the associated interface addresses * by itself. * (jinmei@kame.net 20010129) */ prelist_remove(pr); } } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select(); } /* XXXXX * We do not nuke the neighbor cache entries here any more * because the neighbor cache is kept in if_afdata[AF_INET6]. * nd6_purge() is invoked by in6_ifdetach() which is called * from if_detach() where everything gets purged. So let * in6_domifdetach() do the actual L2 table purging work. */ } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; int llflags; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); llflags = 0; if (flags & ND6_CREATE) llflags |= LLE_CREATE; if (flags & ND6_EXCLUSIVE) llflags |= LLE_EXCLUSIVE; ln = lla_lookup(LLTABLE6(ifp), llflags, (struct sockaddr *)&sin6); if ((ln != NULL) && (llflags & LLE_CREATE)) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *dstaddr; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { struct rtentry *rt; /* Always use the default FIB here. */ rt = in6_rtalloc1((struct sockaddr *)&pr->ndpr_prefix, 0, 0, RT_DEFAULT_FIB); if (rt == NULL) continue; /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. In the case * where RADIX_MPATH is enabled, multiple route * entries (of the same rt_key value) will be * installed because the interface addresses all * differ. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr)) { RTFREE_LOCKED(rt); continue; } RTFREE_LOCKED(rt); } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) return (1); } /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr, RT_ALL_FIBS); if (dstaddr != NULL) { if (dstaddr->ifa_ifp == ifp) { ifa_free(dstaddr); return (1); } ifa_free(dstaddr); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && TAILQ_EMPTY(&V_nd_defrouter) && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *lle; int rc = 0; IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ IF_AFDATA_RLOCK(ifp); if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } IF_AFDATA_RUNLOCK(ifp); return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. */ static struct llentry * nd6_free(struct llentry *ln, int gc) { struct llentry *next; struct nd_defrouter *dr; struct ifnet *ifp; LLE_WLOCK_ASSERT(ln); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); ifp = ln->lle_tbl->llt_ifp; if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { dr = defrouter_lookup(&L3_ADDR_SIN6(ln)->sin6_addr, ifp); if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); next = LIST_NEXT(ln, lle_next); LLE_REMREF(ln); LLE_WUNLOCK(ln); return (next); } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&L3_ADDR_SIN6(ln)->sin6_addr, ifp); } if (dr) { /* * Since defrouter_select() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select(); } if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Before deleting the entry, remember the next entry as the * return value. We need this because pfxlist_onlink_check() above * might have freed other entries (particularly the old next entry) as * a side effect (XXX). */ next = LIST_NEXT(ln, lle_next); /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { LLE_REMREF(ln); llentry_free(ln); } else LLE_FREE_LOCKED(ln); IF_AFDATA_UNLOCK(ifp); return (next); } /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * * XXX cost-effective methods? */ void nd6_nud_hint(struct rtentry *rt, struct in6_addr *dst6, int force) { struct llentry *ln; struct ifnet *ifp; if ((dst6 == NULL) || (rt == NULL)) return; ifp = rt->rt_ifp; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(dst6, ND6_EXCLUSIVE, NULL); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) return; if (ln->ln_state < ND6_LLINFO_REACHABLE) goto done; /* * if we get upper-layer reachability confirmation many times, * it is possible we have false information. */ if (!force) { ln->ln_byhint++; if (ln->ln_byhint > V_nd6_maxnudhint) { goto done; } } ln->ln_state = ND6_LLINFO_REACHABLE; if (!ND6_LLINFO_PERMANENT(ln)) { nd6_llinfo_settimer_locked(ln, (long)ND_IFINFO(rt->rt_ifp)->reachable * hz); } done: LLE_WUNLOCK(ln); } /* * Rejuvenate this function for routing operations related * processing. */ void nd6_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) { struct sockaddr_in6 *gateway; struct nd_defrouter *dr; struct ifnet *ifp; gateway = (struct sockaddr_in6 *)rt->rt_gateway; ifp = rt->rt_ifp; switch (req) { case RTM_ADD: break; case RTM_DELETE: if (!ifp) return; /* * Only indirect routes are interesting. */ if ((rt->rt_flags & RTF_GATEWAY) == 0) return; /* * check for default route */ if (IN6_ARE_ADDR_EQUAL(&in6addr_any, &SIN6(rt_key(rt))->sin6_addr)) { dr = defrouter_lookup(&gateway->sin6_addr, ifp); if (dr != NULL) dr->installed = 0; } break; } } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_drlist *drl = (struct in6_drlist *)data; struct in6_oprlist *oprl = (struct in6_oprlist *)data; struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; struct nd_defrouter *dr; struct nd_prefix *pr; int i = 0, error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case SIOCGDRLST_IN6: /* * obsolete API, use sysctl under net.inet6.icmp6 */ bzero(drl, sizeof(*drl)); TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { if (i >= DRLSTSIZ) break; drl->defrouter[i].rtaddr = dr->rtaddr; in6_clearscope(&drl->defrouter[i].rtaddr); drl->defrouter[i].flags = dr->flags; drl->defrouter[i].rtlifetime = dr->rtlifetime; drl->defrouter[i].expire = dr->expire + (time_second - time_uptime); drl->defrouter[i].if_index = dr->ifp->if_index; i++; } break; case SIOCGPRLST_IN6: /* * obsolete API, use sysctl under net.inet6.icmp6 * * XXX the structure in6_prlist was changed in backward- * incompatible manner. in6_oprlist is used for SIOCGPRLST_IN6, * in6_prlist is used for nd6_sysctl() - fill_prlist(). */ /* * XXX meaning of fields, especialy "raflags", is very * differnet between RA prefix list and RR/static prefix list. * how about separating ioctls into two? */ bzero(oprl, sizeof(*oprl)); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { struct nd_pfxrouter *pfr; int j; if (i >= PRLSTSIZ) break; oprl->prefix[i].prefix = pr->ndpr_prefix.sin6_addr; oprl->prefix[i].raflags = pr->ndpr_raf; oprl->prefix[i].prefixlen = pr->ndpr_plen; oprl->prefix[i].vltime = pr->ndpr_vltime; oprl->prefix[i].pltime = pr->ndpr_pltime; oprl->prefix[i].if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) oprl->prefix[i].expire = 0; else { time_t maxexpire; /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) { oprl->prefix[i].expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); } else oprl->prefix[i].expire = maxexpire; } j = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { if (j < DRLSTSIZ) { #define RTRADDR oprl->prefix[i].advrtr[j] RTRADDR = pfr->router->rtaddr; in6_clearscope(&RTRADDR); #undef RTRADDR } j++; } oprl->prefix[i].advrtrs = j; oprl->prefix[i].origin = PR_ORIG_RA; i++; } break; case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intented for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ int duplicated_linklocal = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { duplicated_linklocal = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (duplicated_linklocal) { ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } IF_ADDR_RUNLOCK(ifp); } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ int haslinklocal = 0; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) { haslinklocal = 1; break; } } IF_ADDR_RUNLOCK(ifp); if (!haslinklocal) in6_ifattach(ifp, NULL); } } } ND_IFINFO(ifp)->flags = ND.flags; break; #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select(); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct nd_prefix *pr, *next; LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { struct in6_ifaddr *ia, *ia_next; if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ /* do we really have to remove addresses as well? */ /* XXXRW: in6_ifaddrhead locking. */ TAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } prelist_remove(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ struct nd_defrouter *dr, *next; defrouter_reset(); TAILQ_FOREACH_SAFE(dr, &V_nd_defrouter, dr_entry, next) { defrtrlist_del(dr); } defrouter_select(); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&nb_addr, 0, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * * XXXXX * The caller of this function already acquired the ndp * cache table lock because the cache entry is returned. */ struct llentry * nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL; int is_newentry; int do_update; int olladdr; int llchange; int flags; int newstate = 0; uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; int static_route = 0; IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return NULL; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? ND6_EXCLUSIVE : 0; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(from, flags, ifp); IF_AFDATA_RUNLOCK(ifp); if (ln == NULL) { flags |= ND6_EXCLUSIVE; IF_AFDATA_LOCK(ifp); ln = nd6_lookup(from, flags | ND6_CREATE, ifp); IF_AFDATA_UNLOCK(ifp); is_newentry = 1; } else { /* do nothing if static ndp is set */ if (ln->la_flags & LLE_STATIC) { static_route = 1; goto done; } is_newentry = 0; } if (ln == NULL) return (NULL); olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, &ln->ll_addr, ifp->if_addrlen); } else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y -- (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ if (lladdr) { /* (3-5) and (7) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ bcopy(lladdr, &ln->ll_addr, ifp->if_addrlen); ln->la_flags |= LLE_VALID; EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } if (!is_newentry) { if ((!olladdr && lladdr != NULL) || /* (3) */ (olladdr && lladdr != NULL && llchange)) { /* (5) */ do_update = 1; newstate = ND6_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; if (lladdr == NULL) /* (6) */ newstate = ND6_LLINFO_NOSTATE; else /* (7) */ newstate = ND6_LLINFO_STALE; } if (do_update) { /* * Update the state of the neighbor cache. */ ln->ln_state = newstate; if (ln->ln_state == ND6_LLINFO_STALE) { /* * XXX: since nd6_output() below will cause * state tansition to DELAY and reset the timer, * we must set the timer now, although it is actually * meaningless. */ nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); if (ln->la_hold) { struct mbuf *m_hold, *m_hold_next; /* * reset the la_hold in advance, to explicitly * prevent a la_hold lookup in nd6_output() * (wouldn't happen, though...) */ for (m_hold = ln->la_hold, ln->la_hold = NULL; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_hold->m_nextpkt = NULL; /* * we assume ifp is not a p2p here, so * just set the 2nd argument as the * 1st one. */ nd6_output_lle(ifp, ifp, m_hold, L3_ADDR_SIN6(ln), NULL, ln, &chain); } /* * If we have mbufs in the chain we need to do * deferred transmit. Copy the address from the * llentry before dropping the lock down below. */ if (chain != NULL) memcpy(&sin6, L3_ADDR_SIN6(ln), sizeof(sin6)); } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* probe right away */ nd6_llinfo_settimer_locked((void *)ln, 0); } } /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * newentry olladdr lladdr llchange NS RS RA redir * D R * 0 n n -- (1) c ? s * 0 y n -- (2) c s s * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; else if (is_newentry) /* (6-7) */ ln->ln_router = 0; break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln->ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; } if (ln != NULL) { static_route = (ln->la_flags & LLE_STATIC); router = ln->ln_router; if (flags & ND6_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (static_route) ln = NULL; } if (chain) nd6_output_flush(ifp, ifp, chain, &sin6, NULL); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if (do_update && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select(); } return (ln); done: if (ln != NULL) { if (flags & ND6_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (static_route) ln = NULL; } return (ln); } static void nd6_slowtimo(void *arg) { CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_list) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } IFNET_RUNLOCK_NOSLEEP(); CURVNET_RESTORE(); } /* * IPv6 packet output - light version. * Checks if destination LLE exists and is in proper state * (e.g no modification required). If not true, fall back to * "heavy" version. */ int nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct rtentry *rt0) { struct llentry *ln = NULL; int error = 0; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr)) goto sendpkt; if (nd6_need_cache(ifp) == 0) goto sendpkt; IF_AFDATA_RLOCK(ifp); ln = nd6_lookup(&dst->sin6_addr, 0, ifp); IF_AFDATA_RUNLOCK(ifp); /* * Perform fast path for the following cases: * 1) lle state is REACHABLE * 2) lle state is DELAY (NS message sentNS message sent) * * Every other case involves lle modification, so we handle * them separately. */ if (ln == NULL || (ln->ln_state != ND6_LLINFO_REACHABLE && ln->ln_state != ND6_LLINFO_DELAY)) { /* Fall back to slow processing path */ if (ln != NULL) LLE_RUNLOCK(ln); return (nd6_output_lle(ifp, origifp, m, dst, rt0, NULL, NULL)); } sendpkt: if (ln != NULL) LLE_RUNLOCK(ln); #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { struct m_tag *mtag; struct ip6_hdr *ip6; int ip6len; mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, NULL); return (error); } /* * Output IPv6 packet - heavy version. * Function assume that either * 1) destination LLE does not exist, is invalid or stale, so * ND6_EXCLUSIVE lock needs to be acquired * 2) destination lle is provided (with ND6_EXCLUSIVE lock), * in that case packets are queued in &chain. * */ int nd6_output_lle(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct rtentry *rt0, struct llentry *lle, struct mbuf **chain) { struct m_tag *mtag; struct ip6_hdr *ip6; int error = 0; int flags = 0; int has_lle = 0; int ip6len; #ifdef INVARIANTS if (lle != NULL) { LLE_WLOCK_ASSERT(lle); KASSERT(chain != NULL, (" lle locked but no mbuf chain pointer passed")); } #endif KASSERT(m != NULL, ("NULL mbuf, nothing to send")); /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (lle != NULL) has_lle = 1; if (IN6_IS_ADDR_MULTICAST(&dst->sin6_addr)) goto sendpkt; if (nd6_need_cache(ifp) == 0) goto sendpkt; /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { IF_AFDATA_RLOCK(ifp); lle = nd6_lookup(&dst->sin6_addr, ND6_EXCLUSIVE, ifp); IF_AFDATA_RUNLOCK(ifp); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ flags = ND6_CREATE | ND6_EXCLUSIVE; IF_AFDATA_LOCK(ifp); lle = nd6_lookup(&dst->sin6_addr, flags, ifp); IF_AFDATA_UNLOCK(ifp); } } if (lle == NULL) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0 && !(ND_IFINFO(ifp)->flags & ND6_IFF_PERFORMNUD)) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } goto sendpkt; /* send anyway */ } LLE_WLOCK_ASSERT(lle); /* We don't have to do link-layer address resolution on a p2p link. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && lle->ln_state < ND6_LLINFO_REACHABLE) { lle->ln_state = ND6_LLINFO_STALE; nd6_llinfo_settimer_locked(lle, (long)V_nd6_gctimer * hz); } /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) { lle->la_asked = 0; lle->ln_state = ND6_LLINFO_DELAY; nd6_llinfo_settimer_locked(lle, (long)V_nd6_delay * hz); } /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) goto sendpkt; /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf, unless the number of the packet * does not exceed nd6_maxqueuelen. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->ln_state == ND6_LLINFO_NOSTATE) lle->ln_state = ND6_LLINFO_INCOMPLETE; if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. */ if (!ND6_LLINFO_PERMANENT(lle) && lle->la_asked == 0) { lle->la_asked++; nd6_llinfo_settimer_locked(lle, (long)ND_IFINFO(ifp)->retrans * hz / 1000); LLE_WUNLOCK(lle); nd6_ns_output(ifp, NULL, &dst->sin6_addr, lle, 0); if (has_lle != 0) LLE_WLOCK(lle); } else if (has_lle == 0) { /* * We did the lookup (no lle arg) so we * need to do the unlock here. */ LLE_WUNLOCK(lle); } return (0); sendpkt: /* * ln is valid and the caller did not pass in * an llentry */ if (lle != NULL && has_lle == 0) LLE_WUNLOCK(lle); #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } /* * We were passed in a pointer to an lle with the lock held * this means that we can't call if_output as we will * recurse on the lle lock - so what we do is we create * a list of mbufs to send and transmit them in the caller * after the lock is dropped */ if (has_lle != 0) { if (*chain == NULL) *chain = m; else { struct mbuf *mb; /* * append mbuf to end of deferred chain */ mb = *chain; while (mb->m_nextpkt != NULL) mb = mb->m_nextpkt; mb->m_nextpkt = m; } return (error); } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, NULL); return (error); } int nd6_output_flush(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain, struct sockaddr_in6 *dst, struct route *ro) { struct mbuf *m, *m_head; struct ifnet *outifp; int error = 0; m_head = chain; if ((ifp->if_flags & IFF_LOOPBACK) != 0) outifp = origifp; else outifp = ifp; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, ro); } /* * XXX * note that intermediate errors are blindly ignored - but this is * the same convention as used with nd6_output when called by * nd6_cache_lladdr */ return (error); } int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than ARCnet, Ethernet, FDDI and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ARCNET: case IFT_ETHER: case IFT_FDDI: case IFT_IEEE1394: #ifdef IFT_L2VLAN case IFT_L2VLAN: #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif case IFT_INFINIBAND: case IFT_GIF: /* XXX need more cases? */ case IFT_PPP: case IFT_TUNNEL: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln; ifp = ia->ia_ifa.ifa_ifp; IF_AFDATA_LOCK(ifp); ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; ln = lla_lookup(LLTABLE6(ifp), (LLE_CREATE | LLE_IFADDR | LLE_EXCLUSIVE), (struct sockaddr *)&ia->ia_addr); IF_AFDATA_UNLOCK(ifp); if (ln != NULL) { ln->la_expire = 0; /* for IPv6 this means permanent */ ln->ln_state = ND6_LLINFO_REACHABLE; LLE_WUNLOCK(ln); in6_newaddrmsg(ia, RTM_ADD); return (0); } return (ENOBUFS); } /* * Removes ALL lle records for interface address prefix. * XXXME: That's probably not we really want to do, we need * to remove address record only and keep other records * until we determine if given prefix is really going * to be removed. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia) { struct sockaddr_in6 mask, addr; struct ifnet *ifp; in6_newaddrmsg(ia, RTM_DELETE); ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); lltable_prefix_free(AF_INET6, (struct sockaddr *)&addr, (struct sockaddr *)&mask, LLE_STATIC); } /* * the callers of this function need to be re-worked to drop * the lle lock, drop here for now */ int nd6_storelladdr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, u_char *desten, struct llentry **lle) { struct llentry *ln; *lle = NULL; IF_AFDATA_UNLOCK_ASSERT(ifp); if (m != NULL && m->m_flags & M_MCAST) { int i; switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: #ifdef IFT_L2VLAN case IFT_L2VLAN: #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: #endif case IFT_BRIDGE: case IFT_ISO88025: ETHER_MAP_IPV6_MULTICAST(&SIN6(dst)->sin6_addr, desten); return (0); case IFT_IEEE1394: /* * netbsd can use if_broadcastaddr, but we don't do so * to reduce # of ifdef. */ for (i = 0; i < ifp->if_addrlen; i++) desten[i] = ~0; return (0); case IFT_ARCNET: *desten = 0; return (0); default: m_freem(m); return (EAFNOSUPPORT); } } /* * the entry should have been created in nd6_store_lladdr */ IF_AFDATA_RLOCK(ifp); ln = lla_lookup(LLTABLE6(ifp), 0, dst); IF_AFDATA_RUNLOCK(ifp); if ((ln == NULL) || !(ln->la_flags & LLE_VALID)) { if (ln != NULL) LLE_RUNLOCK(ln); /* this could happen, if we could not allocate memory */ m_freem(m); return (1); } bcopy(&ln->ll_addr, desten, ifp->if_addrlen); *lle = ln; LLE_RUNLOCK(ln); /* * A *small* use after free race exists here */ return (0); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; return; } static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS); static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS); #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_icmp6); #endif SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, CTLFLAG_RD, nd6_sysctl_drlist, ""); SYSCTL_NODE(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLFLAG_RD, nd6_sysctl_prlist, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, - CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); -SYSCTL_VNET_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, - CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); +SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); static int nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS) { struct in6_defrouter d; struct nd_defrouter *dr; int error; if (req->newptr) return (EPERM); bzero(&d, sizeof(d)); d.rtaddr.sin6_family = AF_INET6; d.rtaddr.sin6_len = sizeof(d.rtaddr); /* * XXX locking */ TAILQ_FOREACH(dr, &V_nd_defrouter, dr_entry) { d.rtaddr.sin6_addr = dr->rtaddr; error = sa6_recoverscope(&d.rtaddr); if (error != 0) return (error); d.flags = dr->flags; d.rtlifetime = dr->rtlifetime; d.expire = dr->expire + (time_second - time_uptime); d.if_index = dr->ifp->if_index; error = SYSCTL_OUT(req, &d, sizeof(d)); if (error != 0) return (error); } return (0); } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); /* * XXX locking */ LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_refcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) return (error); LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) return (error); } } return (0); } Index: user/dchagin/lemul/sys/netinet6/scope6.c =================================================================== --- user/dchagin/lemul/sys/netinet6/scope6.c (revision 274240) +++ user/dchagin/lemul/sys/netinet6/scope6.c (revision 274241) @@ -1,535 +1,535 @@ /*- * Copyright (C) 2000 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ENABLE_DEFAULT_SCOPE VNET_DEFINE(int, ip6_use_defzone) = 1; #else VNET_DEFINE(int, ip6_use_defzone) = 0; #endif VNET_DEFINE(int, deembed_scopeid) = 1; SYSCTL_DECL(_net_inet6_ip6); -SYSCTL_VNET_INT(_net_inet6_ip6, OID_AUTO, deembed_scopeid, CTLFLAG_RW, +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, deembed_scopeid, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(deembed_scopeid), 0, "Extract embedded zone ID and set it to sin6_scope_id in sockaddr_in6."); /* * The scope6_lock protects the global sid default stored in * sid_default below. */ static struct mtx scope6_lock; #define SCOPE6_LOCK_INIT() mtx_init(&scope6_lock, "scope6_lock", NULL, MTX_DEF) #define SCOPE6_LOCK() mtx_lock(&scope6_lock) #define SCOPE6_UNLOCK() mtx_unlock(&scope6_lock) #define SCOPE6_LOCK_ASSERT() mtx_assert(&scope6_lock, MA_OWNED) static VNET_DEFINE(struct scope6_id, sid_default); #define V_sid_default VNET(sid_default) #define SID(ifp) \ (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id) static int scope6_get(struct ifnet *, struct scope6_id *); static int scope6_set(struct ifnet *, struct scope6_id *); void scope6_init(void) { bzero(&V_sid_default, sizeof(V_sid_default)); if (!IS_DEFAULT_VNET(curvnet)) return; SCOPE6_LOCK_INIT(); } struct scope6_id * scope6_ifattach(struct ifnet *ifp) { struct scope6_id *sid; sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO); /* * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard. * Should we rather hardcode here? */ sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; return (sid); } void scope6_ifdetach(struct scope6_id *sid) { free(sid, M_IFADDR); } int scope6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ifreq *ifr; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); ifr = (struct in6_ifreq *)data; switch (cmd) { case SIOCSSCOPE6: return (scope6_set(ifp, (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id)); case SIOCGSCOPE6: return (scope6_get(ifp, (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id)); case SIOCGSCOPE6DEF: return (scope6_get_default( (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id)); default: return (EOPNOTSUPP); } } static int scope6_set(struct ifnet *ifp, struct scope6_id *idlist) { int i; int error = 0; struct scope6_id *sid = NULL; IF_AFDATA_WLOCK(ifp); sid = SID(ifp); if (!sid) { /* paranoid? */ IF_AFDATA_WUNLOCK(ifp); return (EINVAL); } /* * XXX: We need more consistency checks of the relationship among * scopes (e.g. an organization should be larger than a site). */ /* * TODO(XXX): after setting, we should reflect the changes to * interface addresses, routing table entries, PCB entries... */ for (i = 0; i < 16; i++) { if (idlist->s6id_list[i] && idlist->s6id_list[i] != sid->s6id_list[i]) { /* * An interface zone ID must be the corresponding * interface index by definition. */ if (i == IPV6_ADDR_SCOPE_INTFACELOCAL && idlist->s6id_list[i] != ifp->if_index) { IF_AFDATA_WUNLOCK(ifp); return (EINVAL); } if (i == IPV6_ADDR_SCOPE_LINKLOCAL && idlist->s6id_list[i] > V_if_index) { /* * XXX: theoretically, there should be no * relationship between link IDs and interface * IDs, but we check the consistency for * safety in later use. */ IF_AFDATA_WUNLOCK(ifp); return (EINVAL); } /* * XXX: we must need lots of work in this case, * but we simply set the new value in this initial * implementation. */ sid->s6id_list[i] = idlist->s6id_list[i]; } } IF_AFDATA_WUNLOCK(ifp); return (error); } static int scope6_get(struct ifnet *ifp, struct scope6_id *idlist) { struct scope6_id *sid; /* We only need to lock the interface's afdata for SID() to work. */ IF_AFDATA_RLOCK(ifp); sid = SID(ifp); if (sid == NULL) { /* paranoid? */ IF_AFDATA_RUNLOCK(ifp); return (EINVAL); } *idlist = *sid; IF_AFDATA_RUNLOCK(ifp); return (0); } /* * Get a scope of the address. Node-local, link-local, site-local or global. */ int in6_addrscope(const struct in6_addr *addr) { if (IN6_IS_ADDR_MULTICAST(addr)) { /* * Addresses with reserved value F must be treated as * global multicast addresses. */ if (IPV6_ADDR_MC_SCOPE(addr) == 0x0f) return (IPV6_ADDR_SCOPE_GLOBAL); return (IPV6_ADDR_MC_SCOPE(addr)); } if (IN6_IS_ADDR_LINKLOCAL(addr) || IN6_IS_ADDR_LOOPBACK(addr)) return (IPV6_ADDR_SCOPE_LINKLOCAL); if (IN6_IS_ADDR_SITELOCAL(addr)) return (IPV6_ADDR_SCOPE_SITELOCAL); return (IPV6_ADDR_SCOPE_GLOBAL); } /* * ifp - note that this might be NULL */ void scope6_setdefault(struct ifnet *ifp) { /* * Currently, this function just sets the default "interfaces" * and "links" according to the given interface. * We might eventually have to separate the notion of "link" from * "interface" and provide a user interface to set the default. */ SCOPE6_LOCK(); if (ifp) { V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; } else { V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0; V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0; } SCOPE6_UNLOCK(); } int scope6_get_default(struct scope6_id *idlist) { SCOPE6_LOCK(); *idlist = V_sid_default; SCOPE6_UNLOCK(); return (0); } u_int32_t scope6_addr2default(struct in6_addr *addr) { u_int32_t id; /* * special case: The loopback address should be considered as * link-local, but there's no ambiguity in the syntax. */ if (IN6_IS_ADDR_LOOPBACK(addr)) return (0); /* * XXX: 32-bit read is atomic on all our platforms, is it OK * not to lock here? */ SCOPE6_LOCK(); id = V_sid_default.s6id_list[in6_addrscope(addr)]; SCOPE6_UNLOCK(); return (id); } /* * Validate the specified scope zone ID in the sin6_scope_id field. If the ID * is unspecified (=0), needs to be specified, and the default zone ID can be * used, the default value will be used. * This routine then generates the kernel-internal form: if the address scope * of is interface-local or link-local, embed the interface index in the * address. */ int sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok) { u_int32_t zoneid; if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) zoneid = scope6_addr2default(&sin6->sin6_addr); if (zoneid != 0 && (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) { /* * At this moment, we only check interface-local and * link-local scope IDs, and use interface indices as the * zone IDs assuming a one-to-one mapping between interfaces * and links. */ if (V_if_index < zoneid || ifnet_byindex(zoneid) == NULL) return (ENXIO); /* XXX assignment to 16bit from 32bit variable */ sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff); sin6->sin6_scope_id = 0; } return 0; } /* * generate standard sockaddr_in6 from embedded form. */ int sa6_recoverscope(struct sockaddr_in6 *sin6) { char ip6buf[INET6_ADDRSTRLEN]; u_int32_t zoneid; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) { /* * KAME assumption: link id == interface id */ zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]); if (zoneid) { /* sanity check */ if (V_if_index < zoneid) return (ENXIO); #if 0 /* XXX: Disabled due to possible deadlock. */ if (!ifnet_byindex(zoneid)) return (ENXIO); #endif if (sin6->sin6_scope_id != 0 && zoneid != sin6->sin6_scope_id) { log(LOG_NOTICE, "%s: embedded scope mismatch: %s%%%d. " "sin6_scope_id was overridden.", __func__, ip6_sprintf(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id); } sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = zoneid; } } return 0; } /* * Determine the appropriate scope zone ID for in6 and ifp. If ret_id is * non NULL, it is set to the zone ID. If the zone ID needs to be embedded * in the in6_addr structure, in6 will be modified. * * ret_id - unnecessary? */ int in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id) { int scope; u_int32_t zoneid = 0; struct scope6_id *sid; /* * special case: the loopback address can only belong to a loopback * interface. */ if (IN6_IS_ADDR_LOOPBACK(in6)) { if (!(ifp->if_flags & IFF_LOOPBACK)) return (EINVAL); } else { scope = in6_addrscope(in6); if (scope == IPV6_ADDR_SCOPE_INTFACELOCAL || scope == IPV6_ADDR_SCOPE_LINKLOCAL) { /* * Currently we use interface indeces as the * zone IDs for interface-local and link-local * scopes. */ zoneid = ifp->if_index; in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */ } else if (scope != IPV6_ADDR_SCOPE_GLOBAL) { IF_AFDATA_RLOCK(ifp); sid = SID(ifp); zoneid = sid->s6id_list[scope]; IF_AFDATA_RUNLOCK(ifp); } } if (ret_id != NULL) *ret_id = zoneid; return (0); } /* * Just clear the embedded scope identifier. Return 0 if the original address * is intact; return non 0 if the address is modified. */ int in6_clearscope(struct in6_addr *in6) { int modified = 0; if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) { if (in6->s6_addr16[1] != 0) modified = 1; in6->s6_addr16[1] = 0; } return (modified); } /* * Return the scope identifier or zero. */ uint16_t in6_getscope(struct in6_addr *in6) { if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) return (in6->s6_addr16[1]); return (0); } /* * Return pointer to ifnet structure, corresponding to the zone id of * link-local scope. */ struct ifnet* in6_getlinkifnet(uint32_t zoneid) { return (ifnet_byindex((u_short)zoneid)); } /* * Return zone id for the specified scope. */ uint32_t in6_getscopezone(const struct ifnet *ifp, int scope) { if (scope == IPV6_ADDR_SCOPE_INTFACELOCAL || scope == IPV6_ADDR_SCOPE_LINKLOCAL) return (ifp->if_index); if (scope >= 0 && scope < IPV6_ADDR_SCOPES_COUNT) return (SID(ifp)->s6id_list[scope]); return (0); } /* * This function is for checking sockaddr_in6 structure passed * from the application level (usually). * * sin6_scope_id should be set for link-local unicast, link-local and * interface-local multicast addresses. * * If it is zero, then look into default zone ids. If default zone id is * not set or disabled, then return error. */ int sa6_checkzone(struct sockaddr_in6 *sa6) { int scope; scope = in6_addrscope(&sa6->sin6_addr); if (scope == IPV6_ADDR_SCOPE_GLOBAL) return (sa6->sin6_scope_id ? EINVAL: 0); if (IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr) && scope != IPV6_ADDR_SCOPE_LINKLOCAL && scope != IPV6_ADDR_SCOPE_INTFACELOCAL) { if (sa6->sin6_scope_id == 0 && V_ip6_use_defzone != 0) sa6->sin6_scope_id = V_sid_default.s6id_list[scope]; return (0); } /* * Since ::1 address always configured on the lo0, we can * automatically set its zone id, when it is not specified. * Return error, when specified zone id doesn't match with * actual value. */ if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) { if (sa6->sin6_scope_id == 0) sa6->sin6_scope_id = in6_getscopezone(V_loif, scope); else if (sa6->sin6_scope_id != in6_getscopezone(V_loif, scope)) return (EADDRNOTAVAIL); } /* XXX: we can validate sin6_scope_id here */ if (sa6->sin6_scope_id != 0) return (0); if (V_ip6_use_defzone != 0) sa6->sin6_scope_id = V_sid_default.s6id_list[scope]; /* Return error if we can't determine zone id */ return (sa6->sin6_scope_id ? 0: EADDRNOTAVAIL); } Index: user/dchagin/lemul/sys/netipsec/ipsec.c =================================================================== --- user/dchagin/lemul/sys/netipsec/ipsec.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/ipsec.c (revision 274241) @@ -1,1743 +1,1743 @@ /* $FreeBSD$ */ /* $KAME: ipsec.c,v 1.103 2001/05/24 07:14:18 sakane Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * IPsec controller part. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #ifdef INET6 #include #endif #include #include #include /*XXX*/ #include #include #include #include #include #include #include #ifdef IPSEC_DEBUG VNET_DEFINE(int, ipsec_debug) = 1; #else VNET_DEFINE(int, ipsec_debug) = 0; #endif /* NB: name changed so netstat doesn't use it. */ VNET_PCPUSTAT_DEFINE(struct ipsecstat, ipsec4stat); VNET_PCPUSTAT_SYSINIT(ipsec4stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipsec4stat); #endif /* VIMAGE */ VNET_DEFINE(int, ip4_ah_offsetmask) = 0; /* maybe IP_DF? */ /* DF bit on encap. 0: clear 1: set 2: copy */ VNET_DEFINE(int, ip4_ipsec_dfbit) = 0; VNET_DEFINE(int, ip4_esp_trans_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip4_esp_net_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip4_ah_trans_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip4_ah_net_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(struct secpolicy, ip4_def_policy); /* ECN ignore(-1)/forbidden(0)/allowed(1) */ VNET_DEFINE(int, ip4_ipsec_ecn) = 0; VNET_DEFINE(int, ip4_esp_randpad) = -1; /* * Crypto support requirements: * * 1 require hardware support * -1 require software support * 0 take anything */ VNET_DEFINE(int, crypto_support) = CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE; FEATURE(ipsec, "Internet Protocol Security (IPsec)"); #ifdef IPSEC_NAT_T FEATURE(ipsec_natt, "UDP Encapsulation of IPsec ESP Packets ('NAT-T')"); #endif SYSCTL_DECL(_net_inet_ipsec); /* net.inet.ipsec */ -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, - CTLFLAG_RW, &VNET_NAME(ip4_def_policy).policy, 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_def_policy).policy, 0, "IPsec default policy."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &VNET_NAME(ip4_esp_trans_deflev), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_esp_trans_deflev), 0, "Default ESP transport mode level"); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &VNET_NAME(ip4_esp_net_deflev), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_esp_net_deflev), 0, "Default ESP tunnel mode level."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &VNET_NAME(ip4_ah_trans_deflev), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_trans_deflev), 0, "AH transfer mode default level."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &VNET_NAME(ip4_ah_net_deflev), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_net_deflev), 0, "AH tunnel mode default level."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos, - CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, ah_cleartos, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, "If set clear type-of-service field when doing AH computation."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask, - CTLFLAG_RW, &VNET_NAME(ip4_ah_offsetmask), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, ah_offsetmask, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ah_offsetmask), 0, "If not set clear offset field mask when doing AH computation."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DFBIT, dfbit, - CTLFLAG_RW, &VNET_NAME(ip4_ipsec_dfbit), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, dfbit, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_dfbit), 0, "Do not fragment bit on encap."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_ECN, ecn, - CTLFLAG_RW, &VNET_NAME(ip4_ipsec_ecn), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, ecn, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_ipsec_ecn), 0, "Explicit Congestion Notification handling."); -SYSCTL_VNET_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug, - CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0, +SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, debug, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0, "Enable IPsec debugging output when set."); -SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, crypto_support, - CTLFLAG_RW, &VNET_NAME(crypto_support), 0, +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, crypto_support, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(crypto_support), 0, "Crypto driver selection."); SYSCTL_VNET_PCPUSTAT(_net_inet_ipsec, OID_AUTO, ipsecstats, struct ipsecstat, ipsec4stat, "IPsec IPv4 statistics."); #ifdef REGRESSION /* * When set to 1, IPsec will send packets with the same sequence number. * This allows to verify if the other side has proper replay attacks detection. */ VNET_DEFINE(int, ipsec_replay) = 0; -SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, test_replay, - CTLFLAG_RW, &VNET_NAME(ipsec_replay), 0, +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_replay, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_replay), 0, "Emulate replay attack"); /* * When set 1, IPsec will send packets with corrupted HMAC. * This allows to verify if the other side properly detects modified packets. */ VNET_DEFINE(int, ipsec_integrity) = 0; -SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, test_integrity, - CTLFLAG_RW, &VNET_NAME(ipsec_integrity), 0, +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, test_integrity, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_integrity), 0, "Emulate man-in-the-middle attack"); #endif #ifdef INET6 VNET_PCPUSTAT_DEFINE(struct ipsecstat, ipsec6stat); VNET_PCPUSTAT_SYSINIT(ipsec6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipsec6stat); #endif /* VIMAGE */ VNET_DEFINE(int, ip6_esp_trans_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip6_esp_net_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip6_ah_trans_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip6_ah_net_deflev) = IPSEC_LEVEL_USE; VNET_DEFINE(int, ip6_ipsec_ecn) = 0; /* ECN ignore(-1)/forbidden(0)/allowed(1) */ SYSCTL_DECL(_net_inet6_ipsec6); /* net.inet6.ipsec6 */ -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy, CTLFLAG_RW, - &VNET_NAME(ip4_def_policy).policy, 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, def_policy, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip4_def_policy).policy, 0, "IPsec default policy."); -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, - esp_trans_deflev, CTLFLAG_RW, &VNET_NAME(ip6_esp_trans_deflev), 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_esp_trans_deflev), 0, "Default ESP transport mode level."); -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, - esp_net_deflev, CTLFLAG_RW, &VNET_NAME(ip6_esp_net_deflev), 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_esp_net_deflev), 0, "Default ESP tunnel mode level."); -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, - ah_trans_deflev, CTLFLAG_RW, &VNET_NAME(ip6_ah_trans_deflev), 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ah_trans_deflev), 0, "AH transfer mode default level."); -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, - ah_net_deflev, CTLFLAG_RW, &VNET_NAME(ip6_ah_net_deflev), 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ah_net_deflev), 0, "AH tunnel mode default level."); -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &VNET_NAME(ip6_ipsec_ecn), 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, ecn, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_ipsec_ecn), 0, "Explicit Congestion Notification handling."); -SYSCTL_VNET_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug, CTLFLAG_RW, - &VNET_NAME(ipsec_debug), 0, +SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, debug, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_debug), 0, "Enable IPsec debugging output when set."); SYSCTL_VNET_PCPUSTAT(_net_inet6_ipsec6, IPSECCTL_STATS, ipsecstats, struct ipsecstat, ipsec6stat, "IPsec IPv6 statistics."); #endif /* INET6 */ static int ipsec_setspidx_inpcb __P((struct mbuf *, struct inpcb *)); static int ipsec_setspidx __P((struct mbuf *, struct secpolicyindex *, int)); static void ipsec4_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); static int ipsec4_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); #ifdef INET6 static void ipsec6_get_ulp __P((struct mbuf *m, struct secpolicyindex *, int)); static int ipsec6_setspidx_ipaddr __P((struct mbuf *, struct secpolicyindex *)); #endif static void ipsec_delpcbpolicy __P((struct inpcbpolicy *)); static struct secpolicy *ipsec_deepcopy_policy __P((struct secpolicy *src)); static void vshiftl __P((unsigned char *, int, int)); MALLOC_DEFINE(M_IPSEC_INPCB, "inpcbpolicy", "inpcb-resident ipsec policy"); /* * Return a held reference to the default SP. */ static struct secpolicy * key_allocsp_default(const char* where, int tag) { struct secpolicy *sp; KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP key_allocsp_default from %s:%u\n", where, tag)); sp = &V_ip4_def_policy; if (sp->policy != IPSEC_POLICY_DISCARD && sp->policy != IPSEC_POLICY_NONE) { ipseclog((LOG_INFO, "fixed system default policy: %d->%d\n", sp->policy, IPSEC_POLICY_NONE)); sp->policy = IPSEC_POLICY_NONE; } key_addref(sp); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP key_allocsp_default returns SP:%p (%u)\n", sp, sp->refcnt)); return (sp); } #define KEY_ALLOCSP_DEFAULT() \ key_allocsp_default(__FILE__, __LINE__) /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. * others: a pointer to SP * * NOTE: IPv6 mapped adddress concern is implemented here. */ struct secpolicy * ipsec_getpolicy(struct tdb_ident *tdbi, u_int dir) { struct secpolicy *sp; IPSEC_ASSERT(tdbi != NULL, ("null tdbi")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); sp = KEY_ALLOCSP2(tdbi->spi, &tdbi->dst, tdbi->proto, dir); if (sp == NULL) /*XXX????*/ sp = KEY_ALLOCSP_DEFAULT(); IPSEC_ASSERT(sp != NULL, ("null SP")); return (sp); } /* * For OUTBOUND packet having a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. * others: a pointer to SP * * NOTE: IPv6 mapped adddress concern is implemented here. */ static struct secpolicy * ipsec_getpolicybysock(struct mbuf *m, u_int dir, struct inpcb *inp, int *error) { struct inpcbpolicy *pcbsp; struct secpolicy *currsp = NULL; /* Policy on socket. */ struct secpolicy *sp; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(inp != NULL, ("null inpcb")); IPSEC_ASSERT(error != NULL, ("null error")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); /* Set spidx in pcb. */ *error = ipsec_setspidx_inpcb(m, inp); if (*error) return (NULL); pcbsp = inp->inp_sp; IPSEC_ASSERT(pcbsp != NULL, ("null pcbsp")); switch (dir) { case IPSEC_DIR_INBOUND: currsp = pcbsp->sp_in; break; case IPSEC_DIR_OUTBOUND: currsp = pcbsp->sp_out; break; } IPSEC_ASSERT(currsp != NULL, ("null currsp")); if (pcbsp->priv) { /* When privilieged socket. */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_IPSEC: key_addref(currsp); sp = currsp; break; case IPSEC_POLICY_ENTRUST: /* Look for a policy in SPD. */ sp = KEY_ALLOCSP(&currsp->spidx, dir); if (sp == NULL) /* No SP found. */ sp = KEY_ALLOCSP_DEFAULT(); break; default: ipseclog((LOG_ERR, "%s: Invalid policy for PCB %d\n", __func__, currsp->policy)); *error = EINVAL; return (NULL); } } else { /* Unpriv, SPD has policy. */ sp = KEY_ALLOCSP(&currsp->spidx, dir); if (sp == NULL) { /* No SP found. */ switch (currsp->policy) { case IPSEC_POLICY_BYPASS: ipseclog((LOG_ERR, "%s: Illegal policy for " "non-priviliged defined %d\n", __func__, currsp->policy)); *error = EINVAL; return (NULL); case IPSEC_POLICY_ENTRUST: sp = KEY_ALLOCSP_DEFAULT(); break; case IPSEC_POLICY_IPSEC: key_addref(currsp); sp = currsp; break; default: ipseclog((LOG_ERR, "%s: Invalid policy for " "PCB %d\n", __func__, currsp->policy)); *error = EINVAL; return (NULL); } } } IPSEC_ASSERT(sp != NULL, ("null SP (priv %u policy %u", pcbsp->priv, currsp->policy)); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s (priv %u policy %u) allocate SP:%p (refcnt %u)\n", __func__, pcbsp->priv, currsp->policy, sp, sp->refcnt)); return (sp); } /* * For FORWADING packet or OUTBOUND without a socket. Searching SPD for packet, * and return a pointer to SP. * OUT: positive: a pointer to the entry for security policy leaf matched. * NULL: no apropreate SP found, the following value is set to error. * 0 : bypass * EACCES : discard packet. * ENOENT : ipsec_acquire() in progress, maybe. * others : error occured. */ struct secpolicy * ipsec_getpolicybyaddr(struct mbuf *m, u_int dir, int flag, int *error) { struct secpolicyindex spidx; struct secpolicy *sp; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(error != NULL, ("null error")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); sp = NULL; if (key_havesp(dir)) { /* Make an index to look for a policy. */ *error = ipsec_setspidx(m, &spidx, (flag & IP_FORWARDING) ? 0 : 1); if (*error != 0) { DPRINTF(("%s: setpidx failed, dir %u flag %u\n", __func__, dir, flag)); return (NULL); } spidx.dir = dir; sp = KEY_ALLOCSP(&spidx, dir); } if (sp == NULL) /* No SP found, use system default. */ sp = KEY_ALLOCSP_DEFAULT(); IPSEC_ASSERT(sp != NULL, ("null SP")); return (sp); } struct secpolicy * ipsec4_checkpolicy(struct mbuf *m, u_int dir, u_int flag, int *error, struct inpcb *inp) { struct secpolicy *sp; *error = 0; if (inp == NULL) sp = ipsec_getpolicybyaddr(m, dir, flag, error); else sp = ipsec_getpolicybysock(m, dir, inp, error); if (sp == NULL) { IPSEC_ASSERT(*error != 0, ("getpolicy failed w/o error")); IPSECSTAT_INC(ips_out_inval); return (NULL); } IPSEC_ASSERT(*error == 0, ("sp w/ error set to %u", *error)); switch (sp->policy) { case IPSEC_POLICY_ENTRUST: default: printf("%s: invalid policy %u\n", __func__, sp->policy); /* FALLTHROUGH */ case IPSEC_POLICY_DISCARD: IPSECSTAT_INC(ips_out_polvio); *error = -EINVAL; /* Packet is discarded by caller. */ break; case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: KEY_FREESP(&sp); sp = NULL; /* NB: force NULL result. */ break; case IPSEC_POLICY_IPSEC: if (sp->req == NULL) /* Acquire a SA. */ *error = key_spdacquire(sp); break; } if (*error != 0) { KEY_FREESP(&sp); sp = NULL; } return (sp); } static int ipsec_setspidx_inpcb(struct mbuf *m, struct inpcb *inp) { int error; IPSEC_ASSERT(inp != NULL, ("null inp")); IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp")); IPSEC_ASSERT(inp->inp_sp->sp_out != NULL && inp->inp_sp->sp_in != NULL, ("null sp_in || sp_out")); error = ipsec_setspidx(m, &inp->inp_sp->sp_in->spidx, 1); if (error == 0) { inp->inp_sp->sp_in->spidx.dir = IPSEC_DIR_INBOUND; inp->inp_sp->sp_out->spidx = inp->inp_sp->sp_in->spidx; inp->inp_sp->sp_out->spidx.dir = IPSEC_DIR_OUTBOUND; } else { bzero(&inp->inp_sp->sp_in->spidx, sizeof (inp->inp_sp->sp_in->spidx)); bzero(&inp->inp_sp->sp_out->spidx, sizeof (inp->inp_sp->sp_in->spidx)); } return (error); } /* * Configure security policy index (src/dst/proto/sport/dport) * by looking at the content of mbuf. * The caller is responsible for error recovery (like clearing up spidx). */ static int ipsec_setspidx(struct mbuf *m, struct secpolicyindex *spidx, int needport) { struct ip *ip = NULL; struct ip ipbuf; u_int v; struct mbuf *n; int len; int error; IPSEC_ASSERT(m != NULL, ("null mbuf")); /* * Validate m->m_pkthdr.len. We see incorrect length if we * mistakenly call this function with inconsistent mbuf chain * (like 4.4BSD tcp/udp processing). XXX Should we panic here? */ len = 0; for (n = m; n; n = n->m_next) len += n->m_len; if (m->m_pkthdr.len != len) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: pkthdr len(%d) mismatch (%d), ignored.\n", __func__, len, m->m_pkthdr.len)); return (EINVAL); } if (m->m_pkthdr.len < sizeof(struct ip)) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: pkthdr len(%d) too small (v4), ignored.\n", __func__, m->m_pkthdr.len)); return (EINVAL); } if (m->m_len >= sizeof(*ip)) ip = mtod(m, struct ip *); else { m_copydata(m, 0, sizeof(ipbuf), (caddr_t)&ipbuf); ip = &ipbuf; } v = ip->ip_v; switch (v) { case 4: error = ipsec4_setspidx_ipaddr(m, spidx); if (error) return (error); ipsec4_get_ulp(m, spidx, needport); return (0); #ifdef INET6 case 6: if (m->m_pkthdr.len < sizeof(struct ip6_hdr)) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: pkthdr len(%d) too small (v6), " "ignored\n", __func__, m->m_pkthdr.len)); return (EINVAL); } error = ipsec6_setspidx_ipaddr(m, spidx); if (error) return (error); ipsec6_get_ulp(m, spidx, needport); return (0); #endif default: KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: " "unknown IP version %u, ignored.\n", __func__, v)); return (EINVAL); } } static void ipsec4_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport) { u_int8_t nxt; int off; /* Sanity check. */ IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(m->m_pkthdr.len >= sizeof(struct ip),("packet too short")); if (m->m_len >= sizeof (struct ip)) { struct ip *ip = mtod(m, struct ip *); if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) goto done; off = ip->ip_hl << 2; nxt = ip->ip_p; } else { struct ip ih; m_copydata(m, 0, sizeof (struct ip), (caddr_t) &ih); if (ih.ip_off & htons(IP_MF | IP_OFFMASK)) goto done; off = ih.ip_hl << 2; nxt = ih.ip_p; } while (off < m->m_pkthdr.len) { struct ip6_ext ip6e; struct tcphdr th; struct udphdr uh; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) goto done_proto; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof (th), (caddr_t) &th); spidx->src.sin.sin_port = th.th_sport; spidx->dst.sin.sin_port = th.th_dport; return; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) goto done_proto; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) goto done; m_copydata(m, off, sizeof (uh), (caddr_t) &uh); spidx->src.sin.sin_port = uh.uh_sport; spidx->dst.sin.sin_port = uh.uh_dport; return; case IPPROTO_AH: if (off + sizeof(ip6e) > m->m_pkthdr.len) goto done; /* XXX Sigh, this works but is totally bogus. */ m_copydata(m, off, sizeof(ip6e), (caddr_t) &ip6e); off += (ip6e.ip6e_len + 2) << 2; nxt = ip6e.ip6e_nxt; break; case IPPROTO_ICMP: default: /* XXX Intermediate headers??? */ spidx->ul_proto = nxt; goto done_proto; } } done: spidx->ul_proto = IPSEC_ULPROTO_ANY; done_proto: spidx->src.sin.sin_port = IPSEC_PORT_ANY; spidx->dst.sin.sin_port = IPSEC_PORT_ANY; } /* Assumes that m is sane. */ static int ipsec4_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx) { static const struct sockaddr_in template = { sizeof (struct sockaddr_in), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; spidx->src.sin = template; spidx->dst.sin = template; if (m->m_len < sizeof (struct ip)) { m_copydata(m, offsetof(struct ip, ip_src), sizeof (struct in_addr), (caddr_t) &spidx->src.sin.sin_addr); m_copydata(m, offsetof(struct ip, ip_dst), sizeof (struct in_addr), (caddr_t) &spidx->dst.sin.sin_addr); } else { struct ip *ip = mtod(m, struct ip *); spidx->src.sin.sin_addr = ip->ip_src; spidx->dst.sin.sin_addr = ip->ip_dst; } spidx->prefs = sizeof(struct in_addr) << 3; spidx->prefd = sizeof(struct in_addr) << 3; return (0); } #ifdef INET6 static void ipsec6_get_ulp(struct mbuf *m, struct secpolicyindex *spidx, int needport) { int off, nxt; struct tcphdr th; struct udphdr uh; struct icmp6_hdr ih; /* Sanity check. */ if (m == NULL) panic("%s: NULL pointer was passed.\n", __func__); KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s:\n", __func__); kdebug_mbuf(m)); /* Set default. */ spidx->ul_proto = IPSEC_ULPROTO_ANY; ((struct sockaddr_in6 *)&spidx->src)->sin6_port = IPSEC_PORT_ANY; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = IPSEC_PORT_ANY; nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off < 0 || m->m_pkthdr.len < off) return; switch (nxt) { case IPPROTO_TCP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct tcphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(th), (caddr_t)&th); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = th.th_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = th.th_dport; break; case IPPROTO_UDP: spidx->ul_proto = nxt; if (!needport) break; if (off + sizeof(struct udphdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(uh), (caddr_t)&uh); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = uh.uh_sport; ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = uh.uh_dport; break; case IPPROTO_ICMPV6: spidx->ul_proto = nxt; if (off + sizeof(struct icmp6_hdr) > m->m_pkthdr.len) break; m_copydata(m, off, sizeof(ih), (caddr_t)&ih); ((struct sockaddr_in6 *)&spidx->src)->sin6_port = htons((uint16_t)ih.icmp6_type); ((struct sockaddr_in6 *)&spidx->dst)->sin6_port = htons((uint16_t)ih.icmp6_code); break; default: /* XXX Intermediate headers??? */ spidx->ul_proto = nxt; break; } } /* Assumes that m is sane. */ static int ipsec6_setspidx_ipaddr(struct mbuf *m, struct secpolicyindex *spidx) { struct ip6_hdr *ip6 = NULL; struct ip6_hdr ip6buf; struct sockaddr_in6 *sin6; if (m->m_len >= sizeof(*ip6)) ip6 = mtod(m, struct ip6_hdr *); else { m_copydata(m, 0, sizeof(ip6buf), (caddr_t)&ip6buf); ip6 = &ip6buf; } sin6 = (struct sockaddr_in6 *)&spidx->src; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_src, &sin6->sin6_addr, sizeof(ip6->ip6_src)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_src.s6_addr16[1]); } spidx->prefs = sizeof(struct in6_addr) << 3; sin6 = (struct sockaddr_in6 *)&spidx->dst; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(ip6->ip6_dst)); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { sin6->sin6_addr.s6_addr16[1] = 0; sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); } spidx->prefd = sizeof(struct in6_addr) << 3; return (0); } #endif static void ipsec_delpcbpolicy(struct inpcbpolicy *p) { free(p, M_IPSEC_INPCB); } /* Initialize policy in PCB. */ int ipsec_init_policy(struct socket *so, struct inpcbpolicy **pcb_sp) { struct inpcbpolicy *new; /* Sanity check. */ if (so == NULL || pcb_sp == NULL) panic("%s: NULL pointer was passed.\n", __func__); new = (struct inpcbpolicy *) malloc(sizeof(struct inpcbpolicy), M_IPSEC_INPCB, M_NOWAIT|M_ZERO); if (new == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return (ENOBUFS); } new->priv = IPSEC_IS_PRIVILEGED_SO(so); if ((new->sp_in = KEY_NEWSP()) == NULL) { ipsec_delpcbpolicy(new); return (ENOBUFS); } new->sp_in->state = IPSEC_SPSTATE_ALIVE; new->sp_in->policy = IPSEC_POLICY_ENTRUST; if ((new->sp_out = KEY_NEWSP()) == NULL) { KEY_FREESP(&new->sp_in); ipsec_delpcbpolicy(new); return (ENOBUFS); } new->sp_out->state = IPSEC_SPSTATE_ALIVE; new->sp_out->policy = IPSEC_POLICY_ENTRUST; *pcb_sp = new; return (0); } /* Copy old IPsec policy into new. */ int ipsec_copy_policy(struct inpcbpolicy *old, struct inpcbpolicy *new) { struct secpolicy *sp; sp = ipsec_deepcopy_policy(old->sp_in); if (sp) { KEY_FREESP(&new->sp_in); new->sp_in = sp; } else return (ENOBUFS); sp = ipsec_deepcopy_policy(old->sp_out); if (sp) { KEY_FREESP(&new->sp_out); new->sp_out = sp; } else return (ENOBUFS); new->priv = old->priv; return (0); } struct ipsecrequest * ipsec_newisr(void) { struct ipsecrequest *p; p = malloc(sizeof(struct ipsecrequest), M_IPSEC_SR, M_NOWAIT|M_ZERO); if (p != NULL) IPSECREQUEST_LOCK_INIT(p); return (p); } void ipsec_delisr(struct ipsecrequest *p) { IPSECREQUEST_LOCK_DESTROY(p); free(p, M_IPSEC_SR); } /* Deep-copy a policy in PCB. */ static struct secpolicy * ipsec_deepcopy_policy(struct secpolicy *src) { struct ipsecrequest *newchain = NULL; struct ipsecrequest *p; struct ipsecrequest **q; struct ipsecrequest *r; struct secpolicy *dst; if (src == NULL) return (NULL); dst = KEY_NEWSP(); if (dst == NULL) return (NULL); /* * Deep-copy IPsec request chain. This is required since struct * ipsecrequest is not reference counted. */ q = &newchain; for (p = src->req; p; p = p->next) { *q = ipsec_newisr(); if (*q == NULL) goto fail; (*q)->saidx.proto = p->saidx.proto; (*q)->saidx.mode = p->saidx.mode; (*q)->level = p->level; (*q)->saidx.reqid = p->saidx.reqid; bcopy(&p->saidx.src, &(*q)->saidx.src, sizeof((*q)->saidx.src)); bcopy(&p->saidx.dst, &(*q)->saidx.dst, sizeof((*q)->saidx.dst)); (*q)->sp = dst; q = &((*q)->next); } dst->req = newchain; dst->state = src->state; dst->policy = src->policy; /* Do not touch the refcnt fields. */ return (dst); fail: for (p = newchain; p; p = r) { r = p->next; ipsec_delisr(p); p = NULL; } return (NULL); } /* Set policy and IPsec request if present. */ static int ipsec_set_policy_internal(struct secpolicy **pcb_sp, int optname, caddr_t request, size_t len, struct ucred *cred) { struct sadb_x_policy *xpl; struct secpolicy *newsp = NULL; int error; /* Sanity check. */ if (pcb_sp == NULL || *pcb_sp == NULL || request == NULL) return (EINVAL); if (len < sizeof(*xpl)) return (EINVAL); xpl = (struct sadb_x_policy *)request; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: passed policy\n", __func__); kdebug_sadb_x_policy((struct sadb_ext *)xpl)); /* Check policy type. */ /* ipsec_set_policy_internal() accepts IPSEC, ENTRUST and BYPASS. */ if (xpl->sadb_x_policy_type == IPSEC_POLICY_DISCARD || xpl->sadb_x_policy_type == IPSEC_POLICY_NONE) return (EINVAL); /* Check privileged socket. */ if (cred != NULL && xpl->sadb_x_policy_type == IPSEC_POLICY_BYPASS) { error = priv_check_cred(cred, PRIV_NETINET_IPSEC, 0); if (error) return (EACCES); } /* Allocating new SP entry. */ if ((newsp = key_msg2sp(xpl, len, &error)) == NULL) return (error); newsp->state = IPSEC_SPSTATE_ALIVE; /* Clear old SP and set new SP. */ KEY_FREESP(pcb_sp); *pcb_sp = newsp; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: new policy\n", __func__); kdebug_secpolicy(newsp)); return (0); } int ipsec_set_policy(struct inpcb *inp, int optname, caddr_t request, size_t len, struct ucred *cred) { struct sadb_x_policy *xpl; struct secpolicy **pcb_sp; /* Sanity check. */ if (inp == NULL || request == NULL) return (EINVAL); if (len < sizeof(*xpl)) return (EINVAL); xpl = (struct sadb_x_policy *)request; /* Select direction. */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = &inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = &inp->inp_sp->sp_out; break; default: ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__, xpl->sadb_x_policy_dir)); return (EINVAL); } return (ipsec_set_policy_internal(pcb_sp, optname, request, len, cred)); } int ipsec_get_policy(struct inpcb *inp, caddr_t request, size_t len, struct mbuf **mp) { struct sadb_x_policy *xpl; struct secpolicy *pcb_sp; /* Sanity check. */ if (inp == NULL || request == NULL || mp == NULL) return (EINVAL); IPSEC_ASSERT(inp->inp_sp != NULL, ("null inp_sp")); if (len < sizeof(*xpl)) return (EINVAL); xpl = (struct sadb_x_policy *)request; /* Select direction. */ switch (xpl->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: pcb_sp = inp->inp_sp->sp_in; break; case IPSEC_DIR_OUTBOUND: pcb_sp = inp->inp_sp->sp_out; break; default: ipseclog((LOG_ERR, "%s: invalid direction=%u\n", __func__, xpl->sadb_x_policy_dir)); return (EINVAL); } /* Sanity check. Should be an IPSEC_ASSERT. */ if (pcb_sp == NULL) return (EINVAL); *mp = key_sp2msg(pcb_sp); if (!*mp) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return (ENOBUFS); } (*mp)->m_type = MT_DATA; KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s:\n", __func__); kdebug_mbuf(*mp)); return (0); } /* Delete policy in PCB. */ int ipsec_delete_pcbpolicy(struct inpcb *inp) { IPSEC_ASSERT(inp != NULL, ("null inp")); if (inp->inp_sp == NULL) return (0); if (inp->inp_sp->sp_in != NULL) KEY_FREESP(&inp->inp_sp->sp_in); if (inp->inp_sp->sp_out != NULL) KEY_FREESP(&inp->inp_sp->sp_out); ipsec_delpcbpolicy(inp->inp_sp); inp->inp_sp = NULL; return (0); } /* * Return current level. * Either IPSEC_LEVEL_USE or IPSEC_LEVEL_REQUIRE are always returned. */ u_int ipsec_get_reqlevel(struct ipsecrequest *isr) { u_int level = 0; u_int esp_trans_deflev, esp_net_deflev; u_int ah_trans_deflev, ah_net_deflev; IPSEC_ASSERT(isr != NULL && isr->sp != NULL, ("null argument")); IPSEC_ASSERT(isr->sp->spidx.src.sa.sa_family == isr->sp->spidx.dst.sa.sa_family, ("af family mismatch, src %u, dst %u", isr->sp->spidx.src.sa.sa_family, isr->sp->spidx.dst.sa.sa_family)); /* XXX Note that we have ipseclog() expanded here - code sync issue. */ #define IPSEC_CHECK_DEFAULT(lev) \ (((lev) != IPSEC_LEVEL_USE && (lev) != IPSEC_LEVEL_REQUIRE \ && (lev) != IPSEC_LEVEL_UNIQUE) \ ? (V_ipsec_debug \ ? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\ (lev), IPSEC_LEVEL_REQUIRE) \ : 0), \ (lev) = IPSEC_LEVEL_REQUIRE, \ (lev) \ : (lev)) /* Set default level. */ switch (((struct sockaddr *)&isr->sp->spidx.src)->sa_family) { #ifdef INET case AF_INET: esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip4_ah_net_deflev); break; #endif #ifdef INET6 case AF_INET6: esp_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_trans_deflev); esp_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_esp_net_deflev); ah_trans_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_trans_deflev); ah_net_deflev = IPSEC_CHECK_DEFAULT(V_ip6_ah_net_deflev); break; #endif /* INET6 */ default: panic("%s: unknown af %u", __func__, isr->sp->spidx.src.sa.sa_family); } #undef IPSEC_CHECK_DEFAULT /* Set level. */ switch (isr->level) { case IPSEC_LEVEL_DEFAULT: switch (isr->saidx.proto) { case IPPROTO_ESP: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = esp_net_deflev; else level = esp_trans_deflev; break; case IPPROTO_AH: if (isr->saidx.mode == IPSEC_MODE_TUNNEL) level = ah_net_deflev; else level = ah_trans_deflev; break; case IPPROTO_IPCOMP: /* * We don't really care, as IPcomp document says that * we shouldn't compress small packets. */ level = IPSEC_LEVEL_USE; break; default: panic("%s: Illegal protocol defined %u\n", __func__, isr->saidx.proto); } break; case IPSEC_LEVEL_USE: case IPSEC_LEVEL_REQUIRE: level = isr->level; break; case IPSEC_LEVEL_UNIQUE: level = IPSEC_LEVEL_REQUIRE; break; default: panic("%s: Illegal IPsec level %u\n", __func__, isr->level); } return (level); } /* * Check security policy requirements against the actual * packet contents. Return one if the packet should be * reject as "invalid"; otherwiser return zero to have the * packet treated as "valid". * * OUT: * 0: valid * 1: invalid */ int ipsec_in_reject(struct secpolicy *sp, struct mbuf *m) { struct ipsecrequest *isr; int need_auth; KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: using SP\n", __func__); kdebug_secpolicy(sp)); /* Check policy. */ switch (sp->policy) { case IPSEC_POLICY_DISCARD: return (1); case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return (0); } IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC, ("invalid policy %u", sp->policy)); /* XXX Should compare policy against IPsec header history. */ need_auth = 0; for (isr = sp->req; isr != NULL; isr = isr->next) { if (ipsec_get_reqlevel(isr) != IPSEC_LEVEL_REQUIRE) continue; switch (isr->saidx.proto) { case IPPROTO_ESP: if ((m->m_flags & M_DECRYPTED) == 0) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: ESP m_flags:%x\n", __func__, m->m_flags)); return (1); } if (!need_auth && isr->sav != NULL && isr->sav->tdb_authalgxform != NULL && (m->m_flags & M_AUTHIPDGM) == 0) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: ESP/AH m_flags:%x\n", __func__, m->m_flags)); return (1); } break; case IPPROTO_AH: need_auth = 1; if ((m->m_flags & M_AUTHIPHDR) == 0) { KEYDEBUG(KEYDEBUG_IPSEC_DUMP, printf("%s: AH m_flags:%x\n", __func__, m->m_flags)); return (1); } break; case IPPROTO_IPCOMP: /* * We don't really care, as IPcomp document * says that we shouldn't compress small * packets. IPComp policy should always be * treated as being in "use" level. */ break; } } return (0); /* Valid. */ } static int ipsec46_in_reject(struct mbuf *m, struct inpcb *inp) { struct secpolicy *sp; int error; int result; IPSEC_ASSERT(m != NULL, ("null mbuf")); /* * Get SP for this packet. * When we are called from ip_forward(), we call * ipsec_getpolicybyaddr() with IP_FORWARDING flag. */ if (inp == NULL) sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, IPSEC_DIR_INBOUND, inp, &error); if (sp != NULL) { result = ipsec_in_reject(sp, m); KEY_FREESP(&sp); } else { result = 0; /* XXX Should be panic? * -> No, there may be error. */ } return (result); } /* * Check AH/ESP integrity. * This function is called from tcp_input(), udp_input(), * and {ah,esp}4_input for tunnel mode. */ int ipsec4_in_reject(struct mbuf *m, struct inpcb *inp) { int result; result = ipsec46_in_reject(m, inp); if (result) IPSECSTAT_INC(ips_in_polvio); return (result); } #ifdef INET6 /* * Check AH/ESP integrity. * This function is called from tcp6_input(), udp6_input(), * and {ah,esp}6_input for tunnel mode. */ int ipsec6_in_reject(struct mbuf *m, struct inpcb *inp) { int result; result = ipsec46_in_reject(m, inp); if (result) IPSEC6STAT_INC(ips_in_polvio); return (result); } #endif /* * Compute the byte size to be occupied by IPsec header. * In case it is tunnelled, it includes the size of outer IP header. * NOTE: SP passed is freed in this function. */ static size_t ipsec_hdrsiz_internal(struct secpolicy *sp) { struct ipsecrequest *isr; size_t size; KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: using SP\n", __func__); kdebug_secpolicy(sp)); switch (sp->policy) { case IPSEC_POLICY_DISCARD: case IPSEC_POLICY_BYPASS: case IPSEC_POLICY_NONE: return (0); } IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC, ("invalid policy %u", sp->policy)); size = 0; for (isr = sp->req; isr != NULL; isr = isr->next) { size_t clen = 0; switch (isr->saidx.proto) { case IPPROTO_ESP: clen = esp_hdrsiz(isr->sav); break; case IPPROTO_AH: clen = ah_hdrsiz(isr->sav); break; case IPPROTO_IPCOMP: clen = sizeof(struct ipcomp); break; } if (isr->saidx.mode == IPSEC_MODE_TUNNEL) { switch (isr->saidx.dst.sa.sa_family) { case AF_INET: clen += sizeof(struct ip); break; #ifdef INET6 case AF_INET6: clen += sizeof(struct ip6_hdr); break; #endif default: ipseclog((LOG_ERR, "%s: unknown AF %d in " "IPsec tunnel SA\n", __func__, ((struct sockaddr *)&isr->saidx.dst)->sa_family)); break; } } size += clen; } return (size); } /* * This function is called from ipsec_hdrsiz_tcp(), ip_ipsec_mtu(), * disabled ip6_ipsec_mtu() and ip6_forward(). */ size_t ipsec_hdrsiz(struct mbuf *m, u_int dir, struct inpcb *inp) { struct secpolicy *sp; int error; size_t size; IPSEC_ASSERT(m != NULL, ("null mbuf")); /* Get SP for this packet. * When we are called from ip_forward(), we call * ipsec_getpolicybyaddr() with IP_FORWARDING flag. */ if (inp == NULL) sp = ipsec_getpolicybyaddr(m, dir, IP_FORWARDING, &error); else sp = ipsec_getpolicybysock(m, dir, inp, &error); if (sp != NULL) { size = ipsec_hdrsiz_internal(sp); KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("%s: size:%lu.\n", __func__, (unsigned long)size)); KEY_FREESP(&sp); } else { size = 0; /* XXX Should be panic? * -> No, we are called w/o knowing if * IPsec processing is needed. */ } return (size); } /* * Check the variable replay window. * ipsec_chkreplay() performs replay check before ICV verification. * ipsec_updatereplay() updates replay bitmap. This must be called after * ICV verification (it also performs replay check, which is usually done * beforehand). * 0 (zero) is returned if packet disallowed, 1 if packet permitted. * * Based on RFC 2401. */ int ipsec_chkreplay(u_int32_t seq, struct secasvar *sav) { const struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* Constant: bits of window size. */ int frlast; /* Constant: last frame. */ IPSEC_ASSERT(sav != NULL, ("Null SA")); IPSEC_ASSERT(sav->replay != NULL, ("Null replay state")); replay = sav->replay; if (replay->wsize == 0) return (1); /* No need to check replay. */ /* Constant. */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* Sequence number of 0 is invalid. */ if (seq == 0) return (0); /* First time is always okay. */ if (replay->count == 0) return (1); if (seq > replay->lastseq) { /* Larger sequences are okay. */ return (1); } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* Over range to check, i.e. too old or wrapped. */ if (diff >= wsizeb) return (0); fr = frlast - diff / 8; /* This packet already seen? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return (0); /* Out of order but good. */ return (1); } } /* * Check replay counter whether to update or not. * OUT: 0: OK * 1: NG */ int ipsec_updatereplay(u_int32_t seq, struct secasvar *sav) { struct secreplay *replay; u_int32_t diff; int fr; u_int32_t wsizeb; /* Constant: bits of window size. */ int frlast; /* Constant: last frame. */ IPSEC_ASSERT(sav != NULL, ("Null SA")); IPSEC_ASSERT(sav->replay != NULL, ("Null replay state")); replay = sav->replay; if (replay->wsize == 0) goto ok; /* No need to check replay. */ /* Constant. */ frlast = replay->wsize - 1; wsizeb = replay->wsize << 3; /* Sequence number of 0 is invalid. */ if (seq == 0) return (1); /* First time. */ if (replay->count == 0) { replay->lastseq = seq; bzero(replay->bitmap, replay->wsize); (replay->bitmap)[frlast] = 1; goto ok; } if (seq > replay->lastseq) { /* seq is larger than lastseq. */ diff = seq - replay->lastseq; /* New larger sequence number. */ if (diff < wsizeb) { /* In window. */ /* Set bit for this packet. */ vshiftl(replay->bitmap, diff, replay->wsize); (replay->bitmap)[frlast] |= 1; } else { /* This packet has a "way larger". */ bzero(replay->bitmap, replay->wsize); (replay->bitmap)[frlast] = 1; } replay->lastseq = seq; /* Larger is good. */ } else { /* seq is equal or less than lastseq. */ diff = replay->lastseq - seq; /* Over range to check, i.e. too old or wrapped. */ if (diff >= wsizeb) return (1); fr = frlast - diff / 8; /* This packet already seen? */ if ((replay->bitmap)[fr] & (1 << (diff % 8))) return (1); /* Mark as seen. */ (replay->bitmap)[fr] |= (1 << (diff % 8)); /* Out of order but good. */ } ok: if (replay->count == ~0) { /* Set overflow flag. */ replay->overflow++; /* Don't increment, no more packets accepted. */ if ((sav->flags & SADB_X_EXT_CYCSEQ) == 0) return (1); ipseclog((LOG_WARNING, "%s: replay counter made %d cycle. %s\n", __func__, replay->overflow, ipsec_logsastr(sav))); } replay->count++; return (0); } /* * Shift variable length buffer to left. * IN: bitmap: pointer to the buffer * nbit: the number of to shift. * wsize: buffer size (bytes). */ static void vshiftl(unsigned char *bitmap, int nbit, int wsize) { int s, j, i; unsigned char over; for (j = 0; j < nbit; j += 8) { s = (nbit - j < 8) ? (nbit - j): 8; bitmap[0] <<= s; for (i = 1; i < wsize; i++) { over = (bitmap[i] >> (8 - s)); bitmap[i] <<= s; bitmap[i-1] |= over; } } } #ifdef INET /* Return a printable string for the IPv4 address. */ static char * inet_ntoa4(struct in_addr ina) { static char buf[4][4 * sizeof "123" + 4]; unsigned char *ucp = (unsigned char *) &ina; static int i = 3; /* XXX-BZ Returns static buffer. */ i = (i + 1) % 4; sprintf(buf[i], "%d.%d.%d.%d", ucp[0] & 0xff, ucp[1] & 0xff, ucp[2] & 0xff, ucp[3] & 0xff); return (buf[i]); } #endif /* Return a printable string for the address. */ char * ipsec_address(union sockaddr_union* sa) { #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif switch (sa->sa.sa_family) { #ifdef INET case AF_INET: return (inet_ntoa4(sa->sin.sin_addr)); #endif /* INET */ #ifdef INET6 case AF_INET6: return (ip6_sprintf(ip6buf, &sa->sin6.sin6_addr)); #endif /* INET6 */ default: return ("(unknown address family)"); } } const char * ipsec_logsastr(struct secasvar *sav) { static char buf[256]; char *p; struct secasindex *saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->src.sa.sa_family == saidx->dst.sa.sa_family, ("address family mismatch")); p = buf; snprintf(buf, sizeof(buf), "SA(SPI=%u ", (u_int32_t)ntohl(sav->spi)); while (p && *p) p++; /* NB: only use ipsec_address on one address at a time. */ snprintf(p, sizeof (buf) - (p - buf), "src=%s ", ipsec_address(&saidx->src)); while (p && *p) p++; snprintf(p, sizeof (buf) - (p - buf), "dst=%s)", ipsec_address(&saidx->dst)); return (buf); } void ipsec_dumpmbuf(struct mbuf *m) { int totlen; int i; u_char *p; totlen = 0; printf("---\n"); while (m) { p = mtod(m, u_char *); for (i = 0; i < m->m_len; i++) { printf("%02x ", p[i]); totlen++; if (totlen % 16 == 0) printf("\n"); } m = m->m_next; } if (totlen % 16 != 0) printf("\n"); printf("---\n"); } static void ipsec_init(const void *unused __unused) { SECPOLICY_LOCK_INIT(&V_ip4_def_policy); V_ip4_def_policy.refcnt = 1; /* NB: disallow free. */ } VNET_SYSINIT(ipsec_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, ipsec_init, NULL); /* XXX This stuff doesn't belong here... */ static struct xformsw* xforms = NULL; /* * Register a transform; typically at system startup. */ void xform_register(struct xformsw* xsp) { xsp->xf_next = xforms; xforms = xsp; } /* * Initialize transform support in an sav. */ int xform_init(struct secasvar *sav, int xftype) { struct xformsw *xsp; if (sav->tdb_xform != NULL) /* Previously initialized. */ return (0); for (xsp = xforms; xsp; xsp = xsp->xf_next) if (xsp->xf_type == xftype) return ((*xsp->xf_init)(sav, xsp)); return (EINVAL); } Index: user/dchagin/lemul/sys/netipsec/ipsec_input.c =================================================================== --- user/dchagin/lemul/sys/netipsec/ipsec_input.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/ipsec_input.c (revision 274241) @@ -1,955 +1,944 @@ /* $FreeBSD$ */ /* $OpenBSD: ipsec_input.c,v 1.63 2003/02/20 18:35:43 deraadt Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * This code was written by John Ioannidis for BSD/OS in Athens, Greece, * in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 2001, Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ /* * IPsec input processing. */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_enc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ENC #include #endif #define IPSEC_ISTAT(proto, name) do { \ if ((proto) == IPPROTO_ESP) \ ESPSTAT_INC(esps_##name); \ else if ((proto) == IPPROTO_AH) \ AHSTAT_INC(ahs_##name); \ else \ IPCOMPSTAT_INC(ipcomps_##name); \ } while (0) #ifdef INET static void ipsec4_common_ctlinput(int, struct sockaddr *, void *, int); #endif /* * ipsec_common_input gets called when an IPsec-protected packet * is received by IPv4 or IPv6. Its job is to find the right SA * and call the appropriate transform. The transform callback * takes care of further processing (like ingress filtering). */ static int ipsec_common_input(struct mbuf *m, int skip, int protoff, int af, int sproto) { union sockaddr_union dst_address; struct secasvar *sav; u_int32_t spi; int error; #ifdef INET #ifdef IPSEC_NAT_T struct m_tag *tag; #endif #endif IPSEC_ISTAT(sproto, input); IPSEC_ASSERT(m != NULL, ("null packet")); IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH || sproto == IPPROTO_IPCOMP, ("unexpected security protocol %u", sproto)); if ((sproto == IPPROTO_ESP && !V_esp_enable) || (sproto == IPPROTO_AH && !V_ah_enable) || (sproto == IPPROTO_IPCOMP && !V_ipcomp_enable)) { m_freem(m); IPSEC_ISTAT(sproto, pdrops); return EOPNOTSUPP; } if (m->m_pkthdr.len - skip < 2 * sizeof (u_int32_t)) { m_freem(m); IPSEC_ISTAT(sproto, hdrops); DPRINTF(("%s: packet too small\n", __func__)); return EINVAL; } /* Retrieve the SPI from the relevant IPsec header */ if (sproto == IPPROTO_ESP) m_copydata(m, skip, sizeof(u_int32_t), (caddr_t) &spi); else if (sproto == IPPROTO_AH) m_copydata(m, skip + sizeof(u_int32_t), sizeof(u_int32_t), (caddr_t) &spi); else if (sproto == IPPROTO_IPCOMP) { u_int16_t cpi; m_copydata(m, skip + sizeof(u_int16_t), sizeof(u_int16_t), (caddr_t) &cpi); spi = ntohl(htons(cpi)); } /* * Find the SA and (indirectly) call the appropriate * kernel crypto routine. The resulting mbuf chain is a valid * IP packet ready to go through input processing. */ bzero(&dst_address, sizeof (dst_address)); dst_address.sa.sa_family = af; switch (af) { #ifdef INET case AF_INET: dst_address.sin.sin_len = sizeof(struct sockaddr_in); m_copydata(m, offsetof(struct ip, ip_dst), sizeof(struct in_addr), (caddr_t) &dst_address.sin.sin_addr); #ifdef IPSEC_NAT_T /* Find the source port for NAT-T; see udp*_espdecap. */ tag = m_tag_find(m, PACKET_TAG_IPSEC_NAT_T_PORTS, NULL); if (tag != NULL) dst_address.sin.sin_port = ((u_int16_t *)(tag + 1))[1]; #endif /* IPSEC_NAT_T */ break; #endif /* INET */ #ifdef INET6 case AF_INET6: dst_address.sin6.sin6_len = sizeof(struct sockaddr_in6); m_copydata(m, offsetof(struct ip6_hdr, ip6_dst), sizeof(struct in6_addr), (caddr_t) &dst_address.sin6.sin6_addr); break; #endif /* INET6 */ default: DPRINTF(("%s: unsupported protocol family %u\n", __func__, af)); m_freem(m); IPSEC_ISTAT(sproto, nopf); return EPFNOSUPPORT; } /* NB: only pass dst since key_allocsa follows RFC2401 */ sav = KEY_ALLOCSA(&dst_address, sproto, spi); if (sav == NULL) { DPRINTF(("%s: no key association found for SA %s/%08lx/%u\n", __func__, ipsec_address(&dst_address), (u_long) ntohl(spi), sproto)); IPSEC_ISTAT(sproto, notdb); m_freem(m); return ENOENT; } if (sav->tdb_xform == NULL) { DPRINTF(("%s: attempted to use uninitialized SA %s/%08lx/%u\n", __func__, ipsec_address(&dst_address), (u_long) ntohl(spi), sproto)); IPSEC_ISTAT(sproto, noxform); KEY_FREESAV(&sav); m_freem(m); return ENXIO; } /* * Call appropriate transform and return -- callback takes care of * everything else. */ error = (*sav->tdb_xform->xf_input)(m, sav, skip, protoff); KEY_FREESAV(&sav); return error; } #ifdef INET /* * Common input handler for IPv4 AH, ESP, and IPCOMP. */ int ipsec4_common_input(struct mbuf *m, ...) { va_list ap; int off, nxt; va_start(ap, m); off = va_arg(ap, int); nxt = va_arg(ap, int); va_end(ap); return ipsec_common_input(m, off, offsetof(struct ip, ip_p), AF_INET, nxt); } int ah4_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; int off; m = *mp; off = *offp; *mp = NULL; ipsec4_common_input(m, off, IPPROTO_AH); return (IPPROTO_DONE); } void ah4_ctlinput(int cmd, struct sockaddr *sa, void *v) { if (sa->sa_family == AF_INET && sa->sa_len == sizeof(struct sockaddr_in)) ipsec4_common_ctlinput(cmd, sa, v, IPPROTO_AH); } int esp4_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; int off; m = *mp; off = *offp; mp = NULL; ipsec4_common_input(m, off, IPPROTO_ESP); return (IPPROTO_DONE); } void esp4_ctlinput(int cmd, struct sockaddr *sa, void *v) { if (sa->sa_family == AF_INET && sa->sa_len == sizeof(struct sockaddr_in)) ipsec4_common_ctlinput(cmd, sa, v, IPPROTO_ESP); } int ipcomp4_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; int off; m = *mp; off = *offp; mp = NULL; ipsec4_common_input(m, off, IPPROTO_IPCOMP); return (IPPROTO_DONE); } /* * IPsec input callback for INET protocols. * This routine is called as the transform callback. * Takes care of filtering and other sanity checks on * the processed packet. */ int ipsec4_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, int protoff, struct m_tag *mt) { int prot, af, sproto, isr_prot; struct ip *ip; struct m_tag *mtag; struct tdb_ident *tdbi; struct secasindex *saidx; int error; #ifdef INET6 #ifdef notyet char ip6buf[INET6_ADDRSTRLEN]; #endif #endif IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->sah != NULL, ("null SAH")); saidx = &sav->sah->saidx; af = saidx->dst.sa.sa_family; IPSEC_ASSERT(af == AF_INET, ("unexpected af %u", af)); sproto = saidx->proto; IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH || sproto == IPPROTO_IPCOMP, ("unexpected security protocol %u", sproto)); /* Sanity check */ if (m == NULL) { DPRINTF(("%s: null mbuf", __func__)); IPSEC_ISTAT(sproto, badkcr); KEY_FREESAV(&sav); return EINVAL; } if (skip != 0) { /* * Fix IPv4 header * XXXGL: do we need this entire block? */ if (m->m_len < skip && (m = m_pullup(m, skip)) == NULL) { DPRINTF(("%s: processing failed for SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); IPSEC_ISTAT(sproto, hdrops); error = ENOBUFS; goto bad; } ip = mtod(m, struct ip *); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_sum = 0; ip->ip_sum = in_cksum(m, ip->ip_hl << 2); } else { ip = mtod(m, struct ip *); } prot = ip->ip_p; #ifdef DEV_ENC if_inc_counter(encif, IFCOUNTER_IPACKETS, 1); if_inc_counter(encif, IFCOUNTER_IBYTES, m->m_pkthdr.len); - /* - * Pass the mbuf to enc0 for bpf and pfil. We will filter the IPIP - * packet later after it has been decapsulated. - */ + /* Pass the mbuf to enc0 for bpf and pfil. */ ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_BEFORE); - - if (prot != IPPROTO_IPIP) - if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0) - return (error); + if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0) + return (error); #endif /* DEV_ENC */ /* IP-in-IP encapsulation */ if (prot == IPPROTO_IPIP && saidx->mode != IPSEC_MODE_TRANSPORT) { if (m->m_pkthdr.len - skip < sizeof(struct ip)) { IPSEC_ISTAT(sproto, hdrops); error = EINVAL; goto bad; } /* enc0: strip outer IPv4 header */ m_striphdr(m, 0, ip->ip_hl << 2); #ifdef notyet /* XXX PROXY address isn't recorded in SAH */ /* * Check that the inner source address is the same as * the proxy address, if available. */ if ((saidx->proxy.sa.sa_family == AF_INET && saidx->proxy.sin.sin_addr.s_addr != INADDR_ANY && ipn.ip_src.s_addr != saidx->proxy.sin.sin_addr.s_addr) || (saidx->proxy.sa.sa_family != AF_INET && saidx->proxy.sa.sa_family != 0)) { DPRINTF(("%s: inner source address %s doesn't " "correspond to expected proxy source %s, " "SA %s/%08lx\n", __func__, inet_ntoa4(ipn.ip_src), ipsp_address(saidx->proxy), ipsp_address(saidx->dst), (u_long) ntohl(sav->spi))); IPSEC_ISTAT(sproto, pdrops); error = EACCES; goto bad; } #endif /* notyet */ } #ifdef INET6 /* IPv6-in-IP encapsulation. */ else if (prot == IPPROTO_IPV6 && saidx->mode != IPSEC_MODE_TRANSPORT) { if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) { IPSEC_ISTAT(sproto, hdrops); error = EINVAL; goto bad; } /* enc0: strip IPv4 header, keep IPv6 header only */ m_striphdr(m, 0, ip->ip_hl << 2); #ifdef notyet /* * Check that the inner source address is the same as * the proxy address, if available. */ if ((saidx->proxy.sa.sa_family == AF_INET6 && !IN6_IS_ADDR_UNSPECIFIED(&saidx->proxy.sin6.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&ip6n.ip6_src, &saidx->proxy.sin6.sin6_addr)) || (saidx->proxy.sa.sa_family != AF_INET6 && saidx->proxy.sa.sa_family != 0)) { DPRINTF(("%s: inner source address %s doesn't " "correspond to expected proxy source %s, " "SA %s/%08lx\n", __func__, ip6_sprintf(ip6buf, &ip6n.ip6_src), ipsec_address(&saidx->proxy), ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); IPSEC_ISTAT(sproto, pdrops); error = EACCES; goto bad; } #endif /* notyet */ } #endif /* INET6 */ else if (prot != IPPROTO_IPV6 && saidx->mode == IPSEC_MODE_ANY) { /* * When mode is wildcard, inner protocol is IPv6 and * we have no INET6 support - drop this packet a bit later. * In other cases we assume transport mode and outer * header was already stripped in xform_xxx_cb. */ prot = IPPROTO_IPIP; } /* * Record what we've done to the packet (under what SA it was * processed). If we've been passed an mtag, it means the packet * was already processed by an ethernet/crypto combo card and * thus has a tag attached with all the right information, but * with a PACKET_TAG_IPSEC_IN_CRYPTO_DONE as opposed to * PACKET_TAG_IPSEC_IN_DONE type; in that case, just change the type. */ if (mt == NULL && sproto != IPPROTO_IPCOMP) { mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE, sizeof(struct tdb_ident), M_NOWAIT); if (mtag == NULL) { DPRINTF(("%s: failed to get tag\n", __func__)); IPSEC_ISTAT(sproto, hdrops); error = ENOMEM; goto bad; } tdbi = (struct tdb_ident *)(mtag + 1); bcopy(&saidx->dst, &tdbi->dst, saidx->dst.sa.sa_len); tdbi->proto = sproto; tdbi->spi = sav->spi; /* Cache those two for enc(4) in xform_ipip. */ tdbi->alg_auth = sav->alg_auth; tdbi->alg_enc = sav->alg_enc; m_tag_prepend(m, mtag); } else if (mt != NULL) { mt->m_tag_id = PACKET_TAG_IPSEC_IN_DONE; /* XXX do we need to mark m_flags??? */ } key_sa_recordxfer(sav, m); /* record data transfer */ /* * In transport mode requeue decrypted mbuf back to IPv4 protocol * handler. This is necessary to correctly expose rcvif. */ if (saidx->mode == IPSEC_MODE_TRANSPORT) prot = IPPROTO_IPIP; #ifdef DEV_ENC /* * Pass the mbuf to enc0 for bpf and pfil. */ if (prot == IPPROTO_IPIP) ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_AFTER); #ifdef INET6 if (prot == IPPROTO_IPV6) ipsec_bpf(m, sav, AF_INET6, ENC_IN|ENC_AFTER); #endif if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_AFTER)) != 0) return (error); #endif /* DEV_ENC */ /* * Re-dispatch via software interrupt. */ switch (prot) { case IPPROTO_IPIP: isr_prot = NETISR_IP; break; #ifdef INET6 case IPPROTO_IPV6: isr_prot = NETISR_IPV6; break; #endif default: DPRINTF(("%s: cannot handle inner ip proto %d\n", __func__, prot)); IPSEC_ISTAT(sproto, nopf); error = EPFNOSUPPORT; goto bad; } error = netisr_queue_src(isr_prot, (uintptr_t)sav->spi, m); if (error) { IPSEC_ISTAT(sproto, qfull); DPRINTF(("%s: queue full; proto %u packet dropped\n", __func__, sproto)); return error; } return 0; bad: m_freem(m); return error; } void ipsec4_common_ctlinput(int cmd, struct sockaddr *sa, void *v, int proto) { /* XXX nothing just yet */ } #endif /* INET */ #ifdef INET6 /* IPv6 AH wrapper. */ int ipsec6_common_input(struct mbuf **mp, int *offp, int proto) { int l = 0; int protoff; struct ip6_ext ip6e; if (*offp < sizeof(struct ip6_hdr)) { DPRINTF(("%s: bad offset %u\n", __func__, *offp)); return IPPROTO_DONE; } else if (*offp == sizeof(struct ip6_hdr)) { protoff = offsetof(struct ip6_hdr, ip6_nxt); } else { /* Chase down the header chain... */ protoff = sizeof(struct ip6_hdr); do { protoff += l; m_copydata(*mp, protoff, sizeof(ip6e), (caddr_t) &ip6e); if (ip6e.ip6e_nxt == IPPROTO_AH) l = (ip6e.ip6e_len + 2) << 2; else l = (ip6e.ip6e_len + 1) << 3; IPSEC_ASSERT(l > 0, ("l went zero or negative")); } while (protoff + l < *offp); /* Malformed packet check */ if (protoff + l != *offp) { DPRINTF(("%s: bad packet header chain, protoff %u, " "l %u, off %u\n", __func__, protoff, l, *offp)); IPSEC_ISTAT(proto, hdrops); m_freem(*mp); *mp = NULL; return IPPROTO_DONE; } protoff += offsetof(struct ip6_ext, ip6e_nxt); } (void) ipsec_common_input(*mp, *offp, protoff, AF_INET6, proto); return IPPROTO_DONE; } /* * IPsec input callback, called by the transform callback. Takes care of * filtering and other sanity checks on the processed packet. */ int ipsec6_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip, int protoff, struct m_tag *mt) { int prot, af, sproto; struct ip6_hdr *ip6; struct m_tag *mtag; struct tdb_ident *tdbi; struct secasindex *saidx; int nxt; u_int8_t nxt8; int error, nest; #ifdef notyet char ip6buf[INET6_ADDRSTRLEN]; #endif IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->sah != NULL, ("null SAH")); saidx = &sav->sah->saidx; af = saidx->dst.sa.sa_family; IPSEC_ASSERT(af == AF_INET6, ("unexpected af %u", af)); sproto = saidx->proto; IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH || sproto == IPPROTO_IPCOMP, ("unexpected security protocol %u", sproto)); /* Sanity check */ if (m == NULL) { DPRINTF(("%s: null mbuf", __func__)); IPSEC_ISTAT(sproto, badkcr); error = EINVAL; goto bad; } /* Fix IPv6 header */ if (m->m_len < sizeof(struct ip6_hdr) && (m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { DPRINTF(("%s: processing failed for SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); IPSEC_ISTAT(sproto, hdrops); error = EACCES; goto bad; } ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); /* Save protocol */ prot = 0; m_copydata(m, protoff, 1, (unsigned char *) &prot); #ifdef DEV_ENC if_inc_counter(encif, IFCOUNTER_IPACKETS, 1); if_inc_counter(encif, IFCOUNTER_IBYTES, m->m_pkthdr.len); - /* - * Pass the mbuf to enc0 for bpf and pfil. We will filter the IPIP - * packet later after it has been decapsulated. - */ + /* Pass the mbuf to enc0 for bpf and pfil. */ ipsec_bpf(m, sav, AF_INET6, ENC_IN|ENC_BEFORE); - - /* XXX-BZ does not make sense. */ - if (prot != IPPROTO_IPIP) - if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0) - return (error); + if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_BEFORE)) != 0) + return (error); #endif /* DEV_ENC */ #ifdef INET /* IP-in-IP encapsulation */ if (prot == IPPROTO_IPIP) { if (m->m_pkthdr.len - skip < sizeof(struct ip)) { IPSEC_ISTAT(sproto, hdrops); error = EINVAL; goto bad; } /* ipn will now contain the inner IPv4 header */ m_striphdr(m, 0, skip); skip = 0; #ifdef notyet /* * Check that the inner source address is the same as * the proxy address, if available. */ if ((saidx->proxy.sa.sa_family == AF_INET && saidx->proxy.sin.sin_addr.s_addr != INADDR_ANY && ipn.ip_src.s_addr != saidx->proxy.sin.sin_addr.s_addr) || (saidx->proxy.sa.sa_family != AF_INET && saidx->proxy.sa.sa_family != 0)) { DPRINTF(("%s: inner source address %s doesn't " "correspond to expected proxy source %s, " "SA %s/%08lx\n", __func__, inet_ntoa4(ipn.ip_src), ipsec_address(&saidx->proxy), ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); IPSEC_ISTAT(sproto, pdrops); error = EACCES; goto bad; } #endif /* notyet */ } #endif /* INET */ /* IPv6-in-IP encapsulation */ if (prot == IPPROTO_IPV6) { if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) { IPSEC_ISTAT(sproto, hdrops); error = EINVAL; goto bad; } /* ip6n will now contain the inner IPv6 header. */ m_striphdr(m, 0, skip); skip = 0; #ifdef notyet /* * Check that the inner source address is the same as * the proxy address, if available. */ if ((saidx->proxy.sa.sa_family == AF_INET6 && !IN6_IS_ADDR_UNSPECIFIED(&saidx->proxy.sin6.sin6_addr) && !IN6_ARE_ADDR_EQUAL(&ip6n.ip6_src, &saidx->proxy.sin6.sin6_addr)) || (saidx->proxy.sa.sa_family != AF_INET6 && saidx->proxy.sa.sa_family != 0)) { DPRINTF(("%s: inner source address %s doesn't " "correspond to expected proxy source %s, " "SA %s/%08lx\n", __func__, ip6_sprintf(ip6buf, &ip6n.ip6_src), ipsec_address(&saidx->proxy), ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); IPSEC_ISTAT(sproto, pdrops); error = EACCES; goto bad; } #endif /* notyet */ } /* * Record what we've done to the packet (under what SA it was * processed). If we've been passed an mtag, it means the packet * was already processed by an ethernet/crypto combo card and * thus has a tag attached with all the right information, but * with a PACKET_TAG_IPSEC_IN_CRYPTO_DONE as opposed to * PACKET_TAG_IPSEC_IN_DONE type; in that case, just change the type. */ if (mt == NULL && sproto != IPPROTO_IPCOMP) { mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE, sizeof(struct tdb_ident), M_NOWAIT); if (mtag == NULL) { DPRINTF(("%s: failed to get tag\n", __func__)); IPSEC_ISTAT(sproto, hdrops); error = ENOMEM; goto bad; } tdbi = (struct tdb_ident *)(mtag + 1); bcopy(&saidx->dst, &tdbi->dst, sizeof(union sockaddr_union)); tdbi->proto = sproto; tdbi->spi = sav->spi; /* Cache those two for enc(4) in xform_ipip. */ tdbi->alg_auth = sav->alg_auth; tdbi->alg_enc = sav->alg_enc; m_tag_prepend(m, mtag); } else { if (mt != NULL) mt->m_tag_id = PACKET_TAG_IPSEC_IN_DONE; /* XXX do we need to mark m_flags??? */ } key_sa_recordxfer(sav, m); #ifdef DEV_ENC /* * Pass the mbuf to enc0 for bpf and pfil. */ #ifdef INET if (prot == IPPROTO_IPIP) ipsec_bpf(m, sav, AF_INET, ENC_IN|ENC_AFTER); #endif if (prot == IPPROTO_IPV6) ipsec_bpf(m, sav, AF_INET6, ENC_IN|ENC_AFTER); if ((error = ipsec_filter(&m, PFIL_IN, ENC_IN|ENC_AFTER)) != 0) return (error); #endif /* DEV_ENC */ /* Retrieve new protocol */ /* We have stripped the IP6 header from the mbuf, we have to use the backuped proto value instead */ nxt8 = prot; /* * See the end of ip6_input for this logic. * IPPROTO_IPV[46] case will be processed just like other ones */ nest = 0; nxt = nxt8; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); error = EINVAL; goto bad; } /* * Protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < skip) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); error = EINVAL; goto bad; } /* * Enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ if ((inet6sw[ip6_protox[nxt]].pr_flags & PR_LASTHDR) != 0 && ipsec6_in_reject(m, NULL)) { error = EINVAL; goto bad; } nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &skip, nxt); } return 0; bad: if (m) m_freem(m); return error; } void esp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { struct ip6ctlparam *ip6cp = NULL; struct mbuf *m = NULL; struct ip6_hdr *ip6; int off; if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; if ((unsigned)cmd >= PRC_NCMDS) return; /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; } else { m = NULL; ip6 = NULL; off = 0; /* calm gcc */ } if (ip6 != NULL) { struct ip6ctlparam ip6cp1; /* * Notify the error to all possible sockets via pfctlinput2. * Since the upper layer information (such as protocol type, * source and destination ports) is embedded in the encrypted * data and might have been cut, we can't directly call * an upper layer ctlinput function. However, the pcbnotify * function will consider source and destination addresses * as well as the flow info value, and may be able to find * some PCB that should be notified. * Although pfctlinput2 will call esp6_ctlinput(), there is * no possibility of an infinite loop of function calls, * because we don't pass the inner IPv6 header. */ bzero(&ip6cp1, sizeof(ip6cp1)); ip6cp1.ip6c_src = ip6cp->ip6c_src; pfctlinput2(cmd, sa, (void *)&ip6cp1); /* * Then go to special cases that need ESP header information. * XXX: We assume that when ip6 is non NULL, * M and OFF are valid. */ if (cmd == PRC_MSGSIZE) { struct secasvar *sav; u_int32_t spi; int valid; /* check header length before using m_copydata */ if (m->m_pkthdr.len < off + sizeof (struct esp)) return; m_copydata(m, off + offsetof(struct esp, esp_spi), sizeof(u_int32_t), (caddr_t) &spi); /* * Check to see if we have a valid SA corresponding to * the address in the ICMP message payload. */ sav = KEY_ALLOCSA((union sockaddr_union *)sa, IPPROTO_ESP, spi); valid = (sav != NULL); if (sav) KEY_FREESAV(&sav); /* XXX Further validation? */ /* * Depending on whether the SA is "valid" and * routing table size (mtudisc_{hi,lo}wat), we will: * - recalcurate the new MTU and create the * corresponding routing entry, or * - ignore the MTU change notification. */ icmp6_mtudisc_update(ip6cp, valid); } } else { /* we normally notify any pcb here */ } } #endif /* INET6 */ Index: user/dchagin/lemul/sys/netipsec/key.c =================================================================== --- user/dchagin/lemul/sys/netipsec/key.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/key.c (revision 274241) @@ -1,8024 +1,8024 @@ /* $FreeBSD$ */ /* $KAME: key.c,v 1.191 2001/06/27 10:46:49 sakane Exp $ */ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This code is referd to RFC 2367 */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif /* INET6 */ #if defined(INET) || defined(INET6) #include #endif #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include /* randomness */ #include #define FULLMASK 0xff #define _BITS(bytes) ((bytes) << 3) /* * Note on SA reference counting: * - SAs that are not in DEAD state will have (total external reference + 1) * following value in reference count field. they cannot be freed and are * referenced from SA header. * - SAs that are in DEAD state will have (total external reference) * in reference count field. they are ready to be freed. reference from * SA header will be removed in key_delsav(), when the reference count * field hits 0 (= no external reference other than from SA header. */ VNET_DEFINE(u_int32_t, key_debug_level) = 0; static VNET_DEFINE(u_int, key_spi_trycnt) = 1000; static VNET_DEFINE(u_int32_t, key_spi_minval) = 0x100; static VNET_DEFINE(u_int32_t, key_spi_maxval) = 0x0fffffff; /* XXX */ static VNET_DEFINE(u_int32_t, policy_id) = 0; /*interval to initialize randseed,1(m)*/ static VNET_DEFINE(u_int, key_int_random) = 60; /* interval to expire acquiring, 30(s)*/ static VNET_DEFINE(u_int, key_larval_lifetime) = 30; /* counter for blocking SADB_ACQUIRE.*/ static VNET_DEFINE(int, key_blockacq_count) = 10; /* lifetime for blocking SADB_ACQUIRE.*/ static VNET_DEFINE(int, key_blockacq_lifetime) = 20; /* preferred old sa rather than new sa.*/ static VNET_DEFINE(int, key_preferred_oldsa) = 1; #define V_key_spi_trycnt VNET(key_spi_trycnt) #define V_key_spi_minval VNET(key_spi_minval) #define V_key_spi_maxval VNET(key_spi_maxval) #define V_policy_id VNET(policy_id) #define V_key_int_random VNET(key_int_random) #define V_key_larval_lifetime VNET(key_larval_lifetime) #define V_key_blockacq_count VNET(key_blockacq_count) #define V_key_blockacq_lifetime VNET(key_blockacq_lifetime) #define V_key_preferred_oldsa VNET(key_preferred_oldsa) static VNET_DEFINE(u_int32_t, acq_seq) = 0; #define V_acq_seq VNET(acq_seq) /* SPD */ static VNET_DEFINE(LIST_HEAD(_sptree, secpolicy), sptree[IPSEC_DIR_MAX]); #define V_sptree VNET(sptree) static struct mtx sptree_lock; #define SPTREE_LOCK_INIT() \ mtx_init(&sptree_lock, "sptree", \ "fast ipsec security policy database", MTX_DEF) #define SPTREE_LOCK_DESTROY() mtx_destroy(&sptree_lock) #define SPTREE_LOCK() mtx_lock(&sptree_lock) #define SPTREE_UNLOCK() mtx_unlock(&sptree_lock) #define SPTREE_LOCK_ASSERT() mtx_assert(&sptree_lock, MA_OWNED) static VNET_DEFINE(LIST_HEAD(_sahtree, secashead), sahtree); /* SAD */ #define V_sahtree VNET(sahtree) static struct mtx sahtree_lock; #define SAHTREE_LOCK_INIT() \ mtx_init(&sahtree_lock, "sahtree", \ "fast ipsec security association database", MTX_DEF) #define SAHTREE_LOCK_DESTROY() mtx_destroy(&sahtree_lock) #define SAHTREE_LOCK() mtx_lock(&sahtree_lock) #define SAHTREE_UNLOCK() mtx_unlock(&sahtree_lock) #define SAHTREE_LOCK_ASSERT() mtx_assert(&sahtree_lock, MA_OWNED) /* registed list */ static VNET_DEFINE(LIST_HEAD(_regtree, secreg), regtree[SADB_SATYPE_MAX + 1]); #define V_regtree VNET(regtree) static struct mtx regtree_lock; #define REGTREE_LOCK_INIT() \ mtx_init(®tree_lock, "regtree", "fast ipsec regtree", MTX_DEF) #define REGTREE_LOCK_DESTROY() mtx_destroy(®tree_lock) #define REGTREE_LOCK() mtx_lock(®tree_lock) #define REGTREE_UNLOCK() mtx_unlock(®tree_lock) #define REGTREE_LOCK_ASSERT() mtx_assert(®tree_lock, MA_OWNED) static VNET_DEFINE(LIST_HEAD(_acqtree, secacq), acqtree); /* acquiring list */ #define V_acqtree VNET(acqtree) static struct mtx acq_lock; #define ACQ_LOCK_INIT() \ mtx_init(&acq_lock, "acqtree", "fast ipsec acquire list", MTX_DEF) #define ACQ_LOCK_DESTROY() mtx_destroy(&acq_lock) #define ACQ_LOCK() mtx_lock(&acq_lock) #define ACQ_UNLOCK() mtx_unlock(&acq_lock) #define ACQ_LOCK_ASSERT() mtx_assert(&acq_lock, MA_OWNED) /* SP acquiring list */ static VNET_DEFINE(LIST_HEAD(_spacqtree, secspacq), spacqtree); #define V_spacqtree VNET(spacqtree) static struct mtx spacq_lock; #define SPACQ_LOCK_INIT() \ mtx_init(&spacq_lock, "spacqtree", \ "fast ipsec security policy acquire list", MTX_DEF) #define SPACQ_LOCK_DESTROY() mtx_destroy(&spacq_lock) #define SPACQ_LOCK() mtx_lock(&spacq_lock) #define SPACQ_UNLOCK() mtx_unlock(&spacq_lock) #define SPACQ_LOCK_ASSERT() mtx_assert(&spacq_lock, MA_OWNED) /* search order for SAs */ static const u_int saorder_state_valid_prefer_old[] = { SADB_SASTATE_DYING, SADB_SASTATE_MATURE, }; static const u_int saorder_state_valid_prefer_new[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, }; static const u_int saorder_state_alive[] = { /* except DEAD */ SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL }; static const u_int saorder_state_any[] = { SADB_SASTATE_MATURE, SADB_SASTATE_DYING, SADB_SASTATE_LARVAL, SADB_SASTATE_DEAD }; static const int minsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ sizeof(struct sadb_sa), /* SADB_EXT_SA */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */ sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_SRC */ sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_DST */ sizeof(struct sadb_address), /* SADB_EXT_ADDRESS_PROXY */ sizeof(struct sadb_key), /* SADB_EXT_KEY_AUTH */ sizeof(struct sadb_key), /* SADB_EXT_KEY_ENCRYPT */ sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_SRC */ sizeof(struct sadb_ident), /* SADB_EXT_IDENTITY_DST */ sizeof(struct sadb_sens), /* SADB_EXT_SENSITIVITY */ sizeof(struct sadb_prop), /* SADB_EXT_PROPOSAL */ sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_AUTH */ sizeof(struct sadb_supported), /* SADB_EXT_SUPPORTED_ENCRYPT */ sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */ 0, /* SADB_X_EXT_KMPRIVATE */ sizeof(struct sadb_x_policy), /* SADB_X_EXT_POLICY */ sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */ sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */ sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */ sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */ sizeof(struct sadb_address), /* SADB_X_EXT_NAT_T_OAI */ sizeof(struct sadb_address), /* SADB_X_EXT_NAT_T_OAR */ sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */ }; static const int maxsize[] = { sizeof(struct sadb_msg), /* SADB_EXT_RESERVED */ sizeof(struct sadb_sa), /* SADB_EXT_SA */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_CURRENT */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_HARD */ sizeof(struct sadb_lifetime), /* SADB_EXT_LIFETIME_SOFT */ 0, /* SADB_EXT_ADDRESS_SRC */ 0, /* SADB_EXT_ADDRESS_DST */ 0, /* SADB_EXT_ADDRESS_PROXY */ 0, /* SADB_EXT_KEY_AUTH */ 0, /* SADB_EXT_KEY_ENCRYPT */ 0, /* SADB_EXT_IDENTITY_SRC */ 0, /* SADB_EXT_IDENTITY_DST */ 0, /* SADB_EXT_SENSITIVITY */ 0, /* SADB_EXT_PROPOSAL */ 0, /* SADB_EXT_SUPPORTED_AUTH */ 0, /* SADB_EXT_SUPPORTED_ENCRYPT */ sizeof(struct sadb_spirange), /* SADB_EXT_SPIRANGE */ 0, /* SADB_X_EXT_KMPRIVATE */ 0, /* SADB_X_EXT_POLICY */ sizeof(struct sadb_x_sa2), /* SADB_X_SA2 */ sizeof(struct sadb_x_nat_t_type),/* SADB_X_EXT_NAT_T_TYPE */ sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_SPORT */ sizeof(struct sadb_x_nat_t_port),/* SADB_X_EXT_NAT_T_DPORT */ 0, /* SADB_X_EXT_NAT_T_OAI */ 0, /* SADB_X_EXT_NAT_T_OAR */ sizeof(struct sadb_x_nat_t_frag),/* SADB_X_EXT_NAT_T_FRAG */ }; static VNET_DEFINE(int, ipsec_esp_keymin) = 256; static VNET_DEFINE(int, ipsec_esp_auth) = 0; static VNET_DEFINE(int, ipsec_ah_keymin) = 128; #define V_ipsec_esp_keymin VNET(ipsec_esp_keymin) #define V_ipsec_esp_auth VNET(ipsec_esp_auth) #define V_ipsec_ah_keymin VNET(ipsec_ah_keymin) #ifdef SYSCTL_DECL SYSCTL_DECL(_net_key); #endif -SYSCTL_VNET_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, - CTLFLAG_RW, &VNET_NAME(key_debug_level), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_debug_level), 0, ""); /* max count of trial for the decision of spi value */ -SYSCTL_VNET_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, - CTLFLAG_RW, &VNET_NAME(key_spi_trycnt), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_trycnt), 0, ""); /* minimum spi value to allocate automatically. */ -SYSCTL_VNET_INT(_net_key, KEYCTL_SPI_MIN_VALUE, - spi_minval, CTLFLAG_RW, &VNET_NAME(key_spi_minval), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_minval), 0, ""); /* maximun spi value to allocate automatically. */ -SYSCTL_VNET_INT(_net_key, KEYCTL_SPI_MAX_VALUE, - spi_maxval, CTLFLAG_RW, &VNET_NAME(key_spi_maxval), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_spi_maxval), 0, ""); /* interval to initialize randseed */ -SYSCTL_VNET_INT(_net_key, KEYCTL_RANDOM_INT, - int_random, CTLFLAG_RW, &VNET_NAME(key_int_random), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_int_random), 0, ""); /* lifetime for larval SA */ -SYSCTL_VNET_INT(_net_key, KEYCTL_LARVAL_LIFETIME, - larval_lifetime, CTLFLAG_RW, &VNET_NAME(key_larval_lifetime), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_larval_lifetime), 0, ""); /* counter for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_VNET_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, - blockacq_count, CTLFLAG_RW, &VNET_NAME(key_blockacq_count), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_blockacq_count), 0, ""); /* lifetime for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_VNET_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, - blockacq_lifetime, CTLFLAG_RW, &VNET_NAME(key_blockacq_lifetime), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_blockacq_lifetime), 0, ""); /* ESP auth */ -SYSCTL_VNET_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, - CTLFLAG_RW, &VNET_NAME(ipsec_esp_auth), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_esp_auth), 0, ""); /* minimum ESP key length */ -SYSCTL_VNET_INT(_net_key, KEYCTL_ESP_KEYMIN, - esp_keymin, CTLFLAG_RW, &VNET_NAME(ipsec_esp_keymin), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_esp_keymin), 0, ""); /* minimum AH key length */ -SYSCTL_VNET_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, - CTLFLAG_RW, &VNET_NAME(ipsec_ah_keymin), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsec_ah_keymin), 0, ""); /* perfered old SA rather than new SA */ -SYSCTL_VNET_INT(_net_key, KEYCTL_PREFERED_OLDSA, - preferred_oldsa, CTLFLAG_RW, &VNET_NAME(key_preferred_oldsa), 0, ""); +SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, preferred_oldsa, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(key_preferred_oldsa), 0, ""); #define __LIST_CHAINED(elm) \ (!((elm)->chain.le_next == NULL && (elm)->chain.le_prev == NULL)) #define LIST_INSERT_TAIL(head, elm, type, field) \ do {\ struct type *curelm = LIST_FIRST(head); \ if (curelm == NULL) {\ LIST_INSERT_HEAD(head, elm, field); \ } else { \ while (LIST_NEXT(curelm, field)) \ curelm = LIST_NEXT(curelm, field);\ LIST_INSERT_AFTER(curelm, elm, field);\ }\ } while (0) #define KEY_CHKSASTATE(head, sav, name) \ do { \ if ((head) != (sav)) { \ ipseclog((LOG_DEBUG, "%s: state mismatched (TREE=%d SA=%d)\n", \ (name), (head), (sav))); \ continue; \ } \ } while (0) #define KEY_CHKSPDIR(head, sp, name) \ do { \ if ((head) != (sp)) { \ ipseclog((LOG_DEBUG, "%s: direction mismatched (TREE=%d SP=%d), " \ "anyway continue.\n", \ (name), (head), (sp))); \ } \ } while (0) MALLOC_DEFINE(M_IPSEC_SA, "secasvar", "ipsec security association"); MALLOC_DEFINE(M_IPSEC_SAH, "sahead", "ipsec sa head"); MALLOC_DEFINE(M_IPSEC_SP, "ipsecpolicy", "ipsec security policy"); MALLOC_DEFINE(M_IPSEC_SR, "ipsecrequest", "ipsec security request"); MALLOC_DEFINE(M_IPSEC_MISC, "ipsec-misc", "ipsec miscellaneous"); MALLOC_DEFINE(M_IPSEC_SAQ, "ipsec-saq", "ipsec sa acquire"); MALLOC_DEFINE(M_IPSEC_SAR, "ipsec-reg", "ipsec sa acquire"); /* * set parameters into secpolicyindex buffer. * Must allocate secpolicyindex buffer passed to this function. */ #define KEY_SETSECSPIDX(_dir, s, d, ps, pd, ulp, idx) \ do { \ bzero((idx), sizeof(struct secpolicyindex)); \ (idx)->dir = (_dir); \ (idx)->prefs = (ps); \ (idx)->prefd = (pd); \ (idx)->ul_proto = (ulp); \ bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \ bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \ } while (0) /* * set parameters into secasindex buffer. * Must allocate secasindex buffer before calling this function. */ #define KEY_SETSECASIDX(p, m, r, s, d, idx) \ do { \ bzero((idx), sizeof(struct secasindex)); \ (idx)->proto = (p); \ (idx)->mode = (m); \ (idx)->reqid = (r); \ bcopy((s), &(idx)->src, ((const struct sockaddr *)(s))->sa_len); \ bcopy((d), &(idx)->dst, ((const struct sockaddr *)(d))->sa_len); \ } while (0) /* key statistics */ struct _keystat { u_long getspi_count; /* the avarage of count to try to get new SPI */ } keystat; struct sadb_msghdr { struct sadb_msg *msg; struct sadb_ext *ext[SADB_EXT_MAX + 1]; int extoff[SADB_EXT_MAX + 1]; int extlen[SADB_EXT_MAX + 1]; }; #ifndef IPSEC_DEBUG2 static struct callout key_timer; #endif static struct secasvar *key_allocsa_policy __P((const struct secasindex *)); static void key_freesp_so __P((struct secpolicy **)); static struct secasvar *key_do_allocsa_policy __P((struct secashead *, u_int)); static void key_delsp __P((struct secpolicy *)); static struct secpolicy *key_getsp __P((struct secpolicyindex *)); static void _key_delsp(struct secpolicy *sp); static struct secpolicy *key_getspbyid __P((u_int32_t)); static u_int32_t key_newreqid __P((void)); static struct mbuf *key_gather_mbuf __P((struct mbuf *, const struct sadb_msghdr *, int, int, ...)); static int key_spdadd __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static u_int32_t key_getnewspid __P((void)); static int key_spddelete __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spddelete2 __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spdget __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spdflush __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_spddump __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static struct mbuf *key_setdumpsp __P((struct secpolicy *, u_int8_t, u_int32_t, u_int32_t)); static u_int key_getspreqmsglen __P((struct secpolicy *)); static int key_spdexpire __P((struct secpolicy *)); static struct secashead *key_newsah __P((struct secasindex *)); static void key_delsah __P((struct secashead *)); static struct secasvar *key_newsav __P((struct mbuf *, const struct sadb_msghdr *, struct secashead *, int *, const char*, int)); #define KEY_NEWSAV(m, sadb, sah, e) \ key_newsav(m, sadb, sah, e, __FILE__, __LINE__) static void key_delsav __P((struct secasvar *)); static struct secashead *key_getsah __P((struct secasindex *)); static struct secasvar *key_checkspidup __P((struct secasindex *, u_int32_t)); static struct secasvar *key_getsavbyspi __P((struct secashead *, u_int32_t)); static int key_setsaval __P((struct secasvar *, struct mbuf *, const struct sadb_msghdr *)); static int key_mature __P((struct secasvar *)); static struct mbuf *key_setdumpsa __P((struct secasvar *, u_int8_t, u_int8_t, u_int32_t, u_int32_t)); static struct mbuf *key_setsadbmsg __P((u_int8_t, u_int16_t, u_int8_t, u_int32_t, pid_t, u_int16_t)); static struct mbuf *key_setsadbsa __P((struct secasvar *)); static struct mbuf *key_setsadbaddr __P((u_int16_t, const struct sockaddr *, u_int8_t, u_int16_t)); #ifdef IPSEC_NAT_T static struct mbuf *key_setsadbxport(u_int16_t, u_int16_t); static struct mbuf *key_setsadbxtype(u_int16_t); #endif static void key_porttosaddr(struct sockaddr *, u_int16_t); #define KEY_PORTTOSADDR(saddr, port) \ key_porttosaddr((struct sockaddr *)(saddr), (port)) static struct mbuf *key_setsadbxsa2 __P((u_int8_t, u_int32_t, u_int32_t)); static struct mbuf *key_setsadbxpolicy __P((u_int16_t, u_int8_t, u_int32_t)); static struct seckey *key_dup_keymsg(const struct sadb_key *, u_int, struct malloc_type *); static struct seclifetime *key_dup_lifemsg(const struct sadb_lifetime *src, struct malloc_type *type); #ifdef INET6 static int key_ismyaddr6 __P((struct sockaddr_in6 *)); #endif /* flags for key_cmpsaidx() */ #define CMP_HEAD 1 /* protocol, addresses. */ #define CMP_MODE_REQID 2 /* additionally HEAD, reqid, mode. */ #define CMP_REQID 3 /* additionally HEAD, reaid. */ #define CMP_EXACTLY 4 /* all elements. */ static int key_cmpsaidx __P((const struct secasindex *, const struct secasindex *, int)); static int key_cmpspidx_exactly __P((struct secpolicyindex *, struct secpolicyindex *)); static int key_cmpspidx_withmask __P((struct secpolicyindex *, struct secpolicyindex *)); static int key_sockaddrcmp __P((const struct sockaddr *, const struct sockaddr *, int)); static int key_bbcmp __P((const void *, const void *, u_int)); static u_int16_t key_satype2proto __P((u_int8_t)); static u_int8_t key_proto2satype __P((u_int16_t)); static int key_getspi __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static u_int32_t key_do_getnewspi __P((struct sadb_spirange *, struct secasindex *)); static int key_update __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); #ifdef IPSEC_DOSEQCHECK static struct secasvar *key_getsavbyseq __P((struct secashead *, u_int32_t)); #endif static int key_add __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_setident __P((struct secashead *, struct mbuf *, const struct sadb_msghdr *)); static struct mbuf *key_getmsgbuf_x1 __P((struct mbuf *, const struct sadb_msghdr *)); static int key_delete __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_get __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static void key_getcomb_setlifetime __P((struct sadb_comb *)); static struct mbuf *key_getcomb_esp __P((void)); static struct mbuf *key_getcomb_ah __P((void)); static struct mbuf *key_getcomb_ipcomp __P((void)); static struct mbuf *key_getprop __P((const struct secasindex *)); static int key_acquire __P((const struct secasindex *, struct secpolicy *)); static struct secacq *key_newacq __P((const struct secasindex *)); static struct secacq *key_getacq __P((const struct secasindex *)); static struct secacq *key_getacqbyseq __P((u_int32_t)); static struct secspacq *key_newspacq __P((struct secpolicyindex *)); static struct secspacq *key_getspacq __P((struct secpolicyindex *)); static int key_acquire2 __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_register __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_expire __P((struct secasvar *)); static int key_flush __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_dump __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_promisc __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)); static int key_senderror __P((struct socket *, struct mbuf *, int)); static int key_validate_ext __P((const struct sadb_ext *, int)); static int key_align __P((struct mbuf *, struct sadb_msghdr *)); static struct mbuf *key_setlifetime(struct seclifetime *src, u_int16_t exttype); static struct mbuf *key_setkey(struct seckey *src, u_int16_t exttype); #if 0 static const char *key_getfqdn __P((void)); static const char *key_getuserfqdn __P((void)); #endif static void key_sa_chgstate __P((struct secasvar *, u_int8_t)); static __inline void sa_initref(struct secasvar *sav) { refcount_init(&sav->refcnt, 1); } static __inline void sa_addref(struct secasvar *sav) { refcount_acquire(&sav->refcnt); IPSEC_ASSERT(sav->refcnt != 0, ("SA refcnt overflow")); } static __inline int sa_delref(struct secasvar *sav) { IPSEC_ASSERT(sav->refcnt > 0, ("SA refcnt underflow")); return (refcount_release(&sav->refcnt)); } #define SP_ADDREF(p) do { \ (p)->refcnt++; \ IPSEC_ASSERT((p)->refcnt != 0, ("SP refcnt overflow")); \ } while (0) #define SP_DELREF(p) do { \ IPSEC_ASSERT((p)->refcnt > 0, ("SP refcnt underflow")); \ (p)->refcnt--; \ } while (0) /* * Update the refcnt while holding the SPTREE lock. */ void key_addref(struct secpolicy *sp) { SPTREE_LOCK(); SP_ADDREF(sp); SPTREE_UNLOCK(); } /* * Return 0 when there are known to be no SP's for the specified * direction. Otherwise return 1. This is used by IPsec code * to optimize performance. */ int key_havesp(u_int dir) { return (dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND ? LIST_FIRST(&V_sptree[dir]) != NULL : 1); } /* %%% IPsec policy management */ /* * allocating a SP for OUTBOUND or INBOUND packet. * Must call key_freesp() later. * OUT: NULL: not found * others: found and return the pointer. */ struct secpolicy * key_allocsp(struct secpolicyindex *spidx, u_int dir, const char* where, int tag) { struct secpolicy *sp; IPSEC_ASSERT(spidx != NULL, ("null spidx")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); /* get a SP entry */ KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** objects\n"); kdebug_secpolicyindex(spidx)); SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** in SPD\n"); kdebug_secpolicyindex(&sp->spidx)); if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (key_cmpspidx_withmask(&sp->spidx, spidx)) goto found; } sp = NULL; found: if (sp) { /* sanity check */ KEY_CHKSPDIR(sp->spidx.dir, dir, __func__); /* found a SPD entry */ sp->lastused = time_second; SP_ADDREF(sp); } SPTREE_UNLOCK(); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__, sp, sp ? sp->id : 0, sp ? sp->refcnt : 0)); return sp; } /* * allocating a SP for OUTBOUND or INBOUND packet. * Must call key_freesp() later. * OUT: NULL: not found * others: found and return the pointer. */ struct secpolicy * key_allocsp2(u_int32_t spi, union sockaddr_union *dst, u_int8_t proto, u_int dir, const char* where, int tag) { struct secpolicy *sp; IPSEC_ASSERT(dst != NULL, ("null dst")); IPSEC_ASSERT(dir == IPSEC_DIR_INBOUND || dir == IPSEC_DIR_OUTBOUND, ("invalid direction %u", dir)); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); /* get a SP entry */ KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** objects\n"); printf("spi %u proto %u dir %u\n", spi, proto, dir); kdebug_sockaddr(&dst->sa)); SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { KEYDEBUG(KEYDEBUG_IPSEC_DATA, printf("*** in SPD\n"); kdebug_secpolicyindex(&sp->spidx)); if (sp->state == IPSEC_SPSTATE_DEAD) continue; /* compare simple values, then dst address */ if (sp->spidx.ul_proto != proto) continue; /* NB: spi's must exist and match */ if (!sp->req || !sp->req->sav || sp->req->sav->spi != spi) continue; if (key_sockaddrcmp(&sp->spidx.dst.sa, &dst->sa, 1) == 0) goto found; } sp = NULL; found: if (sp) { /* sanity check */ KEY_CHKSPDIR(sp->spidx.dir, dir, __func__); /* found a SPD entry */ sp->lastused = time_second; SP_ADDREF(sp); } SPTREE_UNLOCK(); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__, sp, sp ? sp->id : 0, sp ? sp->refcnt : 0)); return sp; } #if 0 /* * return a policy that matches this particular inbound packet. * XXX slow */ struct secpolicy * key_gettunnel(const struct sockaddr *osrc, const struct sockaddr *odst, const struct sockaddr *isrc, const struct sockaddr *idst, const char* where, int tag) { struct secpolicy *sp; const int dir = IPSEC_DIR_INBOUND; struct ipsecrequest *r1, *r2, *p; struct secpolicyindex spidx; KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); if (isrc->sa_family != idst->sa_family) { ipseclog((LOG_ERR, "%s: protocol family mismatched %d != %d\n.", __func__, isrc->sa_family, idst->sa_family)); sp = NULL; goto done; } SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; r1 = r2 = NULL; for (p = sp->req; p; p = p->next) { if (p->saidx.mode != IPSEC_MODE_TUNNEL) continue; r1 = r2; r2 = p; if (!r1) { /* here we look at address matches only */ spidx = sp->spidx; if (isrc->sa_len > sizeof(spidx.src) || idst->sa_len > sizeof(spidx.dst)) continue; bcopy(isrc, &spidx.src, isrc->sa_len); bcopy(idst, &spidx.dst, idst->sa_len); if (!key_cmpspidx_withmask(&sp->spidx, &spidx)) continue; } else { if (key_sockaddrcmp(&r1->saidx.src.sa, isrc, 0) || key_sockaddrcmp(&r1->saidx.dst.sa, idst, 0)) continue; } if (key_sockaddrcmp(&r2->saidx.src.sa, osrc, 0) || key_sockaddrcmp(&r2->saidx.dst.sa, odst, 0)) continue; goto found; } } sp = NULL; found: if (sp) { sp->lastused = time_second; SP_ADDREF(sp); } SPTREE_UNLOCK(); done: KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SP:%p (ID=%u) refcnt %u\n", __func__, sp, sp ? sp->id : 0, sp ? sp->refcnt : 0)); return sp; } #endif /* * allocating an SA entry for an *OUTBOUND* packet. * checking each request entries in SP, and acquire an SA if need. * OUT: 0: there are valid requests. * ENOENT: policy may be valid, but SA with REQUIRE is on acquiring. */ int key_checkrequest(struct ipsecrequest *isr, const struct secasindex *saidx) { u_int level; int error; struct secasvar *sav; IPSEC_ASSERT(isr != NULL, ("null isr")); IPSEC_ASSERT(saidx != NULL, ("null saidx")); IPSEC_ASSERT(saidx->mode == IPSEC_MODE_TRANSPORT || saidx->mode == IPSEC_MODE_TUNNEL, ("unexpected policy %u", saidx->mode)); /* * XXX guard against protocol callbacks from the crypto * thread as they reference ipsecrequest.sav which we * temporarily null out below. Need to rethink how we * handle bundled SA's in the callback thread. */ IPSECREQUEST_LOCK_ASSERT(isr); /* get current level */ level = ipsec_get_reqlevel(isr); /* * We check new SA in the IPsec request because a different * SA may be involved each time this request is checked, either * because new SAs are being configured, or this request is * associated with an unconnected datagram socket, or this request * is associated with a system default policy. * * key_allocsa_policy should allocate the oldest SA available. * See key_do_allocsa_policy(), and draft-jenkins-ipsec-rekeying-03.txt. */ sav = key_allocsa_policy(saidx); if (sav != isr->sav) { /* SA need to be updated. */ if (!IPSECREQUEST_UPGRADE(isr)) { /* Kick everyone off. */ IPSECREQUEST_UNLOCK(isr); IPSECREQUEST_WLOCK(isr); } if (isr->sav != NULL) KEY_FREESAV(&isr->sav); isr->sav = sav; IPSECREQUEST_DOWNGRADE(isr); } else if (sav != NULL) KEY_FREESAV(&sav); /* When there is SA. */ if (isr->sav != NULL) { if (isr->sav->state != SADB_SASTATE_MATURE && isr->sav->state != SADB_SASTATE_DYING) return EINVAL; return 0; } /* there is no SA */ error = key_acquire(saidx, isr->sp); if (error != 0) { /* XXX What should I do ? */ ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n", __func__, error)); return error; } if (level != IPSEC_LEVEL_REQUIRE) { /* XXX sigh, the interface to this routine is botched */ IPSEC_ASSERT(isr->sav == NULL, ("unexpected SA")); return 0; } else { return ENOENT; } } /* * allocating a SA for policy entry from SAD. * NOTE: searching SAD of aliving state. * OUT: NULL: not found. * others: found and return the pointer. */ static struct secasvar * key_allocsa_policy(const struct secasindex *saidx) { #define N(a) _ARRAYLEN(a) struct secashead *sah; struct secasvar *sav; u_int stateidx, arraysize; const u_int *state_valid; state_valid = NULL; /* silence gcc */ arraysize = 0; /* silence gcc */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_MODE_REQID)) { if (V_key_preferred_oldsa) { state_valid = saorder_state_valid_prefer_old; arraysize = N(saorder_state_valid_prefer_old); } else { state_valid = saorder_state_valid_prefer_new; arraysize = N(saorder_state_valid_prefer_new); } break; } } SAHTREE_UNLOCK(); if (sah == NULL) return NULL; /* search valid state */ for (stateidx = 0; stateidx < arraysize; stateidx++) { sav = key_do_allocsa_policy(sah, state_valid[stateidx]); if (sav != NULL) return sav; } return NULL; #undef N } /* * searching SAD with direction, protocol, mode and state. * called by key_allocsa_policy(). * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secasvar * key_do_allocsa_policy(struct secashead *sah, u_int state) { struct secasvar *sav, *nextsav, *candidate, *d; /* initilize */ candidate = NULL; SAHTREE_LOCK(); for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { nextsav = LIST_NEXT(sav, chain); /* sanity check */ KEY_CHKSASTATE(sav->state, state, __func__); /* initialize */ if (candidate == NULL) { candidate = sav; continue; } /* Which SA is the better ? */ IPSEC_ASSERT(candidate->lft_c != NULL, ("null candidate lifetime")); IPSEC_ASSERT(sav->lft_c != NULL, ("null sav lifetime")); /* What the best method is to compare ? */ if (V_key_preferred_oldsa) { if (candidate->lft_c->addtime > sav->lft_c->addtime) { candidate = sav; } continue; /*NOTREACHED*/ } /* preferred new sa rather than old sa */ if (candidate->lft_c->addtime < sav->lft_c->addtime) { d = candidate; candidate = sav; } else d = sav; /* * prepared to delete the SA when there is more * suitable candidate and the lifetime of the SA is not * permanent. */ if (d->lft_h->addtime != 0) { struct mbuf *m, *result; u_int8_t satype; key_sa_chgstate(d, SADB_SASTATE_DEAD); IPSEC_ASSERT(d->refcnt > 0, ("bogus ref count")); satype = key_proto2satype(d->sah->saidx.proto); if (satype == 0) goto msgfail; m = key_setsadbmsg(SADB_DELETE, 0, satype, 0, 0, d->refcnt - 1); if (!m) goto msgfail; result = m; /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &d->sah->saidx.src.sa, d->sah->saidx.src.sa.sa_len << 3, IPSEC_ULPROTO_ANY); if (!m) goto msgfail; m_cat(result, m); /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &d->sah->saidx.dst.sa, d->sah->saidx.dst.sa.sa_len << 3, IPSEC_ULPROTO_ANY); if (!m) goto msgfail; m_cat(result, m); /* create SA extension */ m = key_setsadbsa(d); if (!m) goto msgfail; m_cat(result, m); if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) goto msgfail; } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); if (key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED)) goto msgfail; msgfail: KEY_FREESAV(&d); } } if (candidate) { sa_addref(candidate); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s cause refcnt++:%d SA:%p\n", __func__, candidate->refcnt, candidate)); } SAHTREE_UNLOCK(); return candidate; } /* * allocating a usable SA entry for a *INBOUND* packet. * Must call key_freesav() later. * OUT: positive: pointer to a usable sav (i.e. MATURE or DYING state). * NULL: not found, or error occured. * * In the comparison, no source address is used--for RFC2401 conformance. * To quote, from section 4.1: * A security association is uniquely identified by a triple consisting * of a Security Parameter Index (SPI), an IP Destination Address, and a * security protocol (AH or ESP) identifier. * Note that, however, we do need to keep source address in IPsec SA. * IKE specification and PF_KEY specification do assume that we * keep source address in IPsec SA. We see a tricky situation here. */ struct secasvar * key_allocsa( union sockaddr_union *dst, u_int proto, u_int32_t spi, const char* where, int tag) { struct secashead *sah; struct secasvar *sav; u_int stateidx, arraysize, state; const u_int *saorder_state_valid; #ifdef IPSEC_NAT_T int natt_chkport; #endif IPSEC_ASSERT(dst != NULL, ("null dst address")); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u\n", __func__, where, tag)); #ifdef IPSEC_NAT_T natt_chkport = (dst->sa.sa_family == AF_INET && dst->sa.sa_len == sizeof(struct sockaddr_in) && dst->sin.sin_port != 0); #endif /* * searching SAD. * XXX: to be checked internal IP header somewhere. Also when * IPsec tunnel packet is received. But ESP tunnel mode is * encrypted so we can't check internal IP header. */ SAHTREE_LOCK(); if (V_key_preferred_oldsa) { saorder_state_valid = saorder_state_valid_prefer_old; arraysize = _ARRAYLEN(saorder_state_valid_prefer_old); } else { saorder_state_valid = saorder_state_valid_prefer_new; arraysize = _ARRAYLEN(saorder_state_valid_prefer_new); } LIST_FOREACH(sah, &V_sahtree, chain) { int checkport; /* search valid state */ for (stateidx = 0; stateidx < arraysize; stateidx++) { state = saorder_state_valid[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { /* sanity check */ KEY_CHKSASTATE(sav->state, state, __func__); /* do not return entries w/ unusable state */ if (sav->state != SADB_SASTATE_MATURE && sav->state != SADB_SASTATE_DYING) continue; if (proto != sav->sah->saidx.proto) continue; if (spi != sav->spi) continue; checkport = 0; #ifdef IPSEC_NAT_T /* * Really only check ports when this is a NAT-T * SA. Otherwise other lookups providing ports * might suffer. */ if (sav->natt_type && natt_chkport) checkport = 1; #endif #if 0 /* don't check src */ /* check src address */ if (key_sockaddrcmp(&src->sa, &sav->sah->saidx.src.sa, checkport) != 0) continue; #endif /* check dst address */ if (key_sockaddrcmp(&dst->sa, &sav->sah->saidx.dst.sa, checkport) != 0) continue; sa_addref(sav); goto done; } } } sav = NULL; done: SAHTREE_UNLOCK(); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s return SA:%p; refcnt %u\n", __func__, sav, sav ? sav->refcnt : 0)); return sav; } /* * Must be called after calling key_allocsp(). * For both the packet without socket and key_freeso(). */ void _key_freesp(struct secpolicy **spp, const char* where, int tag) { struct secpolicy *sp = *spp; IPSEC_ASSERT(sp != NULL, ("null sp")); SPTREE_LOCK(); SP_DELREF(sp); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s SP:%p (ID=%u) from %s:%u; refcnt now %u\n", __func__, sp, sp->id, where, tag, sp->refcnt)); if (sp->refcnt == 0) { *spp = NULL; key_delsp(sp); } SPTREE_UNLOCK(); } /* * Must be called after calling key_allocsp(). * For the packet with socket. */ void key_freeso(struct socket *so) { IPSEC_ASSERT(so != NULL, ("null so")); switch (so->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case PF_INET: #endif #ifdef INET6 case PF_INET6: #endif { struct inpcb *pcb = sotoinpcb(so); /* Does it have a PCB ? */ if (pcb == NULL) return; key_freesp_so(&pcb->inp_sp->sp_in); key_freesp_so(&pcb->inp_sp->sp_out); } break; #endif /* INET || INET6 */ default: ipseclog((LOG_DEBUG, "%s: unknown address family=%d.\n", __func__, so->so_proto->pr_domain->dom_family)); return; } } static void key_freesp_so(struct secpolicy **sp) { IPSEC_ASSERT(sp != NULL && *sp != NULL, ("null sp")); if ((*sp)->policy == IPSEC_POLICY_ENTRUST || (*sp)->policy == IPSEC_POLICY_BYPASS) return; IPSEC_ASSERT((*sp)->policy == IPSEC_POLICY_IPSEC, ("invalid policy %u", (*sp)->policy)); KEY_FREESP(sp); } void key_addrefsa(struct secasvar *sav, const char* where, int tag) { IPSEC_ASSERT(sav != NULL, ("null sav")); IPSEC_ASSERT(sav->refcnt > 0, ("refcount must exist")); sa_addref(sav); } /* * Must be called after calling key_allocsa(). * This function is called by key_freesp() to free some SA allocated * for a policy. */ void key_freesav(struct secasvar **psav, const char* where, int tag) { struct secasvar *sav = *psav; IPSEC_ASSERT(sav != NULL, ("null sav")); if (sa_delref(sav)) { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n", __func__, sav, ntohl(sav->spi), where, tag, sav->refcnt)); *psav = NULL; key_delsav(sav); } else { KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s SA:%p (SPI %u) from %s:%u; refcnt now %u\n", __func__, sav, ntohl(sav->spi), where, tag, sav->refcnt)); } } /* %%% SPD management */ /* * free security policy entry. */ static void key_delsp(struct secpolicy *sp) { struct ipsecrequest *isr, *nextisr; IPSEC_ASSERT(sp != NULL, ("null sp")); SPTREE_LOCK_ASSERT(); sp->state = IPSEC_SPSTATE_DEAD; IPSEC_ASSERT(sp->refcnt == 0, ("SP with references deleted (refcnt %u)", sp->refcnt)); /* remove from SP index */ if (__LIST_CHAINED(sp)) LIST_REMOVE(sp, chain); for (isr = sp->req; isr != NULL; isr = nextisr) { if (isr->sav != NULL) { KEY_FREESAV(&isr->sav); isr->sav = NULL; } nextisr = isr->next; ipsec_delisr(isr); } _key_delsp(sp); } /* * search SPD * OUT: NULL : not found * others : found, pointer to a SP. */ static struct secpolicy * key_getsp(struct secpolicyindex *spidx) { struct secpolicy *sp; IPSEC_ASSERT(spidx != NULL, ("null spidx")); SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[spidx->dir], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (key_cmpspidx_exactly(spidx, &sp->spidx)) { SP_ADDREF(sp); break; } } SPTREE_UNLOCK(); return sp; } /* * get SP by index. * OUT: NULL : not found * others : found, pointer to a SP. */ static struct secpolicy * key_getspbyid(u_int32_t id) { struct secpolicy *sp; SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_INBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (sp->id == id) { SP_ADDREF(sp); goto done; } } LIST_FOREACH(sp, &V_sptree[IPSEC_DIR_OUTBOUND], chain) { if (sp->state == IPSEC_SPSTATE_DEAD) continue; if (sp->id == id) { SP_ADDREF(sp); goto done; } } done: SPTREE_UNLOCK(); return sp; } struct secpolicy * key_newsp(const char* where, int tag) { struct secpolicy *newsp = NULL; newsp = (struct secpolicy *) malloc(sizeof(struct secpolicy), M_IPSEC_SP, M_NOWAIT|M_ZERO); if (newsp) { SECPOLICY_LOCK_INIT(newsp); newsp->refcnt = 1; newsp->req = NULL; } KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u return SP:%p\n", __func__, where, tag, newsp)); return newsp; } static void _key_delsp(struct secpolicy *sp) { SECPOLICY_LOCK_DESTROY(sp); free(sp, M_IPSEC_SP); } /* * create secpolicy structure from sadb_x_policy structure. * NOTE: `state', `secpolicyindex' in secpolicy structure are not set, * so must be set properly later. */ struct secpolicy * key_msg2sp(xpl0, len, error) struct sadb_x_policy *xpl0; size_t len; int *error; { struct secpolicy *newsp; IPSEC_ASSERT(xpl0 != NULL, ("null xpl0")); IPSEC_ASSERT(len >= sizeof(*xpl0), ("policy too short: %zu", len)); if (len != PFKEY_EXTLEN(xpl0)) { ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__)); *error = EINVAL; return NULL; } if ((newsp = KEY_NEWSP()) == NULL) { *error = ENOBUFS; return NULL; } newsp->spidx.dir = xpl0->sadb_x_policy_dir; newsp->policy = xpl0->sadb_x_policy_type; /* check policy */ switch (xpl0->sadb_x_policy_type) { case IPSEC_POLICY_DISCARD: case IPSEC_POLICY_NONE: case IPSEC_POLICY_ENTRUST: case IPSEC_POLICY_BYPASS: newsp->req = NULL; break; case IPSEC_POLICY_IPSEC: { int tlen; struct sadb_x_ipsecrequest *xisr; struct ipsecrequest **p_isr = &newsp->req; /* validity check */ if (PFKEY_EXTLEN(xpl0) < sizeof(*xpl0)) { ipseclog((LOG_DEBUG, "%s: Invalid msg length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } tlen = PFKEY_EXTLEN(xpl0) - sizeof(*xpl0); xisr = (struct sadb_x_ipsecrequest *)(xpl0 + 1); while (tlen > 0) { /* length check */ if (xisr->sadb_x_ipsecrequest_len < sizeof(*xisr)) { ipseclog((LOG_DEBUG, "%s: invalid ipsecrequest " "length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } /* allocate request buffer */ /* NB: data structure is zero'd */ *p_isr = ipsec_newisr(); if ((*p_isr) == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); KEY_FREESP(&newsp); *error = ENOBUFS; return NULL; } /* set values */ switch (xisr->sadb_x_ipsecrequest_proto) { case IPPROTO_ESP: case IPPROTO_AH: case IPPROTO_IPCOMP: break; default: ipseclog((LOG_DEBUG, "%s: invalid proto type=%u\n", __func__, xisr->sadb_x_ipsecrequest_proto)); KEY_FREESP(&newsp); *error = EPROTONOSUPPORT; return NULL; } (*p_isr)->saidx.proto = xisr->sadb_x_ipsecrequest_proto; switch (xisr->sadb_x_ipsecrequest_mode) { case IPSEC_MODE_TRANSPORT: case IPSEC_MODE_TUNNEL: break; case IPSEC_MODE_ANY: default: ipseclog((LOG_DEBUG, "%s: invalid mode=%u\n", __func__, xisr->sadb_x_ipsecrequest_mode)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } (*p_isr)->saidx.mode = xisr->sadb_x_ipsecrequest_mode; switch (xisr->sadb_x_ipsecrequest_level) { case IPSEC_LEVEL_DEFAULT: case IPSEC_LEVEL_USE: case IPSEC_LEVEL_REQUIRE: break; case IPSEC_LEVEL_UNIQUE: /* validity check */ /* * If range violation of reqid, kernel will * update it, don't refuse it. */ if (xisr->sadb_x_ipsecrequest_reqid > IPSEC_MANUAL_REQID_MAX) { ipseclog((LOG_DEBUG, "%s: reqid=%d range " "violation, updated by kernel.\n", __func__, xisr->sadb_x_ipsecrequest_reqid)); xisr->sadb_x_ipsecrequest_reqid = 0; } /* allocate new reqid id if reqid is zero. */ if (xisr->sadb_x_ipsecrequest_reqid == 0) { u_int32_t reqid; if ((reqid = key_newreqid()) == 0) { KEY_FREESP(&newsp); *error = ENOBUFS; return NULL; } (*p_isr)->saidx.reqid = reqid; xisr->sadb_x_ipsecrequest_reqid = reqid; } else { /* set it for manual keying. */ (*p_isr)->saidx.reqid = xisr->sadb_x_ipsecrequest_reqid; } break; default: ipseclog((LOG_DEBUG, "%s: invalid level=%u\n", __func__, xisr->sadb_x_ipsecrequest_level)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } (*p_isr)->level = xisr->sadb_x_ipsecrequest_level; /* set IP addresses if there */ if (xisr->sadb_x_ipsecrequest_len > sizeof(*xisr)) { struct sockaddr *paddr; paddr = (struct sockaddr *)(xisr + 1); /* validity check */ if (paddr->sa_len > sizeof((*p_isr)->saidx.src)) { ipseclog((LOG_DEBUG, "%s: invalid " "request address length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } bcopy(paddr, &(*p_isr)->saidx.src, paddr->sa_len); paddr = (struct sockaddr *)((caddr_t)paddr + paddr->sa_len); /* validity check */ if (paddr->sa_len > sizeof((*p_isr)->saidx.dst)) { ipseclog((LOG_DEBUG, "%s: invalid " "request address length.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } bcopy(paddr, &(*p_isr)->saidx.dst, paddr->sa_len); } (*p_isr)->sp = newsp; /* initialization for the next. */ p_isr = &(*p_isr)->next; tlen -= xisr->sadb_x_ipsecrequest_len; /* validity check */ if (tlen < 0) { ipseclog((LOG_DEBUG, "%s: becoming tlen < 0.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } xisr = (struct sadb_x_ipsecrequest *)((caddr_t)xisr + xisr->sadb_x_ipsecrequest_len); } } break; default: ipseclog((LOG_DEBUG, "%s: invalid policy type.\n", __func__)); KEY_FREESP(&newsp); *error = EINVAL; return NULL; } *error = 0; return newsp; } static u_int32_t key_newreqid() { static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1; auto_reqid = (auto_reqid == ~0 ? IPSEC_MANUAL_REQID_MAX + 1 : auto_reqid + 1); /* XXX should be unique check */ return auto_reqid; } /* * copy secpolicy struct to sadb_x_policy structure indicated. */ struct mbuf * key_sp2msg(sp) struct secpolicy *sp; { struct sadb_x_policy *xpl; int tlen; caddr_t p; struct mbuf *m; IPSEC_ASSERT(sp != NULL, ("null policy")); tlen = key_getspreqmsglen(sp); m = m_get2(tlen, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, tlen); m->m_len = tlen; xpl = mtod(m, struct sadb_x_policy *); bzero(xpl, tlen); xpl->sadb_x_policy_len = PFKEY_UNIT64(tlen); xpl->sadb_x_policy_exttype = SADB_X_EXT_POLICY; xpl->sadb_x_policy_type = sp->policy; xpl->sadb_x_policy_dir = sp->spidx.dir; xpl->sadb_x_policy_id = sp->id; p = (caddr_t)xpl + sizeof(*xpl); /* if is the policy for ipsec ? */ if (sp->policy == IPSEC_POLICY_IPSEC) { struct sadb_x_ipsecrequest *xisr; struct ipsecrequest *isr; for (isr = sp->req; isr != NULL; isr = isr->next) { xisr = (struct sadb_x_ipsecrequest *)p; xisr->sadb_x_ipsecrequest_proto = isr->saidx.proto; xisr->sadb_x_ipsecrequest_mode = isr->saidx.mode; xisr->sadb_x_ipsecrequest_level = isr->level; xisr->sadb_x_ipsecrequest_reqid = isr->saidx.reqid; p += sizeof(*xisr); bcopy(&isr->saidx.src, p, isr->saidx.src.sa.sa_len); p += isr->saidx.src.sa.sa_len; bcopy(&isr->saidx.dst, p, isr->saidx.dst.sa.sa_len); p += isr->saidx.src.sa.sa_len; xisr->sadb_x_ipsecrequest_len = PFKEY_ALIGN8(sizeof(*xisr) + isr->saidx.src.sa.sa_len + isr->saidx.dst.sa.sa_len); } } return m; } /* m will not be freed nor modified */ static struct mbuf * #ifdef __STDC__ key_gather_mbuf(struct mbuf *m, const struct sadb_msghdr *mhp, int ndeep, int nitem, ...) #else key_gather_mbuf(m, mhp, ndeep, nitem, va_alist) struct mbuf *m; const struct sadb_msghdr *mhp; int ndeep; int nitem; va_dcl #endif { va_list ap; int idx; int i; struct mbuf *result = NULL, *n; int len; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); va_start(ap, nitem); for (i = 0; i < nitem; i++) { idx = va_arg(ap, int); if (idx < 0 || idx > SADB_EXT_MAX) goto fail; /* don't attempt to pull empty extension */ if (idx == SADB_EXT_RESERVED && mhp->msg == NULL) continue; if (idx != SADB_EXT_RESERVED && (mhp->ext[idx] == NULL || mhp->extlen[idx] == 0)) continue; if (idx == SADB_EXT_RESERVED) { len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); IPSEC_ASSERT(len <= MHLEN, ("header too big %u", len)); MGETHDR(n, M_NOWAIT, MT_DATA); if (!n) goto fail; n->m_len = len; n->m_next = NULL; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t)); } else if (i < ndeep) { len = mhp->extlen[idx]; n = m_get2(len, M_NOWAIT, MT_DATA, 0); if (n == NULL) goto fail; m_align(n, len); n->m_len = len; m_copydata(m, mhp->extoff[idx], mhp->extlen[idx], mtod(n, caddr_t)); } else { n = m_copym(m, mhp->extoff[idx], mhp->extlen[idx], M_NOWAIT); } if (n == NULL) goto fail; if (result) m_cat(result, n); else result = n; } va_end(ap); if ((result->m_flags & M_PKTHDR) != 0) { result->m_pkthdr.len = 0; for (n = result; n; n = n->m_next) result->m_pkthdr.len += n->m_len; } return result; fail: m_freem(result); va_end(ap); return NULL; } /* * SADB_X_SPDADD, SADB_X_SPDSETIDX or SADB_X_SPDUPDATE processing * add an entry to SP database, when received * * from the user(?). * Adding to SP database, * and send * * to the socket which was send. * * SPDADD set a unique policy entry. * SPDSETIDX like SPDADD without a part of policy requests. * SPDUPDATE replace a unique policy entry. * * m will always be freed. */ static int key_spdadd(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0, *xpl; struct sadb_lifetime *lft = NULL; struct secpolicyindex spidx; struct secpolicy *newsp; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_X_EXT_POLICY] == NULL) { ipseclog((LOG_DEBUG, "key_spdadd: invalid message is passed.\n")); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(struct sadb_lifetime)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } lft = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY]; /* * Note: do not parse SADB_X_EXT_NAT_T_* here: * we are processing traffic endpoints. */ /* make secindex */ /* XXX boundary check against sa_len */ KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir, src0 + 1, dst0 + 1, src0->sadb_address_prefixlen, dst0->sadb_address_prefixlen, src0->sadb_address_proto, &spidx); /* checking the direciton. */ switch (xpl0->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: case IPSEC_DIR_OUTBOUND: break; default: ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__)); mhp->msg->sadb_msg_errno = EINVAL; return 0; } /* check policy */ /* key_spdadd() accepts DISCARD, NONE and IPSEC. */ if (xpl0->sadb_x_policy_type == IPSEC_POLICY_ENTRUST || xpl0->sadb_x_policy_type == IPSEC_POLICY_BYPASS) { ipseclog((LOG_DEBUG, "%s: Invalid policy type.\n", __func__)); return key_senderror(so, m, EINVAL); } /* policy requests are mandatory when action is ipsec. */ if (mhp->msg->sadb_msg_type != SADB_X_SPDSETIDX && xpl0->sadb_x_policy_type == IPSEC_POLICY_IPSEC && mhp->extlen[SADB_X_EXT_POLICY] <= sizeof(*xpl0)) { ipseclog((LOG_DEBUG, "%s: some policy requests part required\n", __func__)); return key_senderror(so, m, EINVAL); } /* * checking there is SP already or not. * SPDUPDATE doesn't depend on whether there is a SP or not. * If the type is either SPDADD or SPDSETIDX AND a SP is found, * then error. */ newsp = key_getsp(&spidx); if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) { if (newsp) { SPTREE_LOCK(); newsp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); KEY_FREESP(&newsp); } } else { if (newsp != NULL) { KEY_FREESP(&newsp); ipseclog((LOG_DEBUG, "%s: a SP entry exists already.\n", __func__)); return key_senderror(so, m, EEXIST); } } /* allocation new SP entry */ if ((newsp = key_msg2sp(xpl0, PFKEY_EXTLEN(xpl0), &error)) == NULL) { return key_senderror(so, m, error); } if ((newsp->id = key_getnewspid()) == 0) { _key_delsp(newsp); return key_senderror(so, m, ENOBUFS); } /* XXX boundary check against sa_len */ KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir, src0 + 1, dst0 + 1, src0->sadb_address_prefixlen, dst0->sadb_address_prefixlen, src0->sadb_address_proto, &newsp->spidx); /* sanity check on addr pair */ if (((struct sockaddr *)(src0 + 1))->sa_family != ((struct sockaddr *)(dst0+ 1))->sa_family) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } if (((struct sockaddr *)(src0 + 1))->sa_len != ((struct sockaddr *)(dst0+ 1))->sa_len) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } #if 1 if (newsp->req && newsp->req->saidx.src.sa.sa_family && newsp->req->saidx.dst.sa.sa_family) { if (newsp->req->saidx.src.sa.sa_family != newsp->req->saidx.dst.sa.sa_family) { _key_delsp(newsp); return key_senderror(so, m, EINVAL); } } #endif newsp->created = time_second; newsp->lastused = newsp->created; newsp->lifetime = lft ? lft->sadb_lifetime_addtime : 0; newsp->validtime = lft ? lft->sadb_lifetime_usetime : 0; newsp->refcnt = 1; /* do not reclaim until I say I do */ newsp->state = IPSEC_SPSTATE_ALIVE; LIST_INSERT_TAIL(&V_sptree[newsp->spidx.dir], newsp, secpolicy, chain); /* delete the entry in spacqtree */ if (mhp->msg->sadb_msg_type == SADB_X_SPDUPDATE) { struct secspacq *spacq = key_getspacq(&spidx); if (spacq != NULL) { /* reset counter in order to deletion by timehandler. */ spacq->created = time_second; spacq->count = 0; SPACQ_UNLOCK(); } } { struct mbuf *n, *mpolicy; struct sadb_msg *newmsg; int off; /* * Note: do not send SADB_X_EXT_NAT_T_* here: * we are sending traffic endpoints. */ /* create new sadb_msg to reply. */ if (lft) { n = key_gather_mbuf(m, mhp, 2, 5, SADB_EXT_RESERVED, SADB_X_EXT_POLICY, SADB_EXT_LIFETIME_HARD, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); } else { n = key_gather_mbuf(m, mhp, 2, 4, SADB_EXT_RESERVED, SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); } if (!n) return key_senderror(so, m, ENOBUFS); if (n->m_len < sizeof(*newmsg)) { n = m_pullup(n, sizeof(*newmsg)); if (!n) return key_senderror(so, m, ENOBUFS); } newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); off = 0; mpolicy = m_pulldown(n, PFKEY_ALIGN8(sizeof(struct sadb_msg)), sizeof(*xpl), &off); if (mpolicy == NULL) { /* n is already freed */ return key_senderror(so, m, ENOBUFS); } xpl = (struct sadb_x_policy *)(mtod(mpolicy, caddr_t) + off); if (xpl->sadb_x_policy_exttype != SADB_X_EXT_POLICY) { m_freem(n); return key_senderror(so, m, EINVAL); } xpl->sadb_x_policy_id = newsp->id; m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * get new policy id. * OUT: * 0: failure. * others: success. */ static u_int32_t key_getnewspid() { u_int32_t newid = 0; int count = V_key_spi_trycnt; /* XXX */ struct secpolicy *sp; /* when requesting to allocate spi ranged */ while (count--) { newid = (V_policy_id = (V_policy_id == ~0 ? 1 : V_policy_id + 1)); if ((sp = key_getspbyid(newid)) == NULL) break; KEY_FREESP(&sp); } if (count == 0 || newid == 0) { ipseclog((LOG_DEBUG, "%s: to allocate policy id is failed.\n", __func__)); return 0; } return newid; } /* * SADB_SPDDELETE processing * receive * * from the user(?), and set SADB_SASTATE_DEAD, * and send, * * to the ikmpd. * policy(*) including direction of policy. * * m will always be freed. */ static int key_spddelete(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0; struct secpolicyindex spidx; struct secpolicy *sp; IPSEC_ASSERT(so != NULL, ("null so")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_X_EXT_POLICY] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; xpl0 = (struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY]; /* * Note: do not parse SADB_X_EXT_NAT_T_* here: * we are processing traffic endpoints. */ /* make secindex */ /* XXX boundary check against sa_len */ KEY_SETSECSPIDX(xpl0->sadb_x_policy_dir, src0 + 1, dst0 + 1, src0->sadb_address_prefixlen, dst0->sadb_address_prefixlen, src0->sadb_address_proto, &spidx); /* checking the direciton. */ switch (xpl0->sadb_x_policy_dir) { case IPSEC_DIR_INBOUND: case IPSEC_DIR_OUTBOUND: break; default: ipseclog((LOG_DEBUG, "%s: Invalid SP direction.\n", __func__)); return key_senderror(so, m, EINVAL); } /* Is there SP in SPD ? */ if ((sp = key_getsp(&spidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SP found.\n", __func__)); return key_senderror(so, m, EINVAL); } /* save policy id to buffer to be returned. */ xpl0->sadb_x_policy_id = sp->id; SPTREE_LOCK(); sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); KEY_FREESP(&sp); { struct mbuf *n; struct sadb_msg *newmsg; /* * Note: do not send SADB_X_EXT_NAT_T_* here: * we are sending traffic endpoints. */ /* create new sadb_msg to reply. */ n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED, SADB_X_EXT_POLICY, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n) return key_senderror(so, m, ENOBUFS); newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * SADB_SPDDELETE2 processing * receive * * from the user(?), and set SADB_SASTATE_DEAD, * and send, * * to the ikmpd. * policy(*) including direction of policy. * * m will always be freed. */ static int key_spddelete2(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { u_int32_t id; struct secpolicy *sp; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_X_EXT_POLICY] == NULL || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; /* Is there SP in SPD ? */ if ((sp = key_getspbyid(id)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id)); return key_senderror(so, m, EINVAL); } SPTREE_LOCK(); sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); KEY_FREESP(&sp); { struct mbuf *n, *nn; struct sadb_msg *newmsg; int off, len; /* create new sadb_msg to reply. */ len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); MGETHDR(n, M_NOWAIT, MT_DATA); if (n && len > MHLEN) { MCLGET(n, M_NOWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return key_senderror(so, m, ENOBUFS); n->m_len = len; n->m_next = NULL; off = 0; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)", off, len)); n->m_next = m_copym(m, mhp->extoff[SADB_X_EXT_POLICY], mhp->extlen[SADB_X_EXT_POLICY], M_NOWAIT); if (!n->m_next) { m_freem(n); return key_senderror(so, m, ENOBUFS); } n->m_pkthdr.len = 0; for (nn = n; nn; nn = nn->m_next) n->m_pkthdr.len += nn->m_len; newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * SADB_X_GET processing * receive * * from the user(?), * and send, * * to the ikmpd. * policy(*) including direction of policy. * * m will always be freed. */ static int key_spdget(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { u_int32_t id; struct secpolicy *sp; struct mbuf *n; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_X_EXT_POLICY] == NULL || mhp->extlen[SADB_X_EXT_POLICY] < sizeof(struct sadb_x_policy)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } id = ((struct sadb_x_policy *)mhp->ext[SADB_X_EXT_POLICY])->sadb_x_policy_id; /* Is there SP in SPD ? */ if ((sp = key_getspbyid(id)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SP found id:%u.\n", __func__, id)); return key_senderror(so, m, ENOENT); } n = key_setdumpsp(sp, SADB_X_SPDGET, 0, mhp->msg->sadb_msg_pid); KEY_FREESP(&sp); if (n != NULL) { m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } else return key_senderror(so, m, ENOBUFS); } /* * SADB_X_SPDACQUIRE processing. * Acquire policy and SA(s) for a *OUTBOUND* packet. * send * * to KMD, and expect to receive * with SADB_X_SPDACQUIRE if error occured, * or * * with SADB_X_SPDUPDATE from KMD by PF_KEY. * policy(*) is without policy requests. * * 0 : succeed * others: error number */ int key_spdacquire(sp) struct secpolicy *sp; { struct mbuf *result = NULL, *m; struct secspacq *newspacq; IPSEC_ASSERT(sp != NULL, ("null secpolicy")); IPSEC_ASSERT(sp->req == NULL, ("policy exists")); IPSEC_ASSERT(sp->policy == IPSEC_POLICY_IPSEC, ("policy not IPSEC %u", sp->policy)); /* Get an entry to check whether sent message or not. */ newspacq = key_getspacq(&sp->spidx); if (newspacq != NULL) { if (V_key_blockacq_count < newspacq->count) { /* reset counter and do send message. */ newspacq->count = 0; } else { /* increment counter and do nothing. */ newspacq->count++; return 0; } SPACQ_UNLOCK(); } else { /* make new entry for blocking to send SADB_ACQUIRE. */ newspacq = key_newspacq(&sp->spidx); if (newspacq == NULL) return ENOBUFS; } /* create new sadb_msg to reply. */ m = key_setsadbmsg(SADB_X_SPDACQUIRE, 0, 0, 0, 0, 0); if (!m) return ENOBUFS; result = m; result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, m, KEY_SENDUP_REGISTERED); } /* * SADB_SPDFLUSH processing * receive * * from the user, and free all entries in secpctree. * and send, * * to the user. * NOTE: what to do is only marking SADB_SASTATE_DEAD. * * m will always be freed. */ static int key_spdflush(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_msg *newmsg; struct secpolicy *sp; u_int dir; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (m->m_len != PFKEY_ALIGN8(sizeof(struct sadb_msg))) return key_senderror(so, m, EINVAL); for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); } if (sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } if (m->m_next) m_freem(m->m_next); m->m_next = NULL; m->m_pkthdr.len = m->m_len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); newmsg = mtod(m, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } /* * SADB_SPDDUMP processing * receive * * from the user, and dump all SP leaves * and send, * ..... * to the ikmpd. * * m will always be freed. */ static int key_spddump(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct secpolicy *sp; int cnt; u_int dir; struct mbuf *n; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* search SPD entry and get buffer size. */ cnt = 0; SPTREE_LOCK(); for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { LIST_FOREACH(sp, &V_sptree[dir], chain) { cnt++; } } if (cnt == 0) { SPTREE_UNLOCK(); return key_senderror(so, m, ENOENT); } for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { LIST_FOREACH(sp, &V_sptree[dir], chain) { --cnt; n = key_setdumpsp(sp, SADB_X_SPDDUMP, cnt, mhp->msg->sadb_msg_pid); if (n) key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } SPTREE_UNLOCK(); m_freem(m); return 0; } static struct mbuf * key_setdumpsp(struct secpolicy *sp, u_int8_t type, u_int32_t seq, u_int32_t pid) { struct mbuf *result = NULL, *m; struct seclifetime lt; m = key_setsadbmsg(type, 0, SADB_SATYPE_UNSPEC, seq, pid, sp->refcnt); if (!m) goto fail; result = m; /* * Note: do not send SADB_X_EXT_NAT_T_* here: * we are sending traffic endpoints. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sp->spidx.src.sa, sp->spidx.prefs, sp->spidx.ul_proto); if (!m) goto fail; m_cat(result, m); m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sp->spidx.dst.sa, sp->spidx.prefd, sp->spidx.ul_proto); if (!m) goto fail; m_cat(result, m); m = key_sp2msg(sp); if (!m) goto fail; m_cat(result, m); if(sp->lifetime){ lt.addtime=sp->created; lt.usetime= sp->lastused; m = key_setlifetime(<, SADB_EXT_LIFETIME_CURRENT); if (!m) goto fail; m_cat(result, m); lt.addtime=sp->lifetime; lt.usetime= sp->validtime; m = key_setlifetime(<, SADB_EXT_LIFETIME_HARD); if (!m) goto fail; m_cat(result, m); } if ((result->m_flags & M_PKTHDR) == 0) goto fail; if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) goto fail; } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return result; fail: m_freem(result); return NULL; } /* * get PFKEY message length for security policy and request. */ static u_int key_getspreqmsglen(sp) struct secpolicy *sp; { u_int tlen; tlen = sizeof(struct sadb_x_policy); /* if is the policy for ipsec ? */ if (sp->policy != IPSEC_POLICY_IPSEC) return tlen; /* get length of ipsec requests */ { struct ipsecrequest *isr; int len; for (isr = sp->req; isr != NULL; isr = isr->next) { len = sizeof(struct sadb_x_ipsecrequest) + isr->saidx.src.sa.sa_len + isr->saidx.dst.sa.sa_len; tlen += PFKEY_ALIGN8(len); } } return tlen; } /* * SADB_SPDEXPIRE processing * send * * to KMD by PF_KEY. * * OUT: 0 : succeed * others : error number */ static int key_spdexpire(sp) struct secpolicy *sp; { struct mbuf *result = NULL, *m; int len; int error = -1; struct sadb_lifetime *lt; /* XXX: Why do we lock ? */ IPSEC_ASSERT(sp != NULL, ("null secpolicy")); /* set msg header */ m = key_setsadbmsg(SADB_X_SPDEXPIRE, 0, 0, 0, 0, 0); if (!m) { error = ENOBUFS; goto fail; } result = m; /* create lifetime extension (current and hard) */ len = PFKEY_ALIGN8(sizeof(*lt)) * 2; m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) { error = ENOBUFS; goto fail; } m_align(m, len); m->m_len = len; bzero(mtod(m, caddr_t), len); lt = mtod(m, struct sadb_lifetime *); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lt->sadb_lifetime_allocations = 0; lt->sadb_lifetime_bytes = 0; lt->sadb_lifetime_addtime = sp->created; lt->sadb_lifetime_usetime = sp->lastused; lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lt->sadb_lifetime_allocations = 0; lt->sadb_lifetime_bytes = 0; lt->sadb_lifetime_addtime = sp->lifetime; lt->sadb_lifetime_usetime = sp->validtime; m_cat(result, m); /* * Note: do not send SADB_X_EXT_NAT_T_* here: * we are sending traffic endpoints. */ /* set sadb_address for source */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sp->spidx.src.sa, sp->spidx.prefs, sp->spidx.ul_proto); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* set sadb_address for destination */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sp->spidx.dst.sa, sp->spidx.prefd, sp->spidx.ul_proto); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* set secpolicy */ m = key_sp2msg(sp); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; goto fail; } if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) { error = ENOBUFS; goto fail; } } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: if (result) m_freem(result); return error; } /* %%% SAD management */ /* * allocating a memory for new SA head, and copy from the values of mhp. * OUT: NULL : failure due to the lack of memory. * others : pointer to new SA head. */ static struct secashead * key_newsah(saidx) struct secasindex *saidx; { struct secashead *newsah; IPSEC_ASSERT(saidx != NULL, ("null saidx")); newsah = malloc(sizeof(struct secashead), M_IPSEC_SAH, M_NOWAIT|M_ZERO); if (newsah != NULL) { int i; for (i = 0; i < sizeof(newsah->savtree)/sizeof(newsah->savtree[0]); i++) LIST_INIT(&newsah->savtree[i]); newsah->saidx = *saidx; /* add to saidxtree */ newsah->state = SADB_SASTATE_MATURE; SAHTREE_LOCK(); LIST_INSERT_HEAD(&V_sahtree, newsah, chain); SAHTREE_UNLOCK(); } return(newsah); } /* * delete SA index and all SA registerd. */ static void key_delsah(sah) struct secashead *sah; { struct secasvar *sav, *nextsav; u_int stateidx; int zombie = 0; IPSEC_ASSERT(sah != NULL, ("NULL sah")); SAHTREE_LOCK_ASSERT(); /* searching all SA registerd in the secindex. */ for (stateidx = 0; stateidx < _ARRAYLEN(saorder_state_any); stateidx++) { u_int state = saorder_state_any[stateidx]; LIST_FOREACH_SAFE(sav, &sah->savtree[state], chain, nextsav) { if (sav->refcnt == 0) { /* sanity check */ KEY_CHKSASTATE(state, sav->state, __func__); /* * do NOT call KEY_FREESAV here: * it will only delete the sav if refcnt == 1, * where we already know that refcnt == 0 */ key_delsav(sav); } else { /* give up to delete this sa */ zombie++; } } } if (!zombie) { /* delete only if there are savs */ /* remove from tree of SA index */ if (__LIST_CHAINED(sah)) LIST_REMOVE(sah, chain); if (sah->route_cache.sa_route.ro_rt) { RTFREE(sah->route_cache.sa_route.ro_rt); sah->route_cache.sa_route.ro_rt = (struct rtentry *)NULL; } free(sah, M_IPSEC_SAH); } } /* * allocating a new SA with LARVAL state. key_add() and key_getspi() call, * and copy the values of mhp into new buffer. * When SAD message type is GETSPI: * to set sequence number from acq_seq++, * to set zero to SPI. * not to call key_setsava(). * OUT: NULL : fail * others : pointer to new secasvar. * * does not modify mbuf. does not free mbuf on error. */ static struct secasvar * key_newsav(m, mhp, sah, errp, where, tag) struct mbuf *m; const struct sadb_msghdr *mhp; struct secashead *sah; int *errp; const char* where; int tag; { struct secasvar *newsav; const struct sadb_sa *xsa; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); IPSEC_ASSERT(sah != NULL, ("null secashead")); newsav = malloc(sizeof(struct secasvar), M_IPSEC_SA, M_NOWAIT|M_ZERO); if (newsav == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); *errp = ENOBUFS; goto done; } switch (mhp->msg->sadb_msg_type) { case SADB_GETSPI: newsav->spi = 0; #ifdef IPSEC_DOSEQCHECK /* sync sequence number */ if (mhp->msg->sadb_msg_seq == 0) newsav->seq = (V_acq_seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq)); else #endif newsav->seq = mhp->msg->sadb_msg_seq; break; case SADB_ADD: /* sanity check */ if (mhp->ext[SADB_EXT_SA] == NULL) { free(newsav, M_IPSEC_SA); newsav = NULL; ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); *errp = EINVAL; goto done; } xsa = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; newsav->spi = xsa->sadb_sa_spi; newsav->seq = mhp->msg->sadb_msg_seq; break; default: free(newsav, M_IPSEC_SA); newsav = NULL; *errp = EINVAL; goto done; } /* copy sav values */ if (mhp->msg->sadb_msg_type != SADB_GETSPI) { *errp = key_setsaval(newsav, m, mhp); if (*errp) { free(newsav, M_IPSEC_SA); newsav = NULL; goto done; } } SECASVAR_LOCK_INIT(newsav); /* reset created */ newsav->created = time_second; newsav->pid = mhp->msg->sadb_msg_pid; /* add to satree */ newsav->sah = sah; sa_initref(newsav); newsav->state = SADB_SASTATE_LARVAL; SAHTREE_LOCK(); LIST_INSERT_TAIL(&sah->savtree[SADB_SASTATE_LARVAL], newsav, secasvar, chain); SAHTREE_UNLOCK(); done: KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s from %s:%u return SP:%p\n", __func__, where, tag, newsav)); return newsav; } /* * free() SA variable entry. */ static void key_cleansav(struct secasvar *sav) { /* * Cleanup xform state. Note that zeroize'ing causes the * keys to be cleared; otherwise we must do it ourself. */ if (sav->tdb_xform != NULL) { sav->tdb_xform->xf_zeroize(sav); sav->tdb_xform = NULL; } else { KASSERT(sav->iv == NULL, ("iv but no xform")); if (sav->key_auth != NULL) bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth)); if (sav->key_enc != NULL) bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc)); } if (sav->key_auth != NULL) { if (sav->key_auth->key_data != NULL) free(sav->key_auth->key_data, M_IPSEC_MISC); free(sav->key_auth, M_IPSEC_MISC); sav->key_auth = NULL; } if (sav->key_enc != NULL) { if (sav->key_enc->key_data != NULL) free(sav->key_enc->key_data, M_IPSEC_MISC); free(sav->key_enc, M_IPSEC_MISC); sav->key_enc = NULL; } if (sav->sched) { bzero(sav->sched, sav->schedlen); free(sav->sched, M_IPSEC_MISC); sav->sched = NULL; } if (sav->replay != NULL) { free(sav->replay, M_IPSEC_MISC); sav->replay = NULL; } if (sav->lft_c != NULL) { free(sav->lft_c, M_IPSEC_MISC); sav->lft_c = NULL; } if (sav->lft_h != NULL) { free(sav->lft_h, M_IPSEC_MISC); sav->lft_h = NULL; } if (sav->lft_s != NULL) { free(sav->lft_s, M_IPSEC_MISC); sav->lft_s = NULL; } } /* * free() SA variable entry. */ static void key_delsav(sav) struct secasvar *sav; { IPSEC_ASSERT(sav != NULL, ("null sav")); IPSEC_ASSERT(sav->refcnt == 0, ("reference count %u > 0", sav->refcnt)); /* remove from SA header */ if (__LIST_CHAINED(sav)) LIST_REMOVE(sav, chain); key_cleansav(sav); SECASVAR_LOCK_DESTROY(sav); free(sav, M_IPSEC_SA); } /* * search SAD. * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secashead * key_getsah(saidx) struct secasindex *saidx; { struct secashead *sah; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, saidx, CMP_REQID)) break; } SAHTREE_UNLOCK(); return sah; } /* * check not to be duplicated SPI. * NOTE: this function is too slow due to searching all SAD. * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secasvar * key_checkspidup(saidx, spi) struct secasindex *saidx; u_int32_t spi; { struct secashead *sah; struct secasvar *sav; /* check address family */ if (saidx->src.sa.sa_family != saidx->dst.sa.sa_family) { ipseclog((LOG_DEBUG, "%s: address family mismatched.\n", __func__)); return NULL; } sav = NULL; /* check all SAD */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (!key_ismyaddr((struct sockaddr *)&sah->saidx.dst)) continue; sav = key_getsavbyspi(sah, spi); if (sav != NULL) break; } SAHTREE_UNLOCK(); return sav; } /* * search SAD litmited alive SA, protocol, SPI. * OUT: * NULL : not found * others : found, pointer to a SA. */ static struct secasvar * key_getsavbyspi(sah, spi) struct secashead *sah; u_int32_t spi; { struct secasvar *sav; u_int stateidx, state; sav = NULL; SAHTREE_LOCK_ASSERT(); /* search all status */ for (stateidx = 0; stateidx < _ARRAYLEN(saorder_state_alive); stateidx++) { state = saorder_state_alive[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { /* sanity check */ if (sav->state != state) { ipseclog((LOG_DEBUG, "%s: " "invalid sav->state (queue: %d SA: %d)\n", __func__, state, sav->state)); continue; } if (sav->spi == spi) return sav; } } return NULL; } /* * copy SA values from PF_KEY message except *SPI, SEQ, PID, STATE and TYPE*. * You must update these if need. * OUT: 0: success. * !0: failure. * * does not modify mbuf. does not free mbuf on error. */ static int key_setsaval(sav, m, mhp) struct secasvar *sav; struct mbuf *m; const struct sadb_msghdr *mhp; { int error = 0; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* initialization */ sav->replay = NULL; sav->key_auth = NULL; sav->key_enc = NULL; sav->sched = NULL; sav->schedlen = 0; sav->iv = NULL; sav->lft_c = NULL; sav->lft_h = NULL; sav->lft_s = NULL; sav->tdb_xform = NULL; /* transform */ sav->tdb_encalgxform = NULL; /* encoding algorithm */ sav->tdb_authalgxform = NULL; /* authentication algorithm */ sav->tdb_compalgxform = NULL; /* compression algorithm */ /* Initialize even if NAT-T not compiled in: */ sav->natt_type = 0; sav->natt_esp_frag_len = 0; /* SA */ if (mhp->ext[SADB_EXT_SA] != NULL) { const struct sadb_sa *sa0; sa0 = (const struct sadb_sa *)mhp->ext[SADB_EXT_SA]; if (mhp->extlen[SADB_EXT_SA] < sizeof(*sa0)) { error = EINVAL; goto fail; } sav->alg_auth = sa0->sadb_sa_auth; sav->alg_enc = sa0->sadb_sa_encrypt; sav->flags = sa0->sadb_sa_flags; /* replay window */ if ((sa0->sadb_sa_flags & SADB_X_EXT_OLD) == 0) { sav->replay = (struct secreplay *) malloc(sizeof(struct secreplay)+sa0->sadb_sa_replay, M_IPSEC_MISC, M_NOWAIT|M_ZERO); if (sav->replay == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } if (sa0->sadb_sa_replay != 0) sav->replay->bitmap = (caddr_t)(sav->replay+1); sav->replay->wsize = sa0->sadb_sa_replay; } } /* Authentication keys */ if (mhp->ext[SADB_EXT_KEY_AUTH] != NULL) { const struct sadb_key *key0; int len; key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_AUTH]; len = mhp->extlen[SADB_EXT_KEY_AUTH]; error = 0; if (len < sizeof(*key0)) { error = EINVAL; goto fail; } switch (mhp->msg->sadb_msg_satype) { case SADB_SATYPE_AH: case SADB_SATYPE_ESP: case SADB_X_SATYPE_TCPSIGNATURE: if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) && sav->alg_auth != SADB_X_AALG_NULL) error = EINVAL; break; case SADB_X_SATYPE_IPCOMP: default: error = EINVAL; break; } if (error) { ipseclog((LOG_DEBUG, "%s: invalid key_auth values.\n", __func__)); goto fail; } sav->key_auth = (struct seckey *)key_dup_keymsg(key0, len, M_IPSEC_MISC); if (sav->key_auth == NULL ) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } } /* Encryption key */ if (mhp->ext[SADB_EXT_KEY_ENCRYPT] != NULL) { const struct sadb_key *key0; int len; key0 = (const struct sadb_key *)mhp->ext[SADB_EXT_KEY_ENCRYPT]; len = mhp->extlen[SADB_EXT_KEY_ENCRYPT]; error = 0; if (len < sizeof(*key0)) { error = EINVAL; goto fail; } switch (mhp->msg->sadb_msg_satype) { case SADB_SATYPE_ESP: if (len == PFKEY_ALIGN8(sizeof(struct sadb_key)) && sav->alg_enc != SADB_EALG_NULL) { error = EINVAL; break; } sav->key_enc = (struct seckey *)key_dup_keymsg(key0, len, M_IPSEC_MISC); if (sav->key_enc == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } break; case SADB_X_SATYPE_IPCOMP: if (len != PFKEY_ALIGN8(sizeof(struct sadb_key))) error = EINVAL; sav->key_enc = NULL; /*just in case*/ break; case SADB_SATYPE_AH: case SADB_X_SATYPE_TCPSIGNATURE: default: error = EINVAL; break; } if (error) { ipseclog((LOG_DEBUG, "%s: invalid key_enc value.\n", __func__)); goto fail; } } /* set iv */ sav->ivlen = 0; switch (mhp->msg->sadb_msg_satype) { case SADB_SATYPE_AH: error = xform_init(sav, XF_AH); break; case SADB_SATYPE_ESP: error = xform_init(sav, XF_ESP); break; case SADB_X_SATYPE_IPCOMP: error = xform_init(sav, XF_IPCOMP); break; case SADB_X_SATYPE_TCPSIGNATURE: error = xform_init(sav, XF_TCPSIGNATURE); break; } if (error) { ipseclog((LOG_DEBUG, "%s: unable to initialize SA type %u.\n", __func__, mhp->msg->sadb_msg_satype)); goto fail; } /* reset created */ sav->created = time_second; /* make lifetime for CURRENT */ sav->lft_c = malloc(sizeof(struct seclifetime), M_IPSEC_MISC, M_NOWAIT); if (sav->lft_c == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); error = ENOBUFS; goto fail; } sav->lft_c->allocations = 0; sav->lft_c->bytes = 0; sav->lft_c->addtime = time_second; sav->lft_c->usetime = 0; /* lifetimes for HARD and SOFT */ { const struct sadb_lifetime *lft0; lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_HARD]; if (lft0 != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_HARD] < sizeof(*lft0)) { error = EINVAL; goto fail; } sav->lft_h = key_dup_lifemsg(lft0, M_IPSEC_MISC); if (sav->lft_h == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); error = ENOBUFS; goto fail; } /* to be initialize ? */ } lft0 = (struct sadb_lifetime *)mhp->ext[SADB_EXT_LIFETIME_SOFT]; if (lft0 != NULL) { if (mhp->extlen[SADB_EXT_LIFETIME_SOFT] < sizeof(*lft0)) { error = EINVAL; goto fail; } sav->lft_s = key_dup_lifemsg(lft0, M_IPSEC_MISC); if (sav->lft_s == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); error = ENOBUFS; goto fail; } /* to be initialize ? */ } } return 0; fail: /* initialization */ key_cleansav(sav); return error; } /* * validation with a secasvar entry, and set SADB_SATYPE_MATURE. * OUT: 0: valid * other: errno */ static int key_mature(struct secasvar *sav) { int error; /* check SPI value */ switch (sav->sah->saidx.proto) { case IPPROTO_ESP: case IPPROTO_AH: /* * RFC 4302, 2.4. Security Parameters Index (SPI), SPI values * 1-255 reserved by IANA for future use, * 0 for implementation specific, local use. */ if (ntohl(sav->spi) <= 255) { ipseclog((LOG_DEBUG, "%s: illegal range of SPI %u.\n", __func__, (u_int32_t)ntohl(sav->spi))); return EINVAL; } break; } /* check satype */ switch (sav->sah->saidx.proto) { case IPPROTO_ESP: /* check flags */ if ((sav->flags & (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) == (SADB_X_EXT_OLD|SADB_X_EXT_DERIV)) { ipseclog((LOG_DEBUG, "%s: invalid flag (derived) " "given to old-esp.\n", __func__)); return EINVAL; } error = xform_init(sav, XF_ESP); break; case IPPROTO_AH: /* check flags */ if (sav->flags & SADB_X_EXT_DERIV) { ipseclog((LOG_DEBUG, "%s: invalid flag (derived) " "given to AH SA.\n", __func__)); return EINVAL; } if (sav->alg_enc != SADB_EALG_NONE) { ipseclog((LOG_DEBUG, "%s: protocol and algorithm " "mismated.\n", __func__)); return(EINVAL); } error = xform_init(sav, XF_AH); break; case IPPROTO_IPCOMP: if (sav->alg_auth != SADB_AALG_NONE) { ipseclog((LOG_DEBUG, "%s: protocol and algorithm " "mismated.\n", __func__)); return(EINVAL); } if ((sav->flags & SADB_X_EXT_RAWCPI) == 0 && ntohl(sav->spi) >= 0x10000) { ipseclog((LOG_DEBUG, "%s: invalid cpi for IPComp.\n", __func__)); return(EINVAL); } error = xform_init(sav, XF_IPCOMP); break; case IPPROTO_TCP: if (sav->alg_enc != SADB_EALG_NONE) { ipseclog((LOG_DEBUG, "%s: protocol and algorithm " "mismated.\n", __func__)); return(EINVAL); } error = xform_init(sav, XF_TCPSIGNATURE); break; default: ipseclog((LOG_DEBUG, "%s: Invalid satype.\n", __func__)); error = EPROTONOSUPPORT; break; } if (error == 0) { SAHTREE_LOCK(); key_sa_chgstate(sav, SADB_SASTATE_MATURE); SAHTREE_UNLOCK(); } return (error); } /* * subroutine for SADB_GET and SADB_DUMP. */ static struct mbuf * key_setdumpsa(struct secasvar *sav, u_int8_t type, u_int8_t satype, u_int32_t seq, u_int32_t pid) { struct mbuf *result = NULL, *tres = NULL, *m; int i; int dumporder[] = { SADB_EXT_SA, SADB_X_EXT_SA2, SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT, SADB_EXT_LIFETIME_CURRENT, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST, SADB_EXT_ADDRESS_PROXY, SADB_EXT_KEY_AUTH, SADB_EXT_KEY_ENCRYPT, SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST, SADB_EXT_SENSITIVITY, #ifdef IPSEC_NAT_T SADB_X_EXT_NAT_T_TYPE, SADB_X_EXT_NAT_T_SPORT, SADB_X_EXT_NAT_T_DPORT, SADB_X_EXT_NAT_T_OAI, SADB_X_EXT_NAT_T_OAR, SADB_X_EXT_NAT_T_FRAG, #endif }; m = key_setsadbmsg(type, 0, satype, seq, pid, sav->refcnt); if (m == NULL) goto fail; result = m; for (i = sizeof(dumporder)/sizeof(dumporder[0]) - 1; i >= 0; i--) { m = NULL; switch (dumporder[i]) { case SADB_EXT_SA: m = key_setsadbsa(sav); if (!m) goto fail; break; case SADB_X_EXT_SA2: m = key_setsadbxsa2(sav->sah->saidx.mode, sav->replay ? sav->replay->count : 0, sav->sah->saidx.reqid); if (!m) goto fail; break; case SADB_EXT_ADDRESS_SRC: m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sav->sah->saidx.src.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) goto fail; break; case SADB_EXT_ADDRESS_DST: m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sav->sah->saidx.dst.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) goto fail; break; case SADB_EXT_KEY_AUTH: if (!sav->key_auth) continue; m = key_setkey(sav->key_auth, SADB_EXT_KEY_AUTH); if (!m) goto fail; break; case SADB_EXT_KEY_ENCRYPT: if (!sav->key_enc) continue; m = key_setkey(sav->key_enc, SADB_EXT_KEY_ENCRYPT); if (!m) goto fail; break; case SADB_EXT_LIFETIME_CURRENT: if (!sav->lft_c) continue; m = key_setlifetime(sav->lft_c, SADB_EXT_LIFETIME_CURRENT); if (!m) goto fail; break; case SADB_EXT_LIFETIME_HARD: if (!sav->lft_h) continue; m = key_setlifetime(sav->lft_h, SADB_EXT_LIFETIME_HARD); if (!m) goto fail; break; case SADB_EXT_LIFETIME_SOFT: if (!sav->lft_s) continue; m = key_setlifetime(sav->lft_s, SADB_EXT_LIFETIME_SOFT); if (!m) goto fail; break; #ifdef IPSEC_NAT_T case SADB_X_EXT_NAT_T_TYPE: m = key_setsadbxtype(sav->natt_type); if (!m) goto fail; break; case SADB_X_EXT_NAT_T_DPORT: m = key_setsadbxport( KEY_PORTFROMSADDR(&sav->sah->saidx.dst), SADB_X_EXT_NAT_T_DPORT); if (!m) goto fail; break; case SADB_X_EXT_NAT_T_SPORT: m = key_setsadbxport( KEY_PORTFROMSADDR(&sav->sah->saidx.src), SADB_X_EXT_NAT_T_SPORT); if (!m) goto fail; break; case SADB_X_EXT_NAT_T_OAI: case SADB_X_EXT_NAT_T_OAR: case SADB_X_EXT_NAT_T_FRAG: /* We do not (yet) support those. */ continue; #endif case SADB_EXT_ADDRESS_PROXY: case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: /* XXX: should we brought from SPD ? */ case SADB_EXT_SENSITIVITY: default: continue; } if (!m) goto fail; if (tres) m_cat(m, tres); tres = m; } m_cat(result, tres); if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) goto fail; } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return result; fail: m_freem(result); m_freem(tres); return NULL; } /* * set data into sadb_msg. */ static struct mbuf * key_setsadbmsg(u_int8_t type, u_int16_t tlen, u_int8_t satype, u_int32_t seq, pid_t pid, u_int16_t reserved) { struct mbuf *m; struct sadb_msg *p; int len; len = PFKEY_ALIGN8(sizeof(struct sadb_msg)); if (len > MCLBYTES) return NULL; MGETHDR(m, M_NOWAIT, MT_DATA); if (m && len > MHLEN) { MCLGET(m, M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m = NULL; } } if (!m) return NULL; m->m_pkthdr.len = m->m_len = len; m->m_next = NULL; p = mtod(m, struct sadb_msg *); bzero(p, len); p->sadb_msg_version = PF_KEY_V2; p->sadb_msg_type = type; p->sadb_msg_errno = 0; p->sadb_msg_satype = satype; p->sadb_msg_len = PFKEY_UNIT64(tlen); p->sadb_msg_reserved = reserved; p->sadb_msg_seq = seq; p->sadb_msg_pid = (u_int32_t)pid; return m; } /* * copy secasvar data into sadb_address. */ static struct mbuf * key_setsadbsa(sav) struct secasvar *sav; { struct mbuf *m; struct sadb_sa *p; int len; len = PFKEY_ALIGN8(sizeof(struct sadb_sa)); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_sa *); bzero(p, len); p->sadb_sa_len = PFKEY_UNIT64(len); p->sadb_sa_exttype = SADB_EXT_SA; p->sadb_sa_spi = sav->spi; p->sadb_sa_replay = (sav->replay != NULL ? sav->replay->wsize : 0); p->sadb_sa_state = sav->state; p->sadb_sa_auth = sav->alg_auth; p->sadb_sa_encrypt = sav->alg_enc; p->sadb_sa_flags = sav->flags; return m; } /* * set data into sadb_address. */ static struct mbuf * key_setsadbaddr(u_int16_t exttype, const struct sockaddr *saddr, u_int8_t prefixlen, u_int16_t ul_proto) { struct mbuf *m; struct sadb_address *p; size_t len; len = PFKEY_ALIGN8(sizeof(struct sadb_address)) + PFKEY_ALIGN8(saddr->sa_len); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_address *); bzero(p, len); p->sadb_address_len = PFKEY_UNIT64(len); p->sadb_address_exttype = exttype; p->sadb_address_proto = ul_proto; if (prefixlen == FULLMASK) { switch (saddr->sa_family) { case AF_INET: prefixlen = sizeof(struct in_addr) << 3; break; case AF_INET6: prefixlen = sizeof(struct in6_addr) << 3; break; default: ; /*XXX*/ } } p->sadb_address_prefixlen = prefixlen; p->sadb_address_reserved = 0; bcopy(saddr, mtod(m, caddr_t) + PFKEY_ALIGN8(sizeof(struct sadb_address)), saddr->sa_len); return m; } /* * set data into sadb_x_sa2. */ static struct mbuf * key_setsadbxsa2(u_int8_t mode, u_int32_t seq, u_int32_t reqid) { struct mbuf *m; struct sadb_x_sa2 *p; size_t len; len = PFKEY_ALIGN8(sizeof(struct sadb_x_sa2)); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_x_sa2 *); bzero(p, len); p->sadb_x_sa2_len = PFKEY_UNIT64(len); p->sadb_x_sa2_exttype = SADB_X_EXT_SA2; p->sadb_x_sa2_mode = mode; p->sadb_x_sa2_reserved1 = 0; p->sadb_x_sa2_reserved2 = 0; p->sadb_x_sa2_sequence = seq; p->sadb_x_sa2_reqid = reqid; return m; } #ifdef IPSEC_NAT_T /* * Set a type in sadb_x_nat_t_type. */ static struct mbuf * key_setsadbxtype(u_int16_t type) { struct mbuf *m; size_t len; struct sadb_x_nat_t_type *p; len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_type)); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_x_nat_t_type *); bzero(p, len); p->sadb_x_nat_t_type_len = PFKEY_UNIT64(len); p->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; p->sadb_x_nat_t_type_type = type; return (m); } /* * Set a port in sadb_x_nat_t_port. * In contrast to default RFC 2367 behaviour, port is in network byte order. */ static struct mbuf * key_setsadbxport(u_int16_t port, u_int16_t type) { struct mbuf *m; size_t len; struct sadb_x_nat_t_port *p; len = PFKEY_ALIGN8(sizeof(struct sadb_x_nat_t_port)); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_x_nat_t_port *); bzero(p, len); p->sadb_x_nat_t_port_len = PFKEY_UNIT64(len); p->sadb_x_nat_t_port_exttype = type; p->sadb_x_nat_t_port_port = port; return (m); } /* * Get port from sockaddr. Port is in network byte order. */ u_int16_t key_portfromsaddr(struct sockaddr *sa) { switch (sa->sa_family) { #ifdef INET case AF_INET: return ((struct sockaddr_in *)sa)->sin_port; #endif #ifdef INET6 case AF_INET6: return ((struct sockaddr_in6 *)sa)->sin6_port; #endif } KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s unexpected address family %d\n", __func__, sa->sa_family)); return (0); } #endif /* IPSEC_NAT_T */ /* * Set port in struct sockaddr. Port is in network byte order. */ static void key_porttosaddr(struct sockaddr *sa, u_int16_t port) { switch (sa->sa_family) { #ifdef INET case AF_INET: ((struct sockaddr_in *)sa)->sin_port = port; break; #endif #ifdef INET6 case AF_INET6: ((struct sockaddr_in6 *)sa)->sin6_port = port; break; #endif default: ipseclog((LOG_DEBUG, "%s: unexpected address family %d.\n", __func__, sa->sa_family)); break; } } /* * set data into sadb_x_policy */ static struct mbuf * key_setsadbxpolicy(u_int16_t type, u_int8_t dir, u_int32_t id) { struct mbuf *m; struct sadb_x_policy *p; size_t len; len = PFKEY_ALIGN8(sizeof(struct sadb_x_policy)); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return (NULL); m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_x_policy *); bzero(p, len); p->sadb_x_policy_len = PFKEY_UNIT64(len); p->sadb_x_policy_exttype = SADB_X_EXT_POLICY; p->sadb_x_policy_type = type; p->sadb_x_policy_dir = dir; p->sadb_x_policy_id = id; return m; } /* %%% utilities */ /* Take a key message (sadb_key) from the socket and turn it into one * of the kernel's key structures (seckey). * * IN: pointer to the src * OUT: NULL no more memory */ struct seckey * key_dup_keymsg(const struct sadb_key *src, u_int len, struct malloc_type *type) { struct seckey *dst; dst = (struct seckey *)malloc(sizeof(struct seckey), type, M_NOWAIT); if (dst != NULL) { dst->bits = src->sadb_key_bits; dst->key_data = (char *)malloc(len, type, M_NOWAIT); if (dst->key_data != NULL) { bcopy((const char *)src + sizeof(struct sadb_key), dst->key_data, len); } else { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); free(dst, type); dst = NULL; } } else { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); } return dst; } /* Take a lifetime message (sadb_lifetime) passed in on a socket and * turn it into one of the kernel's lifetime structures (seclifetime). * * IN: pointer to the destination, source and malloc type * OUT: NULL, no more memory */ static struct seclifetime * key_dup_lifemsg(const struct sadb_lifetime *src, struct malloc_type *type) { struct seclifetime *dst = NULL; dst = (struct seclifetime *)malloc(sizeof(struct seclifetime), type, M_NOWAIT); if (dst == NULL) { /* XXX counter */ ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); } else { dst->allocations = src->sadb_lifetime_allocations; dst->bytes = src->sadb_lifetime_bytes; dst->addtime = src->sadb_lifetime_addtime; dst->usetime = src->sadb_lifetime_usetime; } return dst; } /* compare my own address * OUT: 1: true, i.e. my address. * 0: false */ int key_ismyaddr(struct sockaddr *sa) { IPSEC_ASSERT(sa != NULL, ("null sockaddr")); switch (sa->sa_family) { #ifdef INET case AF_INET: return (in_localip(satosin(sa)->sin_addr)); #endif #ifdef INET6 case AF_INET6: return key_ismyaddr6((struct sockaddr_in6 *)sa); #endif } return 0; } #ifdef INET6 /* * compare my own address for IPv6. * 1: ours * 0: other * NOTE: derived ip6_input() in KAME. This is necessary to modify more. */ #include static int key_ismyaddr6(sin6) struct sockaddr_in6 *sin6; { struct in6_ifaddr *ia; #if 0 struct in6_multi *in6m; #endif IN6_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (key_sockaddrcmp((struct sockaddr *)&sin6, (struct sockaddr *)&ia->ia_addr, 0) == 0) { IN6_IFADDR_RUNLOCK(); return 1; } #if 0 /* * XXX Multicast * XXX why do we care about multlicast here while we don't care * about IPv4 multicast?? * XXX scope */ in6m = NULL; IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m); if (in6m) { IN6_IFADDR_RUNLOCK(); return 1; } #endif } IN6_IFADDR_RUNLOCK(); /* loopback, just for safety */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr)) return 1; return 0; } #endif /*INET6*/ /* * compare two secasindex structure. * flag can specify to compare 2 saidxes. * compare two secasindex structure without both mode and reqid. * don't compare port. * IN: * saidx0: source, it can be in SAD. * saidx1: object. * OUT: * 1 : equal * 0 : not equal */ static int key_cmpsaidx( const struct secasindex *saidx0, const struct secasindex *saidx1, int flag) { int chkport = 0; /* sanity */ if (saidx0 == NULL && saidx1 == NULL) return 1; if (saidx0 == NULL || saidx1 == NULL) return 0; if (saidx0->proto != saidx1->proto) return 0; if (flag == CMP_EXACTLY) { if (saidx0->mode != saidx1->mode) return 0; if (saidx0->reqid != saidx1->reqid) return 0; if (bcmp(&saidx0->src, &saidx1->src, saidx0->src.sa.sa_len) != 0 || bcmp(&saidx0->dst, &saidx1->dst, saidx0->dst.sa.sa_len) != 0) return 0; } else { /* CMP_MODE_REQID, CMP_REQID, CMP_HEAD */ if (flag == CMP_MODE_REQID ||flag == CMP_REQID) { /* * If reqid of SPD is non-zero, unique SA is required. * The result must be of same reqid in this case. */ if (saidx1->reqid != 0 && saidx0->reqid != saidx1->reqid) return 0; } if (flag == CMP_MODE_REQID) { if (saidx0->mode != IPSEC_MODE_ANY && saidx0->mode != saidx1->mode) return 0; } #ifdef IPSEC_NAT_T /* * If NAT-T is enabled, check ports for tunnel mode. * Do not check ports if they are set to zero in the SPD. * Also do not do it for native transport mode, as there * is no port information available in the SP. */ if ((saidx1->mode == IPSEC_MODE_TUNNEL || (saidx1->mode == IPSEC_MODE_TRANSPORT && saidx1->proto == IPPROTO_ESP)) && saidx1->src.sa.sa_family == AF_INET && saidx1->dst.sa.sa_family == AF_INET && ((const struct sockaddr_in *)(&saidx1->src))->sin_port && ((const struct sockaddr_in *)(&saidx1->dst))->sin_port) chkport = 1; #endif /* IPSEC_NAT_T */ if (key_sockaddrcmp(&saidx0->src.sa, &saidx1->src.sa, chkport) != 0) { return 0; } if (key_sockaddrcmp(&saidx0->dst.sa, &saidx1->dst.sa, chkport) != 0) { return 0; } } return 1; } /* * compare two secindex structure exactly. * IN: * spidx0: source, it is often in SPD. * spidx1: object, it is often from PFKEY message. * OUT: * 1 : equal * 0 : not equal */ static int key_cmpspidx_exactly( struct secpolicyindex *spidx0, struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) return 1; if (spidx0 == NULL || spidx1 == NULL) return 0; if (spidx0->prefs != spidx1->prefs || spidx0->prefd != spidx1->prefd || spidx0->ul_proto != spidx1->ul_proto) return 0; return key_sockaddrcmp(&spidx0->src.sa, &spidx1->src.sa, 1) == 0 && key_sockaddrcmp(&spidx0->dst.sa, &spidx1->dst.sa, 1) == 0; } /* * compare two secindex structure with mask. * IN: * spidx0: source, it is often in SPD. * spidx1: object, it is often from IP header. * OUT: * 1 : equal * 0 : not equal */ static int key_cmpspidx_withmask( struct secpolicyindex *spidx0, struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) return 1; if (spidx0 == NULL || spidx1 == NULL) return 0; if (spidx0->src.sa.sa_family != spidx1->src.sa.sa_family || spidx0->dst.sa.sa_family != spidx1->dst.sa.sa_family || spidx0->src.sa.sa_len != spidx1->src.sa.sa_len || spidx0->dst.sa.sa_len != spidx1->dst.sa.sa_len) return 0; /* if spidx.ul_proto == IPSEC_ULPROTO_ANY, ignore. */ if (spidx0->ul_proto != (u_int16_t)IPSEC_ULPROTO_ANY && spidx0->ul_proto != spidx1->ul_proto) return 0; switch (spidx0->src.sa.sa_family) { case AF_INET: if (spidx0->src.sin.sin_port != IPSEC_PORT_ANY && spidx0->src.sin.sin_port != spidx1->src.sin.sin_port) return 0; if (!key_bbcmp(&spidx0->src.sin.sin_addr, &spidx1->src.sin.sin_addr, spidx0->prefs)) return 0; break; case AF_INET6: if (spidx0->src.sin6.sin6_port != IPSEC_PORT_ANY && spidx0->src.sin6.sin6_port != spidx1->src.sin6.sin6_port) return 0; /* * scope_id check. if sin6_scope_id is 0, we regard it * as a wildcard scope, which matches any scope zone ID. */ if (spidx0->src.sin6.sin6_scope_id && spidx1->src.sin6.sin6_scope_id && spidx0->src.sin6.sin6_scope_id != spidx1->src.sin6.sin6_scope_id) return 0; if (!key_bbcmp(&spidx0->src.sin6.sin6_addr, &spidx1->src.sin6.sin6_addr, spidx0->prefs)) return 0; break; default: /* XXX */ if (bcmp(&spidx0->src, &spidx1->src, spidx0->src.sa.sa_len) != 0) return 0; break; } switch (spidx0->dst.sa.sa_family) { case AF_INET: if (spidx0->dst.sin.sin_port != IPSEC_PORT_ANY && spidx0->dst.sin.sin_port != spidx1->dst.sin.sin_port) return 0; if (!key_bbcmp(&spidx0->dst.sin.sin_addr, &spidx1->dst.sin.sin_addr, spidx0->prefd)) return 0; break; case AF_INET6: if (spidx0->dst.sin6.sin6_port != IPSEC_PORT_ANY && spidx0->dst.sin6.sin6_port != spidx1->dst.sin6.sin6_port) return 0; /* * scope_id check. if sin6_scope_id is 0, we regard it * as a wildcard scope, which matches any scope zone ID. */ if (spidx0->dst.sin6.sin6_scope_id && spidx1->dst.sin6.sin6_scope_id && spidx0->dst.sin6.sin6_scope_id != spidx1->dst.sin6.sin6_scope_id) return 0; if (!key_bbcmp(&spidx0->dst.sin6.sin6_addr, &spidx1->dst.sin6.sin6_addr, spidx0->prefd)) return 0; break; default: /* XXX */ if (bcmp(&spidx0->dst, &spidx1->dst, spidx0->dst.sa.sa_len) != 0) return 0; break; } /* XXX Do we check other field ? e.g. flowinfo */ return 1; } /* returns 0 on match */ static int key_sockaddrcmp( const struct sockaddr *sa1, const struct sockaddr *sa2, int port) { #ifdef satosin #undef satosin #endif #define satosin(s) ((const struct sockaddr_in *)s) #ifdef satosin6 #undef satosin6 #endif #define satosin6(s) ((const struct sockaddr_in6 *)s) if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len) return 1; switch (sa1->sa_family) { case AF_INET: if (sa1->sa_len != sizeof(struct sockaddr_in)) return 1; if (satosin(sa1)->sin_addr.s_addr != satosin(sa2)->sin_addr.s_addr) { return 1; } if (port && satosin(sa1)->sin_port != satosin(sa2)->sin_port) return 1; break; case AF_INET6: if (sa1->sa_len != sizeof(struct sockaddr_in6)) return 1; /*EINVAL*/ if (satosin6(sa1)->sin6_scope_id != satosin6(sa2)->sin6_scope_id) { return 1; } if (!IN6_ARE_ADDR_EQUAL(&satosin6(sa1)->sin6_addr, &satosin6(sa2)->sin6_addr)) { return 1; } if (port && satosin6(sa1)->sin6_port != satosin6(sa2)->sin6_port) { return 1; } break; default: if (bcmp(sa1, sa2, sa1->sa_len) != 0) return 1; break; } return 0; #undef satosin #undef satosin6 } /* * compare two buffers with mask. * IN: * addr1: source * addr2: object * bits: Number of bits to compare * OUT: * 1 : equal * 0 : not equal */ static int key_bbcmp(const void *a1, const void *a2, u_int bits) { const unsigned char *p1 = a1; const unsigned char *p2 = a2; /* XXX: This could be considerably faster if we compare a word * at a time, but it is complicated on LSB Endian machines */ /* Handle null pointers */ if (p1 == NULL || p2 == NULL) return (p1 == p2); while (bits >= 8) { if (*p1++ != *p2++) return 0; bits -= 8; } if (bits > 0) { u_int8_t mask = ~((1<<(8-bits))-1); if ((*p1 & mask) != (*p2 & mask)) return 0; } return 1; /* Match! */ } static void key_flush_spd(time_t now) { static u_int16_t sptree_scangen = 0; u_int16_t gen = sptree_scangen++; struct secpolicy *sp; u_int dir; /* SPD */ for (dir = 0; dir < IPSEC_DIR_MAX; dir++) { restart: SPTREE_LOCK(); LIST_FOREACH(sp, &V_sptree[dir], chain) { if (sp->scangen == gen) /* previously handled */ continue; sp->scangen = gen; if (sp->state == IPSEC_SPSTATE_DEAD && sp->refcnt == 1) { /* * Ensure that we only decrease refcnt once, * when we're the last consumer. * Directly call SP_DELREF/key_delsp instead * of KEY_FREESP to avoid unlocking/relocking * SPTREE_LOCK before key_delsp: may refcnt * be increased again during that time ? * NB: also clean entries created by * key_spdflush */ SP_DELREF(sp); key_delsp(sp); SPTREE_UNLOCK(); goto restart; } if (sp->lifetime == 0 && sp->validtime == 0) continue; if ((sp->lifetime && now - sp->created > sp->lifetime) || (sp->validtime && now - sp->lastused > sp->validtime)) { sp->state = IPSEC_SPSTATE_DEAD; SPTREE_UNLOCK(); key_spdexpire(sp); goto restart; } } SPTREE_UNLOCK(); } } static void key_flush_sad(time_t now) { struct secashead *sah, *nextsah; struct secasvar *sav, *nextsav; /* SAD */ SAHTREE_LOCK(); LIST_FOREACH_SAFE(sah, &V_sahtree, chain, nextsah) { /* if sah has been dead, then delete it and process next sah. */ if (sah->state == SADB_SASTATE_DEAD) { key_delsah(sah); continue; } /* if LARVAL entry doesn't become MATURE, delete it. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_LARVAL], chain, nextsav) { /* Need to also check refcnt for a larval SA ??? */ if (now - sav->created > V_key_larval_lifetime) KEY_FREESAV(&sav); } /* * check MATURE entry to start to send expire message * whether or not. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_MATURE], chain, nextsav) { /* we don't need to check. */ if (sav->lft_s == NULL) continue; /* sanity check */ if (sav->lft_c == NULL) { ipseclog((LOG_DEBUG,"%s: there is no CURRENT " "time, why?\n", __func__)); continue; } /* check SOFT lifetime */ if (sav->lft_s->addtime != 0 && now - sav->created > sav->lft_s->addtime) { key_sa_chgstate(sav, SADB_SASTATE_DYING); /* * Actually, only send expire message if * SA has been used, as it was done before, * but should we always send such message, * and let IKE daemon decide if it should be * renegotiated or not ? * XXX expire message will actually NOT be * sent if SA is only used after soft * lifetime has been reached, see below * (DYING state) */ if (sav->lft_c->usetime != 0) key_expire(sav); } /* check SOFT lifetime by bytes */ /* * XXX I don't know the way to delete this SA * when new SA is installed. Caution when it's * installed too big lifetime by time. */ else if (sav->lft_s->bytes != 0 && sav->lft_s->bytes < sav->lft_c->bytes) { key_sa_chgstate(sav, SADB_SASTATE_DYING); /* * XXX If we keep to send expire * message in the status of * DYING. Do remove below code. */ key_expire(sav); } } /* check DYING entry to change status to DEAD. */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DYING], chain, nextsav) { /* we don't need to check. */ if (sav->lft_h == NULL) continue; /* sanity check */ if (sav->lft_c == NULL) { ipseclog((LOG_DEBUG, "%s: there is no CURRENT " "time, why?\n", __func__)); continue; } if (sav->lft_h->addtime != 0 && now - sav->created > sav->lft_h->addtime) { key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } #if 0 /* XXX Should we keep to send expire message until HARD lifetime ? */ else if (sav->lft_s != NULL && sav->lft_s->addtime != 0 && now - sav->created > sav->lft_s->addtime) { /* * XXX: should be checked to be * installed the valid SA. */ /* * If there is no SA then sending * expire message. */ key_expire(sav); } #endif /* check HARD lifetime by bytes */ else if (sav->lft_h->bytes != 0 && sav->lft_h->bytes < sav->lft_c->bytes) { key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } } /* delete entry in DEAD */ LIST_FOREACH_SAFE(sav, &sah->savtree[SADB_SASTATE_DEAD], chain, nextsav) { /* sanity check */ if (sav->state != SADB_SASTATE_DEAD) { ipseclog((LOG_DEBUG, "%s: invalid sav->state " "(queue: %d SA: %d): kill it anyway\n", __func__, SADB_SASTATE_DEAD, sav->state)); } /* * do not call key_freesav() here. * sav should already be freed, and sav->refcnt * shows other references to sav * (such as from SPD). */ } } SAHTREE_UNLOCK(); } static void key_flush_acq(time_t now) { struct secacq *acq, *nextacq; /* ACQ tree */ ACQ_LOCK(); for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); if (now - acq->created > V_key_blockacq_lifetime && __LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); } } ACQ_UNLOCK(); } static void key_flush_spacq(time_t now) { struct secspacq *acq, *nextacq; /* SP ACQ tree */ SPACQ_LOCK(); for (acq = LIST_FIRST(&V_spacqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); if (now - acq->created > V_key_blockacq_lifetime && __LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); } } SPACQ_UNLOCK(); } /* * time handler. * scanning SPD and SAD to check status for each entries, * and do to remove or to expire. * XXX: year 2038 problem may remain. */ static void key_timehandler(void *arg) { VNET_ITERATOR_DECL(vnet_iter); time_t now = time_second; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); key_flush_spd(now); key_flush_sad(now); key_flush_acq(now); key_flush_spacq(now); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); #ifndef IPSEC_DEBUG2 /* do exchange to tick time !! */ callout_schedule(&key_timer, hz); #endif /* IPSEC_DEBUG2 */ } u_long key_random() { u_long value; key_randomfill(&value, sizeof(value)); return value; } void key_randomfill(p, l) void *p; size_t l; { size_t n; u_long v; static int warn = 1; n = 0; n = (size_t)read_random(p, (u_int)l); /* last resort */ while (n < l) { v = random(); bcopy(&v, (u_int8_t *)p + n, l - n < sizeof(v) ? l - n : sizeof(v)); n += sizeof(v); if (warn) { printf("WARNING: pseudo-random number generator " "used for IPsec processing\n"); warn = 0; } } } /* * map SADB_SATYPE_* to IPPROTO_*. * if satype == SADB_SATYPE then satype is mapped to ~0. * OUT: * 0: invalid satype. */ static u_int16_t key_satype2proto(u_int8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_IPCOMP; case SADB_X_SATYPE_TCPSIGNATURE: return IPPROTO_TCP; default: return 0; } /* NOTREACHED */ } /* * map IPPROTO_* to SADB_SATYPE_* * OUT: * 0: invalid protocol type. */ static u_int8_t key_proto2satype(u_int16_t proto) { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_IPCOMP: return SADB_X_SATYPE_IPCOMP; case IPPROTO_TCP: return SADB_X_SATYPE_TCPSIGNATURE; default: return 0; } /* NOTREACHED */ } /* %%% PF_KEY */ /* * SADB_GETSPI processing is to receive * * from the IKMPd, to assign a unique spi value, to hang on the INBOUND * tree with the status of LARVAL, and send * * to the IKMPd. * * IN: mhp: pointer to the pointer to each header. * OUT: NULL if fail. * other if success, return pointer to the message to send. */ static int key_getspi(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *newsah; struct secasvar *newsav; u_int8_t proto; u_int32_t spi; u_int8_t mode; u_int32_t reqid; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ switch (((struct sockaddr *)(src0 + 1))->sa_family) { case AF_INET: if (((struct sockaddr *)(src0 + 1))->sa_len != sizeof(struct sockaddr_in)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in *)(src0 + 1))->sin_port = 0; break; case AF_INET6: if (((struct sockaddr *)(src0 + 1))->sa_len != sizeof(struct sockaddr_in6)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in6 *)(src0 + 1))->sin6_port = 0; break; default: ; /*???*/ } switch (((struct sockaddr *)(dst0 + 1))->sa_family) { case AF_INET: if (((struct sockaddr *)(dst0 + 1))->sa_len != sizeof(struct sockaddr_in)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in *)(dst0 + 1))->sin_port = 0; break; case AF_INET6: if (((struct sockaddr *)(dst0 + 1))->sa_len != sizeof(struct sockaddr_in6)) return key_senderror(so, m, EINVAL); ((struct sockaddr_in6 *)(dst0 + 1))->sin6_port = 0; break; default: ; /*???*/ } /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); #ifdef IPSEC_NAT_T /* * Handle NAT-T info if present. * We made sure the port numbers are zero above, so we do * not have to worry in case we do not update them. */ if (mhp->ext[SADB_X_EXT_NAT_T_OAI] != NULL) ipseclog((LOG_DEBUG, "%s: NAT-T OAi present\n", __func__)); if (mhp->ext[SADB_X_EXT_NAT_T_OAR] != NULL) ipseclog((LOG_DEBUG, "%s: NAT-T OAr present\n", __func__)); if (mhp->ext[SADB_X_EXT_NAT_T_TYPE] != NULL && mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { struct sadb_x_nat_t_type *type; struct sadb_x_nat_t_port *sport, *dport; if (mhp->extlen[SADB_X_EXT_NAT_T_TYPE] < sizeof(*type) || mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid nat-t message " "passed.\n", __func__)); return key_senderror(so, m, EINVAL); } sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; if (sport) KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port); } #endif /* SPI allocation */ spi = key_do_getnewspi((struct sadb_spirange *)mhp->ext[SADB_EXT_SPIRANGE], &saidx); if (spi == 0) return key_senderror(so, m, EINVAL); /* get a SA index */ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA index */ if ((newsah = key_newsah(&saidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); return key_senderror(so, m, ENOBUFS); } } /* get a new SA */ /* XXX rewrite */ newsav = KEY_NEWSAV(m, mhp, newsah, &error); if (newsav == NULL) { /* XXX don't free new SA index allocated in above. */ return key_senderror(so, m, error); } /* set spi */ newsav->spi = htonl(spi); /* delete the entry in acqtree */ if (mhp->msg->sadb_msg_seq != 0) { struct secacq *acq; if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) != NULL) { /* reset counter in order to deletion by timehandler. */ acq->created = time_second; acq->count = 0; } } { struct mbuf *n, *nn; struct sadb_sa *m_sa; struct sadb_msg *newmsg; int off, len; /* create new sadb_msg to reply. */ len = PFKEY_ALIGN8(sizeof(struct sadb_msg)) + PFKEY_ALIGN8(sizeof(struct sadb_sa)); MGETHDR(n, M_NOWAIT, MT_DATA); if (len > MHLEN) { MCLGET(n, M_NOWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return key_senderror(so, m, ENOBUFS); n->m_len = len; n->m_next = NULL; off = 0; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); m_sa = (struct sadb_sa *)(mtod(n, caddr_t) + off); m_sa->sadb_sa_len = PFKEY_UNIT64(sizeof(struct sadb_sa)); m_sa->sadb_sa_exttype = SADB_EXT_SA; m_sa->sadb_sa_spi = htonl(spi); off += PFKEY_ALIGN8(sizeof(struct sadb_sa)); IPSEC_ASSERT(off == len, ("length inconsistency (off %u len %u)", off, len)); n->m_next = key_gather_mbuf(m, mhp, 0, 2, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n->m_next) { m_freem(n); return key_senderror(so, m, ENOBUFS); } if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return key_sendup_mbuf(so, m, KEY_SENDUP_ONE); } n->m_pkthdr.len = 0; for (nn = n; nn; nn = nn->m_next) n->m_pkthdr.len += nn->m_len; newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_seq = newsav->seq; newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } /* * allocating new SPI * called by key_getspi(). * OUT: * 0: failure. * others: success. */ static u_int32_t key_do_getnewspi(spirange, saidx) struct sadb_spirange *spirange; struct secasindex *saidx; { u_int32_t newspi; u_int32_t min, max; int count = V_key_spi_trycnt; /* set spi range to allocate */ if (spirange != NULL) { min = spirange->sadb_spirange_min; max = spirange->sadb_spirange_max; } else { min = V_key_spi_minval; max = V_key_spi_maxval; } /* IPCOMP needs 2-byte SPI */ if (saidx->proto == IPPROTO_IPCOMP) { u_int32_t t; if (min >= 0x10000) min = 0xffff; if (max >= 0x10000) max = 0xffff; if (min > max) { t = min; min = max; max = t; } } if (min == max) { if (key_checkspidup(saidx, min) != NULL) { ipseclog((LOG_DEBUG, "%s: SPI %u exists already.\n", __func__, min)); return 0; } count--; /* taking one cost. */ newspi = min; } else { /* init SPI */ newspi = 0; /* when requesting to allocate spi ranged */ while (count--) { /* generate pseudo-random SPI value ranged. */ newspi = min + (key_random() % (max - min + 1)); if (key_checkspidup(saidx, newspi) == NULL) break; } if (count == 0 || newspi == 0) { ipseclog((LOG_DEBUG, "%s: to allocate spi is failed.\n", __func__)); return 0; } } /* statistics */ keystat.getspi_count = (keystat.getspi_count + V_key_spi_trycnt - count) / 2; return newspi; } /* * SADB_UPDATE processing * receive * * from the ikmpd, and update a secasvar entry whose status is SADB_SASTATE_LARVAL. * and send * * to the ikmpd. * * m will always be freed. */ static int key_update(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; #ifdef IPSEC_NAT_T struct sadb_x_nat_t_type *type; struct sadb_x_nat_t_port *sport, *dport; struct sadb_address *iaddr, *raddr; struct sadb_x_nat_t_frag *frag; #endif struct secasindex saidx; struct secashead *sah; struct secasvar *sav; u_int16_t proto; u_int8_t mode; u_int32_t reqid; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP && mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) || (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH && mhp->ext[SADB_EXT_KEY_AUTH] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } /* XXX boundary checking for other extensions */ sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ KEY_PORTTOSADDR(&saidx.src, 0); KEY_PORTTOSADDR(&saidx.dst, 0); #ifdef IPSEC_NAT_T /* * Handle NAT-T info if present. */ if (mhp->ext[SADB_X_EXT_NAT_T_TYPE] != NULL && mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { if (mhp->extlen[SADB_X_EXT_NAT_T_TYPE] < sizeof(*type) || mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid message.\n", __func__)); return key_senderror(so, m, EINVAL); } type = (struct sadb_x_nat_t_type *) mhp->ext[SADB_X_EXT_NAT_T_TYPE]; sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; } else { type = 0; sport = dport = 0; } if (mhp->ext[SADB_X_EXT_NAT_T_OAI] != NULL && mhp->ext[SADB_X_EXT_NAT_T_OAR] != NULL) { if (mhp->extlen[SADB_X_EXT_NAT_T_OAI] < sizeof(*iaddr) || mhp->extlen[SADB_X_EXT_NAT_T_OAR] < sizeof(*raddr)) { ipseclog((LOG_DEBUG, "%s: invalid message\n", __func__)); return key_senderror(so, m, EINVAL); } iaddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAI]; raddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAR]; ipseclog((LOG_DEBUG, "%s: NAT-T OAi/r present\n", __func__)); } else { iaddr = raddr = NULL; } if (mhp->ext[SADB_X_EXT_NAT_T_FRAG] != NULL) { if (mhp->extlen[SADB_X_EXT_NAT_T_FRAG] < sizeof(*frag)) { ipseclog((LOG_DEBUG, "%s: invalid message\n", __func__)); return key_senderror(so, m, EINVAL); } frag = (struct sadb_x_nat_t_frag *) mhp->ext[SADB_X_EXT_NAT_T_FRAG]; } else { frag = 0; } #endif /* get a SA header */ if ((sah = key_getsah(&saidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: no SA index found.\n", __func__)); return key_senderror(so, m, ENOENT); } /* set spidx if there */ /* XXX rewrite */ error = key_setident(sah, m, mhp); if (error) return key_senderror(so, m, error); /* find a SA with sequence number. */ #ifdef IPSEC_DOSEQCHECK if (mhp->msg->sadb_msg_seq != 0 && (sav = key_getsavbyseq(sah, mhp->msg->sadb_msg_seq)) == NULL) { ipseclog((LOG_DEBUG, "%s: no larval SA with sequence %u " "exists.\n", __func__, mhp->msg->sadb_msg_seq)); return key_senderror(so, m, ENOENT); } #else SAHTREE_LOCK(); sav = key_getsavbyspi(sah, sa0->sadb_sa_spi); SAHTREE_UNLOCK(); if (sav == NULL) { ipseclog((LOG_DEBUG, "%s: no such a SA found (spi:%u)\n", __func__, (u_int32_t)ntohl(sa0->sadb_sa_spi))); return key_senderror(so, m, EINVAL); } #endif /* validity check */ if (sav->sah->saidx.proto != proto) { ipseclog((LOG_DEBUG, "%s: protocol mismatched " "(DB=%u param=%u)\n", __func__, sav->sah->saidx.proto, proto)); return key_senderror(so, m, EINVAL); } #ifdef IPSEC_DOSEQCHECK if (sav->spi != sa0->sadb_sa_spi) { ipseclog((LOG_DEBUG, "%s: SPI mismatched (DB:%u param:%u)\n", __func__, (u_int32_t)ntohl(sav->spi), (u_int32_t)ntohl(sa0->sadb_sa_spi))); return key_senderror(so, m, EINVAL); } #endif if (sav->pid != mhp->msg->sadb_msg_pid) { ipseclog((LOG_DEBUG, "%s: pid mismatched (DB:%u param:%u)\n", __func__, sav->pid, mhp->msg->sadb_msg_pid)); return key_senderror(so, m, EINVAL); } /* copy sav values */ error = key_setsaval(sav, m, mhp); if (error) { KEY_FREESAV(&sav); return key_senderror(so, m, error); } #ifdef IPSEC_NAT_T /* * Handle more NAT-T info if present, * now that we have a sav to fill. */ if (type) sav->natt_type = type->sadb_x_nat_t_type_type; if (sport) KEY_PORTTOSADDR(&sav->sah->saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&sav->sah->saidx.dst, dport->sadb_x_nat_t_port_port); #if 0 /* * In case SADB_X_EXT_NAT_T_FRAG was not given, leave it at 0. * We should actually check for a minimum MTU here, if we * want to support it in ip_output. */ if (frag) sav->natt_esp_frag_len = frag->sadb_x_nat_t_frag_fraglen; #endif #endif /* check SA values to be mature. */ if ((mhp->msg->sadb_msg_errno = key_mature(sav)) != 0) { KEY_FREESAV(&sav); return key_senderror(so, m, 0); } { struct mbuf *n; /* set msg buf from mhp */ n = key_getmsgbuf_x1(m, mhp); if (n == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * search SAD with sequence for a SA which state is SADB_SASTATE_LARVAL. * only called by key_update(). * OUT: * NULL : not found * others : found, pointer to a SA. */ #ifdef IPSEC_DOSEQCHECK static struct secasvar * key_getsavbyseq(sah, seq) struct secashead *sah; u_int32_t seq; { struct secasvar *sav; u_int state; state = SADB_SASTATE_LARVAL; /* search SAD with sequence number ? */ LIST_FOREACH(sav, &sah->savtree[state], chain) { KEY_CHKSASTATE(state, sav->state, __func__); if (sav->seq == seq) { sa_addref(sav); KEYDEBUG(KEYDEBUG_IPSEC_STAMP, printf("DP %s cause refcnt++:%d SA:%p\n", __func__, sav->refcnt, sav)); return sav; } } return NULL; } #endif /* * SADB_ADD processing * add an entry to SA database, when received * * from the ikmpd, * and send * * to the ikmpd. * * IGNORE identity and sensitivity messages. * * m will always be freed. */ static int key_add(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; #ifdef IPSEC_NAT_T struct sadb_x_nat_t_type *type; struct sadb_address *iaddr, *raddr; struct sadb_x_nat_t_frag *frag; #endif struct secasindex saidx; struct secashead *newsah; struct secasvar *newsav; u_int16_t proto; u_int8_t mode; u_int32_t reqid; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || (mhp->msg->sadb_msg_satype == SADB_SATYPE_ESP && mhp->ext[SADB_EXT_KEY_ENCRYPT] == NULL) || (mhp->msg->sadb_msg_satype == SADB_SATYPE_AH && mhp->ext[SADB_EXT_KEY_AUTH] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] != NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] == NULL) || (mhp->ext[SADB_EXT_LIFETIME_HARD] == NULL && mhp->ext[SADB_EXT_LIFETIME_SOFT] != NULL)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { /* XXX need more */ ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_X_EXT_SA2] != NULL) { mode = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_mode; reqid = ((struct sadb_x_sa2 *)mhp->ext[SADB_X_EXT_SA2])->sadb_x_sa2_reqid; } else { mode = IPSEC_MODE_ANY; reqid = 0; } sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, mode, reqid, src0 + 1, dst0 + 1, &saidx); /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ KEY_PORTTOSADDR(&saidx.src, 0); KEY_PORTTOSADDR(&saidx.dst, 0); #ifdef IPSEC_NAT_T /* * Handle NAT-T info if present. */ if (mhp->ext[SADB_X_EXT_NAT_T_TYPE] != NULL && mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { struct sadb_x_nat_t_port *sport, *dport; if (mhp->extlen[SADB_X_EXT_NAT_T_TYPE] < sizeof(*type) || mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid message.\n", __func__)); return key_senderror(so, m, EINVAL); } type = (struct sadb_x_nat_t_type *) mhp->ext[SADB_X_EXT_NAT_T_TYPE]; sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; if (sport) KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port); } else { type = 0; } if (mhp->ext[SADB_X_EXT_NAT_T_OAI] != NULL && mhp->ext[SADB_X_EXT_NAT_T_OAR] != NULL) { if (mhp->extlen[SADB_X_EXT_NAT_T_OAI] < sizeof(*iaddr) || mhp->extlen[SADB_X_EXT_NAT_T_OAR] < sizeof(*raddr)) { ipseclog((LOG_DEBUG, "%s: invalid message\n", __func__)); return key_senderror(so, m, EINVAL); } iaddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAI]; raddr = (struct sadb_address *)mhp->ext[SADB_X_EXT_NAT_T_OAR]; ipseclog((LOG_DEBUG, "%s: NAT-T OAi/r present\n", __func__)); } else { iaddr = raddr = NULL; } if (mhp->ext[SADB_X_EXT_NAT_T_FRAG] != NULL) { if (mhp->extlen[SADB_X_EXT_NAT_T_FRAG] < sizeof(*frag)) { ipseclog((LOG_DEBUG, "%s: invalid message\n", __func__)); return key_senderror(so, m, EINVAL); } frag = (struct sadb_x_nat_t_frag *) mhp->ext[SADB_X_EXT_NAT_T_FRAG]; } else { frag = 0; } #endif /* get a SA header */ if ((newsah = key_getsah(&saidx)) == NULL) { /* create a new SA header */ if ((newsah = key_newsah(&saidx)) == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n",__func__)); return key_senderror(so, m, ENOBUFS); } } /* set spidx if there */ /* XXX rewrite */ error = key_setident(newsah, m, mhp); if (error) { return key_senderror(so, m, error); } /* create new SA entry. */ /* We can create new SA only if SPI is differenct. */ SAHTREE_LOCK(); newsav = key_getsavbyspi(newsah, sa0->sadb_sa_spi); SAHTREE_UNLOCK(); if (newsav != NULL) { ipseclog((LOG_DEBUG, "%s: SA already exists.\n", __func__)); return key_senderror(so, m, EEXIST); } newsav = KEY_NEWSAV(m, mhp, newsah, &error); if (newsav == NULL) { return key_senderror(so, m, error); } #ifdef IPSEC_NAT_T /* * Handle more NAT-T info if present, * now that we have a sav to fill. */ if (type) newsav->natt_type = type->sadb_x_nat_t_type_type; #if 0 /* * In case SADB_X_EXT_NAT_T_FRAG was not given, leave it at 0. * We should actually check for a minimum MTU here, if we * want to support it in ip_output. */ if (frag) newsav->natt_esp_frag_len = frag->sadb_x_nat_t_frag_fraglen; #endif #endif /* check SA values to be mature. */ if ((error = key_mature(newsav)) != 0) { KEY_FREESAV(&newsav); return key_senderror(so, m, error); } /* * don't call key_freesav() here, as we would like to keep the SA * in the database on success. */ { struct mbuf *n; /* set msg buf from mhp */ n = key_getmsgbuf_x1(m, mhp); if (n == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* m is retained */ static int key_setident(sah, m, mhp) struct secashead *sah; struct mbuf *m; const struct sadb_msghdr *mhp; { const struct sadb_ident *idsrc, *iddst; int idsrclen, iddstlen; IPSEC_ASSERT(sah != NULL, ("null secashead")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* don't make buffer if not there */ if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL && mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) { sah->idents = NULL; sah->identd = NULL; return 0; } if (mhp->ext[SADB_EXT_IDENTITY_SRC] == NULL || mhp->ext[SADB_EXT_IDENTITY_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid identity.\n", __func__)); return EINVAL; } idsrc = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_SRC]; iddst = (const struct sadb_ident *)mhp->ext[SADB_EXT_IDENTITY_DST]; idsrclen = mhp->extlen[SADB_EXT_IDENTITY_SRC]; iddstlen = mhp->extlen[SADB_EXT_IDENTITY_DST]; /* validity check */ if (idsrc->sadb_ident_type != iddst->sadb_ident_type) { ipseclog((LOG_DEBUG, "%s: ident type mismatch.\n", __func__)); return EINVAL; } switch (idsrc->sadb_ident_type) { case SADB_IDENTTYPE_PREFIX: case SADB_IDENTTYPE_FQDN: case SADB_IDENTTYPE_USERFQDN: default: /* XXX do nothing */ sah->idents = NULL; sah->identd = NULL; return 0; } /* make structure */ sah->idents = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT); if (sah->idents == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return ENOBUFS; } sah->identd = malloc(sizeof(struct secident), M_IPSEC_MISC, M_NOWAIT); if (sah->identd == NULL) { free(sah->idents, M_IPSEC_MISC); sah->idents = NULL; ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return ENOBUFS; } sah->idents->type = idsrc->sadb_ident_type; sah->idents->id = idsrc->sadb_ident_id; sah->identd->type = iddst->sadb_ident_type; sah->identd->id = iddst->sadb_ident_id; return 0; } /* * m will not be freed on return. * it is caller's responsibility to free the result. */ static struct mbuf * key_getmsgbuf_x1(m, mhp) struct mbuf *m; const struct sadb_msghdr *mhp; { struct mbuf *n; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* create new sadb_msg to reply. */ n = key_gather_mbuf(m, mhp, 1, 9, SADB_EXT_RESERVED, SADB_EXT_SA, SADB_X_EXT_SA2, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST, SADB_EXT_LIFETIME_HARD, SADB_EXT_LIFETIME_SOFT, SADB_EXT_IDENTITY_SRC, SADB_EXT_IDENTITY_DST); if (!n) return NULL; if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return NULL; } mtod(n, struct sadb_msg *)->sadb_msg_errno = 0; mtod(n, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); return n; } static int key_delete_all __P((struct socket *, struct mbuf *, const struct sadb_msghdr *, u_int16_t)); /* * SADB_DELETE processing * receive * * from the ikmpd, and set SADB_SASTATE_DEAD, * and send, * * to the ikmpd. * * m will always be freed. */ static int key_delete(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav = NULL; u_int16_t proto; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL) { /* * Caller wants us to delete all non-LARVAL SAs * that match the src/dst. This is used during * IKE INITIAL-CONTACT. */ ipseclog((LOG_DEBUG, "%s: doing delete all.\n", __func__)); return key_delete_all(so, m, mhp, proto); } else if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ KEY_PORTTOSADDR(&saidx.src, 0); KEY_PORTTOSADDR(&saidx.dst, 0); #ifdef IPSEC_NAT_T /* * Handle NAT-T info if present. */ if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { struct sadb_x_nat_t_port *sport, *dport; if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid message.\n", __func__)); return key_senderror(so, m, EINVAL); } sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; if (sport) KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port); } #endif /* get a SA header */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* get a SA with SPI. */ sav = key_getsavbyspi(sah, sa0->sadb_sa_spi); if (sav) break; } if (sah == NULL) { SAHTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__)); return key_senderror(so, m, ENOENT); } key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); SAHTREE_UNLOCK(); { struct mbuf *n; struct sadb_msg *newmsg; /* create new sadb_msg to reply. */ /* XXX-BZ NAT-T extensions? */ n = key_gather_mbuf(m, mhp, 1, 4, SADB_EXT_RESERVED, SADB_EXT_SA, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n) return key_senderror(so, m, ENOBUFS); if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return key_senderror(so, m, ENOBUFS); } newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * delete all SAs for src/dst. Called from key_delete(). */ static int key_delete_all(struct socket *so, struct mbuf *m, const struct sadb_msghdr *mhp, u_int16_t proto) { struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav, *nextsav; u_int stateidx, state; src0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp->ext[SADB_EXT_ADDRESS_DST]); /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ KEY_PORTTOSADDR(&saidx.src, 0); KEY_PORTTOSADDR(&saidx.dst, 0); #ifdef IPSEC_NAT_T /* * Handle NAT-T info if present. */ if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { struct sadb_x_nat_t_port *sport, *dport; if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid message.\n", __func__)); return key_senderror(so, m, EINVAL); } sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; if (sport) KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port); } #endif SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* Delete all non-LARVAL SAs. */ for (stateidx = 0; stateidx < _ARRAYLEN(saorder_state_alive); stateidx++) { state = saorder_state_alive[stateidx]; if (state == SADB_SASTATE_LARVAL) continue; for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { nextsav = LIST_NEXT(sav, chain); /* sanity check */ if (sav->state != state) { ipseclog((LOG_DEBUG, "%s: invalid " "sav->state (queue %d SA %d)\n", __func__, state, sav->state)); continue; } key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } } } SAHTREE_UNLOCK(); { struct mbuf *n; struct sadb_msg *newmsg; /* create new sadb_msg to reply. */ /* XXX-BZ NAT-T extensions? */ n = key_gather_mbuf(m, mhp, 1, 3, SADB_EXT_RESERVED, SADB_EXT_ADDRESS_SRC, SADB_EXT_ADDRESS_DST); if (!n) return key_senderror(so, m, ENOBUFS); if (n->m_len < sizeof(struct sadb_msg)) { n = m_pullup(n, sizeof(struct sadb_msg)); if (n == NULL) return key_senderror(so, m, ENOBUFS); } newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(n->m_pkthdr.len); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ALL); } } /* * SADB_GET processing * receive * * from the ikmpd, and get a SP and a SA to respond, * and send, * * to the ikmpd. * * m will always be freed. */ static int key_get(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; struct secasvar *sav = NULL; u_int16_t proto; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_SA] == NULL || mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_SA] < sizeof(struct sadb_sa) || mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address)) { ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } sa0 = (struct sadb_sa *)mhp->ext[SADB_EXT_SA]; src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ KEY_PORTTOSADDR(&saidx.src, 0); KEY_PORTTOSADDR(&saidx.dst, 0); #ifdef IPSEC_NAT_T /* * Handle NAT-T info if present. */ if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { struct sadb_x_nat_t_port *sport, *dport; if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid message.\n", __func__)); return key_senderror(so, m, EINVAL); } sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; if (sport) KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port); } #endif /* get a SA header */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_HEAD) == 0) continue; /* get a SA with SPI. */ sav = key_getsavbyspi(sah, sa0->sadb_sa_spi); if (sav) break; } SAHTREE_UNLOCK(); if (sah == NULL) { ipseclog((LOG_DEBUG, "%s: no SA found.\n", __func__)); return key_senderror(so, m, ENOENT); } { struct mbuf *n; u_int8_t satype; /* map proto to satype */ if ((satype = key_proto2satype(sah->saidx.proto)) == 0) { ipseclog((LOG_DEBUG, "%s: there was invalid proto in SAD.\n", __func__)); return key_senderror(so, m, EINVAL); } /* create new sadb_msg to reply. */ n = key_setdumpsa(sav, SADB_GET, satype, mhp->msg->sadb_msg_seq, mhp->msg->sadb_msg_pid); if (!n) return key_senderror(so, m, ENOBUFS); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } /* XXX make it sysctl-configurable? */ static void key_getcomb_setlifetime(comb) struct sadb_comb *comb; { comb->sadb_comb_soft_allocations = 1; comb->sadb_comb_hard_allocations = 1; comb->sadb_comb_soft_bytes = 0; comb->sadb_comb_hard_bytes = 0; comb->sadb_comb_hard_addtime = 86400; /* 1 day */ comb->sadb_comb_soft_addtime = comb->sadb_comb_soft_addtime * 80 / 100; comb->sadb_comb_soft_usetime = 28800; /* 8 hours */ comb->sadb_comb_hard_usetime = comb->sadb_comb_hard_usetime * 80 / 100; } /* * XXX reorder combinations by preference * XXX no idea if the user wants ESP authentication or not */ static struct mbuf * key_getcomb_esp() { struct sadb_comb *comb; struct enc_xform *algo; struct mbuf *result = NULL, *m, *n; int encmin; int i, off, o; int totlen; const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); m = NULL; for (i = 1; i <= SADB_EALG_MAX; i++) { algo = esp_algorithm_lookup(i); if (algo == NULL) continue; /* discard algorithms with key size smaller than system min */ if (_BITS(algo->maxkey) < V_ipsec_esp_keymin) continue; if (_BITS(algo->minkey) < V_ipsec_esp_keymin) encmin = V_ipsec_esp_keymin; else encmin = _BITS(algo->minkey); if (V_ipsec_esp_auth) m = key_getcomb_ah(); else { IPSEC_ASSERT(l <= MLEN, ("l=%u > MLEN=%lu", l, (u_long) MLEN)); MGET(m, M_NOWAIT, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; bzero(mtod(m, caddr_t), m->m_len); } } if (!m) goto fail; totlen = 0; for (n = m; n; n = n->m_next) totlen += n->m_len; IPSEC_ASSERT((totlen % l) == 0, ("totlen=%u, l=%u", totlen, l)); for (off = 0; off < totlen; off += l) { n = m_pulldown(m, off, l, &o); if (!n) { /* m is already freed */ goto fail; } comb = (struct sadb_comb *)(mtod(n, caddr_t) + o); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_encrypt = i; comb->sadb_comb_encrypt_minbits = encmin; comb->sadb_comb_encrypt_maxbits = _BITS(algo->maxkey); } if (!result) result = m; else m_cat(result, m); } return result; fail: if (result) m_freem(result); return NULL; } static void key_getsizes_ah( const struct auth_hash *ah, int alg, u_int16_t* min, u_int16_t* max) { *min = *max = ah->keysize; if (ah->keysize == 0) { /* * Transform takes arbitrary key size but algorithm * key size is restricted. Enforce this here. */ switch (alg) { case SADB_X_AALG_MD5: *min = *max = 16; break; case SADB_X_AALG_SHA: *min = *max = 20; break; case SADB_X_AALG_NULL: *min = 1; *max = 256; break; case SADB_X_AALG_SHA2_256: *min = *max = 32; break; case SADB_X_AALG_SHA2_384: *min = *max = 48; break; case SADB_X_AALG_SHA2_512: *min = *max = 64; break; default: DPRINTF(("%s: unknown AH algorithm %u\n", __func__, alg)); break; } } } /* * XXX reorder combinations by preference */ static struct mbuf * key_getcomb_ah() { struct sadb_comb *comb; struct auth_hash *algo; struct mbuf *m; u_int16_t minkeysize, maxkeysize; int i; const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); m = NULL; for (i = 1; i <= SADB_AALG_MAX; i++) { #if 1 /* we prefer HMAC algorithms, not old algorithms */ if (i != SADB_AALG_SHA1HMAC && i != SADB_AALG_MD5HMAC && i != SADB_X_AALG_SHA2_256 && i != SADB_X_AALG_SHA2_384 && i != SADB_X_AALG_SHA2_512) continue; #endif algo = ah_algorithm_lookup(i); if (!algo) continue; key_getsizes_ah(algo, i, &minkeysize, &maxkeysize); /* discard algorithms with key size smaller than system min */ if (_BITS(minkeysize) < V_ipsec_ah_keymin) continue; if (!m) { IPSEC_ASSERT(l <= MLEN, ("l=%u > MLEN=%lu", l, (u_long) MLEN)); MGET(m, M_NOWAIT, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; } } else M_PREPEND(m, l, M_NOWAIT); if (!m) return NULL; comb = mtod(m, struct sadb_comb *); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_auth = i; comb->sadb_comb_auth_minbits = _BITS(minkeysize); comb->sadb_comb_auth_maxbits = _BITS(maxkeysize); } return m; } /* * not really an official behavior. discussed in pf_key@inner.net in Sep2000. * XXX reorder combinations by preference */ static struct mbuf * key_getcomb_ipcomp() { struct sadb_comb *comb; struct comp_algo *algo; struct mbuf *m; int i; const int l = PFKEY_ALIGN8(sizeof(struct sadb_comb)); m = NULL; for (i = 1; i <= SADB_X_CALG_MAX; i++) { algo = ipcomp_algorithm_lookup(i); if (!algo) continue; if (!m) { IPSEC_ASSERT(l <= MLEN, ("l=%u > MLEN=%lu", l, (u_long) MLEN)); MGET(m, M_NOWAIT, MT_DATA); if (m) { M_ALIGN(m, l); m->m_len = l; m->m_next = NULL; } } else M_PREPEND(m, l, M_NOWAIT); if (!m) return NULL; comb = mtod(m, struct sadb_comb *); bzero(comb, sizeof(*comb)); key_getcomb_setlifetime(comb); comb->sadb_comb_encrypt = i; /* what should we set into sadb_comb_*_{min,max}bits? */ } return m; } /* * XXX no way to pass mode (transport/tunnel) to userland * XXX replay checking? * XXX sysctl interface to ipsec_{ah,esp}_keymin */ static struct mbuf * key_getprop(saidx) const struct secasindex *saidx; { struct sadb_prop *prop; struct mbuf *m, *n; const int l = PFKEY_ALIGN8(sizeof(struct sadb_prop)); int totlen; switch (saidx->proto) { case IPPROTO_ESP: m = key_getcomb_esp(); break; case IPPROTO_AH: m = key_getcomb_ah(); break; case IPPROTO_IPCOMP: m = key_getcomb_ipcomp(); break; default: return NULL; } if (!m) return NULL; M_PREPEND(m, l, M_NOWAIT); if (!m) return NULL; totlen = 0; for (n = m; n; n = n->m_next) totlen += n->m_len; prop = mtod(m, struct sadb_prop *); bzero(prop, sizeof(*prop)); prop->sadb_prop_len = PFKEY_UNIT64(totlen); prop->sadb_prop_exttype = SADB_EXT_PROPOSAL; prop->sadb_prop_replay = 32; /* XXX */ return m; } /* * SADB_ACQUIRE processing called by key_checkrequest() and key_acquire2(). * send * * to KMD, and expect to receive * with SADB_ACQUIRE if error occured, * or * with SADB_GETSPI * from KMD by PF_KEY. * * XXX x_policy is outside of RFC2367 (KAME extension). * XXX sensitivity is not supported. * XXX for ipcomp, RFC2367 does not define how to fill in proposal. * see comment for key_getcomb_ipcomp(). * * OUT: * 0 : succeed * others: error number */ static int key_acquire(const struct secasindex *saidx, struct secpolicy *sp) { struct mbuf *result = NULL, *m; struct secacq *newacq; u_int8_t satype; int error = -1; u_int32_t seq; IPSEC_ASSERT(saidx != NULL, ("null saidx")); satype = key_proto2satype(saidx->proto); IPSEC_ASSERT(satype != 0, ("null satype, protocol %u", saidx->proto)); /* * We never do anything about acquirng SA. There is anather * solution that kernel blocks to send SADB_ACQUIRE message until * getting something message from IKEd. In later case, to be * managed with ACQUIRING list. */ /* Get an entry to check whether sending message or not. */ if ((newacq = key_getacq(saidx)) != NULL) { if (V_key_blockacq_count < newacq->count) { /* reset counter and do send message. */ newacq->count = 0; } else { /* increment counter and do nothing. */ newacq->count++; return 0; } } else { /* make new entry for blocking to send SADB_ACQUIRE. */ if ((newacq = key_newacq(saidx)) == NULL) return ENOBUFS; } seq = newacq->seq; m = key_setsadbmsg(SADB_ACQUIRE, 0, satype, seq, 0, 0); if (!m) { error = ENOBUFS; goto fail; } result = m; /* * No SADB_X_EXT_NAT_T_* here: we do not know * anything related to NAT-T at this time. */ /* set sadb_address for saidx's. */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &saidx->src.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &saidx->dst.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* XXX proxy address (optional) */ /* set sadb_x_policy */ if (sp) { m = key_setsadbxpolicy(sp->policy, sp->spidx.dir, sp->id); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); } /* XXX identity (optional) */ #if 0 if (idexttype && fqdn) { /* create identity extension (FQDN) */ struct sadb_ident *id; int fqdnlen; fqdnlen = strlen(fqdn) + 1; /* +1 for terminating-NUL */ id = (struct sadb_ident *)p; bzero(id, sizeof(*id) + PFKEY_ALIGN8(fqdnlen)); id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(fqdnlen)); id->sadb_ident_exttype = idexttype; id->sadb_ident_type = SADB_IDENTTYPE_FQDN; bcopy(fqdn, id + 1, fqdnlen); p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(fqdnlen); } if (idexttype) { /* create identity extension (USERFQDN) */ struct sadb_ident *id; int userfqdnlen; if (userfqdn) { /* +1 for terminating-NUL */ userfqdnlen = strlen(userfqdn) + 1; } else userfqdnlen = 0; id = (struct sadb_ident *)p; bzero(id, sizeof(*id) + PFKEY_ALIGN8(userfqdnlen)); id->sadb_ident_len = PFKEY_UNIT64(sizeof(*id) + PFKEY_ALIGN8(userfqdnlen)); id->sadb_ident_exttype = idexttype; id->sadb_ident_type = SADB_IDENTTYPE_USERFQDN; /* XXX is it correct? */ if (curproc && curproc->p_cred) id->sadb_ident_id = curproc->p_cred->p_ruid; if (userfqdn && userfqdnlen) bcopy(userfqdn, id + 1, userfqdnlen); p += sizeof(struct sadb_ident) + PFKEY_ALIGN8(userfqdnlen); } #endif /* XXX sensitivity (optional) */ /* create proposal/combination extension */ m = key_getprop(saidx); #if 0 /* * spec conformant: always attach proposal/combination extension, * the problem is that we have no way to attach it for ipcomp, * due to the way sadb_comb is declared in RFC2367. */ if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); #else /* * outside of spec; make proposal/combination extension optional. */ if (m) m_cat(result, m); #endif if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; goto fail; } if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) { error = ENOBUFS; goto fail; } } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: if (result) m_freem(result); return error; } static struct secacq * key_newacq(const struct secasindex *saidx) { struct secacq *newacq; /* get new entry */ newacq = malloc(sizeof(struct secacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO); if (newacq == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return NULL; } /* copy secindex */ bcopy(saidx, &newacq->saidx, sizeof(newacq->saidx)); newacq->seq = (V_acq_seq == ~0 ? 1 : ++V_acq_seq); newacq->created = time_second; newacq->count = 0; /* add to acqtree */ ACQ_LOCK(); LIST_INSERT_HEAD(&V_acqtree, newacq, chain); ACQ_UNLOCK(); return newacq; } static struct secacq * key_getacq(const struct secasindex *saidx) { struct secacq *acq; ACQ_LOCK(); LIST_FOREACH(acq, &V_acqtree, chain) { if (key_cmpsaidx(saidx, &acq->saidx, CMP_EXACTLY)) break; } ACQ_UNLOCK(); return acq; } static struct secacq * key_getacqbyseq(seq) u_int32_t seq; { struct secacq *acq; ACQ_LOCK(); LIST_FOREACH(acq, &V_acqtree, chain) { if (acq->seq == seq) break; } ACQ_UNLOCK(); return acq; } static struct secspacq * key_newspacq(spidx) struct secpolicyindex *spidx; { struct secspacq *acq; /* get new entry */ acq = malloc(sizeof(struct secspacq), M_IPSEC_SAQ, M_NOWAIT|M_ZERO); if (acq == NULL) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return NULL; } /* copy secindex */ bcopy(spidx, &acq->spidx, sizeof(acq->spidx)); acq->created = time_second; acq->count = 0; /* add to spacqtree */ SPACQ_LOCK(); LIST_INSERT_HEAD(&V_spacqtree, acq, chain); SPACQ_UNLOCK(); return acq; } static struct secspacq * key_getspacq(spidx) struct secpolicyindex *spidx; { struct secspacq *acq; SPACQ_LOCK(); LIST_FOREACH(acq, &V_spacqtree, chain) { if (key_cmpspidx_exactly(spidx, &acq->spidx)) { /* NB: return holding spacq_lock */ return acq; } } SPACQ_UNLOCK(); return NULL; } /* * SADB_ACQUIRE processing, * in first situation, is receiving * * from the ikmpd, and clear sequence of its secasvar entry. * * In second situation, is receiving * * from a user land process, and return * * to the socket. * * m will always be freed. */ static int key_acquire2(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { const struct sadb_address *src0, *dst0; struct secasindex saidx; struct secashead *sah; u_int16_t proto; int error; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* * Error message from KMd. * We assume that if error was occured in IKEd, the length of PFKEY * message is equal to the size of sadb_msg structure. * We do not raise error even if error occured in this function. */ if (mhp->msg->sadb_msg_len == PFKEY_UNIT64(sizeof(struct sadb_msg))) { struct secacq *acq; /* check sequence number */ if (mhp->msg->sadb_msg_seq == 0) { ipseclog((LOG_DEBUG, "%s: must specify sequence " "number.\n", __func__)); m_freem(m); return 0; } if ((acq = key_getacqbyseq(mhp->msg->sadb_msg_seq)) == NULL) { /* * the specified larval SA is already gone, or we got * a bogus sequence number. we can silently ignore it. */ m_freem(m); return 0; } /* reset acq counter in order to deletion by timehander. */ acq->created = time_second; acq->count = 0; m_freem(m); return 0; } /* * This message is from user land. */ /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->ext[SADB_EXT_ADDRESS_SRC] == NULL || mhp->ext[SADB_EXT_ADDRESS_DST] == NULL || mhp->ext[SADB_EXT_PROPOSAL] == NULL) { /* error */ ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } if (mhp->extlen[SADB_EXT_ADDRESS_SRC] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_ADDRESS_DST] < sizeof(struct sadb_address) || mhp->extlen[SADB_EXT_PROPOSAL] < sizeof(struct sadb_prop)) { /* error */ ipseclog((LOG_DEBUG, "%s: invalid message is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } src0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_SRC]; dst0 = (struct sadb_address *)mhp->ext[SADB_EXT_ADDRESS_DST]; /* XXX boundary check against sa_len */ KEY_SETSECASIDX(proto, IPSEC_MODE_ANY, 0, src0 + 1, dst0 + 1, &saidx); /* * Make sure the port numbers are zero. * In case of NAT-T we will update them later if needed. */ KEY_PORTTOSADDR(&saidx.src, 0); KEY_PORTTOSADDR(&saidx.dst, 0); #ifndef IPSEC_NAT_T /* * Handle NAT-T info if present. */ if (mhp->ext[SADB_X_EXT_NAT_T_SPORT] != NULL && mhp->ext[SADB_X_EXT_NAT_T_DPORT] != NULL) { struct sadb_x_nat_t_port *sport, *dport; if (mhp->extlen[SADB_X_EXT_NAT_T_SPORT] < sizeof(*sport) || mhp->extlen[SADB_X_EXT_NAT_T_DPORT] < sizeof(*dport)) { ipseclog((LOG_DEBUG, "%s: invalid message.\n", __func__)); return key_senderror(so, m, EINVAL); } sport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_SPORT]; dport = (struct sadb_x_nat_t_port *) mhp->ext[SADB_X_EXT_NAT_T_DPORT]; if (sport) KEY_PORTTOSADDR(&saidx.src, sport->sadb_x_nat_t_port_port); if (dport) KEY_PORTTOSADDR(&saidx.dst, dport->sadb_x_nat_t_port_port); } #endif /* get a SA index */ SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (sah->state == SADB_SASTATE_DEAD) continue; if (key_cmpsaidx(&sah->saidx, &saidx, CMP_MODE_REQID)) break; } SAHTREE_UNLOCK(); if (sah != NULL) { ipseclog((LOG_DEBUG, "%s: a SA exists already.\n", __func__)); return key_senderror(so, m, EEXIST); } error = key_acquire(&saidx, NULL); if (error != 0) { ipseclog((LOG_DEBUG, "%s: error %d returned from key_acquire\n", __func__, mhp->msg->sadb_msg_errno)); return key_senderror(so, m, error); } return key_sendup_mbuf(so, m, KEY_SENDUP_REGISTERED); } /* * SADB_REGISTER processing. * If SATYPE_UNSPEC has been passed as satype, only return sabd_supported. * receive * * from the ikmpd, and register a socket to send PF_KEY messages, * and send * * to KMD by PF_KEY. * If socket is detached, must free from regnode. * * m will always be freed. */ static int key_register(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct secreg *reg, *newreg = 0; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* check for invalid register message */ if (mhp->msg->sadb_msg_satype >= sizeof(V_regtree)/sizeof(V_regtree[0])) return key_senderror(so, m, EINVAL); /* When SATYPE_UNSPEC is specified, only return sabd_supported. */ if (mhp->msg->sadb_msg_satype == SADB_SATYPE_UNSPEC) goto setmsg; /* check whether existing or not */ REGTREE_LOCK(); LIST_FOREACH(reg, &V_regtree[mhp->msg->sadb_msg_satype], chain) { if (reg->so == so) { REGTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: socket exists already.\n", __func__)); return key_senderror(so, m, EEXIST); } } /* create regnode */ newreg = malloc(sizeof(struct secreg), M_IPSEC_SAR, M_NOWAIT|M_ZERO); if (newreg == NULL) { REGTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } newreg->so = so; ((struct keycb *)sotorawcb(so))->kp_registered++; /* add regnode to regtree. */ LIST_INSERT_HEAD(&V_regtree[mhp->msg->sadb_msg_satype], newreg, chain); REGTREE_UNLOCK(); setmsg: { struct mbuf *n; struct sadb_msg *newmsg; struct sadb_supported *sup; u_int len, alen, elen; int off; int i; struct sadb_alg *alg; /* create new sadb_msg to reply. */ alen = 0; for (i = 1; i <= SADB_AALG_MAX; i++) { if (ah_algorithm_lookup(i)) alen += sizeof(struct sadb_alg); } if (alen) alen += sizeof(struct sadb_supported); elen = 0; for (i = 1; i <= SADB_EALG_MAX; i++) { if (esp_algorithm_lookup(i)) elen += sizeof(struct sadb_alg); } if (elen) elen += sizeof(struct sadb_supported); len = sizeof(struct sadb_msg) + alen + elen; if (len > MCLBYTES) return key_senderror(so, m, ENOBUFS); MGETHDR(n, M_NOWAIT, MT_DATA); if (len > MHLEN) { MCLGET(n, M_NOWAIT); if ((n->m_flags & M_EXT) == 0) { m_freem(n); n = NULL; } } if (!n) return key_senderror(so, m, ENOBUFS); n->m_pkthdr.len = n->m_len = len; n->m_next = NULL; off = 0; m_copydata(m, 0, sizeof(struct sadb_msg), mtod(n, caddr_t) + off); newmsg = mtod(n, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(len); off += PFKEY_ALIGN8(sizeof(struct sadb_msg)); /* for authentication algorithm */ if (alen) { sup = (struct sadb_supported *)(mtod(n, caddr_t) + off); sup->sadb_supported_len = PFKEY_UNIT64(alen); sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; off += PFKEY_ALIGN8(sizeof(*sup)); for (i = 1; i <= SADB_AALG_MAX; i++) { struct auth_hash *aalgo; u_int16_t minkeysize, maxkeysize; aalgo = ah_algorithm_lookup(i); if (!aalgo) continue; alg = (struct sadb_alg *)(mtod(n, caddr_t) + off); alg->sadb_alg_id = i; alg->sadb_alg_ivlen = 0; key_getsizes_ah(aalgo, i, &minkeysize, &maxkeysize); alg->sadb_alg_minbits = _BITS(minkeysize); alg->sadb_alg_maxbits = _BITS(maxkeysize); off += PFKEY_ALIGN8(sizeof(*alg)); } } /* for encryption algorithm */ if (elen) { sup = (struct sadb_supported *)(mtod(n, caddr_t) + off); sup->sadb_supported_len = PFKEY_UNIT64(elen); sup->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; off += PFKEY_ALIGN8(sizeof(*sup)); for (i = 1; i <= SADB_EALG_MAX; i++) { struct enc_xform *ealgo; ealgo = esp_algorithm_lookup(i); if (!ealgo) continue; alg = (struct sadb_alg *)(mtod(n, caddr_t) + off); alg->sadb_alg_id = i; alg->sadb_alg_ivlen = ealgo->blocksize; alg->sadb_alg_minbits = _BITS(ealgo->minkey); alg->sadb_alg_maxbits = _BITS(ealgo->maxkey); off += PFKEY_ALIGN8(sizeof(struct sadb_alg)); } } IPSEC_ASSERT(off == len, ("length assumption failed (off %u len %u)", off, len)); m_freem(m); return key_sendup_mbuf(so, n, KEY_SENDUP_REGISTERED); } } /* * free secreg entry registered. * XXX: I want to do free a socket marked done SADB_RESIGER to socket. */ void key_freereg(struct socket *so) { struct secreg *reg; int i; IPSEC_ASSERT(so != NULL, ("NULL so")); /* * check whether existing or not. * check all type of SA, because there is a potential that * one socket is registered to multiple type of SA. */ REGTREE_LOCK(); for (i = 0; i <= SADB_SATYPE_MAX; i++) { LIST_FOREACH(reg, &V_regtree[i], chain) { if (reg->so == so && __LIST_CHAINED(reg)) { LIST_REMOVE(reg, chain); free(reg, M_IPSEC_SAR); break; } } } REGTREE_UNLOCK(); } /* * SADB_EXPIRE processing * send * * to KMD by PF_KEY. * NOTE: We send only soft lifetime extension. * * OUT: 0 : succeed * others : error number */ static int key_expire(struct secasvar *sav) { int satype; struct mbuf *result = NULL, *m; int len; int error = -1; struct sadb_lifetime *lt; IPSEC_ASSERT (sav != NULL, ("null sav")); IPSEC_ASSERT (sav->sah != NULL, ("null sa header")); /* set msg header */ satype = key_proto2satype(sav->sah->saidx.proto); IPSEC_ASSERT(satype != 0, ("invalid proto, satype %u", satype)); m = key_setsadbmsg(SADB_EXPIRE, 0, satype, sav->seq, 0, sav->refcnt); if (!m) { error = ENOBUFS; goto fail; } result = m; /* create SA extension */ m = key_setsadbsa(sav); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* create SA extension */ m = key_setsadbxsa2(sav->sah->saidx.mode, sav->replay ? sav->replay->count : 0, sav->sah->saidx.reqid); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* create lifetime extension (current and soft) */ len = PFKEY_ALIGN8(sizeof(*lt)) * 2; m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) { error = ENOBUFS; goto fail; } m_align(m, len); m->m_len = len; bzero(mtod(m, caddr_t), len); lt = mtod(m, struct sadb_lifetime *); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lt->sadb_lifetime_allocations = sav->lft_c->allocations; lt->sadb_lifetime_bytes = sav->lft_c->bytes; lt->sadb_lifetime_addtime = sav->lft_c->addtime; lt->sadb_lifetime_usetime = sav->lft_c->usetime; lt = (struct sadb_lifetime *)(mtod(m, caddr_t) + len / 2); lt->sadb_lifetime_len = PFKEY_UNIT64(sizeof(struct sadb_lifetime)); lt->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lt->sadb_lifetime_allocations = sav->lft_s->allocations; lt->sadb_lifetime_bytes = sav->lft_s->bytes; lt->sadb_lifetime_addtime = sav->lft_s->addtime; lt->sadb_lifetime_usetime = sav->lft_s->usetime; m_cat(result, m); /* set sadb_address for source */ m = key_setsadbaddr(SADB_EXT_ADDRESS_SRC, &sav->sah->saidx.src.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* set sadb_address for destination */ m = key_setsadbaddr(SADB_EXT_ADDRESS_DST, &sav->sah->saidx.dst.sa, FULLMASK, IPSEC_ULPROTO_ANY); if (!m) { error = ENOBUFS; goto fail; } m_cat(result, m); /* * XXX-BZ Handle NAT-T extensions here. */ if ((result->m_flags & M_PKTHDR) == 0) { error = EINVAL; goto fail; } if (result->m_len < sizeof(struct sadb_msg)) { result = m_pullup(result, sizeof(struct sadb_msg)); if (result == NULL) { error = ENOBUFS; goto fail; } } result->m_pkthdr.len = 0; for (m = result; m; m = m->m_next) result->m_pkthdr.len += m->m_len; mtod(result, struct sadb_msg *)->sadb_msg_len = PFKEY_UNIT64(result->m_pkthdr.len); return key_sendup_mbuf(NULL, result, KEY_SENDUP_REGISTERED); fail: if (result) m_freem(result); return error; } /* * SADB_FLUSH processing * receive * * from the ikmpd, and free all entries in secastree. * and send, * * to the ikmpd. * NOTE: to do is only marking SADB_SASTATE_DEAD. * * m will always be freed. */ static int key_flush(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct sadb_msg *newmsg; struct secashead *sah, *nextsah; struct secasvar *sav, *nextsav; u_int16_t proto; u_int8_t state; u_int stateidx; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } /* no SATYPE specified, i.e. flushing all SA. */ SAHTREE_LOCK(); for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { nextsah = LIST_NEXT(sah, chain); if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; for (stateidx = 0; stateidx < _ARRAYLEN(saorder_state_alive); stateidx++) { state = saorder_state_any[stateidx]; for (sav = LIST_FIRST(&sah->savtree[state]); sav != NULL; sav = nextsav) { nextsav = LIST_NEXT(sav, chain); key_sa_chgstate(sav, SADB_SASTATE_DEAD); KEY_FREESAV(&sav); } } sah->state = SADB_SASTATE_DEAD; } SAHTREE_UNLOCK(); if (m->m_len < sizeof(struct sadb_msg) || sizeof(struct sadb_msg) > m->m_len + M_TRAILINGSPACE(m)) { ipseclog((LOG_DEBUG, "%s: No more memory.\n", __func__)); return key_senderror(so, m, ENOBUFS); } if (m->m_next) m_freem(m->m_next); m->m_next = NULL; m->m_pkthdr.len = m->m_len = sizeof(struct sadb_msg); newmsg = mtod(m, struct sadb_msg *); newmsg->sadb_msg_errno = 0; newmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len); return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } /* * SADB_DUMP processing * dump all entries including status of DEAD in SAD. * receive * * from the ikmpd, and dump all secasvar leaves * and send, * ..... * to the ikmpd. * * m will always be freed. */ static int key_dump(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { struct secashead *sah; struct secasvar *sav; u_int16_t proto; u_int stateidx; u_int8_t satype; u_int8_t state; int cnt; struct sadb_msg *newmsg; struct mbuf *n; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); /* map satype to proto */ if ((proto = key_satype2proto(mhp->msg->sadb_msg_satype)) == 0) { ipseclog((LOG_DEBUG, "%s: invalid satype is passed.\n", __func__)); return key_senderror(so, m, EINVAL); } /* count sav entries to be sent to the userland. */ cnt = 0; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; for (stateidx = 0; stateidx < _ARRAYLEN(saorder_state_any); stateidx++) { state = saorder_state_any[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { cnt++; } } } if (cnt == 0) { SAHTREE_UNLOCK(); return key_senderror(so, m, ENOENT); } /* send this to the userland, one at a time. */ newmsg = NULL; LIST_FOREACH(sah, &V_sahtree, chain) { if (mhp->msg->sadb_msg_satype != SADB_SATYPE_UNSPEC && proto != sah->saidx.proto) continue; /* map proto to satype */ if ((satype = key_proto2satype(sah->saidx.proto)) == 0) { SAHTREE_UNLOCK(); ipseclog((LOG_DEBUG, "%s: there was invalid proto in " "SAD.\n", __func__)); return key_senderror(so, m, EINVAL); } for (stateidx = 0; stateidx < _ARRAYLEN(saorder_state_any); stateidx++) { state = saorder_state_any[stateidx]; LIST_FOREACH(sav, &sah->savtree[state], chain) { n = key_setdumpsa(sav, SADB_DUMP, satype, --cnt, mhp->msg->sadb_msg_pid); if (!n) { SAHTREE_UNLOCK(); return key_senderror(so, m, ENOBUFS); } key_sendup_mbuf(so, n, KEY_SENDUP_ONE); } } } SAHTREE_UNLOCK(); m_freem(m); return 0; } /* * SADB_X_PROMISC processing * * m will always be freed. */ static int key_promisc(so, m, mhp) struct socket *so; struct mbuf *m; const struct sadb_msghdr *mhp; { int olen; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(mhp->msg != NULL, ("null msg")); olen = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len); if (olen < sizeof(struct sadb_msg)) { #if 1 return key_senderror(so, m, EINVAL); #else m_freem(m); return 0; #endif } else if (olen == sizeof(struct sadb_msg)) { /* enable/disable promisc mode */ struct keycb *kp; if ((kp = (struct keycb *)sotorawcb(so)) == NULL) return key_senderror(so, m, EINVAL); mhp->msg->sadb_msg_errno = 0; switch (mhp->msg->sadb_msg_satype) { case 0: case 1: kp->kp_promisc = mhp->msg->sadb_msg_satype; break; default: return key_senderror(so, m, EINVAL); } /* send the original message back to everyone */ mhp->msg->sadb_msg_errno = 0; return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } else { /* send packet as is */ m_adj(m, PFKEY_ALIGN8(sizeof(struct sadb_msg))); /* TODO: if sadb_msg_seq is specified, send to specific pid */ return key_sendup_mbuf(so, m, KEY_SENDUP_ALL); } } static int (*key_typesw[]) __P((struct socket *, struct mbuf *, const struct sadb_msghdr *)) = { NULL, /* SADB_RESERVED */ key_getspi, /* SADB_GETSPI */ key_update, /* SADB_UPDATE */ key_add, /* SADB_ADD */ key_delete, /* SADB_DELETE */ key_get, /* SADB_GET */ key_acquire2, /* SADB_ACQUIRE */ key_register, /* SADB_REGISTER */ NULL, /* SADB_EXPIRE */ key_flush, /* SADB_FLUSH */ key_dump, /* SADB_DUMP */ key_promisc, /* SADB_X_PROMISC */ NULL, /* SADB_X_PCHANGE */ key_spdadd, /* SADB_X_SPDUPDATE */ key_spdadd, /* SADB_X_SPDADD */ key_spddelete, /* SADB_X_SPDDELETE */ key_spdget, /* SADB_X_SPDGET */ NULL, /* SADB_X_SPDACQUIRE */ key_spddump, /* SADB_X_SPDDUMP */ key_spdflush, /* SADB_X_SPDFLUSH */ key_spdadd, /* SADB_X_SPDSETIDX */ NULL, /* SADB_X_SPDEXPIRE */ key_spddelete2, /* SADB_X_SPDDELETE2 */ }; /* * parse sadb_msg buffer to process PFKEYv2, * and create a data to response if needed. * I think to be dealed with mbuf directly. * IN: * msgp : pointer to pointer to a received buffer pulluped. * This is rewrited to response. * so : pointer to socket. * OUT: * length for buffer to send to user process. */ int key_parse(m, so) struct mbuf *m; struct socket *so; { struct sadb_msg *msg; struct sadb_msghdr mh; u_int orglen; int error; int target; IPSEC_ASSERT(so != NULL, ("null socket")); IPSEC_ASSERT(m != NULL, ("null mbuf")); #if 0 /*kdebug_sadb assumes msg in linear buffer*/ KEYDEBUG(KEYDEBUG_KEY_DUMP, ipseclog((LOG_DEBUG, "%s: passed sadb_msg\n", __func__)); kdebug_sadb(msg)); #endif if (m->m_len < sizeof(struct sadb_msg)) { m = m_pullup(m, sizeof(struct sadb_msg)); if (!m) return ENOBUFS; } msg = mtod(m, struct sadb_msg *); orglen = PFKEY_UNUNIT64(msg->sadb_msg_len); target = KEY_SENDUP_ONE; if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len != m->m_pkthdr.len) { ipseclog((LOG_DEBUG, "%s: invalid message length.\n",__func__)); PFKEYSTAT_INC(out_invlen); error = EINVAL; goto senderror; } if (msg->sadb_msg_version != PF_KEY_V2) { ipseclog((LOG_DEBUG, "%s: PF_KEY version %u is mismatched.\n", __func__, msg->sadb_msg_version)); PFKEYSTAT_INC(out_invver); error = EINVAL; goto senderror; } if (msg->sadb_msg_type > SADB_MAX) { ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n", __func__, msg->sadb_msg_type)); PFKEYSTAT_INC(out_invmsgtype); error = EINVAL; goto senderror; } /* for old-fashioned code - should be nuked */ if (m->m_pkthdr.len > MCLBYTES) { m_freem(m); return ENOBUFS; } if (m->m_next) { struct mbuf *n; MGETHDR(n, M_NOWAIT, MT_DATA); if (n && m->m_pkthdr.len > MHLEN) { MCLGET(n, M_NOWAIT); if ((n->m_flags & M_EXT) == 0) { m_free(n); n = NULL; } } if (!n) { m_freem(m); return ENOBUFS; } m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t)); n->m_pkthdr.len = n->m_len = m->m_pkthdr.len; n->m_next = NULL; m_freem(m); m = n; } /* align the mbuf chain so that extensions are in contiguous region. */ error = key_align(m, &mh); if (error) return error; msg = mh.msg; /* check SA type */ switch (msg->sadb_msg_satype) { case SADB_SATYPE_UNSPEC: switch (msg->sadb_msg_type) { case SADB_GETSPI: case SADB_UPDATE: case SADB_ADD: case SADB_DELETE: case SADB_GET: case SADB_ACQUIRE: case SADB_EXPIRE: ipseclog((LOG_DEBUG, "%s: must specify satype " "when msg type=%u.\n", __func__, msg->sadb_msg_type)); PFKEYSTAT_INC(out_invsatype); error = EINVAL; goto senderror; } break; case SADB_SATYPE_AH: case SADB_SATYPE_ESP: case SADB_X_SATYPE_IPCOMP: case SADB_X_SATYPE_TCPSIGNATURE: switch (msg->sadb_msg_type) { case SADB_X_SPDADD: case SADB_X_SPDDELETE: case SADB_X_SPDGET: case SADB_X_SPDDUMP: case SADB_X_SPDFLUSH: case SADB_X_SPDSETIDX: case SADB_X_SPDUPDATE: case SADB_X_SPDDELETE2: ipseclog((LOG_DEBUG, "%s: illegal satype=%u\n", __func__, msg->sadb_msg_type)); PFKEYSTAT_INC(out_invsatype); error = EINVAL; goto senderror; } break; case SADB_SATYPE_RSVP: case SADB_SATYPE_OSPFV2: case SADB_SATYPE_RIPV2: case SADB_SATYPE_MIP: ipseclog((LOG_DEBUG, "%s: type %u isn't supported.\n", __func__, msg->sadb_msg_satype)); PFKEYSTAT_INC(out_invsatype); error = EOPNOTSUPP; goto senderror; case 1: /* XXX: What does it do? */ if (msg->sadb_msg_type == SADB_X_PROMISC) break; /*FALLTHROUGH*/ default: ipseclog((LOG_DEBUG, "%s: invalid type %u is passed.\n", __func__, msg->sadb_msg_satype)); PFKEYSTAT_INC(out_invsatype); error = EINVAL; goto senderror; } /* check field of upper layer protocol and address family */ if (mh.ext[SADB_EXT_ADDRESS_SRC] != NULL && mh.ext[SADB_EXT_ADDRESS_DST] != NULL) { struct sadb_address *src0, *dst0; u_int plen; src0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mh.ext[SADB_EXT_ADDRESS_DST]); /* check upper layer protocol */ if (src0->sadb_address_proto != dst0->sadb_address_proto) { ipseclog((LOG_DEBUG, "%s: upper layer protocol " "mismatched.\n", __func__)); PFKEYSTAT_INC(out_invaddr); error = EINVAL; goto senderror; } /* check family */ if (PFKEY_ADDR_SADDR(src0)->sa_family != PFKEY_ADDR_SADDR(dst0)->sa_family) { ipseclog((LOG_DEBUG, "%s: address family mismatched.\n", __func__)); PFKEYSTAT_INC(out_invaddr); error = EINVAL; goto senderror; } if (PFKEY_ADDR_SADDR(src0)->sa_len != PFKEY_ADDR_SADDR(dst0)->sa_len) { ipseclog((LOG_DEBUG, "%s: address struct size " "mismatched.\n", __func__)); PFKEYSTAT_INC(out_invaddr); error = EINVAL; goto senderror; } switch (PFKEY_ADDR_SADDR(src0)->sa_family) { case AF_INET: if (PFKEY_ADDR_SADDR(src0)->sa_len != sizeof(struct sockaddr_in)) { PFKEYSTAT_INC(out_invaddr); error = EINVAL; goto senderror; } break; case AF_INET6: if (PFKEY_ADDR_SADDR(src0)->sa_len != sizeof(struct sockaddr_in6)) { PFKEYSTAT_INC(out_invaddr); error = EINVAL; goto senderror; } break; default: ipseclog((LOG_DEBUG, "%s: unsupported address family\n", __func__)); PFKEYSTAT_INC(out_invaddr); error = EAFNOSUPPORT; goto senderror; } switch (PFKEY_ADDR_SADDR(src0)->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: plen = 0; /*fool gcc*/ break; } /* check max prefix length */ if (src0->sadb_address_prefixlen > plen || dst0->sadb_address_prefixlen > plen) { ipseclog((LOG_DEBUG, "%s: illegal prefixlen.\n", __func__)); PFKEYSTAT_INC(out_invaddr); error = EINVAL; goto senderror; } /* * prefixlen == 0 is valid because there can be a case when * all addresses are matched. */ } if (msg->sadb_msg_type >= sizeof(key_typesw)/sizeof(key_typesw[0]) || key_typesw[msg->sadb_msg_type] == NULL) { PFKEYSTAT_INC(out_invmsgtype); error = EINVAL; goto senderror; } return (*key_typesw[msg->sadb_msg_type])(so, m, &mh); senderror: msg->sadb_msg_errno = error; return key_sendup_mbuf(so, m, target); } static int key_senderror(so, m, code) struct socket *so; struct mbuf *m; int code; { struct sadb_msg *msg; IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg), ("mbuf too small, len %u", m->m_len)); msg = mtod(m, struct sadb_msg *); msg->sadb_msg_errno = code; return key_sendup_mbuf(so, m, KEY_SENDUP_ONE); } /* * set the pointer to each header into message buffer. * m will be freed on error. * XXX larger-than-MCLBYTES extension? */ static int key_align(m, mhp) struct mbuf *m; struct sadb_msghdr *mhp; { struct mbuf *n; struct sadb_ext *ext; size_t off, end; int extlen; int toff; IPSEC_ASSERT(m != NULL, ("null mbuf")); IPSEC_ASSERT(mhp != NULL, ("null msghdr")); IPSEC_ASSERT(m->m_len >= sizeof(struct sadb_msg), ("mbuf too small, len %u", m->m_len)); /* initialize */ bzero(mhp, sizeof(*mhp)); mhp->msg = mtod(m, struct sadb_msg *); mhp->ext[0] = (struct sadb_ext *)mhp->msg; /*XXX backward compat */ end = PFKEY_UNUNIT64(mhp->msg->sadb_msg_len); extlen = end; /*just in case extlen is not updated*/ for (off = sizeof(struct sadb_msg); off < end; off += extlen) { n = m_pulldown(m, off, sizeof(struct sadb_ext), &toff); if (!n) { /* m is already freed */ return ENOBUFS; } ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff); /* set pointer */ switch (ext->sadb_ext_type) { case SADB_EXT_SA: case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_EXT_LIFETIME_CURRENT: case SADB_EXT_LIFETIME_HARD: case SADB_EXT_LIFETIME_SOFT: case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: case SADB_EXT_SENSITIVITY: case SADB_EXT_PROPOSAL: case SADB_EXT_SUPPORTED_AUTH: case SADB_EXT_SUPPORTED_ENCRYPT: case SADB_EXT_SPIRANGE: case SADB_X_EXT_POLICY: case SADB_X_EXT_SA2: #ifdef IPSEC_NAT_T case SADB_X_EXT_NAT_T_TYPE: case SADB_X_EXT_NAT_T_SPORT: case SADB_X_EXT_NAT_T_DPORT: case SADB_X_EXT_NAT_T_OAI: case SADB_X_EXT_NAT_T_OAR: case SADB_X_EXT_NAT_T_FRAG: #endif /* duplicate check */ /* * XXX Are there duplication payloads of either * KEY_AUTH or KEY_ENCRYPT ? */ if (mhp->ext[ext->sadb_ext_type] != NULL) { ipseclog((LOG_DEBUG, "%s: duplicate ext_type " "%u\n", __func__, ext->sadb_ext_type)); m_freem(m); PFKEYSTAT_INC(out_dupext); return EINVAL; } break; default: ipseclog((LOG_DEBUG, "%s: invalid ext_type %u\n", __func__, ext->sadb_ext_type)); m_freem(m); PFKEYSTAT_INC(out_invexttype); return EINVAL; } extlen = PFKEY_UNUNIT64(ext->sadb_ext_len); if (key_validate_ext(ext, extlen)) { m_freem(m); PFKEYSTAT_INC(out_invlen); return EINVAL; } n = m_pulldown(m, off, extlen, &toff); if (!n) { /* m is already freed */ return ENOBUFS; } ext = (struct sadb_ext *)(mtod(n, caddr_t) + toff); mhp->ext[ext->sadb_ext_type] = ext; mhp->extoff[ext->sadb_ext_type] = off; mhp->extlen[ext->sadb_ext_type] = extlen; } if (off != end) { m_freem(m); PFKEYSTAT_INC(out_invlen); return EINVAL; } return 0; } static int key_validate_ext(ext, len) const struct sadb_ext *ext; int len; { const struct sockaddr *sa; enum { NONE, ADDR } checktype = NONE; int baselen = 0; const int sal = offsetof(struct sockaddr, sa_len) + sizeof(sa->sa_len); if (len != PFKEY_UNUNIT64(ext->sadb_ext_len)) return EINVAL; /* if it does not match minimum/maximum length, bail */ if (ext->sadb_ext_type >= sizeof(minsize) / sizeof(minsize[0]) || ext->sadb_ext_type >= sizeof(maxsize) / sizeof(maxsize[0])) return EINVAL; if (!minsize[ext->sadb_ext_type] || len < minsize[ext->sadb_ext_type]) return EINVAL; if (maxsize[ext->sadb_ext_type] && len > maxsize[ext->sadb_ext_type]) return EINVAL; /* more checks based on sadb_ext_type XXX need more */ switch (ext->sadb_ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: baselen = PFKEY_ALIGN8(sizeof(struct sadb_address)); checktype = ADDR; break; case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: if (((const struct sadb_ident *)ext)->sadb_ident_type == SADB_X_IDENTTYPE_ADDR) { baselen = PFKEY_ALIGN8(sizeof(struct sadb_ident)); checktype = ADDR; } else checktype = NONE; break; default: checktype = NONE; break; } switch (checktype) { case NONE: break; case ADDR: sa = (const struct sockaddr *)(((const u_int8_t*)ext)+baselen); if (len < baselen + sal) return EINVAL; if (baselen + PFKEY_ALIGN8(sa->sa_len) != len) return EINVAL; break; } return 0; } void key_init(void) { int i; for (i = 0; i < IPSEC_DIR_MAX; i++) LIST_INIT(&V_sptree[i]); LIST_INIT(&V_sahtree); for (i = 0; i <= SADB_SATYPE_MAX; i++) LIST_INIT(&V_regtree[i]); LIST_INIT(&V_acqtree); LIST_INIT(&V_spacqtree); /* system default */ V_ip4_def_policy.policy = IPSEC_POLICY_NONE; V_ip4_def_policy.refcnt++; /*never reclaim this*/ if (!IS_DEFAULT_VNET(curvnet)) return; SPTREE_LOCK_INIT(); REGTREE_LOCK_INIT(); SAHTREE_LOCK_INIT(); ACQ_LOCK_INIT(); SPACQ_LOCK_INIT(); #ifndef IPSEC_DEBUG2 callout_init(&key_timer, CALLOUT_MPSAFE); callout_reset(&key_timer, hz, key_timehandler, NULL); #endif /*IPSEC_DEBUG2*/ /* initialize key statistics */ keystat.getspi_count = 1; printf("IPsec: Initialized Security Association Processing.\n"); } #ifdef VIMAGE void key_destroy(void) { struct secpolicy *sp, *nextsp; struct secacq *acq, *nextacq; struct secspacq *spacq, *nextspacq; struct secashead *sah, *nextsah; struct secreg *reg; int i; SPTREE_LOCK(); for (i = 0; i < IPSEC_DIR_MAX; i++) { for (sp = LIST_FIRST(&V_sptree[i]); sp != NULL; sp = nextsp) { nextsp = LIST_NEXT(sp, chain); if (__LIST_CHAINED(sp)) { LIST_REMOVE(sp, chain); free(sp, M_IPSEC_SP); } } } SPTREE_UNLOCK(); SAHTREE_LOCK(); for (sah = LIST_FIRST(&V_sahtree); sah != NULL; sah = nextsah) { nextsah = LIST_NEXT(sah, chain); if (__LIST_CHAINED(sah)) { LIST_REMOVE(sah, chain); free(sah, M_IPSEC_SAH); } } SAHTREE_UNLOCK(); REGTREE_LOCK(); for (i = 0; i <= SADB_SATYPE_MAX; i++) { LIST_FOREACH(reg, &V_regtree[i], chain) { if (__LIST_CHAINED(reg)) { LIST_REMOVE(reg, chain); free(reg, M_IPSEC_SAR); break; } } } REGTREE_UNLOCK(); ACQ_LOCK(); for (acq = LIST_FIRST(&V_acqtree); acq != NULL; acq = nextacq) { nextacq = LIST_NEXT(acq, chain); if (__LIST_CHAINED(acq)) { LIST_REMOVE(acq, chain); free(acq, M_IPSEC_SAQ); } } ACQ_UNLOCK(); SPACQ_LOCK(); for (spacq = LIST_FIRST(&V_spacqtree); spacq != NULL; spacq = nextspacq) { nextspacq = LIST_NEXT(spacq, chain); if (__LIST_CHAINED(spacq)) { LIST_REMOVE(spacq, chain); free(spacq, M_IPSEC_SAQ); } } SPACQ_UNLOCK(); } #endif /* * XXX: maybe This function is called after INBOUND IPsec processing. * * Special check for tunnel-mode packets. * We must make some checks for consistency between inner and outer IP header. * * xxx more checks to be provided */ int key_checktunnelsanity(sav, family, src, dst) struct secasvar *sav; u_int family; caddr_t src; caddr_t dst; { IPSEC_ASSERT(sav->sah != NULL, ("null SA header")); /* XXX: check inner IP header */ return 1; } /* record data transfer on SA, and update timestamps */ void key_sa_recordxfer(sav, m) struct secasvar *sav; struct mbuf *m; { IPSEC_ASSERT(sav != NULL, ("Null secasvar")); IPSEC_ASSERT(m != NULL, ("Null mbuf")); if (!sav->lft_c) return; /* * XXX Currently, there is a difference of bytes size * between inbound and outbound processing. */ sav->lft_c->bytes += m->m_pkthdr.len; /* to check bytes lifetime is done in key_timehandler(). */ /* * We use the number of packets as the unit of * allocations. We increment the variable * whenever {esp,ah}_{in,out}put is called. */ sav->lft_c->allocations++; /* XXX check for expires? */ /* * NOTE: We record CURRENT usetime by using wall clock, * in seconds. HARD and SOFT lifetime are measured by the time * difference (again in seconds) from usetime. * * usetime * v expire expire * -----+-----+--------+---> t * <--------------> HARD * <-----> SOFT */ sav->lft_c->usetime = time_second; /* XXX check for expires? */ return; } /* dumb version */ void key_sa_routechange(dst) struct sockaddr *dst; { struct secashead *sah; struct route *ro; SAHTREE_LOCK(); LIST_FOREACH(sah, &V_sahtree, chain) { ro = &sah->route_cache.sa_route; if (ro->ro_rt && dst->sa_len == ro->ro_dst.sa_len && bcmp(dst, &ro->ro_dst, dst->sa_len) == 0) { RTFREE(ro->ro_rt); ro->ro_rt = (struct rtentry *)NULL; } } SAHTREE_UNLOCK(); } static void key_sa_chgstate(struct secasvar *sav, u_int8_t state) { IPSEC_ASSERT(sav != NULL, ("NULL sav")); SAHTREE_LOCK_ASSERT(); if (sav->state != state) { if (__LIST_CHAINED(sav)) LIST_REMOVE(sav, chain); sav->state = state; LIST_INSERT_HEAD(&sav->sah->savtree[state], sav, chain); } } void key_sa_stir_iv(sav) struct secasvar *sav; { IPSEC_ASSERT(sav->iv != NULL, ("null IV")); key_randomfill(sav->iv, sav->ivlen); } /* * Take one of the kernel's security keys and convert it into a PF_KEY * structure within an mbuf, suitable for sending up to a waiting * application in user land. * * IN: * src: A pointer to a kernel security key. * exttype: Which type of key this is. Refer to the PF_KEY data structures. * OUT: * a valid mbuf or NULL indicating an error * */ static struct mbuf * key_setkey(struct seckey *src, u_int16_t exttype) { struct mbuf *m; struct sadb_key *p; int len; if (src == NULL) return NULL; len = PFKEY_ALIGN8(sizeof(struct sadb_key) + _KEYLEN(src)); m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return NULL; m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_key *); bzero(p, len); p->sadb_key_len = PFKEY_UNIT64(len); p->sadb_key_exttype = exttype; p->sadb_key_bits = src->bits; bcopy(src->key_data, _KEYBUF(p), _KEYLEN(src)); return m; } /* * Take one of the kernel's lifetime data structures and convert it * into a PF_KEY structure within an mbuf, suitable for sending up to * a waiting application in user land. * * IN: * src: A pointer to a kernel lifetime structure. * exttype: Which type of lifetime this is. Refer to the PF_KEY * data structures for more information. * OUT: * a valid mbuf or NULL indicating an error * */ static struct mbuf * key_setlifetime(struct seclifetime *src, u_int16_t exttype) { struct mbuf *m = NULL; struct sadb_lifetime *p; int len = PFKEY_ALIGN8(sizeof(struct sadb_lifetime)); if (src == NULL) return NULL; m = m_get2(len, M_NOWAIT, MT_DATA, 0); if (m == NULL) return m; m_align(m, len); m->m_len = len; p = mtod(m, struct sadb_lifetime *); bzero(p, len); p->sadb_lifetime_len = PFKEY_UNIT64(len); p->sadb_lifetime_exttype = exttype; p->sadb_lifetime_allocations = src->allocations; p->sadb_lifetime_bytes = src->bytes; p->sadb_lifetime_addtime = src->addtime; p->sadb_lifetime_usetime = src->usetime; return m; } Index: user/dchagin/lemul/sys/netipsec/xform_ah.c =================================================================== --- user/dchagin/lemul/sys/netipsec/xform_ah.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/xform_ah.c (revision 274241) @@ -1,1228 +1,1228 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ah.c,v 1.63 2001/06/26 06:18:58 angelos Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis and Niklas Hallqvist. * * Copyright (c) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 1999 Niklas Hallqvist. * Copyright (c) 2001 Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include /* * Return header size in bytes. The old protocol did not support * the replay counter; the new protocol always includes the counter. */ #define HDRSIZE(sav) \ (((sav)->flags & SADB_X_EXT_OLD) ? \ sizeof (struct ah) : sizeof (struct ah) + sizeof (u_int32_t)) /* * Return authenticator size in bytes. The old protocol is known * to use a fixed 16-byte authenticator. The new algorithm use 12-byte * authenticator. */ #define AUTHSIZE(sav) ah_authsize(sav) VNET_DEFINE(int, ah_enable) = 1; /* control flow of packets with AH */ VNET_DEFINE(int, ah_cleartos) = 1; /* clear ip_tos when doing AH calc */ VNET_PCPUSTAT_DEFINE(struct ahstat, ahstat); VNET_PCPUSTAT_SYSINIT(ahstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ahstat); #endif /* VIMAGE */ #ifdef INET SYSCTL_DECL(_net_inet_ah); -SYSCTL_VNET_INT(_net_inet_ah, OID_AUTO, - ah_enable, CTLFLAG_RW, &VNET_NAME(ah_enable), 0, ""); -SYSCTL_VNET_INT(_net_inet_ah, OID_AUTO, - ah_cleartos, CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, ""); +SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_enable, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_enable), 0, ""); +SYSCTL_INT(_net_inet_ah, OID_AUTO, ah_cleartos, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ah_cleartos), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet_ah, IPSECCTL_STATS, stats, struct ahstat, ahstat, "AH statistics (struct ahstat, netipsec/ah_var.h)"); #endif static unsigned char ipseczeroes[256]; /* larger than an ip6 extension hdr */ static int ah_input_cb(struct cryptop*); static int ah_output_cb(struct cryptop*); static int ah_authsize(struct secasvar *sav) { IPSEC_ASSERT(sav != NULL, ("%s: sav == NULL", __func__)); if (sav->flags & SADB_X_EXT_OLD) return 16; switch (sav->alg_auth) { case SADB_X_AALG_SHA2_256: return 16; case SADB_X_AALG_SHA2_384: return 24; case SADB_X_AALG_SHA2_512: return 32; default: return AH_HMAC_HASHLEN; } /* NOTREACHED */ } /* * NB: this is public for use by the PF_KEY support. */ struct auth_hash * ah_algorithm_lookup(int alg) { if (alg > SADB_AALG_MAX) return NULL; switch (alg) { case SADB_X_AALG_NULL: return &auth_hash_null; case SADB_AALG_MD5HMAC: return &auth_hash_hmac_md5; case SADB_AALG_SHA1HMAC: return &auth_hash_hmac_sha1; case SADB_X_AALG_RIPEMD160HMAC: return &auth_hash_hmac_ripemd_160; case SADB_X_AALG_MD5: return &auth_hash_key_md5; case SADB_X_AALG_SHA: return &auth_hash_key_sha1; case SADB_X_AALG_SHA2_256: return &auth_hash_hmac_sha2_256; case SADB_X_AALG_SHA2_384: return &auth_hash_hmac_sha2_384; case SADB_X_AALG_SHA2_512: return &auth_hash_hmac_sha2_512; } return NULL; } size_t ah_hdrsiz(struct secasvar *sav) { size_t size; if (sav != NULL) { int authsize; IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null xform")); /*XXX not right for null algorithm--does it matter??*/ authsize = AUTHSIZE(sav); size = roundup(authsize, sizeof (u_int32_t)) + HDRSIZE(sav); } else { /* default guess */ size = sizeof (struct ah) + sizeof (u_int32_t) + 16; } return size; } /* * NB: public for use by esp_init. */ int ah_init0(struct secasvar *sav, struct xformsw *xsp, struct cryptoini *cria) { struct auth_hash *thash; int keylen; thash = ah_algorithm_lookup(sav->alg_auth); if (thash == NULL) { DPRINTF(("%s: unsupported authentication algorithm %u\n", __func__, sav->alg_auth)); return EINVAL; } /* * Verify the replay state block allocation is consistent with * the protocol type. We check here so we can make assumptions * later during protocol processing. */ /* NB: replay state is setup elsewhere (sigh) */ if (((sav->flags&SADB_X_EXT_OLD) == 0) ^ (sav->replay != NULL)) { DPRINTF(("%s: replay state block inconsistency, " "%s algorithm %s replay state\n", __func__, (sav->flags & SADB_X_EXT_OLD) ? "old" : "new", sav->replay == NULL ? "without" : "with")); return EINVAL; } if (sav->key_auth == NULL) { DPRINTF(("%s: no authentication key for %s algorithm\n", __func__, thash->name)); return EINVAL; } keylen = _KEYLEN(sav->key_auth); if (keylen != thash->keysize && thash->keysize != 0) { DPRINTF(("%s: invalid keylength %d, algorithm %s requires " "keysize %d\n", __func__, keylen, thash->name, thash->keysize)); return EINVAL; } sav->tdb_xform = xsp; sav->tdb_authalgxform = thash; /* Initialize crypto session. */ bzero(cria, sizeof (*cria)); cria->cri_alg = sav->tdb_authalgxform->type; cria->cri_klen = _KEYBITS(sav->key_auth); cria->cri_key = sav->key_auth->key_data; cria->cri_mlen = AUTHSIZE(sav); return 0; } /* * ah_init() is called when an SPI is being set up. */ static int ah_init(struct secasvar *sav, struct xformsw *xsp) { struct cryptoini cria; int error; error = ah_init0(sav, xsp, &cria); return error ? error : crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support); } /* * Paranoia. * * NB: public for use by esp_zeroize (XXX). */ int ah_zeroize(struct secasvar *sav) { int err; if (sav->key_auth) bzero(sav->key_auth->key_data, _KEYLEN(sav->key_auth)); err = crypto_freesession(sav->tdb_cryptoid); sav->tdb_cryptoid = 0; sav->tdb_authalgxform = NULL; sav->tdb_xform = NULL; return err; } /* * Massage IPv4/IPv6 headers for AH processing. */ static int ah_massage_headers(struct mbuf **m0, int proto, int skip, int alg, int out) { struct mbuf *m = *m0; unsigned char *ptr; int off, count; #ifdef INET struct ip *ip; #endif /* INET */ #ifdef INET6 struct ip6_ext *ip6e; struct ip6_hdr ip6; int alloc, len, ad; #endif /* INET6 */ switch (proto) { #ifdef INET case AF_INET: /* * This is the least painful way of dealing with IPv4 header * and option processing -- just make sure they're in * contiguous memory. */ *m0 = m = m_pullup(m, skip); if (m == NULL) { DPRINTF(("%s: m_pullup failed\n", __func__)); return ENOBUFS; } /* Fix the IP header */ ip = mtod(m, struct ip *); if (V_ah_cleartos) ip->ip_tos = 0; ip->ip_ttl = 0; ip->ip_sum = 0; if (alg == CRYPTO_MD5_KPDK || alg == CRYPTO_SHA1_KPDK) ip->ip_off &= htons(IP_DF); else ip->ip_off = htons(0); ptr = mtod(m, unsigned char *) + sizeof(struct ip); /* IPv4 option processing */ for (off = sizeof(struct ip); off < skip;) { if (ptr[off] == IPOPT_EOL || ptr[off] == IPOPT_NOP || off + 1 < skip) ; else { DPRINTF(("%s: illegal IPv4 option length for " "option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } switch (ptr[off]) { case IPOPT_EOL: off = skip; /* End the loop. */ break; case IPOPT_NOP: off++; break; case IPOPT_SECURITY: /* 0x82 */ case 0x85: /* Extended security. */ case 0x86: /* Commercial security. */ case 0x94: /* Router alert */ case 0x95: /* RFC1770 */ /* Sanity check for option length. */ if (ptr[off + 1] < 2) { DPRINTF(("%s: illegal IPv4 option " "length for option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } off += ptr[off + 1]; break; case IPOPT_LSRR: case IPOPT_SSRR: /* Sanity check for option length. */ if (ptr[off + 1] < 2) { DPRINTF(("%s: illegal IPv4 option " "length for option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } /* * On output, if we have either of the * source routing options, we should * swap the destination address of the * IP header with the last address * specified in the option, as that is * what the destination's IP header * will look like. */ if (out) bcopy(ptr + off + ptr[off + 1] - sizeof(struct in_addr), &(ip->ip_dst), sizeof(struct in_addr)); /* Fall through */ default: /* Sanity check for option length. */ if (ptr[off + 1] < 2) { DPRINTF(("%s: illegal IPv4 option " "length for option %d\n", __func__, ptr[off])); m_freem(m); return EINVAL; } /* Zeroize all other options. */ count = ptr[off + 1]; bcopy(ipseczeroes, ptr, count); off += count; break; } /* Sanity check. */ if (off > skip) { DPRINTF(("%s: malformed IPv4 options header\n", __func__)); m_freem(m); return EINVAL; } } break; #endif /* INET */ #ifdef INET6 case AF_INET6: /* Ugly... */ /* Copy and "cook" the IPv6 header. */ m_copydata(m, 0, sizeof(ip6), (caddr_t) &ip6); /* We don't do IPv6 Jumbograms. */ if (ip6.ip6_plen == 0) { DPRINTF(("%s: unsupported IPv6 jumbogram\n", __func__)); m_freem(m); return EMSGSIZE; } ip6.ip6_flow = 0; ip6.ip6_hlim = 0; ip6.ip6_vfc &= ~IPV6_VERSION_MASK; ip6.ip6_vfc |= IPV6_VERSION; /* Scoped address handling. */ if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_src)) ip6.ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_LINKLOCAL(&ip6.ip6_dst)) ip6.ip6_dst.s6_addr16[1] = 0; /* Done with IPv6 header. */ m_copyback(m, 0, sizeof(struct ip6_hdr), (caddr_t) &ip6); /* Let's deal with the remaining headers (if any). */ if (skip - sizeof(struct ip6_hdr) > 0) { if (m->m_len <= skip) { ptr = (unsigned char *) malloc( skip - sizeof(struct ip6_hdr), M_XDATA, M_NOWAIT); if (ptr == NULL) { DPRINTF(("%s: failed to allocate memory" "for IPv6 headers\n",__func__)); m_freem(m); return ENOBUFS; } /* * Copy all the protocol headers after * the IPv6 header. */ m_copydata(m, sizeof(struct ip6_hdr), skip - sizeof(struct ip6_hdr), ptr); alloc = 1; } else { /* No need to allocate memory. */ ptr = mtod(m, unsigned char *) + sizeof(struct ip6_hdr); alloc = 0; } } else break; off = ip6.ip6_nxt & 0xff; /* Next header type. */ for (len = 0; len < skip - sizeof(struct ip6_hdr);) switch (off) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: ip6e = (struct ip6_ext *) (ptr + len); /* * Process the mutable/immutable * options -- borrows heavily from the * KAME code. */ for (count = len + sizeof(struct ip6_ext); count < len + ((ip6e->ip6e_len + 1) << 3);) { if (ptr[count] == IP6OPT_PAD1) { count++; continue; /* Skip padding. */ } /* Sanity check. */ if (count > len + ((ip6e->ip6e_len + 1) << 3)) { m_freem(m); /* Free, if we allocated. */ if (alloc) free(ptr, M_XDATA); return EINVAL; } ad = ptr[count + 1]; /* If mutable option, zeroize. */ if (ptr[count] & IP6OPT_MUTABLE) bcopy(ipseczeroes, ptr + count, ptr[count + 1]); count += ad; /* Sanity check. */ if (count > skip - sizeof(struct ip6_hdr)) { m_freem(m); /* Free, if we allocated. */ if (alloc) free(ptr, M_XDATA); return EINVAL; } } /* Advance. */ len += ((ip6e->ip6e_len + 1) << 3); off = ip6e->ip6e_nxt; break; case IPPROTO_ROUTING: /* * Always include routing headers in * computation. */ ip6e = (struct ip6_ext *) (ptr + len); len += ((ip6e->ip6e_len + 1) << 3); off = ip6e->ip6e_nxt; break; default: DPRINTF(("%s: unexpected IPv6 header type %d", __func__, off)); if (alloc) free(ptr, M_XDATA); m_freem(m); return EINVAL; } /* Copyback and free, if we allocated. */ if (alloc) { m_copyback(m, sizeof(struct ip6_hdr), skip - sizeof(struct ip6_hdr), ptr); free(ptr, M_XDATA); } break; #endif /* INET6 */ } return 0; } /* * ah_input() gets called to verify that an input packet * passes authentication. */ static int ah_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { struct auth_hash *ahx; struct tdb_ident *tdbi; struct tdb_crypto *tc; struct m_tag *mtag; struct newah *ah; int hl, rplen, authsize; struct cryptodesc *crda; struct cryptop *crp; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->key_auth != NULL, ("null authentication key")); IPSEC_ASSERT(sav->tdb_authalgxform != NULL, ("null authentication xform")); /* Figure out header size. */ rplen = HDRSIZE(sav); /* XXX don't pullup, just copy header */ IP6_EXTHDR_GET(ah, struct newah *, m, skip, rplen); if (ah == NULL) { DPRINTF(("ah_input: cannot pullup header\n")); AHSTAT_INC(ahs_hdrops); /*XXX*/ m_freem(m); return ENOBUFS; } /* Check replay window, if applicable. */ if (sav->replay && !ipsec_chkreplay(ntohl(ah->ah_seq), sav)) { AHSTAT_INC(ahs_replay); DPRINTF(("%s: packet replay failure: %s\n", __func__, ipsec_logsastr(sav))); m_freem(m); return ENOBUFS; } /* Verify AH header length. */ hl = ah->ah_len * sizeof (u_int32_t); ahx = sav->tdb_authalgxform; authsize = AUTHSIZE(sav); if (hl != authsize + rplen - sizeof (struct ah)) { DPRINTF(("%s: bad authenticator length %u (expecting %lu)" " for packet in SA %s/%08lx\n", __func__, hl, (u_long) (authsize + rplen - sizeof (struct ah)), ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_badauthl); m_freem(m); return EACCES; } AHSTAT_ADD(ahs_ibytes, m->m_pkthdr.len - skip - hl); /* Get crypto descriptors. */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); AHSTAT_INC(ahs_crypto); m_freem(m); return ENOBUFS; } crda = crp->crp_desc; IPSEC_ASSERT(crda != NULL, ("null crypto descriptor")); crda->crd_skip = 0; crda->crd_len = m->m_pkthdr.len; crda->crd_inject = skip + rplen; /* Authentication operation. */ crda->crd_alg = ahx->type; crda->crd_klen = _KEYBITS(sav->key_auth); crda->crd_key = sav->key_auth->key_data; /* Find out if we've already done crypto. */ for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL); mtag != NULL; mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) { tdbi = (struct tdb_ident *) (mtag + 1); if (tdbi->proto == sav->sah->saidx.proto && tdbi->spi == sav->spi && !bcmp(&tdbi->dst, &sav->sah->saidx.dst, sizeof (union sockaddr_union))) break; } /* Allocate IPsec-specific opaque crypto info. */ if (mtag == NULL) { tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto) + skip + rplen + authsize, M_XDATA, M_NOWAIT|M_ZERO); } else { /* Hash verification has already been done successfully. */ tc = (struct tdb_crypto *) malloc(sizeof (struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); } if (tc == NULL) { DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); AHSTAT_INC(ahs_crypto); crypto_freereq(crp); m_freem(m); return ENOBUFS; } /* Only save information if crypto processing is needed. */ if (mtag == NULL) { int error; /* * Save the authenticator, the skipped portion of the packet, * and the AH header. */ m_copydata(m, 0, skip + rplen + authsize, (caddr_t)(tc+1)); /* Zeroize the authenticator on the packet. */ m_copyback(m, skip + rplen, authsize, ipseczeroes); /* "Massage" the packet headers for crypto processing. */ error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family, skip, ahx->type, 0); if (error != 0) { /* NB: mbuf is free'd by ah_massage_headers */ AHSTAT_INC(ahs_hdrops); free(tc, M_XDATA); crypto_freereq(crp); return error; } } /* Crypto operation descriptor. */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ah_input_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback. */ tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_nxt = ah->ah_nxt; tc->tc_protoff = protoff; tc->tc_skip = skip; tc->tc_ptr = (caddr_t) mtag; /* Save the mtag we've identified. */ KEY_ADDREFSA(sav); tc->tc_sav = sav; if (mtag == NULL) return crypto_dispatch(crp); else return ah_input_cb(crp); } /* * AH input callback from the crypto driver. */ static int ah_input_cb(struct cryptop *crp) { int rplen, error, skip, protoff; unsigned char calc[AH_ALEN_MAX]; struct mbuf *m; struct cryptodesc *crd; struct auth_hash *ahx; struct tdb_crypto *tc; struct m_tag *mtag; struct secasvar *sav; struct secasindex *saidx; u_int8_t nxt; caddr_t ptr; int authsize; crd = crp->crp_desc; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!")); skip = tc->tc_skip; nxt = tc->tc_nxt; protoff = tc->tc_protoff; mtag = (struct m_tag *) tc->tc_ptr; m = (struct mbuf *) crp->crp_buf; sav = tc->tc_sav; IPSEC_ASSERT(sav != NULL, ("null SA!")); saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); ahx = (struct auth_hash *) sav->tdb_authalgxform; /* Check for crypto errors. */ if (crp->crp_etype) { if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) return (crypto_dispatch(crp)); AHSTAT_INC(ahs_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } else { AHSTAT_INC(ahs_hist[sav->alg_auth]); crypto_freereq(crp); /* No longer needed. */ crp = NULL; } /* Shouldn't happen... */ if (m == NULL) { AHSTAT_INC(ahs_crypto); DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } /* Figure out header size. */ rplen = HDRSIZE(sav); authsize = AUTHSIZE(sav); /* Copy authenticator off the packet. */ m_copydata(m, skip + rplen, authsize, calc); /* * If we have an mtag, we don't need to verify the authenticator -- * it has been verified by an IPsec-aware NIC. */ if (mtag == NULL) { ptr = (caddr_t) (tc + 1); /* Verify authenticator. */ if (bcmp(ptr + skip + rplen, calc, authsize)) { DPRINTF(("%s: authentication hash mismatch for packet " "in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_badauth); error = EACCES; goto bad; } /* Fix the Next Protocol field. */ ((u_int8_t *) ptr)[protoff] = nxt; /* Copyback the saved (uncooked) network headers. */ m_copyback(m, 0, skip, ptr); } else { /* Fix the Next Protocol field. */ m_copyback(m, protoff, sizeof(u_int8_t), &nxt); } free(tc, M_XDATA), tc = NULL; /* No longer needed */ /* * Header is now authenticated. */ m->m_flags |= M_AUTHIPHDR|M_AUTHIPDGM; /* * Update replay sequence number, if appropriate. */ if (sav->replay) { u_int32_t seq; m_copydata(m, skip + offsetof(struct newah, ah_seq), sizeof (seq), (caddr_t) &seq); if (ipsec_updatereplay(ntohl(seq), sav)) { AHSTAT_INC(ahs_replay); error = ENOBUFS; /*XXX as above*/ goto bad; } } /* * Remove the AH header and authenticator from the mbuf. */ error = m_striphdr(m, skip, rplen + authsize); if (error) { DPRINTF(("%s: mangled mbuf chain for SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_hdrops); goto bad; } switch (saidx->dst.sa.sa_family) { #ifdef INET6 case AF_INET6: error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); break; #endif #ifdef INET case AF_INET: error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); break; #endif default: panic("%s: Unexpected address family: %d saidx=%p", __func__, saidx->dst.sa.sa_family, saidx); } KEY_FREESAV(&sav); return error; bad: if (sav) KEY_FREESAV(&sav); if (m != NULL) m_freem(m); if (tc != NULL) free(tc, M_XDATA); if (crp != NULL) crypto_freereq(crp); return error; } /* * AH output routine, called by ipsec[46]_process_packet(). */ static int ah_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff) { struct secasvar *sav; struct auth_hash *ahx; struct cryptodesc *crda; struct tdb_crypto *tc; struct mbuf *mi; struct cryptop *crp; u_int16_t iplen; int error, rplen, authsize, maxpacketsize, roff; u_int8_t prot; struct newah *ah; sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); ahx = sav->tdb_authalgxform; IPSEC_ASSERT(ahx != NULL, ("null authentication xform")); AHSTAT_INC(ahs_output); /* Figure out header size. */ rplen = HDRSIZE(sav); /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: DPRINTF(("%s: unknown/unsupported protocol family %u, " "SA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_nopf); error = EPFNOSUPPORT; goto bad; } authsize = AUTHSIZE(sav); if (rplen + authsize + m->m_pkthdr.len > maxpacketsize) { DPRINTF(("%s: packet in SA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi), rplen + authsize + m->m_pkthdr.len, maxpacketsize)); AHSTAT_INC(ahs_toobig); error = EMSGSIZE; goto bad; } /* Update the counters. */ AHSTAT_ADD(ahs_obytes, m->m_pkthdr.len - skip); m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_hdrops); error = ENOBUFS; goto bad; } /* Inject AH header. */ mi = m_makespace(m, skip, rplen + authsize, &roff); if (mi == NULL) { DPRINTF(("%s: failed to inject %u byte AH header for SA " "%s/%08lx\n", __func__, rplen + authsize, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_hdrops); /*XXX differs from openbsd */ error = ENOBUFS; goto bad; } /* * The AH header is guaranteed by m_makespace() to be in * contiguous memory, at roff bytes offset into the returned mbuf. */ ah = (struct newah *)(mtod(mi, caddr_t) + roff); /* Initialize the AH header. */ m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nxt); ah->ah_len = (rplen + authsize - sizeof(struct ah)) / sizeof(u_int32_t); ah->ah_reserve = 0; ah->ah_spi = sav->spi; /* Zeroize authenticator. */ m_copyback(m, skip + rplen, authsize, ipseczeroes); /* Insert packet replay counter, as requested. */ if (sav->replay) { if (sav->replay->count == ~0 && (sav->flags & SADB_X_EXT_CYCSEQ) == 0) { DPRINTF(("%s: replay counter wrapped for SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); AHSTAT_INC(ahs_wrap); error = EINVAL; goto bad; } #ifdef REGRESSION /* Emulate replay attack when ipsec_replay is TRUE. */ if (!V_ipsec_replay) #endif sav->replay->count++; ah->ah_seq = htonl(sav->replay->count); } /* Get crypto descriptors. */ crp = crypto_getreq(1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); AHSTAT_INC(ahs_crypto); error = ENOBUFS; goto bad; } crda = crp->crp_desc; crda->crd_skip = 0; crda->crd_inject = skip + rplen; crda->crd_len = m->m_pkthdr.len; /* Authentication operation. */ crda->crd_alg = ahx->type; crda->crd_key = sav->key_auth->key_data; crda->crd_klen = _KEYBITS(sav->key_auth); /* Allocate IPsec-specific opaque crypto info. */ tc = (struct tdb_crypto *) malloc( sizeof(struct tdb_crypto) + skip, M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); AHSTAT_INC(ahs_crypto); error = ENOBUFS; goto bad; } /* Save the skipped portion of the packet. */ m_copydata(m, 0, skip, (caddr_t) (tc + 1)); /* * Fix IP header length on the header used for * authentication. We don't need to fix the original * header length as it will be fixed by our caller. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: bcopy(((caddr_t)(tc + 1)) + offsetof(struct ip, ip_len), (caddr_t) &iplen, sizeof(u_int16_t)); iplen = htons(ntohs(iplen) + rplen + authsize); m_copyback(m, offsetof(struct ip, ip_len), sizeof(u_int16_t), (caddr_t) &iplen); break; #endif /* INET */ #ifdef INET6 case AF_INET6: bcopy(((caddr_t)(tc + 1)) + offsetof(struct ip6_hdr, ip6_plen), (caddr_t) &iplen, sizeof(u_int16_t)); iplen = htons(ntohs(iplen) + rplen + authsize); m_copyback(m, offsetof(struct ip6_hdr, ip6_plen), sizeof(u_int16_t), (caddr_t) &iplen); break; #endif /* INET6 */ } /* Fix the Next Header field in saved header. */ ((u_int8_t *) (tc + 1))[protoff] = IPPROTO_AH; /* Update the Next Protocol field in the IP header. */ prot = IPPROTO_AH; m_copyback(m, protoff, sizeof(u_int8_t), (caddr_t) &prot); /* "Massage" the packet headers for crypto processing. */ error = ah_massage_headers(&m, sav->sah->saidx.dst.sa.sa_family, skip, ahx->type, 1); if (error != 0) { m = NULL; /* mbuf was free'd by ah_massage_headers. */ free(tc, M_XDATA); crypto_freereq(crp); goto bad; } /* Crypto operation descriptor. */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ah_output_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback. */ tc->tc_isr = isr; KEY_ADDREFSA(sav); tc->tc_sav = sav; tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_skip = skip; tc->tc_protoff = protoff; return crypto_dispatch(crp); bad: if (m) m_freem(m); return (error); } /* * AH output callback from the crypto driver. */ static int ah_output_cb(struct cryptop *crp) { int skip, protoff, error; struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; struct mbuf *m; caddr_t ptr; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque data area!")); skip = tc->tc_skip; protoff = tc->tc_protoff; ptr = (caddr_t) (tc + 1); m = (struct mbuf *) crp->crp_buf; isr = tc->tc_isr; IPSECREQUEST_LOCK(isr); sav = tc->tc_sav; /* With the isr lock released SA pointer can be updated. */ if (sav != isr->sav) { AHSTAT_INC(ahs_notdb); DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; } /* Check for crypto errors. */ if (crp->crp_etype) { if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { IPSECREQUEST_UNLOCK(isr); return (crypto_dispatch(crp)); } AHSTAT_INC(ahs_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { AHSTAT_INC(ahs_crypto); DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } AHSTAT_INC(ahs_hist[sav->alg_auth]); /* * Copy original headers (with the new protocol number) back * in place. */ m_copyback(m, 0, skip, ptr); /* No longer needed. */ free(tc, M_XDATA); crypto_freereq(crp); #ifdef REGRESSION /* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */ if (V_ipsec_integrity) { int alen; /* * Corrupt HMAC if we want to test integrity verification of * the other side. */ alen = AUTHSIZE(sav); m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes); } #endif /* NB: m is reclaimed by ipsec_process_done. */ error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); return error; bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); return error; } static struct xformsw ah_xformsw = { XF_AH, XFT_AUTH, "IPsec AH", ah_init, ah_zeroize, ah_input, ah_output, }; static void ah_attach(void) { xform_register(&ah_xformsw); } SYSINIT(ah_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ah_attach, NULL); Index: user/dchagin/lemul/sys/netipsec/xform_esp.c =================================================================== --- user/dchagin/lemul/sys/netipsec/xform_esp.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/xform_esp.c (revision 274241) @@ -1,1031 +1,1031 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_esp.c,v 1.69 2001/06/26 06:18:59 angelos Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 2001 Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include #include #include VNET_DEFINE(int, esp_enable) = 1; VNET_PCPUSTAT_DEFINE(struct espstat, espstat); VNET_PCPUSTAT_SYSINIT(espstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(espstat); #endif /* VIMAGE */ SYSCTL_DECL(_net_inet_esp); -SYSCTL_VNET_INT(_net_inet_esp, OID_AUTO, - esp_enable, CTLFLAG_RW, &VNET_NAME(esp_enable), 0, ""); +SYSCTL_INT(_net_inet_esp, OID_AUTO, esp_enable, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(esp_enable), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet_esp, IPSECCTL_STATS, stats, struct espstat, espstat, "ESP statistics (struct espstat, netipsec/esp_var.h"); static int esp_input_cb(struct cryptop *op); static int esp_output_cb(struct cryptop *crp); /* * NB: this is public for use by the PF_KEY support. * NB: if you add support here; be sure to add code to esp_attach below! */ struct enc_xform * esp_algorithm_lookup(int alg) { if (alg >= ESP_ALG_MAX) return NULL; switch (alg) { case SADB_EALG_DESCBC: return &enc_xform_des; case SADB_EALG_3DESCBC: return &enc_xform_3des; case SADB_X_EALG_AES: return &enc_xform_rijndael128; case SADB_X_EALG_BLOWFISHCBC: return &enc_xform_blf; case SADB_X_EALG_CAST128CBC: return &enc_xform_cast5; case SADB_X_EALG_SKIPJACK: return &enc_xform_skipjack; case SADB_EALG_NULL: return &enc_xform_null; case SADB_X_EALG_CAMELLIACBC: return &enc_xform_camellia; } return NULL; } size_t esp_hdrsiz(struct secasvar *sav) { size_t size; if (sav != NULL) { /*XXX not right for null algorithm--does it matter??*/ IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("SA with null xform")); if (sav->flags & SADB_X_EXT_OLD) size = sizeof (struct esp); else size = sizeof (struct newesp); size += sav->tdb_encalgxform->blocksize + 9; /*XXX need alg check???*/ if (sav->tdb_authalgxform != NULL && sav->replay) size += ah_hdrsiz(sav); } else { /* * base header size * + max iv length for CBC mode * + max pad length * + sizeof (pad length field) * + sizeof (next header field) * + max icv supported. */ size = sizeof (struct newesp) + EALG_MAX_BLOCK_LEN + 9 + 16; } return size; } /* * esp_init() is called when an SPI is being set up. */ static int esp_init(struct secasvar *sav, struct xformsw *xsp) { struct enc_xform *txform; struct cryptoini cria, crie; int keylen; int error; txform = esp_algorithm_lookup(sav->alg_enc); if (txform == NULL) { DPRINTF(("%s: unsupported encryption algorithm %d\n", __func__, sav->alg_enc)); return EINVAL; } if (sav->key_enc == NULL) { DPRINTF(("%s: no encoding key for %s algorithm\n", __func__, txform->name)); return EINVAL; } if ((sav->flags&(SADB_X_EXT_OLD|SADB_X_EXT_IV4B)) == SADB_X_EXT_IV4B) { DPRINTF(("%s: 4-byte IV not supported with protocol\n", __func__)); return EINVAL; } keylen = _KEYLEN(sav->key_enc); if (txform->minkey > keylen || keylen > txform->maxkey) { DPRINTF(("%s: invalid key length %u, must be in the range " "[%u..%u] for algorithm %s\n", __func__, keylen, txform->minkey, txform->maxkey, txform->name)); return EINVAL; } /* * NB: The null xform needs a non-zero blocksize to keep the * crypto code happy but if we use it to set ivlen then * the ESP header will be processed incorrectly. The * compromise is to force it to zero here. */ sav->ivlen = (txform == &enc_xform_null ? 0 : txform->blocksize); sav->iv = (caddr_t) malloc(sav->ivlen, M_XDATA, M_WAITOK); key_randomfill(sav->iv, sav->ivlen); /*XXX*/ /* * Setup AH-related state. */ if (sav->alg_auth != 0) { error = ah_init0(sav, xsp, &cria); if (error) return error; } /* NB: override anything set in ah_init0 */ sav->tdb_xform = xsp; sav->tdb_encalgxform = txform; /* Initialize crypto session. */ bzero(&crie, sizeof (crie)); crie.cri_alg = sav->tdb_encalgxform->type; crie.cri_klen = _KEYBITS(sav->key_enc); crie.cri_key = sav->key_enc->key_data; /* XXX Rounds ? */ if (sav->tdb_authalgxform && sav->tdb_encalgxform) { /* init both auth & enc */ crie.cri_next = &cria; error = crypto_newsession(&sav->tdb_cryptoid, &crie, V_crypto_support); } else if (sav->tdb_encalgxform) { error = crypto_newsession(&sav->tdb_cryptoid, &crie, V_crypto_support); } else if (sav->tdb_authalgxform) { error = crypto_newsession(&sav->tdb_cryptoid, &cria, V_crypto_support); } else { /* XXX cannot happen? */ DPRINTF(("%s: no encoding OR authentication xform!\n", __func__)); error = EINVAL; } return error; } /* * Paranoia. */ static int esp_zeroize(struct secasvar *sav) { /* NB: ah_zerorize free's the crypto session state */ int error = ah_zeroize(sav); if (sav->key_enc) bzero(sav->key_enc->key_data, _KEYLEN(sav->key_enc)); if (sav->iv) { free(sav->iv, M_XDATA); sav->iv = NULL; } sav->tdb_encalgxform = NULL; sav->tdb_xform = NULL; return error; } /* * ESP input processing, called (eventually) through the protocol switch. */ static int esp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { struct auth_hash *esph; struct enc_xform *espx; struct tdb_ident *tdbi; struct tdb_crypto *tc; int plen, alen, hlen; struct m_tag *mtag; struct newesp *esp; struct cryptodesc *crde; struct cryptop *crp; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->tdb_encalgxform != NULL, ("null encoding xform")); /* Valid IP Packet length ? */ if ( (skip&3) || (m->m_pkthdr.len&3) ){ DPRINTF(("%s: misaligned packet, skip %u pkt len %u", __func__, skip, m->m_pkthdr.len)); ESPSTAT_INC(esps_badilen); m_freem(m); return EINVAL; } /* XXX don't pullup, just copy header */ IP6_EXTHDR_GET(esp, struct newesp *, m, skip, sizeof (struct newesp)); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; /* Determine the ESP header length */ if (sav->flags & SADB_X_EXT_OLD) hlen = sizeof (struct esp) + sav->ivlen; else hlen = sizeof (struct newesp) + sav->ivlen; /* Authenticator hash size */ if (esph != NULL) { switch (esph->type) { case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: case CRYPTO_SHA2_512_HMAC: alen = esph->hashsize/2; break; default: alen = AH_HMAC_HASHLEN; break; } }else alen = 0; /* * Verify payload length is multiple of encryption algorithm * block size. * * NB: This works for the null algorithm because the blocksize * is 4 and all packets must be 4-byte aligned regardless * of the algorithm. */ plen = m->m_pkthdr.len - (skip + hlen + alen); if ((plen & (espx->blocksize - 1)) || (plen <= 0)) { DPRINTF(("%s: payload of %d octets not a multiple of %d octets," " SA %s/%08lx\n", __func__, plen, espx->blocksize, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); ESPSTAT_INC(esps_badilen); m_freem(m); return EINVAL; } /* * Check sequence number. */ if (esph && sav->replay && !ipsec_chkreplay(ntohl(esp->esp_seq), sav)) { DPRINTF(("%s: packet replay check for %s\n", __func__, ipsec_logsastr(sav))); /*XXX*/ ESPSTAT_INC(esps_replay); m_freem(m); return ENOBUFS; /*XXX*/ } /* Update the counters */ ESPSTAT_ADD(esps_ibytes, m->m_pkthdr.len - (skip + hlen + alen)); /* Find out if we've already done crypto */ for (mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, NULL); mtag != NULL; mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_CRYPTO_DONE, mtag)) { tdbi = (struct tdb_ident *) (mtag + 1); if (tdbi->proto == sav->sah->saidx.proto && tdbi->spi == sav->spi && !bcmp(&tdbi->dst, &sav->sah->saidx.dst, sizeof(union sockaddr_union))) break; } /* Get crypto descriptors */ crp = crypto_getreq(esph && espx ? 2 : 1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); ESPSTAT_INC(esps_crypto); m_freem(m); return ENOBUFS; } /* Get IPsec-specific opaque pointer */ if (esph == NULL || mtag != NULL) tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); else tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto) + alen, M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); ESPSTAT_INC(esps_crypto); m_freem(m); return ENOBUFS; } tc->tc_ptr = (caddr_t) mtag; if (esph) { struct cryptodesc *crda = crp->crp_desc; IPSEC_ASSERT(crda != NULL, ("null ah crypto descriptor")); /* Authentication descriptor */ crda->crd_skip = skip; crda->crd_len = m->m_pkthdr.len - (skip + alen); crda->crd_inject = m->m_pkthdr.len - alen; crda->crd_alg = esph->type; crda->crd_key = sav->key_auth->key_data; crda->crd_klen = _KEYBITS(sav->key_auth); /* Copy the authenticator */ if (mtag == NULL) m_copydata(m, m->m_pkthdr.len - alen, alen, (caddr_t) (tc + 1)); /* Chain authentication request */ crde = crda->crd_next; } else { crde = crp->crp_desc; } /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = esp_input_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback */ tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_protoff = protoff; tc->tc_skip = skip; KEY_ADDREFSA(sav); tc->tc_sav = sav; /* Decryption descriptor */ if (espx) { IPSEC_ASSERT(crde != NULL, ("null esp crypto descriptor")); crde->crd_skip = skip + hlen; crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen); crde->crd_inject = skip + hlen - sav->ivlen; crde->crd_alg = espx->type; crde->crd_key = sav->key_enc->key_data; crde->crd_klen = _KEYBITS(sav->key_enc); /* XXX Rounds ? */ } if (mtag == NULL) return crypto_dispatch(crp); else return esp_input_cb(crp); } /* * ESP input callback from the crypto driver. */ static int esp_input_cb(struct cryptop *crp) { u_int8_t lastthree[3], aalg[AH_HMAC_MAXHASHLEN]; int hlen, skip, protoff, error, alen; struct mbuf *m; struct cryptodesc *crd; struct auth_hash *esph; struct enc_xform *espx; struct tdb_crypto *tc; struct m_tag *mtag; struct secasvar *sav; struct secasindex *saidx; caddr_t ptr; crd = crp->crp_desc; IPSEC_ASSERT(crd != NULL, ("null crypto descriptor!")); tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!")); skip = tc->tc_skip; protoff = tc->tc_protoff; mtag = (struct m_tag *) tc->tc_ptr; m = (struct mbuf *) crp->crp_buf; sav = tc->tc_sav; IPSEC_ASSERT(sav != NULL, ("null SA!")); saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; /* Check for crypto errors */ if (crp->crp_etype) { /* Reset the session ID */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) return (crypto_dispatch(crp)); ESPSTAT_INC(esps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { ESPSTAT_INC(esps_crypto); DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } ESPSTAT_INC(esps_hist[sav->alg_enc]); /* If authentication was performed, check now. */ if (esph != NULL) { switch (esph->type) { case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: case CRYPTO_SHA2_512_HMAC: alen = esph->hashsize/2; break; default: alen = AH_HMAC_HASHLEN; break; } /* * If we have a tag, it means an IPsec-aware NIC did * the verification for us. Otherwise we need to * check the authentication calculation. */ AHSTAT_INC(ahs_hist[sav->alg_auth]); if (mtag == NULL) { /* Copy the authenticator from the packet */ m_copydata(m, m->m_pkthdr.len - alen, alen, aalg); ptr = (caddr_t) (tc + 1); /* Verify authenticator */ if (bcmp(ptr, aalg, alen) != 0) { DPRINTF(("%s: " "authentication hash mismatch for packet in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); ESPSTAT_INC(esps_badauth); error = EACCES; goto bad; } } /* Remove trailing authenticator */ m_adj(m, -alen); } /* Release the crypto descriptors */ free(tc, M_XDATA), tc = NULL; crypto_freereq(crp), crp = NULL; /* * Packet is now decrypted. */ m->m_flags |= M_DECRYPTED; /* * Update replay sequence number, if appropriate. */ if (sav->replay) { u_int32_t seq; m_copydata(m, skip + offsetof(struct newesp, esp_seq), sizeof (seq), (caddr_t) &seq); if (ipsec_updatereplay(ntohl(seq), sav)) { DPRINTF(("%s: packet replay check for %s\n", __func__, ipsec_logsastr(sav))); ESPSTAT_INC(esps_replay); error = ENOBUFS; goto bad; } } /* Determine the ESP header length */ if (sav->flags & SADB_X_EXT_OLD) hlen = sizeof (struct esp) + sav->ivlen; else hlen = sizeof (struct newesp) + sav->ivlen; /* Remove the ESP header and IV from the mbuf. */ error = m_striphdr(m, skip, hlen); if (error) { ESPSTAT_INC(esps_hdrops); DPRINTF(("%s: bad mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); goto bad; } /* Save the last three bytes of decrypted data */ m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); /* Verify pad length */ if (lastthree[1] + 2 > m->m_pkthdr.len - skip) { ESPSTAT_INC(esps_badilen); DPRINTF(("%s: invalid padding length %d for %u byte packet " "in SA %s/%08lx\n", __func__, lastthree[1], m->m_pkthdr.len - skip, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EINVAL; goto bad; } /* Verify correct decryption by checking the last padding bytes */ if ((sav->flags & SADB_X_EXT_PMASK) != SADB_X_EXT_PRAND) { if (lastthree[1] != lastthree[0] && lastthree[1] != 0) { ESPSTAT_INC(esps_badenc); DPRINTF(("%s: decryption failed for packet in " "SA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EINVAL; goto bad; } } /* Trim the mbuf chain to remove trailing authenticator and padding */ m_adj(m, -(lastthree[1] + 2)); /* Restore the Next Protocol field */ m_copyback(m, protoff, sizeof (u_int8_t), lastthree + 2); switch (saidx->dst.sa.sa_family) { #ifdef INET6 case AF_INET6: error = ipsec6_common_input_cb(m, sav, skip, protoff, mtag); break; #endif #ifdef INET case AF_INET: error = ipsec4_common_input_cb(m, sav, skip, protoff, mtag); break; #endif default: panic("%s: Unexpected address family: %d saidx=%p", __func__, saidx->dst.sa.sa_family, saidx); } KEY_FREESAV(&sav); return error; bad: if (sav) KEY_FREESAV(&sav); if (m != NULL) m_freem(m); if (tc != NULL) free(tc, M_XDATA); if (crp != NULL) crypto_freereq(crp); return error; } /* * ESP output routine, called by ipsec[46]_process_packet(). */ static int esp_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff ) { struct enc_xform *espx; struct auth_hash *esph; int hlen, rlen, padding, blks, alen, i, roff; struct mbuf *mo = (struct mbuf *) NULL; struct tdb_crypto *tc; struct secasvar *sav; struct secasindex *saidx; unsigned char *pad; u_int8_t prot; int error, maxpacketsize; struct cryptodesc *crde = NULL, *crda = NULL; struct cryptop *crp; sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); esph = sav->tdb_authalgxform; espx = sav->tdb_encalgxform; IPSEC_ASSERT(espx != NULL, ("null encoding xform")); if (sav->flags & SADB_X_EXT_OLD) hlen = sizeof (struct esp) + sav->ivlen; else hlen = sizeof (struct newesp) + sav->ivlen; rlen = m->m_pkthdr.len - skip; /* Raw payload length. */ /* * NB: The null encoding transform has a blocksize of 4 * so that headers are properly aligned. */ blks = espx->blocksize; /* IV blocksize */ /* XXX clamp padding length a la KAME??? */ padding = ((blks - ((rlen + 2) % blks)) % blks) + 2; if (esph) switch (esph->type) { case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: case CRYPTO_SHA2_512_HMAC: alen = esph->hashsize/2; break; default: alen = AH_HMAC_HASHLEN; break; } else alen = 0; ESPSTAT_INC(esps_output); saidx = &sav->sah->saidx; /* Check for maximum packet size violations. */ switch (saidx->dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: DPRINTF(("%s: unknown/unsupported protocol " "family %d, SA %s/%08lx\n", __func__, saidx->dst.sa.sa_family, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); ESPSTAT_INC(esps_nopf); error = EPFNOSUPPORT; goto bad; } if (skip + hlen + rlen + padding + alen > maxpacketsize) { DPRINTF(("%s: packet in SA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi), skip + hlen + rlen + padding + alen, maxpacketsize)); ESPSTAT_INC(esps_toobig); error = EMSGSIZE; goto bad; } /* Update the counters. */ ESPSTAT_ADD(esps_obytes, m->m_pkthdr.len - skip); m = m_unshare(m, M_NOWAIT); if (m == NULL) { DPRINTF(("%s: cannot clone mbuf chain, SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); ESPSTAT_INC(esps_hdrops); error = ENOBUFS; goto bad; } /* Inject ESP header. */ mo = m_makespace(m, skip, hlen, &roff); if (mo == NULL) { DPRINTF(("%s: %u byte ESP hdr inject failed for SA %s/%08lx\n", __func__, hlen, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); ESPSTAT_INC(esps_hdrops); /* XXX diffs from openbsd */ error = ENOBUFS; goto bad; } /* Initialize ESP header. */ bcopy((caddr_t) &sav->spi, mtod(mo, caddr_t) + roff, sizeof(u_int32_t)); if (sav->replay) { u_int32_t replay; #ifdef REGRESSION /* Emulate replay attack when ipsec_replay is TRUE. */ if (!V_ipsec_replay) #endif sav->replay->count++; replay = htonl(sav->replay->count); bcopy((caddr_t) &replay, mtod(mo, caddr_t) + roff + sizeof(u_int32_t), sizeof(u_int32_t)); } /* * Add padding -- better to do it ourselves than use the crypto engine, * although if/when we support compression, we'd have to do that. */ pad = (u_char *) m_pad(m, padding + alen); if (pad == NULL) { DPRINTF(("%s: m_pad failed for SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); m = NULL; /* NB: free'd by m_pad */ error = ENOBUFS; goto bad; } /* * Add padding: random, zero, or self-describing. * XXX catch unexpected setting */ switch (sav->flags & SADB_X_EXT_PMASK) { case SADB_X_EXT_PRAND: (void) read_random(pad, padding - 2); break; case SADB_X_EXT_PZERO: bzero(pad, padding - 2); break; case SADB_X_EXT_PSEQ: for (i = 0; i < padding - 2; i++) pad[i] = i+1; break; } /* Fix padding length and Next Protocol in padding itself. */ pad[padding - 2] = padding - 2; m_copydata(m, protoff, sizeof(u_int8_t), pad + padding - 1); /* Fix Next Protocol in IPv4/IPv6 header. */ prot = IPPROTO_ESP; m_copyback(m, protoff, sizeof(u_int8_t), (u_char *) &prot); /* Get crypto descriptors. */ crp = crypto_getreq(esph && espx ? 2 : 1); if (crp == NULL) { DPRINTF(("%s: failed to acquire crypto descriptors\n", __func__)); ESPSTAT_INC(esps_crypto); error = ENOBUFS; goto bad; } if (espx) { crde = crp->crp_desc; crda = crde->crd_next; /* Encryption descriptor. */ crde->crd_skip = skip + hlen; crde->crd_len = m->m_pkthdr.len - (skip + hlen + alen); crde->crd_flags = CRD_F_ENCRYPT; crde->crd_inject = skip + hlen - sav->ivlen; /* Encryption operation. */ crde->crd_alg = espx->type; crde->crd_key = sav->key_enc->key_data; crde->crd_klen = _KEYBITS(sav->key_enc); /* XXX Rounds ? */ } else crda = crp->crp_desc; /* IPsec-specific opaque crypto info. */ tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { crypto_freereq(crp); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); ESPSTAT_INC(esps_crypto); error = ENOBUFS; goto bad; } /* Callback parameters */ tc->tc_isr = isr; KEY_ADDREFSA(sav); tc->tc_sav = sav; tc->tc_spi = sav->spi; tc->tc_dst = saidx->dst; tc->tc_proto = saidx->proto; /* Crypto operation descriptor. */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length. */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = esp_output_cb; crp->crp_opaque = (caddr_t) tc; crp->crp_sid = sav->tdb_cryptoid; if (esph) { /* Authentication descriptor. */ crda->crd_skip = skip; crda->crd_len = m->m_pkthdr.len - (skip + alen); crda->crd_inject = m->m_pkthdr.len - alen; /* Authentication operation. */ crda->crd_alg = esph->type; crda->crd_key = sav->key_auth->key_data; crda->crd_klen = _KEYBITS(sav->key_auth); } return crypto_dispatch(crp); bad: if (m) m_freem(m); return (error); } /* * ESP output callback from the crypto driver. */ static int esp_output_cb(struct cryptop *crp) { struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; struct mbuf *m; int error; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque data area!")); m = (struct mbuf *) crp->crp_buf; isr = tc->tc_isr; IPSECREQUEST_LOCK(isr); sav = tc->tc_sav; /* With the isr lock released SA pointer can be updated. */ if (sav != isr->sav) { ESPSTAT_INC(esps_notdb); DPRINTF(("%s: SA gone during crypto (SA %s/%08lx proto %u)\n", __func__, ipsec_address(&tc->tc_dst), (u_long) ntohl(tc->tc_spi), tc->tc_proto)); error = ENOBUFS; /*XXX*/ goto bad; } /* Check for crypto errors. */ if (crp->crp_etype) { /* Reset session ID. */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { IPSECREQUEST_UNLOCK(isr); return (crypto_dispatch(crp)); } ESPSTAT_INC(esps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { ESPSTAT_INC(esps_crypto); DPRINTF(("%s: bogus returned buffer from crypto\n", __func__)); error = EINVAL; goto bad; } ESPSTAT_INC(esps_hist[sav->alg_enc]); if (sav->tdb_authalgxform != NULL) AHSTAT_INC(ahs_hist[sav->alg_auth]); /* Release crypto descriptors. */ free(tc, M_XDATA); crypto_freereq(crp); #ifdef REGRESSION /* Emulate man-in-the-middle attack when ipsec_integrity is TRUE. */ if (V_ipsec_integrity) { static unsigned char ipseczeroes[AH_HMAC_MAXHASHLEN]; struct auth_hash *esph; /* * Corrupt HMAC if we want to test integrity verification of * the other side. */ esph = sav->tdb_authalgxform; if (esph != NULL) { int alen; switch (esph->type) { case CRYPTO_SHA2_256_HMAC: case CRYPTO_SHA2_384_HMAC: case CRYPTO_SHA2_512_HMAC: alen = esph->hashsize/2; break; default: alen = AH_HMAC_HASHLEN; break; } m_copyback(m, m->m_pkthdr.len - alen, alen, ipseczeroes); } } #endif /* NB: m is reclaimed by ipsec_process_done. */ error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); return error; bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); return error; } static struct xformsw esp_xformsw = { XF_ESP, XFT_CONF|XFT_AUTH, "IPsec ESP", esp_init, esp_zeroize, esp_input, esp_output }; static void esp_attach(void) { xform_register(&esp_xformsw); } SYSINIT(esp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, esp_attach, NULL); Index: user/dchagin/lemul/sys/netipsec/xform_ipcomp.c =================================================================== --- user/dchagin/lemul/sys/netipsec/xform_ipcomp.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/xform_ipcomp.c (revision 274241) @@ -1,639 +1,639 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ipcomp.c,v 1.1 2001/07/05 12:08:52 jjbg Exp $ */ /*- * Copyright (c) 2001 Jean-Jacques Bernard-Gundol (jj@wabbitt.org) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* IP payload compression protocol (IPComp), see RFC 2393 */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include #include #include #include #include VNET_DEFINE(int, ipcomp_enable) = 1; VNET_PCPUSTAT_DEFINE(struct ipcompstat, ipcompstat); VNET_PCPUSTAT_SYSINIT(ipcompstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipcompstat); #endif /* VIMAGE */ SYSCTL_DECL(_net_inet_ipcomp); -SYSCTL_VNET_INT(_net_inet_ipcomp, OID_AUTO, - ipcomp_enable, CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, ""); +SYSCTL_INT(_net_inet_ipcomp, OID_AUTO, ipcomp_enable, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipcomp_enable), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet_ipcomp, IPSECCTL_STATS, stats, struct ipcompstat, ipcompstat, "IPCOMP statistics (struct ipcompstat, netipsec/ipcomp_var.h"); static int ipcomp_input_cb(struct cryptop *crp); static int ipcomp_output_cb(struct cryptop *crp); struct comp_algo * ipcomp_algorithm_lookup(int alg) { if (alg >= IPCOMP_ALG_MAX) return NULL; switch (alg) { case SADB_X_CALG_DEFLATE: return &comp_algo_deflate; } return NULL; } /* * ipcomp_init() is called when an CPI is being set up. */ static int ipcomp_init(struct secasvar *sav, struct xformsw *xsp) { struct comp_algo *tcomp; struct cryptoini cric; /* NB: algorithm really comes in alg_enc and not alg_comp! */ tcomp = ipcomp_algorithm_lookup(sav->alg_enc); if (tcomp == NULL) { DPRINTF(("%s: unsupported compression algorithm %d\n", __func__, sav->alg_comp)); return EINVAL; } sav->alg_comp = sav->alg_enc; /* set for doing histogram */ sav->tdb_xform = xsp; sav->tdb_compalgxform = tcomp; /* Initialize crypto session */ bzero(&cric, sizeof (cric)); cric.cri_alg = sav->tdb_compalgxform->type; return crypto_newsession(&sav->tdb_cryptoid, &cric, V_crypto_support); } /* * ipcomp_zeroize() used when IPCA is deleted */ static int ipcomp_zeroize(struct secasvar *sav) { int err; err = crypto_freesession(sav->tdb_cryptoid); sav->tdb_cryptoid = 0; return err; } /* * ipcomp_input() gets called to uncompress an input packet */ static int ipcomp_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { struct tdb_crypto *tc; struct cryptodesc *crdc; struct cryptop *crp; struct ipcomp *ipcomp; caddr_t addr; int hlen = IPCOMP_HLENGTH; /* * Check that the next header of the IPComp is not IPComp again, before * doing any real work. Given it is not possible to do double * compression it means someone is playing tricks on us. */ if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); return (ENOBUFS); } addr = (caddr_t) mtod(m, struct ip *) + skip; ipcomp = (struct ipcomp *)addr; if (ipcomp->comp_nxt == IPPROTO_IPCOMP) { m_freem(m); IPCOMPSTAT_INC(ipcomps_pdrops); /* XXX have our own stats? */ DPRINTF(("%s: recursive compression detected\n", __func__)); return (EINVAL); } /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { m_freem(m); DPRINTF(("%s: no crypto descriptors\n", __func__)); IPCOMPSTAT_INC(ipcomps_crypto); return ENOBUFS; } /* Get IPsec-specific opaque pointer */ tc = (struct tdb_crypto *) malloc(sizeof (*tc), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { m_freem(m); crypto_freereq(crp); DPRINTF(("%s: cannot allocate tdb_crypto\n", __func__)); IPCOMPSTAT_INC(ipcomps_crypto); return ENOBUFS; } crdc = crp->crp_desc; crdc->crd_skip = skip + hlen; crdc->crd_len = m->m_pkthdr.len - (skip + hlen); crdc->crd_inject = skip; tc->tc_ptr = 0; /* Decompression operation */ crdc->crd_alg = sav->tdb_compalgxform->type; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len - (skip + hlen); crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_input_cb; crp->crp_sid = sav->tdb_cryptoid; crp->crp_opaque = (caddr_t) tc; /* These are passed as-is to the callback */ tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_protoff = protoff; tc->tc_skip = skip; KEY_ADDREFSA(sav); tc->tc_sav = sav; return crypto_dispatch(crp); } /* * IPComp input callback from the crypto driver. */ static int ipcomp_input_cb(struct cryptop *crp) { struct cryptodesc *crd; struct tdb_crypto *tc; int skip, protoff; struct mtag *mtag; struct mbuf *m; struct secasvar *sav; struct secasindex *saidx; int hlen = IPCOMP_HLENGTH, error, clen; u_int8_t nproto; caddr_t addr; crd = crp->crp_desc; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque crypto data area!")); skip = tc->tc_skip; protoff = tc->tc_protoff; mtag = (struct mtag *) tc->tc_ptr; m = (struct mbuf *) crp->crp_buf; sav = tc->tc_sav; IPSEC_ASSERT(sav != NULL, ("null SA!")); saidx = &sav->sah->saidx; IPSEC_ASSERT(saidx->dst.sa.sa_family == AF_INET || saidx->dst.sa.sa_family == AF_INET6, ("unexpected protocol family %u", saidx->dst.sa.sa_family)); /* Check for crypto errors */ if (crp->crp_etype) { /* Reset the session ID */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { return crypto_dispatch(crp); } IPCOMPSTAT_INC(ipcomps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: null mbuf returned from crypto\n", __func__)); error = EINVAL; goto bad; } IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]); clen = crp->crp_olen; /* Length of data after processing */ /* Release the crypto descriptors */ free(tc, M_XDATA), tc = NULL; crypto_freereq(crp), crp = NULL; /* In case it's not done already, adjust the size of the mbuf chain */ m->m_pkthdr.len = clen + hlen + skip; if (m->m_len < skip + hlen && (m = m_pullup(m, skip + hlen)) == 0) { IPCOMPSTAT_INC(ipcomps_hdrops); /*XXX*/ DPRINTF(("%s: m_pullup failed\n", __func__)); error = EINVAL; /*XXX*/ goto bad; } /* Keep the next protocol field */ addr = (caddr_t) mtod(m, struct ip *) + skip; nproto = ((struct ipcomp *) addr)->comp_nxt; /* Remove the IPCOMP header */ error = m_striphdr(m, skip, hlen); if (error) { IPCOMPSTAT_INC(ipcomps_hdrops); DPRINTF(("%s: bad mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); goto bad; } /* Restore the Next Protocol field */ m_copyback(m, protoff, sizeof (u_int8_t), (u_int8_t *) &nproto); switch (saidx->dst.sa.sa_family) { #ifdef INET6 case AF_INET6: error = ipsec6_common_input_cb(m, sav, skip, protoff, NULL); break; #endif #ifdef INET case AF_INET: error = ipsec4_common_input_cb(m, sav, skip, protoff, NULL); break; #endif default: panic("%s: Unexpected address family: %d saidx=%p", __func__, saidx->dst.sa.sa_family, saidx); } KEY_FREESAV(&sav); return error; bad: if (sav) KEY_FREESAV(&sav); if (m) m_freem(m); if (tc != NULL) free(tc, M_XDATA); if (crp) crypto_freereq(crp); return error; } /* * IPComp output routine, called by ipsec[46]_process_packet() */ static int ipcomp_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff ) { struct secasvar *sav; struct comp_algo *ipcompx; int error, ralen, maxpacketsize; struct cryptodesc *crdc; struct cryptop *crp; struct tdb_crypto *tc; sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); ipcompx = sav->tdb_compalgxform; IPSEC_ASSERT(ipcompx != NULL, ("null compression xform")); /* * Do not touch the packet in case our payload to compress * is lower than the minimal threshold of the compression * alogrithm. We will just send out the data uncompressed. * See RFC 3173, 2.2. Non-Expansion Policy. */ if (m->m_pkthdr.len <= ipcompx->minlen) { IPCOMPSTAT_INC(ipcomps_threshold); return ipsec_process_done(m, isr); } ralen = m->m_pkthdr.len - skip; /* Raw payload length before comp. */ IPCOMPSTAT_INC(ipcomps_output); /* Check for maximum packet size violations. */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: maxpacketsize = IP_MAXPACKET; break; #endif /* INET */ #ifdef INET6 case AF_INET6: maxpacketsize = IPV6_MAXPACKET; break; #endif /* INET6 */ default: IPCOMPSTAT_INC(ipcomps_nopf); DPRINTF(("%s: unknown/unsupported protocol family %d, " "IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } if (ralen + skip + IPCOMP_HLENGTH > maxpacketsize) { IPCOMPSTAT_INC(ipcomps_toobig); DPRINTF(("%s: packet in IPCA %s/%08lx got too big " "(len %u, max len %u)\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi), ralen + skip + IPCOMP_HLENGTH, maxpacketsize)); error = EMSGSIZE; goto bad; } /* Update the counters */ IPCOMPSTAT_ADD(ipcomps_obytes, m->m_pkthdr.len - skip); m = m_unshare(m, M_NOWAIT); if (m == NULL) { IPCOMPSTAT_INC(ipcomps_hdrops); DPRINTF(("%s: cannot clone mbuf chain, IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } /* Ok now, we can pass to the crypto processing. */ /* Get crypto descriptors */ crp = crypto_getreq(1); if (crp == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: failed to acquire crypto descriptor\n",__func__)); error = ENOBUFS; goto bad; } crdc = crp->crp_desc; /* Compression descriptor */ crdc->crd_skip = skip; crdc->crd_len = ralen; crdc->crd_flags = CRD_F_COMP; crdc->crd_inject = skip; /* Compression operation */ crdc->crd_alg = ipcompx->type; /* IPsec-specific opaque crypto info */ tc = (struct tdb_crypto *) malloc(sizeof(struct tdb_crypto), M_XDATA, M_NOWAIT|M_ZERO); if (tc == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: failed to allocate tdb_crypto\n", __func__)); crypto_freereq(crp); error = ENOBUFS; goto bad; } tc->tc_isr = isr; KEY_ADDREFSA(sav); tc->tc_sav = sav; tc->tc_spi = sav->spi; tc->tc_dst = sav->sah->saidx.dst; tc->tc_proto = sav->sah->saidx.proto; tc->tc_protoff = protoff; tc->tc_skip = skip; /* Crypto operation descriptor */ crp->crp_ilen = m->m_pkthdr.len; /* Total input length */ crp->crp_flags = CRYPTO_F_IMBUF | CRYPTO_F_CBIFSYNC; crp->crp_buf = (caddr_t) m; crp->crp_callback = ipcomp_output_cb; crp->crp_opaque = (caddr_t) tc; crp->crp_sid = sav->tdb_cryptoid; return crypto_dispatch(crp); bad: if (m) m_freem(m); return (error); } /* * IPComp output callback from the crypto driver. */ static int ipcomp_output_cb(struct cryptop *crp) { struct tdb_crypto *tc; struct ipsecrequest *isr; struct secasvar *sav; struct mbuf *m; int error, skip; tc = (struct tdb_crypto *) crp->crp_opaque; IPSEC_ASSERT(tc != NULL, ("null opaque data area!")); m = (struct mbuf *) crp->crp_buf; skip = tc->tc_skip; isr = tc->tc_isr; IPSECREQUEST_LOCK(isr); sav = tc->tc_sav; /* With the isr lock released SA pointer can be updated. */ if (sav != isr->sav) { IPCOMPSTAT_INC(ipcomps_notdb); DPRINTF(("%s: SA expired while in crypto\n", __func__)); error = ENOBUFS; /*XXX*/ goto bad; } /* Check for crypto errors */ if (crp->crp_etype) { /* Reset the session ID */ if (sav->tdb_cryptoid != 0) sav->tdb_cryptoid = crp->crp_sid; if (crp->crp_etype == EAGAIN) { IPSECREQUEST_UNLOCK(isr); return crypto_dispatch(crp); } IPCOMPSTAT_INC(ipcomps_noxform); DPRINTF(("%s: crypto error %d\n", __func__, crp->crp_etype)); error = crp->crp_etype; goto bad; } /* Shouldn't happen... */ if (m == NULL) { IPCOMPSTAT_INC(ipcomps_crypto); DPRINTF(("%s: bogus return buffer from crypto\n", __func__)); error = EINVAL; goto bad; } IPCOMPSTAT_INC(ipcomps_hist[sav->alg_comp]); if (crp->crp_ilen - skip > crp->crp_olen) { struct mbuf *mo; struct ipcomp *ipcomp; int roff; uint8_t prot; /* Compression helped, inject IPCOMP header. */ mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff); if (mo == NULL) { IPCOMPSTAT_INC(ipcomps_wrap); DPRINTF(("%s: IPCOMP header inject failed for IPCA %s/%08lx\n", __func__, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = ENOBUFS; goto bad; } ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff); /* Initialize the IPCOMP header */ /* XXX alignment always correct? */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: ipcomp->comp_nxt = mtod(m, struct ip *)->ip_p; break; #endif /* INET */ #ifdef INET6 case AF_INET6: ipcomp->comp_nxt = mtod(m, struct ip6_hdr *)->ip6_nxt; break; #endif } ipcomp->comp_flags = 0; ipcomp->comp_cpi = htons((u_int16_t) ntohl(sav->spi)); /* Fix Next Protocol in IPv4/IPv6 header */ prot = IPPROTO_IPCOMP; m_copyback(m, tc->tc_protoff, sizeof(u_int8_t), (u_char *)&prot); /* Adjust the length in the IP header */ switch (sav->sah->saidx.dst.sa.sa_family) { #ifdef INET case AF_INET: mtod(m, struct ip *)->ip_len = htons(m->m_pkthdr.len); break; #endif /* INET */ #ifdef INET6 case AF_INET6: mtod(m, struct ip6_hdr *)->ip6_plen = htons(m->m_pkthdr.len) - sizeof(struct ip6_hdr); break; #endif /* INET6 */ default: IPCOMPSTAT_INC(ipcomps_nopf); DPRINTF(("%s: unknown/unsupported protocol " "family %d, IPCA %s/%08lx\n", __func__, sav->sah->saidx.dst.sa.sa_family, ipsec_address(&sav->sah->saidx.dst), (u_long) ntohl(sav->spi))); error = EPFNOSUPPORT; goto bad; } } else { /* Compression was useless, we have lost time. */ IPCOMPSTAT_INC(ipcomps_uncompr); DPRINTF(("%s: compressions was useless %d - %d <= %d\n", __func__, crp->crp_ilen, skip, crp->crp_olen)); /* XXX remember state to not compress the next couple * of packets, RFC 3173, 2.2. Non-Expansion Policy */ } /* Release the crypto descriptor */ free(tc, M_XDATA); crypto_freereq(crp); /* NB: m is reclaimed by ipsec_process_done. */ error = ipsec_process_done(m, isr); KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); return error; bad: if (sav) KEY_FREESAV(&sav); IPSECREQUEST_UNLOCK(isr); if (m) m_freem(m); free(tc, M_XDATA); crypto_freereq(crp); return error; } static struct xformsw ipcomp_xformsw = { XF_IPCOMP, XFT_COMP, "IPcomp", ipcomp_init, ipcomp_zeroize, ipcomp_input, ipcomp_output }; static void ipcomp_attach(void) { xform_register(&ipcomp_xformsw); } SYSINIT(ipcomp_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipcomp_attach, NULL); Index: user/dchagin/lemul/sys/netipsec/xform_ipip.c =================================================================== --- user/dchagin/lemul/sys/netipsec/xform_ipip.c (revision 274240) +++ user/dchagin/lemul/sys/netipsec/xform_ipip.c (revision 274241) @@ -1,668 +1,668 @@ /* $FreeBSD$ */ /* $OpenBSD: ip_ipip.c,v 1.25 2002/06/10 18:04:55 itojun Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and * Niels Provos (provos@physnet.uni-hamburg.de). * * The original version of this code was written by John Ioannidis * for BSD/OS in Athens, Greece, in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * Copyright (c) 2001, Angelos D. Keromytis. * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ /* * IP-inside-IP processing */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_enc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #include #include /* * We can control the acceptance of IP4 packets by altering the sysctl * net.inet.ipip.allow value. Zero means drop them, all else is acceptance. */ VNET_DEFINE(int, ipip_allow) = 0; VNET_PCPUSTAT_DEFINE(struct ipipstat, ipipstat); VNET_PCPUSTAT_SYSINIT(ipipstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipipstat); #endif /* VIMAGE */ SYSCTL_DECL(_net_inet_ipip); -SYSCTL_VNET_INT(_net_inet_ipip, OID_AUTO, - ipip_allow, CTLFLAG_RW, &VNET_NAME(ipip_allow), 0, ""); +SYSCTL_INT(_net_inet_ipip, OID_AUTO, ipip_allow, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(ipip_allow), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet_ipip, IPSECCTL_STATS, stats, struct ipipstat, ipipstat, "IPIP statistics (struct ipipstat, netipsec/ipip_var.h)"); /* XXX IPCOMP */ #define M_IPSEC (M_AUTHIPHDR|M_AUTHIPDGM|M_DECRYPTED) static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp); #ifdef INET6 /* * Really only a wrapper for ipip_input(), for use with IPv6. */ int ip4_input6(struct mbuf **m, int *offp, int proto) { #if 0 /* If we do not accept IP-in-IP explicitly, drop. */ if (!V_ipip_allow && ((*m)->m_flags & M_IPSEC) == 0) { DPRINTF(("%s: dropped due to policy\n", __func__)); IPIPSTAT_INC(ipips_pdrops); m_freem(*m); return IPPROTO_DONE; } #endif _ipip_input(*m, *offp, NULL); return IPPROTO_DONE; } #endif /* INET6 */ #ifdef INET /* * Really only a wrapper for ipip_input(), for use with IPv4. */ int ip4_input(struct mbuf **mp, int *offp, int proto) { #if 0 /* If we do not accept IP-in-IP explicitly, drop. */ if (!V_ipip_allow && (m->m_flags & M_IPSEC) == 0) { DPRINTF(("%s: dropped due to policy\n", __func__)); IPIPSTAT_INC(ipips_pdrops); m_freem(m); return; } #endif _ipip_input(*mp, *offp, NULL); return (IPPROTO_DONE); } #endif /* INET */ /* * ipip_input gets called when we receive an IP{46} encapsulated packet, * either because we got it at a real interface, or because AH or ESP * were being used in tunnel mode (in which case the rcvif element will * contain the address of the encX interface associated with the tunnel. */ static void _ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp) { struct ip *ipo; #ifdef INET6 struct ip6_hdr *ip6 = NULL; u_int8_t itos; #endif int isr; u_int8_t otos; u_int8_t v; int hlen; IPIPSTAT_INC(ipips_ipackets); m_copydata(m, 0, 1, &v); switch (v >> 4) { #ifdef INET case 4: hlen = sizeof(struct ip); break; #endif /* INET */ #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: IPIPSTAT_INC(ipips_family); m_freem(m); return /* EAFNOSUPPORT */; } /* Bring the IP header in the first mbuf, if not there already */ if (m->m_len < hlen) { if ((m = m_pullup(m, hlen)) == NULL) { DPRINTF(("%s: m_pullup (1) failed\n", __func__)); IPIPSTAT_INC(ipips_hdrops); return; } } ipo = mtod(m, struct ip *); /* Keep outer ecn field. */ switch (v >> 4) { #ifdef INET case 4: otos = ipo->ip_tos; break; #endif /* INET */ #ifdef INET6 case 6: otos = (ntohl(mtod(m, struct ip6_hdr *)->ip6_flow) >> 20) & 0xff; break; #endif default: panic("ipip_input: unknown ip version %u (outer)", v>>4); } /* Remove outer IP header */ m_adj(m, iphlen); /* Sanity check */ if (m->m_pkthdr.len < sizeof(struct ip)) { IPIPSTAT_INC(ipips_hdrops); m_freem(m); return; } m_copydata(m, 0, 1, &v); switch (v >> 4) { #ifdef INET case 4: hlen = sizeof(struct ip); break; #endif /* INET */ #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: IPIPSTAT_INC(ipips_family); m_freem(m); return; /* EAFNOSUPPORT */ } /* * Bring the inner IP header in the first mbuf, if not there already. */ if (m->m_len < hlen) { if ((m = m_pullup(m, hlen)) == NULL) { DPRINTF(("%s: m_pullup (2) failed\n", __func__)); IPIPSTAT_INC(ipips_hdrops); return; } } /* * RFC 1853 specifies that the inner TTL should not be touched on * decapsulation. There's no reason this comment should be here, but * this is as good as any a position. */ /* Some sanity checks in the inner IP header */ switch (v >> 4) { #ifdef INET case 4: ipo = mtod(m, struct ip *); ip_ecn_egress(V_ip4_ipsec_ecn, &otos, &ipo->ip_tos); break; #endif /* INET */ #ifdef INET6 case 6: ip6 = (struct ip6_hdr *) ipo; itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; ip_ecn_egress(V_ip6_ipsec_ecn, &otos, &itos); ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t) itos << 20); break; #endif default: panic("ipip_input: unknown ip version %u (inner)", v>>4); } /* Check for local address spoofing. */ if ((m->m_pkthdr.rcvif == NULL || !(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK)) && V_ipip_allow != 2) { #ifdef INET if ((v >> 4) == IPVERSION && in_localip(ipo->ip_src) != 0) { IPIPSTAT_INC(ipips_spoof); m_freem(m); return; } #endif #ifdef INET6 if ((v & IPV6_VERSION_MASK) == IPV6_VERSION && in6_localip(&ip6->ip6_src) != 0) { IPIPSTAT_INC(ipips_spoof); m_freem(m); return; } #endif } /* Statistics */ IPIPSTAT_ADD(ipips_ibytes, m->m_pkthdr.len - iphlen); /* * Interface pointer stays the same; if no IPsec processing has * been done (or will be done), this will point to a normal * interface. Otherwise, it'll point to an enc interface, which * will allow a packet filter to distinguish between secure and * untrusted packets. */ switch (v >> 4) { #ifdef INET case 4: isr = NETISR_IP; break; #endif #ifdef INET6 case 6: isr = NETISR_IPV6; break; #endif default: panic("%s: bogus ip version %u", __func__, v>>4); } if (netisr_queue(isr, m)) { /* (0) on success. */ IPIPSTAT_INC(ipips_qfull); DPRINTF(("%s: packet dropped because of full queue\n", __func__)); } } int ipip_output( struct mbuf *m, struct ipsecrequest *isr, struct mbuf **mp, int skip, int protoff ) { struct secasvar *sav; u_int8_t tp, otos; struct secasindex *saidx; int error; #if defined(INET) || defined(INET6) u_int8_t itos; #endif #ifdef INET struct ip *ipo; #endif /* INET */ #ifdef INET6 struct ip6_hdr *ip6, *ip6o; #endif /* INET6 */ sav = isr->sav; IPSEC_ASSERT(sav != NULL, ("null SA")); IPSEC_ASSERT(sav->sah != NULL, ("null SAH")); /* XXX Deal with empty TDB source/destination addresses. */ m_copydata(m, 0, 1, &tp); tp = (tp >> 4) & 0xff; /* Get the IP version number. */ saidx = &sav->sah->saidx; switch (saidx->dst.sa.sa_family) { #ifdef INET case AF_INET: if (saidx->src.sa.sa_family != AF_INET || saidx->src.sin.sin_addr.s_addr == INADDR_ANY || saidx->dst.sin.sin_addr.s_addr == INADDR_ANY) { DPRINTF(("%s: unspecified tunnel endpoint " "address in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); IPIPSTAT_INC(ipips_unspec); error = EINVAL; goto bad; } M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == 0) { DPRINTF(("%s: M_PREPEND failed\n", __func__)); IPIPSTAT_INC(ipips_hdrops); error = ENOBUFS; goto bad; } ipo = mtod(m, struct ip *); ipo->ip_v = IPVERSION; ipo->ip_hl = 5; ipo->ip_len = htons(m->m_pkthdr.len); ipo->ip_ttl = V_ip_defttl; ipo->ip_sum = 0; ipo->ip_src = saidx->src.sin.sin_addr; ipo->ip_dst = saidx->dst.sin.sin_addr; ipo->ip_id = ip_newid(); /* If the inner protocol is IP... */ switch (tp) { case IPVERSION: /* Save ECN notification */ m_copydata(m, sizeof(struct ip) + offsetof(struct ip, ip_tos), sizeof(u_int8_t), (caddr_t) &itos); ipo->ip_p = IPPROTO_IPIP; /* * We should be keeping tunnel soft-state and * send back ICMPs if needed. */ m_copydata(m, sizeof(struct ip) + offsetof(struct ip, ip_off), sizeof(u_int16_t), (caddr_t) &ipo->ip_off); ipo->ip_off = ntohs(ipo->ip_off); ipo->ip_off &= ~(IP_DF | IP_MF | IP_OFFMASK); ipo->ip_off = htons(ipo->ip_off); break; #ifdef INET6 case (IPV6_VERSION >> 4): { u_int32_t itos32; /* Save ECN notification. */ m_copydata(m, sizeof(struct ip) + offsetof(struct ip6_hdr, ip6_flow), sizeof(u_int32_t), (caddr_t) &itos32); itos = ntohl(itos32) >> 20; ipo->ip_p = IPPROTO_IPV6; ipo->ip_off = 0; break; } #endif /* INET6 */ default: goto nofamily; } otos = 0; ip_ecn_ingress(ECN_ALLOWED, &otos, &itos); ipo->ip_tos = otos; break; #endif /* INET */ #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&saidx->dst.sin6.sin6_addr) || saidx->src.sa.sa_family != AF_INET6 || IN6_IS_ADDR_UNSPECIFIED(&saidx->src.sin6.sin6_addr)) { DPRINTF(("%s: unspecified tunnel endpoint " "address in SA %s/%08lx\n", __func__, ipsec_address(&saidx->dst), (u_long) ntohl(sav->spi))); IPIPSTAT_INC(ipips_unspec); error = ENOBUFS; goto bad; } /* scoped address handling */ ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) ip6->ip6_src.s6_addr16[1] = 0; if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = 0; M_PREPEND(m, sizeof(struct ip6_hdr), M_NOWAIT); if (m == 0) { DPRINTF(("%s: M_PREPEND failed\n", __func__)); IPIPSTAT_INC(ipips_hdrops); error = ENOBUFS; goto bad; } /* Initialize IPv6 header */ ip6o = mtod(m, struct ip6_hdr *); ip6o->ip6_flow = 0; ip6o->ip6_vfc &= ~IPV6_VERSION_MASK; ip6o->ip6_vfc |= IPV6_VERSION; ip6o->ip6_plen = htons(m->m_pkthdr.len); ip6o->ip6_hlim = IPV6_DEFHLIM; ip6o->ip6_dst = saidx->dst.sin6.sin6_addr; ip6o->ip6_src = saidx->src.sin6.sin6_addr; /* Fix payload length */ ip6o->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); switch (tp) { #ifdef INET case IPVERSION: /* Save ECN notification */ m_copydata(m, sizeof(struct ip6_hdr) + offsetof(struct ip, ip_tos), sizeof(u_int8_t), (caddr_t) &itos); /* This is really IPVERSION. */ ip6o->ip6_nxt = IPPROTO_IPIP; break; #endif /* INET */ case (IPV6_VERSION >> 4): { u_int32_t itos32; /* Save ECN notification. */ m_copydata(m, sizeof(struct ip6_hdr) + offsetof(struct ip6_hdr, ip6_flow), sizeof(u_int32_t), (caddr_t) &itos32); itos = ntohl(itos32) >> 20; ip6o->ip6_nxt = IPPROTO_IPV6; break; } default: goto nofamily; } otos = 0; ip_ecn_ingress(V_ip6_ipsec_ecn, &otos, &itos); ip6o->ip6_flow |= htonl((u_int32_t) otos << 20); break; #endif /* INET6 */ default: nofamily: DPRINTF(("%s: unsupported protocol family %u\n", __func__, saidx->dst.sa.sa_family)); IPIPSTAT_INC(ipips_family); error = EAFNOSUPPORT; /* XXX diffs from openbsd */ goto bad; } IPIPSTAT_INC(ipips_opackets); *mp = m; #ifdef INET if (saidx->dst.sa.sa_family == AF_INET) { #if 0 if (sav->tdb_xform->xf_type == XF_IP4) tdb->tdb_cur_bytes += m->m_pkthdr.len - sizeof(struct ip); #endif IPIPSTAT_ADD(ipips_obytes, m->m_pkthdr.len - sizeof(struct ip)); } #endif /* INET */ #ifdef INET6 if (saidx->dst.sa.sa_family == AF_INET6) { #if 0 if (sav->tdb_xform->xf_type == XF_IP4) tdb->tdb_cur_bytes += m->m_pkthdr.len - sizeof(struct ip6_hdr); #endif IPIPSTAT_ADD(ipips_obytes, m->m_pkthdr.len - sizeof(struct ip6_hdr)); } #endif /* INET6 */ return 0; bad: if (m) m_freem(m); *mp = NULL; return (error); } #ifdef IPSEC #if defined(INET) || defined(INET6) static int ipe4_init(struct secasvar *sav, struct xformsw *xsp) { sav->tdb_xform = xsp; return 0; } static int ipe4_zeroize(struct secasvar *sav) { sav->tdb_xform = NULL; return 0; } static int ipe4_input(struct mbuf *m, struct secasvar *sav, int skip, int protoff) { /* This is a rather serious mistake, so no conditional printing. */ printf("%s: should never be called\n", __func__); if (m) m_freem(m); return EOPNOTSUPP; } static struct xformsw ipe4_xformsw = { XF_IP4, 0, "IPv4 Simple Encapsulation", ipe4_init, ipe4_zeroize, ipe4_input, ipip_output, }; extern struct domain inetdomain; #endif /* INET || INET6 */ #ifdef INET static struct protosw ipe4_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = ip4_input, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif /* INET */ #if defined(INET6) && defined(INET) static struct protosw ipe6_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = ip4_input6, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif /* INET6 && INET */ #ifdef INET /* * Check the encapsulated packet to see if we want it */ static int ipe4_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { /* * Only take packets coming from IPSEC tunnels; the rest * must be handled by the gif tunnel code. Note that we * also return a minimum priority when we want the packet * so any explicit gif tunnels take precedence. */ return ((m->m_flags & M_IPSEC) != 0 ? 1 : 0); } #endif /* INET */ static void ipe4_attach(void) { xform_register(&ipe4_xformsw); /* attach to encapsulation framework */ /* XXX save return cookie for detach on module remove */ #ifdef INET (void) encap_attach_func(AF_INET, -1, ipe4_encapcheck, &ipe4_protosw, NULL); #endif #if defined(INET6) && defined(INET) (void) encap_attach_func(AF_INET6, -1, ipe4_encapcheck, (struct protosw *)&ipe6_protosw, NULL); #endif } SYSINIT(ipe4_xform_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipe4_attach, NULL); #endif /* IPSEC */ Index: user/dchagin/lemul/sys/netpfil/ipfw/ip_fw2.c =================================================================== --- user/dchagin/lemul/sys/netpfil/ipfw/ip_fw2.c (revision 274240) +++ user/dchagin/lemul/sys/netpfil/ipfw/ip_fw2.c (revision 274241) @@ -1,2902 +1,2905 @@ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ static VNET_DEFINE(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) static VNET_DEFINE(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; #ifndef LINEAR_SKIPTO static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) #else static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) #endif /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); /* ipfw_vnet_ready controls when we are open for business */ VNET_DEFINE(int, ipfw_vnet_ready) = 0; VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, - CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, - CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, - CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, - CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of concurrently used tables"); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, - CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU", +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + 0, 0, sysctl_ipfw_tables_sets, "IU", "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); -SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, - CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); -SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, - CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, + &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); -SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, - CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, + &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return (0); /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table_extended(chain, cmd->p.kidx, 0, &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; in_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; #endif /* __FreeBSD__ */ } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &V_ifnet, if_link) { if_addr_rlock(mdc); TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { if_addr_runlock(mdc); return 1; } } } if_addr_runlock(mdc); } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; in6_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; struct ifnet *oif; int lookupflags; int match; id = &args->f_id; inp = args->inp; oif = args->oif; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (oif == NULL) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, oif, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, oif, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (oif == NULL) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, oif, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, oif, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; } #ifndef LINEAR_SKIPTO /* * Helper function to enable cached rule lookups using * cached_id and cached_pos fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; /* If possible use cached f_pos (in f->cached_pos), * whose version is written in f->cached_id * (horrible hacks to avoid changing the ABI). */ if (num != IP_FW_TARG && f->cached_id == chain->id) f_pos = f->cached_pos; else { int i = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; if (chain->idxmap != NULL) f_pos = chain->idxmap[i]; else f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (num != IP_FW_TARG) { f->cached_id = chain->id; f->cached_pos = f_pos; } } return (f_pos); } #else /* * Helper function to enable real fast rule lookups. */ static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; num = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && num <= f->rulenum) num = f->rulenum + 1; f_pos = chain->idxmap[num]; return (f_pos); } #endif #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->oif Outgoing interface, NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; int f_pos = 0; /* index of current rule in the array */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragement (fragement header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ uint16_t iplen=0; int pktlen; uint16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ IPFW_RLOCK_TRACKER; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define PULLUP_LEN(_len, p, T) \ do { \ int x = (_len) + T; \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (_len)); \ } while (0) /* * if we have an ether header, */ if (args->eh) etype = ntohs(args->eh->ether_type); /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, struct carp_header); if (((struct carp_header *)ulp)->carp_version != CARP_VERSION) return (IP_FW_DENY); if (((struct carp_header *)ulp)->carp_type != CARP_ADVERTISEMENT) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); pktlen = iplen < pktlen ? iplen : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->rule.slot) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: { /* For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ uint32_t i = args->rule.info; match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); } break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t key = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v = 0; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* generic lookup. The key must be * in 32bit big-endian format. */ v = ((ipfw_insn_u32 *)cmd)->d[1]; if (v == 0) key = dst_ip.s_addr; else if (v == 1) key = src_ip.s_addr; else if (v == 6) /* dscp */ key = (ip->ip_tos >> 2) & 0x3f; else if (offset != 0) break; else if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) break; else if (v == 2) key = dst_port; else if (v == 3) key = src_port; #ifndef USERSPACE else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (v ==4 /* O_UID */) key = ucred_cache.uid; else if (v == 5 /* O_JAIL */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ } else #endif /* !USERSPACE */ break; } match = ipfw_lookup_table(chain, cmd->arg1, key, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } else if (is_ipv6) { uint32_t v = 0; void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? &args->f_id.dst_ip6: &args->f_id.src_ip6; match = ipfw_lookup_table_extended(chain, cmd->arg1, sizeof(struct in6_addr), pkey, &v); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; if (match) tablearg = v; } break; case O_IP_FLOW_LOOKUP: { uint32_t v = 0; match = ipfw_lookup_table_extended(chain, cmd->arg1, 0, &args->f_id, &v); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; x = (*v & 0x0F) << 2; v++; x |= *v >> 6; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x > 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = iplen - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: if (proto == IPPROTO_TCP && offset == 0 && ulp){ PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); match = tcpopts_match(TCP(ulp), cmd); } break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(chain, f, hlen, args, m, oif, offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif, args->f_id.fib) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif, args->f_id.fib) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib); else match = 1; break; case O_IPSEC: #ifdef IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incomming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_install_state(chain, f, (ipfw_insn_limit *)cmd, args, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = ipfw_lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule by setting * f, cmd, l and clearing cmdlen. */ IPFW_INC_DYN_COUNTER(q, pktlen); /* XXX we would like to have f_pos * readily accessible in the dynamic * rule, instead of having to * lookup q->rule. */ f = q->rule; f_pos = ipfw_find_rule(chain, f->rulenum, f->id); cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; ipfw_dyn_unlock(q); cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->eh) /* not on layer 2 */ break; /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } IPFW_INC_RULE_COUNTER(f, pktlen); stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; f_pos = JUMP(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( args, cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { bcopy(sa, &args->hopstore, sizeof(*sa)); args->hopstore.sin_addr.s_addr = htonl(tablearg); args->next_hop = &args->hopstore; } else { args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->eh) /* not valid on layer2 pkts */ break; if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = TARG(cmd->arg1, fib) & 0x7FFFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t a; a = ip->ip_tos; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); a += ntohs(ip->ip_sum) - ip->ip_tos; ip->ip_sum = htons(a); } else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; *v = (*v & 0xF0) | (code >> 2); v++; *v = (*v & 0x3F) | ((code & 0x03) << 6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ if (!IPFW_NAT_LOADED) { retval = IP_FW_DENY; break; } struct cfg_nat *t; int nat_id; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == 0) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); break; case O_REASS: { int ip_off; IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* in any case exit inner loop */ ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } /* * Switches table namespace between global and per-set. */ static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) { int error; unsigned int sets; sets = V_fw_tables_sets; error = sysctl_handle_int(oidp, &sets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_init_sopt_handler(); ipfw_log_bpf(1); /* init */ ipfw_iface_init(); return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_iface_destroy(); ipfw_log_bpf(0); /* uninit */ ipfw_destroy_sopt_handler(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif ipfw_init_counters(); /* insert the default rule and create the initial map */ chain->n_rules = 1; chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } /* fill and insert the default rule */ rule->act_ofs = 0; rule->rulenum = IPFW_DEFAULT_RULE; rule->cmd_len = 1; rule->set = RESVD_SET; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = chain->map[0] = rule; chain->id = rule->id = 1; /* Pre-calculate rules length for legacy dump format */ chain->static_len = sizeof(struct ip_fw_rule0); IPFW_LOCK_INIT(chain); ipfw_dyn_init(chain); #ifdef LINEAR_SKIPTO ipfw_init_skipto_cache(chain); #endif /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(1); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ (void)ipfw_attach_hooks(0 /* detach */); V_ip_fw_ctl_ptr = NULL; last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); IPFW_UH_WLOCK(chain); IPFW_WLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_WUNLOCK(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) ipfw_reap_add(chain, &reap, chain->map[i]); free(chain->map, M_IPFW); #ifdef LINEAR_SKIPTO ipfw_destroy_skipto_cache(chain); #endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_destroy_tables(chain, last); if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); return (0); } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ Index: user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_dynamic.c =================================================================== --- user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_dynamic.c (revision 274240) +++ user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_dynamic.c (revision 274241) @@ -1,1605 +1,1605 @@ /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #define DEB(x) #define DDB(x) x /* * Dynamic rule support for ipfw */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include /* ip_defttl */ #include #include #include #include /* IN6_ARE_ADDR_EQUAL */ #ifdef INET6 #include #include #endif #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is equal to UMA zone items count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. This can be changed by dyn_keep_states * sysctl. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ struct ipfw_dyn_bucket { struct mtx mtx; /* Bucket protecting lock */ ipfw_dyn_rule *head; /* Pointer to first rule */ }; /* * Static variables followed by global ones */ static VNET_DEFINE(struct ipfw_dyn_bucket *, ipfw_dyn_v); static VNET_DEFINE(u_int32_t, dyn_buckets_max); static VNET_DEFINE(u_int32_t, curr_dyn_buckets); static VNET_DEFINE(struct callout, ipfw_timeout); #define V_ipfw_dyn_v VNET(ipfw_dyn_v) #define V_dyn_buckets_max VNET(dyn_buckets_max) #define V_curr_dyn_buckets VNET(curr_dyn_buckets) #define V_ipfw_timeout VNET(ipfw_timeout) static VNET_DEFINE(uma_zone_t, ipfw_dyn_rule_zone); #define V_ipfw_dyn_rule_zone VNET(ipfw_dyn_rule_zone) #define IPFW_BUCK_LOCK_INIT(b) \ mtx_init(&(b)->mtx, "IPFW dynamic bucket", NULL, MTX_DEF) #define IPFW_BUCK_LOCK_DESTROY(b) \ mtx_destroy(&(b)->mtx) #define IPFW_BUCK_LOCK(i) mtx_lock(&V_ipfw_dyn_v[(i)].mtx) #define IPFW_BUCK_UNLOCK(i) mtx_unlock(&V_ipfw_dyn_v[(i)].mtx) #define IPFW_BUCK_ASSERT(i) mtx_assert(&V_ipfw_dyn_v[(i)].mtx, MA_OWNED) static VNET_DEFINE(int, dyn_keep_states); #define V_dyn_keep_states VNET(dyn_keep_states) /* * Timeouts for various events in handing dynamic rules. */ static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); static VNET_DEFINE(u_int32_t, dyn_short_lifetime); #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) #define V_dyn_short_lifetime VNET(dyn_short_lifetime) /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); static VNET_DEFINE(u_int32_t, dyn_keepalive_period); static VNET_DEFINE(u_int32_t, dyn_keepalive); static VNET_DEFINE(time_t, dyn_keepalive_last); #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) #define V_dyn_keepalive_period VNET(dyn_keepalive_period) #define V_dyn_keepalive VNET(dyn_keepalive) #define V_dyn_keepalive_last VNET(dyn_keepalive_last) static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ #define DYN_COUNT uma_zone_get_cur(V_ipfw_dyn_rule_zone) #define V_dyn_max VNET(dyn_max) /* for userspace, we emulate the uma_zone_counter with ipfw_dyn_count */ static int ipfw_dyn_count; /* number of objects */ #ifdef USERSPACE /* emulation of UMA object counters for userspace */ #define uma_zone_get_cur(x) ipfw_dyn_count #endif /* USERSPACE */ static int last_log; /* Log ratelimiting */ static void ipfw_dyn_tick(void *vnetx); static void check_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *, int, int); #ifdef SYSCTL_NODE static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS); SYSBEGIN(f2) SYSCTL_DECL(_net_inet_ip_fw); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, - CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_buckets_max), 0, "Max number of dyn. buckets"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, - CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, + CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, "Current Number of dyn. buckets"); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, - CTLTYPE_UINT|CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU", +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_count, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RD, 0, 0, sysctl_ipfw_dyn_count, "IU", "Number of dyn. rules"); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, - CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU", +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, dyn_max, + CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_dyn_max, "IU", "Max number of dyn. rules"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, "Lifetime of dyn. rules for acks"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, "Lifetime of dyn. rules for syn"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, "Lifetime of dyn. rules for fin"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, "Lifetime of dyn. rules for rst"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, "Lifetime of dyn. rules for UDP"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, - CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, "Lifetime of dyn. rules for other situations"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, - CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, "Enable keepalives for dyn. rules"); -SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, - CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keep_states, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dyn_keep_states), 0, "Do not flush dynamic states on rule deletion"); SYSEND #endif /* SYSCTL_NODE */ #ifdef INET6 static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port); return i; } #endif /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id, int buckets) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (buckets - 1); return i; } /** * Print customizable flow id description via log(9) facility. */ static void print_dyn_rule_flags(struct ipfw_flow_id *id, int dyn_type, int log_flags, char *prefix, char *postfix) { struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(id)) { ip6_sprintf(src, &id->src_ip6); ip6_sprintf(dst, &id->dst_ip6); } else #endif { da.s_addr = htonl(id->src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(id->dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } log(log_flags, "ipfw: %s type %d %s %d -> %s %d, %d %s\n", prefix, dyn_type, src, id->src_port, dst, id->dst_port, DYN_COUNT, postfix); } #define print_dyn_rule(id, dtype, prefix, postfix) \ print_dyn_rule_flags(id, dtype, LOG_DEBUG, prefix, postfix) #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) #define TIME_LE(a,b) ((int)((a)-(b)) < 0) /* * Lookup a dynamic rule, locked version. */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int i, int *match_direction, struct tcphdr *tcp) { /* * Stateful ipfw extensions. * Lookup into dynamic session queue. */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int dir = MATCH_NONE; ipfw_dyn_rule *prev, *q = NULL; IPFW_BUCK_ASSERT(i); for (prev = NULL, q = V_ipfw_dyn_v[i].head; q; prev = q, q = q->next) { if (q->dyn_type == O_LIMIT_PARENT && q->count) continue; if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT) continue; if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) && IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) && IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port) { dir = MATCH_REVERSE; break; } } } if (q == NULL) goto done; /* q = NULL, not found */ if (prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = V_ipfw_dyn_v[i].head; V_ipfw_dyn_v[i].head = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ uint32_t ack; u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) #define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) #define ACK_FWD 0x10000 /* fwd ack seen */ #define ACK_REV 0x20000 /* rev ack seen */ q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); switch (q->state & TCP_FLAGS) { case TH_SYN: /* opening */ q->expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN: /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8): #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) if (tcp == NULL) break; ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) { q->ack_fwd = ack; q->state |= ACK_FWD; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) { q->ack_rev = ack; q->state |= ACK_REV; } } if ((q->state & (ACK_FWD | ACK_REV)) == (ACK_FWD | ACK_REV)) { q->expire = time_uptime + V_dyn_ack_lifetime; q->state &= ~(ACK_FWD | ACK_REV); } break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; q->expire = time_uptime + V_dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; q->expire = time_uptime + V_dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_uptime + V_dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_uptime + V_dyn_short_lifetime; } done: if (match_direction != NULL) *match_direction = dir; return (q); } ipfw_dyn_rule * ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; int i; i = hash_packet(pkt, V_curr_dyn_buckets); IPFW_BUCK_LOCK(i); q = lookup_dyn_rule_locked(pkt, i, match_direction, tcp); if (q == NULL) IPFW_BUCK_UNLOCK(i); /* NB: return table locked when q is not NULL */ return q; } /* * Unlock bucket mtx * @p - pointer to dynamic rule */ void ipfw_dyn_unlock(ipfw_dyn_rule *q) { IPFW_BUCK_UNLOCK(q->bucket); } static int resize_dynamic_table(struct ip_fw_chain *chain, int nbuckets) { int i, k, nbuckets_old; ipfw_dyn_rule *q; struct ipfw_dyn_bucket *dyn_v, *dyn_v_old; /* Check if given number is power of 2 and less than 64k */ if ((nbuckets > 65536) || (!powerof2(nbuckets))) return 1; CTR3(KTR_NET, "%s: resize dynamic hash: %d -> %d", __func__, V_curr_dyn_buckets, nbuckets); /* Allocate and initialize new hash */ dyn_v = malloc(nbuckets * sizeof(ipfw_dyn_rule), M_IPFW, M_WAITOK | M_ZERO); for (i = 0 ; i < nbuckets; i++) IPFW_BUCK_LOCK_INIT(&dyn_v[i]); /* * Call upper half lock, as get_map() do to ease * read-only access to dynamic rules hash from sysctl */ IPFW_UH_WLOCK(chain); /* * Acquire chain write lock to permit hash access * for main traffic path without additional locks */ IPFW_WLOCK(chain); /* Save old values */ nbuckets_old = V_curr_dyn_buckets; dyn_v_old = V_ipfw_dyn_v; /* Skip relinking if array is not set up */ if (V_ipfw_dyn_v == NULL) V_curr_dyn_buckets = 0; /* Re-link all dynamic states */ for (i = 0 ; i < V_curr_dyn_buckets ; i++) { while (V_ipfw_dyn_v[i].head != NULL) { /* Remove from current chain */ q = V_ipfw_dyn_v[i].head; V_ipfw_dyn_v[i].head = q->next; /* Get new hash value */ k = hash_packet(&q->id, nbuckets); q->bucket = k; /* Add to the new head */ q->next = dyn_v[k].head; dyn_v[k].head = q; } } /* Update current pointers/buckets values */ V_curr_dyn_buckets = nbuckets; V_ipfw_dyn_v = dyn_v; IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); /* Start periodic callout on initial creation */ if (dyn_v_old == NULL) { callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, curvnet, 0); return (0); } /* Destroy all mutexes */ for (i = 0 ; i < nbuckets_old ; i++) IPFW_BUCK_LOCK_DESTROY(&dyn_v_old[i]); /* Free old hash */ free(dyn_v_old, M_IPFW); return 0; } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, int i, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; IPFW_BUCK_ASSERT(i); r = uma_zalloc(V_ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { if (last_log != time_uptime) { last_log = time_uptime; log(LOG_DEBUG, "ipfw: Cannot allocate dynamic state, " "consider increasing net.inet.ip.fw.dyn_max\n"); } return NULL; } ipfw_dyn_count++; /* * refcount on parent is already incremented, so * it is safe to use parent unlocked. */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_uptime + V_dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; IPFW_ZERO_DYN_COUNTER(r); r->count = 0; r->bucket = i; r->next = V_ipfw_dyn_v[i].head; V_ipfw_dyn_v[i].head = r; DEB(print_dyn_rule(id, dyn_type, "add dyn entry", "total");) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, int *pindex, struct ip_fw *rule) { ipfw_dyn_rule *q; int i, is_v6; is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt, V_curr_dyn_buckets ); *pindex = i; IPFW_BUCK_LOCK(i); for (q = V_ipfw_dyn_v[i].head ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { q->expire = time_uptime + V_dyn_short_lifetime; DEB(print_dyn_rule(pkt, q->dyn_type, "lookup_dyn_parent found", "");) return q; } /* Add virtual limiting rule */ return add_dyn_rule(pkt, i, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ int ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { ipfw_dyn_rule *q; int i; DEB(print_dyn_rule(&args->f_id, cmd->o.opcode, "install_state", "");) i = hash_packet(&args->f_id, V_curr_dyn_buckets); IPFW_BUCK_LOCK(i); q = lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL); if (q != NULL) { /* should never occur */ DEB( if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: entry already present, done\n", __func__); }) IPFW_BUCK_UNLOCK(i); return (0); } /* * State limiting is done via uma(9) zone limiting. * Save pointer to newly-installed rule and reject * packet if add_dyn_rule() returned NULL. * Note q is currently set to NULL. */ switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ q = add_dyn_rule(&args->f_id, i, O_KEEP_STATE, rule); break; case O_LIMIT: { /* limit number of sessions */ struct ipfw_flow_id id; ipfw_dyn_rule *parent; uint32_t conn_limit; uint16_t limit_mask = cmd->limit_mask; int pindex; conn_limit = IP_FW_ARG_TABLEARG(chain, cmd->conn_limit, limit); DEB( if (cmd->conn_limit == IP_FW_TARG) printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " "(tablearg)\n", __func__, conn_limit); else printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", __func__, conn_limit); ) id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; id.addr_type = args->f_id.addr_type; id.fib = M_GETFIB(args->m); if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; /* * We have to release lock for previous bucket to * avoid possible deadlock */ IPFW_BUCK_UNLOCK(i); if ((parent = lookup_dyn_parent(&id, &pindex, rule)) == NULL) { printf("ipfw: %s: add parent failed\n", __func__); IPFW_BUCK_UNLOCK(pindex); return (1); } if (parent->count >= conn_limit) { if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; char sbuf[24]; last_log = time_uptime; snprintf(sbuf, sizeof(sbuf), "%d drop session", parent->rule->rulenum); print_dyn_rule_flags(&args->f_id, cmd->o.opcode, LOG_SECURITY | LOG_DEBUG, sbuf, "too many entries"); } IPFW_BUCK_UNLOCK(pindex); return (1); } /* Increment counter on parent */ parent->count++; IPFW_BUCK_UNLOCK(pindex); IPFW_BUCK_LOCK(i); q = add_dyn_rule(&args->f_id, i, O_LIMIT, (struct ip_fw *)parent); if (q == NULL) { /* Decrement index and notify caller */ IPFW_BUCK_UNLOCK(i); IPFW_BUCK_LOCK(pindex); parent->count--; IPFW_BUCK_UNLOCK(pindex); return (1); } break; } default: printf("ipfw: %s: unknown dynamic rule type %u\n", __func__, cmd->o.opcode); } if (q == NULL) { IPFW_BUCK_UNLOCK(i); return (1); /* Notify caller about failure */ } /* XXX just set lifetime */ lookup_dyn_rule_locked(&args->f_id, i, NULL, NULL); IPFW_BUCK_UNLOCK(i); return (0); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ int len, dir; struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(0); h->ip_len = htons(len); h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } /* * Queue keepalive packets for given dynamic rule */ static struct mbuf ** ipfw_dyn_send_ka(struct mbuf **mtailp, ipfw_dyn_rule *q) { struct mbuf *m_rev, *m_fwd; m_rev = (q->state & ACK_REV) ? NULL : ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); m_fwd = (q->state & ACK_FWD) ? NULL : ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); if (m_rev != NULL) { *mtailp = m_rev; mtailp = &(*mtailp)->m_nextpkt; } if (m_fwd != NULL) { *mtailp = m_fwd; mtailp = &(*mtailp)->m_nextpkt; } return (mtailp); } /* * This procedure is used to perform various maintance * on dynamic hash list. Currently it is called every second. */ static void ipfw_dyn_tick(void * vnetx) { struct ip_fw_chain *chain; int check_ka = 0; #ifdef VIMAGE struct vnet *vp = vnetx; #endif CURVNET_SET(vp); chain = &V_layer3_chain; /* Run keepalive checks every keepalive_period iff ka is enabled */ if ((V_dyn_keepalive_last + V_dyn_keepalive_period <= time_uptime) && (V_dyn_keepalive != 0)) { V_dyn_keepalive_last = time_uptime; check_ka = 1; } check_dyn_rules(chain, NULL, check_ka, 1); callout_reset_on(&V_ipfw_timeout, hz, ipfw_dyn_tick, vnetx, 0); CURVNET_RESTORE(); } /* * Walk thru all dynamic states doing generic maintance: * 1) free expired states * 2) free all states based on deleted rule / set * 3) send keepalives for states if needed * * @chain - pointer to current ipfw rules chain * @rule - delete all states originated by given rule if != NULL * @set - delete all states originated by any rule in set @set if != RESVD_SET * @check_ka - perform checking/sending keepalives * @timer - indicate call from timer routine. * * Timer routine must call this function unlocked to permit * sending keepalives/resizing table. * * Others has to call function with IPFW_UH_WLOCK held. * Additionally, function assume that dynamic rule/set is * ALREADY deleted so no new states can be generated by * 'deleted' rules. * * Write lock is needed to ensure that unused parent rules * are not freed by other instance (see stage 2, 3) */ static void check_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int check_ka, int timer) { struct mbuf *m0, *m, *mnext, **mtailp; struct ip *h; int i, dyn_count, new_buckets = 0, max_buckets; int expired = 0, expired_limits = 0, parents = 0, total = 0; ipfw_dyn_rule *q, *q_prev, *q_next; ipfw_dyn_rule *exp_head, **exptailp; ipfw_dyn_rule *exp_lhead, **expltailp; KASSERT(V_ipfw_dyn_v != NULL, ("%s: dynamic table not allocated", __func__)); /* Avoid possible LOR */ KASSERT(!check_ka || timer, ("%s: keepalive check with lock held", __func__)); /* * Do not perform any checks if we currently have no dynamic states */ if (DYN_COUNT == 0) return; /* Expired states */ exp_head = NULL; exptailp = &exp_head; /* Expired limit states */ exp_lhead = NULL; expltailp = &exp_lhead; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; /* Protect from hash resizing */ if (timer != 0) IPFW_UH_WLOCK(chain); else IPFW_UH_WLOCK_ASSERT(chain); #define NEXT_RULE() { q_prev = q; q = q->next ; continue; } /* Stage 1: perform requested deletion */ for (i = 0 ; i < V_curr_dyn_buckets ; i++) { IPFW_BUCK_LOCK(i); for (q = V_ipfw_dyn_v[i].head, q_prev = q; q ; ) { /* account every rule */ total++; /* Skip parent rules at all */ if (q->dyn_type == O_LIMIT_PARENT) { parents++; NEXT_RULE(); } /* * Remove rules which are: * 1) expired * 2) matches deletion range */ if ((TIME_LEQ(q->expire, time_uptime)) || (rt != NULL && ipfw_match_range(q->rule, rt))) { if (TIME_LE(time_uptime, q->expire) && q->dyn_type == O_KEEP_STATE && V_dyn_keep_states != 0) { /* * Do not delete state if * it is not expired and * dyn_keep_states is ON. * However we need to re-link it * to any other stable rule */ q->rule = chain->default_rule; NEXT_RULE(); } /* Unlink q from current list */ q_next = q->next; if (q == V_ipfw_dyn_v[i].head) V_ipfw_dyn_v[i].head = q_next; else q_prev->next = q_next; q->next = NULL; /* queue q to expire list */ if (q->dyn_type != O_LIMIT) { *exptailp = q; exptailp = &(*exptailp)->next; DEB(print_dyn_rule(&q->id, q->dyn_type, "unlink entry", "left"); ) } else { /* Separate list for limit rules */ *expltailp = q; expltailp = &(*expltailp)->next; expired_limits++; DEB(print_dyn_rule(&q->id, q->dyn_type, "unlink limit entry", "left"); ) } q = q_next; expired++; continue; } /* * Check if we need to send keepalive: * we need to ensure if is time to do KA, * this is established TCP session, and * expire time is within keepalive interval */ if ((check_ka != 0) && (q->id.proto == IPPROTO_TCP) && ((q->state & BOTH_SYN) == BOTH_SYN) && (TIME_LEQ(q->expire, time_uptime + V_dyn_keepalive_interval))) mtailp = ipfw_dyn_send_ka(mtailp, q); NEXT_RULE(); } IPFW_BUCK_UNLOCK(i); } /* Stage 2: decrement counters from O_LIMIT parents */ if (expired_limits != 0) { /* * XXX: Note that deleting set with more than one * heavily-used LIMIT rules can result in overwhelming * locking due to lack of per-hash value sorting * * We should probably think about: * 1) pre-allocating hash of size, say, * MAX(16, V_curr_dyn_buckets / 1024) * 2) checking if expired_limits is large enough * 3) If yes, init hash (or its part), re-link * current list and start decrementing procedure in * each bucket separately */ /* * Small optimization: do not unlock bucket until * we see the next item resides in different bucket */ if (exp_lhead != NULL) { i = exp_lhead->parent->bucket; IPFW_BUCK_LOCK(i); } for (q = exp_lhead; q != NULL; q = q->next) { if (i != q->parent->bucket) { IPFW_BUCK_UNLOCK(i); i = q->parent->bucket; IPFW_BUCK_LOCK(i); } /* Decrease parent refcount */ q->parent->count--; } if (exp_lhead != NULL) IPFW_BUCK_UNLOCK(i); } /* * We protectet ourselves from unused parent deletion * (from the timer function) by holding UH write lock. */ /* Stage 3: remove unused parent rules */ if ((parents != 0) && (expired != 0)) { for (i = 0 ; i < V_curr_dyn_buckets ; i++) { IPFW_BUCK_LOCK(i); for (q = V_ipfw_dyn_v[i].head, q_prev = q ; q ; ) { if (q->dyn_type != O_LIMIT_PARENT) NEXT_RULE(); if (q->count != 0) NEXT_RULE(); /* Parent rule without consumers */ /* Unlink q from current list */ q_next = q->next; if (q == V_ipfw_dyn_v[i].head) V_ipfw_dyn_v[i].head = q_next; else q_prev->next = q_next; q->next = NULL; /* Add to expired list */ *exptailp = q; exptailp = &(*exptailp)->next; DEB(print_dyn_rule(&q->id, q->dyn_type, "unlink parent entry", "left"); ) expired++; q = q_next; } IPFW_BUCK_UNLOCK(i); } } #undef NEXT_RULE if (timer != 0) { /* * Check if we need to resize hash: * if current number of states exceeds number of buckes in hash, * grow hash size to the minimum power of 2 which is bigger than * current states count. Limit hash size by 64k. */ max_buckets = (V_dyn_buckets_max > 65536) ? 65536 : V_dyn_buckets_max; dyn_count = DYN_COUNT; if ((dyn_count > V_curr_dyn_buckets * 2) && (dyn_count < max_buckets)) { new_buckets = V_curr_dyn_buckets; while (new_buckets < dyn_count) { new_buckets *= 2; if (new_buckets >= max_buckets) break; } } IPFW_UH_WUNLOCK(chain); } /* Finally delete old states ad limits if any */ for (q = exp_head; q != NULL; q = q_next) { q_next = q->next; uma_zfree(V_ipfw_dyn_rule_zone, q); ipfw_dyn_count--; } for (q = exp_lhead; q != NULL; q = q_next) { q_next = q->next; uma_zfree(V_ipfw_dyn_rule_zone, q); ipfw_dyn_count--; } /* * The rest code MUST be called from timer routine only * without holding any locks */ if (timer == 0) return; /* Send keepalive packets if any */ for (m = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; h = mtod(m, struct ip *); if (h->ip_v == 4) ip_output(m, NULL, NULL, 0, NULL, NULL); #ifdef INET6 else ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); #endif } /* Run table resize without holding any locks */ if (new_buckets != 0) resize_dynamic_table(chain, new_buckets); } /* * Deletes all dynamic rules originated by given rule or all rules in * given set. Specify RESVD_SET to indicate set should not be used. * @chain - pointer to current ipfw rules chain * @rr - delete all states originated by rules in matched range. * * Function has to be called with IPFW_UH_WLOCK held. * Additionally, function assume that dynamic rule/set is * ALREADY deleted so no new states can be generated by * 'deleted' rules. */ void ipfw_expire_dyn_rules(struct ip_fw_chain *chain, ipfw_range_tlv *rt) { check_dyn_rules(chain, rt, 0, 0); } /* * Check if rule contains at least one dynamic opcode. * * Returns 1 if such opcode is found, 0 otherwise. */ int ipfw_is_dyn_rule(struct ip_fw *rule) { int cmdlen, l; ipfw_insn *cmd; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); switch (cmd->opcode) { case O_LIMIT: case O_KEEP_STATE: case O_PROBE_STATE: case O_CHECK_STATE: return (1); } } return (0); } void ipfw_dyn_init(struct ip_fw_chain *chain) { V_ipfw_dyn_v = NULL; V_dyn_buckets_max = 256; /* must be power of 2 */ V_curr_dyn_buckets = 256; /* must be power of 2 */ V_dyn_ack_lifetime = 300; V_dyn_syn_lifetime = 20; V_dyn_fin_lifetime = 1; V_dyn_rst_lifetime = 1; V_dyn_udp_lifetime = 10; V_dyn_short_lifetime = 5; V_dyn_keepalive_interval = 20; V_dyn_keepalive_period = 5; V_dyn_keepalive = 1; /* do send keepalives */ V_dyn_keepalive_last = time_uptime; V_dyn_max = 16384; /* max # of dynamic rules */ V_ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* Enforce limit on dynamic rules */ uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); /* * This can potentially be done on first dynamic rule * being added to chain. */ resize_dynamic_table(chain, V_curr_dyn_buckets); } void ipfw_dyn_uninit(int pass) { int i; if (pass == 0) { callout_drain(&V_ipfw_timeout); return; } if (V_ipfw_dyn_v != NULL) { /* * Skip deleting all dynamic states - * uma_zdestroy() does this more efficiently; */ /* Destroy all mutexes */ for (i = 0 ; i < V_curr_dyn_buckets ; i++) IPFW_BUCK_LOCK_DESTROY(&V_ipfw_dyn_v[i]); free(V_ipfw_dyn_v, M_IPFW); V_ipfw_dyn_v = NULL; } uma_zdestroy(V_ipfw_dyn_rule_zone); } #ifdef SYSCTL_NODE /* * Get/set maximum number of dynamic states in given VNET instance. */ static int sysctl_ipfw_dyn_max(SYSCTL_HANDLER_ARGS) { int error; unsigned int nstates; nstates = V_dyn_max; error = sysctl_handle_int(oidp, &nstates, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); V_dyn_max = nstates; uma_zone_set_max(V_ipfw_dyn_rule_zone, V_dyn_max); return (0); } /* * Get current number of dynamic states in given VNET instance. */ static int sysctl_ipfw_dyn_count(SYSCTL_HANDLER_ARGS) { int error; unsigned int nstates; nstates = DYN_COUNT; error = sysctl_handle_int(oidp, &nstates, 0, req); return (error); } #endif /* * Returns size of dynamic states in legacy format */ int ipfw_dyn_len(void) { return (V_ipfw_dyn_v == NULL) ? 0 : (DYN_COUNT * sizeof(ipfw_dyn_rule)); } /* * Returns number of dynamic states. * Used by dump format v1 (current). */ int ipfw_dyn_get_count(void) { return (V_ipfw_dyn_v == NULL) ? 0 : DYN_COUNT; } static void export_dyn_rule(ipfw_dyn_rule *src, ipfw_dyn_rule *dst) { memcpy(dst, src, sizeof(*src)); memcpy(&(dst->rule), &(src->rule->rulenum), sizeof(src->rule->rulenum)); /* * store set number into high word of * dst->rule pointer. */ memcpy((char *)&dst->rule + sizeof(src->rule->rulenum), &(src->rule->set), sizeof(src->rule->set)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ memcpy(&dst->next, &dst, sizeof(dst)); dst->expire = TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime; } /* * Fills int buffer given by @sd with dynamic states. * Used by dump format v1 (current). * * Returns 0 on success. */ int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd) { ipfw_dyn_rule *p; ipfw_obj_dyntlv *dst, *last; ipfw_obj_ctlv *ctlv; int i; size_t sz; if (V_ipfw_dyn_v == NULL) return (0); IPFW_UH_RLOCK_ASSERT(chain); ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv)); if (ctlv == NULL) return (ENOMEM); sz = sizeof(ipfw_obj_dyntlv); ctlv->head.type = IPFW_TLV_DYNSTATE_LIST; ctlv->objsize = sz; last = NULL; for (i = 0 ; i < V_curr_dyn_buckets; i++) { IPFW_BUCK_LOCK(i); for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { dst = (ipfw_obj_dyntlv *)ipfw_get_sopt_space(sd, sz); if (dst == NULL) { IPFW_BUCK_UNLOCK(i); return (ENOMEM); } export_dyn_rule(p, &dst->state); dst->head.length = sz; dst->head.type = IPFW_TLV_DYN_ENT; last = dst; } IPFW_BUCK_UNLOCK(i); } if (last != NULL) /* mark last dynamic rule */ last->head.flags = IPFW_DF_LAST; return (0); } /* * Fill given buffer with dynamic states (legacy format). * IPFW_UH_RLOCK has to be held while calling. */ void ipfw_get_dynamic(struct ip_fw_chain *chain, char **pbp, const char *ep) { ipfw_dyn_rule *p, *last = NULL; char *bp; int i; if (V_ipfw_dyn_v == NULL) return; bp = *pbp; IPFW_UH_RLOCK_ASSERT(chain); for (i = 0 ; i < V_curr_dyn_buckets; i++) { IPFW_BUCK_LOCK(i); for (p = V_ipfw_dyn_v[i].head ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; export_dyn_rule(p, dst); last = dst; bp += sizeof(ipfw_dyn_rule); } } IPFW_BUCK_UNLOCK(i); } if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); *pbp = bp; } /* end of file */ Index: user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_pfil.c =================================================================== --- user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_pfil.c (revision 274240) +++ user/dchagin/lemul/sys/netpfil/ipfw/ip_fw_pfil.c (revision 274241) @@ -1,574 +1,575 @@ /*- * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" #include "opt_inet.h" #include "opt_inet6.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include static VNET_DEFINE(int, fw_enable) = 1; #define V_fw_enable VNET(fw_enable) #ifdef INET6 static VNET_DEFINE(int, fw6_enable) = 1; #define V_fw6_enable VNET(fw6_enable) #endif static VNET_DEFINE(int, fwlink_enable) = 0; #define V_fwlink_enable VNET(fwlink_enable) int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); /* Forward declarations. */ static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); int ipfw_check_packet(void *, struct mbuf **, struct ifnet *, int, struct inpcb *); int ipfw_check_frame(void *, struct mbuf **, struct ifnet *, int, struct inpcb *); #ifdef SYSCTL_NODE SYSBEGIN(f1) SYSCTL_DECL(_net_inet_ip_fw); -SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw"); +SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, + &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6_fw); -SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, - ipfw_chg_hook, "I", "Enable ipfw+6"); +SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, + &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6"); #endif /* INET6 */ SYSCTL_DECL(_net_link_ether); -SYSCTL_VNET_PROC(_net_link_ether, OID_AUTO, ipfw, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fwlink_enable), 0, - ipfw_chg_hook, "I", "Pass ether pkts through firewall"); +SYSCTL_PROC(_net_link_ether, OID_AUTO, ipfw, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, + &VNET_NAME(fwlink_enable), 0, ipfw_chg_hook, "I", + "Pass ether pkts through firewall"); SYSEND #endif /* SYSCTL_NODE */ /* * The pfilter hook to pass packets to ipfw_chk and then to * dummynet, divert, netgraph or other modules. * The packet may be consumed. */ int ipfw_check_packet(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp) { struct ip_fw_args args; struct m_tag *tag; int ipfw; int ret; /* convert dir to IPFW values */ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; bzero(&args, sizeof(args)); again: /* * extract and remove the tag if present. If we are left * with onepass, optimize the outgoing path. */ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); if (tag != NULL) { args.rule = *((struct ipfw_rule_ref *)(tag+1)); m_tag_delete(*m0, tag); if (args.rule.info & IPFW_ONEPASS) return (0); } args.m = *m0; args.oif = dir == DIR_OUT ? ifp : NULL; args.inp = inp; ipfw = ipfw_chk(&args); *m0 = args.m; KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", __func__)); /* breaking out of the switch means drop */ ret = 0; /* default return value for pass */ switch (ipfw) { case IP_FW_PASS: /* next_hop may be set by ipfw_chk */ if (args.next_hop == NULL && args.next_hop6 == NULL) break; /* pass */ #if (!defined(INET6) && !defined(INET)) ret = EACCES; #else { struct m_tag *fwd_tag; size_t len; KASSERT(args.next_hop == NULL || args.next_hop6 == NULL, ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__, args.next_hop, args.next_hop6)); #ifdef INET6 if (args.next_hop6 != NULL) len = sizeof(struct sockaddr_in6); #endif #ifdef INET if (args.next_hop != NULL) len = sizeof(struct sockaddr_in); #endif /* Incoming packets should not be tagged so we do not * m_tag_find. Outgoing packets may be tagged, so we * reuse the tag if present. */ fwd_tag = (dir == DIR_IN) ? NULL : m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL) { m_tag_unlink(*m0, fwd_tag); } else { fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len, M_NOWAIT); if (fwd_tag == NULL) { ret = EACCES; break; /* i.e. drop */ } } #ifdef INET6 if (args.next_hop6 != NULL) { bcopy(args.next_hop6, (fwd_tag+1), len); if (in6_localip(&args.next_hop6->sin6_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; (*m0)->m_flags |= M_IP6_NEXTHOP; } #endif #ifdef INET if (args.next_hop != NULL) { bcopy(args.next_hop, (fwd_tag+1), len); if (in_localip(args.next_hop->sin_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; (*m0)->m_flags |= M_IP_NEXTHOP; } #endif m_tag_prepend(*m0, fwd_tag); } #endif /* INET || INET6 */ break; case IP_FW_DENY: ret = EACCES; break; /* i.e. drop */ case IP_FW_DUMMYNET: ret = EACCES; if (ip_dn_io_ptr == NULL) break; /* i.e. drop */ if (mtod(*m0, struct ip *)->ip_v == 4) ret = ip_dn_io_ptr(m0, dir, &args); else if (mtod(*m0, struct ip *)->ip_v == 6) ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); else break; /* drop it */ /* * XXX should read the return value. * dummynet normally eats the packet and sets *m0=NULL * unless the packet can be sent immediately. In this * case args is updated and we should re-run the * check without clearing args. */ if (*m0 != NULL) goto again; break; case IP_FW_TEE: case IP_FW_DIVERT: if (ip_divert_ptr == NULL) { ret = EACCES; break; /* i.e. drop */ } ret = ipfw_divert(m0, dir, &args.rule, (ipfw == IP_FW_TEE) ? 1 : 0); /* continue processing for the original packet (tee). */ if (*m0) goto again; break; case IP_FW_NGTEE: case IP_FW_NETGRAPH: if (ng_ipfw_input_p == NULL) { ret = EACCES; break; /* i.e. drop */ } ret = ng_ipfw_input_p(m0, dir, &args, (ipfw == IP_FW_NGTEE) ? 1 : 0); if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ goto again; /* continue with packet */ break; case IP_FW_NAT: /* honor one-pass in case of successful nat */ if (V_fw_one_pass) break; /* ret is already 0 */ goto again; case IP_FW_REASS: goto again; /* continue with packet */ default: KASSERT(0, ("%s: unknown retval", __func__)); } if (ret != 0) { if (*m0) FREE_PKT(*m0); *m0 = NULL; } return ret; } /* * ipfw processing for ethernet packets (in and out). * Inteface is NULL from ether_demux, and ifp from * ether_output_frame. */ int ipfw_check_frame(void *arg, struct mbuf **m0, struct ifnet *dst, int dir, struct inpcb *inp) { struct ether_header *eh; struct ether_header save_eh; struct mbuf *m; int i, ret; struct ip_fw_args args; struct m_tag *mtag; /* fetch start point from rule, if any */ mtag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { args.rule.slot = 0; } else { /* dummynet packet, already partially processed */ struct ipfw_rule_ref *r; /* XXX can we free it after use ? */ mtag->m_tag_id = PACKET_TAG_NONE; r = (struct ipfw_rule_ref *)(mtag + 1); if (r->info & IPFW_ONEPASS) return (0); args.rule = *r; } /* I need some amt of data to be contiguous */ m = *m0; i = min(m->m_pkthdr.len, max_protohdr); if (m->m_len < i) { m = m_pullup(m, i); if (m == NULL) { *m0 = m; return (0); } } eh = mtod(m, struct ether_header *); save_eh = *eh; /* save copy for restore below */ m_adj(m, ETHER_HDR_LEN); /* strip ethernet header */ args.m = m; /* the packet we are looking at */ args.oif = dir == PFIL_OUT ? dst: NULL; /* destination, if any */ args.next_hop = NULL; /* we do not support forward yet */ args.next_hop6 = NULL; /* we do not support forward yet */ args.eh = &save_eh; /* MAC header for bridged/MAC packets */ args.inp = NULL; /* used by ipfw uid/gid/jail rules */ i = ipfw_chk(&args); m = args.m; if (m != NULL) { /* * Restore Ethernet header, as needed, in case the * mbuf chain was replaced by ipfw. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); if (m == NULL) { *m0 = NULL; return (0); } if (eh != mtod(m, struct ether_header *)) bcopy(&save_eh, mtod(m, struct ether_header *), ETHER_HDR_LEN); } *m0 = m; ret = 0; /* Check result of ipfw_chk() */ switch (i) { case IP_FW_PASS: break; case IP_FW_DENY: ret = EACCES; break; /* i.e. drop */ case IP_FW_DUMMYNET: ret = EACCES; int dir; if (ip_dn_io_ptr == NULL) break; /* i.e. drop */ *m0 = NULL; dir = PROTO_LAYER2 | (dst ? DIR_OUT : DIR_IN); ip_dn_io_ptr(&m, dir, &args); return 0; default: KASSERT(0, ("%s: unknown retval", __func__)); } if (ret != 0) { if (*m0) FREE_PKT(*m0); *m0 = NULL; } return ret; } /* do the divert, return 1 on error 0 on success */ static int ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, int tee) { /* * ipfw_chk() has already tagged the packet with the divert tag. * If tee is set, copy packet and return original. * If not tee, consume packet and send it to divert socket. */ struct mbuf *clone; struct ip *ip = mtod(*m0, struct ip *); struct m_tag *tag; /* Cloning needed for tee? */ if (tee == 0) { clone = *m0; /* use the original mbuf */ *m0 = NULL; } else { clone = m_dup(*m0, M_NOWAIT); /* If we cannot duplicate the mbuf, we sacrifice the divert * chain and continue with the tee-ed packet. */ if (clone == NULL) return 1; } /* * Divert listeners can normally handle non-fragmented packets, * but we can only reass in the non-tee case. * This means that listeners on a tee rule may get fragments, * and have to live with that. * Note that we now have the 'reass' ipfw option so if we care * we can do it before a 'tee'. */ if (!tee) switch (ip->ip_v) { case IPVERSION: if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { int hlen; struct mbuf *reass; reass = ip_reass(clone); /* Reassemble packet. */ if (reass == NULL) return 0; /* not an error */ /* if reass = NULL then it was consumed by ip_reass */ /* * IP header checksum fixup after reassembly and leave header * in network byte order. */ ip = mtod(reass, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(reass, hlen); clone = reass; } break; #ifdef INET6 case IPV6_VERSION >> 4: { struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_FRAGMENT) { int nxt, off; off = sizeof(struct ip6_hdr); nxt = frag6_input(&clone, &off, 0); if (nxt == IPPROTO_DONE) return (0); } break; } #endif } /* attach a tag to the packet with the reinject info */ tag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT); if (tag == NULL) { FREE_PKT(clone); return 1; } *((struct ipfw_rule_ref *)(tag+1)) = *rule; m_tag_prepend(clone, tag); /* Do the dirty job... */ ip_divert_ptr(clone, incoming); return 0; } /* * attach or detach hooks for a given protocol family */ static int ipfw_hook(int onoff, int pf) { struct pfil_head *pfh; void *hook_func; pfh = pfil_head_get(PFIL_TYPE_AF, pf); if (pfh == NULL) return ENOENT; hook_func = (pf == AF_LINK) ? ipfw_check_frame : ipfw_check_packet; (void) (onoff ? pfil_add_hook : pfil_remove_hook) (hook_func, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); return 0; } int ipfw_attach_hooks(int arg) { int error = 0; if (arg == 0) /* detach */ ipfw_hook(0, AF_INET); else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ printf("ipfw_hook() error\n"); } #ifdef INET6 if (arg == 0) /* detach */ ipfw_hook(0, AF_INET6); else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { error = ENOENT; printf("ipfw6_hook() error\n"); } #endif if (arg == 0) /* detach */ ipfw_hook(0, AF_LINK); else if (V_fwlink_enable && ipfw_hook(1, AF_LINK) != 0) { error = ENOENT; printf("ipfw_link_hook() error\n"); } return error; } int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { int newval; int error; int af; if (arg1 == &V_fw_enable) af = AF_INET; #ifdef INET6 else if (arg1 == &V_fw6_enable) af = AF_INET6; #endif else if (arg1 == &V_fwlink_enable) af = AF_LINK; else return (EINVAL); newval = *(int *)arg1; /* Handle sysctl change */ error = sysctl_handle_int(oidp, &newval, 0, req); if (error) return (error); /* Formalize new value */ newval = (newval) ? 1 : 0; if (*(int *)arg1 == newval) return (0); error = ipfw_hook(newval, af); if (error) return (error); *(int *)arg1 = newval; return (0); } /* end of file */ Index: user/dchagin/lemul/sys/netpfil/pf/if_pfsync.c =================================================================== --- user/dchagin/lemul/sys/netpfil/pf/if_pfsync.c (revision 274240) +++ user/dchagin/lemul/sys/netpfil/pf/if_pfsync.c (revision 274241) @@ -1,2421 +1,2421 @@ /*- * Copyright (c) 2002 Michael Shalayeff * Copyright (c) 2012 Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2009 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $ * * Revisions picked from OpenBSD after revision 1.110 import: * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input() * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates * 1.120, 1.175 - use monotonic time_uptime * 1.122 - reduce number of updates for non-TCP sessions * 1.125, 1.127 - rewrite merge or stale processing * 1.128 - cleanups * 1.146 - bzero() mbuf before sparsely filling it with data * 1.170 - SIOCSIFMTU checks * 1.126, 1.142 - deferred packets processing * 1.173 - correct expire time processing */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_pf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define PFSYNC_MINPKT ( \ sizeof(struct ip) + \ sizeof(struct pfsync_header) + \ sizeof(struct pfsync_subheader) ) struct pfsync_pkt { struct ip *ip; struct in_addr src; u_int8_t flags; }; static int pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *, struct pfsync_state_peer *); static int pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int); static int pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int); static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = { pfsync_in_clr, /* PFSYNC_ACT_CLR */ pfsync_in_ins, /* PFSYNC_ACT_INS */ pfsync_in_iack, /* PFSYNC_ACT_INS_ACK */ pfsync_in_upd, /* PFSYNC_ACT_UPD */ pfsync_in_upd_c, /* PFSYNC_ACT_UPD_C */ pfsync_in_ureq, /* PFSYNC_ACT_UPD_REQ */ pfsync_in_del, /* PFSYNC_ACT_DEL */ pfsync_in_del_c, /* PFSYNC_ACT_DEL_C */ pfsync_in_error, /* PFSYNC_ACT_INS_F */ pfsync_in_error, /* PFSYNC_ACT_DEL_F */ pfsync_in_bus, /* PFSYNC_ACT_BUS */ pfsync_in_tdb, /* PFSYNC_ACT_TDB */ pfsync_in_eof /* PFSYNC_ACT_EOF */ }; struct pfsync_q { void (*write)(struct pf_state *, void *); size_t len; u_int8_t action; }; /* we have one of these for every PFSYNC_S_ */ static void pfsync_out_state(struct pf_state *, void *); static void pfsync_out_iack(struct pf_state *, void *); static void pfsync_out_upd_c(struct pf_state *, void *); static void pfsync_out_del(struct pf_state *, void *); static struct pfsync_q pfsync_qs[] = { { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_INS }, { pfsync_out_iack, sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK }, { pfsync_out_state, sizeof(struct pfsync_state), PFSYNC_ACT_UPD }, { pfsync_out_upd_c, sizeof(struct pfsync_upd_c), PFSYNC_ACT_UPD_C }, { pfsync_out_del, sizeof(struct pfsync_del_c), PFSYNC_ACT_DEL_C } }; static void pfsync_q_ins(struct pf_state *, int); static void pfsync_q_del(struct pf_state *); static void pfsync_update_state(struct pf_state *); struct pfsync_upd_req_item { TAILQ_ENTRY(pfsync_upd_req_item) ur_entry; struct pfsync_upd_req ur_msg; }; struct pfsync_deferral { struct pfsync_softc *pd_sc; TAILQ_ENTRY(pfsync_deferral) pd_entry; u_int pd_refs; struct callout pd_tmo; struct pf_state *pd_st; struct mbuf *pd_m; }; struct pfsync_softc { /* Configuration */ struct ifnet *sc_ifp; struct ifnet *sc_sync_if; struct ip_moptions sc_imo; struct in_addr sc_sync_peer; uint32_t sc_flags; #define PFSYNCF_OK 0x00000001 #define PFSYNCF_DEFER 0x00000002 #define PFSYNCF_PUSH 0x00000004 uint8_t sc_maxupdates; struct ip sc_template; struct callout sc_tmo; struct mtx sc_mtx; /* Queued data */ size_t sc_len; TAILQ_HEAD(, pf_state) sc_qs[PFSYNC_S_COUNT]; TAILQ_HEAD(, pfsync_upd_req_item) sc_upd_req_list; TAILQ_HEAD(, pfsync_deferral) sc_deferrals; u_int sc_deferred; void *sc_plus; size_t sc_pluslen; /* Bulk update info */ struct mtx sc_bulk_mtx; uint32_t sc_ureq_sent; int sc_bulk_tries; uint32_t sc_ureq_received; int sc_bulk_hashid; uint64_t sc_bulk_stateid; uint32_t sc_bulk_creatorid; struct callout sc_bulk_tmo; struct callout sc_bulkfail_tmo; }; #define PFSYNC_LOCK(sc) mtx_lock(&(sc)->sc_mtx) #define PFSYNC_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) #define PFSYNC_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) #define PFSYNC_BLOCK(sc) mtx_lock(&(sc)->sc_bulk_mtx) #define PFSYNC_BUNLOCK(sc) mtx_unlock(&(sc)->sc_bulk_mtx) #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED) static const char pfsyncname[] = "pfsync"; static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data"); static VNET_DEFINE(struct pfsync_softc *, pfsyncif) = NULL; #define V_pfsyncif VNET(pfsyncif) static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL; #define V_pfsync_swi_cookie VNET(pfsync_swi_cookie) static VNET_DEFINE(struct pfsyncstats, pfsyncstats); #define V_pfsyncstats VNET(pfsyncstats) static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW; #define V_pfsync_carp_adj VNET(pfsync_carp_adj) static void pfsync_timeout(void *); static void pfsync_push(struct pfsync_softc *); static void pfsyncintr(void *); static int pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *, void *); static void pfsync_multicast_cleanup(struct pfsync_softc *); static void pfsync_pointers_init(void); static void pfsync_pointers_uninit(void); static int pfsync_init(void); static void pfsync_uninit(void); SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC"); -SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW, +SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(pfsyncstats), pfsyncstats, "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)"); SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW, &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment"); static int pfsync_clone_create(struct if_clone *, int, caddr_t); static void pfsync_clone_destroy(struct ifnet *); static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *, struct pf_state_peer *); static int pfsyncoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static int pfsyncioctl(struct ifnet *, u_long, caddr_t); static int pfsync_defer(struct pf_state *, struct mbuf *); static void pfsync_undefer(struct pfsync_deferral *, int); static void pfsync_undefer_state(struct pf_state *, int); static void pfsync_defer_tmo(void *); static void pfsync_request_update(u_int32_t, u_int64_t); static void pfsync_update_state_req(struct pf_state *); static void pfsync_drop(struct pfsync_softc *); static void pfsync_sendout(int); static void pfsync_send_plus(void *, size_t); static void pfsync_bulk_start(void); static void pfsync_bulk_status(u_int8_t); static void pfsync_bulk_update(void *); static void pfsync_bulk_fail(void *); #ifdef IPSEC static void pfsync_update_net_tdb(struct pfsync_tdb *); #endif #define PFSYNC_MAX_BULKTRIES 12 VNET_DEFINE(struct if_clone *, pfsync_cloner); #define V_pfsync_cloner VNET(pfsync_cloner) static int pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param) { struct pfsync_softc *sc; struct ifnet *ifp; int q; if (unit != 0) return (EINVAL); sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO); sc->sc_flags |= PFSYNCF_OK; for (q = 0; q < PFSYNC_S_COUNT; q++) TAILQ_INIT(&sc->sc_qs[q]); TAILQ_INIT(&sc->sc_upd_req_list); TAILQ_INIT(&sc->sc_deferrals); sc->sc_len = PFSYNC_MINPKT; sc->sc_maxupdates = 128; ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC); if (ifp == NULL) { free(sc, M_PFSYNC); return (ENOSPC); } if_initname(ifp, pfsyncname, unit); ifp->if_softc = sc; ifp->if_ioctl = pfsyncioctl; ifp->if_output = pfsyncoutput; ifp->if_type = IFT_PFSYNC; ifp->if_snd.ifq_maxlen = ifqmaxlen; ifp->if_hdrlen = sizeof(struct pfsync_header); ifp->if_mtu = ETHERMTU; mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF); mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF); callout_init(&sc->sc_tmo, CALLOUT_MPSAFE); callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0); callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0); if_attach(ifp); bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN); V_pfsyncif = sc; return (0); } static void pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; /* * At this stage, everything should have already been * cleared by pfsync_uninit(), and we have only to * drain callouts. */ while (sc->sc_deferred > 0) { struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals); TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); sc->sc_deferred--; if (callout_stop(&pd->pd_tmo)) { pf_release_state(pd->pd_st); m_freem(pd->pd_m); free(pd, M_PFSYNC); } else { pd->pd_refs++; callout_drain(&pd->pd_tmo); free(pd, M_PFSYNC); } } callout_drain(&sc->sc_tmo); callout_drain(&sc->sc_bulkfail_tmo); callout_drain(&sc->sc_bulk_tmo); if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy"); bpfdetach(ifp); if_detach(ifp); pfsync_drop(sc); if_free(ifp); if (sc->sc_imo.imo_membership) pfsync_multicast_cleanup(sc); mtx_destroy(&sc->sc_mtx); mtx_destroy(&sc->sc_bulk_mtx); free(sc, M_PFSYNC); V_pfsyncif = NULL; } static int pfsync_alloc_scrub_memory(struct pfsync_state_peer *s, struct pf_state_peer *d) { if (s->scrub.scrub_flag && d->scrub == NULL) { d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO); if (d->scrub == NULL) return (ENOMEM); } return (0); } static int pfsync_state_import(struct pfsync_state *sp, u_int8_t flags) { struct pfsync_softc *sc = V_pfsyncif; #ifndef __NO_STRICT_ALIGNMENT struct pfsync_state_key key[2]; #endif struct pfsync_state_key *kw, *ks; struct pf_state *st = NULL; struct pf_state_key *skw = NULL, *sks = NULL; struct pf_rule *r = NULL; struct pfi_kif *kif; int error; PF_RULES_RASSERT(); if (sp->creatorid == 0) { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: invalid creator id: %08x\n", __func__, ntohl(sp->creatorid)); return (EINVAL); } if ((kif = pfi_kif_find(sp->ifname)) == NULL) { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: unknown interface: %s\n", __func__, sp->ifname); if (flags & PFSYNC_SI_IOCTL) return (EINVAL); return (0); /* skip this state */ } /* * If the ruleset checksums match or the state is coming from the ioctl, * it's safe to associate the state with the rule of that number. */ if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) && (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) < pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount) r = pf_main_ruleset.rules[ PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)]; else r = &V_pf_default_rule; if ((r->max_states && counter_u64_fetch(r->states_cur) >= r->max_states)) goto cleanup; /* * XXXGL: consider M_WAITOK in ioctl path after. */ if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL) goto cleanup; if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL) goto cleanup; #ifndef __NO_STRICT_ALIGNMENT bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2); kw = &key[PF_SK_WIRE]; ks = &key[PF_SK_STACK]; #else kw = &sp->key[PF_SK_WIRE]; ks = &sp->key[PF_SK_STACK]; #endif if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) || PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) || kw->port[0] != ks->port[0] || kw->port[1] != ks->port[1]) { sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT); if (sks == NULL) goto cleanup; } else sks = skw; /* allocate memory for scrub info */ if (pfsync_alloc_scrub_memory(&sp->src, &st->src) || pfsync_alloc_scrub_memory(&sp->dst, &st->dst)) goto cleanup; /* Copy to state key(s). */ skw->addr[0] = kw->addr[0]; skw->addr[1] = kw->addr[1]; skw->port[0] = kw->port[0]; skw->port[1] = kw->port[1]; skw->proto = sp->proto; skw->af = sp->af; if (sks != skw) { sks->addr[0] = ks->addr[0]; sks->addr[1] = ks->addr[1]; sks->port[0] = ks->port[0]; sks->port[1] = ks->port[1]; sks->proto = sp->proto; sks->af = sp->af; } /* copy to state */ bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr)); st->creation = time_uptime - ntohl(sp->creation); st->expire = time_uptime; if (sp->expire) { uint32_t timeout; timeout = r->timeout[sp->timeout]; if (!timeout) timeout = V_pf_default_rule.timeout[sp->timeout]; /* sp->expire may have been adaptively scaled by export. */ st->expire -= timeout - ntohl(sp->expire); } st->direction = sp->direction; st->log = sp->log; st->timeout = sp->timeout; st->state_flags = sp->state_flags; st->id = sp->id; st->creatorid = sp->creatorid; pf_state_peer_ntoh(&sp->src, &st->src); pf_state_peer_ntoh(&sp->dst, &st->dst); st->rule.ptr = r; st->nat_rule.ptr = NULL; st->anchor.ptr = NULL; st->rt_kif = NULL; st->pfsync_time = time_uptime; st->sync_state = PFSYNC_S_NONE; if (!(flags & PFSYNC_SI_IOCTL)) st->state_flags |= PFSTATE_NOSYNC; if ((error = pf_state_insert(kif, skw, sks, st)) != 0) goto cleanup_state; /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */ counter_u64_add(r->states_cur, 1); counter_u64_add(r->states_tot, 1); if (!(flags & PFSYNC_SI_IOCTL)) { st->state_flags &= ~PFSTATE_NOSYNC; if (st->state_flags & PFSTATE_ACK) { pfsync_q_ins(st, PFSYNC_S_IACK); pfsync_push(sc); } } st->state_flags &= ~PFSTATE_ACK; PF_STATE_UNLOCK(st); return (0); cleanup: error = ENOMEM; if (skw == sks) sks = NULL; if (skw != NULL) uma_zfree(V_pf_state_key_z, skw); if (sks != NULL) uma_zfree(V_pf_state_key_z, sks); cleanup_state: /* pf_state_insert() frees the state keys. */ if (st) { if (st->dst.scrub) uma_zfree(V_pf_state_scrub_z, st->dst.scrub); if (st->src.scrub) uma_zfree(V_pf_state_scrub_z, st->src.scrub); uma_zfree(V_pf_state_z, st); } return (error); } static int pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_pkt pkt; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pfsync_header *ph; struct pfsync_subheader subh; int offset, len; int rv; uint16_t count; *mp = NULL; V_pfsyncstats.pfsyncs_ipackets++; /* Verify that we have a sync interface configured. */ if (!sc || !sc->sc_sync_if || !V_pf_status.running || (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) goto done; /* verify that the packet came in on the right interface */ if (sc->sc_sync_if != m->m_pkthdr.rcvif) { V_pfsyncstats.pfsyncs_badif++; goto done; } if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* verify that the IP TTL is 255. */ if (ip->ip_ttl != PFSYNC_DFLTTL) { V_pfsyncstats.pfsyncs_badttl++; goto done; } offset = ip->ip_hl << 2; if (m->m_pkthdr.len < offset + sizeof(*ph)) { V_pfsyncstats.pfsyncs_hdrops++; goto done; } if (offset + sizeof(*ph) > m->m_len) { if (m_pullup(m, offset + sizeof(*ph)) == NULL) { V_pfsyncstats.pfsyncs_hdrops++; return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } ph = (struct pfsync_header *)((char *)ip + offset); /* verify the version */ if (ph->version != PFSYNC_VERSION) { V_pfsyncstats.pfsyncs_badver++; goto done; } len = ntohs(ph->len) + offset; if (m->m_pkthdr.len < len) { V_pfsyncstats.pfsyncs_badlen++; goto done; } /* Cheaper to grab this now than having to mess with mbufs later */ pkt.ip = ip; pkt.src = ip->ip_src; pkt.flags = 0; /* * Trusting pf_chksum during packet processing, as well as seeking * in interface name tree, require holding PF_RULES_RLOCK(). */ PF_RULES_RLOCK(); if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH)) pkt.flags |= PFSYNC_SI_CKSUM; offset += sizeof(*ph); while (offset <= len - sizeof(subh)) { m_copydata(m, offset, sizeof(subh), (caddr_t)&subh); offset += sizeof(subh); if (subh.action >= PFSYNC_ACT_MAX) { V_pfsyncstats.pfsyncs_badact++; PF_RULES_RUNLOCK(); goto done; } count = ntohs(subh.count); V_pfsyncstats.pfsyncs_iacts[subh.action] += count; rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count); if (rv == -1) { PF_RULES_RUNLOCK(); return (IPPROTO_DONE); } offset += rv; } PF_RULES_RUNLOCK(); done: m_freem(m); return (IPPROTO_DONE); } static int pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_clr *clr; struct mbuf *mp; int len = sizeof(*clr) * count; int i, offp; u_int32_t creatorid; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } clr = (struct pfsync_clr *)(mp->m_data + offp); for (i = 0; i < count; i++) { creatorid = clr[i].creatorid; if (clr[i].ifname[0] != '\0' && pfi_kif_find(clr[i].ifname) == NULL) continue; for (int i = 0; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; struct pf_state *s; relock: PF_HASHROW_LOCK(ih); LIST_FOREACH(s, &ih->states, entry) { if (s->creatorid == creatorid) { s->state_flags |= PFSTATE_NOSYNC; pf_unlink_state(s, PF_ENTER_LOCKED); goto relock; } } PF_HASHROW_UNLOCK(ih); } } return (len); } static int pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct mbuf *mp; struct pfsync_state *sa, *sp; int len = sizeof(*sp) * count; int i, offp; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_state *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; /* Check for invalid values. */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST || sp->direction > PF_OUT || (sp->af != AF_INET && sp->af != AF_INET6)) { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("%s: invalid value\n", __func__); V_pfsyncstats.pfsyncs_badval++; continue; } if (pfsync_state_import(sp, pkt->flags) == ENOMEM) /* Drop out, but process the rest of the actions. */ break; } return (len); } static int pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_ins_ack *ia, *iaa; struct pf_state *st; struct mbuf *mp; int len = count * sizeof(*ia); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } iaa = (struct pfsync_ins_ack *)(mp->m_data + offp); for (i = 0; i < count; i++) { ia = &iaa[i]; st = pf_find_state_byid(ia->id, ia->creatorid); if (st == NULL) continue; if (st->state_flags & PFSTATE_ACK) { PFSYNC_LOCK(V_pfsyncif); pfsync_undefer_state(st, 0); PFSYNC_UNLOCK(V_pfsyncif); } PF_STATE_UNLOCK(st); } /* * XXX this is not yet implemented, but we know the size of the * message so we can skip it. */ return (count * sizeof(struct pfsync_ins_ack)); } static int pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src, struct pfsync_state_peer *dst) { int sync = 0; PF_STATE_LOCK_ASSERT(st); /* * The state should never go backwards except * for syn-proxy states. Neither should the * sequence window slide backwards. */ if ((st->src.state > src->state && (st->src.state < PF_TCPS_PROXY_SRC || src->state >= PF_TCPS_PROXY_SRC)) || (st->src.state == src->state && SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))) sync++; else pf_state_peer_ntoh(src, &st->src); if ((st->dst.state > dst->state) || (st->dst.state >= TCPS_SYN_SENT && SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))) sync++; else pf_state_peer_ntoh(dst, &st->dst); return (sync); } static int pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_state *sa, *sp; struct pf_state *st; int sync; struct mbuf *mp; int len = count * sizeof(*sp); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_state *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; /* check for invalid values */ if (sp->timeout >= PFTM_MAX || sp->src.state > PF_TCPS_PROXY_DST || sp->dst.state > PF_TCPS_PROXY_DST) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pfsync_input: PFSYNC_ACT_UPD: " "invalid value\n"); } V_pfsyncstats.pfsyncs_badval++; continue; } st = pf_find_state_byid(sp->id, sp->creatorid); if (st == NULL) { /* insert the update */ if (pfsync_state_import(sp, 0)) V_pfsyncstats.pfsyncs_badstate++; continue; } if (st->state_flags & PFSTATE_ACK) { PFSYNC_LOCK(sc); pfsync_undefer_state(st, 1); PFSYNC_UNLOCK(sc); } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) sync = pfsync_upd_tcp(st, &sp->src, &sp->dst); else { sync = 0; /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > sp->src.state) sync++; else pf_state_peer_ntoh(&sp->src, &st->src); if (st->dst.state > sp->dst.state) sync++; else pf_state_peer_ntoh(&sp->dst, &st->dst); } if (sync < 2) { pfsync_alloc_scrub_memory(&sp->dst, &st->dst); pf_state_peer_ntoh(&sp->dst, &st->dst); st->expire = time_uptime; st->timeout = sp->timeout; } st->pfsync_time = time_uptime; if (sync) { V_pfsyncstats.pfsyncs_stale++; pfsync_update_state(st); PF_STATE_UNLOCK(st); PFSYNC_LOCK(sc); pfsync_push(sc); PFSYNC_UNLOCK(sc); continue; } PF_STATE_UNLOCK(st); } return (len); } static int pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_upd_c *ua, *up; struct pf_state *st; int len = count * sizeof(*up); int sync; struct mbuf *mp; int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } ua = (struct pfsync_upd_c *)(mp->m_data + offp); for (i = 0; i < count; i++) { up = &ua[i]; /* check for invalid values */ if (up->timeout >= PFTM_MAX || up->src.state > PF_TCPS_PROXY_DST || up->dst.state > PF_TCPS_PROXY_DST) { if (V_pf_status.debug >= PF_DEBUG_MISC) { printf("pfsync_input: " "PFSYNC_ACT_UPD_C: " "invalid value\n"); } V_pfsyncstats.pfsyncs_badval++; continue; } st = pf_find_state_byid(up->id, up->creatorid); if (st == NULL) { /* We don't have this state. Ask for it. */ PFSYNC_LOCK(sc); pfsync_request_update(up->creatorid, up->id); PFSYNC_UNLOCK(sc); continue; } if (st->state_flags & PFSTATE_ACK) { PFSYNC_LOCK(sc); pfsync_undefer_state(st, 1); PFSYNC_UNLOCK(sc); } if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) sync = pfsync_upd_tcp(st, &up->src, &up->dst); else { sync = 0; /* * Non-TCP protocol state machine always go * forwards */ if (st->src.state > up->src.state) sync++; else pf_state_peer_ntoh(&up->src, &st->src); if (st->dst.state > up->dst.state) sync++; else pf_state_peer_ntoh(&up->dst, &st->dst); } if (sync < 2) { pfsync_alloc_scrub_memory(&up->dst, &st->dst); pf_state_peer_ntoh(&up->dst, &st->dst); st->expire = time_uptime; st->timeout = up->timeout; } st->pfsync_time = time_uptime; if (sync) { V_pfsyncstats.pfsyncs_stale++; pfsync_update_state(st); PF_STATE_UNLOCK(st); PFSYNC_LOCK(sc); pfsync_push(sc); PFSYNC_UNLOCK(sc); continue; } PF_STATE_UNLOCK(st); } return (len); } static int pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_upd_req *ur, *ura; struct mbuf *mp; int len = count * sizeof(*ur); int i, offp; struct pf_state *st; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } ura = (struct pfsync_upd_req *)(mp->m_data + offp); for (i = 0; i < count; i++) { ur = &ura[i]; if (ur->id == 0 && ur->creatorid == 0) pfsync_bulk_start(); else { st = pf_find_state_byid(ur->id, ur->creatorid); if (st == NULL) { V_pfsyncstats.pfsyncs_badstate++; continue; } if (st->state_flags & PFSTATE_NOSYNC) { PF_STATE_UNLOCK(st); continue; } pfsync_update_state_req(st); PF_STATE_UNLOCK(st); } } return (len); } static int pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct mbuf *mp; struct pfsync_state *sa, *sp; struct pf_state *st; int len = count * sizeof(*sp); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_state *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; st = pf_find_state_byid(sp->id, sp->creatorid); if (st == NULL) { V_pfsyncstats.pfsyncs_badstate++; continue; } st->state_flags |= PFSTATE_NOSYNC; pf_unlink_state(st, PF_ENTER_LOCKED); } return (len); } static int pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct mbuf *mp; struct pfsync_del_c *sa, *sp; struct pf_state *st; int len = count * sizeof(*sp); int offp, i; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } sa = (struct pfsync_del_c *)(mp->m_data + offp); for (i = 0; i < count; i++) { sp = &sa[i]; st = pf_find_state_byid(sp->id, sp->creatorid); if (st == NULL) { V_pfsyncstats.pfsyncs_badstate++; continue; } st->state_flags |= PFSTATE_NOSYNC; pf_unlink_state(st, PF_ENTER_LOCKED); } return (len); } static int pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_bus *bus; struct mbuf *mp; int len = count * sizeof(*bus); int offp; PFSYNC_BLOCK(sc); /* If we're not waiting for a bulk update, who cares. */ if (sc->sc_ureq_sent == 0) { PFSYNC_BUNLOCK(sc); return (len); } mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { PFSYNC_BUNLOCK(sc); V_pfsyncstats.pfsyncs_badlen++; return (-1); } bus = (struct pfsync_bus *)(mp->m_data + offp); switch (bus->status) { case PFSYNC_BUS_START: callout_reset(&sc->sc_bulkfail_tmo, 4 * hz + V_pf_limits[PF_LIMIT_STATES].limit / ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) / sizeof(struct pfsync_state)), pfsync_bulk_fail, sc); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received bulk update start\n"); break; case PFSYNC_BUS_END: if (time_uptime - ntohl(bus->endtime) >= sc->sc_ureq_sent) { /* that's it, we're happy */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0; callout_stop(&sc->sc_bulkfail_tmo); if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync bulk done"); sc->sc_flags |= PFSYNCF_OK; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received valid " "bulk update end\n"); } else { if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received invalid " "bulk update end: bad timestamp\n"); } break; } PFSYNC_BUNLOCK(sc); return (len); } static int pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { int len = count * sizeof(struct pfsync_tdb); #if defined(IPSEC) struct pfsync_tdb *tp; struct mbuf *mp; int offp; int i; int s; mp = m_pulldown(m, offset, len, &offp); if (mp == NULL) { V_pfsyncstats.pfsyncs_badlen++; return (-1); } tp = (struct pfsync_tdb *)(mp->m_data + offp); for (i = 0; i < count; i++) pfsync_update_net_tdb(&tp[i]); #endif return (len); } #if defined(IPSEC) /* Update an in-kernel tdb. Silently fail if no tdb is found. */ static void pfsync_update_net_tdb(struct pfsync_tdb *pt) { struct tdb *tdb; int s; /* check for invalid values */ if (ntohl(pt->spi) <= SPI_RESERVED_MAX || (pt->dst.sa.sa_family != AF_INET && pt->dst.sa.sa_family != AF_INET6)) goto bad; tdb = gettdb(pt->spi, &pt->dst, pt->sproto); if (tdb) { pt->rpl = ntohl(pt->rpl); pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes); /* Neither replay nor byte counter should ever decrease. */ if (pt->rpl < tdb->tdb_rpl || pt->cur_bytes < tdb->tdb_cur_bytes) { goto bad; } tdb->tdb_rpl = pt->rpl; tdb->tdb_cur_bytes = pt->cur_bytes; } return; bad: if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: " "invalid value\n"); V_pfsyncstats.pfsyncs_badstate++; return; } #endif static int pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { /* check if we are at the right place in the packet */ if (offset != m->m_pkthdr.len) V_pfsyncstats.pfsyncs_badlen++; /* we're done. free and let the caller return */ m_freem(m); return (-1); } static int pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count) { V_pfsyncstats.pfsyncs_badact++; m_freem(m); return (-1); } static int pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *rt) { m_freem(m); return (0); } /* ARGSUSED */ static int pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct pfsync_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; struct pfsyncreq pfsyncr; int error; switch (cmd) { case SIOCSIFFLAGS: PFSYNC_LOCK(sc); if (ifp->if_flags & IFF_UP) { ifp->if_drv_flags |= IFF_DRV_RUNNING; PFSYNC_UNLOCK(sc); pfsync_pointers_init(); } else { ifp->if_drv_flags &= ~IFF_DRV_RUNNING; PFSYNC_UNLOCK(sc); pfsync_pointers_uninit(); } break; case SIOCSIFMTU: if (!sc->sc_sync_if || ifr->ifr_mtu <= PFSYNC_MINPKT || ifr->ifr_mtu > sc->sc_sync_if->if_mtu) return (EINVAL); if (ifr->ifr_mtu < ifp->if_mtu) { PFSYNC_LOCK(sc); if (sc->sc_len > PFSYNC_MINPKT) pfsync_sendout(1); PFSYNC_UNLOCK(sc); } ifp->if_mtu = ifr->ifr_mtu; break; case SIOCGETPFSYNC: bzero(&pfsyncr, sizeof(pfsyncr)); PFSYNC_LOCK(sc); if (sc->sc_sync_if) { strlcpy(pfsyncr.pfsyncr_syncdev, sc->sc_sync_if->if_xname, IFNAMSIZ); } pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer; pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates; pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER == (sc->sc_flags & PFSYNCF_DEFER)); PFSYNC_UNLOCK(sc); return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr))); case SIOCSETPFSYNC: { struct ip_moptions *imo = &sc->sc_imo; struct ifnet *sifp; struct ip *ip; void *mship = NULL; if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0) return (error); if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr)))) return (error); if (pfsyncr.pfsyncr_maxupdates > 255) return (EINVAL); if (pfsyncr.pfsyncr_syncdev[0] == 0) sifp = NULL; else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL) return (EINVAL); if (sifp != NULL && ( pfsyncr.pfsyncr_syncpeer.s_addr == 0 || pfsyncr.pfsyncr_syncpeer.s_addr == htonl(INADDR_PFSYNC_GROUP))) mship = malloc((sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO); PFSYNC_LOCK(sc); if (pfsyncr.pfsyncr_syncpeer.s_addr == 0) sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP); else sc->sc_sync_peer.s_addr = pfsyncr.pfsyncr_syncpeer.s_addr; sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates; if (pfsyncr.pfsyncr_defer) { sc->sc_flags |= PFSYNCF_DEFER; pfsync_defer_ptr = pfsync_defer; } else { sc->sc_flags &= ~PFSYNCF_DEFER; pfsync_defer_ptr = NULL; } if (sifp == NULL) { if (sc->sc_sync_if) if_rele(sc->sc_sync_if); sc->sc_sync_if = NULL; if (imo->imo_membership) pfsync_multicast_cleanup(sc); PFSYNC_UNLOCK(sc); break; } if (sc->sc_len > PFSYNC_MINPKT && (sifp->if_mtu < sc->sc_ifp->if_mtu || (sc->sc_sync_if != NULL && sifp->if_mtu < sc->sc_sync_if->if_mtu) || sifp->if_mtu < MCLBYTES - sizeof(struct ip))) pfsync_sendout(1); if (imo->imo_membership) pfsync_multicast_cleanup(sc); if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) { error = pfsync_multicast_setup(sc, sifp, mship); if (error) { if_rele(sifp); free(mship, M_PFSYNC); return (error); } } if (sc->sc_sync_if) if_rele(sc->sc_sync_if); sc->sc_sync_if = sifp; ip = &sc->sc_template; bzero(ip, sizeof(*ip)); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(sc->sc_template) >> 2; ip->ip_tos = IPTOS_LOWDELAY; /* len and id are set later. */ ip->ip_off = htons(IP_DF); ip->ip_ttl = PFSYNC_DFLTTL; ip->ip_p = IPPROTO_PFSYNC; ip->ip_src.s_addr = INADDR_ANY; ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr; /* Request a full state table update. */ if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(V_pfsync_carp_adj, "pfsync bulk start"); sc->sc_flags &= ~PFSYNCF_OK; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: requesting bulk update\n"); pfsync_request_update(0, 0); PFSYNC_UNLOCK(sc); PFSYNC_BLOCK(sc); sc->sc_ureq_sent = time_uptime; callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, sc); PFSYNC_BUNLOCK(sc); break; } default: return (ENOTTY); } return (0); } static void pfsync_out_state(struct pf_state *st, void *buf) { struct pfsync_state *sp = buf; pfsync_state_export(sp, st); } static void pfsync_out_iack(struct pf_state *st, void *buf) { struct pfsync_ins_ack *iack = buf; iack->id = st->id; iack->creatorid = st->creatorid; } static void pfsync_out_upd_c(struct pf_state *st, void *buf) { struct pfsync_upd_c *up = buf; bzero(up, sizeof(*up)); up->id = st->id; pf_state_peer_hton(&st->src, &up->src); pf_state_peer_hton(&st->dst, &up->dst); up->creatorid = st->creatorid; up->timeout = st->timeout; } static void pfsync_out_del(struct pf_state *st, void *buf) { struct pfsync_del_c *dp = buf; dp->id = st->id; dp->creatorid = st->creatorid; st->state_flags |= PFSTATE_NOSYNC; } static void pfsync_drop(struct pfsync_softc *sc) { struct pf_state *st, *next; struct pfsync_upd_req_item *ur; int q; for (q = 0; q < PFSYNC_S_COUNT; q++) { if (TAILQ_EMPTY(&sc->sc_qs[q])) continue; TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) { KASSERT(st->sync_state == q, ("%s: st->sync_state == q", __func__)); st->sync_state = PFSYNC_S_NONE; pf_release_state(st); } TAILQ_INIT(&sc->sc_qs[q]); } while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); free(ur, M_PFSYNC); } sc->sc_plus = NULL; sc->sc_len = PFSYNC_MINPKT; } static void pfsync_sendout(int schedswi) { struct pfsync_softc *sc = V_pfsyncif; struct ifnet *ifp = sc->sc_ifp; struct mbuf *m; struct ip *ip; struct pfsync_header *ph; struct pfsync_subheader *subh; struct pf_state *st; struct pfsync_upd_req_item *ur; int offset; int q, count = 0; KASSERT(sc != NULL, ("%s: null sc", __func__)); KASSERT(sc->sc_len > PFSYNC_MINPKT, ("%s: sc_len %zu", __func__, sc->sc_len)); PFSYNC_LOCK_ASSERT(sc); if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) { pfsync_drop(sc); return; } m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) { if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); V_pfsyncstats.pfsyncs_onomem++; return; } m->m_data += max_linkhdr; m->m_len = m->m_pkthdr.len = sc->sc_len; /* build the ip header */ ip = (struct ip *)m->m_data; bcopy(&sc->sc_template, ip, sizeof(*ip)); offset = sizeof(*ip); ip->ip_len = htons(m->m_pkthdr.len); ip->ip_id = htons(ip_randomid()); /* build the pfsync header */ ph = (struct pfsync_header *)(m->m_data + offset); bzero(ph, sizeof(*ph)); offset += sizeof(*ph); ph->version = PFSYNC_VERSION; ph->len = htons(sc->sc_len - sizeof(*ip)); bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); /* walk the queues */ for (q = 0; q < PFSYNC_S_COUNT; q++) { if (TAILQ_EMPTY(&sc->sc_qs[q])) continue; subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) { KASSERT(st->sync_state == q, ("%s: st->sync_state == q", __func__)); /* * XXXGL: some of write methods do unlocked reads * of state data :( */ pfsync_qs[q].write(st, m->m_data + offset); offset += pfsync_qs[q].len; st->sync_state = PFSYNC_S_NONE; pf_release_state(st); count++; } TAILQ_INIT(&sc->sc_qs[q]); bzero(subh, sizeof(*subh)); subh->action = pfsync_qs[q].action; subh->count = htons(count); V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count; } if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); bcopy(&ur->ur_msg, m->m_data + offset, sizeof(ur->ur_msg)); offset += sizeof(ur->ur_msg); free(ur, M_PFSYNC); count++; } bzero(subh, sizeof(*subh)); subh->action = PFSYNC_ACT_UPD_REQ; subh->count = htons(count); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count; } /* has someone built a custom region for us to add? */ if (sc->sc_plus != NULL) { bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen); offset += sc->sc_pluslen; sc->sc_plus = NULL; } subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); bzero(subh, sizeof(*subh)); subh->action = PFSYNC_ACT_EOF; subh->count = htons(1); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++; /* we're done, let's put it on the wire */ if (ifp->if_bpf) { m->m_data += sizeof(*ip); m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip); BPF_MTAP(ifp, m); m->m_data -= sizeof(*ip); m->m_len = m->m_pkthdr.len = sc->sc_len; } if (sc->sc_sync_if == NULL) { sc->sc_len = PFSYNC_MINPKT; m_freem(m); return; } if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); sc->sc_len = PFSYNC_MINPKT; if (!_IF_QFULL(&sc->sc_ifp->if_snd)) _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); else { m_freem(m); if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1); } if (schedswi) swi_sched(V_pfsync_swi_cookie, 0); } static void pfsync_insert_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; if (st->state_flags & PFSTATE_NOSYNC) return; if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) || st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) { st->state_flags |= PFSTATE_NOSYNC; return; } KASSERT(st->sync_state == PFSYNC_S_NONE, ("%s: st->sync_state %u", __func__, st->sync_state)); PFSYNC_LOCK(sc); if (sc->sc_len == PFSYNC_MINPKT) callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); pfsync_q_ins(st, PFSYNC_S_INS); PFSYNC_UNLOCK(sc); st->sync_updates = 0; } static int pfsync_defer(struct pf_state *st, struct mbuf *m) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_deferral *pd; if (m->m_flags & (M_BCAST|M_MCAST)) return (0); PFSYNC_LOCK(sc); if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) || !(sc->sc_flags & PFSYNCF_DEFER)) { PFSYNC_UNLOCK(sc); return (0); } if (sc->sc_deferred >= 128) pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0); pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT); if (pd == NULL) return (0); sc->sc_deferred++; m->m_flags |= M_SKIP_FIREWALL; st->state_flags |= PFSTATE_ACK; pd->pd_sc = sc; pd->pd_refs = 0; pd->pd_st = st; pf_ref_state(st); pd->pd_m = m; TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd); pfsync_push(sc); return (1); } static void pfsync_undefer(struct pfsync_deferral *pd, int drop) { struct pfsync_softc *sc = pd->pd_sc; struct mbuf *m = pd->pd_m; struct pf_state *st = pd->pd_st; PFSYNC_LOCK_ASSERT(sc); TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); sc->sc_deferred--; pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ free(pd, M_PFSYNC); pf_release_state(st); if (drop) m_freem(m); else { _IF_ENQUEUE(&sc->sc_ifp->if_snd, m); pfsync_push(sc); } } static void pfsync_defer_tmo(void *arg) { struct pfsync_deferral *pd = arg; struct pfsync_softc *sc = pd->pd_sc; struct mbuf *m = pd->pd_m; struct pf_state *st = pd->pd_st; PFSYNC_LOCK_ASSERT(sc); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); sc->sc_deferred--; pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */ if (pd->pd_refs == 0) free(pd, M_PFSYNC); PFSYNC_UNLOCK(sc); ip_output(m, NULL, NULL, 0, NULL, NULL); pf_release_state(st); CURVNET_RESTORE(); } static void pfsync_undefer_state(struct pf_state *st, int drop) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_deferral *pd; PFSYNC_LOCK_ASSERT(sc); TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { if (pd->pd_st == st) { if (callout_stop(&pd->pd_tmo)) pfsync_undefer(pd, drop); return; } } panic("%s: unable to find deferred state", __func__); } static void pfsync_update_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; int sync = 0; PF_STATE_LOCK_ASSERT(st); PFSYNC_LOCK(sc); if (st->state_flags & PFSTATE_ACK) pfsync_undefer_state(st, 0); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) pfsync_q_del(st); PFSYNC_UNLOCK(sc); return; } if (sc->sc_len == PFSYNC_MINPKT) callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: case PFSYNC_S_INS: /* we're already handling it */ if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) { st->sync_updates++; if (st->sync_updates >= sc->sc_maxupdates) sync = 1; } break; case PFSYNC_S_IACK: pfsync_q_del(st); case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_UPD_C); st->sync_updates = 0; break; default: panic("%s: unexpected sync state %d", __func__, st->sync_state); } if (sync || (time_uptime - st->pfsync_time) < 2) pfsync_push(sc); PFSYNC_UNLOCK(sc); } static void pfsync_request_update(u_int32_t creatorid, u_int64_t id) { struct pfsync_softc *sc = V_pfsyncif; struct pfsync_upd_req_item *item; size_t nlen = sizeof(struct pfsync_upd_req); PFSYNC_LOCK_ASSERT(sc); /* * This code does a bit to prevent multiple update requests for the * same state being generated. It searches current subheader queue, * but it doesn't lookup into queue of already packed datagrams. */ TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry) if (item->ur_msg.id == id && item->ur_msg.creatorid == creatorid) return; item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT); if (item == NULL) return; /* XXX stats */ item->ur_msg.id = id; item->ur_msg.creatorid = creatorid; if (TAILQ_EMPTY(&sc->sc_upd_req_list)) nlen += sizeof(struct pfsync_subheader); if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { pfsync_sendout(1); nlen = sizeof(struct pfsync_subheader) + sizeof(struct pfsync_upd_req); } TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); sc->sc_len += nlen; } static void pfsync_update_state_req(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; PF_STATE_LOCK_ASSERT(st); PFSYNC_LOCK(sc); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) pfsync_q_del(st); PFSYNC_UNLOCK(sc); return; } switch (st->sync_state) { case PFSYNC_S_UPD_C: case PFSYNC_S_IACK: pfsync_q_del(st); case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_UPD); pfsync_push(sc); break; case PFSYNC_S_INS: case PFSYNC_S_UPD: case PFSYNC_S_DEL: /* we're already handling it */ break; default: panic("%s: unexpected sync state %d", __func__, st->sync_state); } PFSYNC_UNLOCK(sc); } static void pfsync_delete_state(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; PFSYNC_LOCK(sc); if (st->state_flags & PFSTATE_ACK) pfsync_undefer_state(st, 1); if (st->state_flags & PFSTATE_NOSYNC) { if (st->sync_state != PFSYNC_S_NONE) pfsync_q_del(st); PFSYNC_UNLOCK(sc); return; } if (sc->sc_len == PFSYNC_MINPKT) callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif); switch (st->sync_state) { case PFSYNC_S_INS: /* We never got to tell the world so just forget about it. */ pfsync_q_del(st); break; case PFSYNC_S_UPD_C: case PFSYNC_S_UPD: case PFSYNC_S_IACK: pfsync_q_del(st); /* FALLTHROUGH to putting it on the del list */ case PFSYNC_S_NONE: pfsync_q_ins(st, PFSYNC_S_DEL); break; default: panic("%s: unexpected sync state %d", __func__, st->sync_state); } PFSYNC_UNLOCK(sc); } static void pfsync_clear_states(u_int32_t creatorid, const char *ifname) { struct pfsync_softc *sc = V_pfsyncif; struct { struct pfsync_subheader subh; struct pfsync_clr clr; } __packed r; bzero(&r, sizeof(r)); r.subh.action = PFSYNC_ACT_CLR; r.subh.count = htons(1); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++; strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname)); r.clr.creatorid = creatorid; PFSYNC_LOCK(sc); pfsync_send_plus(&r, sizeof(r)); PFSYNC_UNLOCK(sc); } static void pfsync_q_ins(struct pf_state *st, int q) { struct pfsync_softc *sc = V_pfsyncif; size_t nlen = pfsync_qs[q].len; PFSYNC_LOCK_ASSERT(sc); KASSERT(st->sync_state == PFSYNC_S_NONE, ("%s: st->sync_state %u", __func__, st->sync_state)); KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu", sc->sc_len)); if (TAILQ_EMPTY(&sc->sc_qs[q])) nlen += sizeof(struct pfsync_subheader); if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) { pfsync_sendout(1); nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; } sc->sc_len += nlen; TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); st->sync_state = q; pf_ref_state(st); } static void pfsync_q_del(struct pf_state *st) { struct pfsync_softc *sc = V_pfsyncif; int q = st->sync_state; PFSYNC_LOCK_ASSERT(sc); KASSERT(st->sync_state != PFSYNC_S_NONE, ("%s: st->sync_state != PFSYNC_S_NONE", __func__)); sc->sc_len -= pfsync_qs[q].len; TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); st->sync_state = PFSYNC_S_NONE; pf_release_state(st); if (TAILQ_EMPTY(&sc->sc_qs[q])) sc->sc_len -= sizeof(struct pfsync_subheader); } static void pfsync_bulk_start(void) { struct pfsync_softc *sc = V_pfsyncif; if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: received bulk update request\n"); PFSYNC_BLOCK(sc); sc->sc_ureq_received = time_uptime; sc->sc_bulk_hashid = 0; sc->sc_bulk_stateid = 0; pfsync_bulk_status(PFSYNC_BUS_START); callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc); PFSYNC_BUNLOCK(sc); } static void pfsync_bulk_update(void *arg) { struct pfsync_softc *sc = arg; struct pf_state *s; int i, sent = 0; PFSYNC_BLOCK_ASSERT(sc); CURVNET_SET(sc->sc_ifp->if_vnet); /* * Start with last state from previous invocation. * It may had gone, in this case start from the * hash slot. */ s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid); if (s != NULL) i = PF_IDHASH(s); else i = sc->sc_bulk_hashid; for (; i <= pf_hashmask; i++) { struct pf_idhash *ih = &V_pf_idhash[i]; if (s != NULL) PF_HASHROW_ASSERT(ih); else { PF_HASHROW_LOCK(ih); s = LIST_FIRST(&ih->states); } for (; s; s = LIST_NEXT(s, entry)) { if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) < sizeof(struct pfsync_state)) { /* We've filled a packet. */ sc->sc_bulk_hashid = i; sc->sc_bulk_stateid = s->id; sc->sc_bulk_creatorid = s->creatorid; PF_HASHROW_UNLOCK(ih); callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc); goto full; } if (s->sync_state == PFSYNC_S_NONE && s->timeout < PFTM_MAX && s->pfsync_time <= sc->sc_ureq_received) { pfsync_update_state_req(s); sent++; } } PF_HASHROW_UNLOCK(ih); } /* We're done. */ pfsync_bulk_status(PFSYNC_BUS_END); full: CURVNET_RESTORE(); } static void pfsync_bulk_status(u_int8_t status) { struct { struct pfsync_subheader subh; struct pfsync_bus bus; } __packed r; struct pfsync_softc *sc = V_pfsyncif; bzero(&r, sizeof(r)); r.subh.action = PFSYNC_ACT_BUS; r.subh.count = htons(1); V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++; r.bus.creatorid = V_pf_status.hostid; r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received); r.bus.status = status; PFSYNC_LOCK(sc); pfsync_send_plus(&r, sizeof(r)); PFSYNC_UNLOCK(sc); } static void pfsync_bulk_fail(void *arg) { struct pfsync_softc *sc = arg; CURVNET_SET(sc->sc_ifp->if_vnet); PFSYNC_BLOCK_ASSERT(sc); if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) { /* Try again */ callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, V_pfsyncif); PFSYNC_LOCK(sc); pfsync_request_update(0, 0); PFSYNC_UNLOCK(sc); } else { /* Pretend like the transfer was ok. */ sc->sc_ureq_sent = 0; sc->sc_bulk_tries = 0; PFSYNC_LOCK(sc); if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p) (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync bulk fail"); sc->sc_flags |= PFSYNCF_OK; PFSYNC_UNLOCK(sc); if (V_pf_status.debug >= PF_DEBUG_MISC) printf("pfsync: failed to receive bulk update\n"); } CURVNET_RESTORE(); } static void pfsync_send_plus(void *plus, size_t pluslen) { struct pfsync_softc *sc = V_pfsyncif; PFSYNC_LOCK_ASSERT(sc); if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu) pfsync_sendout(1); sc->sc_plus = plus; sc->sc_len += (sc->sc_pluslen = pluslen); pfsync_sendout(1); } static void pfsync_timeout(void *arg) { struct pfsync_softc *sc = arg; CURVNET_SET(sc->sc_ifp->if_vnet); PFSYNC_LOCK(sc); pfsync_push(sc); PFSYNC_UNLOCK(sc); CURVNET_RESTORE(); } static void pfsync_push(struct pfsync_softc *sc) { PFSYNC_LOCK_ASSERT(sc); sc->sc_flags |= PFSYNCF_PUSH; swi_sched(V_pfsync_swi_cookie, 0); } static void pfsyncintr(void *arg) { struct pfsync_softc *sc = arg; struct mbuf *m, *n; CURVNET_SET(sc->sc_ifp->if_vnet); PFSYNC_LOCK(sc); if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) { pfsync_sendout(0); sc->sc_flags &= ~PFSYNCF_PUSH; } _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m); PFSYNC_UNLOCK(sc); for (; m != NULL; m = n) { n = m->m_nextpkt; m->m_nextpkt = NULL; /* * We distinguish between a deferral packet and our * own pfsync packet based on M_SKIP_FIREWALL * flag. This is XXX. */ if (m->m_flags & M_SKIP_FIREWALL) ip_output(m, NULL, NULL, 0, NULL, NULL); else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL) == 0) V_pfsyncstats.pfsyncs_opackets++; else V_pfsyncstats.pfsyncs_oerrors++; } CURVNET_RESTORE(); } static int pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship) { struct ip_moptions *imo = &sc->sc_imo; int error; if (!(ifp->if_flags & IFF_MULTICAST)) return (EADDRNOTAVAIL); imo->imo_membership = (struct in_multi **)mship; imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; imo->imo_multicast_vif = -1; if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL, &imo->imo_membership[0])) != 0) { imo->imo_membership = NULL; return (error); } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = PFSYNC_DFLTTL; imo->imo_multicast_loop = 0; return (0); } static void pfsync_multicast_cleanup(struct pfsync_softc *sc) { struct ip_moptions *imo = &sc->sc_imo; in_leavegroup(imo->imo_membership[0], NULL); free(imo->imo_membership, M_PFSYNC); imo->imo_membership = NULL; imo->imo_multicast_ifp = NULL; } #ifdef INET extern struct domain inetdomain; static struct protosw in_pfsync_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_PFSYNC, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = pfsync_input, .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; #endif static void pfsync_pointers_init() { PF_RULES_WLOCK(); pfsync_state_import_ptr = pfsync_state_import; pfsync_insert_state_ptr = pfsync_insert_state; pfsync_update_state_ptr = pfsync_update_state; pfsync_delete_state_ptr = pfsync_delete_state; pfsync_clear_states_ptr = pfsync_clear_states; pfsync_defer_ptr = pfsync_defer; PF_RULES_WUNLOCK(); } static void pfsync_pointers_uninit() { PF_RULES_WLOCK(); pfsync_state_import_ptr = NULL; pfsync_insert_state_ptr = NULL; pfsync_update_state_ptr = NULL; pfsync_delete_state_ptr = NULL; pfsync_clear_states_ptr = NULL; pfsync_defer_ptr = NULL; PF_RULES_WUNLOCK(); } static int pfsync_init() { VNET_ITERATOR_DECL(vnet_iter); int error = 0; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_pfsync_cloner = if_clone_simple(pfsyncname, pfsync_clone_create, pfsync_clone_destroy, 1); error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif, SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie); CURVNET_RESTORE(); if (error) goto fail_locked; } VNET_LIST_RUNLOCK(); #ifdef INET error = pf_proto_register(PF_INET, &in_pfsync_protosw); if (error) goto fail; error = ipproto_register(IPPROTO_PFSYNC); if (error) { pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); goto fail; } #endif pfsync_pointers_init(); return (0); fail: VNET_LIST_RLOCK(); fail_locked: VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (V_pfsync_swi_cookie) { swi_remove(V_pfsync_swi_cookie); if_clone_detach(V_pfsync_cloner); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (error); } static void pfsync_uninit() { VNET_ITERATOR_DECL(vnet_iter); pfsync_pointers_uninit(); ipproto_unregister(IPPROTO_PFSYNC); pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if_clone_detach(V_pfsync_cloner); swi_remove(V_pfsync_swi_cookie); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); } static int pfsync_modevent(module_t mod, int type, void *data) { int error = 0; switch (type) { case MOD_LOAD: error = pfsync_init(); break; case MOD_QUIESCE: /* * Module should not be unloaded due to race conditions. */ error = EBUSY; break; case MOD_UNLOAD: pfsync_uninit(); break; default: error = EINVAL; break; } return (error); } static moduledata_t pfsync_mod = { pfsyncname, pfsync_modevent, 0 }; #define PFSYNC_MODVER 1 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_VERSION(pfsync, PFSYNC_MODVER); MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER); Index: user/dchagin/lemul/sys =================================================================== --- user/dchagin/lemul/sys (revision 274240) +++ user/dchagin/lemul/sys (revision 274241) Property changes on: user/dchagin/lemul/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r274221-274240 Index: user/dchagin/lemul =================================================================== --- user/dchagin/lemul (revision 274240) +++ user/dchagin/lemul (revision 274241) Property changes on: user/dchagin/lemul ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r274221-274240