diff --git a/sys/conf/options b/sys/conf/options index 276c6c5067f7..ba1efff080b7 100644 --- a/sys/conf/options +++ b/sys/conf/options @@ -1,1026 +1,1027 @@ # # On the handling of kernel options # # All kernel options should be listed in NOTES, with suitable # descriptions. Negative options (options that make some code not # compile) should be commented out; LINT (generated from NOTES) should # compile as much code as possible. Try to structure option-using # code so that a single option only switch code on, or only switch # code off, to make it possible to have a full compile-test. If # necessary, you can check for COMPILING_LINT to get maximum code # coverage. # # All new options shall also be listed in either "conf/options" or # "conf/options.". Options that affect a single source-file # .[c|s] should be directed into "opt_.h", while options # that affect multiple files should either go in "opt_global.h" if # this is a kernel-wide option (used just about everywhere), or in # "opt_.h" if it affects only some files. # Note that the effect of listing only an option without a # header-file-name in conf/options (and cousins) is that the last # convention is followed. # # This handling scheme is not yet fully implemented. # # # Format of this file: # Option name filename # # If filename is missing, the default is # opt_.h AAC_DEBUG opt_aac.h AACRAID_DEBUG opt_aacraid.h AHC_ALLOW_MEMIO opt_aic7xxx.h AHC_TMODE_ENABLE opt_aic7xxx.h AHC_DUMP_EEPROM opt_aic7xxx.h AHC_DEBUG opt_aic7xxx.h AHC_DEBUG_OPTS opt_aic7xxx.h AHC_REG_PRETTY_PRINT opt_aic7xxx.h AHD_DEBUG opt_aic79xx.h AHD_DEBUG_OPTS opt_aic79xx.h AHD_TMODE_ENABLE opt_aic79xx.h AHD_REG_PRETTY_PRINT opt_aic79xx.h # Debugging options. ALT_BREAK_TO_DEBUGGER opt_kdb.h BREAK_TO_DEBUGGER opt_kdb.h BUF_TRACKING opt_global.h DDB DDB_BUFR_SIZE opt_ddb.h DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h DDB_CAPTURE_MAXBUFSIZE opt_ddb.h DDB_CTF opt_ddb.h DDB_NUMSYM opt_ddb.h EARLY_PRINTF opt_global.h FULL_BUF_TRACKING opt_global.h GDB KDB opt_global.h KDB_TRACE opt_kdb.h KDB_UNATTENDED opt_kdb.h KLD_DEBUG opt_kld.h NUM_CORE_FILES opt_global.h QUEUE_MACRO_DEBUG_TRACE opt_global.h QUEUE_MACRO_DEBUG_TRASH opt_global.h SYSCTL_DEBUG opt_sysctl.h TEXTDUMP_PREFERRED opt_ddb.h TEXTDUMP_VERBOSE opt_ddb.h TSLOG opt_global.h TSLOG_PAGEZERO opt_global.h TSLOGSIZE opt_global.h # Miscellaneous options. ALQ ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h ATSE_CFI_HACK opt_cfi.h AUDIT opt_global.h BOOTHOWTO opt_global.h BOOTVERBOSE opt_global.h CALLOUT_PROFILING CAPABILITIES opt_capsicum.h CAPABILITY_MODE opt_capsicum.h CC_CDG opt_global.h CC_CHD opt_global.h CC_CUBIC opt_global.h CC_DEFAULT opt_cc.h CC_DCTCP opt_global.h CC_HD opt_global.h CC_HTCP opt_global.h CC_NEWRENO opt_global.h CC_VEGAS opt_global.h COMPAT_43 opt_global.h COMPAT_43TTY opt_global.h COMPAT_FREEBSD4 opt_global.h COMPAT_FREEBSD5 opt_global.h COMPAT_FREEBSD6 opt_global.h COMPAT_FREEBSD7 opt_global.h COMPAT_FREEBSD9 opt_global.h COMPAT_FREEBSD10 opt_global.h COMPAT_FREEBSD11 opt_global.h COMPAT_FREEBSD12 opt_global.h COMPAT_FREEBSD13 opt_global.h COMPAT_FREEBSD14 opt_global.h COMPAT_LINUXKPI opt_dontuse.h COMPILING_LINT opt_global.h CY_PCI_FASTINTR DEADLKRES opt_watchdog.h EXPERIMENTAL opt_global.h DIRECTIO FILEMON opt_dontuse.h FFCLOCK FULL_PREEMPTION opt_sched.h GZIO opt_gzio.h IMGACT_BINMISC opt_dontuse.h IPI_PREEMPTION opt_sched.h GEOM_BDE opt_geom.h GEOM_CACHE opt_geom.h GEOM_CONCAT opt_geom.h GEOM_ELI opt_geom.h GEOM_GATE opt_geom.h GEOM_JOURNAL opt_geom.h GEOM_LABEL opt_geom.h GEOM_LABEL_GPT opt_geom.h GEOM_LINUX_LVM opt_geom.h GEOM_MAP opt_geom.h GEOM_MIRROR opt_geom.h GEOM_MOUNTVER opt_geom.h GEOM_MULTIPATH opt_geom.h GEOM_NOP opt_geom.h GEOM_PART_APM opt_geom.h GEOM_PART_BSD opt_geom.h GEOM_PART_BSD64 opt_geom.h GEOM_PART_EBR opt_geom.h GEOM_PART_GPT opt_geom.h GEOM_PART_LDM opt_geom.h GEOM_PART_MBR opt_geom.h GEOM_RAID opt_geom.h GEOM_RAID3 opt_geom.h GEOM_SHSEC opt_geom.h GEOM_STRIPE opt_geom.h GEOM_UZIP opt_geom.h GEOM_UZIP_DEBUG opt_geom.h GEOM_VINUM opt_geom.h GEOM_VIRSTOR opt_geom.h GEOM_ZERO opt_geom.h IFLIB opt_iflib.h KDTRACE_HOOKS opt_global.h KDTRACE_FRAME opt_kdtrace.h +KDTRACE_NO_MIB_SDT opt_global.h KN_HASHSIZE opt_kqueue.h KSTACK_MAX_PAGES KSTACK_PAGES KSTACK_USAGE_PROF KTRACE KTRACE_REQUEST_POOL opt_ktrace.h LIBICONV MAC opt_global.h MAC_BIBA opt_dontuse.h MAC_BSDEXTENDED opt_dontuse.h MAC_DDB opt_dontuse.h MAC_DEBUG opt_mac.h MAC_IFOFF opt_dontuse.h MAC_IPACL opt_dontuse.h MAC_LOMAC opt_dontuse.h MAC_MLS opt_dontuse.h MAC_NONE opt_dontuse.h MAC_NTPD opt_dontuse.h MAC_PARTITION opt_dontuse.h MAC_PORTACL opt_dontuse.h MAC_PRIORITY opt_dontuse.h MAC_SEEOTHERUIDS opt_dontuse.h MAC_STATIC opt_mac.h MAC_STUB opt_dontuse.h MAC_TEST opt_dontuse.h MAC_GRANTBYLABEL opt_dontuse.h MAC_VERIEXEC opt_dontuse.h MAC_VERIEXEC_DEBUG opt_mac.h MAC_VERIEXEC_SHA1 opt_dontuse.h MAC_VERIEXEC_SHA256 opt_dontuse.h MAC_VERIEXEC_SHA384 opt_dontuse.h MAC_VERIEXEC_SHA512 opt_dontuse.h MD_ROOT opt_md.h MD_ROOT_FSTYPE opt_md.h MD_ROOT_READONLY opt_md.h MD_ROOT_SIZE opt_md.h MD_ROOT_MEM opt_md.h MFI_DEBUG opt_mfi.h MFI_DECODE_LOG opt_mfi.h MPROF_BUFFERS opt_mprof.h MPROF_HASH_SIZE opt_mprof.h NEW_PCIB opt_global.h NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h NO_ADAPTIVE_RWLOCKS NO_ADAPTIVE_SX NO_OBSOLETE_CODE opt_global.h NO_SYSCTL_DESCR opt_global.h NSWBUF_MIN opt_param.h MBUF_PACKET_ZONE_DISABLE opt_global.h PANIC_REBOOT_WAIT_TIME opt_panic.h PCI_HP opt_pci.h PCI_IOV opt_global.h PPC_DEBUG opt_ppc.h PPC_PROBE_CHIPSET opt_ppc.h PPS_SYNC opt_ntp.h PREEMPTION opt_sched.h QUOTA SCHED_4BSD opt_sched.h SCHED_STATS opt_sched.h SCHED_ULE opt_sched.h SLEEPQUEUE_PROFILING SLHCI_DEBUG opt_slhci.h STACK opt_stack.h SUIDDIR MSGMNB opt_sysvipc.h MSGMNI opt_sysvipc.h MSGSEG opt_sysvipc.h MSGSSZ opt_sysvipc.h MSGTQL opt_sysvipc.h SEMMNI opt_sysvipc.h SEMMNS opt_sysvipc.h SEMMNU opt_sysvipc.h SEMMSL opt_sysvipc.h SEMOPM opt_sysvipc.h SEMUME opt_sysvipc.h SHMALL opt_sysvipc.h SHMMAX opt_sysvipc.h SHMMAXPGS opt_sysvipc.h SHMMIN opt_sysvipc.h SHMMNI opt_sysvipc.h SHMSEG opt_sysvipc.h SYSVMSG opt_sysvipc.h SYSVSEM opt_sysvipc.h SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS TCP_REQUEST_TRK opt_global.h TCP_ACCOUNTING opt_global.h TCP_BBR opt_inet.h TCP_RACK opt_inet.h # # TCP SaD Detection is an experimental Sack attack Detection (SaD) # algorithm that uses "normal" behaviour with SACK's to detect # a possible attack. It is strictly experimental at this point. # TCP_SAD_DETECTION opt_inet.h TURNSTILE_PROFILING UMTX_PROFILING UMTX_CHAINS opt_global.h VERBOSE_SYSINIT ZSTDIO opt_zstdio.h # Sanitizers COVERAGE opt_global.h KASAN opt_global.h KCOV KCSAN opt_global.h KMSAN opt_global.h KUBSAN opt_global.h # POSIX kernel options P1003_1B_MQUEUE opt_posix.h P1003_1B_SEMAPHORES opt_posix.h _KPOSIX_PRIORITY_SCHEDULING opt_posix.h # Do we want the config file compiled into the kernel? INCLUDE_CONFIG_FILE opt_config.h # Options for static filesystems. These should only be used at config # time, since the corresponding lkms cannot work if there are any static # dependencies. Unusability is enforced by hiding the defines for the # options in a never-included header. AUTOFS opt_dontuse.h CD9660 opt_dontuse.h EXT2FS opt_dontuse.h FDESCFS opt_dontuse.h FFS opt_dontuse.h FUSEFS opt_dontuse.h MSDOSFS opt_dontuse.h NULLFS opt_dontuse.h PROCFS opt_dontuse.h PSEUDOFS opt_dontuse.h SMBFS opt_dontuse.h TARFS opt_dontuse.h TMPFS opt_dontuse.h UDF opt_dontuse.h UNIONFS opt_dontuse.h ZFS opt_dontuse.h # Pseudofs debugging PSEUDOFS_TRACE opt_pseudofs.h # Tarfs debugging TARFS_DEBUG opt_tarfs.h # In-kernel GSS-API KGSSAPI opt_kgssapi.h KGSSAPI_DEBUG opt_kgssapi.h # These static filesystems have one slightly bogus static dependency in # sys/i386/i386/autoconf.c. If any of these filesystems are # statically compiled into the kernel, code for mounting them as root # filesystems will be enabled - but look below. # NFSCL - client # NFSD - server NFSCL opt_nfs.h NFSD opt_nfs.h # filesystems and libiconv bridge CD9660_ICONV opt_dontuse.h MSDOSFS_ICONV opt_dontuse.h UDF_ICONV opt_dontuse.h # If you are following the conditions in the copyright, # you can enable soft-updates which will speed up a lot of thigs # and make the system safer from crashes at the same time. # otherwise a STUB module will be compiled in. SOFTUPDATES opt_ffs.h # On small, embedded systems, it can be useful to turn off support for # snapshots. It saves about 30-40k for a feature that would be lightly # used, if it is used at all. NO_FFS_SNAPSHOT opt_ffs.h # Enabling this option turns on support for Access Control Lists in UFS, # which can be used to support high security configurations. Depends on # UFS_EXTATTR. UFS_ACL opt_ufs.h # Enabling this option turns on support for extended attributes in UFS-based # filesystems, which can be used to support high security configurations # as well as new filesystem features. UFS_EXTATTR opt_ufs.h UFS_EXTATTR_AUTOSTART opt_ufs.h # Enable fast hash lookups for large directories on UFS-based filesystems. UFS_DIRHASH opt_ufs.h # Enable gjournal-based UFS journal. UFS_GJOURNAL opt_ufs.h # The below sentence is not in English, and neither is this one. # We plan to remove the static dependences above, with a # _ROOT option to control if it usable as root. This list # allows these options to be present in config files already (though # they won't make any difference yet). NFS_ROOT opt_nfsroot.h # SMB/CIFS requester NETSMB opt_netsmb.h # Enable debugnet(4) networking support. DEBUGNET opt_global.h # Enable netdump(4) client support. NETDUMP opt_global.h # Enable netgdb(4) support. NETGDB opt_global.h # Options used only in subr_param.c. HZ opt_param.h MAXFILES opt_param.h NBUF opt_param.h NSFBUFS opt_param.h VM_BCACHE_SIZE_MAX opt_param.h VM_SWZONE_SIZE_MAX opt_param.h MAXUSERS DFLDSIZ opt_param.h MAXDSIZ opt_param.h MAXSSIZ opt_param.h # Generic SCSI options. CAM_MAX_HIGHPOWER opt_cam.h CAMDEBUG opt_cam.h CAM_DEBUG_COMPILE opt_cam.h CAM_DEBUG_DELAY opt_cam.h CAM_DEBUG_BUS opt_cam.h CAM_DEBUG_TARGET opt_cam.h CAM_DEBUG_LUN opt_cam.h CAM_DEBUG_FLAGS opt_cam.h CAM_BOOT_DELAY opt_cam.h CAM_IOSCHED_DYNAMIC opt_cam.h CAM_IO_STATS opt_cam.h CAM_TEST_FAILURE opt_cam.h SCSI_DELAY opt_scsi.h SCSI_NO_SENSE_STRINGS opt_scsi.h SCSI_NO_OP_STRINGS opt_scsi.h # Options used only in cam/ata/ata_da.c ATA_STATIC_ID opt_ada.h # Options used only in cam/scsi/scsi_cd.c CHANGER_MIN_BUSY_SECONDS opt_cd.h CHANGER_MAX_BUSY_SECONDS opt_cd.h # Options used only in cam/scsi/scsi_da.c DA_TRACK_REFS opt_da.h # Options used only in cam/scsi/scsi_sa.c. SA_IO_TIMEOUT opt_sa.h SA_SPACE_TIMEOUT opt_sa.h SA_REWIND_TIMEOUT opt_sa.h SA_ERASE_TIMEOUT opt_sa.h SA_1FM_AT_EOD opt_sa.h # Options used only in cam/scsi/scsi_pt.c SCSI_PT_DEFAULT_TIMEOUT opt_pt.h # Options used only in cam/scsi/scsi_ses.c SES_ENABLE_PASSTHROUGH opt_ses.h # Options used in dev/sym/ (Symbios SCSI driver). SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885 # disabled:0 (default), enabled:1 SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking # disabled:0, enabled:1 (default) SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported # default:8, range:[1..64] # Options used only in dev/isp/* ISP_TARGET_MODE opt_isp.h ISP_FW_CRASH_DUMP opt_isp.h ISP_DEFAULT_ROLES opt_isp.h ISP_INTERNAL_TARGET opt_isp.h ISP_FCTAPE_OFF opt_isp.h # Options used only in dev/iscsi ISCSI_INITIATOR_DEBUG opt_iscsi_initiator.h # Net stuff. ACCEPT_FILTER_DATA ACCEPT_FILTER_DNS ACCEPT_FILTER_HTTP ALTQ opt_global.h ALTQ_CBQ opt_altq.h ALTQ_CDNR opt_altq.h ALTQ_CODEL opt_altq.h ALTQ_DEBUG opt_altq.h ALTQ_HFSC opt_altq.h ALTQ_FAIRQ opt_altq.h ALTQ_NOPCC opt_altq.h ALTQ_PRIQ opt_altq.h ALTQ_RED opt_altq.h ALTQ_RIO opt_altq.h BOOTP opt_bootp.h BOOTP_BLOCKSIZE opt_bootp.h BOOTP_COMPAT opt_bootp.h BOOTP_NFSROOT opt_bootp.h BOOTP_NFSV3 opt_bootp.h BOOTP_WIRED_TO opt_bootp.h DEVICE_POLLING DUMMYNET opt_ipdn.h RATELIMIT opt_ratelimit.h RATELIMIT_DEBUG opt_ratelimit.h INET opt_inet.h INET6 opt_inet6.h STATS opt_global.h IPDIVERT IPFILTER opt_ipfilter.h IPFILTER_DEFAULT_BLOCK opt_ipfilter.h IPFILTER_LOG opt_ipfilter.h IPFILTER_LOOKUP opt_ipfilter.h IPFIREWALL opt_ipfw.h IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h IPFIREWALL_NAT opt_ipfw.h IPFIREWALL_NAT64 opt_ipfw.h IPFIREWALL_NPTV6 opt_ipfw.h IPFIREWALL_VERBOSE opt_ipfw.h IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h IPFIREWALL_PMOD opt_ipfw.h IPSEC opt_ipsec.h IPSEC_DEBUG opt_ipsec.h IPSEC_SUPPORT opt_ipsec.h IPSTEALTH KERN_TLS KRPC LIBALIAS LIBMCHAIN MBUF_PROFILING MBUF_STRESS_TEST MROUTING opt_mrouting.h NFSLOCKD NETLINK opt_global.h PF_DEFAULT_TO_DROP opt_pf.h ROUTE_MPATH opt_route.h ROUTETABLES opt_route.h FIB_ALGO opt_route.h RSS opt_rss.h SLIP_IFF_OPTS opt_slip.h TCPPCAP opt_global.h SIFTR TCP_BLACKBOX opt_global.h TCP_HHOOK opt_global.h TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading TCP_RFC7413 opt_inet.h TCP_RFC7413_MAX_KEYS opt_inet.h TCP_RFC7413_MAX_PSKS opt_inet.h TCP_SIGNATURE opt_ipsec.h VLAN_ARRAY opt_vlan.h XDR XBONEHACK # # SCTP # SCTP opt_sctp.h SCTP_SUPPORT opt_sctp.h SCTP_DEBUG opt_sctp.h # Enable debug printfs SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns. SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats. SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats # # # # Netgraph(4). Use option NETGRAPH to enable the base netgraph code. # Each netgraph node type can be either be compiled into the kernel # or loaded dynamically. To get the former, include the corresponding # option below. Each type has its own man page, e.g. ng_async(4). NETGRAPH NETGRAPH_DEBUG opt_netgraph.h NETGRAPH_ASYNC opt_netgraph.h NETGRAPH_BLUETOOTH opt_netgraph.h NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h NETGRAPH_BLUETOOTH_H4 opt_netgraph.h NETGRAPH_BLUETOOTH_HCI opt_netgraph.h NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h NETGRAPH_BLUETOOTH_UBT opt_netgraph.h NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h NETGRAPH_BPF opt_netgraph.h NETGRAPH_BRIDGE opt_netgraph.h NETGRAPH_CAR opt_netgraph.h NETGRAPH_CHECKSUM opt_netgraph.h NETGRAPH_CISCO opt_netgraph.h NETGRAPH_DEFLATE opt_netgraph.h NETGRAPH_DEVICE opt_netgraph.h NETGRAPH_ECHO opt_netgraph.h NETGRAPH_EIFACE opt_netgraph.h NETGRAPH_ETHER opt_netgraph.h NETGRAPH_ETHER_ECHO opt_netgraph.h NETGRAPH_FEC opt_netgraph.h NETGRAPH_FRAME_RELAY opt_netgraph.h NETGRAPH_GIF opt_netgraph.h NETGRAPH_GIF_DEMUX opt_netgraph.h NETGRAPH_HOLE opt_netgraph.h NETGRAPH_IFACE opt_netgraph.h NETGRAPH_IP_INPUT opt_netgraph.h NETGRAPH_IPFW opt_netgraph.h NETGRAPH_KSOCKET opt_netgraph.h NETGRAPH_L2TP opt_netgraph.h NETGRAPH_LMI opt_netgraph.h NETGRAPH_MPPC_COMPRESSION opt_netgraph.h NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h NETGRAPH_NAT opt_netgraph.h NETGRAPH_NETFLOW opt_netgraph.h NETGRAPH_ONE2MANY opt_netgraph.h NETGRAPH_PATCH opt_netgraph.h NETGRAPH_PIPE opt_netgraph.h NETGRAPH_PPP opt_netgraph.h NETGRAPH_PPPOE opt_netgraph.h NETGRAPH_PPTPGRE opt_netgraph.h NETGRAPH_PRED1 opt_netgraph.h NETGRAPH_RFC1490 opt_netgraph.h NETGRAPH_SOCKET opt_netgraph.h NETGRAPH_SPLIT opt_netgraph.h NETGRAPH_SPPP opt_netgraph.h NETGRAPH_TAG opt_netgraph.h NETGRAPH_TCPMSS opt_netgraph.h NETGRAPH_TEE opt_netgraph.h NETGRAPH_TTY opt_netgraph.h NETGRAPH_UI opt_netgraph.h NETGRAPH_VJC opt_netgraph.h NETGRAPH_VLAN opt_netgraph.h # DRM options DRM_DEBUG opt_drm.h TI_SF_BUF_JUMBO opt_ti.h TI_JUMBO_HDRSPLIT opt_ti.h # Misc debug flags. Most of these should probably be replaced with # 'DEBUG', and then let people recompile just the interesting modules # with 'make CC="cc -DDEBUG"'. DEBUG_1284 opt_ppb_1284.h LPT_DEBUG opt_lpt.h PLIP_DEBUG opt_plip.h LOCKF_DEBUG opt_debug_lockf.h SI_DEBUG opt_debug_si.h IFMEDIA_DEBUG opt_ifmedia.h # Fb options FB_DEBUG opt_fb.h # ppbus related options PERIPH_1284 opt_ppb_1284.h DONTPROBE_1284 opt_ppb_1284.h # smbus related options ENABLE_ALART opt_intpm.h # These cause changes all over the kernel BLKDEV_IOSIZE opt_global.h BURN_BRIDGES opt_global.h DEBUG opt_global.h DEBUG_LOCKS opt_global.h DEBUG_VFS_LOCKS opt_global.h DFLTPHYS opt_global.h DIAGNOSTIC opt_global.h INVARIANT_SUPPORT opt_global.h INVARIANTS opt_global.h KASSERT_PANIC_OPTIONAL opt_global.h MAXCPU opt_global.h MAXMEMDOM opt_global.h MAXPHYS opt_maxphys.h MCLSHIFT opt_global.h MUTEX_NOINLINE opt_global.h LOCK_PROFILING opt_global.h MSIZE opt_global.h REGRESSION opt_global.h RWLOCK_NOINLINE opt_global.h SX_NOINLINE opt_global.h VFS_BIO_DEBUG opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h VM_KMEM_SIZE_SCALE opt_vm.h VM_KMEM_SIZE_MAX opt_vm.h VM_NRESERVLEVEL opt_vm.h VM_LEVEL_0_ORDER opt_vm.h NO_SWAPPING opt_vm.h MALLOC_MAKE_FAILURES opt_vm.h MALLOC_PROFILE opt_vm.h MALLOC_DEBUG_MAXZONES opt_vm.h # The MemGuard replacement allocator used for tamper-after-free detection DEBUG_MEMGUARD opt_vm.h # The RedZone malloc(9) protection DEBUG_REDZONE opt_vm.h # Standard SMP options EARLY_AP_STARTUP opt_global.h SMP opt_global.h NUMA opt_global.h # Size of the kernel message buffer MSGBUF_SIZE opt_msgbuf.h # NFS options NFS_MINATTRTIMO opt_nfs.h NFS_MAXATTRTIMO opt_nfs.h NFS_MINDIRATTRTIMO opt_nfs.h NFS_MAXDIRATTRTIMO opt_nfs.h NFS_DEBUG opt_nfs.h # TMPFS options TMPFS_PAGES_MINRESERVED opt_tmpfs.h # Options for uart(4) UART_PPS_ON_CTS opt_uart.h UART_POLL_FREQ opt_uart.h UART_DEV_TOLERANCE_PCT opt_uart.h # options for bus/device framework BUS_DEBUG opt_bus.h # options for USB support USB_DEBUG opt_usb.h USB_HOST_ALIGN opt_usb.h USB_REQ_DEBUG opt_usb.h USB_TEMPLATE opt_usb.h USB_VERBOSE opt_usb.h USB_DMA_SINGLE_ALLOC opt_usb.h USB_EHCI_BIG_ENDIAN_DESC opt_usb.h U3G_DEBUG opt_u3g.h UKBD_DFLT_KEYMAP opt_ukbd.h UPLCOM_INTR_INTERVAL opt_uplcom.h UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h UVSCOM_INTR_INTERVAL opt_uvscom.h # options for the Realtek rtwn driver RTWN_DEBUG opt_rtwn.h RTWN_WITHOUT_UCODE opt_rtwn.h # Embedded system options INIT_PATH ROOTDEVNAME FDC_DEBUG opt_fdc.h PCFCLOCK_VERBOSE opt_pcfclock.h PCFCLOCK_MAX_RETRIES opt_pcfclock.h KTR opt_global.h KTR_ALQ opt_ktr.h KTR_MASK opt_ktr.h KTR_CPUMASK opt_ktr.h KTR_COMPILE opt_global.h KTR_BOOT_ENTRIES opt_global.h KTR_ENTRIES opt_global.h KTR_VERBOSE opt_ktr.h WITNESS opt_global.h WITNESS_KDB opt_witness.h WITNESS_NO_VNODE opt_witness.h WITNESS_SKIPSPIN opt_witness.h WITNESS_COUNT opt_witness.h OPENSOLARIS_WITNESS opt_global.h EPOCH_TRACE opt_global.h # options for ACPI support ACPI_DEBUG opt_acpi.h ACPI_MAX_TASKS opt_acpi.h ACPI_MAX_THREADS opt_acpi.h DEV_ACPI opt_acpi.h ACPI_EARLY_EPYC_WAR opt_acpi.h # options for IOMMU support IOMMU opt_iommu.h # ISA support DEV_ISA opt_isa.h ISAPNP opt_dontuse.h # various 'device presence' options. DEV_BPF opt_bpf.h DEV_CARP opt_carp.h DEV_NETMAP opt_global.h DEV_PCI opt_pci.h DEV_PF opt_pf.h DEV_PFLOG opt_pf.h DEV_PFSYNC opt_pf.h DEV_SPLASH opt_splash.h DEV_VLAN opt_vlan.h # bce driver BCE_DEBUG opt_bce.h BCE_NVRAM_WRITE_SUPPORT opt_bce.h SOCKBUF_DEBUG opt_global.h # options for hifn driver HIFN_DEBUG opt_hifn.h HIFN_RNDTEST opt_hifn.h # options for safenet driver SAFE_DEBUG opt_safe.h SAFE_NO_RNG opt_safe.h SAFE_RNDTEST opt_safe.h # syscons/vt options MAXCONS opt_syscons.h SC_ALT_MOUSE_IMAGE opt_syscons.h SC_CUT_SPACES2TABS opt_syscons.h SC_CUT_SEPCHARS opt_syscons.h SC_DEBUG_LEVEL opt_syscons.h SC_DFLT_FONT opt_syscons.h SC_DFLT_TERM opt_syscons.h SC_DISABLE_KDBKEY opt_syscons.h SC_DISABLE_REBOOT opt_syscons.h SC_HISTORY_SIZE opt_syscons.h SC_KERNEL_CONS_ATTR opt_syscons.h SC_KERNEL_CONS_ATTRS opt_syscons.h SC_KERNEL_CONS_REV_ATTR opt_syscons.h SC_MOUSE_CHAR opt_syscons.h SC_NO_CUTPASTE opt_syscons.h SC_NO_FONT_LOADING opt_syscons.h SC_NO_HISTORY opt_syscons.h SC_NO_MODE_CHANGE opt_syscons.h SC_NO_SUSPEND_VTYSWITCH opt_syscons.h SC_NO_SYSMOUSE opt_syscons.h SC_NO_TERM_DUMB opt_syscons.h SC_NO_TERM_SC opt_syscons.h SC_NO_TERM_TEKEN opt_syscons.h SC_NORM_ATTR opt_syscons.h SC_NORM_REV_ATTR opt_syscons.h SC_PIXEL_MODE opt_syscons.h SC_RENDER_DEBUG opt_syscons.h SC_TWOBUTTON_MOUSE opt_syscons.h VT_ALT_TO_ESC_HACK opt_syscons.h VT_FB_MAX_WIDTH opt_syscons.h VT_FB_MAX_HEIGHT opt_syscons.h VT_MAXWINDOWS opt_syscons.h VT_TWOBUTTON_MOUSE opt_syscons.h DEV_SC opt_syscons.h DEV_VT opt_syscons.h # teken terminal emulator options TEKEN_CONS25 opt_teken.h TEKEN_UTF8 opt_teken.h TERMINAL_KERN_ATTR opt_teken.h TERMINAL_NORM_ATTR opt_teken.h # options for printf PRINTF_BUFR_SIZE opt_printf.h BOOT_TAG opt_printf.h BOOT_TAG_SZ opt_printf.h # kbd options KBD_DISABLE_KEYMAP_LOAD opt_kbd.h KBD_INSTALL_CDEV opt_kbd.h KBD_MAXRETRY opt_kbd.h KBD_MAXWAIT opt_kbd.h KBD_RESETDELAY opt_kbd.h KBD_DELAY1 opt_kbd.h KBD_DELAY2 opt_kbd.h KBDIO_DEBUG opt_kbd.h KBDMUX_DFLT_KEYMAP opt_kbdmux.h # options for the Atheros driver ATH_DEBUG opt_ath.h ATH_TXBUF opt_ath.h ATH_RXBUF opt_ath.h ATH_DIAGAPI opt_ath.h ATH_TX99_DIAG opt_ath.h ATH_ENABLE_DFS opt_ath.h ATH_EEPROM_FIRMWARE opt_ath.h ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h ATH_DEBUG_ALQ opt_ath.h ATH_KTR_INTR_DEBUG opt_ath.h AH_DEBUG opt_ah.h AH_ASSERT opt_ah.h AH_DEBUG_ALQ opt_ah.h AH_REGOPS_FUNC opt_ah.h AH_WRITE_REGDOMAIN opt_ah.h AH_DEBUG_COUNTRY opt_ah.h AH_WRITE_EEPROM opt_ah.h AH_PRIVATE_DIAG opt_ah.h AH_NEED_DESC_SWAP opt_ah.h AH_USE_INIPDGAIN opt_ah.h AH_MAXCHAN opt_ah.h AH_RXCFG_SDMAMW_4BYTES opt_ah.h AH_INTERRUPT_DEBUGGING opt_ah.h # AR5416 and later interrupt mitigation # XXX do not use this for AR9130 AH_AR5416_INTERRUPT_MITIGATION opt_ah.h # options for the Altera mSGDMA driver (altera_msgdma) ALTERA_MSGDMA_DESC_STD opt_altera_msgdma.h ALTERA_MSGDMA_DESC_EXT opt_altera_msgdma.h ALTERA_MSGDMA_DESC_PF_STD opt_altera_msgdma.h ALTERA_MSGDMA_DESC_PF_EXT opt_altera_msgdma.h # options for the Broadcom BCM43xx driver (bwi) BWI_DEBUG opt_bwi.h BWI_DEBUG_VERBOSE opt_bwi.h # options for the Brodacom BCM43xx driver (bwn) BWN_DEBUG opt_bwn.h BWN_GPL_PHY opt_bwn.h BWN_USE_SIBA opt_bwn.h # Options for the SIBA driver SIBA_DEBUG opt_siba.h # options for the Marvell 8335 wireless driver MALO_DEBUG opt_malo.h MALO_TXBUF opt_malo.h MALO_RXBUF opt_malo.h # options for the Marvell wireless driver MWL_DEBUG opt_mwl.h MWL_TXBUF opt_mwl.h MWL_RXBUF opt_mwl.h MWL_DIAGAPI opt_mwl.h MWL_AGGR_SIZE opt_mwl.h MWL_TX_NODROP opt_mwl.h # Options for the Marvell NETA driver MVNETA_MULTIQUEUE opt_mvneta.h MVNETA_KTR opt_mvneta.h # Options for the Intel 802.11ac wireless driver IWM_DEBUG opt_iwm.h # Options for the Intel 802.11n wireless driver IWN_DEBUG opt_iwn.h # Options for the Intel 3945ABG wireless driver WPI_DEBUG opt_wpi.h # dcons options DCONS_BUF_SIZE opt_dcons.h DCONS_POLL_HZ opt_dcons.h DCONS_FORCE_CONSOLE opt_dcons.h DCONS_FORCE_GDB opt_dcons.h # HWPMC options HWPMC_DEBUG opt_global.h HWPMC_HOOKS # 802.11 support layer IEEE80211_DEBUG opt_wlan.h IEEE80211_DEBUG_REFCNT opt_wlan.h IEEE80211_SUPPORT_MESH opt_wlan.h IEEE80211_SUPPORT_SUPERG opt_wlan.h IEEE80211_SUPPORT_TDMA opt_wlan.h IEEE80211_ALQ opt_wlan.h IEEE80211_DFS_DEBUG opt_wlan.h # 802.11 TDMA support TDMA_SLOTLEN_DEFAULT opt_tdma.h TDMA_SLOTCNT_DEFAULT opt_tdma.h TDMA_BINTVAL_DEFAULT opt_tdma.h TDMA_TXRATE_11B_DEFAULT opt_tdma.h TDMA_TXRATE_11G_DEFAULT opt_tdma.h TDMA_TXRATE_11A_DEFAULT opt_tdma.h TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h TDMA_TXRATE_HALF_DEFAULT opt_tdma.h TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h TDMA_TXRATE_11NA_DEFAULT opt_tdma.h TDMA_TXRATE_11NG_DEFAULT opt_tdma.h # VideoMode PICKMODE_DEBUG opt_videomode.h # Network stack virtualization options VIMAGE opt_global.h VNET_DEBUG opt_global.h # Common Flash Interface (CFI) options CFI_SUPPORT_STRATAFLASH opt_cfi.h CFI_ARMEDANDDANGEROUS opt_cfi.h CFI_HARDWAREBYTESWAP opt_cfi.h # Sound options SND_DEBUG opt_snd.h SND_DIAGNOSTIC opt_snd.h SND_FEEDER_MULTIFORMAT opt_snd.h SND_FEEDER_FULL_MULTIFORMAT opt_snd.h SND_FEEDER_RATE_HP opt_snd.h SND_PCM_64 opt_snd.h SND_OLDSTEREO opt_snd.h X86BIOS # Flattened device tree options FDT opt_platform.h FDT_DTB_STATIC opt_platform.h # OFED Infiniband stack OFED opt_ofed.h OFED_DEBUG_INIT opt_ofed.h SDP opt_ofed.h SDP_DEBUG opt_ofed.h IPOIB opt_ofed.h IPOIB_DEBUG opt_ofed.h IPOIB_CM opt_ofed.h # Resource Accounting RACCT opt_global.h RACCT_DEFAULT_TO_DISABLED opt_global.h # Resource Limits RCTL opt_global.h # Random number generator(s) # Alternative RNG algorithm. RANDOM_FENESTRASX opt_global.h # With this, no entropy processor is loaded, but the entropy # harvesting infrastructure is present. This means an entropy # processor may be loaded as a module. RANDOM_LOADABLE opt_global.h # This turns on high-rate and potentially expensive harvesting in # the uma slab allocator. RANDOM_ENABLE_UMA opt_global.h RANDOM_ENABLE_ETHER opt_global.h # This options turns TPM into entropy source. TPM_HARVEST opt_tpm.h # BHND(4) driver BHND_LOGLEVEL opt_global.h # GPIO and child devices GPIO_SPI_DEBUG opt_gpio.h # SPI devices SPIGEN_LEGACY_CDEVNAME opt_spi.h # etherswitch(4) driver RTL8366_SOFT_RESET opt_etherswitch.h # evdev protocol support EVDEV_SUPPORT opt_evdev.h EVDEV_DEBUG opt_evdev.h UINPUT_DEBUG opt_evdev.h # Hyper-V network driver HN_DEBUG opt_hn.h # CAM-based MMC stack MMCCAM # Encrypted kernel crash dumps EKCD opt_ekcd.h # NVME options NVME_USE_NVD opt_nvme.h # amdsbwd options AMDSBWD_DEBUG opt_amdsbwd.h # gcov support GCOV opt_global.h LINDEBUGFS # options for HID support HID_DEBUG opt_hid.h IICHID_DEBUG opt_hid.h IICHID_SAMPLING opt_hid.h HKBD_DFLT_KEYMAP opt_hkbd.h HIDRAW_MAKE_UHID_ALIAS opt_hid.h # kenv options # The early kernel environment (loader environment, config(8)-provided static) # is typically cleared after the dynamic environment comes up to ensure that # we're not inadvertently holding on to 'secret' values in these stale envs. # This option is insecure except in controlled environments where the static # environment's contents are known to be safe. PRESERVE_EARLY_KENV opt_global.h diff --git a/sys/netinet/icmp6.h b/sys/netinet/icmp6.h index e344c662783a..4368fd2a0fcf 100644 --- a/sys/netinet/icmp6.h +++ b/sys/netinet/icmp6.h @@ -1,785 +1,776 @@ /* $KAME: icmp6.h,v 1.46 2001/04/27 15:09:48 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_ICMP6_H_ #define _NETINET_ICMP6_H_ #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ struct icmp6_hdr { u_int8_t icmp6_type; /* type field */ u_int8_t icmp6_code; /* code field */ u_int16_t icmp6_cksum; /* checksum field */ union { u_int32_t icmp6_un_data32[1]; /* type-specific field */ u_int16_t icmp6_un_data16[2]; /* type-specific field */ u_int8_t icmp6_un_data8[4]; /* type-specific field */ } icmp6_dataun; } __packed; #define icmp6_data32 icmp6_dataun.icmp6_un_data32 #define icmp6_data16 icmp6_dataun.icmp6_un_data16 #define icmp6_data8 icmp6_dataun.icmp6_un_data8 #define icmp6_pptr icmp6_data32[0] /* parameter prob */ #define icmp6_mtu icmp6_data32[0] /* packet too big */ #define icmp6_id icmp6_data16[0] /* echo request/reply */ #define icmp6_seq icmp6_data16[1] /* echo request/reply */ #define icmp6_maxdelay icmp6_data16[0] /* mcast group membership */ #define ICMP6_DST_UNREACH 1 /* dest unreachable, codes: */ #define ICMP6_PACKET_TOO_BIG 2 /* packet too big */ #define ICMP6_TIME_EXCEEDED 3 /* time exceeded, code: */ #define ICMP6_PARAM_PROB 4 /* ip6 header bad */ #define ICMP6_ECHO_REQUEST 128 /* echo service */ #define ICMP6_ECHO_REPLY 129 /* echo reply */ #define MLD_LISTENER_QUERY 130 /* multicast listener query */ #define MLD_LISTENER_REPORT 131 /* multicast listener report */ #define MLD_LISTENER_DONE 132 /* multicast listener done */ #define MLD_LISTENER_REDUCTION MLD_LISTENER_DONE /* RFC3542 definition */ /* RFC2292 decls */ #define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ #define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ #define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ #ifndef _KERNEL /* the followings are for backward compatibility to old KAME apps. */ #define MLD6_LISTENER_QUERY MLD_LISTENER_QUERY #define MLD6_LISTENER_REPORT MLD_LISTENER_REPORT #define MLD6_LISTENER_DONE MLD_LISTENER_DONE #endif #define ND_ROUTER_SOLICIT 133 /* router solicitation */ #define ND_ROUTER_ADVERT 134 /* router advertisement */ #define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ #define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisement */ #define ND_REDIRECT 137 /* redirect */ #define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ #define ICMP6_WRUREQUEST 139 /* who are you request */ #define ICMP6_WRUREPLY 140 /* who are you reply */ #define ICMP6_FQDN_QUERY 139 /* FQDN query */ #define ICMP6_FQDN_REPLY 140 /* FQDN reply */ #define ICMP6_NI_QUERY 139 /* node information request */ #define ICMP6_NI_REPLY 140 /* node information reply */ #define MLDV2_LISTENER_REPORT 143 /* RFC3810 listener report */ /* The definitions below are experimental. TBA */ #define MLD_MTRACE_RESP 200 /* mtrace resp (to sender) */ #define MLD_MTRACE 201 /* mtrace messages */ #ifndef _KERNEL #define MLD6_MTRACE_RESP MLD_MTRACE_RESP #define MLD6_MTRACE MLD_MTRACE #endif #define ICMP6_MAXTYPE 201 #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ #define ICMP6_DST_UNREACH_NOTNEIGHBOR 2 /* not a neighbor(obsolete) */ #define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */ #define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ #define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ #define ICMP6_DST_UNREACH_POLICY 5 /* failed ingress/egress policy */ #define ICMP6_DST_UNREACH_REJECT 6 /* Reject route to destination */ #define ICMP6_DST_UNREACH_SRCROUTE 7 /* Error in source routing header */ #define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */ #define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */ #define ICMP6_PARAMPROB_HEADER 0 /* erroneous header field */ #define ICMP6_PARAMPROB_NEXTHEADER 1 /* unrecognized next header */ #define ICMP6_PARAMPROB_OPTION 2 /* unrecognized option */ #define ICMP6_INFOMSG_MASK 0x80 /* all informational messages */ #define ICMP6_NI_SUBJ_IPV6 0 /* Query Subject is an IPv6 address */ #define ICMP6_NI_SUBJ_FQDN 1 /* Query Subject is a Domain name */ #define ICMP6_NI_SUBJ_IPV4 2 /* Query Subject is an IPv4 address */ #define ICMP6_NI_SUCCESS 0 /* node information successful reply */ #define ICMP6_NI_REFUSED 1 /* node information request is refused */ #define ICMP6_NI_UNKNOWN 2 /* unknown Qtype */ #define ICMP6_ROUTER_RENUMBERING_COMMAND 0 /* rr command */ #define ICMP6_ROUTER_RENUMBERING_RESULT 1 /* rr result */ #define ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET 255 /* rr seq num reset */ /* Used in kernel only */ #define ND_REDIRECT_ONLINK 0 /* redirect to an on-link node */ #define ND_REDIRECT_ROUTER 1 /* redirect to a better router */ /* * Multicast Listener Discovery */ struct mld_hdr { struct icmp6_hdr mld_icmp6_hdr; struct in6_addr mld_addr; /* multicast address */ } __packed; /* definitions to provide backward compatibility to old KAME applications */ #ifndef _KERNEL #define mld6_hdr mld_hdr #define mld6_type mld_type #define mld6_code mld_code #define mld6_cksum mld_cksum #define mld6_maxdelay mld_maxdelay #define mld6_reserved mld_reserved #define mld6_addr mld_addr #endif /* shortcut macro definitions */ #define mld_type mld_icmp6_hdr.icmp6_type #define mld_code mld_icmp6_hdr.icmp6_code #define mld_cksum mld_icmp6_hdr.icmp6_cksum #define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0] #define mld_reserved mld_icmp6_hdr.icmp6_data16[1] #define mld_v2_reserved mld_icmp6_hdr.icmp6_data16[0] #define mld_v2_numrecs mld_icmp6_hdr.icmp6_data16[1] /* * Neighbor Discovery */ struct nd_router_solicit { /* router solicitation */ struct icmp6_hdr nd_rs_hdr; /* could be followed by options */ } __packed; #define nd_rs_type nd_rs_hdr.icmp6_type #define nd_rs_code nd_rs_hdr.icmp6_code #define nd_rs_cksum nd_rs_hdr.icmp6_cksum #define nd_rs_reserved nd_rs_hdr.icmp6_data32[0] struct nd_router_advert { /* router advertisement */ struct icmp6_hdr nd_ra_hdr; u_int32_t nd_ra_reachable; /* reachable time */ u_int32_t nd_ra_retransmit; /* retransmit timer */ /* could be followed by options */ } __packed; #define nd_ra_type nd_ra_hdr.icmp6_type #define nd_ra_code nd_ra_hdr.icmp6_code #define nd_ra_cksum nd_ra_hdr.icmp6_cksum #define nd_ra_curhoplimit nd_ra_hdr.icmp6_data8[0] #define nd_ra_flags_reserved nd_ra_hdr.icmp6_data8[1] #define ND_RA_FLAG_MANAGED 0x80 #define ND_RA_FLAG_OTHER 0x40 #define ND_RA_FLAG_HA 0x20 /* * Router preference values based on draft-draves-ipngwg-router-selection-01. * These are non-standard definitions. */ #define ND_RA_FLAG_RTPREF_MASK 0x18 /* 00011000 */ #define ND_RA_FLAG_RTPREF_HIGH 0x08 /* 00001000 */ #define ND_RA_FLAG_RTPREF_MEDIUM 0x00 /* 00000000 */ #define ND_RA_FLAG_RTPREF_LOW 0x18 /* 00011000 */ #define ND_RA_FLAG_RTPREF_RSV 0x10 /* 00010000 */ #ifdef EXPERIMENTAL #define ND_RA_FLAG_IPV6_ONLY 0x02 /* draft-ietf-6man-ipv6only-flag */ #endif #define nd_ra_router_lifetime nd_ra_hdr.icmp6_data16[1] struct nd_neighbor_solicit { /* neighbor solicitation */ struct icmp6_hdr nd_ns_hdr; struct in6_addr nd_ns_target; /*target address */ /* could be followed by options */ } __packed; #define nd_ns_type nd_ns_hdr.icmp6_type #define nd_ns_code nd_ns_hdr.icmp6_code #define nd_ns_cksum nd_ns_hdr.icmp6_cksum #define nd_ns_reserved nd_ns_hdr.icmp6_data32[0] struct nd_neighbor_advert { /* neighbor advertisement */ struct icmp6_hdr nd_na_hdr; struct in6_addr nd_na_target; /* target address */ /* could be followed by options */ } __packed; #define nd_na_type nd_na_hdr.icmp6_type #define nd_na_code nd_na_hdr.icmp6_code #define nd_na_cksum nd_na_hdr.icmp6_cksum #define nd_na_flags_reserved nd_na_hdr.icmp6_data32[0] #if BYTE_ORDER == BIG_ENDIAN #define ND_NA_FLAG_ROUTER 0x80000000 #define ND_NA_FLAG_SOLICITED 0x40000000 #define ND_NA_FLAG_OVERRIDE 0x20000000 #else #if BYTE_ORDER == LITTLE_ENDIAN #define ND_NA_FLAG_ROUTER 0x80 #define ND_NA_FLAG_SOLICITED 0x40 #define ND_NA_FLAG_OVERRIDE 0x20 #endif #endif struct nd_redirect { /* redirect */ struct icmp6_hdr nd_rd_hdr; struct in6_addr nd_rd_target; /* target address */ struct in6_addr nd_rd_dst; /* destination address */ /* could be followed by options */ } __packed; #define nd_rd_type nd_rd_hdr.icmp6_type #define nd_rd_code nd_rd_hdr.icmp6_code #define nd_rd_cksum nd_rd_hdr.icmp6_cksum #define nd_rd_reserved nd_rd_hdr.icmp6_data32[0] struct nd_opt_hdr { /* Neighbor discovery option header */ u_int8_t nd_opt_type; u_int8_t nd_opt_len; /* followed by option specific data*/ } __packed; #define ND_OPT_SOURCE_LINKADDR 1 #define ND_OPT_TARGET_LINKADDR 2 #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 #define ND_OPT_NONCE 14 /* RFC 3971 */ #define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */ #define ND_OPT_RDNSS 25 /* RFC 6106 */ #define ND_OPT_DNSSL 31 /* RFC 6106 */ #define ND_OPT_MAX 31 struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; u_int8_t nd_opt_pi_len; u_int8_t nd_opt_pi_prefix_len; u_int8_t nd_opt_pi_flags_reserved; u_int32_t nd_opt_pi_valid_time; u_int32_t nd_opt_pi_preferred_time; u_int32_t nd_opt_pi_reserved2; struct in6_addr nd_opt_pi_prefix; } __packed; #define ND_OPT_PI_FLAG_ONLINK 0x80 #define ND_OPT_PI_FLAG_AUTO 0x40 struct nd_opt_rd_hdr { /* redirected header */ u_int8_t nd_opt_rh_type; u_int8_t nd_opt_rh_len; u_int16_t nd_opt_rh_reserved1; u_int32_t nd_opt_rh_reserved2; /* followed by IP header and data */ } __packed; struct nd_opt_mtu { /* MTU option */ u_int8_t nd_opt_mtu_type; u_int8_t nd_opt_mtu_len; u_int16_t nd_opt_mtu_reserved; u_int32_t nd_opt_mtu_mtu; } __packed; #define ND_OPT_NONCE_LEN ((1 * 8) - 2) #if ((ND_OPT_NONCE_LEN + 2) % 8) != 0 #error "(ND_OPT_NONCE_LEN + 2) must be a multiple of 8." #endif struct nd_opt_nonce { /* nonce option */ u_int8_t nd_opt_nonce_type; u_int8_t nd_opt_nonce_len; u_int8_t nd_opt_nonce[ND_OPT_NONCE_LEN]; } __packed; struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_type; u_int8_t nd_opt_rti_len; u_int8_t nd_opt_rti_prefixlen; u_int8_t nd_opt_rti_flags; u_int32_t nd_opt_rti_lifetime; /* prefix follows */ } __packed; struct nd_opt_rdnss { /* RDNSS option (RFC 6106) */ u_int8_t nd_opt_rdnss_type; u_int8_t nd_opt_rdnss_len; u_int16_t nd_opt_rdnss_reserved; u_int32_t nd_opt_rdnss_lifetime; /* followed by list of recursive DNS servers */ } __packed; struct nd_opt_dnssl { /* DNSSL option (RFC 6106) */ u_int8_t nd_opt_dnssl_type; u_int8_t nd_opt_dnssl_len; u_int16_t nd_opt_dnssl_reserved; u_int32_t nd_opt_dnssl_lifetime; /* followed by list of DNS search domains */ } __packed; /* * icmp6 namelookup */ struct icmp6_namelookup { struct icmp6_hdr icmp6_nl_hdr; u_int8_t icmp6_nl_nonce[8]; int32_t icmp6_nl_ttl; #if 0 u_int8_t icmp6_nl_len; u_int8_t icmp6_nl_name[3]; #endif /* could be followed by options */ } __packed; /* * icmp6 node information */ struct icmp6_nodeinfo { struct icmp6_hdr icmp6_ni_hdr; u_int8_t icmp6_ni_nonce[8]; /* could be followed by reply data */ } __packed; #define ni_type icmp6_ni_hdr.icmp6_type #define ni_code icmp6_ni_hdr.icmp6_code #define ni_cksum icmp6_ni_hdr.icmp6_cksum #define ni_qtype icmp6_ni_hdr.icmp6_data16[0] #define ni_flags icmp6_ni_hdr.icmp6_data16[1] #define NI_QTYPE_NOOP 0 /* NOOP */ #define NI_QTYPE_SUPTYPES 1 /* Supported Qtypes */ #define NI_QTYPE_FQDN 2 /* FQDN (draft 04) */ #define NI_QTYPE_DNSNAME 2 /* DNS Name */ #define NI_QTYPE_NODEADDR 3 /* Node Addresses */ #define NI_QTYPE_IPV4ADDR 4 /* IPv4 Addresses */ #if BYTE_ORDER == BIG_ENDIAN #define NI_SUPTYPE_FLAG_COMPRESS 0x1 #define NI_FQDN_FLAG_VALIDTTL 0x1 #elif BYTE_ORDER == LITTLE_ENDIAN #define NI_SUPTYPE_FLAG_COMPRESS 0x0100 #define NI_FQDN_FLAG_VALIDTTL 0x0100 #endif #ifdef NAME_LOOKUPS_04 #if BYTE_ORDER == BIG_ENDIAN #define NI_NODEADDR_FLAG_LINKLOCAL 0x1 #define NI_NODEADDR_FLAG_SITELOCAL 0x2 #define NI_NODEADDR_FLAG_GLOBAL 0x4 #define NI_NODEADDR_FLAG_ALL 0x8 #define NI_NODEADDR_FLAG_TRUNCATE 0x10 #define NI_NODEADDR_FLAG_ANYCAST 0x20 /* just experimental. not in spec */ #elif BYTE_ORDER == LITTLE_ENDIAN #define NI_NODEADDR_FLAG_LINKLOCAL 0x0100 #define NI_NODEADDR_FLAG_SITELOCAL 0x0200 #define NI_NODEADDR_FLAG_GLOBAL 0x0400 #define NI_NODEADDR_FLAG_ALL 0x0800 #define NI_NODEADDR_FLAG_TRUNCATE 0x1000 #define NI_NODEADDR_FLAG_ANYCAST 0x2000 /* just experimental. not in spec */ #endif #else /* draft-ietf-ipngwg-icmp-name-lookups-05 (and later?) */ #if BYTE_ORDER == BIG_ENDIAN #define NI_NODEADDR_FLAG_TRUNCATE 0x1 #define NI_NODEADDR_FLAG_ALL 0x2 #define NI_NODEADDR_FLAG_COMPAT 0x4 #define NI_NODEADDR_FLAG_LINKLOCAL 0x8 #define NI_NODEADDR_FLAG_SITELOCAL 0x10 #define NI_NODEADDR_FLAG_GLOBAL 0x20 #define NI_NODEADDR_FLAG_ANYCAST 0x40 /* just experimental. not in spec */ #elif BYTE_ORDER == LITTLE_ENDIAN #define NI_NODEADDR_FLAG_TRUNCATE 0x0100 #define NI_NODEADDR_FLAG_ALL 0x0200 #define NI_NODEADDR_FLAG_COMPAT 0x0400 #define NI_NODEADDR_FLAG_LINKLOCAL 0x0800 #define NI_NODEADDR_FLAG_SITELOCAL 0x1000 #define NI_NODEADDR_FLAG_GLOBAL 0x2000 #define NI_NODEADDR_FLAG_ANYCAST 0x4000 /* just experimental. not in spec */ #endif #endif struct ni_reply_fqdn { u_int32_t ni_fqdn_ttl; /* TTL */ u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ } __packed; /* * Router Renumbering. as router-renum-08.txt */ struct icmp6_router_renum { /* router renumbering header */ struct icmp6_hdr rr_hdr; u_int8_t rr_segnum; u_int8_t rr_flags; u_int16_t rr_maxdelay; u_int32_t rr_reserved; } __packed; #define ICMP6_RR_FLAGS_TEST 0x80 #define ICMP6_RR_FLAGS_REQRESULT 0x40 #define ICMP6_RR_FLAGS_FORCEAPPLY 0x20 #define ICMP6_RR_FLAGS_SPECSITE 0x10 #define ICMP6_RR_FLAGS_PREVDONE 0x08 #define rr_type rr_hdr.icmp6_type #define rr_code rr_hdr.icmp6_code #define rr_cksum rr_hdr.icmp6_cksum #define rr_seqnum rr_hdr.icmp6_data32[0] struct rr_pco_match { /* match prefix part */ u_int8_t rpm_code; u_int8_t rpm_len; u_int8_t rpm_ordinal; u_int8_t rpm_matchlen; u_int8_t rpm_minlen; u_int8_t rpm_maxlen; u_int16_t rpm_reserved; struct in6_addr rpm_prefix; } __packed; #define RPM_PCO_ADD 1 #define RPM_PCO_CHANGE 2 #define RPM_PCO_SETGLOBAL 3 #define RPM_PCO_MAX 4 struct rr_pco_use { /* use prefix part */ u_int8_t rpu_uselen; u_int8_t rpu_keeplen; u_int8_t rpu_ramask; u_int8_t rpu_raflags; u_int32_t rpu_vltime; u_int32_t rpu_pltime; u_int32_t rpu_flags; struct in6_addr rpu_prefix; } __packed; #define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 #define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 #if BYTE_ORDER == BIG_ENDIAN #define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80000000 #define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40000000 #elif BYTE_ORDER == LITTLE_ENDIAN #define ICMP6_RR_PCOUSE_FLAGS_DECRVLTIME 0x80 #define ICMP6_RR_PCOUSE_FLAGS_DECRPLTIME 0x40 #endif struct rr_result { /* router renumbering result message */ u_int16_t rrr_flags; u_int8_t rrr_ordinal; u_int8_t rrr_matchedlen; u_int32_t rrr_ifid; struct in6_addr rrr_prefix; } __packed; #if BYTE_ORDER == BIG_ENDIAN #define ICMP6_RR_RESULT_FLAGS_OOB 0x0002 #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001 #elif BYTE_ORDER == LITTLE_ENDIAN #define ICMP6_RR_RESULT_FLAGS_OOB 0x0200 #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0100 #endif /* * icmp6 filter structures. */ struct icmp6_filter { u_int32_t icmp6_filt[8]; }; #ifdef _KERNEL #define ICMP6_FILTER_SETPASSALL(filterp) \ do { \ int i; u_char *p; \ p = (u_char *)filterp; \ for (i = 0; i < sizeof(struct icmp6_filter); i++) \ p[i] = 0xff; \ } while (/*CONSTCOND*/ 0) #define ICMP6_FILTER_SETBLOCKALL(filterp) \ bzero(filterp, sizeof(struct icmp6_filter)) #else /* _KERNEL */ #define ICMP6_FILTER_SETPASSALL(filterp) \ memset(filterp, 0xff, sizeof(struct icmp6_filter)) #define ICMP6_FILTER_SETBLOCKALL(filterp) \ memset(filterp, 0x00, sizeof(struct icmp6_filter)) #endif /* _KERNEL */ #define ICMP6_FILTER_SETPASS(type, filterp) \ (((filterp)->icmp6_filt[(type) >> 5]) |= (1 << ((type) & 31))) #define ICMP6_FILTER_SETBLOCK(type, filterp) \ (((filterp)->icmp6_filt[(type) >> 5]) &= ~(1 << ((type) & 31))) #define ICMP6_FILTER_WILLPASS(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) != 0) #define ICMP6_FILTER_WILLBLOCK(type, filterp) \ ((((filterp)->icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) /* * Variables related to this implementation * of the internet control message protocol version 6. */ -struct icmp6errstat { - uint64_t icp6errs_dst_unreach_noroute; - uint64_t icp6errs_dst_unreach_admin; - uint64_t icp6errs_dst_unreach_beyondscope; - uint64_t icp6errs_dst_unreach_addr; - uint64_t icp6errs_dst_unreach_noport; - uint64_t icp6errs_packet_too_big; - uint64_t icp6errs_time_exceed_transit; - uint64_t icp6errs_time_exceed_reassembly; - uint64_t icp6errs_paramprob_header; - uint64_t icp6errs_paramprob_nextheader; - uint64_t icp6errs_paramprob_option; - uint64_t icp6errs_redirect; /* we regard redirect as an error here */ - uint64_t icp6errs_unknown; -}; - struct icmp6stat { /* statistics related to icmp6 packets generated */ uint64_t icp6s_error; /* # of calls to icmp6_error */ uint64_t icp6s_canterror; /* no error 'cuz old was icmp */ uint64_t icp6s_toofreq; /* no error 'cuz rate limitation */ uint64_t icp6s_outhist[256]; /* statistics related to input message processed */ uint64_t icp6s_badcode; /* icmp6_code out of range */ uint64_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ uint64_t icp6s_checksum; /* bad checksum */ uint64_t icp6s_badlen; /* calculated bound mismatch */ uint64_t icp6s_dropped; /* # of packets dropped waiting for a resolution */ /* * number of responses: this member is inherited from netinet code, but * for netinet6 code, it is already available in icp6s_outhist[]. */ uint64_t icp6s_reflect; uint64_t icp6s_inhist[256]; uint64_t icp6s_nd_toomanyopt; /* too many ND options */ - struct icmp6errstat icp6s_outerrhist; -#define icp6s_odst_unreach_noroute \ - icp6s_outerrhist.icp6errs_dst_unreach_noroute -#define icp6s_odst_unreach_admin icp6s_outerrhist.icp6errs_dst_unreach_admin -#define icp6s_odst_unreach_beyondscope \ - icp6s_outerrhist.icp6errs_dst_unreach_beyondscope -#define icp6s_odst_unreach_addr icp6s_outerrhist.icp6errs_dst_unreach_addr -#define icp6s_odst_unreach_noport icp6s_outerrhist.icp6errs_dst_unreach_noport -#define icp6s_opacket_too_big icp6s_outerrhist.icp6errs_packet_too_big -#define icp6s_otime_exceed_transit \ - icp6s_outerrhist.icp6errs_time_exceed_transit -#define icp6s_otime_exceed_reassembly \ - icp6s_outerrhist.icp6errs_time_exceed_reassembly -#define icp6s_oparamprob_header icp6s_outerrhist.icp6errs_paramprob_header -#define icp6s_oparamprob_nextheader \ - icp6s_outerrhist.icp6errs_paramprob_nextheader -#define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option -#define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect -#define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown + uint64_t icp6s_odst_unreach_noroute; + uint64_t icp6s_odst_unreach_admin; + uint64_t icp6s_odst_unreach_beyondscope; + uint64_t icp6s_odst_unreach_addr; + uint64_t icp6s_odst_unreach_noport; + uint64_t icp6s_opacket_too_big; + uint64_t icp6s_otime_exceed_transit; + uint64_t icp6s_otime_exceed_reassembly; + uint64_t icp6s_oparamprob_header; + uint64_t icp6s_oparamprob_nextheader; + uint64_t icp6s_oparamprob_option; + uint64_t icp6s_oredirect; + uint64_t icp6s_ounknown; uint64_t icp6s_pmtuchg; /* path MTU changes */ uint64_t icp6s_nd_badopt; /* bad ND options */ uint64_t icp6s_badns; /* bad neighbor solicitation */ uint64_t icp6s_badna; /* bad neighbor advertisement */ uint64_t icp6s_badrs; /* bad router solicitation */ uint64_t icp6s_badra; /* bad router advertisement */ uint64_t icp6s_badredirect; /* bad redirect message */ uint64_t icp6s_overflowdefrtr; /* Too many default routers. */ uint64_t icp6s_overflowprfx; /* Too many prefixes. */ uint64_t icp6s_overflownndp; /* Too many neighbour entries. */ uint64_t icp6s_overflowredirect;/* Too many redirects. */ uint64_t icp6s_invlhlim; /* Invalid hop limit. */ uint64_t icp6s_spare[32]; }; #ifdef _KERNEL #include +#include #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet6_icmp6); #endif VNET_PCPUSTAT_DECLARE(struct icmp6stat, icmp6stat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ICMP6STAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, (val)) -#define ICMP6STAT_INC(name) ICMP6STAT_ADD(name, 1) +#define ICMP6STAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(icmp6, count, name, (val)); \ + VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, (val)); \ + } while (0) +#define ICMP6STAT_INC(name) ICMP6STAT_ADD(name, 1) +#define ICMP6STAT_INC2(name, type) \ + do { \ + MIB_SDT_PROBE2(icmp6, count, name, 1, type); \ + VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, 1); \ + } while (0) /* * Kernel module consumers must use this accessor macro. */ void kmod_icmp6stat_inc(int statnum); -#define KMOD_ICMP6STAT_INC(name) \ - kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(uint64_t)) +#define KMOD_ICMP6STAT_INC(name) \ + do { \ + MIB_SDT_PROBE1(icmp6, count, name, 1); \ + kmod_icmp6stat_inc( \ + offsetof(struct icmp6stat, name) / sizeof(uint64_t)); \ + } while (0) #endif /* * Names for ICMP sysctl objects */ #define ICMPV6CTL_STATS 1 #define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ #define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ #if 0 /*obsoleted*/ #define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ #endif #define ICMPV6CTL_ND6_PRUNE 6 #define ICMPV6CTL_ND6_DELAY 8 #define ICMPV6CTL_ND6_UMAXTRIES 9 #define ICMPV6CTL_ND6_MMAXTRIES 10 #define ICMPV6CTL_ND6_USELOOPBACK 11 /*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ #define ICMPV6CTL_NODEINFO 13 #define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ #define ICMPV6CTL_ND6_MAXNUDHINT 15 #define ICMPV6CTL_MTUDISC_HIWAT 16 #define ICMPV6CTL_MTUDISC_LOWAT 17 #define ICMPV6CTL_ND6_DEBUG 18 #define ICMPV6CTL_ND6_DRLIST 19 #define ICMPV6CTL_ND6_PRLIST 20 #define ICMPV6CTL_MLD_MAXSRCFILTER 21 #define ICMPV6CTL_MLD_SOMAXSRC 22 #define ICMPV6CTL_MLD_VERSION 23 #define ICMPV6CTL_ND6_MAXQLEN 24 #define ICMPV6CTL_NODEINFO_OLDMCPREFIX 25 #define ICMPV6CTL_MAXID 26 #ifdef _KERNEL # ifdef __STDC__ struct nhop_object; struct rttimer; struct in6_multi; # endif void icmp6_paramerror(struct mbuf *, int); int icmp6_errmap(const struct icmp6_hdr *); void icmp6_error(struct mbuf *, int, int, int); void icmp6_error2(struct mbuf *, int, int, int, struct ifnet *); int icmp6_input(struct mbuf **, int *, int); void icmp6_prepare(struct mbuf *); void icmp6_redirect_input(struct mbuf *, int); void icmp6_redirect_output(struct mbuf *, struct nhop_object *); int icmp6_ratelimit(const struct in6_addr *, const int, const int); struct ip6ctlparam; void icmp6_mtudisc_update(struct ip6ctlparam *, int); /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ counter_u64_add(((struct in6_ifextra *) \ ((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat[\ offsetof(struct icmp6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) #define icmp6_ifoutstat_inc(ifp, type, code) \ do { \ icmp6_ifstat_inc(ifp, ifs6_out_msg); \ if (type < ICMP6_INFOMSG_MASK) \ icmp6_ifstat_inc(ifp, ifs6_out_error); \ switch (type) { \ case ICMP6_DST_UNREACH: \ icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \ if (code == ICMP6_DST_UNREACH_ADMIN) \ icmp6_ifstat_inc(ifp, ifs6_out_adminprohib); \ break; \ case ICMP6_PACKET_TOO_BIG: \ icmp6_ifstat_inc(ifp, ifs6_out_pkttoobig); \ break; \ case ICMP6_TIME_EXCEEDED: \ icmp6_ifstat_inc(ifp, ifs6_out_timeexceed); \ break; \ case ICMP6_PARAM_PROB: \ icmp6_ifstat_inc(ifp, ifs6_out_paramprob); \ break; \ case ICMP6_ECHO_REQUEST: \ icmp6_ifstat_inc(ifp, ifs6_out_echo); \ break; \ case ICMP6_ECHO_REPLY: \ icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \ break; \ case MLD_LISTENER_QUERY: \ icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \ break; \ case MLD_LISTENER_REPORT: \ icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \ break; \ case MLD_LISTENER_DONE: \ icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \ break; \ case ND_ROUTER_SOLICIT: \ icmp6_ifstat_inc(ifp, ifs6_out_routersolicit); \ break; \ case ND_ROUTER_ADVERT: \ icmp6_ifstat_inc(ifp, ifs6_out_routeradvert); \ break; \ case ND_NEIGHBOR_SOLICIT: \ icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); \ break; \ case ND_NEIGHBOR_ADVERT: \ icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); \ break; \ case ND_REDIRECT: \ icmp6_ifstat_inc(ifp, ifs6_out_redirect); \ break; \ } \ } while (/*CONSTCOND*/ 0) #define ICMP6_NODEINFO_FQDNOK 0x1 #define ICMP6_NODEINFO_NODEADDROK 0x2 #define ICMP6_NODEINFO_TMPADDROK 0x4 #define ICMP6_NODEINFO_GLOBALOK 0x8 #endif /* _KERNEL */ #endif /* not _NETINET_ICMP6_H_ */ diff --git a/sys/netinet/icmp_var.h b/sys/netinet/icmp_var.h index a14b5607f39d..b1f2b0ebf911 100644 --- a/sys/netinet/icmp_var.h +++ b/sys/netinet/icmp_var.h @@ -1,100 +1,114 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_ICMP_VAR_H_ #define _NETINET_ICMP_VAR_H_ /* * Variables related to this implementation * of the internet control message protocol. */ struct icmpstat { /* statistics related to icmp packets generated */ u_long icps_error; /* # of calls to icmp_error */ u_long icps_oldshort; /* no error 'cuz old ip too short */ u_long icps_oldicmp; /* no error 'cuz old was icmp */ u_long icps_outhist[ICMP_MAXTYPE + 1]; /* statistics related to input messages processed */ u_long icps_badcode; /* icmp_code out of range */ u_long icps_tooshort; /* packet < ICMP_MINLEN */ u_long icps_checksum; /* bad checksum */ u_long icps_badlen; /* calculated bound mismatch */ u_long icps_reflect; /* number of responses */ u_long icps_inhist[ICMP_MAXTYPE + 1]; u_long icps_bmcastecho; /* b/mcast echo requests dropped */ u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */ u_long icps_badaddr; /* bad return address */ u_long icps_noroute; /* no route back */ }; #ifdef _KERNEL #include +#include VNET_PCPUSTAT_DECLARE(struct icmpstat, icmpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ICMPSTAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct icmpstat, icmpstat, name, (val)) +#define ICMPSTAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(icmp, count, name, (val)); \ + VNET_PCPUSTAT_ADD(struct icmpstat, icmpstat, name, (val)); \ + } while (0) + #define ICMPSTAT_INC(name) ICMPSTAT_ADD(name, 1) +#define ICMPSTAT_INC2(name, type) \ + do { \ + MIB_SDT_PROBE2(icmp, count, name, 1, type); \ + VNET_PCPUSTAT_ADD(struct icmpstat, icmpstat, name[type], 1); \ + } while (0) /* * Kernel module consumers must use this accessor macro. */ void kmod_icmpstat_inc(int statnum); -#define KMOD_ICMPSTAT_INC(name) \ - kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(uint64_t)) +#define KMOD_ICMPSTAT_INC(name) \ + do { \ + MIB_SDT_PROBE1(icmp, count, name, 1); \ + kmod_icmpstat_inc( \ + offsetof(struct icmpstat, name) / sizeof(uint64_t)); \ + } while (0) #endif /* * Identifiers for ICMP sysctl nodes */ #define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ #define ICMPCTL_STATS 2 /* statistics (read-only) */ #define ICMPCTL_ICMPLIM 3 #ifdef _KERNEL SYSCTL_DECL(_net_inet_icmp); extern int badport_bandlim(int); #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 #define BANDLIM_ICMP_ECHO 1 #define BANDLIM_ICMP_TSTAMP 2 #define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ #define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ #define BANDLIM_ICMP6_UNREACH 5 #define BANDLIM_SCTP_OOTB 6 #define BANDLIM_MAX 7 #endif #endif diff --git a/sys/netinet/in_kdtrace.c b/sys/netinet/in_kdtrace.c index 68a9c91ecba6..2a53b11c3be2 100644 --- a/sys/netinet/in_kdtrace.c +++ b/sys/netinet/in_kdtrace.c @@ -1,167 +1,472 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Mark Johnston * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include +SDT_PROVIDER_DEFINE(mib); SDT_PROVIDER_DEFINE(ip); SDT_PROVIDER_DEFINE(tcp); SDT_PROVIDER_DEFINE(udp); SDT_PROVIDER_DEFINE(udplite); +#ifndef KDTRACE_NO_MIB_SDT +#define MIB_PROBE_IP(name) \ + SDT_PROBE_DEFINE1(mib, ip, count, name, \ + "int") + +MIB_PROBE_IP(ips_total); +MIB_PROBE_IP(ips_badsum); +MIB_PROBE_IP(ips_tooshort); +MIB_PROBE_IP(ips_toosmall); +MIB_PROBE_IP(ips_badhlen); +MIB_PROBE_IP(ips_badlen); +MIB_PROBE_IP(ips_fragments); +MIB_PROBE_IP(ips_fragdropped); +MIB_PROBE_IP(ips_fragtimeout); +MIB_PROBE_IP(ips_forward); +MIB_PROBE_IP(ips_fastforward); +MIB_PROBE_IP(ips_cantforward); +MIB_PROBE_IP(ips_redirectsent); +MIB_PROBE_IP(ips_noproto); +MIB_PROBE_IP(ips_delivered); +MIB_PROBE_IP(ips_localout); +MIB_PROBE_IP(ips_odropped); +MIB_PROBE_IP(ips_reassembled); +MIB_PROBE_IP(ips_fragmented); +MIB_PROBE_IP(ips_ofragments); +MIB_PROBE_IP(ips_cantfrag); +MIB_PROBE_IP(ips_badoptions); +MIB_PROBE_IP(ips_noroute); +MIB_PROBE_IP(ips_badvers); +MIB_PROBE_IP(ips_rawout); +MIB_PROBE_IP(ips_toolong); +MIB_PROBE_IP(ips_notmember); +MIB_PROBE_IP(ips_nogif); +MIB_PROBE_IP(ips_badaddr); + +#define MIB_PROBE_IP6(name) \ + SDT_PROBE_DEFINE1(mib, ip6, count, name, \ + "int") +#define MIB_PROBE2_IP6(name) \ + SDT_PROBE_DEFINE2(mib, ip6, count, name, \ + "int", "int") + +MIB_PROBE_IP6(ip6s_total); +MIB_PROBE_IP6(ip6s_tooshort); +MIB_PROBE_IP6(ip6s_toosmall); +MIB_PROBE_IP6(ip6s_fragments); +MIB_PROBE_IP6(ip6s_fragdropped); +MIB_PROBE_IP6(ip6s_fragtimeout); +MIB_PROBE_IP6(ip6s_fragoverflow); +MIB_PROBE_IP6(ip6s_forward); +MIB_PROBE_IP6(ip6s_cantforward); +MIB_PROBE_IP6(ip6s_redirectsent); +MIB_PROBE_IP6(ip6s_delivered); +MIB_PROBE_IP6(ip6s_localout); +MIB_PROBE_IP6(ip6s_odropped); +MIB_PROBE_IP6(ip6s_reassembled); +MIB_PROBE_IP6(ip6s_atomicfrags); +MIB_PROBE_IP6(ip6s_fragmented); +MIB_PROBE_IP6(ip6s_ofragments); +MIB_PROBE_IP6(ip6s_cantfrag); +MIB_PROBE_IP6(ip6s_badoptions); +MIB_PROBE_IP6(ip6s_noroute); +MIB_PROBE_IP6(ip6s_badvers); +MIB_PROBE_IP6(ip6s_rawout); +MIB_PROBE_IP6(ip6s_badscope); +MIB_PROBE_IP6(ip6s_notmember); +MIB_PROBE2_IP6(ip6s_nxthist); +MIB_PROBE_IP6(ip6s_m1); +MIB_PROBE2_IP6(ip6s_m2m); +MIB_PROBE_IP6(ip6s_mext1); +MIB_PROBE_IP6(ip6s_mext2m); +MIB_PROBE_IP6(ip6s_exthdrtoolong); +MIB_PROBE_IP6(ip6s_nogif); +MIB_PROBE_IP6(ip6s_toomanyhdr); +MIB_PROBE_IP6(ip6s_sources_none); +MIB_PROBE2_IP6(ip6s_sources_sameif); +MIB_PROBE2_IP6(ip6s_sources_otherif); +MIB_PROBE2_IP6(ip6s_sources_samescope); +MIB_PROBE2_IP6(ip6s_sources_otherscope); +MIB_PROBE2_IP6(ip6s_sources_deprecated); +MIB_PROBE2_IP6(ip6s_sources_rule); + +#define MIB_PROBE_ICMP(name) \ + SDT_PROBE_DEFINE1(mib, icmp, count, name, \ + "int") +#define MIB_PROBE2_ICMP(name) \ + SDT_PROBE_DEFINE2(mib, icmp, count, name, \ + "int", "int") + +MIB_PROBE_ICMP(icps_error); +MIB_PROBE_ICMP(icps_oldshort); +MIB_PROBE_ICMP(icps_oldicmp); +MIB_PROBE2_ICMP(icps_outhist); +MIB_PROBE_ICMP(icps_badcode); +MIB_PROBE_ICMP(icps_tooshort); +MIB_PROBE_ICMP(icps_checksum); +MIB_PROBE_ICMP(icps_badlen); +MIB_PROBE_ICMP(icps_reflect); +MIB_PROBE2_ICMP(icps_inhist); +MIB_PROBE_ICMP(icps_bmcastecho); +MIB_PROBE_ICMP(icps_bmcasttstamp); +MIB_PROBE_ICMP(icps_badaddr); +MIB_PROBE_ICMP(icps_noroute); + +#define MIB_PROBE_ICMP6(name) \ + SDT_PROBE_DEFINE1(mib, icmp6, count, name, \ + "int") +#define MIB_PROBE2_ICMP6(name) \ + SDT_PROBE_DEFINE2(mib, icmp6, count, name, \ + "int", "int") + +MIB_PROBE_ICMP6(icp6s_error); +MIB_PROBE_ICMP6(icp6s_canterror); +MIB_PROBE_ICMP6(icp6s_toofreq); +MIB_PROBE2_ICMP6(icp6s_outhist); +MIB_PROBE_ICMP6(icp6s_badcode); +MIB_PROBE_ICMP6(icp6s_tooshort); +MIB_PROBE_ICMP6(icp6s_checksum); +MIB_PROBE_ICMP6(icp6s_badlen); +MIB_PROBE_ICMP6(icp6s_dropped); +MIB_PROBE_ICMP6(icp6s_reflect); +MIB_PROBE2_ICMP6(icp6s_inhist); +MIB_PROBE_ICMP6(icp6s_nd_toomanyopt); +MIB_PROBE_ICMP6(icp6s_odst_unreach_noroute); +MIB_PROBE_ICMP6(icp6s_odst_unreach_admin); +MIB_PROBE_ICMP6(icp6s_odst_unreach_beyondscope); +MIB_PROBE_ICMP6(icp6s_odst_unreach_addr); +MIB_PROBE_ICMP6(icp6s_odst_unreach_noport); +MIB_PROBE_ICMP6(icp6s_opacket_too_big); +MIB_PROBE_ICMP6(icp6s_otime_exceed_transit); +MIB_PROBE_ICMP6(icp6s_otime_exceed_reassembly); +MIB_PROBE_ICMP6(icp6s_oparamprob_header); +MIB_PROBE_ICMP6(icp6s_oparamprob_nextheader); +MIB_PROBE_ICMP6(icp6s_oparamprob_option); +MIB_PROBE_ICMP6(icp6s_oredirect); +MIB_PROBE_ICMP6(icp6s_ounknown); +MIB_PROBE_ICMP6(icp6s_pmtuchg); +MIB_PROBE_ICMP6(icp6s_nd_badopt); +MIB_PROBE_ICMP6(icp6s_badns); +MIB_PROBE_ICMP6(icp6s_badna); +MIB_PROBE_ICMP6(icp6s_badrs); +MIB_PROBE_ICMP6(icp6s_badra); +MIB_PROBE_ICMP6(icp6s_badredirect); +MIB_PROBE_ICMP6(icp6s_overflowdefrtr); +MIB_PROBE_ICMP6(icp6s_overflowprfx); +MIB_PROBE_ICMP6(icp6s_overflownndp); +MIB_PROBE_ICMP6(icp6s_overflowredirect); +MIB_PROBE_ICMP6(icp6s_invlhlim); + +#define MIB_PROBE_UDP(name) SDT_PROBE_DEFINE1(mib, udp, count, name, "int") +MIB_PROBE_UDP(udps_ipackets); +MIB_PROBE_UDP(udps_hdrops); +MIB_PROBE_UDP(udps_badsum); +MIB_PROBE_UDP(udps_nosum); +MIB_PROBE_UDP(udps_badlen); +MIB_PROBE_UDP(udps_noport); +MIB_PROBE_UDP(udps_noportbcast); +MIB_PROBE_UDP(udps_fullsock); +MIB_PROBE_UDP(udps_pcbcachemiss); +MIB_PROBE_UDP(udps_pcbhashmiss); +MIB_PROBE_UDP(udps_opackets); +MIB_PROBE_UDP(udps_fastout); +MIB_PROBE_UDP(udps_noportmcast); +MIB_PROBE_UDP(udps_filtermcast); + +#define MIB_PROBE_TCP(name) SDT_PROBE_DEFINE1(mib, tcp, count, name, "int") + +MIB_PROBE_TCP(tcps_connattempt); +MIB_PROBE_TCP(tcps_accepts); +MIB_PROBE_TCP(tcps_connects); +MIB_PROBE_TCP(tcps_drops); +MIB_PROBE_TCP(tcps_conndrops); +MIB_PROBE_TCP(tcps_minmmsdrops); +MIB_PROBE_TCP(tcps_closed); +MIB_PROBE_TCP(tcps_segstimed); +MIB_PROBE_TCP(tcps_rttupdated); +MIB_PROBE_TCP(tcps_delack); +MIB_PROBE_TCP(tcps_timeoutdrop); +MIB_PROBE_TCP(tcps_rexmttimeo); +MIB_PROBE_TCP(tcps_persisttimeo); +MIB_PROBE_TCP(tcps_keeptimeo); +MIB_PROBE_TCP(tcps_keepprobe); +MIB_PROBE_TCP(tcps_keepdrops); +MIB_PROBE_TCP(tcps_progdrops); + +MIB_PROBE_TCP(tcps_sndtotal); +MIB_PROBE_TCP(tcps_sndpack); +MIB_PROBE_TCP(tcps_sndbyte); +MIB_PROBE_TCP(tcps_sndrexmitpack); +MIB_PROBE_TCP(tcps_sndrexmitbyte); +MIB_PROBE_TCP(tcps_sndrexmitbad); +MIB_PROBE_TCP(tcps_sndacks); +MIB_PROBE_TCP(tcps_sndprobe); +MIB_PROBE_TCP(tcps_sndurg); +MIB_PROBE_TCP(tcps_sndwinup); +MIB_PROBE_TCP(tcps_sndctrl); + +MIB_PROBE_TCP(tcps_rcvtotal); +MIB_PROBE_TCP(tcps_rcvpack); +MIB_PROBE_TCP(tcps_rcvbyte); +MIB_PROBE_TCP(tcps_rcvbadsum); +MIB_PROBE_TCP(tcps_rcvbadoff); +MIB_PROBE_TCP(tcps_rcvreassfull); +MIB_PROBE_TCP(tcps_rcvshort); +MIB_PROBE_TCP(tcps_rcvduppack); +MIB_PROBE_TCP(tcps_rcvdupbyte); +MIB_PROBE_TCP(tcps_rcvpartduppack); +MIB_PROBE_TCP(tcps_rcvpartdupbyte); +MIB_PROBE_TCP(tcps_rcvoopack); +MIB_PROBE_TCP(tcps_rcvoobyte); +MIB_PROBE_TCP(tcps_rcvpackafterwin); +MIB_PROBE_TCP(tcps_rcvbyteafterwin); +MIB_PROBE_TCP(tcps_rcvafterclose); +MIB_PROBE_TCP(tcps_rcvwinprobe); +MIB_PROBE_TCP(tcps_rcvdupack); +MIB_PROBE_TCP(tcps_rcvacktoomuch); +MIB_PROBE_TCP(tcps_rcvackpack); +MIB_PROBE_TCP(tcps_rcvackbyte); +MIB_PROBE_TCP(tcps_rcvwinupd); +MIB_PROBE_TCP(tcps_pawsdrop); +MIB_PROBE_TCP(tcps_predack); +MIB_PROBE_TCP(tcps_preddat); +MIB_PROBE_TCP(tcps_pcbackemiss); +MIB_PROBE_TCP(tcps_cachedrtt); +MIB_PROBE_TCP(tcps_cachedrttvar); +MIB_PROBE_TCP(tcps_cachedssthresh); +MIB_PROBE_TCP(tcps_usedrtt); +MIB_PROBE_TCP(tcps_usedrttvar); +MIB_PROBE_TCP(tcps_usedssthresh); +MIB_PROBE_TCP(tcps_persistdrop); +MIB_PROBE_TCP(tcps_badsyn); +MIB_PROBE_TCP(tcps_mturesent); +MIB_PROBE_TCP(tcps_listendrop); +MIB_PROBE_TCP(tcps_badrst); + +MIB_PROBE_TCP(tcps_sc_added); +MIB_PROBE_TCP(tcps_sc_retransmitted); +MIB_PROBE_TCP(tcps_sc_dupsyn); +MIB_PROBE_TCP(tcps_sc_dropped); +MIB_PROBE_TCP(tcps_sc_completed); +MIB_PROBE_TCP(tcps_sc_bucketoverflow); +MIB_PROBE_TCP(tcps_sc_cacheoverflow); +MIB_PROBE_TCP(tcps_sc_reset); +MIB_PROBE_TCP(tcps_sc_stale); +MIB_PROBE_TCP(tcps_sc_aborted); +MIB_PROBE_TCP(tcps_sc_badack); +MIB_PROBE_TCP(tcps_sc_unreach); +MIB_PROBE_TCP(tcps_sc_zonefail); +MIB_PROBE_TCP(tcps_sc_sendcookie); +MIB_PROBE_TCP(tcps_sc_recvcookie); + +MIB_PROBE_TCP(tcps_hc_added); +MIB_PROBE_TCP(tcps_hc_bucketoverflow); + +MIB_PROBE_TCP(tcps_finwait2_drops); + +MIB_PROBE_TCP(tcps_sack_recovery_episode); +MIB_PROBE_TCP(tcps_sack_rexmits); +MIB_PROBE_TCP(tcps_sack_rexmit_bytes); +MIB_PROBE_TCP(tcps_sack_rcv_blocks); +MIB_PROBE_TCP(tcps_sack_send_blocks); +MIB_PROBE_TCP(tcps_sack_lostrexmt); +MIB_PROBE_TCP(tcps_sack_sboverflow); + +MIB_PROBE_TCP(tcps_ecn_rcvce); +MIB_PROBE_TCP(tcps_ecn_rcvect0); +MIB_PROBE_TCP(tcps_ecn_rcvect1); +MIB_PROBE_TCP(tcps_ecn_shs); +MIB_PROBE_TCP(tcps_ecn_rcwnd); + +MIB_PROBE_TCP(tcps_sig_rcvgoodsig); +MIB_PROBE_TCP(tcps_sig_rcvbadsig); +MIB_PROBE_TCP(tcps_sig_err_buildsig); +MIB_PROBE_TCP(tcps_sig_err_sigopt); +MIB_PROBE_TCP(tcps_sig_err_nosigopt); + +MIB_PROBE_TCP(tcps_pmtud_blackhole_activated); +MIB_PROBE_TCP(tcps_pmtud_blackhole_activated_min_mss); +MIB_PROBE_TCP(tcps_pmtud_blackhole_failed); + +MIB_PROBE_TCP(tcps_tunneled_pkts); +MIB_PROBE_TCP(tcps_tunneled_errs); + +MIB_PROBE_TCP(tcps_dsack_count); +MIB_PROBE_TCP(tcps_dsack_bytes); +MIB_PROBE_TCP(tcps_dsack_tlp_bytes); + +MIB_PROBE_TCP(tcps_tw_recycles); +MIB_PROBE_TCP(tcps_tw_resets); +MIB_PROBE_TCP(tcps_tw_responds); + +MIB_PROBE_TCP(tcps_ace_nect); +MIB_PROBE_TCP(tcps_ace_ect1); +MIB_PROBE_TCP(tcps_ace_ect0); +MIB_PROBE_TCP(tcps_ace_ce); + +MIB_PROBE_TCP(tcps_ecn_sndect0); +MIB_PROBE_TCP(tcps_ecn_sndect1); + +MIB_PROBE_TCP(tcps_tlpresends); +MIB_PROBE_TCP(tcps_tlpresend_bytes); + +#endif + SDT_PROBE_DEFINE6_XLATE(ip, , , receive, "void *", "pktinfo_t *", "void *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct ifnet *", "ifinfo_t *", "struct ip *", "ipv4info_t *", "struct ip6_hdr *", "ipv6info_t *"); SDT_PROBE_DEFINE6_XLATE(ip, , , send, "void *", "pktinfo_t *", "void *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct ifnet *", "ifinfo_t *", "struct ip *", "ipv4info_t *", "struct ip6_hdr *", "ipv6info_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , accept__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__established, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__refused, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , connect__request, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , receive, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *"); SDT_PROBE_DEFINE5_XLATE(tcp, , , send, "void *", "pktinfo_t *", "struct tcpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *"); SDT_PROBE_DEFINE1_XLATE(tcp, , , siftr, "struct pkt_node *", "siftrinfo_t *"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__input, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *", "struct mbuf *", "ipinfo_t *"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__output, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfo_t *", "struct mbuf *", "ipinfo_t *"); SDT_PROBE_DEFINE2_XLATE(tcp, , , debug__user, "struct tcpcb *", "tcpsinfo_t *" , "int", "int"); SDT_PROBE_DEFINE3_XLATE(tcp, , , debug__drop, "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *", "struct mbuf *", "ipinfo_t *"); SDT_PROBE_DEFINE6_XLATE(tcp, , , state__change, "void *", "void *", "struct tcpcb *", "csinfo_t *", "void *", "void *", "struct tcpcb *", "tcpsinfo_t *", "void *", "void *", "int", "tcplsinfo_t *"); SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize, "void *", "void *", "struct tcpcb *", "csinfo_t *", "struct mbuf *", "ipinfo_t *", "struct tcpcb *", "tcpsinfo_t *" , "struct tcphdr *", "tcpinfoh_t *", "int", "int"); SDT_PROBE_DEFINE5_XLATE(udp, , , receive, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udpsinfo_t *", "struct udphdr *", "udpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udp, , , send, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udpsinfo_t *", "struct udphdr *", "udpinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udplite, , , receive, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udplitesinfo_t *", "struct udphdr *", "udpliteinfo_t *"); SDT_PROBE_DEFINE5_XLATE(udplite, , , send, "void *", "pktinfo_t *", "struct inpcb *", "csinfo_t *", "uint8_t *", "ipinfo_t *", "struct inpcb *", "udplitesinfo_t *", "struct udphdr *", "udpliteinfo_t *"); diff --git a/sys/netinet/in_kdtrace.h b/sys/netinet/in_kdtrace.h index ca279cdfc3be..780839299993 100644 --- a/sys/netinet/in_kdtrace.h +++ b/sys/netinet/in_kdtrace.h @@ -1,130 +1,409 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2013 Mark Johnston * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_IN_KDTRACE_H_ #define _SYS_IN_KDTRACE_H_ +#include + #define IP_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) #define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4) #define UDPLITE_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(udplite, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE1(probe, arg0) \ SDT_PROBE1(tcp, , , probe, arg0) #define TCP_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(tcp, , , probe, arg0, arg1) #define TCP_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(tcp, , , probe, arg0, arg1, arg2) #define TCP_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(tcp, , , probe, arg0, arg1, arg2, arg3) #define TCP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4) #define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ SDT_PROBE6(tcp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) SDT_PROVIDER_DECLARE(ip); SDT_PROVIDER_DECLARE(tcp); SDT_PROVIDER_DECLARE(udp); SDT_PROVIDER_DECLARE(udplite); +#ifndef KDTRACE_NO_MIB_SDT +SDT_PROVIDER_DECLARE(mib); + +SDT_PROBE_DECLARE(mib, ip, count, ips_total); +SDT_PROBE_DECLARE(mib, ip, count, ips_badsum); +SDT_PROBE_DECLARE(mib, ip, count, ips_tooshort); +SDT_PROBE_DECLARE(mib, ip, count, ips_toosmall); +SDT_PROBE_DECLARE(mib, ip, count, ips_badhlen); +SDT_PROBE_DECLARE(mib, ip, count, ips_badlen); +SDT_PROBE_DECLARE(mib, ip, count, ips_fragments); +SDT_PROBE_DECLARE(mib, ip, count, ips_fragdropped); +SDT_PROBE_DECLARE(mib, ip, count, ips_fragtimeout); +SDT_PROBE_DECLARE(mib, ip, count, ips_forward); +SDT_PROBE_DECLARE(mib, ip, count, ips_fastforward); +SDT_PROBE_DECLARE(mib, ip, count, ips_cantforward); +SDT_PROBE_DECLARE(mib, ip, count, ips_redirectsent); +SDT_PROBE_DECLARE(mib, ip, count, ips_noproto); +SDT_PROBE_DECLARE(mib, ip, count, ips_delivered); +SDT_PROBE_DECLARE(mib, ip, count, ips_localout); +SDT_PROBE_DECLARE(mib, ip, count, ips_odropped); +SDT_PROBE_DECLARE(mib, ip, count, ips_reassembled); +SDT_PROBE_DECLARE(mib, ip, count, ips_fragmented); +SDT_PROBE_DECLARE(mib, ip, count, ips_ofragments); +SDT_PROBE_DECLARE(mib, ip, count, ips_cantfrag); +SDT_PROBE_DECLARE(mib, ip, count, ips_badoptions); +SDT_PROBE_DECLARE(mib, ip, count, ips_noroute); +SDT_PROBE_DECLARE(mib, ip, count, ips_badvers); +SDT_PROBE_DECLARE(mib, ip, count, ips_rawout); +SDT_PROBE_DECLARE(mib, ip, count, ips_toolong); +SDT_PROBE_DECLARE(mib, ip, count, ips_notmember); +SDT_PROBE_DECLARE(mib, ip, count, ips_nogif); +SDT_PROBE_DECLARE(mib, ip, count, ips_badaddr); + +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_total); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_tooshort); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_toosmall); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragments); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragdropped); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragtimeout); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragoverflow); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_forward); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_cantforward); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_redirectsent); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_delivered); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_localout); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_odropped); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_reassembled); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_atomicfrags); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_fragmented); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_ofragments); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_cantfrag); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_badoptions); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_noroute); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_badvers); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_rawout); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_badscope); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_notmember); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_nxthist); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_m1); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_m2m); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_mext1); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_mext2m); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_exthdrtoolong); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_nogif); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_toomanyhdr); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_none); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_sameif); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_otherif); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_samescope); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_otherscope); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_deprecated); +SDT_PROBE_DECLARE(mib, ip6, count, ip6s_sources_rule); + +SDT_PROBE_DECLARE(mib, icmp, count, icps_error); +SDT_PROBE_DECLARE(mib, icmp, count, icps_oldshort); +SDT_PROBE_DECLARE(mib, icmp, count, icps_oldicmp); +SDT_PROBE_DECLARE(mib, icmp, count, icps_outhist); +SDT_PROBE_DECLARE(mib, icmp, count, icps_badcode); +SDT_PROBE_DECLARE(mib, icmp, count, icps_tooshort); +SDT_PROBE_DECLARE(mib, icmp, count, icps_checksum); +SDT_PROBE_DECLARE(mib, icmp, count, icps_badlen); +SDT_PROBE_DECLARE(mib, icmp, count, icps_reflect); +SDT_PROBE_DECLARE(mib, icmp, count, icps_inhist); +SDT_PROBE_DECLARE(mib, icmp, count, icps_bmcastecho); +SDT_PROBE_DECLARE(mib, icmp, count, icps_bmcasttstamp); +SDT_PROBE_DECLARE(mib, icmp, count, icps_badaddr); +SDT_PROBE_DECLARE(mib, icmp, count, icps_noroute); + +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_error); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_canterror); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_toofreq); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_outhist); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badcode); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_tooshort); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_checksum); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badlen); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_dropped); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_reflect); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_inhist); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_nd_toomanyopt); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_noroute); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_admin); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_beyondscope); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_addr); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_odst_unreach_noport); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_opacket_too_big); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_otime_exceed_transit); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_otime_exceed_reassembly); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oparamprob_header); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oparamprob_nextheader); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oparamprob_option); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_oredirect); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_ounknown); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_pmtuchg); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_nd_badopt); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badns); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badna); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badrs); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badra); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_badredirect); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflowdefrtr); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflowprfx); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflownndp); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_overflowredirect); +SDT_PROBE_DECLARE(mib, icmp6, count, icp6s_invlhlim); + +SDT_PROBE_DECLARE(mib, udp, count, udps_ipackets); +SDT_PROBE_DECLARE(mib, udp, count, udps_hdrops); +SDT_PROBE_DECLARE(mib, udp, count, udps_badsum); +SDT_PROBE_DECLARE(mib, udp, count, udps_nosum); +SDT_PROBE_DECLARE(mib, udp, count, udps_badlen); +SDT_PROBE_DECLARE(mib, udp, count, udps_noport); +SDT_PROBE_DECLARE(mib, udp, count, udps_noportbcast); +SDT_PROBE_DECLARE(mib, udp, count, udps_fullsock); +SDT_PROBE_DECLARE(mib, udp, count, udps_pcbcachemiss); +SDT_PROBE_DECLARE(mib, udp, count, udps_pcbhashmiss); +SDT_PROBE_DECLARE(mib, udp, count, udps_opackets); +SDT_PROBE_DECLARE(mib, udp, count, udps_fastout); +SDT_PROBE_DECLARE(mib, udp, count, udps_noportmcast); +SDT_PROBE_DECLARE(mib, udp, count, udps_filtermcast); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_connattempt); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_accepts); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_connects); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_drops); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_conndrops); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_minmssdrops); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_closed); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_segstimed); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rttupdated); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_delack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_timeoutdrop); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rexmttimeo); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_persisttimeo); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_keeptimeo); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_keepprobe); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_keepdrops); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_progdrops); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndtotal); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndpack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndbyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndrexmitpack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndrexmitbyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndrexmitbad); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndacks); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndprobe); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndurg); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndwinup); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sndctrl); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvtotal); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbadsum); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbadoff); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvreassfull); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvshort); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvduppack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvdupbyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpartduppack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpartdupbyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvoopack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvoobyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvpackafterwin); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvbyteafterwin); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvafterclose); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvwinprobe); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvdupack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvacktoomuch); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvackpack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvackbyte); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_rcvwinupd); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_pawsdrop); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_predack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_preddat); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_pcbcachemiss); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_cachedrtt); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_cachedrttvar); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_cachedssthresh); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_usedrtt); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_usedrttvar); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_usedssthresh); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_persistdrop); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_badsyn); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_mturesent); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_listendrop); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_badrst); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_added); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_retransmitted); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_dupsyn); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_dropped); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_completed); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_bucketoverflow); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_cacheoverflow); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_reset); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_stale); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_aborted); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_badack); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_unreach); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_zonefail); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_sendcookie); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sc_recvcookie); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_added); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_hc_bucketoverflow); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_finwait2_drops); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_recovery_episode); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmits); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rexmit_bytes); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_rcv_blocks); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_send_blocks); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_lostrexmt); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sack_sboverflow); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcvce); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcvect0); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcvect1); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_shs); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_rcwnd); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_rcvgoodsig); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_rcvbadsig); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_err_buildsig); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_err_sigopt); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_sig_err_nosigopt); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_pmtud_blackhole_activated); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_pmtud_blackhole_activated_min_mss); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_pmtud_blackhole_failed); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tunneled_pkts); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tunneled_errs); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_dsack_count); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_dsack_bytes); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_dsack_tlp_bytes); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tw_recycles); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tw_resets); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tw_responds); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_nect); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_ect1); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_ect0); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ace_ce); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_sndect0); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_ecn_sndect1); + +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tlpresends); +SDT_PROBE_DECLARE(mib, tcp, count, tcps_tlpresend_bytes); +#endif + SDT_PROBE_DECLARE(ip, , , receive); SDT_PROBE_DECLARE(ip, , , send); SDT_PROBE_DECLARE(tcp, , , accept__established); SDT_PROBE_DECLARE(tcp, , , accept__refused); SDT_PROBE_DECLARE(tcp, , , connect__established); SDT_PROBE_DECLARE(tcp, , , connect__refused); SDT_PROBE_DECLARE(tcp, , , connect__request); SDT_PROBE_DECLARE(tcp, , , receive); SDT_PROBE_DECLARE(tcp, , , send); SDT_PROBE_DECLARE(tcp, , , siftr); SDT_PROBE_DECLARE(tcp, , , state__change); SDT_PROBE_DECLARE(tcp, , , debug__input); SDT_PROBE_DECLARE(tcp, , , debug__output); SDT_PROBE_DECLARE(tcp, , , debug__user); SDT_PROBE_DECLARE(tcp, , , debug__drop); SDT_PROBE_DECLARE(tcp, , , receive__autoresize); SDT_PROBE_DECLARE(udp, , , receive); SDT_PROBE_DECLARE(udp, , , send); SDT_PROBE_DECLARE(udplite, , , receive); SDT_PROBE_DECLARE(udplite, , , send); /* * These constants originate from the 4.4BSD sys/protosw.h. They lost * their initial purpose in 2c37256e5a59, when single pr_usrreq method * was split into multiple methods. However, they were used by TCPDEBUG, * a feature barely used, but it kept them in the tree for many years. * In 5d06879adb95 DTrace probes started to use them. Note that they * are not documented in dtrace_tcp(4), so they are likely to be * eventually renamed to something better and extended/trimmed. */ #define PRU_ATTACH 0 /* attach protocol to up */ #define PRU_DETACH 1 /* detach protocol from up */ #define PRU_BIND 2 /* bind socket to address */ #define PRU_LISTEN 3 /* listen for connection */ #define PRU_CONNECT 4 /* establish connection to peer */ #define PRU_ACCEPT 5 /* accept connection from peer */ #define PRU_DISCONNECT 6 /* disconnect from peer */ #define PRU_SHUTDOWN 7 /* won't send any more data */ #define PRU_RCVD 8 /* have taken data; more room now */ #define PRU_SEND 9 /* send this data */ #define PRU_ABORT 10 /* abort (fast DISCONNECT, DETATCH) */ #define PRU_CONTROL 11 /* control operations on protocol */ #define PRU_SENSE 12 /* return status into m */ #define PRU_RCVOOB 13 /* retrieve out of band data */ #define PRU_SENDOOB 14 /* send out of band data */ #define PRU_SOCKADDR 15 /* fetch socket's address */ #define PRU_PEERADDR 16 /* fetch peer's address */ #define PRU_CONNECT2 17 /* connect two sockets */ /* begin for protocols internal use */ #define PRU_FASTTIMO 18 /* 200ms timeout */ #define PRU_SLOWTIMO 19 /* 500ms timeout */ #define PRU_PROTORCV 20 /* receive from below */ #define PRU_PROTOSEND 21 /* send to below */ /* end for protocol's internal use */ #define PRU_SEND_EOF 22 /* send and close */ #define PRU_SOSETLABEL 23 /* MAC label change */ #define PRU_CLOSE 24 /* socket close */ #define PRU_FLUSH 25 /* flush the socket */ #define PRU_NREQ 25 #ifdef PRUREQUESTS const char *prurequests[] = { "ATTACH", "DETACH", "BIND", "LISTEN", "CONNECT", "ACCEPT", "DISCONNECT", "SHUTDOWN", "RCVD", "SEND", "ABORT", "CONTROL", "SENSE", "RCVOOB", "SENDOOB", "SOCKADDR", "PEERADDR", "CONNECT2", "FASTTIMO", "SLOWTIMO", "PROTORCV", "PROTOSEND", "SEND_EOF", "SOSETLABEL", "CLOSE", "FLUSH", }; #endif #endif diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index 4dfbd0c525ff..a8ce2b4c1d6c 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -1,1205 +1,1205 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ extern ipproto_ctlinput_t *ip_ctlprotox[]; /* * ICMP routines: error generation, receive packet processing, and * routines to turnaround packets back to the originator, and * host table maintenance routines. */ static int sysctl_icmplim_and_jitter(SYSCTL_HANDLER_ARGS); VNET_DEFINE_STATIC(u_int, icmplim) = 200; #define V_icmplim VNET(icmplim) SYSCTL_PROC(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLTYPE_UINT | CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, &sysctl_icmplim_and_jitter, "IU", "Maximum number of ICMP responses per second"); VNET_DEFINE_STATIC(int, icmplim_curr_jitter) = 0; #define V_icmplim_curr_jitter VNET(icmplim_curr_jitter) VNET_DEFINE_STATIC(u_int, icmplim_jitter) = 16; #define V_icmplim_jitter VNET(icmplim_jitter) SYSCTL_PROC(_net_inet_icmp, OID_AUTO, icmplim_jitter, CTLTYPE_UINT | CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_jitter), 0, &sysctl_icmplim_and_jitter, "IU", "Random icmplim jitter adjustment limit"); VNET_DEFINE_STATIC(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, "Enable logging of ICMP response rate limiting"); #ifdef INET VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); VNET_PCPUSTAT_SYSINIT(icmpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmpstat); #endif /* VIMAGE */ VNET_DEFINE_STATIC(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, "Reply to ICMP Address Mask Request packets"); VNET_DEFINE_STATIC(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, "Fake reply to ICMP Address Mask Request packets"); VNET_DEFINE(int, drop_redirect) = 0; #define V_drop_redirect VNET(drop_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_redirect), 0, "Ignore ICMP redirects"); VNET_DEFINE_STATIC(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); VNET_DEFINE_STATIC(int, redirtimeout) = 60 * 10; /* 10 minutes */ #define V_redirtimeout VNET(redirtimeout) SYSCTL_INT(_net_inet_icmp, OID_AUTO, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(redirtimeout), 0, "Delay in seconds before expiring redirect route"); VNET_DEFINE_STATIC(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, "ICMP reply source for non-local packets"); VNET_DEFINE_STATIC(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); /* Router requirements RFC 1812 section 4.3.2.3 requires 576 - 28. */ VNET_DEFINE_STATIC(int, icmp_quotelen) = 548; #define V_icmp_quotelen VNET(icmp_quotelen) SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); VNET_DEFINE_STATIC(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, "Reply to multicast ICMP Echo Request and Timestamp packets"); VNET_DEFINE_STATIC(int, icmptstamprepl) = 1; #define V_icmptstamprepl VNET(icmptstamprepl) SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmptstamprepl), 0, "Respond to ICMP Timestamp packets"); VNET_DEFINE_STATIC(int, error_keeptags) = 0; #define V_error_keeptags VNET(error_keeptags) SYSCTL_INT(_net_inet_icmp, OID_AUTO, error_keeptags, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(error_keeptags), 0, "ICMP error response keeps copy of mbuf_tags of original packet"); #ifdef ICMPPRINTFS int icmpprintfs = 0; #endif static void icmp_reflect(struct mbuf *); static void icmp_send(struct mbuf *, struct mbuf *); static int icmp_verify_redirect_gateway(struct sockaddr_in *, struct sockaddr_in *, struct sockaddr_in *, u_int); /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the * general layout of icmpstat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmpstat_inc(int statnum) { counter_u64_add(VNET(icmpstat)[statnum], 1); } /* * Generate an error packet of type error * in response to bad packet ip. */ void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) { struct ip *oip, *nip; struct icmp *icp; struct mbuf *m; unsigned icmplen, icmpelen, nlen, oiphlen; KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); if (type != ICMP_REDIRECT) ICMPSTAT_INC(icps_error); /* * Don't send error: * if the original packet was encrypted. * if not the first fragment of message. * in response to a multicast or broadcast packet. * if the old packet protocol was an ICMP error message. */ if (n->m_flags & M_DECRYPTED) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; /* Drop if IP header plus 8 bytes is not contiguous in first mbuf. */ if (n->m_len < sizeof(struct ip) + ICMP_MINLEN) goto freeit; oip = mtod(n, struct ip *); oiphlen = oip->ip_hl << 2; if (n->m_len < oiphlen + ICMP_MINLEN) goto freeit; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_error(%p, %x, %d)\n", oip, type, code); #endif if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { ICMPSTAT_INC(icps_oldicmp); goto freeit; } /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. * Unfortunately this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); if (oip->ip_p == IPPROTO_TCP) { struct tcphdr *th; int tcphlen; if (oiphlen + sizeof(struct tcphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct tcphdr) && (n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL) goto freeit; oip = mtod(n, struct ip *); th = mtodo(n, oiphlen); tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && (n = m_pullup(n, oiphlen + tcphlen)) == NULL) goto freeit; oip = mtod(n, struct ip *); icmpelen = max(tcphlen, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } else if (oip->ip_p == IPPROTO_SCTP) { struct sctphdr *sh; struct sctp_chunkhdr *ch; if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr)) goto stdreply; if (oiphlen + sizeof(struct sctphdr) > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + sizeof(struct sctphdr) && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL) goto freeit; oip = mtod(n, struct ip *); icmpelen = max(sizeof(struct sctphdr), min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); sh = mtodo(n, oiphlen); if (ntohl(sh->v_tag) == 0 && ntohs(oip->ip_len) >= oiphlen + sizeof(struct sctphdr) + 8 && (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 || n->m_next != NULL)) { if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 && (n = m_pullup(n, oiphlen + sizeof(struct sctphdr) + 8)) == NULL) goto freeit; oip = mtod(n, struct ip *); sh = mtodo(n, oiphlen); ch = (struct sctp_chunkhdr *)(sh + 1); if (ch->chunk_type == SCTP_INITIATION) { icmpelen = max(sizeof(struct sctphdr) + 8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); } } } else stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) m = m_gethdr(M_NOWAIT, MT_DATA); else m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC mac_netinet_icmp_reply(n, m); #endif icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); m_align(m, sizeof(struct ip) + ICMP_MINLEN + icmplen); m->m_data += sizeof(struct ip); m->m_len = ICMP_MINLEN + icmplen; /* XXX MRT make the outgoing packet use the same FIB * that was associated with the incoming packet */ M_SETFIB(m, M_GETFIB(n)); icp = mtod(m, struct icmp *); - ICMPSTAT_INC(icps_outhist[type]); + ICMPSTAT_INC2(icps_outhist, type); icp->icmp_type = type; if (type == ICMP_REDIRECT) icp->icmp_gwaddr.s_addr = dest; else { icp->icmp_void = 0; /* * The following assignments assume an overlay with the * just zeroed icmp_void field. */ if (type == ICMP_PARAMPROB) { icp->icmp_pptr = code; code = 0; } else if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG && mtu) { icp->icmp_nextmtu = htons(mtu); } } icp->icmp_code = code; /* * Copy the quotation into ICMP message and * convert quoted IP header back to network representation. */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; /* * Set up ICMP message mbuf and copy old IP header (without options * in front of ICMP message. * If the original mbuf was meant to bypass the firewall, the error * reply should bypass as well. */ m->m_flags |= n->m_flags & M_SKIP_FIREWALL; KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ip), ("insufficient space for ip header")); m->m_data -= sizeof(struct ip); m->m_len += sizeof(struct ip); m->m_pkthdr.len = m->m_len; m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; nip->ip_tos = 0; nip->ip_off = 0; if (V_error_keeptags) m_tag_copy_chain(m, n, M_NOWAIT); icmp_reflect(m); freeit: m_freem(n); } int icmp_errmap(const struct icmp *icp) { switch (icp->icmp_type) { case ICMP_UNREACH: switch (icp->icmp_code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_ISOLATED: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: case ICMP_UNREACH_HOST_PRECEDENCE: case ICMP_UNREACH_PRECEDENCE_CUTOFF: return (EHOSTUNREACH); case ICMP_UNREACH_NEEDFRAG: return (EMSGSIZE); case ICMP_UNREACH_PROTOCOL: case ICMP_UNREACH_PORT: case ICMP_UNREACH_NET_PROHIB: case ICMP_UNREACH_HOST_PROHIB: case ICMP_UNREACH_FILTER_PROHIB: return (ECONNREFUSED); default: return (0); } case ICMP_TIMXCEED: switch (icp->icmp_code) { case ICMP_TIMXCEED_INTRANS: return (EHOSTUNREACH); default: return (0); } case ICMP_PARAMPROB: switch (icp->icmp_code) { case ICMP_PARAMPROB_ERRATPTR: case ICMP_PARAMPROB_OPTABSENT: return (ENOPROTOOPT); default: return (0); } default: return (0); } } /* * Process a received ICMP message. */ int icmp_input(struct mbuf **mp, int *offp, int proto) { struct icmp *icp; struct in_ifaddr *ia; struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; int hlen = *offp; int icmplen = ntohs(ip->ip_len) - *offp; int i, code; int fibnum; NET_EPOCH_ASSERT(); *mp = NULL; /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. */ #ifdef ICMPPRINTFS if (icmpprintfs) { char srcbuf[INET_ADDRSTRLEN]; char dstbuf[INET_ADDRSTRLEN]; printf("icmp_input from %s to %s, len %d\n", inet_ntoa_r(ip->ip_src, srcbuf), inet_ntoa_r(ip->ip_dst, dstbuf), icmplen); } #endif if (icmplen < ICMP_MINLEN) { ICMPSTAT_INC(icps_tooshort); goto freeit; } i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; m->m_data += hlen; icp = mtod(m, struct icmp *); if (in_cksum(m, icmplen)) { ICMPSTAT_INC(icps_checksum); goto freeit; } m->m_len += hlen; m->m_data -= hlen; #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, icp->icmp_code); #endif /* * Message type specific processing. */ if (icp->icmp_type > ICMP_MAXTYPE) goto raw; /* Initialize */ bzero(&icmpsrc, sizeof(icmpsrc)); icmpsrc.sin_len = sizeof(struct sockaddr_in); icmpsrc.sin_family = AF_INET; bzero(&icmpdst, sizeof(icmpdst)); icmpdst.sin_len = sizeof(struct sockaddr_in); icmpdst.sin_family = AF_INET; bzero(&icmpgw, sizeof(icmpgw)); icmpgw.sin_len = sizeof(struct sockaddr_in); icmpgw.sin_family = AF_INET; - ICMPSTAT_INC(icps_inhist[icp->icmp_type]); + ICMPSTAT_INC2(icps_inhist, icp->icmp_type); code = icp->icmp_code; switch (icp->icmp_type) { case ICMP_UNREACH: if (code > ICMP_UNREACH_PRECEDENCE_CUTOFF) goto badcode; else goto deliver; case ICMP_TIMXCEED: if (code > ICMP_TIMXCEED_REASS) goto badcode; else goto deliver; case ICMP_PARAMPROB: if (code > ICMP_PARAMPROB_LENGTH) goto badcode; deliver: /* * Problem with datagram; advise higher level routines. */ if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); goto freeit; } /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; /* Filter out responses to INADDR_ANY, protocols ignore it. */ if (icp->icmp_ip.ip_dst.s_addr == INADDR_ANY || icp->icmp_ip.ip_src.s_addr == INADDR_ANY) goto freeit; #ifdef ICMPPRINTFS if (icmpprintfs) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp)); ip_stripoptions(m); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { /* This should actually not happen */ ICMPSTAT_INC(icps_tooshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); icp = (struct icmp *)(ip + 1); /* * The upper layer handler can rely on: * - The outer IP header has no options. * - The outer IP header, the ICMP header, the inner IP header, * and the first n bytes of the inner payload are contiguous. * n is at least 8, but might be larger based on * ICMP_ADVLENPREF. See its definition in ip_icmp.h. */ if (ip_ctlprotox[icp->icmp_ip.ip_p] != NULL) ip_ctlprotox[icp->icmp_ip.ip_p](icp); break; badcode: ICMPSTAT_INC(icps_badcode); break; case ICMP_ECHO: if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcastecho); break; } if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) goto freeit; icp->icmp_type = ICMP_ECHOREPLY; goto reflect; case ICMP_TSTAMP: if (V_icmptstamprepl == 0) break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); break; } if (icmplen < ICMP_TSLEN) { ICMPSTAT_INC(icps_badlen); break; } if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) goto freeit; icp->icmp_type = ICMP_TSTAMPREPLY; icp->icmp_rtime = iptime(); icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ goto reflect; case ICMP_MASKREQ: if (V_icmpmaskrepl == 0) break; /* * We are not able to respond with all ones broadcast * unless we receive it over a point-to-point interface. */ if (icmplen < ICMP_MASKLEN) break; switch (ip->ip_dst.s_addr) { case INADDR_BROADCAST: case INADDR_ANY: icmpdst.sin_addr = ip->ip_src; break; default: icmpdst.sin_addr = ip->ip_dst; } ia = (struct in_ifaddr *)ifaof_ifpforaddr( (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == NULL) break; if (ia->ia_ifp == NULL) break; icp->icmp_type = ICMP_MASKREPLY; if (V_icmpmaskfake == 0) icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; else icp->icmp_mask = V_icmpmaskfake; if (ip->ip_src.s_addr == 0) { if (ia->ia_ifp->if_flags & IFF_BROADCAST) ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } reflect: ICMPSTAT_INC(icps_reflect); - ICMPSTAT_INC(icps_outhist[icp->icmp_type]); + ICMPSTAT_INC2(icps_outhist, icp->icmp_type); icmp_reflect(m); return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { u_long src, dst, gw; src = ntohl(ip->ip_src.s_addr); dst = ntohl(icp->icmp_ip.ip_dst.s_addr); gw = ntohl(icp->icmp_gwaddr.s_addr); printf("icmp redirect from %d.%d.%d.%d: " "%d.%d.%d.%d => %d.%d.%d.%d\n", (int)(src >> 24), (int)((src >> 16) & 0xff), (int)((src >> 8) & 0xff), (int)(src & 0xff), (int)(dst >> 24), (int)((dst >> 16) & 0xff), (int)((dst >> 8) & 0xff), (int)(dst & 0xff), (int)(gw >> 24), (int)((gw >> 16) & 0xff), (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); } /* * RFC1812 says we must ignore ICMP redirects if we * are acting as router. */ if (V_drop_redirect || V_ipforwarding) break; if (code > 3) goto badcode; if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { ICMPSTAT_INC(icps_badlen); break; } /* * Short circuit routing redirects to force * immediate change in the kernel's routing * tables. The message is also handed to anyone * listening on a raw socket (e.g. the routing * daemon for use in updating its tables). */ icmpgw.sin_addr = ip->ip_src; icmpdst.sin_addr = icp->icmp_gwaddr; #ifdef ICMPPRINTFS if (icmpprintfs) { char dstbuf[INET_ADDRSTRLEN]; char gwbuf[INET_ADDRSTRLEN]; printf("redirect dst %s to %s\n", inet_ntoa_r(icp->icmp_ip.ip_dst, dstbuf), inet_ntoa_r(icp->icmp_gwaddr, gwbuf)); } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; /* * RFC 1122 says network (code 0,2) redirects SHOULD * be treated identically to the host redirects. * Given that, ignore network masks. */ /* * Variable values: * icmpsrc: route destination * icmpdst: route gateway * icmpgw: message source */ if (icmp_verify_redirect_gateway(&icmpgw, &icmpsrc, &icmpdst, M_GETFIB(m)) != 0) { /* TODO: increment bad redirects here */ break; } for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { rib_add_redirect(fibnum, (struct sockaddr *)&icmpsrc, (struct sockaddr *)&icmpdst, (struct sockaddr *)&icmpgw, m->m_pkthdr.rcvif, RTF_GATEWAY, V_redirtimeout); } break; /* * No kernel processing for the following; * just fall through to send to raw listener. */ case ICMP_ECHOREPLY: case ICMP_ROUTERADVERT: case ICMP_ROUTERSOLICIT: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: case ICMP_SOURCEQUENCH: default: break; } raw: *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); freeit: m_freem(m); return (IPPROTO_DONE); } /* * Reflect the ip packet back to the source */ static void icmp_reflect(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; struct nhop_object *nh; struct mbuf *opts = NULL; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); NET_EPOCH_ASSERT(); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || (IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net240) || (IN_ZERONET(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net0) ) { m_freem(m); /* Bad return address */ ICMPSTAT_INC(icps_badaddr); goto done; /* Ip_output() will check for broadcast */ } t = ip->ip_dst; ip->ip_dst = ip->ip_src; /* * Source selection for ICMP replies: * * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ CK_LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; goto match; } } /* * If the incoming packet was addressed to one of our broadcast * addresses, use the first non-broadcast address which corresponds * to the incoming interface. */ ifp = m->m_pkthdr.rcvif; if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == t.s_addr) { t = IA_SIN(ia)->sin_addr; goto match; } } } /* * If the packet was transiting through us, use the address of * the interface the packet came through in. If that interface * doesn't have a suitable IP address, the normal selection * criteria apply. */ if (V_icmp_rfi && ifp != NULL) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; goto match; } } /* * If the incoming packet was not addressed directly to us, use * designated interface for icmp replies specified by sysctl * net.inet.icmp.reply_src (default not set). Otherwise continue * with normal source selection. */ if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); t = IA_SIN(ia)->sin_addr; goto match; } } /* * If the packet was transiting through us, use the address of * the interface that is the closest to the packet source. * When we don't have a route back to the packet source, stop here * and drop the packet. */ nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, 0); if (nh == NULL) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } t = IA_SIN(ifatoia(nh->nh_ifa))->sin_addr; match: #ifdef MAC mac_netinet_icmp_replyinplace(m); #endif ip->ip_src = t; ip->ip_ttl = V_ip_defttl; if (optlen > 0) { u_char *cp; int opt, cnt; u_int len; /* * Retrieve any source routing from the incoming packet; * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); if ((opts = ip_srcroute(m)) == NULL && (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } if (opts) { #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_reflect optlen %d rt %d => ", optlen, opts->m_len); #endif for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) len = 1; else { if (cnt < IPOPT_OLEN + sizeof(*cp)) break; len = cp[IPOPT_OLEN]; if (len < IPOPT_OLEN + sizeof(*cp) || len > cnt) break; } /* * Should check for overflow, but it "can't happen" */ if (opt == IPOPT_RR || opt == IPOPT_TS || opt == IPOPT_SECURITY) { bcopy((caddr_t)cp, mtod(opts, caddr_t) + opts->m_len, len); opts->m_len += len; } } /* Terminate & pad, if necessary */ cnt = opts->m_len % 4; if (cnt) { for (; cnt < 4; cnt++) { *(mtod(opts, caddr_t) + opts->m_len) = IPOPT_EOL; opts->m_len++; } } #ifdef ICMPPRINTFS if (icmpprintfs) printf("%d\n", opts->m_len); #endif } ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); icmp_send(m, opts); done: if (opts) (void)m_free(opts); } /* * Verifies if redirect message is valid, according to RFC 1122 * * @src: sockaddr with address of redirect originator * @dst: sockaddr with destination in question * @gateway: new proposed gateway * * Returns 0 on success. */ static int icmp_verify_redirect_gateway(struct sockaddr_in *src, struct sockaddr_in *dst, struct sockaddr_in *gateway, u_int fibnum) { struct nhop_object *nh; struct ifaddr *ifa; NET_EPOCH_ASSERT(); /* Verify the gateway is directly reachable. */ if ((ifa = ifa_ifwithnet((struct sockaddr *)gateway, 0, fibnum))==NULL) return (ENETUNREACH); /* TODO: fib-aware. */ if (ifa_ifwithaddr_check((struct sockaddr *)gateway)) return (EHOSTUNREACH); nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_NONE, 0); if (nh == NULL) return (EINVAL); /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface * going down recently. */ if (!sa_equal((struct sockaddr *)src, &nh->gw_sa)) return (EINVAL); if (nh->nh_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK) return (EINVAL); /* If host route already exists, ignore redirect. */ if (nh->nh_flags & NHF_HOST) return (EEXIST); /* If the prefix is directly reachable, ignore redirect. */ if (!(nh->nh_flags & NHF_GATEWAY)) return (EEXIST); return (0); } /* * Send an icmp packet back to the ip level, * after supplying a checksum. */ static void icmp_send(struct mbuf *m, struct mbuf *opts) { struct ip *ip = mtod(m, struct ip *); int hlen; struct icmp *icp; hlen = ip->ip_hl << 2; m->m_data += hlen; m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef ICMPPRINTFS if (icmpprintfs) { char dstbuf[INET_ADDRSTRLEN]; char srcbuf[INET_ADDRSTRLEN]; printf("icmp_send dst %s src %s\n", inet_ntoa_r(ip->ip_dst, dstbuf), inet_ntoa_r(ip->ip_src, srcbuf)); } #endif (void) ip_output(m, opts, NULL, 0, NULL, NULL); } /* * Return milliseconds since 00:00 UTC in network format. */ uint32_t iptime(void) { struct timeval atv; u_long t; getmicrotime(&atv); t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; return (htonl(t)); } /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau * is returned; otherwise, a smaller value is returned. */ int ip_next_mtu(int mtu, int dir) { static int mtutab[] = { 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, 296, 68, 0 }; int i, size; size = (sizeof mtutab) / (sizeof mtutab[0]); if (dir >= 0) { for (i = 0; i < size; i++) if (mtu > mtutab[i]) return mtutab[i]; } else { for (i = size - 1; i >= 0; i--) if (mtu < mtutab[i]) return mtutab[i]; if (mtu == mtutab[0]) return mtutab[0]; } return 0; } #endif /* INET */ /* * badport_bandlim() - check for ICMP bandwidth limit * * Return 0 if it is ok to send an ICMP error response, -1 if we have * hit our bandwidth limit and it is not ok. * * If icmplim is <= 0, the feature is disabled and 0 is returned. * * For now we separate the TCP and UDP subsystems w/ different 'which' * values. We may eventually remove this separation (and simplify the * code further). * * Note that the printing of the error message is delayed so we can * properly print the icmp error rate that the system was trying to do * (i.e. 22000/100 pps, etc...). This can cause long delays in printing * the 'final' error, but it doesn't make sense to solve the printing * delay with more complex code. */ VNET_DEFINE_STATIC(struct counter_rate, icmp_rates[BANDLIM_MAX]); #define V_icmp_rates VNET(icmp_rates) static const char *icmp_rate_descrs[BANDLIM_MAX] = { [BANDLIM_ICMP_UNREACH] = "icmp unreach", [BANDLIM_ICMP_ECHO] = "icmp ping", [BANDLIM_ICMP_TSTAMP] = "icmp tstamp", [BANDLIM_RST_CLOSEDPORT] = "closed port RST", [BANDLIM_RST_OPENPORT] = "open port RST", [BANDLIM_ICMP6_UNREACH] = "icmp6 unreach", [BANDLIM_SCTP_OOTB] = "sctp ootb", }; static void icmplim_new_jitter(void) { /* * Adjust limit +/- to jitter the measurement to deny a side-channel * port scan as in https://dl.acm.org/doi/10.1145/3372297.3417280 */ if (V_icmplim_jitter > 0) V_icmplim_curr_jitter = arc4random_uniform(V_icmplim_jitter * 2 + 1) - V_icmplim_jitter; } static int sysctl_icmplim_and_jitter(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; bool lim; MPASS(oidp->oid_arg1 == &VNET_NAME(icmplim) || oidp->oid_arg1 == &VNET_NAME(icmplim_jitter)); lim = (oidp->oid_arg1 == &VNET_NAME(icmplim)); new = lim ? V_icmplim : V_icmplim_jitter; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (lim) { if (new != 0 && new <= V_icmplim_jitter) error = EINVAL; else V_icmplim = new; } else { if (new >= V_icmplim) error = EINVAL; else { V_icmplim_jitter = new; icmplim_new_jitter(); } } } MPASS(V_icmplim + V_icmplim_curr_jitter > 0); return (error); } static void icmp_bandlimit_init(void) { for (int i = 0; i < BANDLIM_MAX; i++) { V_icmp_rates[i].cr_rate = counter_u64_alloc(M_WAITOK); V_icmp_rates[i].cr_ticks = ticks; } icmplim_new_jitter(); } VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp_bandlimit_init, NULL); #ifdef VIMAGE static void icmp_bandlimit_uninit(void) { for (int i = 0; i < BANDLIM_MAX; i++) counter_u64_free(V_icmp_rates[i].cr_rate); } VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp_bandlimit_uninit, NULL); #endif int badport_bandlim(int which) { int64_t pps; if (V_icmplim == 0 || which == BANDLIM_UNLIMITED) return (0); KASSERT(which >= 0 && which < BANDLIM_MAX, ("%s: which %d", __func__, which)); pps = counter_ratecheck(&V_icmp_rates[which], V_icmplim + V_icmplim_curr_jitter); if (pps > 0) { if (V_icmplim_output) log(LOG_NOTICE, "Limiting %s response from %jd to %d packets/sec\n", icmp_rate_descrs[which], (intmax_t )pps, V_icmplim + V_icmplim_curr_jitter); icmplim_new_jitter(); } if (pps == -1) return (-1); return (0); } diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 826954c6fb6a..0f2ed8c43e64 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -1,344 +1,356 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_IP_VAR_H_ #define _NETINET_IP_VAR_H_ #include #include #include #include /* * Overlay for ip header used by other protocols (tcp, udp). */ struct ipovly { u_char ih_x1[9]; /* (unused) */ u_char ih_pr; /* protocol */ u_short ih_len; /* protocol length */ struct in_addr ih_src; /* source internet address */ struct in_addr ih_dst; /* destination internet address */ }; #ifdef _KERNEL /* * Ip reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. * They are timed out after net.inet.ip.fragttl seconds, and may also be * reclaimed if memory becomes tight. */ struct ipq { TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ time_t ipq_expire; /* time_uptime when ipq expires */ u_char ipq_nfrags; /* # frags in this packet */ u_char ipq_p; /* protocol of this fragment */ u_short ipq_id; /* sequence id for reassembly */ int ipq_maxoff; /* total length of packet */ struct mbuf *ipq_frags; /* to ip headers of fragments */ struct in_addr ipq_src,ipq_dst; struct label *ipq_label; /* MAC label */ }; #endif /* _KERNEL */ /* * Structure stored in mbuf in inpcb.ip_options * and passed to ip_output when ip options are in use. * The actual length of the options (including ipopt_dst) * is in m_len. */ #define MAX_IPOPTLEN 40 struct ipoption { struct in_addr ipopt_dst; /* first-hop dst if source routed */ char ipopt_list[MAX_IPOPTLEN]; /* options proper */ }; #if defined(_NETINET_IN_VAR_H_) && defined(_KERNEL) /* * Structure attached to inpcb.ip_moptions and * passed to ip_output when IP multicast options are in use. * This structure is lazy-allocated. */ struct ip_moptions { struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ u_long imo_multicast_vif; /* vif num outgoing multicasts */ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ u_char imo_multicast_loop; /* 1 => hear sends if a member */ struct ip_mfilter_head imo_head; /* group membership list */ }; #else struct ip_moptions; #endif struct ipstat { uint64_t ips_total; /* total packets received */ uint64_t ips_badsum; /* checksum bad */ uint64_t ips_tooshort; /* packet too short */ uint64_t ips_toosmall; /* not enough data */ uint64_t ips_badhlen; /* ip header length < data size */ uint64_t ips_badlen; /* ip length < ip header length */ uint64_t ips_fragments; /* fragments received */ uint64_t ips_fragdropped; /* frags dropped (dups, out of space) */ uint64_t ips_fragtimeout; /* fragments timed out */ uint64_t ips_forward; /* packets forwarded */ uint64_t ips_fastforward; /* packets fast forwarded */ uint64_t ips_cantforward; /* packets rcvd for unreachable dest */ uint64_t ips_redirectsent; /* packets forwarded on same net */ uint64_t ips_noproto; /* unknown or unsupported protocol */ uint64_t ips_delivered; /* datagrams delivered to upper level*/ uint64_t ips_localout; /* total ip packets generated here */ uint64_t ips_odropped; /* lost packets due to nobufs, etc. */ uint64_t ips_reassembled; /* total packets reassembled ok */ uint64_t ips_fragmented; /* datagrams successfully fragmented */ uint64_t ips_ofragments; /* output fragments created */ uint64_t ips_cantfrag; /* don't fragment flag was set, etc. */ uint64_t ips_badoptions; /* error in option processing */ uint64_t ips_noroute; /* packets discarded due to no route */ uint64_t ips_badvers; /* ip version != 4 */ uint64_t ips_rawout; /* total raw ip packets generated */ uint64_t ips_toolong; /* ip length > max ip packet size */ uint64_t ips_notmember; /* multicasts for unregistered grps */ uint64_t ips_nogif; /* no match gif found */ uint64_t ips_badaddr; /* invalid address on header */ }; #ifdef _KERNEL #include #include +#include VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define IPSTAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val)) -#define IPSTAT_SUB(name, val) IPSTAT_ADD(name, -(val)) +#define IPSTAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(ip, count, name, (val)); \ + VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val)); \ + } while (0) +#define IPSTAT_SUB(name, val) IPSTAT_ADD(name, -(val)) #define IPSTAT_INC(name) IPSTAT_ADD(name, 1) #define IPSTAT_DEC(name) IPSTAT_SUB(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_ipstat_inc(int statnum); -#define KMOD_IPSTAT_INC(name) \ - kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t)) -void kmod_ipstat_dec(int statnum); -#define KMOD_IPSTAT_DEC(name) \ - kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t)) +#define KMOD_IPSTAT_INC(name) \ + do { \ + MIB_SDT_PROBE1(ip, count, name, 1); \ + kmod_ipstat_inc( \ + offsetof(struct ipstat, name) / sizeof(uint64_t)); \ + } while (0) +void kmod_ipstat_dec(int statnum); +#define KMOD_IPSTAT_DEC(name) \ + do { \ + MIB_SDT_PROBE1(ip, count, name, -1); \ + kmod_ipstat_dec( \ + offsetof(struct ipstat, name) / sizeof(uint64_t)); \ + } while (0) /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define IP_RAWOUTPUT 0x2 /* raw ip header exists */ #define IP_SENDONES 0x4 /* send all-ones broadcast */ #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ #define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #define IP_NO_SND_TAG_RL 0x80 /* Don't send down the ratelimit tag */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 #else #define IP_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif struct ip; struct inpcb; struct route; struct sockopt; struct inpcbinfo; VNET_DECLARE(int, ip_defttl); /* default IP ttl */ VNET_DECLARE(int, ipforwarding); /* ip forwarding */ VNET_DECLARE(int, ipsendredirects); #ifdef IPSTEALTH VNET_DECLARE(int, ipstealth); /* stealth forwarding */ #endif VNET_DECLARE(struct socket *, ip_rsvpd); /* reservation protocol daemon*/ VNET_DECLARE(struct socket *, ip_mrouter); /* multicast routing daemon */ extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); VNET_DECLARE(int, rsvp_on); VNET_DECLARE(int, drop_redirect); #define V_ip_id VNET(ip_id) #define V_ip_defttl VNET(ip_defttl) #define V_ipforwarding VNET(ipforwarding) #define V_ipsendredirects VNET(ipsendredirects) #ifdef IPSTEALTH #define V_ipstealth VNET(ipstealth) #endif #define V_ip_rsvpd VNET(ip_rsvpd) #define V_ip_mrouter VNET(ip_mrouter) #define V_rsvp_on VNET(rsvp_on) #define V_drop_redirect VNET(drop_redirect) void inp_freemoptions(struct ip_moptions *); int inp_getmoptions(struct inpcb *, struct sockopt *); int inp_setmoptions(struct inpcb *, struct sockopt *); int ip_ctloutput(struct socket *, struct sockopt *sopt); int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags); void ip_forward(struct mbuf *m, int srcrt); extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); int ip_output(struct mbuf *, struct mbuf *, struct route *, int, struct ip_moptions *, struct inpcb *); struct mbuf * ip_reass(struct mbuf *); void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ip_fillid(struct ip *); int rip_ctloutput(struct socket *, struct sockopt *); int ipip_input(struct mbuf **, int *, int); int rsvp_input(struct mbuf **, int *, int); int ip_rsvp_init(struct socket *); int ip_rsvp_done(void); extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *); extern void (*ip_rsvp_force_done)(struct socket *); extern int (*rsvp_input_p)(struct mbuf **, int *, int); typedef int ipproto_input_t(struct mbuf **, int *, int); struct icmp; typedef void ipproto_ctlinput_t(struct icmp *); int ipproto_register(uint8_t, ipproto_input_t, ipproto_ctlinput_t); int ipproto_unregister(uint8_t); #define IPPROTO_REGISTER(prot, input, ctl) do { \ int error __diagused; \ error = ipproto_register(prot, input, ctl); \ MPASS(error == 0); \ } while (0) ipproto_input_t rip_input; ipproto_ctlinput_t rip_ctlinput; VNET_DECLARE(struct pfil_head *, inet_pfil_head); #define V_inet_pfil_head VNET(inet_pfil_head) #define PFIL_INET_NAME "inet" VNET_DECLARE(struct pfil_head *, inet_local_pfil_head); #define V_inet_local_pfil_head VNET(inet_local_pfil_head) #define PFIL_INET_LOCAL_NAME "inet-local" void in_delayed_cksum(struct mbuf *m); /* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */ /* * Reference to an ipfw or packet filter rule that can be carried * outside critical sections. * A rule is identified by rulenum:rule_id which is ordered. * In version chain_id the rule can be found in slot 'slot', so * we don't need a lookup if chain_id == chain->id. * * On exit from the firewall this structure refers to the rule after * the matching one (slot points to the new rule; rulenum:rule_id-1 * is the matching rule), and additional info (e.g. info often contains * the insn argument or tablearg in the low 16 bits, in host format). * On entry, the structure is valid if slot>0, and refers to the starting * rules. 'info' contains the reason for reinject, e.g. divert port, * divert direction, and so on. * * Packet Mark is an analogue to ipfw tags with O(1) lookup from mbuf while * regular tags require a single-linked list traversal. Mark is a 32-bit * number that can be looked up in a table [with 'number' table-type], matched * or compared with a number with optional mask applied before comparison. * Having generic nature, Mark can be used in a variety of needs. * For example, it could be used as a security group: mark will hold a * security group id and represent a group of packet flows that shares same * access control policy. * O_MASK opcode can match mark value bitwise so one can build a hierarchical * model designating different meanings for a bit range(s). */ struct ipfw_rule_ref { /* struct m_tag spans 24 bytes above this point, see mbuf_tags(9) */ /* spare space just to be save in case struct m_tag grows */ /* -- 32 bytes -- */ uint32_t slot; /* slot for matching rule */ uint32_t rulenum; /* matching rule number */ uint32_t rule_id; /* matching rule id */ uint32_t chain_id; /* ruleset id */ uint32_t info; /* see below */ uint32_t pkt_mark; /* packet mark */ uint32_t spare[2]; /* -- 64 bytes -- */ }; enum { IPFW_INFO_MASK = 0x0000ffff, IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ IPFW_IS_MASK = 0x30000000, /* which source ? */ IPFW_IS_DIVERT = 0x20000000, IPFW_IS_DUMMYNET =0x10000000, IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ }; #define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ #define MTAG_IPFW_RULE 1262273568 /* rule reference */ #define MTAG_IPFW_CALL 1308397630 /* call stack */ struct ip_fw_args; typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *); VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) /* Divert hooks. */ extern void (*ip_divert_ptr)(struct mbuf *m, bool incoming); /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */ extern int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); extern int (*ip_dn_ctl_ptr)(struct sockopt *); extern int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); /* pf specific mtag for divert(4) support */ __enum_uint8_decl(pf_mtag_dir) { PF_DIVERT_MTAG_DIR_IN = 1, PF_DIVERT_MTAG_DIR_OUT = 2 }; struct pf_divert_mtag { __enum_uint8(pf_mtag_dir) idir; /* initial pkt direction */ union { __enum_uint8(pf_mtag_dir) ndir; /* new dir after re-enter */ uint16_t port; /* initial divert(4) port */ }; }; #define MTAG_PF_DIVERT 1262273569 #endif /* _KERNEL */ #endif /* !_NETINET_IP_VAR_H_ */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 7542d680daa0..b16410dad4db 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,1576 +1,1585 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #include #endif #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */ /* Types of ending byte info */ #define TCP_EI_EMPTY_SLOT 0 #define TCP_EI_STATUS_CLIENT_FIN 0x1 #define TCP_EI_STATUS_CLIENT_RST 0x2 #define TCP_EI_STATUS_SERVER_FIN 0x3 #define TCP_EI_STATUS_SERVER_RST 0x4 #define TCP_EI_STATUS_RETRAN 0x5 #define TCP_EI_STATUS_PROGRESS 0x6 #define TCP_EI_STATUS_PERSIST_MAX 0x7 #define TCP_EI_STATUS_KEEP_MAX 0x8 #define TCP_EI_STATUS_DATA_A_CLOSE 0x9 #define TCP_EI_STATUS_RST_IN_FRONT 0xa #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb #define TCP_TRK_REQ_LOG_NEW 0x01 #define TCP_TRK_REQ_LOG_COMPLETE 0x02 #define TCP_TRK_REQ_LOG_FREED 0x03 #define TCP_TRK_REQ_LOG_ALLOCFAIL 0x04 #define TCP_TRK_REQ_LOG_MOREYET 0x05 #define TCP_TRK_REQ_LOG_FORCEFREE 0x06 #define TCP_TRK_REQ_LOG_STALE 0x07 #define TCP_TRK_REQ_LOG_SEARCH 0x08 /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but * for human representation. To check a bit we * take and shift over by 1 minus the value (1-8). */ /************************************************/ #define TCP_EI_BITS_CLIENT_FIN 0x001 #define TCP_EI_BITS_CLIENT_RST 0x002 #define TCP_EI_BITS_SERVER_FIN 0x004 #define TCP_EI_BITS_SERVER_RST 0x008 #define TCP_EI_BITS_RETRAN 0x010 #define TCP_EI_BITS_PROGRESS 0x020 #define TCP_EI_BITS_PRESIST_MAX 0x040 #define TCP_EI_BITS_KEEP_MAX 0x080 #define TCP_EI_BITS_DATA_A_CLO 0x100 #define TCP_EI_BITS_RST_IN_FR 0x200 /* a front state reset */ #define TCP_EI_BITS_2MS_TIMER 0x400 /* 2 MSL timer expired */ #if defined(_KERNEL) || defined(_WANT_TCPCB) #include /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from tcp_get_flags() */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int32_t delivered_data; /* Newly acked data from last SACK */ int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */ uint32_t prr_delivered; /* Total bytes delivered using PRR */ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ int32_t hole_bytes; /* current number of bytes in scoreboard holes */ int32_t lost_bytes; /* number of rfc6675 IsLost() bytes */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); #define TCP_TRK_TRACK_FLG_EMPTY 0x00 /* Available */ #define TCP_TRK_TRACK_FLG_USED 0x01 /* In use */ #define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */ #define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */ #define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */ #define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */ #define TCP_TRK_TRACK_FLG_LSND 0x20 /* We were able to set the Last Sent */ #define MAX_TCP_TRK_REQ 5 /* Max we will have at once */ struct tcp_sendfile_track { uint64_t timestamp; /* User sent timestamp */ uint64_t start; /* Start of sendfile offset */ uint64_t end; /* End if not open-range req */ uint64_t localtime; /* Time we actually got the req */ uint64_t deadline; /* If in CU mode, deadline to delivery */ uint64_t first_send; /* Time of first send in the range */ uint64_t cspr; /* Client suggested pace rate */ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */ uint64_t sent_at_ls; /* Sent value at the last send */ uint64_t rxt_at_ls; /* Retransmit value at the last send */ tcp_seq start_seq; /* First TCP Seq assigned */ tcp_seq end_seq; /* If range req last seq */ uint32_t flags; /* Type of request open etc */ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */ uint32_t hint_maxseg; /* Client hinted maxseg */ uint32_t playout_ms; /* Client playout ms */ uint32_t hybrid_flags; /* Hybrid flags on this request */ }; /* * Change Query responses for a stack switch we create a structure * that allows query response from the new stack to the old, if * supported. * * There are three queries currently defined. * - sendmap * - timers * - rack_times * * For the sendmap query the caller fills in the * req and the req_param as the first seq (usually * snd_una). When the response comes back indicating * that there was data (return value 1), then the caller * can build a sendmap entry based on the range and the * times. The next query would then be done at the * newly created sendmap_end. Repeated until sendmap_end == snd_max. * * Flags in sendmap_flags are defined below as well. * * For timers the standard PACE_TMR_XXXX flags are returned indicating * a pacing timer (possibly) and one other timer. If pacing timer then * the expiration timeout time in microseconds is in timer_pacing_to. * And the value used with whatever timer (if a flag is set) is in * timer_rxt. If no timers are running a 0 is returned and of * course no flags are set in timer_hpts_flags. * * The rack_times are a misc collection of information that * the old stack might possibly fill in. Of course its possible * that an old stack may not have a piece of information. If so * then setting that value to zero is advised. Setting any * timestamp passed should only place a zero in it when it * is unfilled. This may mean that a time is off by a micro-second * but this is ok in the grand scheme of things. * * When switching stacks it is desireable to get as much information * from the old stack to the new stack as possible. Though not always * will the stack be compatible in the types of information. The * init() function needs to take care when it begins changing * things such as inp_flags2 and the timer units to position these * changes at a point where it is unlikely they will fail after * making such changes. A stack optionally can have an "undo" * function * * To transfer information to the old stack from the new in * respect to LRO and the inp_flags2, the new stack should set * the inp_flags2 to what it supports. The old stack in its * fini() function should call the tcp_handle_orphaned_packets() * to clean up any packets. Note that a new stack should attempt */ /* Query types */ #define TCP_QUERY_SENDMAP 1 #define TCP_QUERY_TIMERS_UP 2 #define TCP_QUERY_RACK_TIMES 3 /* Flags returned in sendmap_flags */ #define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */ #define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */ #define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */ #define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */ #define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */ #define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */ #define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */ #define SNDMAP_MASK (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\ |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH) #define SNDMAP_NRTX 3 struct tcp_query_resp { int req; uint32_t req_param; union { struct { tcp_seq sendmap_start; tcp_seq sendmap_end; int sendmap_send_cnt; uint64_t sendmap_time[SNDMAP_NRTX]; uint64_t sendmap_ack_arrival; int sendmap_flags; uint32_t sendmap_r_rtr_bytes; /* If FAS is available if not 0 */ uint32_t sendmap_fas; uint8_t sendmap_dupacks; }; struct { uint32_t timer_hpts_flags; uint32_t timer_pacing_to; uint32_t timer_timer_exp; }; struct { /* Timestamps and rtt's */ uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */ uint32_t rack_num_dsacks; /* Num of dsacks seen */ uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */ uint32_t rack_min_rtt; /* never 0 smallest rtt seen */ uint32_t rack_rtt; /* Last rtt used by rack */ uint32_t rack_tmit_time; /* The time the rtt seg was tmited */ uint32_t rack_time_went_idle; /* If in persist the time we went idle */ /* Prr data */ uint32_t rack_sacked; uint32_t rack_holes_rxt; uint32_t rack_prr_delivered; uint32_t rack_prr_recovery_fs; uint32_t rack_prr_out; uint32_t rack_prr_sndcnt; /* TLP data */ uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */ /* Various bits */ uint8_t rack_tlp_out; /* Is a TLP outstanding */ uint8_t rack_srtt_measured; /* The previous stack has measured srtt */ uint8_t rack_in_persist; /* Is the old stack in persists? */ uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */ }; }; }; #define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */ #define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */ typedef enum { TT_REXMT = 0, TT_PERSIST, TT_KEEP, TT_2MSL, TT_DELACK, TT_N, } tt_which; typedef enum { TT_PROCESSING = 0, TT_PROCESSED, TT_STARTING, TT_STOPPING, } tt_what; /* * Tcp control block, one per tcp connection. */ struct tcpcb { struct inpcb t_inpcb; /* embedded protocol independent cb */ #define t_start_zero t_fb #define t_zero_size (sizeof(struct tcpcb) - \ offsetof(struct tcpcb, t_start_zero)) struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ struct callout t_callout; sbintime_t t_timers[TT_N]; sbintime_t t_precisions[TT_N]; /* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */ TAILQ_ENTRY(tcpcb) t_hpts; /* linkage to HPTS ring */ STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input packets queue */ uint32_t t_hpts_request; /* Current hpts request, zero if * fits in the pacing window. */ uint32_t t_hpts_slot; /* HPTS wheel slot this tcb is. */ uint32_t t_hpts_drop_reas; /* Reason we are dropping the pcb. */ uint32_t t_hpts_gencnt; uint16_t t_hpts_cpu; /* CPU chosen by hpts_cpuid(). */ uint16_t t_lro_cpu; /* CPU derived from LRO. */ #define HPTS_CPU_NONE ((uint16_t)-1) enum { IHPTS_NONE = 0, IHPTS_ONQUEUE, IHPTS_MOVING, } t_in_hpts; /* Is it linked into HPTS? */ uint32_t t_maxseg:24, /* maximum segment size */ _t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t ts_offset; /* our timestamp offset */ uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ uint32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* total reassembly queue byte length */ struct tsegqe_head t_segq; /* segment reassembly queue */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; /* RACK and BBR incoming new data was acked */ u_int t_sndtime; /* time last data was sent */ u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ char t_oobflags; /* have some */ char t_iobc; /* input character */ uint8_t t_nic_ktls_xmit:1, /* active nic ktls xmit sessions */ t_nic_ktls_xmit_dis:1, /* disabled nic xmit ktls? */ t_nic_ktls_spare:6; /* spare nic ktls */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_fbyte_in; /* ticks time first byte queued in */ u_int t_fbyte_out; /* ticks time first byte queued out */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_blackhole_enter; /* when to enter blackhole detection */ int t_blackhole_exit; /* when to exit blackhole detection */ u_int t_rttmin; /* minimum rtt allowed */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *t_cc; /* congestion control algorithm */ struct cc_var t_ccv; /* congestion control specific vars */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ uint32_t t_rcep; /* Number of received CE marked pkts */ uint32_t t_scep; /* Synced number of delivered CE pkts */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ /* Should these be a pointer to the arrays or an array? */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint32_t t_maxpeakrate; /* max peak rate set by user, bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ uint64_t t_snd_rxt_bytes; /* total bytes retransmitted */ uint32_t t_dsack_bytes; /* dsack bytes received */ uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */ uint32_t t_dsack_pack; /* dsack packets we have eceived */ uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */ uint8_t t_rttupdated; /* number of times rtt sampled */ /* TCP Fast Open */ uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TFO server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ union { uint8_t t_end_info_bytes[TCP_END_BYTE_INFO]; uint64_t t_end_info; }; struct osd t_osd; /* storage for Khelp module data */ uint8_t _t_logpoint; /* Used when a BB log points is enabled */ /* * Keep all #ifdef'ed components at the end of the structure! * This is important to minimize problems when compiling modules * using this structure from within the modules' directory. */ #ifdef TCP_REQUEST_TRK /* Response tracking addons. */ uint8_t t_tcpreq_req; /* Request count */ uint8_t t_tcpreq_open; /* Number of open range requests */ uint8_t t_tcpreq_closed; /* Number of closed range requests */ uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */ uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */ uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */ struct tcp_sendfile_track t_tcpreq_info[MAX_TCP_TRK_REQ]; #endif #ifdef TCP_ACCOUNTING uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; #endif #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* SACK scoreboard update status */ typedef enum { SACK_NOCHANGE = 0, SACK_CHANGE, SACK_NEWLOSS } sackstatus_t; /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 #define TCP_TUNNELING_PORT_DEFAULT 0 /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) #define TCP_TUNNELING_OVERHEAD_MAX 1024 #define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ /** * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * * tfb_tcp_fb_init is used to allow the new stack to * setup its control block. Among the things it must * do is: * a) Make sure that the inp_flags2 is setup correctly * for LRO. There are two flags that the previous * stack may have set INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. If the new stack does not * support these it *should* clear the flags. * b) Make sure that the timers are in the proper * granularity that the stack wants. The stack * should check the t_tmr_granularity field. Currently * there are two values that it may hold * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC. * Use the functions tcp_timer_convert(tp, granularity); * to move the timers to the correct format for your stack. * * The new stack may also optionally query the tfb_chg_query * function if the old stack has one. The new stack may ask * for one of three entries and can also state to the old * stack its support for the INP_MBUF_ACKCMP and * INP_SUPPORTS_MBUFQ. This is important since if there are * queued ack's without that statement the old stack will * be forced to discard the queued acks. The requests that * can be made for information by the new stacks are: * * Note also that the tfb_tcp_fb_init() when called can * determine if a query is needed by looking at the * value passed in the ptr. The ptr is designed to be * set in with any allocated memory, but the address * of the condtion (ptr == &tp->t_fb_ptr) will be * true if this is not a stack switch but the initial * setup of a tcb (which means no query would be needed). * If, however, the value is not t_fb_ptr, then the caller * is in the middle of a stack switch and is the new stack. * A query would be appropriate (if the new stack support * the query mechanism). * * TCP_QUERY_SENDMAP - Query of outstanding data. * TCP_QUERY_TIMERS_UP - Query about running timers. * TCP_SUPPORTED_LRO - Declaration in req_param of * the inp_flags2 supported by * the new stack. * TCP_QUERY_RACK_TIMES - Enquire about various timestamps * and states the old stack may be in. * * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). The * tfb_tcp_fb_fini() function itself should not change timers * or inp_flags2 (the tfb_tcp_fb_init() must do that). However * if the old stack supports the LRO mbuf queuing, and the new * stack does not communicate via chg messages that it too does, * it must assume it does not and free any queued mbufs. * */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); void (*tfb_tcp_do_segment)(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t); int (*tfb_do_segment_nounlock)(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t, int, struct timeval *); int (*tfb_do_queued_segments)(struct tcpcb *, int); int (*tfb_tcp_ctloutput)(struct tcpcb *, struct sockopt *); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *, void **); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *tp); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); int (*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *); void (*tfb_switch_failed)(struct tcpcb *); bool (*tfb_early_wake_check)(struct tcpcb *); int (*tfb_compute_pipe)(struct tcpcb *tp); int (*tfb_stack_info)(struct tcpcb *tp, struct stack_specific_info *); void (*tfb_inherit)(struct tcpcb *tp, struct inpcb *h_inp); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); struct tcpcb * tcp_drop(struct tcpcb *, int); #ifdef _NETINET_IN_PCB_H_ #define intotcpcb(inp) __containerof((inp), struct tcpcb, t_inpcb) #define sototcpcb(so) intotcpcb(sotoinpcb(so)) #define tptoinpcb(tp) (&(tp)->t_inpcb) #define tptosocket(tp) (tp)->t_inpcb.inp_socket /* * tcp_output() * Handles tcp_drop request from advanced stacks and reports that inpcb is * gone with negative return code. * Drop in replacement for the default stack. */ static inline int tcp_output(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); int rv; INP_WLOCK_ASSERT(inp); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); tp = tcp_drop(tp, -rv); if (tp) INP_WUNLOCK(inp); } return (rv); } /* * tcp_output_unlock() * Always returns unlocked, handles drop request from advanced stacks. * Always returns positive error code. */ static inline int tcp_output_unlock(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); int rv; INP_WLOCK_ASSERT(inp); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); rv = -rv; tp = tcp_drop(tp, rv); if (tp) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); return (rv); } /* * tcp_output_nodrop() * Always returns locked. It is caller's responsibility to run tcp_drop()! * Useful in syscall implementations, when we want to perform some logging * and/or tracing with tcpcb before calling tcp_drop(). To be used with * tcp_unlock_or_drop() later. * * XXXGL: maybe don't allow stacks to return a drop request at certain * TCP states? Why would it do in connect(2)? In recv(2)? */ static inline int tcp_output_nodrop(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tptoinpcb(tp)); rv = tp->t_fb->tfb_tcp_output(tp); KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); return (rv); } /* * tcp_unlock_or_drop() * Handle return code from tfb_tcp_output() after we have logged/traced, * to be used with tcp_output_nodrop(). */ static inline int tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); if (tcp_output_retval < 0) { tcp_output_retval = -tcp_output_retval; if (tcp_drop(tp, tcp_output_retval) != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); return (tcp_output_retval); } #endif /* _NETINET_IN_PCB_H_ */ static int inline tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack) { return ((ack - tp->snd_una) / tp->t_maxseg + ((((ack - tp->snd_una) % tp->t_maxseg) != 0) ? 1 : 0)); } #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid * Note: accessing and restoring from * these may only be done in the 1st * RTO recovery round (t_rxtshift == 1) */ #define TF_WAKESOR 0x00004000 /* wake up receive socket */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_SONOTCONN 0x00020000 /* needs soisconnected() on ESTAB */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_CLOSED 0x04000000 /* close(2) called on socket */ #define TF_SENTSYN 0x08000000 /* At least one syn has been sent */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ #define TF2_HPTS_CPU_SET 0x00000200 /* t_hpts_cpu is not random */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ #define TF2_ECN_USE_ECT1 0x00000800 /* Use ECT(1) marking on session */ #define TF2_TCP_ACCOUNTING 0x00001000 /* Do TCP accounting */ #define TF2_HPTS_CALLS 0x00002000 /* tcp_output() called via HPTS */ #define TF2_MBUF_L_ACKS 0x00004000 /* large mbufs for ack compression */ #define TF2_MBUF_ACKCMP 0x00008000 /* mbuf ack compression ok */ #define TF2_SUPPORTS_MBUFQ 0x00010000 /* Supports the mbuf queue method */ #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 5 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 5 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 4 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 4 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_progdrops; /* drops due to no progress */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_rcvce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_rcvect0; /* ECN Capable Transport */ uint64_t tcps_ecn_rcvect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ /* Dsack related stats */ uint64_t tcps_dsack_count; /* Number of ACKs arriving with DSACKs */ uint64_t tcps_dsack_bytes; /* Number of bytes DSACK'ed no TLP */ uint64_t tcps_dsack_tlp_bytes; /* Number of bytes DSACK'ed due to TLPs */ /* TCPS_TIME_WAIT usage stats */ uint64_t tcps_tw_recycles; /* Times time-wait was recycled. */ uint64_t tcps_tw_resets; /* Times time-wait sent a reset. */ uint64_t tcps_tw_responds; /* Times time-wait sent a valid ack. */ /* Accurate ECN Handshake stats */ uint64_t tcps_ace_nect; /* ACE SYN packet with Non-ECT */ uint64_t tcps_ace_ect1; /* ACE SYN packet with ECT1 */ uint64_t tcps_ace_ect0; /* ACE SYN packet with ECT0 */ uint64_t tcps_ace_ce; /* ACE SYN packet with CE */ /* ECN related stats */ uint64_t tcps_ecn_sndect0; /* ECN Capable Transport */ uint64_t tcps_ecn_sndect1; /* ECN Capable Transport */ /* * BBR and Rack implement TLP's these values count TLP bytes in * two catagories, bytes that were retransmitted and bytes that * were newly transmited. Both types can serve as TLP's but they * are accounted differently. */ uint64_t tcps_tlpresends; /* number of tlp resends */ uint64_t tcps_tlpresend_bytes; /* number of bytes resent by tlp */ uint64_t _pad[4]; /* 4 TBD placeholder for STABLE */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include +#include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define TCPSTAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) +#define TCPSTAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(tcp, count, name, (val)); \ + VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)); \ + } while (0) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_add(int statnum, int val); -#define KMOD_TCPSTAT_ADD(name, val) \ - kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val) +#define KMOD_TCPSTAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(tcp, count, name, (val)); \ + kmod_tcpstat_add(offsetof(struct tcpstat, name) / \ + sizeof(uint64_t), \ + val); \ + } while (0) #define KMOD_TCPSTAT_INC(name) KMOD_TCPSTAT_ADD(name, 1) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ char xt_cc[TCP_CA_NAME_MAX]; /* (s) */ int64_t spare64[6]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ uint32_t t_snd_cwnd; /* (s) */ uint32_t t_snd_ssthresh; /* (s) */ uint32_t t_maxseg; /* (s) */ uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ uint32_t t_dsack_bytes; /* (n) */ uint32_t t_dsack_tlp_bytes; /* (n) */ uint32_t t_dsack_pack; /* (n) */ uint16_t xt_encaps_port; /* (s) */ int16_t spare16; int32_t spare32[22]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_tolerate_missing_ts); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_newsack); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_retries); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_tolerate_missing_ts VNET(tcp_tolerate_missing_ts) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_newsack VNET(tcp_do_newsack) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_retries VNET(tcp_retries) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif void tcp_account_for_send(struct tcpcb *, uint32_t, uint8_t, uint8_t, bool); int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, const void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_do_segment(struct tcpcb *, struct mbuf *, struct tcphdr *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs); uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt); int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt); void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_extra_mbuf; extern counter_u64_t tcp_would_have_but; extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; extern counter_u64_t tcp_bad_csums; #ifdef TCP_SAD_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sad_limit; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp6_use_min_mtu(struct tcpcb *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_default_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, uint16_t); bool tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, tt_which, u_int); bool tcp_timer_active(struct tcpcb *, tt_which); void tcp_timer_stop(struct tcpcb *); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); void cc_after_idle(struct tcpcb *tp); extern struct protosw tcp_protosw; /* shared for TOE */ extern struct protosw tcp6_protosw; /* shared for TOE */ uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); sackstatus_t tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t, u_int *); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *, u_int *); void tcp_resend_sackholes(struct tcpcb *tp); void tcp_free_sackholes(struct tcpcb *tp); void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); int tcp_incr_dgp_pacing_cnt(void); void tcp_dec_dgp_pacing_cnt(void); void tcp_decrement_paced_conn(void); void tcp_change_time_units(struct tcpcb *, int); void tcp_handle_orphaned_packets(struct tcpcb *); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); #ifdef TCP_REQUEST_TRK void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent); struct tcp_sendfile_track * tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip); int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point); int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point); struct tcp_sendfile_track * tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq); void tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes); uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes); void tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts); struct tcp_sendfile_track * tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups); #endif #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss); #endif static inline void tcp_lro_features_off(struct tcpcb *tp) { tp->t_flags2 &= ~(TF2_SUPPORTS_MBUFQ| TF2_MBUF_QUEUE_READY| TF2_DONT_SACK_QUEUE| TF2_MBUF_ACKCMP| TF2_MBUF_L_ACKS); } static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h index a66d76845eb6..0d70bad91df4 100644 --- a/sys/netinet/udp_var.h +++ b/sys/netinet/udp_var.h @@ -1,184 +1,192 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_UDP_VAR_H_ #define _NETINET_UDP_VAR_H_ #include #include #include /* * UDP kernel structures and variables. */ struct udpiphdr { struct ipovly ui_i; /* overlaid ip structure */ struct udphdr ui_u; /* udp header */ }; #define ui_x1 ui_i.ih_x1 #define ui_v ui_i.ih_x1[0] #define ui_pr ui_i.ih_pr #define ui_len ui_i.ih_len #define ui_src ui_i.ih_src #define ui_dst ui_i.ih_dst #define ui_sport ui_u.uh_sport #define ui_dport ui_u.uh_dport #define ui_ulen ui_u.uh_ulen #define ui_sum ui_u.uh_sum /* * Identifiers for UDP sysctl nodes. */ #define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ #define UDPCTL_STATS 2 /* statistics (read-only) */ #define UDPCTL_MAXDGRAM 3 /* max datagram size */ #define UDPCTL_RECVSPACE 4 /* default receive buffer space */ #define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ /* IPsec: ESP in UDP tunneling: */ #define UF_ESPINUDP_NON_IKE 0x00000001 /* w/ non-IKE marker .. */ /* .. per draft-ietf-ipsec-nat-t-ike-0[01], * and draft-ietf-ipsec-udp-encaps-(00/)01.txt */ #define UF_ESPINUDP 0x00000002 /* w/ non-ESP marker. */ struct udpstat { /* input statistics: */ uint64_t udps_ipackets; /* total input packets */ uint64_t udps_hdrops; /* packet shorter than header */ uint64_t udps_badsum; /* checksum error */ uint64_t udps_nosum; /* no checksum */ uint64_t udps_badlen; /* data length larger than packet */ uint64_t udps_noport; /* no socket on port */ uint64_t udps_noportbcast; /* of above, arrived as broadcast */ uint64_t udps_fullsock; /* not delivered, input socket full */ uint64_t udpps_pcbcachemiss; /* input packets missing pcb cache */ uint64_t udpps_pcbhashmiss; /* input packets not for hashed pcb */ /* output statistics: */ uint64_t udps_opackets; /* total output packets */ uint64_t udps_fastout; /* output packets on fast path */ /* of no socket on port, arrived as multicast */ uint64_t udps_noportmcast; uint64_t udps_filtermcast; /* blocked by multicast filter */ }; #ifdef _KERNEL #include #include +#include struct mbuf; typedef bool udp_tun_func_t(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); typedef union { struct icmp *icmp; struct ip6ctlparam *ip6cp; } udp_tun_icmp_param_t __attribute__((__transparent_union__)); typedef void udp_tun_icmp_t(udp_tun_icmp_param_t); /* * UDP control block; one per udp. */ struct udpcb { struct inpcb u_inpcb; #define u_start_zero u_tun_func #define u_zero_size (sizeof(struct udpcb) - \ offsetof(struct udpcb, u_start_zero)) udp_tun_func_t *u_tun_func; /* UDP kernel tunneling callback. */ udp_tun_icmp_t *u_icmp_func; /* UDP kernel tunneling icmp callback */ u_int u_flags; /* Generic UDP flags. */ uint16_t u_rxcslen; /* Coverage for incoming datagrams. */ uint16_t u_txcslen; /* Coverage for outgoing datagrams. */ void *u_tun_ctx; /* Tunneling callback context. */ }; #define intoudpcb(ip) __containerof((inp), struct udpcb, u_inpcb) #define sotoudpcb(so) (intoudpcb(sotoinpcb(so))) VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define UDPSTAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val)) -#define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1) +#define UDPSTAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(udp, count, name, (val)); \ + VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val)); \ + } while (0) +#define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_udpstat_inc(int statnum); -#define KMOD_UDPSTAT_INC(name) \ - kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t)) +#define KMOD_UDPSTAT_INC(name) \ + do { \ + MIB_SDT_PROBE1(udp, count, name, 1); \ + kmod_udpstat_inc( \ + offsetof(struct udpstat, name) / sizeof(uint64_t)); \ + } while (0) SYSCTL_DECL(_net_inet_udp); VNET_DECLARE(struct inpcbinfo, udbinfo); VNET_DECLARE(struct inpcbinfo, ulitecbinfo); #define V_udbinfo VNET(udbinfo) #define V_ulitecbinfo VNET(ulitecbinfo) extern u_long udp_sendspace; extern u_long udp_recvspace; VNET_DECLARE(int, udp_cksum); VNET_DECLARE(int, udp_blackhole); VNET_DECLARE(bool, udp_blackhole_local); VNET_DECLARE(int, udp_log_in_vain); #define V_udp_cksum VNET(udp_cksum) #define V_udp_blackhole VNET(udp_blackhole) #define V_udp_blackhole_local VNET(udp_blackhole_local) #define V_udp_log_in_vain VNET(udp_log_in_vain) VNET_DECLARE(int, zero_checksum_port); #define V_zero_checksum_port VNET(zero_checksum_port) static __inline struct inpcbinfo * udp_get_inpcbinfo(int protocol) { return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo; } int udp_ctloutput(struct socket *, struct sockopt *); void udplite_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno); int udp_shutdown(struct socket *, enum shutdown_how); int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx); #ifdef _SYS_PROTOSW_H_ pr_abort_t udp_abort; pr_disconnect_t udp_disconnect; pr_send_t udp_send; #endif #endif /* _KERNEL */ #endif /* _NETINET_UDP_VAR_H_ */ diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index e56ec3bc7afd..b4473a40be6a 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -1,2914 +1,2914 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #define MBUF_PRIVATE /* XXXRW: Optimisation tries to avoid M_EXT mbufs */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern ip6proto_ctlinput_t *ip6_ctlprotox[]; VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat); VNET_PCPUSTAT_SYSINIT(icmp6stat); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, struct icmp6stat, icmp6stat, "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(icmp6stat); #endif /* VIMAGE */ VNET_DEFINE_STATIC(int, icmp6_rediraccept) = 1; #define V_icmp6_rediraccept VNET(icmp6_rediraccept) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, "Accept ICMPv6 redirect messages"); VNET_DEFINE_STATIC(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ #define V_icmp6_redirtimeout VNET(icmp6_redirtimeout) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, "Delay in seconds before expiring redirect route"); VNET_DEFINE_STATIC(int, icmp6_nodeinfo) = 0; #define V_icmp6_nodeinfo VNET(icmp6_nodeinfo) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, "Mask of enabled RFC4620 node information query types"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static void icmp6_errcount(int, int); static int icmp6_rip6_input(struct mbuf **, int); static void icmp6_reflect(struct mbuf *, size_t); static const char *icmp6_redirect_diag(struct in6_addr *, struct in6_addr *, struct in6_addr *); static struct mbuf *ni6_input(struct mbuf *, int, struct prison *); static struct mbuf *ni6_nametodns(const char *, int, int); static int ni6_dnsmatch(const char *, int, const char *, int); static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *, struct ifnet **, struct in6_addr *); static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf **, int, int); /* * Kernel module interface for updating icmp6stat. The argument is an index * into icmp6stat treated as an array of u_quad_t. While this encodes the * general layout of icmp6stat into the caller, it doesn't encode its * location, so that future changes to add, for example, per-CPU stats * support won't cause binary compatibility problems for kernel modules. */ void kmod_icmp6stat_inc(int statnum) { counter_u64_add(VNET(icmp6stat)[statnum], 1); } static void icmp6_errcount(int type, int code) { switch (type) { case ICMP6_DST_UNREACH: switch (code) { case ICMP6_DST_UNREACH_NOROUTE: ICMP6STAT_INC(icp6s_odst_unreach_noroute); return; case ICMP6_DST_UNREACH_ADMIN: ICMP6STAT_INC(icp6s_odst_unreach_admin); return; case ICMP6_DST_UNREACH_BEYONDSCOPE: ICMP6STAT_INC(icp6s_odst_unreach_beyondscope); return; case ICMP6_DST_UNREACH_ADDR: ICMP6STAT_INC(icp6s_odst_unreach_addr); return; case ICMP6_DST_UNREACH_NOPORT: ICMP6STAT_INC(icp6s_odst_unreach_noport); return; } break; case ICMP6_PACKET_TOO_BIG: ICMP6STAT_INC(icp6s_opacket_too_big); return; case ICMP6_TIME_EXCEEDED: switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: ICMP6STAT_INC(icp6s_otime_exceed_transit); return; case ICMP6_TIME_EXCEED_REASSEMBLY: ICMP6STAT_INC(icp6s_otime_exceed_reassembly); return; } break; case ICMP6_PARAM_PROB: switch (code) { case ICMP6_PARAMPROB_HEADER: ICMP6STAT_INC(icp6s_oparamprob_header); return; case ICMP6_PARAMPROB_NEXTHEADER: ICMP6STAT_INC(icp6s_oparamprob_nextheader); return; case ICMP6_PARAMPROB_OPTION: ICMP6STAT_INC(icp6s_oparamprob_option); return; } break; case ND_REDIRECT: ICMP6STAT_INC(icp6s_oredirect); return; } ICMP6STAT_INC(icp6s_ounknown); } /* * A wrapper function for icmp6_error() necessary when the erroneous packet * may not contain enough scope zone information. */ void icmp6_error2(struct mbuf *m, int type, int code, int param, struct ifnet *ifp) { struct ip6_hdr *ip6; if (ifp == NULL) return; if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) return; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) return; icmp6_error(m, type, code, param); } /* * Generate an error packet of type error in response to bad IP6 packet. */ void icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; struct epoch_tracker et; u_int preplen; int off; int nxt; ICMP6STAT_INC(icp6s_error); /* count per-type-code statistics */ icmp6_errcount(type, code); #ifdef M_DECRYPTED /*not openbsd*/ if (m->m_flags & M_DECRYPTED) { ICMP6STAT_INC(icp6s_canterror); goto freeit; } #endif if (m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } oip6 = mtod(m, struct ip6_hdr *); /* * If the destination address of the erroneous packet is a multicast * address, or the packet was sent using link-layer multicast, * we should basically suppress sending an error (RFC 2463, Section * 2.4). * We have two exceptions (the item e.2 in that section): * - the Packet Too Big message can be sent for path MTU discovery. * - the Parameter Problem Message that can be allowed an icmp6 error * in the option type field. This check has been done in * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && (type != ICMP6_PACKET_TOO_BIG && (type != ICMP6_PARAM_PROB || code != ICMP6_PARAMPROB_OPTION))) goto freeit; /* * RFC 2463, 2.4 (e.5): source address check. * XXX: the case of anycast source? */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; /* * If we are about to send ICMPv6 against ICMPv6 error/redirect, * don't do it. */ nxt = -1; off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt); if (off >= 0 && nxt == IPPROTO_ICMPV6) { struct icmp6_hdr *icp; if (m->m_len < off + sizeof(struct icmp6_hdr)) { m = m_pullup(m, off + sizeof(struct icmp6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } oip6 = mtod(m, struct ip6_hdr *); icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); if (icp->icmp6_type < ICMP6_ECHO_REQUEST || icp->icmp6_type == ND_REDIRECT) { /* * ICMPv6 error * Special case: for redirect (which is * informational) we must not send icmp6 error. */ ICMP6STAT_INC(icp6s_canterror); goto freeit; } else { /* ICMPv6 informational - send the error */ } } else { /* non-ICMPv6 - send the error */ } /* Finally, do rate limitation check. */ if (icmp6_ratelimit(&oip6->ip6_src, type, code)) goto freeit; /* * OK, ICMP6 can be generated. */ if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN) m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len); preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); M_PREPEND(m, preplen, M_NOWAIT); /* FIB is also copied over. */ if (m == NULL) { nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__)); return; } nip6 = mtod(m, struct ip6_hdr *); nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; in6_clearscope(&oip6->ip6_src); in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; icmp6->icmp6_code = code; icmp6->icmp6_pptr = htonl((u_int32_t)param); - ICMP6STAT_INC(icp6s_outhist[type]); + ICMP6STAT_INC2(icp6s_outhist, type); NET_EPOCH_ENTER(et); icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */ NET_EPOCH_EXIT(et); return; freeit: /* * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } int icmp6_errmap(const struct icmp6_hdr *icmp6) { switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: return (EHOSTUNREACH); case ICMP6_DST_UNREACH_NOPORT: case ICMP6_DST_UNREACH_ADMIN: return (ECONNREFUSED); case ICMP6_DST_UNREACH_BEYONDSCOPE: return (ENOPROTOOPT); default: return (0); /* Shouldn't happen. */ } case ICMP6_PACKET_TOO_BIG: return (EMSGSIZE); case ICMP6_TIME_EXCEEDED: switch (icmp6->icmp6_code) { case ICMP6_TIME_EXCEED_TRANSIT: return (EHOSTUNREACH); case ICMP6_TIME_EXCEED_REASSEMBLY: return (0); default: return (0); /* Shouldn't happen. */ } case ICMP6_PARAM_PROB: switch (icmp6->icmp6_code) { case ICMP6_PARAMPROB_NEXTHEADER: return (ECONNREFUSED); case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: return (ENOPROTOOPT); default: return (0); /* Shouldn't happen. */ } default: return (0); } } /* * Process a received ICMP6 message. */ int icmp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m, *n; struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; int code, error, icmp6len, ip6len, noff, off, sum; NET_EPOCH_ASSERT(); m = *mp; off = *offp; if (m->m_len < off + sizeof(struct icmp6_hdr)) { m = m_pullup(m, off + sizeof(struct icmp6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (IPPROTO_DONE); } } /* * Locate icmp6 structure in mbuf, and check * that not corrupted and of at least minimum length */ icmp6len = m->m_pkthdr.len - off; if (icmp6len < sizeof(struct icmp6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } ip6 = mtod(m, struct ip6_hdr *); ifp = m->m_pkthdr.rcvif; /* * Check multicast group membership. * Note: SSM filters are not applied for ICMPv6 traffic. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { struct in6_multi *inm; inm = in6m_lookup(ifp, &ip6->ip6_dst); if (inm == NULL) { IP6STAT_INC(ip6s_notmember); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); goto freeit; } } /* Calculate the checksum. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); code = icmp6->icmp6_code; if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) { nd6log((LOG_ERR, "ICMP6 checksum error(%d|%x) %s\n", icmp6->icmp6_type, sum, ip6_sprintf(ip6bufs, &ip6->ip6_src))); ICMP6STAT_INC(icp6s_checksum); goto freeit; } - ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]); + ICMP6STAT_INC2(icp6s_inhist, icmp6->icmp6_type); icmp6_ifstat_inc(ifp, ifs6_in_msg); if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(ifp, ifs6_in_error); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(ifp, ifs6_in_dstunreach); switch (code) { case ICMP6_DST_UNREACH_ADMIN: icmp6_ifstat_inc(ifp, ifs6_in_adminprohib); case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_ADDR: case ICMP6_DST_UNREACH_BEYONDSCOPE: case ICMP6_DST_UNREACH_NOPORT: goto deliver; default: goto badcode; } case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig); /* * Validation is made in icmp6_mtudisc_update. * Updating the path MTU will be done after examining * intermediate extension headers. */ goto deliver; case ICMP6_TIME_EXCEEDED: icmp6_ifstat_inc(ifp, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: case ICMP6_TIME_EXCEED_REASSEMBLY: goto deliver; default: goto badcode; } case ICMP6_PARAM_PROB: icmp6_ifstat_inc(ifp, ifs6_in_paramprob); switch (code) { case ICMP6_PARAMPROB_NEXTHEADER: case ICMP6_PARAMPROB_HEADER: case ICMP6_PARAMPROB_OPTION: goto deliver; default: goto badcode; } case ICMP6_ECHO_REQUEST: icmp6_ifstat_inc(ifp, ifs6_in_echo); if (code != 0) goto badcode; if (icmp6_ratelimit(&ip6->ip6_src, ICMP6_ECHO_REPLY, 0)) break; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) { /* Give up remote */ break; } if (!M_WRITABLE(n) || n->m_len < off + sizeof(struct icmp6_hdr)) { struct mbuf *n0 = n; int n0len; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN); n = m_gethdr(M_NOWAIT, n0->m_type); if (n == NULL) { /* Give up remote */ m_freem(n0); break; } m_move_pkthdr(n, n0); /* FIB copied. */ n0len = n0->m_pkthdr.len; /* save for use below */ /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); noff = sizeof(struct ip6_hdr); /* new mbuf contains only ipv6+icmpv6 headers */ n->m_len = noff + sizeof(struct icmp6_hdr); /* * Adjust mbuf. ip6_plen will be adjusted in * ip6_output(). */ m_adj(n0, off + sizeof(struct icmp6_hdr)); /* recalculate complete packet size */ n->m_pkthdr.len = n0len + (noff - off); n->m_next = n0; } else { if (n->m_len < off + sizeof(*nicmp6)) { n = m_pullup(n, off + sizeof(*nicmp6)); if (n == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); break; } } nicmp6 = (struct icmp6_hdr *)(mtod(n, caddr_t) + off); noff = off; } if (n) { nicmp6->icmp6_type = ICMP6_ECHO_REPLY; nicmp6->icmp6_code = 0; ICMP6STAT_INC(icp6s_reflect); - ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]); + ICMP6STAT_INC2(icp6s_outhist, ICMP6_ECHO_REPLY); icmp6_reflect(n, noff); } break; case ICMP6_ECHO_REPLY: icmp6_ifstat_inc(ifp, ifs6_in_echoreply); if (code != 0) goto badcode; break; case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_DONE: case MLDV2_LISTENER_REPORT: /* * Drop MLD traffic which is not link-local, has a hop limit * of greater than 1 hop, or which does not have the * IPv6 HBH Router Alert option. * As IPv6 HBH options are stripped in ip6_input() we must * check an mbuf header flag. * XXX Should we also sanity check that these messages * were directed to a link-local multicast prefix? */ if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0) goto freeit; if (mld_input(&m, off, icmp6len) != 0) { *mp = NULL; return (IPPROTO_DONE); } /* m stays. */ break; case ICMP6_WRUREQUEST: /* ICMP6_FQDN_QUERY */ { enum { WRU, FQDN } mode; struct prison *pr; if (!V_icmp6_nodeinfo) break; if (icmp6len == sizeof(struct icmp6_hdr) + 4) mode = WRU; else if (icmp6len >= sizeof(struct icmp6_nodeinfo)) mode = FQDN; else goto badlen; pr = NULL; sx_slock(&allprison_lock); TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_vnet == ifp->if_vnet) break; sx_sunlock(&allprison_lock); if (pr == NULL) pr = curthread->td_ucred->cr_prison; if (mode == FQDN) { if (m->m_len < off + sizeof(struct icmp6_nodeinfo)) { m = m_pullup(m, off + sizeof(struct icmp6_nodeinfo)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (IPPROTO_DONE); } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n) n = ni6_input(n, off, pr); /* XXX meaningless if n == NULL */ noff = sizeof(struct ip6_hdr); } else { u_char *p; int maxhlen, hlen; /* * XXX: this combination of flags is pointless, * but should we keep this for compatibility? */ if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) != (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK)) break; if (code != 0) goto badcode; CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN); n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { /* Give up remote */ break; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { /* * Previous code did a blind M_COPY_PKTHDR * and said "just for rcvif". If true, then * we could tolerate the dup failing (due to * the deep copy of the tag chain). For now * be conservative and just fail. */ m_free(n); n = NULL; break; } /* * Copy IPv6 and ICMPv6 only. */ nip6 = mtod(n, struct ip6_hdr *); bcopy(ip6, nip6, sizeof(struct ip6_hdr)); nicmp6 = (struct icmp6_hdr *)(nip6 + 1); bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr)); p = (u_char *)(nicmp6 + 1); bzero(p, 4); maxhlen = M_TRAILINGSPACE(n) - (sizeof(*nip6) + sizeof(*nicmp6) + 4); mtx_lock(&pr->pr_mtx); hlen = strlen(pr->pr_hostname); if (maxhlen > hlen) maxhlen = hlen; /* meaningless TTL */ bcopy(pr->pr_hostname, p + 4, maxhlen); mtx_unlock(&pr->pr_mtx); noff = sizeof(struct ip6_hdr); n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + 4 + maxhlen; nicmp6->icmp6_type = ICMP6_WRUREPLY; nicmp6->icmp6_code = 0; } if (n) { ICMP6STAT_INC(icp6s_reflect); - ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]); + ICMP6STAT_INC2(icp6s_outhist, ICMP6_WRUREPLY); icmp6_reflect(n, noff); } break; } case ICMP6_WRUREPLY: if (code != 0) goto badcode; break; case ND_ROUTER_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_routersolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (IPPROTO_DONE); } } error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_rs_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_ROUTER_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_routeradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_router_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ra_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_SOLICIT: icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_solicit)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_ns_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_NEIGHBOR_ADVERT: icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_neighbor_advert)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); nd6_na_input(m, off, icmp6len); m = n; if (m == NULL) goto freeit; break; case ND_REDIRECT: icmp6_ifstat_inc(ifp, ifs6_in_redirect); if (code != 0) goto badcode; if (icmp6len < sizeof(struct nd_redirect)) goto badlen; if (send_sendso_input_hook != NULL) { error = send_sendso_input_hook(m, ifp, SND_IN, ip6len); if (error == 0) { m = NULL; goto freeit; } } n = m_copym(m, 0, M_COPYALL, M_NOWAIT); icmp6_redirect_input(m, off); m = n; if (m == NULL) goto freeit; break; case ICMP6_ROUTER_RENUMBERING: if (code != ICMP6_ROUTER_RENUMBERING_COMMAND && code != ICMP6_ROUTER_RENUMBERING_RESULT) goto badcode; if (icmp6len < sizeof(struct icmp6_router_renum)) goto badlen; break; default: nd6log((LOG_DEBUG, "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n", icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp ? ifp->if_index : 0)); if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) { /* ICMPv6 error: MUST deliver it by spec... */ goto deliver; } else { /* ICMPv6 informational: MUST not deliver */ break; } deliver: if (icmp6_notify_error(&m, off, icmp6len) != 0) { /* In this case, m should've been freed. */ *mp = NULL; return (IPPROTO_DONE); } break; badcode: ICMP6STAT_INC(icp6s_badcode); break; badlen: ICMP6STAT_INC(icp6s_badlen); break; } /* deliver the packet to appropriate sockets */ icmp6_rip6_input(&m, *offp); *mp = m; return (IPPROTO_DONE); freeit: m_freem(m); *mp = NULL; return (IPPROTO_DONE); } static int icmp6_notify_error(struct mbuf **mp, int off, int icmp6len) { struct mbuf *m; struct icmp6_hdr *icmp6; struct ip6_hdr *eip6; u_int32_t notifymtu; struct sockaddr_in6 icmp6src, icmp6dst; m = *mp; if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) { ICMP6STAT_INC(icp6s_tooshort); goto freeit; } if (m->m_len < off + sizeof(*icmp6) + sizeof(struct ip6_hdr)) { m = m_pullup(m, off + sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); eip6 = (struct ip6_hdr *)(icmp6 + 1); bzero(&icmp6dst, sizeof(icmp6dst)); /* Detect the upper level protocol */ { u_int8_t nxt = eip6->ip6_nxt; int eoff = off + sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr); struct ip6ctlparam ip6cp; int icmp6type = icmp6->icmp6_type; struct ip6_frag *fh; struct ip6_rthdr *rth; struct ip6_rthdr0 *rth0; int rthlen; while (1) { /* XXX: should avoid infinite loop explicitly? */ struct ip6_ext *eh; switch (nxt) { case IPPROTO_HOPOPTS: case IPPROTO_DSTOPTS: case IPPROTO_AH: if (m->m_len < eoff + sizeof(struct ip6_ext)) { m = m_pullup(m, eoff + sizeof(struct ip6_ext)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } eh = (struct ip6_ext *) (mtod(m, caddr_t) + eoff); if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else eoff += (eh->ip6e_len + 1) << 3; nxt = eh->ip6e_nxt; break; case IPPROTO_ROUTING: /* * When the erroneous packet contains a * routing header, we should examine the * header to determine the final destination. * Otherwise, we can't properly update * information that depends on the final * destination (e.g. path MTU). */ if (m->m_len < eoff + sizeof(*rth)) { m = m_pullup(m, eoff + sizeof(*rth)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } rth = (struct ip6_rthdr *) (mtod(m, caddr_t) + eoff); rthlen = (rth->ip6r_len + 1) << 3; /* * XXX: currently there is no * officially defined type other * than type-0. * Note that if the segment left field * is 0, all intermediate hops must * have been passed. */ if (rth->ip6r_segleft && rth->ip6r_type == IPV6_RTHDR_TYPE_0) { int hops; if (m->m_len < eoff + rthlen) { m = m_pullup(m, eoff + rthlen); if (m == NULL) { IP6STAT_INC( ip6s_exthdrtoolong); *mp = m; return (-1); } } rth0 = (struct ip6_rthdr0 *) (mtod(m, caddr_t) + eoff); /* just ignore a bogus header */ if ((rth0->ip6r0_len % 2) == 0 && (hops = rth0->ip6r0_len/2)) icmp6dst.sin6_addr = *((struct in6_addr *)(rth0 + 1) + (hops - 1)); } eoff += rthlen; nxt = rth->ip6r_nxt; break; case IPPROTO_FRAGMENT: if (m->m_len < eoff + sizeof(struct ip6_frag)) { m = m_pullup(m, eoff + sizeof(struct ip6_frag)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = m; return (-1); } } fh = (struct ip6_frag *)(mtod(m, caddr_t) + eoff); /* * Data after a fragment header is meaningless * unless it is the first fragment, but * we'll go to the notify label for path MTU * discovery. */ if (fh->ip6f_offlg & IP6F_OFF_MASK) goto notify; eoff += sizeof(struct ip6_frag); nxt = fh->ip6f_nxt; break; default: /* * This case includes ESP and the No Next * Header. In such cases going to the notify * label does not have any meaning * (i.e. ctlfunc will be NULL), but we go * anyway since we might have to update * path MTU information. */ goto notify; } } notify: icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); /* * retrieve parameters from the inner IPv6 header, and convert * them into sockaddr structures. * XXX: there is no guarantee that the source or destination * addresses of the inner packet are in the same scope as * the addresses of the icmp packet. But there is no other * way to determine the zone. */ eip6 = (struct ip6_hdr *)(icmp6 + 1); /* * Protocol layers can't do anything useful with unspecified * addresses. */ if (IN6_IS_ADDR_UNSPECIFIED(&eip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&eip6->ip6_dst)) goto freeit; icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; if (IN6_IS_ADDR_UNSPECIFIED(&icmp6dst.sin6_addr)) icmp6dst.sin6_addr = eip6->ip6_dst; if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; icmp6src.sin6_flowinfo = (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); ip6cp.ip6c_m = m; ip6cp.ip6c_icmp6 = icmp6; ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1); ip6cp.ip6c_off = eoff; ip6cp.ip6c_finaldst = &icmp6dst; ip6cp.ip6c_src = &icmp6src; ip6cp.ip6c_nxt = nxt; if (icmp6type == ICMP6_PACKET_TOO_BIG) { notifymtu = ntohl(icmp6->icmp6_mtu); ip6cp.ip6c_cmdarg = (void *)¬ifymtu; icmp6_mtudisc_update(&ip6cp, 1); /*XXX*/ } if (ip6_ctlprotox[nxt] != NULL) ip6_ctlprotox[nxt](&ip6cp); } *mp = m; return (0); freeit: m_freem(m); *mp = NULL; return (-1); } void icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated) { struct in6_addr *dst = &ip6cp->ip6c_finaldst->sin6_addr; struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); struct in_conninfo inc; uint32_t max_mtu; #if 0 /* * RFC2460 section 5, last paragraph. * even though minimum link MTU for IPv6 is IPV6_MMTU, * we may see ICMPv6 too big with mtu < IPV6_MMTU * due to packet translator in the middle. * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for * special handling. */ if (mtu < IPV6_MMTU) return; #endif /* * we reject ICMPv6 too big with abnormally small value. * XXX what is the good definition of "abnormally small"? */ if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) return; if (!validated) return; /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned * "alwaysfrag" case. * Try to be as close to the spec as possible. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) return; max_mtu = tcp_hc_getmtu(&inc); if (max_mtu == 0) max_mtu = tcp_maxmtu6(&inc, NULL); if (mtu < max_mtu) { tcp_hc_updatemtu(&inc, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing * - Proxy reply (answer even if it's not for me) * - joins NI group address at in6_ifattach() time only, does not cope * with hostname changes by sethostname(3) */ static struct mbuf * ni6_input(struct mbuf *m, int off, struct prison *pr) { struct icmp6_nodeinfo *ni6, *nni6; struct mbuf *n = NULL; u_int16_t qtype; int subjlen; int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); struct ni_reply_fqdn *fqdn; int addrs; /* for NI_QTYPE_NODEADDR */ struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */ struct in6_addr in6_subj; /* subject address */ struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); /* * Validate IPv6 source address. * The default configuration MUST be to refuse answering queries from * global-scope addresses according to RFC4602. * Notes: * - it's not very clear what "refuse" means; this implementation * simply drops it. * - it's not very easy to identify global-scope (unicast) addresses * since there are many prefixes for them. It should be safer * and in practice sufficient to check "all" but loopback and * link-local (note that site-local unicast was deprecated and * ULA is defined as global scope-wise) */ if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) goto bad; /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. * [RFC4602, Section 5.] */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) goto bad; /* else it's a link-local multicast, fine */ } else { /* unicast or anycast */ ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 == NULL) goto bad; /* XXX impossible */ if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); goto bad; } } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo); switch (qtype) { case NI_QTYPE_NOOP: case NI_QTYPE_SUPTYPES: /* 07 draft */ if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0) break; /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 case 0: #endif /* * backward compatibility - try to accept 03 draft * format, where no Subject is present. */ if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 && subjlen == 0) { oldfqdn++; break; } #if ICMP6_NI_SUBJ_IPV6 != 0 if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6) goto bad; #endif if (subjlen != sizeof(struct in6_addr)) goto bad; /* * Validate Subject address. * * Not sure what exactly "address belongs to the node" * means in the spec, is it just unicast, or what? * * At this moment we consider Subject address as * "belong to the node" if the Subject address equals * to the IPv6 destination address; validation for * IPv6 destination address should have done enough * check for us. * * We do not do proxy at this moment. */ m_copydata(m, off + sizeof(struct icmp6_nodeinfo), subjlen, (caddr_t)&in6_subj); if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL)) goto bad; subj = (char *)&in6_subj; if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj)) break; /* * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 * destination X with subject Y, * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ goto bad; case ICMP6_NI_SUBJ_FQDN: /* * Validate Subject name with gethostname(3). * * The behavior may need some debate, since: * - we are not sure if the node has FQDN as * hostname (returned by gethostname(3)). * - the code does wildcard match for truncated names. * however, we are not sure if we want to perform * wildcard match, if gethostname(3) side has * truncated hostname. */ mtx_lock(&pr->pr_mtx); n = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), 0); mtx_unlock(&pr->pr_mtx); if (!n || n->m_next || n->m_len == 0) goto bad; if (m->m_len < off + sizeof(struct icmp6_nodeinfo) + subjlen) { m = m_pullup(m, off + sizeof(struct icmp6_nodeinfo) + subjlen); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); goto bad; } } /* ip6 possibly invalid but not used after. */ ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off); subj = (char *)(mtod(m, caddr_t) + off + sizeof(struct icmp6_nodeinfo)); if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *), n->m_len)) { goto bad; } m_freem(n); n = NULL; break; case ICMP6_NI_SUBJ_IPV4: /* XXX: to be implemented? */ default: goto bad; } break; } /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: case NI_QTYPE_IPV4ADDR: if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } /* guess reply length */ switch (qtype) { case NI_QTYPE_NOOP: break; /* no reply data */ case NI_QTYPE_SUPTYPES: replylen += sizeof(u_int32_t); break; case NI_QTYPE_FQDN: /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); break; case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj); if ((replylen += addrs * (sizeof(struct in6_addr) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; case NI_QTYPE_IPV4ADDR: /* unsupported - should respond with unknown Qtype? */ break; default: /* * XXX: We must return a reply with the ICMP6 code * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. * But the mechanism is not reliable... * maybe we should obsolete older versions. */ qtype = NI_QTYPE_FQDN; /* XXX will append an mbuf */ replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen); oldfqdn++; break; } /* Allocate an mbuf to reply. */ if (replylen > MCLBYTES) { /* * XXX: should we try to allocate more? But MCLBYTES * is probably much larger than IPV6_MMTU... */ goto bad; } if (replylen > MHLEN) n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR); else n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { m_freem(m); return (NULL); } m_move_pkthdr(n, m); /* just for recvif and FIB */ n->m_pkthdr.len = n->m_len = replylen; /* copy mbuf header and IPv6 + Node Information base headers */ bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr)); nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1); bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo)); /* qtype dependent procedure */ switch (qtype) { case NI_QTYPE_NOOP: nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = 0; break; case NI_QTYPE_SUPTYPES: { u_int32_t v; nni6->ni_code = ICMP6_NI_SUCCESS; nni6->ni_flags = htons(0x0000); /* raw bitmap */ /* supports NOOP, SUPTYPES, FQDN, and NODEADDR */ v = (u_int32_t)htonl(0x0000000f); bcopy(&v, nni6 + 1, sizeof(u_int32_t)); break; } case NI_QTYPE_FQDN: nni6->ni_code = ICMP6_NI_SUCCESS; fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) + sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo)); nni6->ni_flags = 0; /* XXX: meaningless TTL */ fqdn->ni_fqdn_ttl = 0; /* ditto. */ /* * XXX do we really have FQDN in hostname? */ mtx_lock(&pr->pr_mtx); n->m_next = ni6_nametodns(pr->pr_hostname, strlen(pr->pr_hostname), oldfqdn); mtx_unlock(&pr->pr_mtx); if (n->m_next == NULL) goto bad; /* XXX we assume that n->m_next is not a chain */ if (n->m_next->m_next != NULL) goto bad; n->m_pkthdr.len += n->m_next->m_len; break; case NI_QTYPE_NODEADDR: { int lenlim, copied; nni6->ni_code = ICMP6_NI_SUCCESS; n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo); lenlim = M_TRAILINGSPACE(n); copied = ni6_store_addrs(ni6, nni6, ifp, lenlim); /* XXX: reset mbuf length */ n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo) + copied; break; } default: break; /* XXX impossible! */ } nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); return (n); bad: m_freem(m); if (n) m_freem(n); return (NULL); } /* * make a mbuf with DNS-encoded string. no compression support. * * XXX names with less than 2 dots (like "foo" or "foo.section") will be * treated as truncated name (two \0 at the end). this is a wild guess. * * old - return pascal string if non-zero */ static struct mbuf * ni6_nametodns(const char *name, int namelen, int old) { struct mbuf *m; char *cp, *ep; const char *p, *q; int i, len, nterm; if (old) len = namelen + 1; else len = MCLBYTES; /* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */ if (len > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) goto fail; if (old) { m->m_len = len; *mtod(m, char *) = namelen; bcopy(name, mtod(m, char *) + 1, namelen); return m; } else { m->m_len = 0; cp = mtod(m, char *); ep = mtod(m, char *) + M_TRAILINGSPACE(m); /* if not certain about my name, return empty buffer */ if (namelen == 0) return m; /* * guess if it looks like shortened hostname, or FQDN. * shortened hostname needs two trailing "\0". */ i = 0; for (p = name; p < name + namelen; p++) { if (*p && *p == '.') i++; } if (i < 2) nterm = 2; else nterm = 1; p = name; while (cp < ep && p < name + namelen) { i = 0; for (q = p; q < name + namelen && *q && *q != '.'; q++) i++; /* result does not fit into mbuf */ if (cp + i + 1 >= ep) goto fail; /* * DNS label length restriction, RFC1035 page 8. * "i == 0" case is included here to avoid returning * 0-length label on "foo..bar". */ if (i <= 0 || i >= 64) goto fail; *cp++ = i; bcopy(p, cp, i); cp += i; p = q; if (p < name + namelen && *p == '.') p++; } /* termination */ if (cp + nterm >= ep) goto fail; while (nterm-- > 0) *cp++ = '\0'; m->m_len = cp - mtod(m, char *); return m; } panic("should not reach here"); /* NOTREACHED */ fail: if (m) m_freem(m); return NULL; } /* * check if two DNS-encoded string matches. takes care of truncated * form (with \0\0 at the end). no compression support. * XXX upper/lowercase match (see RFC2065) */ static int ni6_dnsmatch(const char *a, int alen, const char *b, int blen) { const char *a0, *b0; int l; /* simplest case - need validation? */ if (alen == blen && bcmp(a, b, alen) == 0) return 1; a0 = a; b0 = b; /* termination is mandatory */ if (alen < 2 || blen < 2) return 0; if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0') return 0; alen--; blen--; while (a - a0 < alen && b - b0 < blen) { if (a - a0 + 1 > alen || b - b0 + 1 > blen) return 0; if ((signed char)a[0] < 0 || (signed char)b[0] < 0) return 0; /* we don't support compression yet */ if (a[0] >= 64 || b[0] >= 64) return 0; /* truncated case */ if (a[0] == 0 && a - a0 == alen - 1) return 1; if (b[0] == 0 && b - b0 == blen - 1) return 1; if (a[0] == 0 || b[0] == 0) return 0; if (a[0] != b[0]) return 0; l = a[0]; if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen) return 0; if (bcmp(a + 1, b + 1, l) != 0) return 0; a += 1 + l; b += 1 + l; } if (a - a0 == alen && b - b0 == blen) return 1; else return 0; } /* * calculate the number of addresses to be returned in the node info reply. */ static int ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp, struct in6_addr *subj) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; NET_EPOCH_ASSERT(); if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: if (subj == NULL) /* must be impossible... */ return (0); break; default: /* * XXX: we only support IPv6 subject address for * this Qtype. */ return (0); } } CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { addrsofif = 0; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr)) iffound = 1; /* * IPv4-mapped addresses can only be returned by a * Node Information proxy, since they represent * addresses of IPv4-only nodes, which perforce do * not implement this protocol. * [icmp-name-lookups-07, Section 5.4] * So we don't support NI_NODEADDR_FLAG_COMPAT in * this function at this moment. */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; /* we need only unicast addresses */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } addrsofif++; /* count the address */ } if (iffound) { *ifpp = ifp; return (addrsofif); } addrs += addrsofif; } return (addrs); } static int ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6, struct ifnet *ifp0, int resid) { struct ifnet *ifp; struct in6_ifaddr *ifa6; struct ifaddr *ifa; struct ifnet *ifp_dep = NULL; int copied = 0, allow_deprecated = 0; u_char *cp = (u_char *)(nni6 + 1); int niflags = ni6->ni_flags; u_int32_t ltime; NET_EPOCH_ASSERT(); if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) return (0); /* needless to copy */ ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet); again: for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && allow_deprecated == 0) { /* * prefererred address should be put before * deprecated addresses. */ /* record the interface for later search */ if (ifp_dep == NULL) ifp_dep = ifp; continue; } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && allow_deprecated != 0) continue; /* we now collect deprecated addrs */ /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_SITELOCAL: if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) continue; break; case IPV6_ADDR_SCOPE_GLOBAL: if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) continue; break; default: continue; } /* * check if anycast is okay. * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) continue; if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { /* * We give up much more copy. * Set the truncate flag and return. */ nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE; return (copied); } /* * Set the TTL of the address. * The TTL value should be one of the following * according to the specification: * * 1. The remaining lifetime of a DHCP lease on the * address, or * 2. The remaining Valid Lifetime of a prefix from * which the address was derived through Stateless * Autoconfiguration. * * Note that we currently do not support stateful * address configuration by DHCPv6, so the former * case can't happen. */ if (ifa6->ia6_lifetime.ia6t_expire == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > time_uptime) ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime); else ltime = 0; } bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); /* copy the address itself */ bcopy(&ifa6->ia_addr.sin6_addr, cp, sizeof(struct in6_addr)); in6_clearscope((struct in6_addr *)cp); /* XXX */ cp += sizeof(struct in6_addr); resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); } if (ifp0) /* we need search only on the specified IF */ break; } if (allow_deprecated == 0 && ifp_dep != NULL) { ifp = ifp_dep; allow_deprecated = 1; goto again; } return (copied); } static bool icmp6_rip6_match(const struct inpcb *inp, void *v) { struct ip6_hdr *ip6 = v; if ((inp->inp_vflag & INP_IPV6) == 0) return (false); if (inp->inp_ip_p != IPPROTO_ICMPV6) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) return (false); return (true); } /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6); struct inpcb *inp; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; int delivered = 0; /* This is assumed to be safe; icmp6_input() does a pullup. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); /* * XXX: the address may have embedded scope zone ID, which should be * hidden from applications. */ bzero(&fromsa, sizeof(fromsa)); fromsa.sin6_family = AF_INET6; fromsa.sin6_len = sizeof(struct sockaddr_in6); fromsa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&fromsa)) { m_freem(m); *mp = NULL; return (IPPROTO_DONE); } while ((inp = inp_next(&inpi)) != NULL) { if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, inp->in6p_icmp6filt)) continue; /* * Recent network drivers tend to allocate a single * mbuf cluster, rather than to make a couple of * mbufs without clusters. Also, since the IPv6 code * path tries to avoid m_pullup(), it is highly * probable that we still have an mbuf cluster here * even though the necessary length can be stored in an * mbuf's internal buffer. * Meanwhile, the default size of the receive socket * buffer for raw sockets is not so large. This means * the possibility of packet loss is relatively higher * than before. To avoid this scenario, we copy the * received data to a separate mbuf that does not use * a cluster, if possible. * XXX: it is better to copy the data after stripping * intermediate headers. */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; } else { m_free(n); n = NULL; } } } else n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n == NULL) continue; if (inp->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(inp, n, &opts); /* strip intermediate headers */ m_adj(n, off); SOCKBUF_LOCK(&inp->inp_socket->so_rcv); if (sbappendaddr_locked(&inp->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { soroverflow_locked(inp->inp_socket); m_freem(n); if (opts) m_freem(opts); } else { sorwakeup_locked(inp->inp_socket); delivered++; } opts = NULL; } m_freem(m); *mp = NULL; if (delivered == 0) IP6STAT_DEC(ip6s_delivered); return (IPPROTO_DONE); } /* * Reflect the ip6 packet back to the source. * OFF points to the icmp6 header, counted from the top of the mbuf. */ static void icmp6_reflect(struct mbuf *m, size_t off) { struct in6_addr src6, *srcp; struct ip6_hdr *ip6; struct icmp6_hdr *icmp6; struct in6_ifaddr *ia = NULL; struct ifnet *outif = NULL; int plen; int type, code, hlim; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { nd6log((LOG_DEBUG, "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n", (u_long)off, (u_long)sizeof(struct ip6_hdr), __FILE__, __LINE__)); goto bad; } /* * If there are extra headers between IPv6 and ICMPv6, strip * off that header first. */ #ifdef DIAGNOSTIC if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN) panic("assumption failed in icmp6_reflect"); #endif if (off > sizeof(struct ip6_hdr)) { size_t l; struct ip6_hdr nip6; l = off - sizeof(struct ip6_hdr); m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6); m_adj(m, l); l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6)); } else /* off == sizeof(struct ip6_hdr) */ { size_t l; l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr); if (m->m_len < l) { if ((m = m_pullup(m, l)) == NULL) return; } } plen = m->m_pkthdr.len - sizeof(struct ip6_hdr); ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_nxt = IPPROTO_ICMPV6; icmp6 = (struct icmp6_hdr *)(ip6 + 1); type = icmp6->icmp6_type; /* keep type for statistics */ code = icmp6->icmp6_code; /* ditto. */ hlim = 0; srcp = NULL; /* * If the incoming packet was addressed directly to us (i.e. unicast), * use dst as the src for the reply. * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. */ if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia != NULL && !(ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) { src6 = ia->ia_addr.sin6_addr; srcp = &src6; if (m->m_pkthdr.rcvif != NULL) { /* XXX: This may not be the outgoing interface */ hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim; } else hlim = V_ip6_defhlim; } } if (srcp == NULL) { int error; struct in6_addr dst6; uint32_t scopeid; /* * This case matches to multicasts, our anycast, or unicasts * that we do not own. Select a source address based on the * source address of the erroneous packet. */ in6_splitscope(&ip6->ip6_src, &dst6, &scopeid); error = in6_selectsrc_addr(M_GETFIB(m), &dst6, scopeid, NULL, &src6, &hlim); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " "dst=%s, error=%d\n", ip6_sprintf(ip6buf, &ip6->ip6_dst), error)); goto bad; } srcp = &src6; } /* * ip6_input() drops a packet if its src is multicast. * So, the src is never multicast. */ ip6->ip6_dst = ip6->ip6_src; ip6->ip6_src = *srcp; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = hlim; icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), plen); /* * XXX option handling */ m->m_flags &= ~(M_BCAST|M_MCAST); m->m_pkthdr.rcvif = NULL; ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) icmp6_ifoutstat_inc(outif, type, code); return; bad: m_freem(m); return; } static const char * icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6, struct in6_addr *tgt6) { static char buf[1024]; char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; char ip6buft[INET6_ADDRSTRLEN]; snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)", ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6), ip6_sprintf(ip6buft, tgt6)); return buf; } void icmp6_redirect_input(struct mbuf *m, int off) { struct ifnet *ifp; struct ip6_hdr *ip6; struct nd_redirect *nd_rd; struct in6_addr src6, redtgt6, reddst6; union nd_opts ndopts; char ip6buf[INET6_ADDRSTRLEN]; char *lladdr; int icmp6len, is_onlink, is_router, lladdrlen; M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__)); /* XXX if we are router, we don't update route by icmp6 redirect */ if (V_ip6_forwarding) goto freeit; if (!V_icmp6_rediraccept) goto freeit; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; ip6 = mtod(m, struct ip6_hdr *); icmp6len = ntohs(ip6->ip6_plen); if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off); ifp = m->m_pkthdr.rcvif; redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; if (in6_setscope(&redtgt6, ifp, NULL) || in6_setscope(&reddst6, ifp, NULL)) { goto freeit; } /* validation */ src6 = ip6->ip6_src; if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "must be from linklocal\n", ip6_sprintf(ip6buf, &src6))); goto bad; } if (__predict_false(ip6->ip6_hlim != 255)) { ICMP6STAT_INC(icp6s_invlhlim); nd6log((LOG_ERR, "ICMP6 redirect sent from %s rejected; " "hlim=%d (must be 255)\n", ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim)); goto bad; } { /* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */ struct nhop_object *nh; struct in6_addr kdst; uint32_t scopeid; in6_splitscope(&reddst6, &kdst, &scopeid); NET_EPOCH_ASSERT(); nh = fib6_lookup(ifp->if_fib, &kdst, scopeid, 0, 0); if (nh != NULL) { struct in6_addr nh_addr; nh_addr = ifatoia6(nh->nh_ifa)->ia_addr.sin6_addr; if ((nh->nh_flags & NHF_GATEWAY) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; no route " "with inet6 gateway found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* * Embed scope zone id into next hop address. */ nh_addr = nh->gw6_sa.sin6_addr; if (IN6_ARE_ADDR_EQUAL(&src6, &nh_addr) == 0) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "not equal to gw-for-src=%s (must be same): " "%s\n", ip6_sprintf(ip6buf, &nh_addr), icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } else { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "no route found for redirect dst: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&reddst6)) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "redirect dst must be unicast: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } is_router = is_onlink = 0; if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) is_router = 1; /* router case */ if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0) is_onlink = 1; /* on-link destination case */ if (!is_router && !is_onlink) { nd6log((LOG_ERR, "ICMP6 redirect rejected; " "neither router case nor onlink case: %s\n", icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } icmp6len -= sizeof(*nd_rd); nd6_option_init(nd_rd + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n", __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); /* nd6_options have incremented stats */ goto freeit; } lladdr = NULL; lladdrlen = 0; if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s " "(if %d, icmp6 packet %d): %s\n", __func__, ip6_sprintf(ip6buf, &redtgt6), ifp->if_addrlen, lladdrlen - 2, icmp6_redirect_diag(&src6, &reddst6, &redtgt6))); goto bad; } /* Validation passed. */ /* RFC 2461 8.3 */ nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT, is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER); /* * Install a gateway route in the better-router case or an interface * route in the on-link-destination case. */ { struct sockaddr_in6 sdst; struct sockaddr_in6 sgw; struct sockaddr_in6 ssrc; struct sockaddr *gw; int rt_flags; u_int fibnum; bzero(&sdst, sizeof(sdst)); bzero(&ssrc, sizeof(ssrc)); sdst.sin6_family = ssrc.sin6_family = AF_INET6; sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); rt_flags = 0; if (is_router) { bzero(&sgw, sizeof(sgw)); sgw.sin6_family = AF_INET6; sgw.sin6_len = sizeof(struct sockaddr_in6); bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); gw = (struct sockaddr *)&sgw; rt_flags |= RTF_GATEWAY; } else gw = ifp->if_addr->ifa_addr; for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rib_add_redirect(fibnum, (struct sockaddr *)&sdst, gw, (struct sockaddr *)&ssrc, ifp, rt_flags, V_icmp6_redirtimeout); } freeit: m_freem(m); return; bad: ICMP6STAT_INC(icp6s_badredirect); m_freem(m); } void icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh) { struct ifnet *ifp; /* my outgoing interface */ struct in6_addr *ifp_ll6; struct in6_addr *router_ll6; struct ip6_hdr *sip6; /* m0 as struct ip6_hdr */ struct mbuf *m = NULL; /* newly allocated one */ struct m_tag *mtag; struct ip6_hdr *ip6; /* m as struct ip6_hdr */ struct nd_redirect *nd_rd; struct llentry *ln = NULL; size_t maxlen; u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; icmp6_errcount(ND_REDIRECT, 0); /* if we are not router, we don't send icmp6 redirect */ if (!V_ip6_forwarding) goto fail; /* sanity check */ if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp)) goto fail; /* * Address check: * the source address must identify a neighbor, and * the destination address must not be a multicast address * [RFC 2461, sec 8.2] */ sip6 = mtod(m0, struct ip6_hdr *); bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = sip6->ip6_src; if (nd6_is_addr_neighbor(&src_sa, ifp) == 0) goto fail; if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst)) goto fail; /* what should we do here? */ /* rate limit */ if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0)) goto fail; /* * Since we are going to append up to 1280 bytes (= IPV6_MMTU), * we almost always ask for an mbuf cluster for simplicity. * (MHLEN < IPV6_MMTU is almost always true) */ #if IPV6_MMTU >= MCLBYTES # error assumption failed about IPV6_MMTU and MCLBYTES #endif m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto fail; M_SETFIB(m, M_GETFIB(m0)); maxlen = M_TRAILINGSPACE(m); maxlen = min(IPV6_MMTU, maxlen); /* just for safety */ if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) + ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) { goto fail; } { /* get ip6 linklocal address for ifp(my outgoing interface). */ struct in6_ifaddr *ia; if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; ifp_ll6 = &ia->ia_addr.sin6_addr; /* XXXRW: reference released prematurely. */ ifa_free(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ if (nh->nh_flags & NHF_GATEWAY) { struct sockaddr_in6 *sin6; sin6 = &nh->gw6_sa; router_ll6 = &sin6->sin6_addr; if (!IN6_IS_ADDR_LINKLOCAL(router_ll6)) router_ll6 = (struct in6_addr *)NULL; } else router_ll6 = (struct in6_addr *)NULL; /* ip6 */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; /* ip6->ip6_src must be linklocal addr for my outgoing if. */ bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr)); bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr)); /* ND Redirect */ nd_rd = (struct nd_redirect *)(ip6 + 1); nd_rd->nd_rd_type = ND_REDIRECT; nd_rd->nd_rd_code = 0; nd_rd->nd_rd_reserved = 0; if (nh->nh_flags & NHF_GATEWAY) { /* * nd_rd->nd_rd_target must be a link-local address in * better router cases. */ if (!router_ll6) goto fail; bcopy(router_ll6, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } else { /* make sure redtgt == reddst */ bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target, sizeof(nd_rd->nd_rd_target)); bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst, sizeof(nd_rd->nd_rd_dst)); } p = (u_char *)(nd_rd + 1); if (!router_ll6) goto nolladdropt; { /* target lladdr option */ int len; struct nd_opt_hdr *nd_opt; char *lladdr; ln = nd6_lookup(router_ll6, LLE_SF(AF_INET6, 0), ifp); if (ln == NULL) goto nolladdropt; len = sizeof(*nd_opt) + ifp->if_addrlen; len = (len + 7) & ~7; /* round by 8 */ /* safety check */ if (len + (p - (u_char *)ip6) > maxlen) goto nolladdropt; if (ln->la_flags & LLE_VALID) { nd_opt = (struct nd_opt_hdr *)p; nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } nolladdropt: if (ln != NULL) LLE_RUNLOCK(ln); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* just to be safe */ #ifdef M_DECRYPTED /*not openbsd*/ if (m0->m_flags & M_DECRYPTED) goto noredhdropt; #endif if (p - (u_char *)ip6 > maxlen) goto noredhdropt; { /* redirected header option */ int len; struct nd_opt_rd_hdr *nd_opt_rh; /* * compute the maximum size for icmp6 redirect header option. * XXX room for auth header? */ len = maxlen - (p - (u_char *)ip6); len &= ~7; /* This is just for simplicity. */ if (m0->m_pkthdr.len != m0->m_len) { if (m0->m_next) { m_freem(m0->m_next); m0->m_next = NULL; } m0->m_pkthdr.len = m0->m_len; } /* * Redirected header option spec (RFC2461 4.6.3) talks nothing * about padding/truncate rule for the original IP packet. * From the discussion on IPv6imp in Feb 1999, * the consensus was: * - "attach as much as possible" is the goal * - pad if not aligned (original size can be guessed by * original ip6 header) * Following code adds the padding if it is simple enough, * and truncates if not. */ if (m0->m_next || m0->m_pkthdr.len != m0->m_len) panic("assumption failed in %s:%d", __FILE__, __LINE__); if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) { /* not enough room, truncate */ m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } else { /* enough room, pad or truncate */ size_t extra; extra = m0->m_pkthdr.len % 8; if (extra) { /* pad if easy enough, truncate if not */ if (8 - extra <= M_TRAILINGSPACE(m0)) { /* pad */ m0->m_len += (8 - extra); m0->m_pkthdr.len += (8 - extra); } else { /* truncate */ m0->m_pkthdr.len -= extra; m0->m_len -= extra; } } len = m0->m_pkthdr.len + sizeof(*nd_opt_rh); m0->m_pkthdr.len = m0->m_len = len - sizeof(*nd_opt_rh); } nd_opt_rh = (struct nd_opt_rd_hdr *)p; bzero(nd_opt_rh, sizeof(*nd_opt_rh)); nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER; nd_opt_rh->nd_opt_rh_len = len >> 3; p += sizeof(*nd_opt_rh); m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; /* connect m0 to m */ m_tag_delete_chain(m0, NULL); m0->m_flags &= ~M_PKTHDR; m->m_next = m0; m->m_pkthdr.len = m->m_len + m0->m_len; m0 = NULL; } noredhdropt:; if (m0) { m_freem(m0); m0 = NULL; } /* XXX: clear embedded link IDs in the inner header */ in6_clearscope(&sip6->ip6_src); in6_clearscope(&sip6->ip6_dst); in6_clearscope(&nd_rd->nd_rd_target); in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); nd_rd->nd_rd_cksum = 0; nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), ntohs(ip6->ip6_plen)); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto fail; *(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type; m_tag_prepend(m, mtag); } /* send the packet to outside... */ ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); } - ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]); + ICMP6STAT_INC2(icp6s_outhist, ND_REDIRECT); return; fail: if (m) m_freem(m); if (m0) m_freem(m0); } /* * ICMPv6 socket option processing. */ int icmp6_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0; int optlen; struct inpcb *inp = sotoinpcb(so); int level, op, optname; if (sopt) { level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; } else level = op = optname = optlen = 0; if (level != IPPROTO_ICMPV6) { return EINVAL; } switch (op) { case PRCO_SETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; if (optlen != sizeof(ic6f)) { error = EMSGSIZE; break; } error = sooptcopyin(sopt, &ic6f, optlen, optlen); if (error == 0) { INP_WLOCK(inp); *inp->in6p_icmp6filt = ic6f; INP_WUNLOCK(inp); } break; } default: error = ENOPROTOOPT; break; } break; case PRCO_GETOPT: switch (optname) { case ICMP6_FILTER: { struct icmp6_filter ic6f; INP_RLOCK(inp); ic6f = *inp->in6p_icmp6filt; INP_RUNLOCK(inp); error = sooptcopyout(sopt, &ic6f, sizeof(ic6f)); break; } default: error = ENOPROTOOPT; break; } break; } return (error); } static int sysctl_icmp6lim_and_jitter(SYSCTL_HANDLER_ARGS); VNET_DEFINE_STATIC(u_int, icmp6errppslim) = 100; #define V_icmp6errppslim VNET(icmp6errppslim) SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLTYPE_UINT | CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, &sysctl_icmp6lim_and_jitter, "IU", "Maximum number of ICMPv6 error/reply messages per second"); VNET_DEFINE_STATIC(int, icmp6lim_curr_jitter) = 0; #define V_icmp6lim_curr_jitter VNET(icmp6lim_curr_jitter) VNET_DEFINE_STATIC(u_int, icmp6lim_jitter) = 8; #define V_icmp6lim_jitter VNET(icmp6lim_jitter) SYSCTL_PROC(_net_inet6_icmp6, OID_AUTO, icmp6lim_jitter, CTLTYPE_UINT | CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6lim_jitter), 0, &sysctl_icmp6lim_and_jitter, "IU", "Random errppslimit jitter adjustment limit"); VNET_DEFINE_STATIC(int, icmp6lim_output) = 1; #define V_icmp6lim_output VNET(icmp6lim_output) SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, icmp6lim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6lim_output), 0, "Enable logging of ICMPv6 response rate limiting"); typedef enum { RATELIM_PARAM_PROB = 0, RATELIM_TOO_BIG, RATELIM_UNREACH, RATELIM_TEXCEED, RATELIM_REDIR, RATELIM_REPLY, RATELIM_OTHER, RATELIM_MAX } ratelim_which; static const char *icmp6_rate_descrs[RATELIM_MAX] = { [RATELIM_PARAM_PROB] = "bad IPv6 header", [RATELIM_TOO_BIG] = "packet too big", [RATELIM_UNREACH] = "destination unreachable", [RATELIM_TEXCEED] = "time exceeded", [RATELIM_REPLY] = "echo reply", [RATELIM_REDIR] = "neighbor discovery redirect", [RATELIM_OTHER] = "(other)", }; static void icmp6lim_new_jitter(void) { /* * Adjust limit +/- to jitter the measurement to deny a side-channel * port scan as in https://dl.acm.org/doi/10.1145/3372297.3417280 */ if (V_icmp6lim_jitter > 0) V_icmp6lim_curr_jitter = arc4random_uniform(V_icmp6lim_jitter * 2 + 1) - V_icmp6lim_jitter; } static int sysctl_icmp6lim_and_jitter(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; bool lim; MPASS(oidp->oid_arg1 == &VNET_NAME(icmp6errppslim) || oidp->oid_arg1 == &VNET_NAME(icmp6lim_jitter)); lim = (oidp->oid_arg1 == &VNET_NAME(icmp6errppslim)); new = lim ? V_icmp6errppslim : V_icmp6lim_jitter; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (lim) { if (new != 0 && new <= V_icmp6lim_jitter) error = EINVAL; else V_icmp6errppslim = new; } else { if (new >= V_icmp6errppslim) error = EINVAL; else { V_icmp6lim_jitter = new; icmp6lim_new_jitter(); } } } MPASS(V_icmp6errppslim + V_icmp6lim_curr_jitter > 0); return (error); } VNET_DEFINE_STATIC(struct counter_rate, icmp6_rates[RATELIM_MAX]); #define V_icmp6_rates VNET(icmp6_rates) static void icmp6_ratelimit_init(void) { for (int i = 0; i < RATELIM_MAX; i++) { V_icmp6_rates[i].cr_rate = counter_u64_alloc(M_WAITOK); V_icmp6_rates[i].cr_ticks = ticks; } icmp6lim_new_jitter(); } VNET_SYSINIT(icmp6_ratelimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, icmp6_ratelimit_init, NULL); #ifdef VIMAGE static void icmp6_ratelimit_uninit(void) { for (int i = 0; i < RATELIM_MAX; i++) counter_u64_free(V_icmp6_rates[i].cr_rate); } VNET_SYSUNINIT(icmp6_ratelimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, icmp6_ratelimit_uninit, NULL); #endif /* * Perform rate limit check. * Returns 0 if it is okay to send the icmp6 packet. * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate * limitation. * * XXX per-destination/type check necessary? * * dst - not used at this moment * code - not used at this moment */ int icmp6_ratelimit(const struct in6_addr *dst, const int type, const int code) { ratelim_which which; int64_t pps; if (V_icmp6errppslim == 0) return (0); switch (type) { case ICMP6_PARAM_PROB: which = RATELIM_PARAM_PROB; break; case ICMP6_PACKET_TOO_BIG: which = RATELIM_TOO_BIG; break; case ICMP6_DST_UNREACH: which = RATELIM_UNREACH; break; case ICMP6_TIME_EXCEEDED: which = RATELIM_TEXCEED; break; case ND_REDIRECT: which = RATELIM_REDIR; break; case ICMP6_ECHO_REPLY: which = RATELIM_REPLY; break; default: which = RATELIM_OTHER; break; }; pps = counter_ratecheck(&V_icmp6_rates[which], V_icmp6errppslim + V_icmp6lim_curr_jitter); if (pps > 0) { if (V_icmp6lim_output) log(LOG_NOTICE, "Limiting ICMPv6 %s output from %jd " "to %d packets/sec\n", icmp6_rate_descrs[which], (intmax_t )pps, V_icmp6errppslim + V_icmp6lim_curr_jitter); icmp6lim_new_jitter(); } if (pps == -1) { ICMP6STAT_INC(icp6s_toofreq); return (-1); } return (0); } diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c index efc7d14d8b74..dd6864482b3c 100644 --- a/sys/netinet6/in6_src.c +++ b/sys/netinet6/in6_src.c @@ -1,1115 +1,1115 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct mtx addrsel_lock; #define ADDRSEL_LOCK_INIT() mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF) #define ADDRSEL_LOCK() mtx_lock(&addrsel_lock) #define ADDRSEL_UNLOCK() mtx_unlock(&addrsel_lock) #define ADDRSEL_LOCK_ASSERT() mtx_assert(&addrsel_lock, MA_OWNED) static struct sx addrsel_sxlock; #define ADDRSEL_SXLOCK_INIT() sx_init(&addrsel_sxlock, "addrsel_sxlock") #define ADDRSEL_SLOCK() sx_slock(&addrsel_sxlock) #define ADDRSEL_SUNLOCK() sx_sunlock(&addrsel_sxlock) #define ADDRSEL_XLOCK() sx_xlock(&addrsel_sxlock) #define ADDRSEL_XUNLOCK() sx_xunlock(&addrsel_sxlock) #define ADDR_LABEL_NOTAPP (-1) VNET_DEFINE_STATIC(struct in6_addrpolicy, defaultaddrpolicy); #define V_defaultaddrpolicy VNET(defaultaddrpolicy) VNET_DEFINE(int, ip6_prefer_tempaddr) = 0; static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct nhop_object **, int, u_int, uint32_t); static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct ifnet **, struct ifnet *, u_int); static int in6_selectsrc(uint32_t, struct sockaddr_in6 *, struct ip6_pktopts *, struct inpcb *, struct ucred *, struct ifnet **, struct in6_addr *); static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *); static void init_policy_queue(void); static int add_addrsel_policyent(struct in6_addrpolicy *); static int delete_addrsel_policyent(struct in6_addrpolicy *); static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *), void *); static int dump_addrsel_policyent(struct in6_addrpolicy *, void *); static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ #define REPLACE(r) do {\ - IP6STAT_INC(ip6s_sources_rule[(r)]); \ + IP6STAT_INC2(ip6s_sources_rule, (r)); \ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto replace; \ } while(0) #define NEXT(r) do {\ /* { \ char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \ printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \ } */ \ goto next; /* XXX: we can't use 'continue' here */ \ } while(0) #define BREAK(r) do { \ - IP6STAT_INC(ip6s_sources_rule[(r)]); \ + IP6STAT_INC2(ip6s_sources_rule, (r)); \ goto out; /* XXX: we can't use 'break' here */ \ } while(0) static int in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred, struct ifnet **ifpp, struct in6_addr *srcp) { struct rm_priotracker in6_ifa_tracker; struct in6_addr dst, tmp; struct ifnet *ifp = NULL, *oifp = NULL; struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; int dst_scope = -1, best_scope = -1, best_matchlen = -1; struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; u_int32_t odstzone; int prefer_tempaddr; int error; struct ip6_moptions *mopts; NET_EPOCH_ASSERT(); KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__)); dst = dstsock->sin6_addr; /* make a copy for local operation */ if (ifpp) { /* * Save a possibly passed in ifp for in6_selectsrc. Only * neighbor discovery code should use this feature, where * we may know the interface but not the FIB number holding * the connected subnet in case someone deleted it from the * default FIB and we need to check the interface. */ if (*ifpp != NULL) oifp = *ifpp; *ifpp = NULL; } if (inp != NULL) { INP_LOCK_ASSERT(inp); mopts = inp->in6p_moptions; } else { mopts = NULL; } /* * If the source address is explicitly specified by the caller, * check if the requested source address is indeed a unicast address * assigned to the node, and can be used as the packet's source * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { /* get the outgoing interface */ if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp, fibnum)) != 0) return (error); /* * determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. * If the specified address is ambiguous wrt the scope zone, * the interface must be specified; otherwise, ifa_ifwithaddr() * will fail matching the address. */ tmp = pi->ipi6_addr; if (ifp) { error = in6_setscope(&tmp, ifp, &odstzone); if (error) return (error); } if (cred != NULL && (error = prison_local_ip6(cred, &tmp, (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) return (error); /* * If IPV6_BINDANY socket option is set, we allow to specify * non local addresses as source address in IPV6_PKTINFO * ancillary data. */ if ((inp->inp_flags & INP_BINDANY) == 0) { ia = in6ifa_ifwithaddr(&tmp, 0 /* XXX */, false); if (ia == NULL || (ia->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY))) return (EADDRNOTAVAIL); bcopy(&ia->ia_addr.sin6_addr, srcp, sizeof(*srcp)); } else bcopy(&tmp, srcp, sizeof(*srcp)); pi->ipi6_addr = tmp; /* XXX: this overrides pi */ if (ifpp) *ifpp = ifp; return (0); } /* * Otherwise, if the socket has already bound the source, just use it. */ if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (cred != NULL && (error = prison_local_ip6(cred, &inp->in6p_laddr, ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0) return (error); bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp)); return (0); } /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip6(cred, srcp)) return (0); /* * If the address is not specified, choose the best one based on * the outgoing interface and the destination address. */ /* get the outgoing interface */ if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp, (inp != NULL) ? inp->inp_inc.inc_fibnum : fibnum)) != 0) return (error); #ifdef DIAGNOSTIC if (ifp == NULL) /* this should not happen */ panic("in6_selectsrc: NULL ifp"); #endif error = in6_setscope(&dst, ifp, &odstzone); if (error) return (error); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { int new_scope = -1, new_matchlen = -1; struct in6_addrpolicy *new_policy = NULL; u_int32_t srczone, osrczone, dstzone; struct in6_addr src; struct ifnet *ifp1 = ia->ia_ifp; /* * We'll never take an address that breaks the scope zone * of the destination. We also skip an address if its zone * does not contain the outgoing interface. * XXX: we should probably use sin6_scope_id here. */ if (in6_setscope(&dst, ifp1, &dstzone) || odstzone != dstzone) { continue; } src = ia->ia_addr.sin6_addr; if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) || osrczone != srczone) { continue; } /* avoid unusable addresses */ if ((ia->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) { continue; } if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) continue; /* If jailed only take addresses of the jail into account. */ if (cred != NULL && prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0) continue; /* Rule 1: Prefer same address */ if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) { ia_best = ia; BREAK(1); /* there should be no better candidate */ } if (ia_best == NULL) REPLACE(0); /* Rule 2: Prefer appropriate scope */ if (dst_scope < 0) dst_scope = in6_addrscope(&dst); new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) REPLACE(2); NEXT(2); } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) NEXT(2); REPLACE(2); } /* * Rule 3: Avoid deprecated addresses. Note that the case of * !ip6_use_deprecated is already rejected above. */ if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) NEXT(3); if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) REPLACE(3); /* Rule 4: Prefer home addresses */ /* * XXX: This is a TODO. We should probably merge the MIP6 * case above. */ /* Rule 5: Prefer outgoing interface */ if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) { if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) NEXT(5); if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) REPLACE(5); } /* * Rule 6: Prefer matching label * Note that best_policy should be non-NULL here. */ if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label && dst_policy->label != new_policy->label) NEXT(6); if (dst_policy->label != best_policy->label && dst_policy->label == new_policy->label) REPLACE(6); } /* * Rule 7: Prefer public addresses. * We allow users to reverse the logic by configuring * a sysctl variable, so that privacy conscious users can * always prefer temporary addresses. */ if (opts == NULL || opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { prefer_tempaddr = V_ip6_prefer_tempaddr; } else if (opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_NOTPREFER) { prefer_tempaddr = 0; } else prefer_tempaddr = 1; if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && (ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) REPLACE(7); else NEXT(7); } if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { if (prefer_tempaddr) NEXT(7); else REPLACE(7); } /* * Rule 8: prefer addresses on alive interfaces. * This is a KAME specific rule. */ if ((ia_best->ia_ifp->if_flags & IFF_UP) && !(ia->ia_ifp->if_flags & IFF_UP)) NEXT(8); if (!(ia_best->ia_ifp->if_flags & IFF_UP) && (ia->ia_ifp->if_flags & IFF_UP)) REPLACE(8); /* * Rule 9: prefer address with better virtual status. */ if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa)) REPLACE(9); if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa)) NEXT(9); /* * Rule 10: prefer address with `prefer_source' flag. */ if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 && (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0) REPLACE(10); if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 && (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0) NEXT(10); /* * Rule 14: Use longest matching prefix. * Note: in the address selection draft, this rule is * documented as "Rule 8". However, since it is also * documented that this rule can be overridden, we assign * a large number so that it is easy to assign smaller numbers * to more preferred rules. */ new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); if (best_matchlen < new_matchlen) REPLACE(14); if (new_matchlen < best_matchlen) NEXT(14); /* Rule 15 is reserved. */ /* * Last resort: just keep the current candidate. * Or, do we need more rules? */ continue; replace: ia_best = ia; best_scope = (new_scope >= 0 ? new_scope : in6_addrscope(&ia_best->ia_addr.sin6_addr)); best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst)); next: continue; out: break; } if ((ia = ia_best) == NULL) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); IP6STAT_INC(ip6s_sources_none); return (EADDRNOTAVAIL); } /* * At this point at least one of the addresses belonged to the jail * but it could still be, that we want to further restrict it, e.g. * theoratically IN6_IS_ADDR_LOOPBACK. * It must not be IN6_IS_ADDR_UNSPECIFIED anymore. * prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should * let all others previously selected pass. * Use tmp to not change ::1 on lo0 to the primary jail address. */ tmp = ia->ia_addr.sin6_addr; if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL && (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); IP6STAT_INC(ip6s_sources_none); return (EADDRNOTAVAIL); } if (ifpp) *ifpp = ifp; bcopy(&tmp, srcp, sizeof(*srcp)); if (ia->ia_ifp == ifp) - IP6STAT_INC(ip6s_sources_sameif[best_scope]); + IP6STAT_INC2(ip6s_sources_sameif, best_scope); else - IP6STAT_INC(ip6s_sources_otherif[best_scope]); + IP6STAT_INC2(ip6s_sources_otherif, best_scope); if (dst_scope == best_scope) - IP6STAT_INC(ip6s_sources_samescope[best_scope]); + IP6STAT_INC2(ip6s_sources_samescope, best_scope); else - IP6STAT_INC(ip6s_sources_otherscope[best_scope]); + IP6STAT_INC2(ip6s_sources_otherscope, best_scope); if (IFA6_IS_DEPRECATED(ia)) - IP6STAT_INC(ip6s_sources_deprecated[best_scope]); + IP6STAT_INC2(ip6s_sources_deprecated, best_scope); IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } /* * Select source address based on @inp, @dstsock and @opts. * Stores selected address to @srcp. If @scope_ambiguous is set, * embed scope from selected outgoing interface. If @hlim pointer * is provided, stores calculated hop limit there. * Returns 0 on success. */ int in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred, int scope_ambiguous, struct in6_addr *srcp, int *hlim) { struct ifnet *retifp; uint32_t fibnum; int error; fibnum = inp->inp_inc.inc_fibnum; retifp = NULL; error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp); if (error != 0) return (error); if (hlim != NULL) *hlim = in6_selecthlim(inp, retifp); if (retifp == NULL || scope_ambiguous == 0) return (0); /* * Application should provide a proper zone ID or the use of * default zone IDs should be enabled. Unfortunately, some * applications do not behave as it should, so we need a * workaround. Even if an appropriate ID is not determined * (when it's required), if we can determine the outgoing * interface. determine the zone ID based on the interface. */ error = in6_setscope(&dstsock->sin6_addr, retifp, NULL); return (error); } /* * Select source address based on @fibnum, @dst and @scopeid. * Stores selected address to @srcp. * Returns 0 on success. * * Used by non-socket based consumers (ND code mostly) */ int in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst, uint32_t scopeid, struct ifnet *ifp, struct in6_addr *srcp, int *hlim) { struct ifnet *retifp; struct sockaddr_in6 dst_sa; int error; retifp = ifp; bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = *dst; dst_sa.sin6_scope_id = scopeid; sa6_embedscope(&dst_sa, 0); error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp); if (hlim != NULL) *hlim = in6_selecthlim(NULL, retifp); return (error); } static struct nhop_object * cache_route(uint32_t fibnum, const struct sockaddr_in6 *dst, struct route_in6 *ro, uint32_t flowid) { /* * Use a cached route if it exists and is valid, else try to allocate * a new one. Note that we should check the address family of the * cached destination, in case of sharing the cache with IPv4. * Assumes that 'struct route_in6' is exclusively locked. */ if (ro->ro_nh != NULL && ( !NH_IS_VALID(ro->ro_nh) || ro->ro_dst.sin6_family != AF_INET6 || !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &dst->sin6_addr))) RO_NHFREE(ro); if (ro->ro_nh == NULL) { ro->ro_dst = *dst; const struct in6_addr *paddr; struct in6_addr unscoped_addr; uint32_t scopeid = 0; if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) { in6_splitscope(&dst->sin6_addr, &unscoped_addr, &scopeid); paddr = &unscoped_addr; } else paddr = &dst->sin6_addr; ro->ro_nh = fib6_lookup(fibnum, paddr, scopeid, NHR_REF, flowid); } return (ro->ro_nh); } static struct nhop_object * lookup_route(uint32_t fibnum, struct sockaddr_in6 *dst, struct route_in6 *ro, struct ip6_pktopts *opts, uint32_t flowid) { struct nhop_object *nh = NULL; /* * If the next hop address for the packet is specified by the caller, * use it as the gateway. */ if (opts && opts->ip6po_nexthop) { struct route_in6 *ron = &opts->ip6po_nextroute; struct sockaddr_in6 *sin6_next = satosin6(opts->ip6po_nexthop); nh = cache_route(fibnum, sin6_next, ron, flowid); /* * The node identified by that address must be a * neighbor of the sending host. */ if (nh != NULL && (nh->nh_flags & NHF_GATEWAY) != 0) nh = NULL; } else if (ro != NULL) { nh = cache_route(fibnum, dst, ro, flowid); if (nh == NULL) return (NULL); /* * Check if the outgoing interface conflicts with * the interface specified by ipi6_ifindex (if specified). */ struct in6_pktinfo *pi; if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { if (nh->nh_aifp->if_index != pi->ipi6_ifindex) nh = NULL; } } return (nh); } /* * Finds outgoing nexthop or the outgoing interface for the * @dstsock. * Return 0 on success and stores the lookup result in @retnh and @retifp */ static int selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct nhop_object **retnh, int norouteok, u_int fibnum, uint32_t flowid) { int error = 0; struct ifnet *ifp = NULL; struct in6_pktinfo *pi = NULL; struct in6_addr *dst = &dstsock->sin6_addr; /* If the caller specify the outgoing interface explicitly, use it. */ if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) { /* XXX boundary check is assumed to be already done. */ ifp = ifnet_byindex(pi->ipi6_ifindex); if (ifp != NULL && (norouteok || IN6_IS_ADDR_MULTICAST(dst))) { /* * we do not have to check or get the route for * multicast. */ goto done; } else goto getroute; } /* * If the destination address is a multicast address and the outgoing * interface for the address is specified by the caller, use it. */ if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) { goto done; /* we do not need a route for multicast. */ } /* * If destination address is LLA or link- or node-local multicast, * use it's embedded scope zone id to determine outgoing interface. */ if (IN6_IS_ADDR_MC_LINKLOCAL(dst) || IN6_IS_ADDR_MC_NODELOCAL(dst)) { uint32_t zoneid = ntohs(in6_getscope(dst)); if (zoneid > 0) { ifp = in6_getlinkifnet(zoneid); goto done; } } getroute:; struct nhop_object *nh = lookup_route(fibnum, dstsock, ro, opts, flowid); if (nh != NULL) { *retifp = nh->nh_aifp; error = 0; } else { *retifp = NULL; IP6STAT_INC(ip6s_noroute); error = EHOSTUNREACH; } *retnh = nh; return (error); done: if (ifp == NULL) { /* * This can happen if the caller did not pass a cached route * nor any other hints. We treat this case an error. */ error = EHOSTUNREACH; } if (error == EHOSTUNREACH) IP6STAT_INC(ip6s_noroute); *retifp = ifp; *retnh = NULL; return (error); } static int in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct ifnet **retifp, struct ifnet *oifp, u_int fibnum) { int error; struct route_in6 sro; struct nhop_object *nh = NULL; uint16_t nh_flags; KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__)); bzero(&sro, sizeof(sro)); nh_flags = 0; error = selectroute(dstsock, opts, mopts, &sro, retifp, &nh, 1, fibnum, 0); if (nh != NULL) nh_flags = nh->nh_flags; if (nh != NULL && nh == sro.ro_nh) NH_FREE(nh); if (error != 0) { /* Help ND. See oifp comment in in6_selectsrc(). */ if (oifp != NULL && fibnum == RT_DEFAULT_FIB) { *retifp = oifp; error = 0; } return (error); } /* * do not use a rejected or black hole route. * XXX: this check should be done in the L2 output routine. * However, if we skipped this check here, we'd see the following * scenario: * - install a rejected route for a scoped address prefix * (like fe80::/10) * - send a packet to a destination that matches the scoped prefix, * with ambiguity about the scope zone. * - pick the outgoing interface from the route, and disambiguate the * scope zone with the interface. * - ip6_output() would try to get another route with the "new" * destination, which may be valid. * - we'd see no error on output. * Although this may not be very harmful, it should still be confusing. * We thus reject the case here. */ if (nh_flags & (NHF_REJECT | NHF_BLACKHOLE)) { error = (nh_flags & NHF_HOST ? EHOSTUNREACH : ENETUNREACH); return (error); } return (0); } /* Public wrapper function to selectroute(). */ int in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, struct ifnet **retifp, struct nhop_object **retnh, u_int fibnum, uint32_t flowid) { MPASS(retifp != NULL); MPASS(retnh != NULL); return (selectroute(dstsock, opts, mopts, ro, retifp, retnh, 0, fibnum, flowid)); } /* * Default hop limit selection. The precedence is as follows: * 1. Hoplimit value specified via ioctl. * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. */ int in6_selecthlim(struct inpcb *inp, struct ifnet *ifp) { if (inp && inp->in6p_hops >= 0) return (inp->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); else if (inp && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { struct nhop_object *nh; struct in6_addr dst; uint32_t fibnum, scopeid; int hlim; fibnum = inp->inp_inc.inc_fibnum; in6_splitscope(&inp->in6p_faddr, &dst, &scopeid); nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0); if (nh != NULL) { hlim = ND_IFINFO(nh->nh_ifp)->chlim; return (hlim); } } return (V_ip6_defhlim); } void addrsel_policy_init(void) { init_policy_queue(); /* initialize the "last resort" policy */ bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy)); V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; if (!IS_DEFAULT_VNET(curvnet)) return; ADDRSEL_LOCK_INIT(); ADDRSEL_SXLOCK_INIT(); } static struct in6_addrpolicy * lookup_addrsel_policy(struct sockaddr_in6 *key) { struct in6_addrpolicy *match = NULL; ADDRSEL_LOCK(); match = match_addrsel_policy(key); if (match == NULL) match = &V_defaultaddrpolicy; else match->use++; ADDRSEL_UNLOCK(); return (match); } /* * Subroutines to manage the address selection policy table via sysctl. */ struct walkarg { struct sysctl_req *w_req; }; static int in6_src_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_DECL(_net_inet6_ip6); static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, CTLFLAG_RD | CTLFLAG_MPSAFE, in6_src_sysctl, ""); static int in6_src_sysctl(SYSCTL_HANDLER_ARGS) { struct walkarg w; if (req->newptr) return EPERM; bzero(&w, sizeof(w)); w.w_req = req; return (walk_addrsel_policy(dump_addrsel_policyent, &w)); } int in6_src_ioctl(u_long cmd, caddr_t data) { struct in6_addrpolicy ent0; if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) return (EOPNOTSUPP); /* check for safety */ ent0 = *(struct in6_addrpolicy *)data; if (ent0.label == ADDR_LABEL_NOTAPP) return (EINVAL); /* check if the prefix mask is consecutive. */ if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) return (EINVAL); /* clear trailing garbages (if any) of the prefix address. */ IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr); ent0.use = 0; switch (cmd) { case SIOCAADDRCTL_POLICY: return (add_addrsel_policyent(&ent0)); case SIOCDADDRCTL_POLICY: return (delete_addrsel_policyent(&ent0)); } return (0); /* XXX: compromise compilers */ } /* * The followings are implementation of the policy table using a * simple tail queue. * XXX such details should be hidden. * XXX implementation using binary tree should be more efficient. */ struct addrsel_policyent { TAILQ_ENTRY(addrsel_policyent) ape_entry; struct in6_addrpolicy ape_policy; }; TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); VNET_DEFINE_STATIC(struct addrsel_policyhead, addrsel_policytab); #define V_addrsel_policytab VNET(addrsel_policytab) static void init_policy_queue(void) { TAILQ_INIT(&V_addrsel_policytab); } static int add_addrsel_policyent(struct in6_addrpolicy *newpolicy) { struct addrsel_policyent *new, *pol; new = malloc(sizeof(*new), M_IFADDR, M_WAITOK); ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* duplication check */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(new, M_IFADDR); return (EEXIST); /* or override it? */ } } bzero(new, sizeof(*new)); /* XXX: should validate entry */ new->ape_policy = *newpolicy; TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (0); } static int delete_addrsel_policyent(struct in6_addrpolicy *key) { struct addrsel_policyent *pol; ADDRSEL_XLOCK(); ADDRSEL_LOCK(); /* search for the entry in the table */ TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) && IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, &pol->ape_policy.addrmask.sin6_addr)) { break; } } if (pol == NULL) { ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); return (ESRCH); } TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry); ADDRSEL_UNLOCK(); ADDRSEL_XUNLOCK(); free(pol, M_IFADDR); return (0); } static int walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w) { struct addrsel_policyent *pol; int error = 0; ADDRSEL_SLOCK(); TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) { if ((error = (*callback)(&pol->ape_policy, w)) != 0) { ADDRSEL_SUNLOCK(); return (error); } } ADDRSEL_SUNLOCK(); return (error); } static int dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg) { int error = 0; struct walkarg *w = arg; error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); return (error); } static struct in6_addrpolicy * match_addrsel_policy(struct sockaddr_in6 *key) { struct addrsel_policyent *pent; struct in6_addrpolicy *bestpol = NULL, *pol; int matchlen, bestmatchlen = -1; u_char *mp, *ep, *k, *p, m; TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) { matchlen = 0; pol = &pent->ape_policy; mp = (u_char *)&pol->addrmask.sin6_addr; ep = mp + 16; /* XXX: scope field? */ k = (u_char *)&key->sin6_addr; p = (u_char *)&pol->addr.sin6_addr; for (; mp < ep && *mp; mp++, k++, p++) { m = *mp; if ((*k & m) != *p) goto next; /* not match */ if (m == 0xff) /* short cut for a typical case */ matchlen += 8; else { while (m >= 0x80) { matchlen++; m <<= 1; } } } /* matched. check if this is better than the current best. */ if (bestpol == NULL || matchlen > bestmatchlen) { bestpol = pol; bestmatchlen = matchlen; } next: continue; } return (bestpol); } diff --git a/sys/netinet6/ip6_input.c b/sys/netinet6/ip6_input.c index 11b92c152a1a..ec819a12628d 100644 --- a/sys/netinet6/ip6_input.c +++ b/sys/netinet6/ip6_input.c @@ -1,1723 +1,1723 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include ip6proto_input_t *ip6_protox[IPPROTO_MAX] = { [0 ... IPPROTO_MAX - 1] = rip6_input }; ip6proto_ctlinput_t *ip6_ctlprotox[IPPROTO_MAX] = { [0 ... IPPROTO_MAX - 1] = rip6_ctlinput }; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; static int sysctl_netinet6_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_nh, qlimit)); } SYSCTL_DECL(_net_inet6_ip6); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet6_intr_queue_maxlen, "I", "Maximum size of the IPv6 input queue"); VNET_DEFINE_STATIC(bool, ip6_sav) = true; #define V_ip6_sav VNET(ip6_sav) SYSCTL_BOOL(_net_inet6_ip6, OID_AUTO, source_address_validation, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sav), true, "Drop incoming packets with source address that is a local address"); #ifdef RSS static struct netisr_handler ip6_direct_nh = { .nh_name = "ip6_direct", .nh_handler = ip6_direct_input, .nh_proto = NETISR_IPV6_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; static int sysctl_netinet6_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_netinet6_intr_direct_queue_maxlen, "I", "Maximum size of the IPv6 direct input queue"); #endif VNET_DEFINE(pfil_head_t, inet6_pfil_head); VNET_DEFINE(pfil_head_t, inet6_local_pfil_head); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ static void ip6_vnet_init(void *arg __unused) { struct pfil_head_args args; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); CK_STAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_IP6; args.pa_headname = PFIL_INET6_NAME; V_inet6_pfil_head = pfil_head_register(&args); args.pa_flags = PFIL_OUT; args.pa_headname = PFIL_INET6_LOCAL_NAME; V_inet6_local_pfil_head = pfil_head_register(&args); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6, &V_ipsec_hhh_out[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ #ifdef VIMAGE netisr_register_vnet(&ip6_nh); #ifdef RSS netisr_register_vnet(&ip6_direct_nh); #endif #endif } VNET_SYSINIT(ip6_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, ip6_vnet_init, NULL); static void ip6_init(void *arg __unused) { /* * Register statically those protocols that are unlikely to ever go * dynamic. */ IP6PROTO_REGISTER(IPPROTO_ICMPV6, icmp6_input, rip6_ctlinput); IP6PROTO_REGISTER(IPPROTO_DSTOPTS, dest6_input, NULL); IP6PROTO_REGISTER(IPPROTO_ROUTING, route6_input, NULL); IP6PROTO_REGISTER(IPPROTO_FRAGMENT, frag6_input, NULL); IP6PROTO_REGISTER(IPPROTO_IPV4, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_IPV6, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_ETHERIP, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_GRE, encap6_input, NULL); IP6PROTO_REGISTER(IPPROTO_PIM, encap6_input, NULL); #ifdef SCTP /* XXX: has a loadable & static version */ IP6PROTO_REGISTER(IPPROTO_SCTP, sctp6_input, sctp6_ctlinput); #endif EVENTHANDLER_REGISTER(vm_lowmem, frag6_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, frag6_drain, NULL, LOWMEM_PRI_DEFAULT); netisr_register(&ip6_nh); #ifdef RSS netisr_register(&ip6_direct_nh); #endif } SYSINIT(ip6_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_init, NULL); int ip6proto_register(uint8_t proto, ip6proto_input_t input, ip6proto_ctlinput_t ctl) { MPASS(proto > 0); if (ip6_protox[proto] == rip6_input) { ip6_protox[proto] = input; ip6_ctlprotox[proto] = ctl; return (0); } else return (EEXIST); } int ip6proto_unregister(uint8_t proto) { MPASS(proto > 0); if (ip6_protox[proto] != rip6_input) { ip6_protox[proto] = rip6_input; ip6_ctlprotox[proto] = rip6_ctlinput; return (0); } else return (ENOENT); } #ifdef VIMAGE static void ip6_destroy(void *unused __unused) { struct ifaddr *ifa, *nifa; struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip6_direct_nh); #endif netisr_unregister_vnet(&ip6_nh); pfil_head_unregister(V_inet6_pfil_head); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } /* Cleanup addresses. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_LOCK(ifp); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* IF_ADDR_UNLOCK(ifp); */ in6_ifdetach_destroy(ifp); mld_domifdetach(ifp); } IFNET_RUNLOCK(); /* Make sure any routes are gone as well. */ rib_flush_routes_family(AF_INET6); frag6_destroy(); nd6_destroy(); in6_ifattach_destroy(); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); } VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int ip6_input_hbh(struct mbuf **mp, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { struct mbuf *m; struct ip6_hdr *ip6; struct ip6_hbh *hbh; if (ip6_hopopts_input(plen, rtalert, mp, off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc((*mp)->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ m = *mp; ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } #ifdef RSS /* * IPv6 direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip6_direct_input(struct mbuf *m) { int off, nxt; int nest; struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL); KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!")); ip6dc = (struct ip6_direct_ctx *)(mtag + 1); nxt = ip6dc->ip6dc_nxt; off = ip6dc->ip6dc_off; nest = 0; m_tag_delete(m, mtag); while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = ip6_protox[nxt](&m, &off, nxt); } return; bad: m_freem(m); } #endif void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; struct ifnet *rcvif; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; /* * Drop the packet if IPv6 operation is disabled on the interface. */ rcvif = m->m_pkthdr.rcvif; if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ ip6 = mtod(m, struct ip6_hdr *); goto passin; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { struct ifnet *ifp = (m->m_flags & M_LOOP) ? V_loif : rcvif; int ifindex = ifp->if_index; if (ifindex >= IP6S_M2MMAX) ifindex = 0; - IP6STAT_INC(ip6s_m2m[ifindex]); + IP6STAT_INC2(ip6s_m2m, ifindex); } else IP6STAT_INC(ip6s_m1); } in6_ifstat_inc(rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) goto bad; m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } - IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); + IP6STAT_INC2(ip6s_nxthist, ip6->ip6_nxt); IP_PROBE(receive, NULL, NULL, ip6, rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * We have supported IPv6-only kernels for a few years and this issue * has not come up. The world seems to move mostly towards not using * v4mapped on the wire, so it makes sense for us to keep rejecting * any such packets. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Try to forward the packet, but if we fail continue. * ip6_tryforward() does not generate redirects, so fall * through to normal processing if redirects are required. * ip6_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip6 pointer. */ if (V_ip6_forwarding != 0 && V_ip6_sendredirects == 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv6) || IPSEC_CAPS(ipv6, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip6_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { ip6 = mtod(m, struct ip6_hdr *); goto passin; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv6) && IPSEC_CAPS(ipv6, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_IN(V_inet6_pfil_head)) goto passin; odst = ip6->ip6_dst; if (pfil_mbuf_in(V_inet6_pfil_head, &m, m->m_pkthdr.rcvif, NULL) != PFIL_PASS) return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if ((m->m_flags & (M_IP6_NEXTHOP | M_FASTFWD_OURS)) == M_IP6_NEXTHOP && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, rcvif, NULL) || in6_setscope(&ip6->ip6_dst, rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); goto bad; } if (V_ip6_sav && !(m->m_flags & M_LOOP) && __predict_false(in6_localip_fib(&ip6->ip6_src, rcvif->if_fib))) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { if (ip6_input_hbh(&m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (rtalert != ~0) m->m_flags |= M_RTALERT_MLD; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, rcvif, m)) { IP6STAT_INC(ip6s_cantforward); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } /* * We are going to ship the packet to the local protocol stack. Call the * filter again for this 'output' action, allowing redirect-like rules * to adjust the source address. */ if (PFIL_HOOKED_OUT(V_inet6_local_pfil_head)) { if (pfil_mbuf_out(V_inet6_local_pfil_head, &m, V_loif, NULL) != PFIL_PASS) return; ip6 = mtod(m, struct ip6_hdr *); } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = ip6_protox[nxt](&m, &off, nxt); } return; bad: in6_ifstat_inc(rcvif, ifs6_in_discard); if (m != NULL) m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ if (m->m_len < off + sizeof(*hbh)) { m = m_pullup(m, off + sizeof(*hbh)); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (-1); } } hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; if (m->m_len < off + hbhlen) { m = m_pullup(m, off + hbhlen); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); *mp = NULL; return (-1); } } hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) { *mp = NULL; return (-1); } *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called m_pullup() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { union { struct timeval tv; struct bintime bt; struct timespec ts; } t; struct bintime boottimebin, bt1; struct timespec ts1; bool stamped; stamped = false; switch (inp->inp_socket->so_ts_clock) { case SO_TS_REALTIME_MICRO: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &t.tv); } else { microtime(&t.tv); } *mp = sbcreatecontrol(&t.tv, sizeof(t.tv), SCM_TIMESTAMP, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_BINTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &t.bt); getboottimebin(&boottimebin); bintime_add(&t.bt, &boottimebin); } else { bintime(&t.bt); } *mp = sbcreatecontrol(&t.bt, sizeof(t.bt), SCM_BINTIME, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_REALTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &t.ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); timespecadd(&t.ts, &ts1, &t.ts); } else { nanotime(&t.ts); } *mp = sbcreatecontrol(&t.ts, sizeof(t.ts), SCM_REALTIME, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_MONOTONIC: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &t.ts); else nanouptime(&t.ts); *mp = sbcreatecontrol(&t.ts, sizeof(t.ts), SCM_MONOTONIC, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; default: panic("unknown (corrupted) so_ts_clock"); } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol(&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET, M_NOWAIT); if (*mp != NULL) mp = &(*mp)->m_next; } } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol(&pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol(&hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol(&tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *inp, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6; int v4only = 0; mp = ip6_savecontrol_v4(inp, m, mp, &v4only); if (v4only) return; ip6 = mtod(m, struct ip6_hdr *); /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((inp->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; u_int hbhlen; hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol(hbh, hbhlen, IS2292(inp, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } } if ((inp->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; u_int elen; /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; switch (nxt) { case IPPROTO_DSTOPTS: if (!(inp->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol(ip6e, elen, IS2292(inp, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(inp->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol(ip6e, elen, IS2292(inp, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; } loopend: ; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol(&flowid, sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol(&flow_type, sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol(&rss_bucketid, sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6, M_NOWAIT); if (*mp) mp = &(*mp)->m_next; } } #endif } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol(&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6, M_NOWAIT)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { soroverflow(so); m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } /* * Get pointer to the previous header followed by the header * currently processed. */ int ip6_get_prevhdr(const struct mbuf *m, int off) { struct ip6_ext ip6e; struct ip6_hdr *ip6; int len, nlen, nxt; if (off == sizeof(struct ip6_hdr)) return (offsetof(struct ip6_hdr, ip6_nxt)); if (off < sizeof(struct ip6_hdr)) panic("%s: off < sizeof(struct ip6_hdr)", __func__); ip6 = mtod(m, struct ip6_hdr *); nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); nlen = 0; while (len < off) { m_copydata(m, len, sizeof(ip6e), (caddr_t)&ip6e); switch (nxt) { case IPPROTO_FRAGMENT: nlen = sizeof(struct ip6_frag); break; case IPPROTO_AH: nlen = (ip6e.ip6e_len + 2) << 2; break; default: nlen = (ip6e.ip6e_len + 1) << 3; } len += nlen; nxt = ip6e.ip6e_nxt; } return (len - nlen); } /* * get next header offset. m will be retained. */ int ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } /* NOTREACHED */ } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h index 014710f9bceb..1aa170f6ed2b 100644 --- a/sys/netinet6/ip6_var.h +++ b/sys/netinet6/ip6_var.h @@ -1,475 +1,484 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_var.h,v 1.62 2001/05/03 14:51:48 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET6_IP6_VAR_H_ #define _NETINET6_IP6_VAR_H_ #include #ifdef _KERNEL struct ip6asfrag; /* frag6.c */ TAILQ_HEAD(ip6fraghead, ip6asfrag); /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { struct ip6fraghead ip6q_frags; u_int32_t ip6q_ident; u_int8_t ip6q_nxt; u_int8_t ip6q_ecn; u_int16_t ip6q_ttl; struct in6_addr ip6q_src, ip6q_dst; TAILQ_ENTRY(ip6q) ip6q_tq; int ip6q_unfrglen; /* len of unfragmentable part */ int ip6q_nfrag; /* # of fragments */ struct label *ip6q_label; }; #endif /* _KERNEL */ /* * IP6 reinjecting structure. */ struct ip6_direct_ctx { uint32_t ip6dc_nxt; /* next header to process */ uint32_t ip6dc_off; /* offset to next header */ }; #if defined(_NETINET6_IN6_VAR_H_) && defined(_KERNEL) /* * Structure attached to inpcb.in6p_moptions and * passed to ip6_output when IPv6 multicast options are in use. * This structure is lazy-allocated. */ struct ip6_moptions { struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ struct ip6_mfilter_head im6o_head; /* group membership list */ }; #else struct ip6_moptions; #endif /* * Control options for outgoing packets */ /* Routing header related info */ struct ip6po_rhinfo { struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */ struct route_in6 ip6po_rhi_route; /* Route to the 1st hop */ }; #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route /* Nexthop related info */ struct ip6po_nhinfo { struct sockaddr *ip6po_nhi_nexthop; struct route_in6 ip6po_nhi_route; /* Route to the nexthop */ }; #define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop #define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route /* * Note that fields with valid data must be flagged in ip6po_valid. * This is done to reduce cache misses in ip6_output(). Before * ip6po_valid, ip6_output needed to check all the individual fields * of ip6_pktopts needed to be checked themselves, and they are spread * across 4 cachelines. ip6_output() is currently the only consumer of * these flags, as it is in the critical path of every packet sent. */ struct ip6_pktopts { uint32_t ip6po_valid; #define IP6PO_VALID_HLIM 0x0001 #define IP6PO_VALID_PKTINFO 0x0002 #define IP6PO_VALID_NHINFO 0x0004 #define IP6PO_VALID_HBH 0x0008 #define IP6PO_VALID_DEST1 0x0010 #define IP6PO_VALID_RHINFO 0x0020 #define IP6PO_VALID_DEST2 0x0040 #define IP6PO_VALID_TC 0x0080 int ip6po_hlim; /* Hoplimit for outgoing packets */ int ip6po_tclass; /* traffic class */ int ip6po_minmtu; /* fragment vs PMTU discovery policy */ #define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/ #define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ #define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ int ip6po_prefer_tempaddr; /* whether temporary addresses are preferred as source address */ #define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ #define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ #define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ int ip6po_flags; #if 0 /* parameters in this block is obsolete. do not reuse the values. */ #define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ #define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #endif #define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */ #define IP6PO_USECOA 0x08 /* use care of address */ struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; /* Next-hop address information */ struct ip6po_nhinfo ip6po_nhinfo; struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ /* Destination options header (before a routing header) */ struct ip6_dest *ip6po_dest1; /* Routing header related info. */ struct ip6po_rhinfo ip6po_rhinfo; /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; }; /* * Control options for incoming packets */ struct ip6stat { uint64_t ip6s_total; /* total packets received */ uint64_t ip6s_tooshort; /* packet too short */ uint64_t ip6s_toosmall; /* not enough data */ uint64_t ip6s_fragments; /* fragments received */ uint64_t ip6s_fragdropped; /* frags dropped(dups, out of space) */ uint64_t ip6s_fragtimeout; /* fragments timed out */ uint64_t ip6s_fragoverflow; /* fragments that exceeded limit */ uint64_t ip6s_forward; /* packets forwarded */ uint64_t ip6s_cantforward; /* packets rcvd for unreachable dest */ uint64_t ip6s_redirectsent; /* packets forwarded on same net */ uint64_t ip6s_delivered; /* datagrams delivered to upper level*/ uint64_t ip6s_localout; /* total ip packets generated here */ uint64_t ip6s_odropped; /* lost packets due to nobufs, etc. */ uint64_t ip6s_reassembled; /* total packets reassembled ok */ uint64_t ip6s_atomicfrags; /* atomic fragments */ uint64_t ip6s_fragmented; /* datagrams successfully fragmented */ uint64_t ip6s_ofragments; /* output fragments created */ uint64_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ uint64_t ip6s_badoptions; /* error in option processing */ uint64_t ip6s_noroute; /* packets discarded due to no route */ uint64_t ip6s_badvers; /* ip6 version != 6 */ uint64_t ip6s_rawout; /* total raw ip packets generated */ uint64_t ip6s_badscope; /* scope error */ uint64_t ip6s_notmember; /* don't join this multicast group */ #define IP6S_HDRCNT 256 /* headers count */ uint64_t ip6s_nxthist[IP6S_HDRCNT]; /* next header history */ uint64_t ip6s_m1; /* one mbuf */ #define IP6S_M2MMAX 32 uint64_t ip6s_m2m[IP6S_M2MMAX]; /* two or more mbuf */ uint64_t ip6s_mext1; /* one ext mbuf */ uint64_t ip6s_mext2m; /* two or more ext mbuf */ uint64_t ip6s_exthdrtoolong; /* ext hdr are not contiguous */ uint64_t ip6s_nogif; /* no match gif found */ uint64_t ip6s_toomanyhdr; /* discarded due to too many headers */ /* * statistics for improvement of the source address selection * algorithm: * XXX: hardcoded 16 = # of ip6 multicast scope types + 1 */ #define IP6S_RULESMAX 16 #define IP6S_SCOPECNT 16 /* number of times that address selection fails */ uint64_t ip6s_sources_none; /* number of times that an address on the outgoing I/F is chosen */ uint64_t ip6s_sources_sameif[IP6S_SCOPECNT]; /* number of times that an address on a non-outgoing I/F is chosen */ uint64_t ip6s_sources_otherif[IP6S_SCOPECNT]; /* * number of times that an address that has the same scope * from the destination is chosen. */ uint64_t ip6s_sources_samescope[IP6S_SCOPECNT]; /* * number of times that an address that has a different scope * from the destination is chosen. */ uint64_t ip6s_sources_otherscope[IP6S_SCOPECNT]; /* number of times that a deprecated address is chosen */ uint64_t ip6s_sources_deprecated[IP6S_SCOPECNT]; /* number of times that each rule of source selection is applied. */ uint64_t ip6s_sources_rule[IP6S_RULESMAX]; }; #ifdef _KERNEL #include +#include VNET_PCPUSTAT_DECLARE(struct ip6stat, ip6stat); -#define IP6STAT_ADD(name, val) \ - VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, (val)) -#define IP6STAT_SUB(name, val) IP6STAT_ADD(name, -(val)) +#define IP6STAT_ADD(name, val) \ + do { \ + MIB_SDT_PROBE1(ip6, count, name, (val)); \ + VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, (val)); \ + } while (0) +#define IP6STAT_SUB(name, val) IP6STAT_ADD(name, -(val)) #define IP6STAT_INC(name) IP6STAT_ADD(name, 1) -#define IP6STAT_DEC(name) IP6STAT_SUB(name, 1) +#define IP6STAT_INC2(name, type) \ + do { \ + MIB_SDT_PROBE2(ip6, count, name, 1, type); \ + VNET_PCPUSTAT_ADD(struct ip6stat, ip6stat, name, 1); \ + } while (0) +#define IP6STAT_DEC(name) IP6STAT_SUB(name, 1) #endif #ifdef _KERNEL /* flags passed to ip6_output as last parameter */ #define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ #ifdef __NO_STRICT_ALIGNMENT #define IP6_HDR_ALIGNED_P(ip) 1 #else #define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) #endif VNET_DECLARE(int, ip6_defhlim); /* default hop limit */ VNET_DECLARE(int, ip6_defmcasthlim); /* default multicast hop limit */ VNET_DECLARE(int, ip6_forwarding); /* act as router? */ VNET_DECLARE(int, ip6_use_deprecated); /* allow deprecated addr as source */ VNET_DECLARE(int, ip6_rr_prune); /* router renumbering prefix * walk list every 5 sec. */ VNET_DECLARE(int, ip6_mcast_pmtu); /* enable pMTU discovery for multicast? */ VNET_DECLARE(int, ip6_v6only); #define V_ip6_defhlim VNET(ip6_defhlim) #define V_ip6_defmcasthlim VNET(ip6_defmcasthlim) #define V_ip6_forwarding VNET(ip6_forwarding) #define V_ip6_use_deprecated VNET(ip6_use_deprecated) #define V_ip6_rr_prune VNET(ip6_rr_prune) #define V_ip6_mcast_pmtu VNET(ip6_mcast_pmtu) #define V_ip6_v6only VNET(ip6_v6only) VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */ VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA * receiving IF. */ VNET_DECLARE(int, ip6_rfc6204w3); /* Accept defroute from RA even when forwarding enabled */ VNET_DECLARE(int, ip6_hdrnestlimit); /* upper limit of # of extension * headers */ VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */ #define V_ip6_mrouter VNET(ip6_mrouter) #define V_ip6_sendredirects VNET(ip6_sendredirects) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) #define V_ip6_no_radr VNET(ip6_no_radr) #define V_ip6_norbit_raif VNET(ip6_norbit_raif) #define V_ip6_rfc6204w3 VNET(ip6_rfc6204w3) #define V_ip6_hdrnestlimit VNET(ip6_hdrnestlimit) #define V_ip6_dad_count VNET(ip6_dad_count) VNET_DECLARE(int, ip6_auto_flowlabel); VNET_DECLARE(int, ip6_auto_linklocal); #define V_ip6_auto_flowlabel VNET(ip6_auto_flowlabel) #define V_ip6_auto_linklocal VNET(ip6_auto_linklocal) VNET_DECLARE(int, ip6_use_tempaddr); /* Whether to use temporary addresses */ VNET_DECLARE(int, ip6_prefer_tempaddr); /* Whether to prefer temporary * addresses in the source address * selection */ #define V_ip6_use_tempaddr VNET(ip6_use_tempaddr) #define V_ip6_prefer_tempaddr VNET(ip6_prefer_tempaddr) VNET_DECLARE(int, ip6_use_defzone); /* Whether to use the default scope * zone when unspecified */ #define V_ip6_use_defzone VNET(ip6_use_defzone) VNET_DECLARE(struct pfil_head *, inet6_pfil_head); #define V_inet6_pfil_head VNET(inet6_pfil_head) #define PFIL_INET6_NAME "inet6" VNET_DECLARE(struct pfil_head *, inet6_local_pfil_head); #define V_inet6_local_pfil_head VNET(inet6_local_pfil_head) #define PFIL_INET6_LOCAL_NAME "inet6-local" #ifdef IPSTEALTH VNET_DECLARE(int, ip6stealth); #define V_ip6stealth VNET(ip6stealth) #endif VNET_DECLARE(bool, ip6_log_cannot_forward); #define V_ip6_log_cannot_forward VNET(ip6_log_cannot_forward) extern struct pr_usrreqs rip6_usrreqs; struct sockopt; struct inpcb; struct ucred; int icmp6_ctloutput(struct socket *, struct sockopt *sopt); void ip6_input(struct mbuf *); void ip6_direct_input(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); int ip6_unknown_opt(u_int8_t *, struct mbuf *, int); int ip6_get_prevhdr(const struct mbuf *, int); int ip6_nexthdr(const struct mbuf *, int, int, int *); int ip6_lasthdr(const struct mbuf *, int, int, int *); extern int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *); struct mbuf **ip6_savecontrol_v4(struct inpcb *, struct mbuf *, struct mbuf **, int *); void ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **); void ip6_notify_pmtu(struct inpcb *, struct sockaddr_in6 *, u_int32_t); int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t); void ip6_forward(struct mbuf *, int); void ip6_mloopback(struct ifnet *, struct mbuf *); int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, int, struct ip6_moptions *, struct ifnet **, struct inpcb *); int ip6_ctloutput(struct socket *, struct sockopt *); int ip6_raw_ctloutput(struct socket *, struct sockopt *); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *, struct ip6_pktopts *, struct ucred *, int); void ip6_clearpktopts(struct ip6_pktopts *, int); struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); int ip6_optlen(struct inpcb *); int ip6_deletefraghdr(struct mbuf *, int, int); int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int, uint32_t); int route6_input(struct mbuf **, int *, int); void frag6_init(void); void frag6_destroy(void); int frag6_input(struct mbuf **, int *, int); void frag6_drain(void); void rip6_init(void); int rip6_ctloutput(struct socket *, struct sockopt *); int rip6_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *); int dest6_input(struct mbuf **, int *, int); int none_input(struct mbuf **, int *, int); int in6_selectsrc_socket(struct sockaddr_in6 *, struct ip6_pktopts *, struct inpcb *, struct ucred *, int, struct in6_addr *, int *); int in6_selectsrc_addr(uint32_t, const struct in6_addr *, uint32_t, struct ifnet *, struct in6_addr *, int *); int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, struct ifnet **, struct nhop_object **, u_int, uint32_t); u_int32_t ip6_randomid(void); u_int32_t ip6_randomflowlabel(void); void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset); int ip6_log_ratelimit(void); /* * Argument type for the last arg of ip6proto_ctlinput_t(). * * IPv6 ICMP IPv6 [exthdrs] finalhdr payload * ^ ^ ^ ^ * | | ip6c_ip6 ip6c_off * | ip6c_icmp6 * ip6c_m * * ip6c_finaldst's sin6_addr usually points to ip6c_ip6->ip6_dst. If the * original * (internal) packet carries a routing header, it may point the * final * destination address in the routing header. * * ip6c_src: ip6c_ip6->ip6_src + scope info + flowlabel in ip6c_ip6 * (beware of flowlabel, if you try to compare it against others) * ip6c_dst: ip6c_finaldst + scope info */ struct ip6ctlparam { struct mbuf *ip6c_m; /* start of mbuf chain */ struct icmp6_hdr *ip6c_icmp6; /* icmp6 header of target packet */ struct ip6_hdr *ip6c_ip6; /* ip6 header of target packet */ int ip6c_off; /* offset of the target proto header */ struct sockaddr_in6 *ip6c_src; /* srcaddr w/ additional info */ struct sockaddr_in6 *ip6c_dst; /* (final) dstaddr w/ additional info */ struct sockaddr_in6 *ip6c_finaldst; /* final destination address */ void *ip6c_cmdarg; /* control command dependent data */ u_int8_t ip6c_nxt; /* final next header field */ }; typedef int ip6proto_input_t(struct mbuf **, int *, int); typedef void ip6proto_ctlinput_t(struct ip6ctlparam *); int ip6proto_register(uint8_t, ip6proto_input_t, ip6proto_ctlinput_t); int ip6proto_unregister(uint8_t); #define IP6PROTO_REGISTER(prot, input, ctl) do { \ int error __diagused; \ error = ip6proto_register(prot, input, ctl); \ MPASS(error == 0); \ } while (0) ip6proto_input_t rip6_input; ip6proto_ctlinput_t rip6_ctlinput; #endif /* _KERNEL */ #endif /* !_NETINET6_IP6_VAR_H_ */ diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c index 5dd192a69fa9..06fe9e8820c9 100644 --- a/sys/netinet6/mld6.c +++ b/sys/netinet6/mld6.c @@ -1,3326 +1,3326 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ /*- * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Stephen Deering of Stanford University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef KTR_MLD #define KTR_MLD KTR_INET6 #endif static void mli_delete_locked(struct ifnet *); static void mld_dispatch_packet(struct mbuf *); static void mld_dispatch_queue(struct mbufq *, int); static void mld_final_leave(struct in6_multi *, struct mld_ifsoftc *); static void mld_fasttimo_vnet(struct in6_multi_head *inmh); static int mld_handle_state_change(struct in6_multi *, struct mld_ifsoftc *); static int mld_initial_join(struct in6_multi *, struct mld_ifsoftc *, const int); #ifdef KTR static char * mld_rec_type_to_str(const int); #endif static void mld_set_version(struct mld_ifsoftc *, const int); static void mld_slowtimo_vnet(void); static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, /*const*/ struct mld_hdr *); static void mld_v1_process_group_timer(struct in6_multi_head *, struct in6_multi *); static void mld_v1_process_querier_timers(struct mld_ifsoftc *); static int mld_v1_transmit_report(struct in6_multi *, const int); static void mld_v1_update_group(struct in6_multi *, const int); static void mld_v2_cancel_link_timers(struct mld_ifsoftc *); static void mld_v2_dispatch_general_query(struct mld_ifsoftc *); static struct mbuf * mld_v2_encap_report(struct ifnet *, struct mbuf *); static int mld_v2_enqueue_filter_change(struct mbufq *, struct in6_multi *); static int mld_v2_enqueue_group_record(struct mbufq *, struct in6_multi *, const int, const int, const int, const int); static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, struct mbuf *, struct mldv2_query *, const int, const int); static int mld_v2_merge_state_changes(struct in6_multi *, struct mbufq *); static void mld_v2_process_group_timers(struct in6_multi_head *, struct mbufq *, struct mbufq *, struct in6_multi *, const int); static int mld_v2_process_group_query(struct in6_multi *, struct mld_ifsoftc *mli, int, struct mbuf *, struct mldv2_query *, const int); static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS); static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS); /* * Normative references: RFC 2710, RFC 3590, RFC 3810. * * Locking: * * The MLD subsystem lock ends up being system-wide for the moment, * but could be per-VIMAGE later on. * * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK. * Any may be taken independently; if any are held at the same * time, the above lock order must be followed. * * IN6_MULTI_LOCK covers in_multi. * * MLD_LOCK covers per-link state and any global variables in this file. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. * * XXX LOR PREVENTION * A special case for IPv6 is the in6_setscope() routine. ip6_output() * will not accept an ifp; it wants an embedded scope ID, unlike * ip_output(), which happily takes the ifp given to it. The embedded * scope ID is only used by MLD to select the outgoing interface. * * During interface attach and detach, MLD will take MLD_LOCK *after* * the IF_AFDATA_LOCK. * As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call * it with MLD_LOCK held without triggering an LOR. A netisr with indirect * dispatch could work around this, but we'd rather not do that, as it * can introduce other races. * * As such, we exploit the fact that the scope ID is just the interface * index, and embed it in the IPv6 destination address accordingly. * This is potentially NOT VALID for MLDv1 reports, as they * are always sent to the multicast group itself; as MLDv2 * reports are always sent to ff02::16, this is not an issue * when MLDv2 is in use. * * This does not however eliminate the LOR when ip6_output() itself * calls in6_setscope() internally whilst MLD_LOCK is held. This will * trigger a LOR warning in WITNESS when the ifnet is detached. * * The right answer is probably to make IF_AFDATA_LOCK an rwlock, given * how it's used across the network stack. Here we're simply exploiting * the fact that MLD runs at a similar layer in the stack to scope6.c. * * VIMAGE: * * Each in6_multi corresponds to an ifp, and each ifp corresponds * to a vnet in ifp->if_vnet. */ static struct mtx mld_mtx; static MALLOC_DEFINE(M_MLD, "mld", "mld state"); #define MLD_EMBEDSCOPE(pin6, zoneid) \ if (IN6_IS_SCOPE_LINKLOCAL(pin6) || \ IN6_IS_ADDR_MC_INTFACELOCAL(pin6)) \ (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) \ /* * VIMAGE-wide globals. */ VNET_DEFINE_STATIC(struct timeval, mld_gsrdelay) = {10, 0}; VNET_DEFINE_STATIC(LIST_HEAD(, mld_ifsoftc), mli_head); VNET_DEFINE_STATIC(int, interface_timers_running6); VNET_DEFINE_STATIC(int, state_change_timers_running6); VNET_DEFINE_STATIC(int, current_state_timers_running6); #define V_mld_gsrdelay VNET(mld_gsrdelay) #define V_mli_head VNET(mli_head) #define V_interface_timers_running6 VNET(interface_timers_running6) #define V_state_change_timers_running6 VNET(state_change_timers_running6) #define V_current_state_timers_running6 VNET(current_state_timers_running6) SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPv6 Multicast Listener Discovery"); /* * Virtualized sysctls. */ SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I", "Rate limit for MLDv2 Group-and-Source queries in seconds"); /* * Non-virtualized sysctls. */ static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo, "Per-interface MLDv2 state"); static int mld_v1enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN, &mld_v1enable, 0, "Enable fallback to MLDv1"); static int mld_v2enable = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN, &mld_v2enable, 0, "Enable MLDv2"); static int mld_use_allow = 1; SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN, &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); /* * Packed Router Alert option structure declaration. */ struct mld_raopt { struct ip6_hbh hbh; struct ip6_opt pad; struct ip6_opt_router ra; } __packed; /* * Router Alert hop-by-hop option header. */ static struct mld_raopt mld_ra = { .hbh = { 0, 0 }, .pad = { .ip6o_type = IP6OPT_PADN, 0 }, .ra = { .ip6or_type = IP6OPT_ROUTER_ALERT, .ip6or_len = IP6OPT_RTALERT_LEN - 2, .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF), .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF) } }; static struct ip6_pktopts mld_po; static __inline void mld_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.rcvif = ifp; m->m_pkthdr.flowid = ifp->if_index; } static __inline void mld_scrub_context(struct mbuf *m) { m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } /* * Restore context from a queued output chain. * Return saved ifindex. * * VIMAGE: The assertion is there to make sure that we * actually called CURVNET_SET() with what's in the mbuf chain. */ static __inline uint32_t mld_restore_context(struct mbuf *m) { #if defined(VIMAGE) && defined(INVARIANTS) KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr, ("%s: called when curvnet was not restored: cuvnet %p m ptr %p", __func__, curvnet, m->m_pkthdr.PH_loc.ptr)); #endif return (m->m_pkthdr.flowid); } /* * Retrieve or set threshold between group-source queries in seconds. * * VIMAGE: Assume curvnet set by caller. * SMPng: NOTE: Serialized by MLD lock. */ static int sysctl_mld_gsr(SYSCTL_HANDLER_ARGS) { int error; int i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error) return (error); MLD_LOCK(); i = V_mld_gsrdelay.tv_sec; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) goto out_locked; if (i < -1 || i >= 60) { error = EINVAL; goto out_locked; } CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d", V_mld_gsrdelay.tv_sec, i); V_mld_gsrdelay.tv_sec = i; out_locked: MLD_UNLOCK(); return (error); } /* * Expose struct mld_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * VIMAGE: Assume curvnet set by caller. The node handler itself * is not directly virtualized. */ static int sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; int *name; int error; u_int namelen; struct ifnet *ifp; struct mld_ifsoftc *mli; name = (int *)arg1; namelen = arg2; if (req->newptr != NULL) return (EPERM); if (namelen != 1) return (EINVAL); error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo)); if (error) return (error); IN6_MULTI_LOCK(); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); NET_EPOCH_ENTER(et); error = ENOENT; ifp = ifnet_byindex(name[0]); if (ifp == NULL) goto out_locked; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (ifp == mli->mli_ifp) { struct mld_ifinfo info; info.mli_version = mli->mli_version; info.mli_v1_timer = mli->mli_v1_timer; info.mli_v2_timer = mli->mli_v2_timer; info.mli_flags = mli->mli_flags; info.mli_rv = mli->mli_rv; info.mli_qi = mli->mli_qi; info.mli_qri = mli->mli_qri; info.mli_uri = mli->mli_uri; error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } out_locked: NET_EPOCH_EXIT(et); MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); IN6_MULTI_UNLOCK(); return (error); } /* * Dispatch an entire queue of pending packet chains. * VIMAGE: Assumes the vnet pointer has been set. */ static void mld_dispatch_queue(struct mbufq *mq, int limit) { struct mbuf *m; while ((m = mbufq_dequeue(mq)) != NULL) { CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m); mld_dispatch_packet(m); if (--limit == 0) break; } } /* * Filter outgoing MLD report state by group. * * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) * and node-local addresses. However, kernel and socket consumers * always embed the KAME scope ID in the address provided, so strip it * when performing comparison. * Note: This is not the same as the *multicast* scope. * * Return zero if the given group is one for which MLD reports * should be suppressed, or non-zero if reports should be issued. */ static __inline int mld_is_addr_reported(const struct in6_addr *addr) { KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__)); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) return (0); if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { struct in6_addr tmp = *addr; in6_clearscope(&tmp); if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) return (0); } return (1); } /* * Attach MLD when PF_INET6 is attached to an interface. Assumes that the * current VNET is set by the caller. */ struct mld_ifsoftc * mld_domifattach(struct ifnet *ifp) { struct mld_ifsoftc *mli; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_WAITOK | M_ZERO); mli->mli_ifp = ifp; mli->mli_version = MLD_VERSION_2; mli->mli_flags = 0; mli->mli_rv = MLD_RV_INIT; mli->mli_qi = MLD_QI_INIT; mli->mli_qri = MLD_QRI_INIT; mli->mli_uri = MLD_URI_INIT; mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS); if ((ifp->if_flags & IFF_MULTICAST) == 0) mli->mli_flags |= MLIF_SILENT; if (mld_use_allow) mli->mli_flags |= MLIF_USEALLOW; MLD_LOCK(); LIST_INSERT_HEAD(&V_mli_head, mli, mli_link); MLD_UNLOCK(); return (mli); } /* * Hook for ifdetach. * * NOTE: Some finalization tasks need to run before the protocol domain * is detached, but also before the link layer does its cleanup. * Run before link-layer cleanup; cleanup groups, but do not free MLD state. * * SMPng: Caller must hold IN6_MULTI_LOCK(). * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator. * XXX This routine is also bitten by unlocked ifma_protospec access. */ void mld_ifdetach(struct ifnet *ifp, struct in6_multi_head *inmh) { struct epoch_tracker et; struct mld_ifsoftc *mli; struct ifmultiaddr *ifma; struct in6_multi *inm; CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK(); mli = MLD_IFINFO(ifp); IF_ADDR_WLOCK(ifp); /* * Extract list of in6_multi associated with the detaching ifp * which the PF_INET6 layer is about to release. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; in6m_disconnect_locked(inmh, inm); if (mli->mli_version == MLD_VERSION_2) { in6m_clear_recorded(inm); /* * We need to release the final reference held * for issuing the INCLUDE {}. */ if (inm->in6m_state == MLD_LEAVING_MEMBER) { inm->in6m_state = MLD_NOT_MEMBER; in6m_rele_locked(inmh, inm); } } } NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); MLD_UNLOCK(); } /* * Hook for domifdetach. * Runs after link-layer cleanup; free MLD state. * * SMPng: Normally called with IF_AFDATA_LOCK held. */ void mld_domifdetach(struct ifnet *ifp) { CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK(); mli_delete_locked(ifp); MLD_UNLOCK(); } static void mli_delete_locked(struct ifnet *ifp) { struct mld_ifsoftc *mli, *tmli; CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)", __func__, ifp, if_name(ifp)); MLD_LOCK_ASSERT(); LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) { if (mli->mli_ifp == ifp) { /* * Free deferred General Query responses. */ mbufq_drain(&mli->mli_gq); LIST_REMOVE(mli, mli_link); free(mli, M_MLD); return; } } } /* * Process a received MLDv1 general or address-specific query. * Assumes that the query header has been pulled up to sizeof(mld_hdr). * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct ifmultiaddr *ifma; struct mld_ifsoftc *mli; struct in6_multi *inm; int is_general_query; uint16_t timer; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif NET_EPOCH_ASSERT(); is_general_query = 0; if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * Do address field validation upfront before we accept * the query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * MLDv1 General Query. * If this was not sent to the all-nodes group, ignore it. */ struct in6_addr dst; dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * Switch to MLDv1 host compatibility mode. */ mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); mld_set_version(mli, MLD_VERSION_1); timer = (ntohs(mld->mld_maxdelay) * MLD_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; if (is_general_query) { /* * For each reporting group joined on this * interface, kick the report timer. */ CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)", ifp, if_name(ifp)); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; mld_v1_update_group(inm, timer); } } else { /* * MLDv1 Group-Specific Query. * If this is a group-specific MLDv1 query, we need only * look up the single group to process it. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); mld_v1_update_group(inm, timer); } /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); } MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Update the report timer on a group in response to an MLDv1 query. * * If we are becoming the reporting member for this group, start the timer. * If we already are the reporting member for this group, and timer is * below the threshold, reset it. * * We may be updating the group for the first time since we switched * to MLDv2. If we are, then we must clear any recorded source lists, * and transition to REPORTING state; the group timer is overloaded * for group and group-source query responses. * * Unlike MLDv2, the delay per group should be jittered * to avoid bursts of MLDv1 reports. */ static void mld_v1_update_group(struct in6_multi *inm, const int timer) { #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), timer); IN6_MULTI_LIST_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: if (inm->in6m_timer != 0 && inm->in6m_timer <= timer) { CTR1(KTR_MLD, "%s: REPORTING and timer running, " "skipping.", __func__); break; } /* FALLTHROUGH */ case MLD_SG_QUERY_PENDING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: CTR1(KTR_MLD, "%s: ->REPORTING", __func__); inm->in6m_state = MLD_REPORTING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; break; case MLD_SLEEPING_MEMBER: CTR1(KTR_MLD, "%s: ->AWAKENING", __func__); inm->in6m_state = MLD_AWAKENING_MEMBER; break; case MLD_LEAVING_MEMBER: break; } } /* * Process a received MLDv2 general, group-specific or * group-and-source-specific query. * * Assumes that mld points to a struct mldv2_query which is stored in * contiguous memory. * * Return 0 if successful, otherwise an appropriate error code is returned. */ static int mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, struct mbuf *m, struct mldv2_query *mld, const int off, const int icmp6len) { struct mld_ifsoftc *mli; struct in6_multi *inm; uint32_t maxdelay, nsrc, qqi; int is_general_query; uint16_t timer; uint8_t qrv; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif NET_EPOCH_ASSERT(); if (!mld_v2enable) { CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } /* * RFC3810 Section 6.2: MLD queries must originate from * a router's link-local address. */ if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (0); } is_general_query = 0; CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp)); maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ if (maxdelay >= 32768) { maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << (MLD_MRC_EXP(maxdelay) + 3); } timer = (maxdelay * MLD_FASTHZ) / MLD_TIMER_SCALE; if (timer == 0) timer = 1; qrv = MLD_QRV(mld->mld_misc); if (qrv < 2) { CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__, qrv, MLD_RV_INIT); qrv = MLD_RV_INIT; } qqi = mld->mld_qqi; if (qqi >= 128) { qqi = MLD_QQIC_MANT(mld->mld_qqi) << (MLD_QQIC_EXP(mld->mld_qqi) + 3); } nsrc = ntohs(mld->mld_numsrc); if (nsrc > MLD_MAX_GS_SOURCES) return (EMSGSIZE); if (icmp6len < sizeof(struct mldv2_query) + (nsrc * sizeof(struct in6_addr))) return (EMSGSIZE); /* * Do further input validation upfront to avoid resetting timers * should we need to discard this query. */ if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* * A general query with a source list has undefined * behaviour; discard it. */ if (nsrc > 0) return (EINVAL); is_general_query = 1; } else { /* * Embed scope ID of receiving interface in MLD query for * lookup whilst we don't hold other locks (due to KAME * locking lameness). We own this mbuf chain just now. */ in6_setscope(&mld->mld_addr, ifp, NULL); } IN6_MULTI_LIST_LOCK(); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * Discard the v2 query if we're in Compatibility Mode. * The RFC is pretty clear that hosts need to stay in MLDv1 mode * until the Old Version Querier Present timer expires. */ if (mli->mli_version != MLD_VERSION_2) goto out_locked; mld_set_version(mli, MLD_VERSION_2); mli->mli_rv = qrv; mli->mli_qi = qqi; mli->mli_qri = maxdelay; CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi, maxdelay); if (is_general_query) { /* * MLDv2 General Query. * * Schedule a current-state report on this ifp for * all groups, possibly containing source lists. * * If there is a pending General Query response * scheduled earlier than the selected delay, do * not schedule any other reports. * Otherwise, reset the interface timer. */ CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)", ifp, if_name(ifp)); if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); V_interface_timers_running6 = 1; } } else { /* * MLDv2 Group-specific or Group-and-source-specific Query. * * Group-source-specific queries are throttled on * a per-group basis to defeat denial-of-service attempts. * Queries for groups we are not a member of on this * link are simply ignored. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm == NULL) goto out_locked; if (nsrc > 0) { if (!ratecheck(&inm->in6m_lastgsrtv, &V_mld_gsrdelay)) { CTR1(KTR_MLD, "%s: GS query throttled.", __func__); goto out_locked; } } CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)", ifp, if_name(ifp)); /* * If there is a pending General Query response * scheduled sooner than the selected delay, no * further report need be scheduled. * Otherwise, prepare to respond to the * group-specific or group-and-source query. */ if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) mld_v2_process_group_query(inm, mli, timer, m, mld, off); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); return (0); } /* * Process a received MLDv2 group-specific or group-and-source-specific * query. * Return <0 if any error occurred. Currently this is ignored. */ static int mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli, int timer, struct mbuf *m0, struct mldv2_query *mld, const int off) { int retval; uint16_t nsrc; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); retval = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LEAVING_MEMBER: return (retval); break; case MLD_REPORTING_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: break; } nsrc = ntohs(mld->mld_numsrc); /* Length should be checked by calling function. */ KASSERT((m0->m_flags & M_PKTHDR) == 0 || m0->m_pkthdr.len >= off + sizeof(struct mldv2_query) + nsrc * sizeof(struct in6_addr), ("mldv2 packet is too short: (%d bytes < %zd bytes, m=%p)", m0->m_pkthdr.len, off + sizeof(struct mldv2_query) + nsrc * sizeof(struct in6_addr), m0)); /* * Deal with group-specific queries upfront. * If any group query is already pending, purge any recorded * source-list state if it exists, and schedule a query response * for this group-specific query. */ if (nsrc == 0) { if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { in6m_clear_recorded(inm); timer = min(inm->in6m_timer, timer); } inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Deal with the case where a group-and-source-specific query has * been received but a group-specific query is already pending. */ if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { timer = min(inm->in6m_timer, timer); inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; return (retval); } /* * Finally, deal with the case where a group-and-source-specific * query has been received, where a response to a previous g-s-r * query exists, or none exists. * In this case, we need to parse the source-list which the Querier * has provided us with and check if we have any source list filter * entries at T1 for these sources. If we do not, there is no need * schedule a report and the query may be dropped. * If we do, we must record them and schedule a current-state * report for those sources. */ if (inm->in6m_nsrc > 0) { struct in6_addr srcaddr; int i, nrecorded; int soff; soff = off + sizeof(struct mldv2_query); nrecorded = 0; for (i = 0; i < nsrc; i++) { m_copydata(m0, soff, sizeof(struct in6_addr), (caddr_t)&srcaddr); retval = in6m_record_source(inm, &srcaddr); if (retval < 0) break; nrecorded += retval; soff += sizeof(struct in6_addr); } if (nrecorded > 0) { CTR1(KTR_MLD, "%s: schedule response to SG query", __func__); inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; inm->in6m_timer = MLD_RANDOM_DELAY(timer); V_current_state_timers_running6 = 1; } } return (retval); } /* * Process a received MLDv1 host membership report. * Assumes mld points to mld_hdr in pulled up mbuf chain. * * NOTE: Can't be fully const correct as we temporarily embed scope ID in * mld_addr. This is OK as we own the mbuf chain. */ static int mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, /*const*/ struct mld_hdr *mld) { struct in6_addr src, dst; struct in6_ifaddr *ia; struct in6_multi *inm; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif NET_EPOCH_ASSERT(); if (!mld_v1enable) { CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); return (0); } if (ifp->if_flags & IFF_LOOPBACK) return (0); /* * MLDv1 reports must originate from a host's link-local address, * or the unspecified address (when booting). */ src = ip6->ip6_src; in6_clearscope(&src); if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_src), ifp, if_name(ifp)); return (EINVAL); } /* * RFC2710 Section 4: MLDv1 reports must pertain to a multicast * group, and must be directed to the group itself. */ dst = ip6->ip6_dst; in6_clearscope(&dst); if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &ip6->ip6_dst), ifp, if_name(ifp)); return (EINVAL); } /* * Make sure we don't hear our own membership report, as fast * leave requires knowing that we are the only member of a * group. Assume we used the link-local address if available, * otherwise look for ::. * * XXX Note that scope ID comparison is needed for the address * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be * performed for the on-wire address. */ ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) || (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } if (ia != NULL) ifa_free(&ia->ia_ifa); CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); /* * Embed scope ID of receiving interface in MLD query for lookup * whilst we don't hold other locks (due to KAME locking lameness). */ if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) in6_setscope(&mld->mld_addr, ifp, NULL); IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * MLDv1 report suppression. * If we are a member of this group, and our membership should be * reported, and our group timer is pending or about to be reset, * stop our group timer by transitioning to the 'lazy' state. */ inm = in6m_lookup_locked(ifp, &mld->mld_addr); if (inm != NULL) { struct mld_ifsoftc *mli; mli = inm->in6m_mli; KASSERT(mli != NULL, ("%s: no mli for ifp %p", __func__, ifp)); /* * If we are in MLDv2 host mode, do not allow the * other host's MLDv1 report to suppress our reports. */ if (mli->mli_version == MLD_VERSION_2) goto out_locked; inm->in6m_timer = 0; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_AWAKENING_MEMBER: CTR3(KTR_MLD, "report suppressed for %s on ifp %p(%s)", ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp)); case MLD_LAZY_MEMBER: inm->in6m_state = MLD_LAZY_MEMBER; break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); /* XXX Clear embedded scope ID as userland won't expect it. */ in6_clearscope(&mld->mld_addr); return (0); } /* * MLD input path. * * Assume query messages which fit in a single ICMPv6 message header * have been pulled up. * Assume that userland will want to see the message, even if it * otherwise fails kernel input validation; do not free it. * Pullup may however free the mbuf chain m if it fails. * * Return IPPROTO_DONE if we freed m. Otherwise, return 0. */ int mld_input(struct mbuf **mp, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct mbuf *m; struct mld_hdr *mld; int mldlen; m = *mp; CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off); ifp = m->m_pkthdr.rcvif; /* Pullup to appropriate size. */ if (m->m_len < off + sizeof(*mld)) { m = m_pullup(m, off + sizeof(*mld)); if (m == NULL) { ICMP6STAT_INC(icp6s_badlen); return (IPPROTO_DONE); } } mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); if (mld->mld_type == MLD_LISTENER_QUERY && icmp6len >= sizeof(struct mldv2_query)) { mldlen = sizeof(struct mldv2_query); } else { mldlen = sizeof(struct mld_hdr); } if (m->m_len < off + mldlen) { m = m_pullup(m, off + mldlen); if (m == NULL) { ICMP6STAT_INC(icp6s_badlen); return (IPPROTO_DONE); } } *mp = m; ip6 = mtod(m, struct ip6_hdr *); mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); /* * Userland needs to see all of this traffic for implementing * the endpoint discovery portion of multicast routing. */ switch (mld->mld_type) { case MLD_LISTENER_QUERY: icmp6_ifstat_inc(ifp, ifs6_in_mldquery); if (icmp6len == sizeof(struct mld_hdr)) { if (mld_v1_input_query(ifp, ip6, mld) != 0) return (0); } else if (icmp6len >= sizeof(struct mldv2_query)) { if (mld_v2_input_query(ifp, ip6, m, (struct mldv2_query *)mld, off, icmp6len) != 0) return (0); } break; case MLD_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); if (mld_v1_input_report(ifp, ip6, mld) != 0) return (0); break; case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(ifp, ifs6_in_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(ifp, ifs6_in_mlddone); break; default: break; } return (0); } /* * Fast timeout handler (global). * VIMAGE: Timeout handlers are expected to service all vimages. */ static struct callout mldfast_callout; static void mld_fasttimo(void *arg __unused) { struct epoch_tracker et; struct in6_multi_head inmh; VNET_ITERATOR_DECL(vnet_iter); SLIST_INIT(&inmh); NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_fasttimo_vnet(&inmh); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); in6m_release_list_deferred(&inmh); callout_reset(&mldfast_callout, hz / MLD_FASTHZ, mld_fasttimo, NULL); } /* * Fast timeout handler (per-vnet). * * VIMAGE: Assume caller has set up our curvnet. */ static void mld_fasttimo_vnet(struct in6_multi_head *inmh) { struct mbufq scq; /* State-change packets */ struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; struct mld_ifsoftc *mli; struct ifmultiaddr *ifma; struct in6_multi *inm; int uri_fasthz; uri_fasthz = 0; /* * Quick check to see if any work needs to be done, in order to * minimize the overhead of fasttimo processing. * SMPng: XXX Unlocked reads. */ if (!V_current_state_timers_running6 && !V_interface_timers_running6 && !V_state_change_timers_running6) return; IN6_MULTI_LIST_LOCK(); MLD_LOCK(); /* * MLDv2 General Query response timer processing. */ if (V_interface_timers_running6) { CTR1(KTR_MLD, "%s: interface timers running", __func__); V_interface_timers_running6 = 0; LIST_FOREACH(mli, &V_mli_head, mli_link) { if (mli->mli_v2_timer == 0) { /* Do nothing. */ } else if (--mli->mli_v2_timer == 0) { mld_v2_dispatch_general_query(mli); } else { V_interface_timers_running6 = 1; } } } if (!V_current_state_timers_running6 && !V_state_change_timers_running6) goto out_locked; V_current_state_timers_running6 = 0; V_state_change_timers_running6 = 0; CTR1(KTR_MLD, "%s: state change timers running", __func__); /* * MLD host report and state-change timer processing. * Note: Processing a v2 group timer may remove a node. */ LIST_FOREACH(mli, &V_mli_head, mli_link) { ifp = mli->mli_ifp; if (mli->mli_version == MLD_VERSION_2) { uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * MLD_FASTHZ); mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS); mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; switch (mli->mli_version) { case MLD_VERSION_1: mld_v1_process_group_timer(inmh, inm); break; case MLD_VERSION_2: mld_v2_process_group_timers(inmh, &qrq, &scq, inm, uri_fasthz); break; } } IF_ADDR_WUNLOCK(ifp); switch (mli->mli_version) { case MLD_VERSION_1: /* * Transmit reports for this lifecycle. This * is done while not holding IF_ADDR_LOCK * since this can call * in6ifa_ifpforlinklocal() which locks * IF_ADDR_LOCK internally as well as * ip6_output() to transmit a packet. */ while ((inm = SLIST_FIRST(inmh)) != NULL) { SLIST_REMOVE_HEAD(inmh, in6m_defer); (void)mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); } break; case MLD_VERSION_2: mld_dispatch_queue(&qrq, 0); mld_dispatch_queue(&scq, 0); break; } } out_locked: MLD_UNLOCK(); IN6_MULTI_LIST_UNLOCK(); } /* * Update host report group timer. * Will update the global pending timer flags. */ static void mld_v1_process_group_timer(struct in6_multi_head *inmh, struct in6_multi *inm) { int report_timer_expired; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); if (inm->in6m_timer == 0) { report_timer_expired = 0; } else if (--inm->in6m_timer == 0) { report_timer_expired = 1; } else { V_current_state_timers_running6 = 1; return; } switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_REPORTING_MEMBER: if (report_timer_expired) { inm->in6m_state = MLD_IDLE_MEMBER; SLIST_INSERT_HEAD(inmh, inm, in6m_defer); } break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } /* * Update a group's timers for MLDv2. * Will update the global pending timer flags. * Note: Unlocked read from mli. */ static void mld_v2_process_group_timers(struct in6_multi_head *inmh, struct mbufq *qrq, struct mbufq *scq, struct in6_multi *inm, const int uri_fasthz) { int query_response_timer_expired; int state_change_retransmit_timer_expired; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); query_response_timer_expired = 0; state_change_retransmit_timer_expired = 0; /* * During a transition from compatibility mode back to MLDv2, * a group record in REPORTING state may still have its group * timer active. This is a no-op in this function; it is easier * to deal with it here than to complicate the slow-timeout path. */ if (inm->in6m_timer == 0) { query_response_timer_expired = 0; } else if (--inm->in6m_timer == 0) { query_response_timer_expired = 1; } else { V_current_state_timers_running6 = 1; } if (inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 0; } else if (--inm->in6m_sctimer == 0) { state_change_retransmit_timer_expired = 1; } else { V_state_change_timers_running6 = 1; } /* We are in fasttimo, so be quick about it. */ if (!state_change_retransmit_timer_expired && !query_response_timer_expired) return; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_LAZY_MEMBER: case MLD_AWAKENING_MEMBER: case MLD_IDLE_MEMBER: break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: /* * Respond to a previously pending Group-Specific * or Group-and-Source-Specific query by enqueueing * the appropriate Current-State report for * immediate transmission. */ if (query_response_timer_expired) { int retval __unused; retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); inm->in6m_state = MLD_REPORTING_MEMBER; in6m_clear_recorded(inm); } /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: case MLD_LEAVING_MEMBER: if (state_change_retransmit_timer_expired) { /* * State-change retransmission timer fired. * If there are any further pending retransmissions, * set the global pending state-change flag, and * reset the timer. */ if (--inm->in6m_scrv > 0) { inm->in6m_sctimer = uri_fasthz; V_state_change_timers_running6 = 1; } /* * Retransmit the previously computed state-change * report. If there are no further pending * retransmissions, the mbuf queue will be consumed. * Update T0 state to T1 as we have now sent * a state-change. */ (void)mld_v2_merge_state_changes(inm, scq); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * If we are leaving the group for good, make sure * we release MLD's reference to it. * This release must be deferred using a SLIST, * as we are called from a loop which traverses * the in_ifmultiaddr TAILQ. */ if (inm->in6m_state == MLD_LEAVING_MEMBER && inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; in6m_disconnect_locked(inmh, inm); in6m_rele_locked(inmh, inm); } } break; } } /* * Switch to a different version on the given interface, * as per Section 9.12. */ static void mld_set_version(struct mld_ifsoftc *mli, const int version) { int old_version_timer; MLD_LOCK_ASSERT(); CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__, version, mli->mli_ifp, if_name(mli->mli_ifp)); if (version == MLD_VERSION_1) { /* * Compute the "Older Version Querier Present" timer as per * Section 9.12. */ old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; old_version_timer *= MLD_SLOWHZ; mli->mli_v1_timer = old_version_timer; } if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { mli->mli_version = MLD_VERSION_1; mld_v2_cancel_link_timers(mli); } } /* * Cancel pending MLDv2 timers for the given link and all groups * joined on it; state-change, general-query, and group-query timers. */ static void mld_v2_cancel_link_timers(struct mld_ifsoftc *mli) { struct epoch_tracker et; struct in6_multi_head inmh; struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__, mli->mli_ifp, if_name(mli->mli_ifp)); SLIST_INIT(&inmh); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * Fast-track this potentially expensive operation * by checking all the global 'timer pending' flags. */ if (!V_interface_timers_running6 && !V_state_change_timers_running6 && !V_current_state_timers_running6) return; mli->mli_v2_timer = 0; ifp = mli->mli_ifp; IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: break; case MLD_LEAVING_MEMBER: /* * If we are leaving the group and switching * version, we need to release the final * reference held for issuing the INCLUDE {}. */ if (inm->in6m_refcount == 1) in6m_disconnect_locked(&inmh, inm); in6m_rele_locked(&inmh, inm); /* FALLTHROUGH */ case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: in6m_clear_recorded(inm); /* FALLTHROUGH */ case MLD_REPORTING_MEMBER: inm->in6m_sctimer = 0; inm->in6m_timer = 0; inm->in6m_state = MLD_REPORTING_MEMBER; /* * Free any pending MLDv2 state-change records. */ mbufq_drain(&inm->in6m_scq); break; } } NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); in6m_release_list_deferred(&inmh); } /* * Global slowtimo handler. * VIMAGE: Timeout handlers are expected to service all vimages. */ static struct callout mldslow_callout; static void mld_slowtimo(void *arg __unused) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); mld_slowtimo_vnet(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&mldslow_callout, hz / MLD_SLOWHZ, mld_slowtimo, NULL); } /* * Per-vnet slowtimo handler. */ static void mld_slowtimo_vnet(void) { struct mld_ifsoftc *mli; MLD_LOCK(); LIST_FOREACH(mli, &V_mli_head, mli_link) { mld_v1_process_querier_timers(mli); } MLD_UNLOCK(); } /* * Update the Older Version Querier Present timers for a link. * See Section 9.12 of RFC 3810. */ static void mld_v1_process_querier_timers(struct mld_ifsoftc *mli) { MLD_LOCK_ASSERT(); if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { /* * MLDv1 Querier Present timer expired; revert to MLDv2. */ CTR5(KTR_MLD, "%s: transition from v%d -> v%d on %p(%s)", __func__, mli->mli_version, MLD_VERSION_2, mli->mli_ifp, if_name(mli->mli_ifp)); mli->mli_version = MLD_VERSION_2; } } /* * Transmit an MLDv1 report immediately. */ static int mld_v1_transmit_report(struct in6_multi *in6m, const int type) { struct ifnet *ifp; struct in6_ifaddr *ia; struct ip6_hdr *ip6; struct mbuf *mh, *md; struct mld_hdr *mld; NET_EPOCH_ASSERT(); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); ifp = in6m->in6m_ifp; /* in process of being freed */ if (ifp == NULL) return (0); ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); /* ia may be NULL if link-local address is tentative. */ mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } md = m_get(M_NOWAIT, MT_DATA); if (md == NULL) { m_free(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (ENOMEM); } mh->m_next = md; /* * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so * that ether_output() does not need to allocate another mbuf * for the header in the most common case. */ M_ALIGN(mh, sizeof(struct ip6_hdr)); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); mh->m_len = sizeof(struct ip6_hdr); ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = in6m->in6m_addr; md->m_len = sizeof(struct mld_hdr); mld = mtod(md, struct mld_hdr *); mld->mld_type = type; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_maxdelay = 0; mld->mld_reserved = 0; mld->mld_addr = in6m->in6m_addr; in6_clearscope(&mld->mld_addr); mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); mld_save_context(mh, ifp); mh->m_flags |= M_MLDV1; mld_dispatch_packet(mh); if (ia != NULL) ifa_free(&ia->ia_ifa); return (0); } /* * Process a state change from the upper layer for the given IPv6 group. * * Each socket holds a reference on the in_multi in its own ip_moptions. * The socket layer will have made the necessary updates to.the group * state, it is now up to MLD to issue a state change report if there * has been any change between T0 (when the last state-change was issued) * and T1 (now). * * We use the MLDv2 state machine at group level. The MLd module * however makes the decision as to which MLD protocol version to speak. * A state change *from* INCLUDE {} always means an initial join. * A state change *to* INCLUDE {} always means a final leave. * * If delay is non-zero, and the state change is an initial multicast * join, the state change report will be delayed by 'delay' ticks * in units of MLD_FASTHZ if MLDv1 is active on the link; otherwise * the initial MLDv2 state change report will be delayed by whichever * is sooner, a pending state-change timer or delay itself. * * VIMAGE: curvnet should have been set by caller, as this routine * is called from the socket option handlers. */ int mld_change_state(struct in6_multi *inm, const int delay) { struct mld_ifsoftc *mli; struct ifnet *ifp; int error; IN6_MULTI_LIST_LOCK_ASSERT(); error = 0; /* * Check if the in6_multi has already been disconnected. */ if (inm->in6m_ifp == NULL) { CTR1(KTR_MLD, "%s: inm is disconnected", __func__); return (0); } /* * Try to detect if the upper layer just asked us to change state * for an interface which has now gone away. */ KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__)); ifp = inm->in6m_ifma->ifma_ifp; if (ifp == NULL) return (0); /* * Sanity check that netinet6's notion of ifp is the * same as net's. */ KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__)); MLD_LOCK(); mli = MLD_IFINFO(ifp); KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED * for this group, then we are starting or finishing an MLD * life cycle for this group. */ if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__, inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode); if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: initial join", __func__); error = mld_initial_join(inm, mli, delay); goto out_locked; } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { CTR1(KTR_MLD, "%s: final leave", __func__); mld_final_leave(inm, mli); goto out_locked; } } else { CTR1(KTR_MLD, "%s: filter set change", __func__); } error = mld_handle_state_change(inm, mli); out_locked: MLD_UNLOCK(); return (error); } /* * Perform the initial join for an MLD group. * * When joining a group: * If the group should have its MLD traffic suppressed, do nothing. * MLDv1 starts sending MLDv1 host membership reports. * MLDv2 will schedule an MLDv2 state-change report containing the * initial state of the membership. * * If the delay argument is non-zero, then we must delay sending the * initial state change for delay ticks (in units of MLD_FASTHZ). */ static int mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli, const int delay) { struct epoch_tracker et; struct ifnet *ifp; struct mbufq *mq; int error, retval, syncstates; int odelay; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); error = 0; syncstates = 1; ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); /* * Groups joined on loopback or marked as 'not reported', * enter the MLD_SILENT_MEMBER state and * are never reported in any protocol exchanges. * All other groups enter the appropriate state machine * for the version in use on this link. * A link marked as MLIF_SILENT causes MLD to be completely * disabled for the link. */ if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); inm->in6m_state = MLD_SILENT_MEMBER; inm->in6m_timer = 0; } else { /* * Deal with overlapping in_multi lifecycle. * If this group was LEAVING, then make sure * we drop the reference we picked up to keep the * group around for the final INCLUDE {} enqueue. */ if (mli->mli_version == MLD_VERSION_2 && inm->in6m_state == MLD_LEAVING_MEMBER) { inm->in6m_refcount--; MPASS(inm->in6m_refcount > 0); } inm->in6m_state = MLD_REPORTING_MEMBER; switch (mli->mli_version) { case MLD_VERSION_1: /* * If a delay was provided, only use it if * it is greater than the delay normally * used for an MLDv1 state change report, * and delay sending the initial MLDv1 report * by not transitioning to the IDLE state. */ odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * MLD_FASTHZ); if (delay) { inm->in6m_timer = max(delay, odelay); V_current_state_timers_running6 = 1; } else { inm->in6m_state = MLD_IDLE_MEMBER; NET_EPOCH_ENTER(et); error = mld_v1_transmit_report(inm, MLD_LISTENER_REPORT); NET_EPOCH_EXIT(et); if (error == 0) { inm->in6m_timer = odelay; V_current_state_timers_running6 = 1; } } break; case MLD_VERSION_2: /* * Defer update of T0 to T1, until the first copy * of the state change has been transmitted. */ syncstates = 0; /* * Immediately enqueue a State-Change Report for * this interface, freeing any previous reports. * Don't kick the timers if there is nothing to do, * or if an error occurred. */ mq = &inm->in6m_scq; mbufq_drain(mq); retval = mld_v2_enqueue_group_record(mq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) { error = retval * -1; break; } /* * Schedule transmission of pending state-change * report up to RV times for this link. The timer * will fire at the next mld_fasttimo (~200ms), * giving us an opportunity to merge the reports. * * If a delay was provided to this function, only * use this delay if sooner than the existing one. */ KASSERT(mli->mli_rv > 1, ("%s: invalid robustness %d", __func__, mli->mli_rv)); inm->in6m_scrv = mli->mli_rv; if (delay) { if (inm->in6m_sctimer > 1) { inm->in6m_sctimer = min(inm->in6m_sctimer, delay); } else inm->in6m_sctimer = delay; } else inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; error = 0; break; } } /* * Only update the T0 state if state change is atomic, * i.e. we don't need to wait for a timer to fire before we * can consider the state change to have been communicated. */ if (syncstates) { in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); } return (error); } /* * Issue an intermediate state change during the life-cycle. */ static int mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli) { struct ifnet *ifp; int retval; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); ifp = inm->in6m_ifp; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__)); if ((ifp->if_flags & IFF_LOOPBACK) || (mli->mli_flags & MLIF_SILENT) || !mld_is_addr_reported(&inm->in6m_addr) || (mli->mli_version != MLD_VERSION_2)) { if (!mld_is_addr_reported(&inm->in6m_addr)) { CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); } CTR1(KTR_MLD, "%s: nothing to do", __func__); in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } mbufq_drain(&inm->in6m_scq); retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); if (retval <= 0) return (-retval); /* * If record(s) were enqueued, start the state-change * report timer for this group. */ inm->in6m_scrv = mli->mli_rv; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; return (0); } /* * Perform the final leave for a multicast address. * * When leaving a group: * MLDv1 sends a DONE message, if and only if we are the reporter. * MLDv2 enqueues a state-change report containing a transition * to INCLUDE {} for immediate transmission. */ static void mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli) { struct epoch_tracker et; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), inm->in6m_ifp, if_name(inm->in6m_ifp)); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: case MLD_LEAVING_MEMBER: /* Already leaving or left; do nothing. */ CTR1(KTR_MLD, "%s: not kicking state machine for silent group", __func__); break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: if (mli->mli_version == MLD_VERSION_1) { #ifdef INVARIANTS if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) panic("%s: MLDv2 state reached, not MLDv2 mode", __func__); #endif NET_EPOCH_ENTER(et); mld_v1_transmit_report(inm, MLD_LISTENER_DONE); NET_EPOCH_EXIT(et); inm->in6m_state = MLD_NOT_MEMBER; V_current_state_timers_running6 = 1; } else if (mli->mli_version == MLD_VERSION_2) { /* * Stop group timer and all pending reports. * Immediately enqueue a state-change report * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ mbufq_drain(&inm->in6m_scq); inm->in6m_timer = 0; inm->in6m_scrv = mli->mli_rv; CTR4(KTR_MLD, "%s: Leaving %s/%s with %d " "pending retransmissions.", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp), inm->in6m_scrv); if (inm->in6m_scrv == 0) { inm->in6m_state = MLD_NOT_MEMBER; inm->in6m_sctimer = 0; } else { int retval __diagused; in6m_acquire_locked(inm); retval = mld_v2_enqueue_group_record( &inm->in6m_scq, inm, 1, 0, 0, (mli->mli_flags & MLIF_USEALLOW)); KASSERT(retval != 0, ("%s: enqueue record = %d", __func__, retval)); inm->in6m_state = MLD_LEAVING_MEMBER; inm->in6m_sctimer = 1; V_state_change_timers_running6 = 1; } break; } break; case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: /* Our reports are suppressed; do nothing. */ break; } in6m_commit(inm); CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s", __func__, &inm->in6m_addr, if_name(inm->in6m_ifp)); } /* * Enqueue an MLDv2 group record to the given output queue. * * If is_state_change is zero, a current-state record is appended. * If is_state_change is non-zero, a state-change report is appended. * * If is_group_query is non-zero, an mbuf packet chain is allocated. * If is_group_query is zero, and if there is a packet with free space * at the tail of the queue, it will be appended to providing there * is enough free space. * Otherwise a new mbuf packet chain is allocated. * * If is_source_query is non-zero, each source is checked to see if * it was recorded for a Group-Source query, and will be omitted if * it is not both in-mode and recorded. * * If use_block_allow is non-zero, state change reports for initial join * and final leave, on an inclusive mode group with a source list, will be * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. * * The function will attempt to allocate leading space in the packet * for the IPv6+ICMP headers to be prepended without fragmenting the chain. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query, const int use_block_allow) { struct mldv2_record mr; struct mldv2_record *pmr; struct ifnet *ifp; struct ip6_msource *ims, *nims; struct mbuf *m0, *m, *md; int is_filter_list_change; int minrec0len, m0srcs, msrcs, nbytes, off; int record_has_sources; int now; int type; uint8_t mode; #ifdef KTR char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); ifp = inm->in6m_ifp; is_filter_list_change = 0; m = NULL; m0 = NULL; m0srcs = 0; msrcs = 0; nbytes = 0; nims = NULL; record_has_sources = 1; pmr = NULL; type = MLD_DO_NOTHING; mode = inm->in6m_st[1].iss_fmode; /* * If we did not transition out of ASM mode during t0->t1, * and there are no source nodes to process, we can skip * the generation of source records. */ if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && inm->in6m_nsrc == 0) record_has_sources = 0; if (is_state_change) { /* * Queue a state change record. * If the mode did not change, and there are non-ASM * listeners or source filters present, * we potentially need to issue two records for the group. * If there are ASM listeners, and there was no filter * mode transition of any kind, do nothing. * * If we are transitioning to MCAST_UNDEFINED, we need * not send any sources. A transition to/from this state is * considered inclusive with some special treatment. * * If we are rewriting initial joins/leaves to use * ALLOW/BLOCK, and the group's membership is inclusive, * we need to send sources in all cases. */ if (mode != inm->in6m_st[0].iss_fmode) { if (mode == MCAST_EXCLUDE) { CTR1(KTR_MLD, "%s: change to EXCLUDE", __func__); type = MLD_CHANGE_TO_EXCLUDE_MODE; } else { CTR1(KTR_MLD, "%s: change to INCLUDE", __func__); if (use_block_allow) { /* * XXX * Here we're interested in state * edges either direction between * MCAST_UNDEFINED and MCAST_INCLUDE. * Perhaps we should just check * the group state, rather than * the filter mode. */ if (mode == MCAST_UNDEFINED) { type = MLD_BLOCK_OLD_SOURCES; } else { type = MLD_ALLOW_NEW_SOURCES; } } else { type = MLD_CHANGE_TO_INCLUDE_MODE; if (mode == MCAST_UNDEFINED) record_has_sources = 0; } } } else { if (record_has_sources) { is_filter_list_change = 1; } else { type = MLD_DO_NOTHING; } } } else { /* * Queue a current state record. */ if (mode == MCAST_EXCLUDE) { type = MLD_MODE_IS_EXCLUDE; } else if (mode == MCAST_INCLUDE) { type = MLD_MODE_IS_INCLUDE; KASSERT(inm->in6m_st[1].iss_asm == 0, ("%s: inm %p is INCLUDE but ASM count is %d", __func__, inm, inm->in6m_st[1].iss_asm)); } } /* * Generate the filter list changes using a separate function. */ if (is_filter_list_change) return (mld_v2_enqueue_filter_change(mq, inm)); if (type == MLD_DO_NOTHING) { CTR3(KTR_MLD, "%s: nothing to do for %s/%s", __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); return (0); } /* * If any sources are present, we must be able to fit at least * one in the trailing space of the tail packet's mbuf, * ideally more. */ minrec0len = sizeof(struct mldv2_record); if (record_has_sources) minrec0len += sizeof(struct in6_addr); CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__, mld_rec_type_to_str(type), ip6_sprintf(ip6tbuf, &inm->in6m_addr), if_name(inm->in6m_ifp)); /* * Check if we have a packet in the tail of the queue for this * group into which the first group record for this group will fit. * Otherwise allocate a new packet. * Always allocate leading space for IP6+RA+ICMPV6+REPORT. * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + minrec0len) < (ifp->if_mtu - MLD_MTUSPACE)) { m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); m = m0; CTR1(KTR_MLD, "%s: use existing packet", __func__); } else { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = NULL; m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); if (!is_state_change && !is_group_query) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); CTR1(KTR_MLD, "%s: allocated first packet", __func__); } /* * Append group record. * If we have sources, we don't know how many yet. */ mr.mr_type = type; mr.mr_datalen = 0; mr.mr_numsrc = 0; mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct mldv2_record); /* * Append as many sources as will fit in the first packet. * If we are appending to a new packet, the chain allocation * may potentially use clusters; use m_getptr() in this case. * If we are appending to an existing packet, we need to obtain * a pointer to the group record after m_append(), in case a new * mbuf was allocated. * * Only append sources which are in-mode at t1. If we are * transitioning to MCAST_UNDEFINED state on the group, and * use_block_allow is zero, do not include source entries. * Otherwise, we need to include this source in the report. * * Only report recorded sources in our filter set when responding * to a group-source query. */ if (record_has_sources) { if (m == m0) { md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - nbytes); } else { md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } msrcs = 0; RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); CTR2(KTR_MLD, "%s: node is %d", __func__, now); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } nbytes += sizeof(struct in6_addr); ++msrcs; if (msrcs == m0srcs) break; } CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__, msrcs); pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); } if (is_source_query && msrcs == 0) { CTR1(KTR_MLD, "%s: no recorded sources to report", __func__); if (m != m0) m_freem(m); return (0); } /* * We are good to go with first packet. */ if (m != m0) { CTR1(KTR_MLD, "%s: enqueueing first packet", __func__); m->m_pkthdr.vt_nrecs = 1; mbufq_enqueue(mq, m); } else m->m_pkthdr.vt_nrecs++; /* * No further work needed if no source list in packet(s). */ if (!record_has_sources) return (nbytes); /* * Whilst sources remain to be announced, we need to allocate * a new packet and fill out as many sources as will fit. * Always try for a cluster first. */ while (nims != NULL) { if (mbufq_full(mq)) { CTR1(KTR_MLD, "%s: outbound queue full", __func__); return (-ENOMEM); } m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (-ENOMEM); mld_save_context(m, ifp); md = m_getptr(m, 0, &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); CTR1(KTR_MLD, "%s: allocated next packet", __func__); if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } m->m_pkthdr.vt_nrecs = 1; nbytes += sizeof(struct mldv2_record); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); msrcs = 0; RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); if ((now != mode) || (now == mode && (!use_block_allow && mode == MCAST_UNDEFINED))) { CTR1(KTR_MLD, "%s: skip node", __func__); continue; } if (is_source_query && ims->im6s_stp == 0) { CTR1(KTR_MLD, "%s: skip unrecorded node", __func__); continue; } CTR1(KTR_MLD, "%s: append node", __func__); if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed.", __func__); return (-ENOMEM); } ++msrcs; if (msrcs == m0srcs) break; } pmr->mr_numsrc = htons(msrcs); nbytes += (msrcs * sizeof(struct in6_addr)); CTR1(KTR_MLD, "%s: enqueueing next packet", __func__); mbufq_enqueue(mq, m); } return (nbytes); } /* * Type used to mark record pass completion. * We exploit the fact we can cast to this easily from the * current filter modes on each ip_msource node. */ typedef enum { REC_NONE = 0x00, /* MCAST_UNDEFINED */ REC_ALLOW = 0x01, /* MCAST_INCLUDE */ REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ REC_FULL = REC_ALLOW | REC_BLOCK } rectype_t; /* * Enqueue an MLDv2 filter list change to the given output queue. * * Source list filter state is held in an RB-tree. When the filter list * for a group is changed without changing its mode, we need to compute * the deltas between T0 and T1 for each source in the filter set, * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. * * As we may potentially queue two record types, and the entire R-B tree * needs to be walked at once, we break this out into its own function * so we can generate a tightly packed queue of packets. * * XXX This could be written to only use one tree walk, although that makes * serializing into the mbuf chains a bit harder. For now we do two walks * which makes things easier on us, and it may or may not be harder on * the L2 cache. * * If successful the size of all data appended to the queue is returned, * otherwise an error code less than zero is returned, or zero if * no record(s) were appended. */ static int mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm) { static const int MINRECLEN = sizeof(struct mldv2_record) + sizeof(struct in6_addr); struct ifnet *ifp; struct mldv2_record mr; struct mldv2_record *pmr; struct ip6_msource *ims, *nims; struct mbuf *m, *m0, *md; int m0srcs, nbytes, npbytes, off, rsrcs, schanged; uint8_t mode, now, then; rectype_t crt, drt, nrt; #ifdef KTR int nallow, nblock; char ip6tbuf[INET6_ADDRSTRLEN]; #endif IN6_MULTI_LIST_LOCK_ASSERT(); if (inm->in6m_nsrc == 0 || (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) return (0); ifp = inm->in6m_ifp; /* interface */ mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ crt = REC_NONE; /* current group record type */ drt = REC_NONE; /* mask of completed group record types */ nrt = REC_NONE; /* record type for current node */ m0srcs = 0; /* # source which will fit in current mbuf chain */ npbytes = 0; /* # of bytes appended this packet */ nbytes = 0; /* # of bytes appended to group's state-change queue */ rsrcs = 0; /* # sources encoded in current record */ schanged = 0; /* # nodes encoded in overall filter change */ #ifdef KTR nallow = 0; /* # of source entries in ALLOW_NEW */ nblock = 0; /* # of source entries in BLOCK_OLD */ #endif nims = NULL; /* next tree node pointer */ /* * For each possible filter record mode. * The first kind of source we encounter tells us which * is the first kind of record we start appending. * If a node transitioned to UNDEFINED at t1, its mode is treated * as the inverse of the group's filter mode. */ while (drt != REC_FULL) { do { m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && (m0->m_pkthdr.len + MINRECLEN) < (ifp->if_mtu - MLD_MTUSPACE)) { m = m0; m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); CTR1(KTR_MLD, "%s: use previous packet", __func__); } else { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { CTR1(KTR_MLD, "%s: m_get*() failed", __func__); return (-ENOMEM); } m->m_pkthdr.vt_nrecs = 0; mld_save_context(m, ifp); m0srcs = (ifp->if_mtu - MLD_MTUSPACE - sizeof(struct mldv2_record)) / sizeof(struct in6_addr); npbytes = 0; CTR1(KTR_MLD, "%s: allocated new packet", __func__); } /* * Append the MLD group record header to the * current packet's data area. * Recalculate pointer to free space for next * group record, in case m_append() allocated * a new mbuf or cluster. */ memset(&mr, 0, sizeof(mr)); mr.mr_addr = inm->in6m_addr; in6_clearscope(&mr.mr_addr); if (!m_append(m, sizeof(mr), (void *)&mr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } npbytes += sizeof(struct mldv2_record); if (m != m0) { /* new packet; offset in chain */ md = m_getptr(m, npbytes - sizeof(struct mldv2_record), &off); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); } else { /* current packet; offset from last append */ md = m_last(m); pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + md->m_len - sizeof(struct mldv2_record)); } /* * Begin walking the tree for this record type * pass, or continue from where we left off * previously if we had to allocate a new packet. * Only report deltas in-mode at t1. * We need not report included sources as allowed * if we are in inclusive mode on the group, * however the converse is not true. */ rsrcs = 0; if (nims == NULL) { nims = RB_MIN(ip6_msource_tree, &inm->in6m_srcs); } RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { CTR2(KTR_MLD, "%s: visit node %s", __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr)); now = im6s_get_mode(inm, ims, 1); then = im6s_get_mode(inm, ims, 0); CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d", __func__, then, now); if (now == then) { CTR1(KTR_MLD, "%s: skip unchanged", __func__); continue; } if (mode == MCAST_EXCLUDE && now == MCAST_INCLUDE) { CTR1(KTR_MLD, "%s: skip IN src on EX group", __func__); continue; } nrt = (rectype_t)now; if (nrt == REC_NONE) nrt = (rectype_t)(~mode & REC_FULL); if (schanged++ == 0) { crt = nrt; } else if (crt != nrt) continue; if (!m_append(m, sizeof(struct in6_addr), (void *)&ims->im6s_addr)) { if (m != m0) m_freem(m); CTR1(KTR_MLD, "%s: m_append() failed", __func__); return (-ENOMEM); } #ifdef KTR nallow += !!(crt == REC_ALLOW); nblock += !!(crt == REC_BLOCK); #endif if (++rsrcs == m0srcs) break; } /* * If we did not append any tree nodes on this * pass, back out of allocations. */ if (rsrcs == 0) { npbytes -= sizeof(struct mldv2_record); if (m != m0) { CTR1(KTR_MLD, "%s: m_free(m)", __func__); m_freem(m); } else { CTR1(KTR_MLD, "%s: m_adj(m, -mr)", __func__); m_adj(m, -((int)sizeof( struct mldv2_record))); } continue; } npbytes += (rsrcs * sizeof(struct in6_addr)); if (crt == REC_ALLOW) pmr->mr_type = MLD_ALLOW_NEW_SOURCES; else if (crt == REC_BLOCK) pmr->mr_type = MLD_BLOCK_OLD_SOURCES; pmr->mr_numsrc = htons(rsrcs); /* * Count the new group record, and enqueue this * packet if it wasn't already queued. */ m->m_pkthdr.vt_nrecs++; if (m != m0) mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; crt = (~crt & REC_FULL); } CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, nallow, nblock); return (nbytes); } static int mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq) { struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ int docopy, domerge; u_int recslen; docopy = 0; domerge = 0; recslen = 0; IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); /* * If there are further pending retransmissions, make a writable * copy of each queued state-change message before merging. */ if (inm->in6m_scrv > 0) docopy = 1; gq = &inm->in6m_scq; #ifdef KTR if (mbufq_first(gq) == NULL) { CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if * there is sufficient space to do so; an MLDv2 report * packet may only contain 65,535 group records. * Always use a simple mbuf chain concatentation to do this, * as large state changes for single groups may have * allocated clusters. */ domerge = 0; mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); if ((mt->m_pkthdr.vt_nrecs + m->m_pkthdr.vt_nrecs <= MLD_V2_REPORT_MAXRECS) && (mt->m_pkthdr.len + recslen <= (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) domerge = 1; } if (!domerge && mbufq_full(gq)) { CTR2(KTR_MLD, "%s: outbound queue full, skipping whole packet %p", __func__, m); mt = m->m_nextpkt; if (!docopy) m_freem(m); m = mt; continue; } if (!docopy) { CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m); m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_MLD, "%s: copying %p", __func__, m); m0 = m_dup(m, M_NOWAIT); if (m0 == NULL) return (ENOMEM); m0->m_nextpkt = NULL; m = m->m_nextpkt; } if (!domerge) { CTR3(KTR_MLD, "%s: queueing %p to scq %p)", __func__, m0, scq); mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)", __func__, m0, mt); mtl = m_last(mt); m0->m_flags &= ~M_PKTHDR; mt->m_pkthdr.len += recslen; mt->m_pkthdr.vt_nrecs += m0->m_pkthdr.vt_nrecs; mtl->m_next = m0; } } return (0); } /* * Respond to a pending MLDv2 General Query. */ static void mld_v2_dispatch_general_query(struct mld_ifsoftc *mli) { struct ifmultiaddr *ifma; struct ifnet *ifp; struct in6_multi *inm; int retval __unused; NET_EPOCH_ASSERT(); IN6_MULTI_LIST_LOCK_ASSERT(); MLD_LOCK_ASSERT(); KASSERT(mli->mli_version == MLD_VERSION_2, ("%s: called when version %d", __func__, mli->mli_version)); /* * Check that there are some packets queued. If so, send them first. * For large number of groups the reply to general query can take * many packets, we should finish sending them before starting of * queuing the new reply. */ if (!mbufq_empty(&mli->mli_gq)) goto send; ifp = mli->mli_ifp; CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = in6m_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; KASSERT(ifp == inm->in6m_ifp, ("%s: inconsistent ifp", __func__)); switch (inm->in6m_state) { case MLD_NOT_MEMBER: case MLD_SILENT_MEMBER: break; case MLD_REPORTING_MEMBER: case MLD_IDLE_MEMBER: case MLD_LAZY_MEMBER: case MLD_SLEEPING_MEMBER: case MLD_AWAKENING_MEMBER: inm->in6m_state = MLD_REPORTING_MEMBER; retval = mld_v2_enqueue_group_record(&mli->mli_gq, inm, 0, 0, 0, 0); CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval); break; case MLD_G_QUERY_PENDING_MEMBER: case MLD_SG_QUERY_PENDING_MEMBER: case MLD_LEAVING_MEMBER: break; } } send: mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST); /* * Slew transmission of bursts over 500ms intervals. */ if (mbufq_first(&mli->mli_gq) != NULL) { mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( MLD_RESPONSE_BURST_INTERVAL); V_interface_timers_running6 = 1; } } /* * Transmit the next pending message in the output queue. * * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. * MRT: Nothing needs to be done, as MLD traffic is always local to * a link and uses a link-scope multicast address. */ static void mld_dispatch_packet(struct mbuf *m) { struct ip6_moptions im6o; struct ifnet *ifp; struct ifnet *oifp; struct mbuf *m0; struct mbuf *md; struct ip6_hdr *ip6; struct mld_hdr *mld; int error; int off; int type; uint32_t ifindex; CTR2(KTR_MLD, "%s: transmit %p", __func__, m); NET_EPOCH_ASSERT(); /* * Set VNET image pointer from enqueued mbuf chain * before doing anything else. Whilst we use interface * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ ifindex = mld_restore_context(m); /* * Check if the ifnet still exists. This limits the scope of * any race in the absence of a global ifp lock for low cost * (an array lookup). */ ifp = ifnet_byindex(ifindex); if (ifp == NULL) { CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.", __func__, m, ifindex); m_freem(m); IP6STAT_INC(ip6s_noroute); goto out; } im6o.im6o_multicast_hlim = 1; im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL); im6o.im6o_multicast_ifp = ifp; if (m->m_flags & M_MLDV1) { m0 = m; } else { m0 = mld_v2_encap_report(ifp, m); if (m0 == NULL) { CTR2(KTR_MLD, "%s: dropped %p", __func__, m); IP6STAT_INC(ip6s_odropped); goto out; } } mld_scrub_context(m0); m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; ip6 = mtod(m0, struct ip6_hdr *); #if 0 (void)in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ #else /* * XXX XXX Break some KPI rules to prevent an LOR which would * occur if we called in6_setscope() at transmission. * See comments at top of file. */ MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); #endif /* * Retrieve the ICMPv6 type before handoff to ip6_output(), * so we can bump the stats. */ md = m_getptr(m0, sizeof(struct ip6_hdr), &off); mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); type = mld->mld_type; oifp = NULL; error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o, &oifp, NULL); if (error) { CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error); goto out; } - ICMP6STAT_INC(icp6s_outhist[type]); + ICMP6STAT_INC2(icp6s_outhist, type); if (oifp != NULL) { icmp6_ifstat_inc(oifp, ifs6_out_msg); switch (type) { case MLD_LISTENER_REPORT: case MLDV2_LISTENER_REPORT: icmp6_ifstat_inc(oifp, ifs6_out_mldreport); break; case MLD_LISTENER_DONE: icmp6_ifstat_inc(oifp, ifs6_out_mlddone); break; } } out: return; } /* * Encapsulate an MLDv2 report. * * KAME IPv6 requires that hop-by-hop options be passed separately, * and that the IPv6 header be prepended in a separate mbuf. * * Returns a pointer to the new mbuf chain head, or NULL if the * allocation failed. */ static struct mbuf * mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mh; struct mldv2_report *mld; struct ip6_hdr *ip6; struct in6_ifaddr *ia; int mldreclen; KASSERT(ifp != NULL, ("%s: null ifp", __func__)); KASSERT((m->m_flags & M_PKTHDR), ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); /* * RFC3590: OK to send as :: or tentative during DAD. */ NET_EPOCH_ASSERT(); ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); if (ia == NULL) CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__); mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { if (ia != NULL) ifa_free(&ia->ia_ifa); m_freem(m); return (NULL); } M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); mldreclen = m_length(m, NULL); CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen); mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report) + mldreclen; ip6 = mtod(mh, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; if (ia != NULL) ifa_free(&ia->ia_ifa); ip6->ip6_dst = in6addr_linklocal_allv2routers; /* scope ID will be set in netisr */ mld = (struct mldv2_report *)(ip6 + 1); mld->mld_type = MLDV2_LISTENER_REPORT; mld->mld_code = 0; mld->mld_cksum = 0; mld->mld_v2_reserved = 0; mld->mld_v2_numrecs = htons(m->m_pkthdr.vt_nrecs); m->m_pkthdr.vt_nrecs = 0; mh->m_next = m; mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); return (mh); } #ifdef KTR static char * mld_rec_type_to_str(const int type) { switch (type) { case MLD_CHANGE_TO_EXCLUDE_MODE: return "TO_EX"; break; case MLD_CHANGE_TO_INCLUDE_MODE: return "TO_IN"; break; case MLD_MODE_IS_EXCLUDE: return "MODE_EX"; break; case MLD_MODE_IS_INCLUDE: return "MODE_IN"; break; case MLD_ALLOW_NEW_SOURCES: return "ALLOW_NEW"; break; case MLD_BLOCK_OLD_SOURCES: return "BLOCK_OLD"; break; default: break; } return "unknown"; } #endif static void mld_init(void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); MLD_LOCK_INIT(); ip6_initpktopts(&mld_po); mld_po.ip6po_hlim = 1; mld_po.ip6po_hbh = &mld_ra.hbh; mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; mld_po.ip6po_flags = IP6PO_DONTFRAG; callout_init(&mldslow_callout, 1); callout_reset(&mldslow_callout, hz / MLD_SLOWHZ, mld_slowtimo, NULL); callout_init(&mldfast_callout, 1); callout_reset(&mldfast_callout, hz / MLD_FASTHZ, mld_fasttimo, NULL); } SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL); static void mld_uninit(void *unused __unused) { CTR1(KTR_MLD, "%s: tearing down", __func__); callout_drain(&mldslow_callout); callout_drain(&mldfast_callout); MLD_LOCK_DESTROY(); } SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL); static void vnet_mld_init(const void *unused __unused) { CTR1(KTR_MLD, "%s: initializing", __func__); LIST_INIT(&V_mli_head); } VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init, NULL); static void vnet_mld_uninit(const void *unused __unused) { /* This can happen if we shutdown the network stack. */ CTR1(KTR_MLD, "%s: tearing down", __func__); } VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit, NULL); static int mld_modevent(module_t mod, int type, void *unused __unused) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t mld_mod = { "mld", mld_modevent, 0 }; DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY); diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c index 353db66c2323..640348a1d198 100644 --- a/sys/netinet6/nd6_nbr.c +++ b/sys/netinet6/nd6_nbr.c @@ -1,1619 +1,1619 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $ */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SDL(s) ((struct sockaddr_dl *)s) struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *, struct nd_opt_nonce *); static void nd6_dad_add(struct dadq *dp); static void nd6_dad_del(struct dadq *dp); static void nd6_dad_rele(struct dadq *); static void nd6_dad_starttimer(struct dadq *, int); static void nd6_dad_stoptimer(struct dadq *); static void nd6_dad_timer(void *); static void nd6_dad_duplicated(struct ifaddr *, struct dadq *); static void nd6_dad_ns_output(struct dadq *); static void nd6_dad_ns_input(struct ifaddr *, struct nd_opt_nonce *); static void nd6_dad_na_input(struct ifaddr *); static void nd6_na_output_fib(struct ifnet *, const struct in6_addr *, const struct in6_addr *, u_long, int, struct sockaddr *, u_int); static void nd6_ns_output_fib(struct ifnet *, const struct in6_addr *, const struct in6_addr *, const struct in6_addr *, uint8_t *, u_int); static struct ifaddr *nd6_proxy_fill_sdl(struct ifnet *, const struct in6_addr *, struct sockaddr_dl *); VNET_DEFINE_STATIC(int, dad_enhanced) = 1; #define V_dad_enhanced VNET(dad_enhanced) SYSCTL_DECL(_net_inet6_ip6); SYSCTL_INT(_net_inet6_ip6, OID_AUTO, dad_enhanced, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(dad_enhanced), 0, "Enable Enhanced DAD, which adds a random nonce to NS messages for DAD."); VNET_DEFINE_STATIC(int, dad_maxtry) = 15; /* max # of *tries* to transmit DAD packet */ #define V_dad_maxtry VNET(dad_maxtry) VNET_DEFINE_STATIC(int, nd6_onlink_ns_rfc4861) = 0; #define V_nd6_onlink_ns_rfc4861 VNET(nd6_onlink_ns_rfc4861) SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), 0, "Accept 'on-link' ICMPv6 NS messages in compliance with RFC 4861"); /* * Input a Neighbor Solicitation Message. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct in6_addr daddr6, myaddr6, saddr6, taddr6; struct ifaddr *ifa; struct sockaddr_dl proxydl; union nd_opts ndopts; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; char *lladdr; int anycast, lladdrlen, proxy, rflag, tentative, tlladdr; ifa = NULL; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); if (__predict_false(ip6->ip6_hlim != 255)) { ICMP6STAT_INC(icp6s_invlhlim); nd6log((LOG_ERR, "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bads; } if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off); saddr6 = ip6->ip6_src; daddr6 = ip6->ip6_dst; taddr6 = nd_ns->nd_ns_target; if (in6_setscope(&taddr6, ifp, NULL) != 0) goto bad; rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0; if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif) rflag = 0; if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { /* dst has to be a solicited node multicast address. */ if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && /* don't check ifindex portion */ daddr6.s6_addr32[1] == 0 && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); goto bad; } } else if (!V_nd6_onlink_ns_rfc4861) { struct sockaddr_in6 src_sa6; /* * According to recent IETF discussions, it is not a good idea * to accept a NS from an address which would not be deemed * to be a neighbor otherwise. This point is expected to be * clarified in future revisions of the specification. */ bzero(&src_sa6, sizeof(src_sa6)); src_sa6.sin6_family = AF_INET6; src_sa6.sin6_len = sizeof(src_sa6); src_sa6.sin6_addr = saddr6; if (nd6_is_addr_neighbor(&src_sa6, ifp) == 0) { nd6log((LOG_INFO, "nd6_ns_input: " "NS packet from non-neighbor\n")); goto bad; } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n")); goto bad; } icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_ns_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } lladdr = NULL; lladdrlen = 0; if (ndopts.nd_opts_src_lladdr) { lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(link-layer address option)\n")); goto bad; } /* * Attaching target link-layer address to the NA? * (RFC 2461 7.2.4) * * NS IP dst is unicast/anycast MUST NOT add * NS IP dst is solicited-node multicast MUST add * * In implementation, we add target link-layer address by default. * We do not add one in MUST NOT cases. */ if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else tlladdr = 1; /* * Target address (taddr6) must be either: * (1) Valid unicast/anycast address for my receiving interface, * (2) Unicast address for which I'm offering proxy service, or * (3) "tentative" address on which DAD is being performed. */ /* (1) and (3) check. */ if (ifp->if_carp) ifa = (*carp_iamatch6_p)(ifp, &taddr6); else ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ proxy = 0; if (ifa == NULL) { if ((ifa = nd6_proxy_fill_sdl(ifp, &taddr6, &proxydl)) != NULL) proxy = 1; } if (ifa == NULL) { /* * We've got an NS packet, and we don't have that address * assigned for us. We MUST silently ignore it. * See RFC2461 7.2.3. */ goto freeit; } myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) goto freeit; if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s " "(if %d, NS packet %d)\n", ip6_sprintf(ip6bufs, &taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) { nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n", ip6_sprintf(ip6bufs, &saddr6))); goto freeit; } /* * We have neighbor solicitation packet, with target address equals to * one of my tentative address. * * src addr how to process? * --- --- * multicast of course, invalid (rejected in ip6_input) * unicast somebody is doing address resolution -> ignore * unspec dup address detection * * The processing is defined in RFC 2462. */ if (tentative) { /* * If source address is unspecified address, it is for * duplicate address detection. * * If not, the packet is for addess resolution; * silently ignore it. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) nd6_dad_ns_input(ifa, ndopts.nd_opts_nonce); goto freeit; } /* * If the source address is unspecified address, entries must not * be created or updated. * It looks that sender is performing DAD. Output NA toward * all-node multicast address, to tell the sender that I'm using * the address. * S bit ("solicited") must be zero. */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { struct in6_addr in6_all; in6_all = in6addr_linklocal_allnodes; if (in6_setscope(&in6_all, ifp, NULL) != 0) goto bad; nd6_na_output_fib(ifp, &in6_all, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m)); goto freeit; } nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_NEIGHBOR_SOLICIT, 0); nd6_na_output_fib(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) | rflag | ND_NA_FLAG_SOLICITED, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m)); freeit: if (ifa != NULL) ifa_free(ifa); m_freem(m); return; bad: nd6log((LOG_ERR, "nd6_ns_input: src=%s\n", ip6_sprintf(ip6bufs, &saddr6))); nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n", ip6_sprintf(ip6bufs, &daddr6))); nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n", ip6_sprintf(ip6bufs, &taddr6))); bads: ICMP6STAT_INC(icp6s_badns); if (ifa != NULL) ifa_free(ifa); m_freem(m); } static struct ifaddr * nd6_proxy_fill_sdl(struct ifnet *ifp, const struct in6_addr *taddr6, struct sockaddr_dl *sdl) { struct ifaddr *ifa; struct llentry *ln; ifa = NULL; ln = nd6_lookup(taddr6, LLE_SF(AF_INET6, 0), ifp); if (ln == NULL) return (ifa); if ((ln->la_flags & (LLE_PUB | LLE_VALID)) == (LLE_PUB | LLE_VALID)) { link_init_sdl(ifp, (struct sockaddr *)sdl, ifp->if_type); sdl->sdl_alen = ifp->if_addrlen; bcopy(ln->ll_addr, &sdl->sdl_data, ifp->if_addrlen); LLE_RUNLOCK(ln); ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); } else LLE_RUNLOCK(ln); return (ifa); } /* * Output a Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * ln - for source address determination * nonce - If non-NULL, NS is used for duplicate address detection and * the value (length is ND_OPT_NONCE_LEN) is used as a random nonce. */ static void nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6, const struct in6_addr *daddr6, const struct in6_addr *taddr6, uint8_t *nonce, u_int fibnum) { struct mbuf *m; struct m_tag *mtag; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct ip6_moptions im6o; int icmp6len; int maxlen; NET_EPOCH_ASSERT(); if (IN6_IS_ADDR_MULTICAST(taddr6)) return; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; KASSERT(max_linkhdr + maxlen <= MCLBYTES, ( "%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)", __func__, max_linkhdr, maxlen, MCLBYTES)); if (max_linkhdr + maxlen > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; M_SETFIB(m, fibnum); if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len; m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */ /* fill neighbor solicitation packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; /* ip6->ip6_plen will be set later */ ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (daddr6) ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; ip6->ip6_dst.s6_addr16[1] = 0; ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) goto bad; } if (nonce == NULL) { char ip6buf[INET6_ADDRSTRLEN]; struct ifaddr *ifa = NULL; /* * RFC2461 7.2.2: * "If the source address of the packet prompting the * solicitation is the same as one of the addresses assigned * to the outgoing interface, that address SHOULD be placed * in the IP Source Address of the outgoing solicitation. * Otherwise, any one of the addresses assigned to the * interface should be used." * * We use the source address for the prompting packet * (saddr6), if saddr6 belongs to the outgoing interface. * Otherwise, we perform the source address selection as usual. */ if (saddr6 != NULL) ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, saddr6); if (ifa == NULL) { int error; struct in6_addr dst6, src6; uint32_t scopeid; in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid); error = in6_selectsrc_addr(fibnum, &dst6, scopeid, ifp, &src6, NULL); if (error) { nd6log((LOG_DEBUG, "%s: source can't be " "determined: dst=%s, error=%d\n", __func__, ip6_sprintf(ip6buf, &dst6), error)); goto bad; } ip6->ip6_src = src6; } else ip6->ip6_src = *saddr6; if (ifp->if_carp != NULL) { /* * Check that selected source address belongs to * CARP addresses. */ if (ifa == NULL) ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &ip6->ip6_src); /* * Do not send NS for CARP address if we are not * the CARP master. */ if (ifa != NULL && ifa->ifa_carp != NULL && !(*carp_master_p)(ifa)) { nd6log((LOG_DEBUG, "nd6_ns_output: NS from BACKUP CARP address %s\n", ip6_sprintf(ip6buf, &ip6->ip6_src))); ifa_free(ifa); goto bad; } } if (ifa != NULL) ifa_free(ifa); } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) * We actually don't have to 0-clear the address (we did it * above), but we do so here explicitly to make the intention * clearer. */ bzero(&ip6->ip6_src, sizeof(ip6->ip6_src)); } nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; in6_clearscope(&nd_ns->nd_ns_target); /* XXX */ /* * Add source link-layer address option. * * spec implementation * --- --- * DAD packet MUST NOT do not add the option * there's no link layer address: * impossible do not add the option * there's link layer address: * Multicast NS MUST add one add the option * Unicast NS SHOULD add one add the option */ if (nonce == NULL) { struct nd_opt_hdr *nd_opt; char *mac; int optlen; mac = NULL; if (ifp->if_carp) mac = (*carp_macmatch6_p)(ifp, m, &ip6->ip6_src); if (mac == NULL) mac = nd6_ifptomac(ifp); if (mac != NULL) { nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; /* 8 byte alignments... */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero(nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, nd_opt + 1, ifp->if_addrlen); } } /* * Add a Nonce option (RFC 3971) to detect looped back NS messages. * This behavior is documented as Enhanced Duplicate Address * Detection in RFC 7527. * net.inet6.ip6.dad_enhanced=0 disables this. */ if (V_dad_enhanced != 0 && nonce != NULL) { int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1); /* 8-byte alignment is required. */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_NONCE; nd_opt->nd_opt_len = optlen >> 3; bcopy(nonce, (caddr_t)(nd_opt + 1), ND_OPT_NONCE_LEN); } ip6->ip6_plen = htons((u_short)icmp6len); nd_ns->nd_ns_cksum = 0; nd_ns->nd_ns_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; *(unsigned short *)(mtag + 1) = nd_ns->nd_ns_type; m_tag_prepend(m, mtag); } ip6_output(m, NULL, NULL, (nonce != NULL) ? IPV6_UNSPECSRC : 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit); - ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_SOLICIT]); + ICMP6STAT_INC2(icp6s_outhist, ND_NEIGHBOR_SOLICIT); return; bad: m_freem(m); } #ifndef BURN_BRIDGES void nd6_ns_output(struct ifnet *ifp, const struct in6_addr *saddr6, const struct in6_addr *daddr6, const struct in6_addr *taddr6,uint8_t *nonce) { nd6_ns_output_fib(ifp, saddr6, daddr6, taddr6, nonce, RT_DEFAULT_FIB); } #endif /* * Neighbor advertisement input handling. * * Based on RFC 2461 * Based on RFC 2462 (duplicate address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) */ void nd6_na_input(struct mbuf *m, int off, int icmp6len) { struct ifnet *ifp; struct ip6_hdr *ip6; struct ifaddr *ifa; struct llentry *ln; struct mbuf *chain; struct nd_neighbor_advert *nd_na; struct in6_addr daddr6, taddr6; union nd_opts ndopts; u_char linkhdr[LLE_MAX_LINKHDR]; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; char *lladdr; size_t linkhdrsize; int flags, is_override, is_router, is_solicited; int lladdr_off, lladdrlen, checklink; bool flush_holdchain = false; NET_EPOCH_ASSERT(); chain = NULL; ln = NULL; checklink = 0; /* RFC 6980: Nodes MUST silently ignore fragments */ if(m->m_flags & M_FRAGMENTED) goto freeit; ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); if (__predict_false(ip6->ip6_hlim != 255)) { ICMP6STAT_INC(icp6s_invlhlim); nd6log((LOG_ERR, "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n", ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp))); goto bad; } if (m->m_len < off + icmp6len) { m = m_pullup(m, off + icmp6len); if (m == NULL) { IP6STAT_INC(ip6s_exthdrtoolong); return; } } ip6 = mtod(m, struct ip6_hdr *); nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off); flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); taddr6 = nd_na->nd_na_target; if (in6_setscope(&taddr6, ifp, NULL)) goto bad; /* XXX: impossible */ if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_ERR, "nd6_na_input: invalid target address %s\n", ip6_sprintf(ip6bufs, &taddr6))); goto bad; } daddr6 = ip6->ip6_dst; if (IN6_IS_ADDR_MULTICAST(&daddr6)) if (is_solicited) { nd6log((LOG_ERR, "nd6_na_input: a solicited adv is multicasted\n")); goto bad; } icmp6len -= sizeof(*nd_na); nd6_option_init(nd_na + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { nd6log((LOG_INFO, "nd6_na_input: invalid ND option, ignored\n")); /* nd6_options have incremented stats */ goto freeit; } lladdr = NULL; lladdrlen = 0; if (ndopts.nd_opts_tgt_lladdr) { lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1); lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3; } ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); if (ifa != NULL && ifa->ifa_carp != NULL) { /* * Silently ignore NAs for CARP addresses if we are not * the CARP master. */ if (!(*carp_master_p)(ifa)) { nd6log((LOG_DEBUG, "nd6_na_input: NA for BACKUP CARP address %s\n", ip6_sprintf(ip6bufs, &taddr6))); ifa_free(ifa); goto freeit; } } /* * Target address matches one of my interface address. * * If my address is tentative, this means that there's somebody * already using the same address as mine. This indicates DAD failure. * This is defined in RFC 2462. * * Otherwise, process as defined in RFC 2461. */ if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) { nd6_dad_na_input(ifa); ifa_free(ifa); goto freeit; } /* Just for safety, maybe unnecessary. */ if (ifa) { ifa_free(ifa); log(LOG_ERR, "nd6_na_input: duplicate IP6 address %s\n", ip6_sprintf(ip6bufs, &taddr6)); goto freeit; } if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s " "(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6), ifp->if_addrlen, lladdrlen - 2)); goto bad; } /* * If no neighbor cache entry is found, NA SHOULD silently be * discarded. */ ln = nd6_lookup(&taddr6, LLE_SF(AF_INET6, LLE_EXCLUSIVE), ifp); if (ln == NULL) { goto freeit; } /* * Do not try to override static entry. */ if (ln->la_flags & LLE_STATIC) goto freeit; if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { /* * If the link-layer has address, and no lladdr option came, * discard the packet. */ if (ifp->if_addrlen && lladdr == NULL) { goto freeit; } /* * Record link-layer address, and update the state. */ if (!nd6_try_set_entry_addr(ifp, ln, lladdr)) goto freeit; flush_holdchain = true; if (is_solicited) nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE); else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if ((ln->ln_router = is_router) != 0) { /* * This means a router's state has changed from * non-reachable to probably reachable, and might * affect the status of associated prefixes.. */ checklink = 1; } } else { int llchange; /* * Check if the link-layer address has changed or not. */ if (lladdr == NULL) llchange = 0; else { if (ln->la_flags & LLE_VALID) { if (bcmp(lladdr, ln->ll_addr, ifp->if_addrlen)) llchange = 1; else llchange = 0; } else llchange = 1; } /* * This is VERY complex. Look at it with care. * * override solicit lladdr llchange action * (L: record lladdr) * * 0 0 n -- (2c) * 0 0 y n (2b) L * 0 0 y y (1) REACHABLE->STALE * 0 1 n -- (2c) *->REACHABLE * 0 1 y n (2b) L *->REACHABLE * 0 1 y y (1) REACHABLE->STALE * 1 0 n -- (2a) * 1 0 y n (2a) L * 1 0 y y (2a) L *->STALE * 1 1 n -- (2a) *->REACHABLE * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ if (!is_override && (lladdr != NULL && llchange)) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. */ if (ln->ln_state == ND6_LLINFO_REACHABLE) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); goto freeit; } else if (is_override /* (2a) */ || (!is_override && (lladdr != NULL && !llchange)) /* (2b) */ || lladdr == NULL) { /* (2c) */ /* * Update link-local address, if any. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) goto freeit; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) goto freeit; EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } /* * If solicited, make the state REACHABLE. * If not solicited and the link-layer address was * changed, make it STALE. */ if (is_solicited) nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE); else { if (lladdr != NULL && llchange) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); } } if (ln->ln_router && !is_router) { /* * The peer dropped the router flag. * Remove the sender from the Default Router List and * update the Destination Cache entries. */ struct ifnet *nd6_ifp; nd6_ifp = lltable_get_ifp(ln->lle_tbl); if (!defrouter_remove(&ln->r_l3addr.addr6, nd6_ifp) && (ND_IFINFO(nd6_ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) /* * Even if the neighbor is not in the default * router list, the neighbor may be used as a * next hop for some destinations (e.g. redirect * case). So we must call rt6_flush explicitly. */ rt6_flush(&ip6->ip6_src, ifp); } ln->ln_router = is_router; } /* XXX - QL * Does this matter? * rt->rt_flags &= ~RTF_REJECT; */ ln->la_asked = 0; if (ln->la_hold != NULL) chain = nd6_grab_holdchain(ln); freeit: if (ln != NULL) LLE_WUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, ln, chain); if (flush_holdchain) nd6_flush_children_holdchain(ifp, ln); if (checklink) pfxlist_onlink_check(); m_freem(m); return; bad: if (ln != NULL) LLE_WUNLOCK(ln); ICMP6STAT_INC(icp6s_badna); m_freem(m); } /* * Neighbor advertisement output handling. * * Based on RFC 2461 * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) * * tlladdr - 1 if include target link-layer address * sdl0 - sockaddr_dl (= proxy NA) or NULL */ static void nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, u_long flags, int tlladdr, struct sockaddr *sdl0, u_int fibnum) { struct mbuf *m; struct m_tag *mtag; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; struct ip6_moptions im6o; struct in6_addr daddr6, dst6, src6; uint32_t scopeid; NET_EPOCH_ASSERT(); int icmp6len, maxlen, error; caddr_t mac = NULL; daddr6 = *daddr6_0; /* make a local copy for modification */ /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; KASSERT(max_linkhdr + maxlen <= MCLBYTES, ( "%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)", __func__, max_linkhdr, maxlen, MCLBYTES)); if (max_linkhdr + maxlen > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; M_SETFIB(m, fibnum); if (IN6_IS_ADDR_MULTICAST(&daddr6)) { m->m_flags |= M_MCAST; im6o.im6o_multicast_ifp = ifp; im6o.im6o_multicast_hlim = 255; im6o.im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len; m->m_data += max_linkhdr; /* or M_ALIGN() equivalent? */ /* fill neighbor advertisement packet */ ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; daddr6.s6_addr16[1] = 0; daddr6.s6_addr32[1] = 0; daddr6.s6_addr32[2] = 0; daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; if (in6_setscope(&daddr6, ifp, NULL)) goto bad; flags &= ~ND_NA_FLAG_SOLICITED; } ip6->ip6_dst = daddr6; /* * Select a source whose scope is the same as that of the dest. */ in6_splitscope(&daddr6, &dst6, &scopeid); error = in6_selectsrc_addr(fibnum, &dst6, scopeid, ifp, &src6, NULL); if (error) { char ip6buf[INET6_ADDRSTRLEN]; nd6log((LOG_DEBUG, "nd6_na_output: source can't be " "determined: dst=%s, error=%d\n", ip6_sprintf(ip6buf, &daddr6), error)); goto bad; } ip6->ip6_src = src6; nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; in6_clearscope(&nd_na->nd_na_target); /* XXX */ /* * "tlladdr" indicates NS's condition for adding tlladdr or not. * see nd6_ns_input() for details. * Basically, if NS packet is sent to unicast/anycast addr, * target lladdr option SHOULD NOT be included. */ if (tlladdr) { /* * sdl0 != NULL indicates proxy NA. If we do proxy, use * lladdr in sdl0. If we are not proxying (sending NA for * my address) use lladdr configured for the interface. */ if (sdl0 == NULL) { if (ifp->if_carp) mac = (*carp_macmatch6_p)(ifp, m, taddr6); if (mac == NULL) mac = nd6_ifptomac(ifp); } else if (sdl0->sa_family == AF_LINK) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)sdl0; if (sdl->sdl_alen == ifp->if_addrlen) mac = LLADDR(sdl); } } if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; m->m_pkthdr.len += optlen; m->m_len += optlen; icmp6len += optlen; bzero((caddr_t)nd_opt, optlen); nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = optlen >> 3; bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen); } else flags &= ~ND_NA_FLAG_OVERRIDE; ip6->ip6_plen = htons((u_short)icmp6len); nd_na->nd_na_flags_reserved = flags; nd_na->nd_na_cksum = 0; nd_na->nd_na_cksum = in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len); if (send_sendso_input_hook != NULL) { mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; *(unsigned short *)(mtag + 1) = nd_na->nd_na_type; m_tag_prepend(m, mtag); } ip6_output(m, NULL, NULL, 0, &im6o, NULL, NULL); icmp6_ifstat_inc(ifp, ifs6_out_msg); icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert); - ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_ADVERT]); + ICMP6STAT_INC2(icp6s_outhist, ND_NEIGHBOR_ADVERT); return; bad: m_freem(m); } #ifndef BURN_BRIDGES void nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, u_long flags, int tlladdr, struct sockaddr *sdl0) { nd6_na_output_fib(ifp, daddr6_0, taddr6, flags, tlladdr, sdl0, RT_DEFAULT_FIB); } #endif caddr_t nd6_ifptomac(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_INFINIBAND: case IFT_BRIDGE: return IF_LLADDR(ifp); default: return NULL; } } struct dadq { TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ int dad_ns_tcount; /* # of trials to send NS */ int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; int dad_ns_lcount; /* looped back NS */ int dad_loopbackprobe; /* probing state for loopback detection */ struct callout dad_timer_ch; struct vnet *dad_vnet; u_int dad_refcnt; #define ND_OPT_NONCE_LEN32 \ ((ND_OPT_NONCE_LEN + sizeof(uint32_t) - 1)/sizeof(uint32_t)) uint32_t dad_nonce[ND_OPT_NONCE_LEN32]; bool dad_ondadq; /* on dadq? Protected by DADQ_WLOCK. */ }; VNET_DEFINE_STATIC(TAILQ_HEAD(, dadq), dadq); VNET_DEFINE_STATIC(struct rwlock, dad_rwlock); #define V_dadq VNET(dadq) #define V_dad_rwlock VNET(dad_rwlock) #define DADQ_LOCKPTR() (&V_dad_rwlock) #define DADQ_LOCK_INIT() rw_init(DADQ_LOCKPTR(), "nd6 DAD queue") #define DADQ_RLOCK() rw_rlock(DADQ_LOCKPTR()) #define DADQ_RUNLOCK() rw_runlock(DADQ_LOCKPTR()) #define DADQ_WLOCK() rw_wlock(DADQ_LOCKPTR()) #define DADQ_WUNLOCK() rw_wunlock(DADQ_LOCKPTR()) #define DADQ_LOCK_ASSERT() rw_assert(DADQ_LOCKPTR(), RA_LOCKED); #define DADQ_RLOCK_ASSERT() rw_assert(DADQ_LOCKPTR(), RA_RLOCKED); #define DADQ_WLOCK_ASSERT() rw_assert(DADQ_LOCKPTR(), RA_WLOCKED); static void nd6_dad_add(struct dadq *dp) { DADQ_WLOCK_ASSERT(); TAILQ_INSERT_TAIL(&V_dadq, dp, dad_list); dp->dad_ondadq = true; } static void nd6_dad_del(struct dadq *dp) { DADQ_WLOCK_ASSERT(); if (dp->dad_ondadq) { /* * Remove dp from the dadq and release the dadq's * reference. */ TAILQ_REMOVE(&V_dadq, dp, dad_list); dp->dad_ondadq = false; nd6_dad_rele(dp); } } static struct dadq * nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *n) { struct dadq *dp; DADQ_LOCK_ASSERT(); TAILQ_FOREACH(dp, &V_dadq, dad_list) { if (dp->dad_ifa != ifa) continue; /* * Skip if the nonce matches the received one. * +2 in the length is required because of type and * length fields are included in a header. */ if (n != NULL && n->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 && memcmp(&n->nd_opt_nonce[0], &dp->dad_nonce[0], ND_OPT_NONCE_LEN) == 0) { dp->dad_ns_lcount++; continue; } break; } return (dp); } static void nd6_dad_starttimer(struct dadq *dp, int ticks) { DADQ_WLOCK_ASSERT(); callout_reset(&dp->dad_timer_ch, ticks, nd6_dad_timer, dp); } static void nd6_dad_stoptimer(struct dadq *dp) { callout_drain(&dp->dad_timer_ch); } static void nd6_dad_rele(struct dadq *dp) { if (refcount_release(&dp->dad_refcnt)) { KASSERT(!dp->dad_ondadq, ("dp %p still on DAD queue", dp)); ifa_free(dp->dad_ifa); free(dp, M_IP6NDP); } } void nd6_dad_init(void) { DADQ_LOCK_INIT(); TAILQ_INIT(&V_dadq); } /* * Start Duplicate Address Detection (DAD) for specified interface address. */ void nd6_dad_start(struct ifaddr *ifa, int delay) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; char ip6buf[INET6_ADDRSTRLEN]; KASSERT((ia->ia6_flags & IN6_IFF_TENTATIVE) != 0, ("starting DAD on non-tentative address %p", ifa)); /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled globally or on the interface * - the interface address is anycast */ if ((ia->ia6_flags & IN6_IFF_ANYCAST) != 0 || V_ip6_dad_count == 0 || (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) != 0) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; return; } if ((ifa->ifa_ifp->if_flags & IFF_UP) == 0 || (ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED) != 0) return; DADQ_WLOCK(); if ((dp = nd6_dad_find(ifa, NULL)) != NULL) { /* * DAD is already in progress. Let the existing entry * finish it. */ DADQ_WUNLOCK(); return; } dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT | M_ZERO); if (dp == NULL) { log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } callout_init_rw(&dp->dad_timer_ch, DADQ_LOCKPTR(), CALLOUT_RETURNUNLOCKED); #ifdef VIMAGE dp->dad_vnet = curvnet; #endif nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* * Send NS packet for DAD, ip6_dad_count times. * Note that we must delay the first transmission, if this is the * first packet to be sent from the interface after interface * (re)initialization. */ dp->dad_ifa = ifa; ifa_ref(dp->dad_ifa); dp->dad_count = V_ip6_dad_count; dp->dad_ns_icount = dp->dad_na_icount = 0; dp->dad_ns_ocount = dp->dad_ns_tcount = 0; dp->dad_ns_lcount = dp->dad_loopbackprobe = 0; /* Add this to the dadq and add a reference for the dadq. */ refcount_init(&dp->dad_refcnt, 1); nd6_dad_add(dp); nd6_dad_starttimer(dp, delay); DADQ_WUNLOCK(); } /* * terminate DAD unconditionally. used for address removals. */ void nd6_dad_stop(struct ifaddr *ifa) { struct dadq *dp; DADQ_WLOCK(); dp = nd6_dad_find(ifa, NULL); if (dp == NULL) { DADQ_WUNLOCK(); /* DAD wasn't started yet */ return; } /* * Acquire a temporary reference so that we can safely stop the callout. */ (void)refcount_acquire(&dp->dad_refcnt); nd6_dad_del(dp); DADQ_WUNLOCK(); nd6_dad_stoptimer(dp); nd6_dad_rele(dp); } static void nd6_dad_timer(void *arg) { struct dadq *dp = arg; struct ifaddr *ifa = dp->dad_ifa; struct ifnet *ifp = dp->dad_ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; char ip6buf[INET6_ADDRSTRLEN]; struct epoch_tracker et; CURVNET_SET(dp->dad_vnet); KASSERT(ia != NULL, ("DAD entry %p with no address", dp)); NET_EPOCH_ENTER(et); if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { /* Do not need DAD for ifdisabled interface. */ log(LOG_ERR, "nd6_dad_timer: cancel DAD on %s because of " "ND6_IFF_IFDISABLED.\n", ifp->if_xname); goto err; } if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto err; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { log(LOG_ERR, "nd6_dad_timer: called with non-tentative address " "%s(%s)\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); goto err; } /* Stop DAD if the interface is down even after dad_maxtry attempts. */ if ((dp->dad_ns_tcount > V_dad_maxtry) && (((ifp->if_flags & IFF_UP) == 0) || ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0))) { nd6log((LOG_INFO, "%s: could not run DAD " "because the interface was down or not running.\n", if_name(ifa->ifa_ifp))); goto err; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { /* * We have more NS to go. Send NS packet for DAD. */ nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000); nd6_dad_ns_output(dp); goto done; } else { /* * We have transmitted sufficient number of DAD packets. * See what we've got. */ if (dp->dad_ns_icount > 0 || dp->dad_na_icount > 0) { /* We've seen NS or NA, means DAD has failed. */ nd6_dad_duplicated(ifa, dp); } else if (V_dad_enhanced != 0 && dp->dad_ns_lcount > 0 && dp->dad_ns_lcount > dp->dad_loopbackprobe) { /* * Sec. 4.1 in RFC 7527 requires transmission of * additional probes until the loopback condition * becomes clear when a looped back probe is detected. */ log(LOG_ERR, "%s: a looped back NS message is " "detected during DAD for %s. " "Another DAD probes are being sent.\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, IFA_IN6(ifa))); dp->dad_loopbackprobe = dp->dad_ns_lcount; /* * Send an NS immediately and increase dad_count by * V_nd6_mmaxtries - 1. */ dp->dad_count = dp->dad_ns_ocount + V_nd6_mmaxtries - 1; nd6_dad_starttimer(dp, (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000); nd6_dad_ns_output(dp); goto done; } else { /* * We are done with DAD. No NA came, no NS came. * No duplicate address found. Check IFDISABLED flag * again in case that it is changed between the * beginning of this function and here. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0) ia->ia6_flags &= ~IN6_IFF_TENTATIVE; nd6log((LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); if (dp->dad_ns_lcount > 0) log(LOG_ERR, "%s: DAD completed while " "a looped back NS message is detected " "during DAD for %s.\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, IFA_IN6(ifa))); } } err: nd6_dad_del(dp); DADQ_WUNLOCK(); done: NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static void nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp; char ip6buf[INET6_ADDRSTRLEN]; log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: " "NS in/out/loopback=%d/%d/%d, NA in=%d\n", if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr), dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount, dp->dad_na_icount); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; ifp = ifa->ifa_ifp; log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)); log(LOG_ERR, "%s: manual intervention required\n", if_name(ifp)); /* * If the address is a link-local address formed from an interface * identifier based on the hardware address which is supposed to be * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP * operation on the interface SHOULD be disabled. * [RFC 4862, Section 5.4.5] */ if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { struct in6_addr in6; /* * To avoid over-reaction, we only apply this logic when we are * very sure that hardware addresses are supposed to be unique. */ switch (ifp->if_type) { case IFT_ETHER: case IFT_ATM: case IFT_IEEE1394: case IFT_INFINIBAND: in6 = ia->ia_addr.sin6_addr; if (in6_get_hw_ifid(ifp, &in6) == 0 && IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "%s: possible hardware address " "duplication detected, disable IPv6\n", if_name(ifp)); } break; } } } /* * Transmit a neighbour solicitation for the purpose of DAD. Returns with the * DAD queue unlocked. */ static void nd6_dad_ns_output(struct dadq *dp) { struct in6_ifaddr *ia = (struct in6_ifaddr *)dp->dad_ifa; struct ifnet *ifp = dp->dad_ifa->ifa_ifp; int i; DADQ_WLOCK_ASSERT(); dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { DADQ_WUNLOCK(); return; } if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { DADQ_WUNLOCK(); return; } dp->dad_ns_ocount++; if (V_dad_enhanced != 0) { for (i = 0; i < ND_OPT_NONCE_LEN32; i++) dp->dad_nonce[i] = arc4random(); /* * XXXHRS: Note that in the case that * DupAddrDetectTransmits > 1, multiple NS messages with * different nonces can be looped back in an unexpected * order. The current implementation recognizes only * the latest nonce on the sender side. Practically it * should work well in almost all cases. */ } DADQ_WUNLOCK(); nd6_ns_output(ifp, NULL, NULL, &ia->ia_addr.sin6_addr, (uint8_t *)&dp->dad_nonce[0]); } static void nd6_dad_ns_input(struct ifaddr *ifa, struct nd_opt_nonce *ndopt_nonce) { struct dadq *dp; if (ifa == NULL) panic("ifa == NULL in nd6_dad_ns_input"); /* Ignore Nonce option when Enhanced DAD is disabled. */ if (V_dad_enhanced == 0) ndopt_nonce = NULL; DADQ_RLOCK(); dp = nd6_dad_find(ifa, ndopt_nonce); if (dp != NULL) dp->dad_ns_icount++; DADQ_RUNLOCK(); } static void nd6_dad_na_input(struct ifaddr *ifa) { struct dadq *dp; if (ifa == NULL) panic("ifa == NULL in nd6_dad_na_input"); DADQ_RLOCK(); dp = nd6_dad_find(ifa, NULL); if (dp != NULL) dp->dad_na_icount++; DADQ_RUNLOCK(); } diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c index 3264de331817..b4421509ec62 100644 --- a/sys/netinet6/raw_ip6.c +++ b/sys/netinet6/raw_ip6.c @@ -1,870 +1,870 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ipsec.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) /* * Raw interface to IP6 protocol. */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; extern u_long rip_recvspace; VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat); VNET_PCPUSTAT_SYSINIT(rip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rip6stat); #endif /* VIMAGE */ /* * Hooks for multicast routing. They all default to NULL, so leave them not * initialized and rely on BSS being set to 0. */ /* * The socket used to communicate with the multicast routing daemon. */ VNET_DEFINE(struct socket *, ip6_mrouter); /* * The various mrouter functions. */ int (*ip6_mrouter_set)(struct socket *, struct sockopt *); int (*ip6_mrouter_get)(struct socket *, struct sockopt *); int (*ip6_mrouter_done)(void); int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); struct rip6_inp_match_ctx { struct ip6_hdr *ip6; int proto; }; static bool rip6_inp_match(const struct inpcb *inp, void *v) { struct rip6_inp_match_ctx *c = v; struct ip6_hdr *ip6 = c->ip6; int proto = c->proto; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) return (false); if (inp->inp_ip_p && inp->inp_ip_p != proto) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) return (false); if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) return (false); return (true); } /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ int rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto }; struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx); int delivered = 0; NET_EPOCH_ASSERT(); RIP6STAT_INC(rip6s_ipackets); init_sin6(&fromsa, m, 0); /* general init */ ifp = m->m_pkthdr.rcvif; while ((inp = inp_next(&inpi)) != NULL) { INP_RLOCK_ASSERT(inp); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { /* Do not inject data into pcb. */ continue; } #endif /* IPSEC */ if (jailed_without_vnet(inp->inp_cred) && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0) /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ continue; if (inp->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 || in6_cksum(m, proto, *offp, m->m_pkthdr.len - *offp)) { RIP6STAT_INC(rip6s_badsum); /* * Drop the received message, don't send an * ICMP6 message. Set proto to IPPROTO_NONE * to achieve that. */ INP_RUNLOCK(inp); proto = IPPROTO_NONE; break; } } /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket * should receive it, as multicast filtering is now * the responsibility of the transport layer. */ if (inp->in6p_moptions && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If the incoming datagram is for MLD, allow it * through unconditionally to the raw socket. * * Use the M_RTALERT_MLD flag to check for MLD * traffic without having to inspect the mbuf chain * more deeply, as all MLDv1/v2 host messages MUST * contain the Router Alert option. * * In the case of MLDv1, we may not have explicitly * joined the group, and may have set IFF_ALLMULTI * on the interface. im6o_mc_filter() may discard * control traffic we actually need to see. * * Userland multicast routing daemons should continue * filter the control traffic appropriately. */ int blocked; blocked = MCAST_PASS; if ((m->m_flags & M_RTALERT_MLD) == 0) { struct sockaddr_in6 mcaddr; bzero(&mcaddr, sizeof(struct sockaddr_in6)); mcaddr.sin6_len = sizeof(struct sockaddr_in6); mcaddr.sin6_family = AF_INET6; mcaddr.sin6_addr = ip6->ip6_dst; blocked = im6o_mc_filter(inp->in6p_moptions, ifp, (struct sockaddr *)&mcaddr, (struct sockaddr *)&fromsa); } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); continue; } } if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) continue; if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & SO_TIMESTAMP) ip6_savecontrol(inp, n, &opts); /* strip intermediate headers */ m_adj(n, *offp); if (sbappendaddr(&inp->inp_socket->so_rcv, (struct sockaddr *)&fromsa, n, opts) == 0) { soroverflow(inp->inp_socket); m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); } else { sorwakeup(inp->inp_socket); delivered++; } opts = NULL; } if (delivered == 0) { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); if (proto == IPPROTO_NONE) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, ip6_get_prevhdr(m, *offp)); IP6STAT_DEC(ip6s_delivered); } else m_freem(m); return (IPPROTO_DONE); } void rip6_ctlinput(struct ip6ctlparam *ip6cp) { int errno; if ((errno = icmp6_errmap(ip6cp->ip6c_icmp6)) != 0) in6_pcbnotify(&V_ripcbinfo, ip6cp->ip6c_finaldst, 0, ip6cp->ip6c_src, 0, errno, ip6cp->ip6c_cmdarg, in6_rtchange); } /* * Generate IPv6 header and pass packet to ip6_output. Tack on options user * may have setup with control call. */ static int rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct epoch_tracker et; struct inpcb *inp; struct sockaddr_in6 tmp, *dstsock; struct m_tag *mtag; struct ip6_hdr *ip6; u_int plen = m->m_pkthdr.len; struct ip6_pktopts opt, *optp; struct ifnet *oifp = NULL; int error; int type = 0, code = 0; /* for ICMPv6 output statistics only */ int scope_ambiguous = 0; int use_defzone = 0; int hlim = 0; struct in6_addr in6a; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_send: inp == NULL")); /* Always copy sockaddr to avoid overwrites. */ /* Unlocked read. */ if (so->so_state & SS_ISCONNECTED) { if (nam) { error = EISCONN; goto release; } tmp = (struct sockaddr_in6 ){ .sin6_family = AF_INET6, .sin6_len = sizeof(struct sockaddr_in6), }; INP_RLOCK(inp); bcopy(&inp->in6p_faddr, &tmp.sin6_addr, sizeof(struct in6_addr)); INP_RUNLOCK(inp); dstsock = &tmp; } else { if (nam == NULL) error = ENOTCONN; else if (nam->sa_family != AF_INET6) error = EAFNOSUPPORT; else if (nam->sa_len != sizeof(struct sockaddr_in6)) error = EINVAL; else error = 0; if (error != 0) goto release; dstsock = (struct sockaddr_in6 *)nam; if (dstsock->sin6_family != AF_INET6) { error = EAFNOSUPPORT; goto release; } } INP_WLOCK(inp); if (control != NULL) { NET_EPOCH_ENTER(et); error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, so->so_cred, inp->inp_ip_p); NET_EPOCH_EXIT(et); if (error != 0) { goto bad; } optp = &opt; } else optp = inp->in6p_outputopts; /* * Check and convert scope zone ID into internal form. * * XXX: we may still need to determine the zone later. */ if (!(so->so_state & SS_ISCONNECTED)) { if (!optp || !optp->ip6po_pktinfo || !optp->ip6po_pktinfo->ipi6_ifindex) use_defzone = V_ip6_use_defzone; if (dstsock->sin6_scope_id == 0 && !use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(dstsock, use_defzone)) != 0) goto bad; } /* * For an ICMPv6 packet, we should know its type and code to update * statistics. */ if (inp->inp_ip_p == IPPROTO_ICMPV6) { struct icmp6_hdr *icmp6; if (m->m_len < sizeof(struct icmp6_hdr) && (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) { error = ENOBUFS; goto bad; } icmp6 = mtod(m, struct icmp6_hdr *); type = icmp6->icmp6_type; code = icmp6->icmp6_code; } M_PREPEND(m, sizeof(*ip6), M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } ip6 = mtod(m, struct ip6_hdr *); #ifdef ROUTE_MPATH if (CALC_FLOWID_OUTBOUND) { uint32_t hash_type, hash_val; hash_val = fib6_calc_software_hash(&inp->in6p_laddr, &dstsock->sin6_addr, 0, 0, inp->inp_ip_p, &hash_type); inp->inp_flowid = hash_val; inp->inp_flowtype = hash_type; } #endif /* * Source address selection. */ NET_EPOCH_ENTER(et); error = in6_selectsrc_socket(dstsock, optp, inp, so->so_cred, scope_ambiguous, &in6a, &hlim); NET_EPOCH_EXIT(et); if (error) goto bad; error = prison_check_ip6(inp->inp_cred, &in6a); if (error != 0) goto bad; ip6->ip6_src = in6a; ip6->ip6_dst = dstsock->sin6_addr; /* * Fill in the rest of the IPv6 header fields. */ ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); /* * ip6_plen will be filled in ip6_output, so not fill it here. */ ip6->ip6_nxt = inp->inp_ip_p; ip6->ip6_hlim = hlim; if (inp->inp_ip_p == IPPROTO_ICMPV6 || inp->in6p_cksum != -1) { struct mbuf *n; int off; u_int16_t *p; /* Compute checksum. */ if (inp->inp_ip_p == IPPROTO_ICMPV6) off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = inp->in6p_cksum; if (plen < off + 2) { error = EINVAL; goto bad; } off += sizeof(struct ip6_hdr); n = m; while (n && n->m_len <= off) { off -= n->m_len; n = n->m_next; } if (!n) goto bad; p = (u_int16_t *)(mtod(n, caddr_t) + off); *p = 0; *p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen); } /* * Send RA/RS messages to user land for protection, before sending * them to rtadvd/rtsol. */ if ((send_sendso_input_hook != NULL) && inp->inp_ip_p == IPPROTO_ICMPV6) { switch (type) { case ND_ROUTER_ADVERT: case ND_ROUTER_SOLICIT: mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short), M_NOWAIT); if (mtag == NULL) goto bad; m_tag_prepend(m, mtag); } } NET_EPOCH_ENTER(et); error = ip6_output(m, optp, NULL, 0, inp->in6p_moptions, &oifp, inp); NET_EPOCH_EXIT(et); if (inp->inp_ip_p == IPPROTO_ICMPV6) { if (oifp) icmp6_ifoutstat_inc(oifp, type, code); - ICMP6STAT_INC(icp6s_outhist[type]); + ICMP6STAT_INC2(icp6s_outhist, type); } else RIP6STAT_INC(rip6s_opackets); goto freectl; bad: if (m) m_freem(m); freectl: if (control != NULL) { ip6_clearpktopts(&opt, -1); m_freem(control); } INP_WUNLOCK(inp); return (error); release: if (control != NULL) m_freem(control); m_freem(m); return (error); } /* * Raw IPv6 socket option processing. */ int rip6_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error; if (sopt->sopt_level == IPPROTO_ICMPV6) /* * XXX: is it better to call icmp6_ctloutput() directly * from protosw? */ return (icmp6_ctloutput(so, sopt)); else if (sopt->sopt_level != IPPROTO_IPV6) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_SETFIB) { INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); return (0); } return (EINVAL); } error = 0; switch (sopt->sopt_dir) { case SOPT_GET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: if (inp->inp_ip_p != IPPROTO_ICMPV6) return (EOPNOTSUPP); error = ip6_mrouter_get ? ip6_mrouter_get(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; case SOPT_SET: switch (sopt->sopt_name) { case MRT6_INIT: case MRT6_DONE: case MRT6_ADD_MIF: case MRT6_DEL_MIF: case MRT6_ADD_MFC: case MRT6_DEL_MFC: case MRT6_PIM: if (inp->inp_ip_p != IPPROTO_ICMPV6) return (EOPNOTSUPP); error = ip6_mrouter_set ? ip6_mrouter_set(so, sopt) : EOPNOTSUPP; break; case IPV6_CHECKSUM: error = ip6_raw_ctloutput(so, sopt); break; default: error = ip6_ctloutput(so, sopt); break; } break; } return (error); } static int rip6_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct icmp6_filter *filter; int error; inp = sotoinpcb(so); KASSERT(inp == NULL, ("rip6_attach: inp != NULL")); error = priv_check(td, PRIV_NETINET_RAW); if (error) return (error); if (proto >= IPPROTO_MAX || proto < 0) return (EPROTONOSUPPORT); error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); error = in_pcballoc(so, &V_ripcbinfo); if (error) { free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; inp->inp_ip_p = proto; inp->in6p_cksum = -1; inp->in6p_icmp6filt = filter; ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); INP_WUNLOCK(inp); return (0); } static void rip6_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_detach: inp == NULL")); if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbfree(inp); } /* XXXRW: This can't ever be called. */ static void rip6_abort(struct socket *so) { struct inpcb *inp __diagused; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_abort: inp == NULL")); soisdisconnected(so); } static void rip6_close(struct socket *so) { struct inpcb *inp __diagused; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_close: inp == NULL")); soisdisconnected(so); } static int rip6_disconnect(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL")); if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); inp->in6p_faddr = in6addr_any; rip6_abort(so); return (0); } static int rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct ifaddr *ifa = NULL; int error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_bind: inp == NULL")); if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0) return (error); if (CK_STAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6) return (EADDRNOTAVAIL); if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); NET_EPOCH_ENTER(et); if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } if (ifa != NULL && ((struct in6_ifaddr *)ifa)->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } NET_EPOCH_EXIT(et); INP_WLOCK(inp); INP_INFO_WLOCK(&V_ripcbinfo); inp->in6p_laddr = addr->sin6_addr; INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static int rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; struct in6_addr in6a; struct epoch_tracker et; int error = 0, scope_ambiguous = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip6_connect: inp == NULL")); if (nam->sa_len != sizeof(*addr)) return (EINVAL); if (CK_STAILQ_EMPTY(&V_ifnet)) return (EADDRNOTAVAIL); if (addr->sin6_family != AF_INET6) return (EAFNOSUPPORT); /* * Application should provide a proper zone ID or the use of default * zone IDs should be enabled. Unfortunately, some applications do * not behave as it should, so we need a workaround. Even if an * appropriate ID is not determined, we'll see if we can determine * the outgoing interface. If we can, determine the zone ID based on * the interface below. */ if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone) scope_ambiguous = 1; if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0) return (error); INP_WLOCK(inp); INP_INFO_WLOCK(&V_ripcbinfo); /* Source address selection. XXX: need pcblookup? */ NET_EPOCH_ENTER(et); error = in6_selectsrc_socket(addr, inp->in6p_outputopts, inp, so->so_cred, scope_ambiguous, &in6a, NULL); NET_EPOCH_EXIT(et); if (error) { INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (error); } inp->in6p_faddr = addr->sin6_addr; inp->in6p_laddr = in6a; soisconnected(so); INP_INFO_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } static int rip6_shutdown(struct socket *so, enum shutdown_how how) { SOCK_LOCK(so); if (!(so->so_state & SS_ISCONNECTED)) { SOCK_UNLOCK(so); return (ENOTCONN); } SOCK_UNLOCK(so); switch (how) { case SHUT_RD: sorflush(so); break; case SHUT_RDWR: sorflush(so); /* FALLTHROUGH */ case SHUT_WR: socantsendmore(so); } return (0); } struct protosw rip6_protosw = { .pr_type = SOCK_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_ctloutput = rip6_ctloutput, .pr_abort = rip6_abort, .pr_attach = rip6_attach, .pr_bind = rip6_bind, .pr_connect = rip6_connect, .pr_control = in6_control, .pr_detach = rip6_detach, .pr_disconnect = rip6_disconnect, .pr_peeraddr = in6_getpeeraddr, .pr_send = rip6_send, .pr_shutdown = rip6_shutdown, .pr_sockaddr = in6_getsockaddr, .pr_close = rip6_close }; diff --git a/sys/sys/sdt.h b/sys/sys/sdt.h index ba3dcfa15762..cbe5bd99c242 100644 --- a/sys/sys/sdt.h +++ b/sys/sys/sdt.h @@ -1,446 +1,457 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2006-2008 John Birrell * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Statically Defined Tracing (SDT) definitions. * */ #ifndef _SYS_SDT_H #define _SYS_SDT_H #ifndef _KERNEL #define _DTRACE_VERSION 1 #define DTRACE_PROBE(prov, name) { \ extern void __dtrace_##prov##___##name(void); \ __dtrace_##prov##___##name(); \ } #define DTRACE_PROBE1(prov, name, arg1) { \ extern void __dtrace_##prov##___##name(unsigned long); \ __dtrace_##prov##___##name((unsigned long)arg1); \ } #define DTRACE_PROBE2(prov, name, arg1, arg2) { \ extern void __dtrace_##prov##___##name(unsigned long, \ unsigned long); \ __dtrace_##prov##___##name((unsigned long)arg1, \ (unsigned long)arg2); \ } #define DTRACE_PROBE3(prov, name, arg1, arg2, arg3) { \ extern void __dtrace_##prov##___##name(unsigned long, \ unsigned long, unsigned long); \ __dtrace_##prov##___##name((unsigned long)arg1, \ (unsigned long)arg2, (unsigned long)arg3); \ } #define DTRACE_PROBE4(prov, name, arg1, arg2, arg3, arg4) { \ extern void __dtrace_##prov##___##name(unsigned long, \ unsigned long, unsigned long, unsigned long); \ __dtrace_##prov##___##name((unsigned long)arg1, \ (unsigned long)arg2, (unsigned long)arg3, \ (unsigned long)arg4); \ } #define DTRACE_PROBE5(prov, name, arg1, arg2, arg3, arg4, arg5) { \ extern void __dtrace_##prov##___##name(unsigned long, \ unsigned long, unsigned long, unsigned long, unsigned long);\ __dtrace_##prov##___##name((unsigned long)arg1, \ (unsigned long)arg2, (unsigned long)arg3, \ (unsigned long)arg4, (unsigned long)arg5); \ } #else /* _KERNEL */ #include #include extern volatile bool sdt_probes_enabled; #ifndef KDTRACE_HOOKS #define __sdt_used __unused #define SDT_PROVIDER_DEFINE(prov) #define SDT_PROVIDER_DECLARE(prov) #define SDT_PROBE_DEFINE(prov, mod, func, name) #define SDT_PROBE_DECLARE(prov, mod, func, name) #define SDT_PROBES_ENABLED() 0 #define SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) #define SDT_PROBE_ARGTYPE(prov, mod, func, name, num, type, xtype) #define SDT_PROBE_DEFINE0(prov, mod, func, name) #define SDT_PROBE_DEFINE1(prov, mod, func, name, arg0) #define SDT_PROBE_DEFINE2(prov, mod, func, name, arg0, arg1) #define SDT_PROBE_DEFINE3(prov, mod, func, name, arg0, arg1, arg2) #define SDT_PROBE_DEFINE4(prov, mod, func, name, arg0, arg1, arg2, arg3) #define SDT_PROBE_DEFINE5(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) #define SDT_PROBE_DEFINE6(prov, mod, func, name, arg0, arg1, arg2, \ arg3, arg4, arg5) #define SDT_PROBE_DEFINE7(prov, mod, func, name, arg0, arg1, arg2, \ arg3, arg4, arg5, arg6) #define SDT_PROBE0(prov, mod, func, name) #define SDT_PROBE1(prov, mod, func, name, arg0) #define SDT_PROBE2(prov, mod, func, name, arg0, arg1) #define SDT_PROBE3(prov, mod, func, name, arg0, arg1, arg2) #define SDT_PROBE4(prov, mod, func, name, arg0, arg1, arg2, arg3) #define SDT_PROBE5(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) #define SDT_PROBE6(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4, arg5) #define SDT_PROBE7(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4, arg5, \ arg6) +#define MIB_SDT_PROBE1(...) +#define MIB_SDT_PROBE2(...) + #define SDT_PROBE_DEFINE0_XLATE(prov, mod, func, name) #define SDT_PROBE_DEFINE1_XLATE(prov, mod, func, name, arg0, xarg0) #define SDT_PROBE_DEFINE2_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1) #define SDT_PROBE_DEFINE3_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2) #define SDT_PROBE_DEFINE4_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3) #define SDT_PROBE_DEFINE5_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3, arg4, xarg4) #define SDT_PROBE_DEFINE6_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3, arg4, xarg4, arg5, xarg5) #define SDT_PROBE_DEFINE7_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3, arg4, xarg4, arg5, xarg5, arg6, \ xarg6) #define DTRACE_PROBE(name) #define DTRACE_PROBE1(name, type0, arg0) #define DTRACE_PROBE2(name, type0, arg0, type1, arg1) #define DTRACE_PROBE3(name, type0, arg0, type1, arg1, type2, arg2) #define DTRACE_PROBE4(name, type0, arg0, type1, arg1, type2, arg2, type3, arg3) #define DTRACE_PROBE5(name, type0, arg0, type1, arg1, type2, arg2, type3, arg3,\ type4, arg4) #else #define __sdt_used SET_DECLARE(sdt_providers_set, struct sdt_provider); SET_DECLARE(sdt_probes_set, struct sdt_probe); SET_DECLARE(sdt_argtypes_set, struct sdt_argtype); #define SDT_PROVIDER_DEFINE(_prov) \ struct sdt_provider sdt_provider_##_prov[1] = { \ [0] = { .name = #_prov }, \ }; \ DATA_SET(sdt_providers_set, sdt_provider_##_prov); #define SDT_PROVIDER_DECLARE(prov) \ extern struct sdt_provider sdt_provider_##prov[1] #define SDT_PROBE_DEFINE(_prov, _mod, _func, _name) \ struct sdt_probe sdt_##_prov##_##_mod##_##_func##_##_name[1] = {\ [0] = { \ .version = sizeof(struct sdt_probe), \ .prov = sdt_provider_##_prov, \ .mod = #_mod, \ .func = #_func, \ .name = #_name, \ }, \ }; \ DATA_SET(sdt_probes_set, sdt_##_prov##_##_mod##_##_func##_##_name) #define SDT_PROBE_DECLARE(prov, mod, func, name) \ extern struct sdt_probe sdt_##prov##_##mod##_##func##_##name[1] #define SDT_PROBES_ENABLED() __predict_false(sdt_probes_enabled) #define SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) do { \ if (SDT_PROBES_ENABLED()) { \ if (__predict_false(sdt_##prov##_##mod##_##func##_##name->id)) \ (*sdt_probe_func)(sdt_##prov##_##mod##_##func##_##name->id, \ (uintptr_t) arg0, (uintptr_t) arg1, (uintptr_t) arg2, \ (uintptr_t) arg3, (uintptr_t) arg4); \ } \ } while (0) #define SDT_PROBE_ARGTYPE(_prov, _mod, _func, _name, _num, _type, _xtype) \ static struct sdt_argtype \ sdta_##_prov##_##_mod##_##_func##_##_name##_num[1] = { \ [0] = { \ .ndx = _num, \ .type = _type, \ .xtype = _xtype, \ .probe = sdt_##_prov##_##_mod##_##_func##_##_name, \ }, \ }; \ DATA_SET(sdt_argtypes_set, \ sdta_##_prov##_##_mod##_##_func##_##_name##_num); #define SDT_PROBE_DEFINE0(prov, mod, func, name) \ SDT_PROBE_DEFINE(prov, mod, func, name) #define SDT_PROBE_DEFINE1(prov, mod, func, name, arg0) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL) #define SDT_PROBE_DEFINE2(prov, mod, func, name, arg0, arg1) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, NULL) #define SDT_PROBE_DEFINE3(prov, mod, func, name, arg0, arg1, arg2)\ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, NULL) #define SDT_PROBE_DEFINE4(prov, mod, func, name, arg0, arg1, arg2, arg3) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, NULL) #define SDT_PROBE_DEFINE5(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 4, arg4, NULL) #define SDT_PROBE_DEFINE6(prov, mod, func, name, arg0, arg1, arg2, arg3,\ arg4, arg5) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 4, arg4, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 5, arg5, NULL) #define SDT_PROBE_DEFINE7(prov, mod, func, name, arg0, arg1, arg2, arg3,\ arg4, arg5, arg6) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 4, arg4, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 5, arg5, NULL); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 6, arg6, NULL) #define SDT_PROBE_DEFINE0_XLATE(prov, mod, func, name) \ SDT_PROBE_DEFINE(prov, mod, func, name) #define SDT_PROBE_DEFINE1_XLATE(prov, mod, func, name, arg0, xarg0) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0) #define SDT_PROBE_DEFINE2_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, xarg1) #define SDT_PROBE_DEFINE3_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, xarg1); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, xarg2) #define SDT_PROBE_DEFINE4_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, xarg1); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, xarg2); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, xarg3) #define SDT_PROBE_DEFINE5_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3, arg4, xarg4) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, xarg1); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, xarg2); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, xarg3); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 4, arg4, xarg4) #define SDT_PROBE_DEFINE6_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3, arg4, xarg4, arg5, xarg5) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, xarg1); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, xarg2); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, xarg3); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 4, arg4, xarg4); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 5, arg5, xarg5) #define SDT_PROBE_DEFINE7_XLATE(prov, mod, func, name, arg0, xarg0, \ arg1, xarg1, arg2, xarg2, arg3, xarg3, arg4, xarg4, arg5, xarg5, arg6, \ xarg6) \ SDT_PROBE_DEFINE(prov, mod, func, name); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 0, arg0, xarg0); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 1, arg1, xarg1); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 2, arg2, xarg2); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 3, arg3, xarg3); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 4, arg4, xarg4); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 5, arg5, xarg5); \ SDT_PROBE_ARGTYPE(prov, mod, func, name, 6, arg6, xarg6) #define SDT_PROBE0(prov, mod, func, name) \ SDT_PROBE(prov, mod, func, name, 0, 0, 0, 0, 0) #define SDT_PROBE1(prov, mod, func, name, arg0) \ SDT_PROBE(prov, mod, func, name, arg0, 0, 0, 0, 0) #define SDT_PROBE2(prov, mod, func, name, arg0, arg1) \ SDT_PROBE(prov, mod, func, name, arg0, arg1, 0, 0, 0) #define SDT_PROBE3(prov, mod, func, name, arg0, arg1, arg2) \ SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, 0, 0) #define SDT_PROBE4(prov, mod, func, name, arg0, arg1, arg2, arg3) \ SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, arg3, 0) #define SDT_PROBE5(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4) #define SDT_PROBE6(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4, arg5) \ do { \ if (sdt_##prov##_##mod##_##func##_##name->id) \ (*(void (*)(uint32_t, uintptr_t, uintptr_t, uintptr_t, \ uintptr_t, uintptr_t, uintptr_t))sdt_probe_func)( \ sdt_##prov##_##mod##_##func##_##name->id, \ (uintptr_t)arg0, (uintptr_t)arg1, (uintptr_t)arg2, \ (uintptr_t)arg3, (uintptr_t)arg4, (uintptr_t)arg5);\ } while (0) #define SDT_PROBE7(prov, mod, func, name, arg0, arg1, arg2, arg3, arg4, arg5, \ arg6) \ do { \ if (sdt_##prov##_##mod##_##func##_##name->id) \ (*(void (*)(uint32_t, uintptr_t, uintptr_t, uintptr_t, \ uintptr_t, uintptr_t, uintptr_t, uintptr_t)) \ sdt_probe_func)( \ sdt_##prov##_##mod##_##func##_##name->id, \ (uintptr_t)arg0, (uintptr_t)arg1, (uintptr_t)arg2, \ (uintptr_t)arg3, (uintptr_t)arg4, (uintptr_t)arg5, \ (uintptr_t)arg6); \ } while (0) +#ifndef KDTRACE_NO_MIB_SDT +#define MIB_SDT_PROBE1(...) SDT_PROBE1(mib, __VA_ARGS__) +#define MIB_SDT_PROBE2(...) SDT_PROBE2(mib, __VA_ARGS__) +#else +#define MIB_SDT_PROBE1(...) +#define MIB_SDT_PROBE2(...) +#endif + #define DTRACE_PROBE_IMPL_START(name, arg0, arg1, arg2, arg3, arg4) do { \ static SDT_PROBE_DEFINE(sdt, , , name); \ SDT_PROBE(sdt, , , name, arg0, arg1, arg2, arg3, arg4); #define DTRACE_PROBE_IMPL_END } while (0) #define DTRACE_PROBE(name) \ DTRACE_PROBE_IMPL_START(name, 0, 0, 0, 0, 0) \ DTRACE_PROBE_IMPL_END #define DTRACE_PROBE1(name, type0, arg0) \ DTRACE_PROBE_IMPL_START(name, arg0, 0, 0, 0, 0) \ SDT_PROBE_ARGTYPE(sdt, , , name, 0, #type0, NULL); \ DTRACE_PROBE_IMPL_END #define DTRACE_PROBE2(name, type0, arg0, type1, arg1) \ DTRACE_PROBE_IMPL_START(name, arg0, arg1, 0, 0, 0) \ SDT_PROBE_ARGTYPE(sdt, , , name, 0, #type0, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 1, #type1, NULL); \ DTRACE_PROBE_IMPL_END #define DTRACE_PROBE3(name, type0, arg0, type1, arg1, type2, arg2) \ DTRACE_PROBE_IMPL_START(name, arg0, arg1, arg2, 0, 0) \ SDT_PROBE_ARGTYPE(sdt, , , name, 0, #type0, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 1, #type1, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 2, #type2, NULL); \ DTRACE_PROBE_IMPL_END #define DTRACE_PROBE4(name, type0, arg0, type1, arg1, type2, arg2, type3, arg3) \ DTRACE_PROBE_IMPL_START(name, arg0, arg1, arg2, arg3, 0) \ SDT_PROBE_ARGTYPE(sdt, , , name, 0, #type0, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 1, #type1, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 2, #type2, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 3, #type3, NULL); \ DTRACE_PROBE_IMPL_END #define DTRACE_PROBE5(name, type0, arg0, type1, arg1, type2, arg2, type3, arg3, \ type4, arg4) \ DTRACE_PROBE_IMPL_START(name, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE_ARGTYPE(sdt, , , name, 0, #type0, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 1, #type1, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 2, #type2, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 3, #type3, NULL); \ SDT_PROBE_ARGTYPE(sdt, , , name, 4, #type4, NULL); \ DTRACE_PROBE_IMPL_END #endif /* KDTRACE_HOOKS */ /* * This type definition must match that of dtrace_probe. It is defined this * way to avoid having to rely on CDDL code. */ typedef void (*sdt_probe_func_t)(uint32_t, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); /* * The 'sdt' provider will set it to dtrace_probe when it loads. */ extern sdt_probe_func_t sdt_probe_func; struct sdt_probe; struct sdt_provider; struct linker_file; struct sdt_argtype { int ndx; /* Argument index. */ const char *type; /* Argument type string. */ const char *xtype; /* Translated argument type. */ TAILQ_ENTRY(sdt_argtype) argtype_entry; /* Argument type list entry. */ struct sdt_probe *probe; /* Ptr to the probe structure. */ }; struct sdt_probe { int version; /* Set to sizeof(struct sdt_probe). */ struct sdt_provider *prov; /* Ptr to the provider structure. */ TAILQ_ENTRY(sdt_probe) probe_entry; /* SDT probe list entry. */ TAILQ_HEAD(, sdt_argtype) argtype_list; const char *mod; const char *func; const char *name; id_t id; /* DTrace probe ID. */ int n_args; /* Number of arguments. */ struct linker_file *sdtp_lf; /* Module in which we're defined. */ }; struct sdt_provider { char *name; /* Provider name. */ TAILQ_ENTRY(sdt_provider) prov_entry; /* SDT provider list entry. */ uintptr_t id; /* DTrace provider ID. */ int sdt_refs; /* Number of module references. */ }; void sdt_probe_stub(uint32_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); SDT_PROVIDER_DECLARE(sdt); #endif /* _KERNEL */ #endif /* _SYS_SDT_H */