diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index bb5453252d86..192508bbf6f1 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -1,352 +1,356 @@ # $FreeBSD$ # # Please see the file src/etc/mtree/README before making changes to this file. # /set type=dir uname=root gname=wheel mode=0755 . arpa .. atf-c .. atf-c++ .. bsm .. bsnmp .. c++ v1 __algorithm .. __bit .. __charconv .. __chrono .. __compare .. __concepts .. __coroutine .. __filesystem .. __format .. __functional .. __iterator .. __memory .. __numeric .. __random .. __ranges .. __thread .. __utility .. __variant .. experimental .. ext .. .. .. cam ata .. mmc .. nvme .. scsi .. .. casper .. crypto .. dev acpica .. agp .. an .. ciss .. evdev .. filemon .. firewire .. hid .. hwpmc .. hyperv .. ic .. iicbus .. io .. mfi .. mlx5 .. mmc .. mpt mpilib .. .. nvme .. ofw .. pbio .. pci .. powermac_nvram .. ppbus .. pwm .. smbus .. speaker .. tcp_log .. usb .. veriexec .. vkbd .. wi .. .. devdctl .. edit readline .. .. fs cuse .. devfs .. fdescfs .. msdosfs .. nfs .. nullfs .. procfs .. smbfs .. udf .. unionfs .. .. gcc 4.2 .. .. geom cache .. concat .. eli .. gate .. journal .. label .. mirror .. mountver .. multipath .. nop .. raid .. raid3 .. shsec .. stripe .. union .. virstor .. .. gssapi .. infiniband complib .. iba .. opensm .. vendor .. .. isofs cd9660 .. .. kadm5 .. krb5 .. lib80211 .. lib9p .. libipt .. libmilter .. libxo .. lzma .. machine pc .. .. net altq .. route .. .. net80211 .. netgraph atm .. bluetooth include .. .. netflow .. .. netinet cc .. netdump .. tcp_stacks .. .. netinet6 .. + netlink + route + .. + .. netipsec .. netnatm api .. msg .. saal .. sig .. .. netpfil pf .. .. netsmb .. nfs .. nfsclient .. nfsserver .. opencsd c_api .. etmv3 .. etmv4 .. ptm .. stm .. .. openssl .. pcap .. protocols .. rdma .. rpc .. rpcsvc .. security audit .. mac_biba .. mac_bsdextended .. mac_lomac .. mac_mls .. mac_partition .. mac_veriexec .. .. sys disk .. .. teken .. ufs ffs .. ufs .. .. vm .. xlocale .. .. diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 68b3dfcac776..a6aee9bbab36 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -1,873 +1,874 @@ # $FreeBSD$ SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR_PARALLEL= # Modules that include binary-only blobs of microcode should be selectable by # MK_SOURCELESS_UCODE option (see below). .include "${SYSDIR}/conf/config.mk" .if defined(MODULES_OVERRIDE) && !defined(ALL_MODULES) SUBDIR=${MODULES_OVERRIDE} .else SUBDIR= \ ${_3dfx} \ ${_3dfx_linux} \ ${_aac} \ ${_aacraid} \ accf_data \ accf_dns \ accf_http \ acl_nfs4 \ acl_posix1e \ ${_acpi} \ ae \ ${_aesni} \ age \ ${_agp} \ ahci \ aic7xxx \ alc \ ale \ alq \ ${_amd_ecc_inject} \ ${_amdgpio} \ ${_amdsbwd} \ ${_amdsmn} \ ${_amdtemp} \ ${_aout} \ ${_arcmsr} \ ${_allwinner} \ ${_armv8crypto} \ ${_asmc} \ ata \ ath \ ath_dfs \ ath_hal \ ath_hal_ar5210 \ ath_hal_ar5211 \ ath_hal_ar5212 \ ath_hal_ar5416 \ ath_hal_ar9300 \ ath_main \ ath_rate \ ath_pci \ ${_autofs} \ axgbe \ backlight \ ${_bce} \ ${_bcm283x_clkman} \ ${_bcm283x_pwm} \ bfe \ bge \ bhnd \ ${_bxe} \ ${_bios} \ ${_blake2} \ bnxt \ bridgestp \ bwi \ bwn \ ${_bytgpio} \ ${_chvgpio} \ cam \ ${_cardbus} \ ${_carp} \ cas \ ${_cbb} \ cc \ ${_ccp} \ cd9660 \ cd9660_iconv \ ${_ce} \ ${_cfi} \ ${_chromebook_platform} \ ${_ciss} \ ${_coretemp} \ ${_cp} \ ${_cpsw} \ ${_cpuctl} \ ${_cpufreq} \ ${_crypto} \ ${_cryptodev} \ ctl \ ${_cxgb} \ ${_cxgbe} \ dc \ dcons \ dcons_crom \ ${_dpdk_lpm4} \ ${_dpdk_lpm6} \ ${_dpms} \ dummynet \ ${_dwwdt} \ ${_e6000sw} \ ${_efirt} \ ${_em} \ ${_ena} \ ${_enetc} \ ${_et} \ evdev \ ${_exca} \ ext2fs \ fdc \ fdescfs \ ${_felix} \ ${_ffec} \ ${_fib_dxr} \ filemon \ firewire \ firmware \ flash \ ${_ftwd} \ fusefs \ ${_fxp} \ gem \ geom \ ${_glxiic} \ ${_glxsb} \ gpio \ hid \ hifn \ ${_hpt27xx} \ ${_hptiop} \ ${_hptmv} \ ${_hptnr} \ ${_hptrr} \ hwpmc \ ${_hyperv} \ i2c \ ${_iavf} \ ${_ibcore} \ ${_ichwd} \ ${_ice} \ ${_ice_ddp} \ ${_irdma} \ ${_ida} \ if_bridge \ ${_if_cgem} \ if_disc \ if_edsc \ ${_if_enc} \ if_epair \ ${_if_gif} \ ${_if_gre} \ ${_if_me} \ if_infiniband \ if_lagg \ if_ovpn \ ${_if_stf} \ if_tuntap \ if_vlan \ if_vxlan \ iflib \ ${_igc} \ imgact_binmisc \ ${_intelspi} \ ${_io} \ ${_ioat} \ ${_ipoib} \ ipdivert \ ${_ipfilter} \ ${_ipfw} \ ipfw_nat \ ${_ipfw_nat64} \ ${_ipfw_nptv6} \ ${_ipfw_pmod} \ ${_ipmi} \ ip6_mroute_mod \ ip_mroute_mod \ ${_ips} \ ${_ipsec} \ ${_ipw} \ ${_ipwfw} \ ${_isci} \ ${_iser} \ isp \ ${_ispfw} \ ${_itwd} \ ${_iwi} \ ${_iwifw} \ ${_iwlwifi} \ ${_iwlwififw} \ ${_iwm} \ ${_iwmfw} \ ${_iwn} \ ${_iwnfw} \ ${_ix} \ ${_ixv} \ ${_ixl} \ jme \ kbdmux \ kgssapi \ kgssapi_krb5 \ khelp \ krpc \ ksyms \ le \ lge \ libalias \ libiconv \ libmchain \ lindebugfs \ linuxkpi \ linuxkpi_wlan \ ${_lio} \ lpt \ ${_mac_biba} \ ${_mac_bsdextended} \ ${_mac_ddb} \ ${_mac_ifoff} \ ${_mac_lomac} \ ${_mac_mls} \ ${_mac_none} \ ${_mac_ntpd} \ ${_mac_partition} \ ${_mac_pimd} \ ${_mac_portacl} \ ${_mac_priority} \ ${_mac_seeotheruids} \ ${_mac_stub} \ ${_mac_test} \ ${_malo} \ ${_mana} \ md \ mdio \ ${_mgb} \ mem \ mfi \ mii \ mlx \ mlxfw \ ${_mlx4} \ ${_mlx4ib} \ ${_mlx4en} \ ${_mlx5} \ ${_mlx5en} \ ${_mlx5ib} \ mmc \ mmcsd \ ${_mpr} \ ${_mps} \ mpt \ mqueue \ mrsas \ msdosfs \ msdosfs_iconv \ msk \ ${_mthca} \ mvs \ mwl \ ${_mwlfw} \ mxge \ my \ ${_nctgpio} \ ${_neta} \ + netlink \ ${_netgraph} \ ${_nfe} \ nfscl \ nfscommon \ nfsd \ nfslockd \ nfssvc \ nge \ nmdm \ nullfs \ ${_ntb} \ ${_nvd} \ ${_nvdimm} \ ${_nvme} \ ${_nvram} \ oce \ ${_ocs_fc} \ ${_ossl} \ otus \ ${_otusfw} \ ow \ ${_padlock} \ ${_padlock_rng} \ ${_pchtherm} \ ${_pcfclock} \ ${_pf} \ ${_pflog} \ ${_pfsync} \ plip \ ${_pms} \ ppbus \ ppc \ ppi \ pps \ procfs \ proto \ pseudofs \ ${_pst} \ pty \ puc \ pwm \ ${_qat} \ ${_qatfw} \ ${_qat_c2xxx} \ ${_qat_c2xxxfw} \ ${_qlxge} \ ${_qlxgb} \ ${_qlxgbe} \ ${_qlnx} \ ral \ ${_ralfw} \ ${_random_fortuna} \ ${_random_other} \ rc4 \ ${_rdma} \ ${_rdrand_rng} \ re \ rl \ ${_rockchip} \ rtsx \ ${_rtw88} \ ${_rtw88fw} \ rtwn \ rtwn_pci \ rtwn_usb \ ${_rtwnfw} \ ${_s3} \ ${_safe} \ safexcel \ ${_sbni} \ scc \ ${_sctp} \ sdhci \ ${_sdhci_acpi} \ ${_sdhci_fdt} \ sdhci_pci \ sdio \ sem \ send \ ${_sfxge} \ sge \ ${_sgx} \ ${_sgx_linux} \ siftr \ siis \ sis \ sk \ ${_smartpqi} \ smbfs \ snp \ sound \ ${_speaker} \ spi \ ${_splash} \ ste \ stge \ ${_sume} \ ${_superio} \ ${_p2sb} \ sym \ ${_syscons} \ sysvipc \ tcp \ ${_ti} \ tmpfs \ ${_toecore} \ ${_tpm} \ twe \ tws \ uart \ udf \ udf_iconv \ ufs \ uinput \ unionfs \ usb \ ${_vesa} \ virtio \ vge \ ${_viawd} \ videomode \ vkbd \ ${_vmd} \ ${_vmm} \ ${_vmware} \ vr \ vte \ ${_wbwd} \ wlan \ wlan_acl \ wlan_amrr \ wlan_ccmp \ wlan_rssadapt \ wlan_tkip \ wlan_wep \ wlan_xauth \ ${_wpi} \ ${_wpifw} \ ${_x86bios} \ xdr \ xl \ xz \ zlib .if ${MK_AUTOFS} != "no" || defined(ALL_MODULES) _autofs= autofs .endif .if ${MK_DTRACE} != "no" || defined(ALL_MODULES) .if ${MACHINE_CPUARCH} != "arm" || ${MACHINE_ARCH:Marmv[67]*} != "" .if ${KERN_OPTS:MKDTRACE_HOOKS} SUBDIR+= dtrace .endif .endif SUBDIR+= opensolaris .endif .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) .if exists(${SRCTOP}/sys/opencrypto) _crypto= crypto _cryptodev= cryptodev _random_fortuna=random_fortuna _random_other= random_other .endif .endif .if ${MK_CUSE} != "no" || defined(ALL_MODULES) SUBDIR+= cuse .endif .if ${MK_EFI} != "no" .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" _efirt= efirt .endif .endif .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _carp= carp _toecore= toecore _if_enc= if_enc _if_gif= if_gif _if_gre= if_gre _ipfw_pmod= ipfw_pmod .if ${KERN_OPTS:MIPSEC_SUPPORT} && !${KERN_OPTS:MIPSEC} _ipsec= ipsec .endif .if ${KERN_OPTS:MSCTP_SUPPORT} || ${KERN_OPTS:MSCTP} _sctp= sctp .endif .endif .if (${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _if_stf= if_stf .endif .if ${MK_INET_SUPPORT} != "no" || defined(ALL_MODULES) _if_me= if_me _ipfw= ipfw .if ${MK_INET6_SUPPORT} != "no" || defined(ALL_MODULES) _ipfw_nat64= ipfw_nat64 .endif .endif .if ${MK_INET6_SUPPORT} != "no" || defined(ALL_MODULES) _ipfw_nptv6= ipfw_nptv6 .endif .if ${MK_IPFILTER} != "no" || defined(ALL_MODULES) _ipfilter= ipfilter .endif .if ${MK_INET_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO} _dpdk_lpm4= dpdk_lpm4 _fib_dxr= fib_dxr .endif .if ${MK_INET6_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO} _dpdk_lpm6= dpdk_lpm6 .endif .if ${MK_ISCSI} != "no" || defined(ALL_MODULES) SUBDIR+= cfiscsi SUBDIR+= iscsi .endif .if !empty(OPT_FDT) SUBDIR+= fdt .endif # Linuxulator .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" SUBDIR+= linprocfs SUBDIR+= linsysfs .endif .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" SUBDIR+= linux .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" SUBDIR+= linux64 SUBDIR+= linux_common .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" _ena= ena .if ${MK_OFED} != "no" || defined(ALL_MODULES) _ibcore= ibcore _ipoib= ipoib _iser= iser .endif _ipmi= ipmi _iwlwifi= iwlwifi .if ${MK_SOURCELESS_UCODE} != "no" _iwlwififw= iwlwififw .endif _mlx4= mlx4 _mlx5= mlx5 .if (${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _mlx4en= mlx4en _mlx5en= mlx5en .endif .if ${MK_OFED} != "no" || defined(ALL_MODULES) _mthca= mthca _mlx4ib= mlx4ib _mlx5ib= mlx5ib .endif _ossl= ossl _rtw88= rtw88 .if ${MK_SOURCELESS_UCODE} != "no" _rtw88fw= rtw88fw .endif _vmware= vmware .endif # MAC framework .if ${KERN_OPTS:MMAC} || defined(ALL_MODULES) _mac_biba= mac_biba _mac_bsdextended= mac_bsdextended .if ${KERN_OPTS:MDDB} || defined(ALL_MODULES) _mac_ddb= mac_ddb .endif _mac_ifoff= mac_ifoff _mac_lomac= mac_lomac _mac_mls= mac_mls _mac_none= mac_none _mac_ntpd= mac_ntpd _mac_partition= mac_partition _mac_pimd= mac_pimd _mac_portacl= mac_portacl _mac_priority= mac_priority _mac_seeotheruids= mac_seeotheruids _mac_stub= mac_stub _mac_test= mac_test .endif .if ${MK_NETGRAPH} != "no" || defined(ALL_MODULES) _netgraph= netgraph .endif .if (${MK_PF} != "no" && (${MK_INET_SUPPORT} != "no" || \ ${MK_INET6_SUPPORT} != "no")) || defined(ALL_MODULES) _pf= pf _pflog= pflog .if ${MK_INET_SUPPORT} != "no" _pfsync= pfsync .endif .endif .if ${MK_SOURCELESS_UCODE} != "no" _bce= bce _fxp= fxp _ispfw= ispfw _ti= ti _mwlfw= mwlfw _otusfw= otusfw _ralfw= ralfw _rtwnfw= rtwnfw .endif .if ${MK_SOURCELESS_UCODE} != "no" && ${MACHINE_CPUARCH} != "arm" && \ ${MACHINE_ARCH} != "powerpc" && ${MACHINE_ARCH} != "powerpcspe" && \ ${MACHINE_CPUARCH} != "riscv" _cxgbe= cxgbe .endif .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "arm64" || ${MACHINE_ARCH:Mpowerpc64*} _ice= ice .if ${MK_SOURCELESS_UCODE} != "no" _ice_ddp= ice_ddp .endif .if ${MK_OFED} != "no" || defined(ALL_MODULES) .if ${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no" _irdma= irdma .endif .endif .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm" || \ ${MACHINE_CPUARCH} == "riscv" _if_cgem= if_cgem .endif # These rely on 64bit atomics .if ${MACHINE_ARCH} != "powerpc" && ${MACHINE_ARCH} != "powerpcspe" _mps= mps _mpr= mpr .endif .if ${MK_TESTS} != "no" || defined(ALL_MODULES) SUBDIR+= tests .endif .if ${MK_ZFS} != "no" || (defined(ALL_MODULES) && ${MACHINE_CPUARCH} != "powerpc") SUBDIR+= zfs .endif .if ${MACHINE_CPUARCH} != "aarch64" && ${MACHINE_CPUARCH} != "arm" && \ ${MACHINE_CPUARCH} != "powerpc" && ${MACHINE_CPUARCH} != "riscv" _syscons= syscons .endif .if ${MK_SOURCELESS_UCODE} != "no" _cxgb= cxgb .endif .if ${MACHINE_CPUARCH} == "aarch64" _allwinner= allwinner _armv8crypto= armv8crypto _dwwdt= dwwdt _em= em _enetc= enetc _felix= felix _rockchip= rockchip .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "arm" _sdhci_fdt= sdhci_fdt _e6000sw= e6000sw _neta= neta .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _agp= agp .if ${MACHINE_CPUARCH} == "i386" || !empty(COMPAT_FREEBSD32_ENABLED) _aout= aout .endif _bios= bios .if ${MK_SOURCELESS_UCODE} != "no" _bxe= bxe .endif _cardbus= cardbus _cbb= cbb _cpuctl= cpuctl _cpufreq= cpufreq _dpms= dpms _em= em _et= et _ftwd= ftwd _exca= exca _igc= igc _io= io _itwd= itwd _ix= ix _ixv= ixv .if ${MK_SOURCELESS_UCODE} != "no" _lio= lio .endif _mana= mana _mgb= mgb _nctgpio= nctgpio _ntb= ntb _ocs_fc= ocs_fc _qat= qat _qatfw= qatfw _qat_c2xxx= qat_c2xxx _qat_c2xxxfw= qat_c2xxxfw .if ${MK_OFED} != "no" || defined(ALL_MODULES) _rdma= rdma .endif _safe= safe _speaker= speaker _splash= splash _p2sb= p2sb _wbwd= wbwd _aac= aac _aacraid= aacraid _acpi= acpi .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _aesni= aesni .endif _amd_ecc_inject=amd_ecc_inject _amdsbwd= amdsbwd _amdsmn= amdsmn _amdtemp= amdtemp _arcmsr= arcmsr _asmc= asmc .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _blake2= blake2 .endif _bytgpio= bytgpio _chvgpio= chvgpio _ciss= ciss _chromebook_platform= chromebook_platform _coretemp= coretemp .if ${MK_SOURCELESS_HOST} != "no" && empty(KCSAN_ENABLED) _hpt27xx= hpt27xx .endif _hptiop= hptiop .if ${MK_SOURCELESS_HOST} != "no" && empty(KCSAN_ENABLED) _hptmv= hptmv _hptnr= hptnr _hptrr= hptrr .endif _hyperv= hyperv _ichwd= ichwd _ida= ida _intelspi= intelspi _ips= ips _isci= isci _ipw= ipw _iwi= iwi _iwm= iwm _iwn= iwn .if ${MK_SOURCELESS_UCODE} != "no" _ipwfw= ipwfw _iwifw= iwifw _iwmfw= iwmfw _iwnfw= iwnfw .endif _nfe= nfe _nvd= nvd _nvme= nvme _nvram= nvram .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _padlock= padlock _padlock_rng= padlock_rng _rdrand_rng= rdrand_rng .endif _pchtherm = pchtherm _s3= s3 _sdhci_acpi= sdhci_acpi _superio= superio _tpm= tpm _vesa= vesa _viawd= viawd _vmd= vmd _wpi= wpi .if ${MK_SOURCELESS_UCODE} != "no" _wpifw= wpifw .endif _x86bios= x86bios .endif .if ${MACHINE_CPUARCH} == "amd64" _amdgpio= amdgpio _ccp= ccp _iavf= iavf _ioat= ioat _ixl= ixl _nvdimm= nvdimm _pms= pms _qlxge= qlxge _qlxgb= qlxgb _sume= sume .if ${MK_SOURCELESS_UCODE} != "no" _qlxgbe= qlxgbe _qlnx= qlnx .endif _sfxge= sfxge _sgx= sgx _sgx_linux= sgx_linux _smartpqi= smartpqi _p2sb= p2sb .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) .if ${KERN_OPTS:MSMP} _vmm= vmm .endif .endif .endif .if ${MACHINE_CPUARCH} == "i386" # XXX some of these can move to the general case when de-i386'ed # XXX some of these can move now, but are untested on other architectures. _3dfx= 3dfx _3dfx_linux= 3dfx_linux .if ${MK_SOURCELESS_HOST} != "no" _ce= ce .endif .if ${MK_SOURCELESS_HOST} != "no" _cp= cp .endif _glxiic= glxiic _glxsb= glxsb _pcfclock= pcfclock _pst= pst _sbni= sbni .endif .if ${MACHINE_ARCH} == "armv7" _cfi= cfi _cpsw= cpsw .endif .if ${MACHINE_CPUARCH} == "powerpc" _aacraid= aacraid _agp= agp _an= an _cardbus= cardbus _cbb= cbb _cfi= cfi _cpufreq= cpufreq _exca= exca _ffec= ffec _nvd= nvd _nvme= nvme .endif .if ${MACHINE_ARCH:Mpowerpc64*} != "" _ipmi= ipmi _ixl= ixl _nvram= opal_nvram .endif .if ${MACHINE_CPUARCH} == "powerpc" && ${MACHINE_ARCH} != "powerpcspe" # Don't build powermac_nvram for powerpcspe, it's never supported. _nvram+= powermac_nvram .endif .endif .if ${MACHINE_ARCH:Marmv[67]*} != "" || ${MACHINE_CPUARCH} == "aarch64" _bcm283x_clkman= bcm283x_clkman _bcm283x_pwm= bcm283x_pwm .endif .if !(${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} < 110000) # LLVM 10 crashes when building if_malo_pci.c, fixed in LLVM11: # https://bugs.llvm.org/show_bug.cgi?id=44351 _malo= malo .endif SUBDIR+=${MODULES_EXTRA} .for reject in ${WITHOUT_MODULES} SUBDIR:= ${SUBDIR:N${reject}} .endfor # Calling kldxref(8) for each module is expensive. .if !defined(NO_XREF) .MAKEFLAGS+= -DNO_XREF afterinstall: .PHONY @if type kldxref >/dev/null 2>&1; then \ ${ECHO} ${KLDXREF_CMD} ${DESTDIR}${KMODDIR}; \ ${KLDXREF_CMD} ${DESTDIR}${KMODDIR}; \ fi .endif SUBDIR:= ${SUBDIR:u:O} .include diff --git a/sys/modules/netlink/Makefile b/sys/modules/netlink/Makefile new file mode 100644 index 000000000000..046ecf5a2961 --- /dev/null +++ b/sys/modules/netlink/Makefile @@ -0,0 +1,17 @@ +.PATH: ${SRCTOP}/sys/netlink +KMOD= netlink + +SRCS = netlink_module.c netlink_domain.c netlink_io.c \ + netlink_message_parser.c netlink_message_writer.c netlink_generic.c \ + netlink_route.c route/iface.c route/iface_drivers.c route/neigh.c \ + route/nexthop.c route/route.c + +EXPORT_SYMS= +EXPORT_SYMS+= nlmsg_get_chain_writer +EXPORT_SYMS+= nlmsg_refill_buffer +EXPORT_SYMS+= nlmsg_end +EXPORT_SYMS+= nlmsg_flush + +EXPORT_SYMS= YES + +.include diff --git a/sys/net/route.c b/sys/net/route.c index 7d46ba2588ed..9773f899f5af 100644 --- a/sys/net/route.c +++ b/sys/net/route.c @@ -1,696 +1,707 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.c 8.3.1.1 (Berkeley) 2/23/95 * $FreeBSD$ */ /************************************************************************ * Note: In this file a 'fib' is a "forwarding information base" * * Which is the new name for an in kernel routing (next hop) table. * ***********************************************************************/ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_mrouting.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_PCPUSTAT_DEFINE(struct rtstat, rtstat); VNET_PCPUSTAT_SYSINIT(rtstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(rtstat); #endif EVENTHANDLER_LIST_DEFINE(rt_addrmsg); static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *, void *arg); /* * route initialization must occur before ip6_init2(), which happenas at * SI_ORDER_MIDDLE. */ static void route_init(void) { nhops_init(); } SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, NULL); struct rib_head * rt_table_init(int offset, int family, u_int fibnum) { struct rib_head *rh; rh = malloc(sizeof(struct rib_head), M_RTABLE, M_WAITOK | M_ZERO); /* TODO: These details should be hidded inside radix.c */ /* Init masks tree */ rn_inithead_internal(&rh->head, rh->rnh_nodes, offset); rn_inithead_internal(&rh->rmhead.head, rh->rmhead.mask_nodes, 0); rh->head.rnh_masks = &rh->rmhead; /* Save metadata associated with this routing table. */ rh->rib_family = family; rh->rib_fibnum = fibnum; #ifdef VIMAGE rh->rib_vnet = curvnet; #endif tmproutes_init(rh); /* Init locks */ RIB_LOCK_INIT(rh); nhops_init_rib(rh); /* Init subscription system */ rib_init_subscriptions(rh); /* Finally, set base callbacks */ rh->rnh_addaddr = rn_addroute; rh->rnh_deladdr = rn_delete; rh->rnh_matchaddr = rn_match; rh->rnh_lookup = rn_lookup; rh->rnh_walktree = rn_walktree; rh->rnh_walktree_from = rn_walktree_from; return (rh); } static int rt_freeentry(struct radix_node *rn, void *arg) { struct radix_head * const rnh = arg; struct radix_node *x; x = (struct radix_node *)rn_delete(rn + 2, NULL, rnh); if (x != NULL) R_Free(x); return (0); } void rt_table_destroy(struct rib_head *rh) { RIB_WLOCK(rh); rh->rib_dying = true; RIB_WUNLOCK(rh); #ifdef FIB_ALGO fib_destroy_rib(rh); #endif tmproutes_destroy(rh); rn_walktree(&rh->rmhead.head, rt_freeentry, &rh->rmhead.head); nhops_destroy_rib(rh); rib_destroy_subscriptions(rh); /* Assume table is already empty */ RIB_LOCK_DESTROY(rh); free(rh, M_RTABLE); } /* * Adds a temporal redirect entry to the routing table. * @fibnum: fib number * @dst: destination to install redirect to * @gateway: gateway to go via * @author: sockaddr of originating router, can be NULL * @ifp: interface to use for the redirected route * @flags: set of flags to add. Allowed: RTF_GATEWAY * @lifetime_sec: time in seconds to expire this redirect. * * Retuns 0 on success, errno otherwise. */ int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int lifetime_sec) { struct route_nhop_data rnd = { .rnd_weight = RT_DEFAULT_WEIGHT }; struct rib_cmd_info rc; struct ifaddr *ifa; int error; NET_EPOCH_ASSERT(); if (rt_tables_get_rnh(fibnum, dst->sa_family) == NULL) return (EAFNOSUPPORT); /* Verify the allowed flag mask. */ KASSERT(((flags & ~(RTF_GATEWAY)) == 0), ("invalid redirect flags: %x", flags)); flags |= RTF_HOST | RTF_DYNAMIC; /* Get the best ifa for the given interface and gateway. */ if ((ifa = ifaof_ifpforaddr(gateway, ifp)) == NULL) return (ENETUNREACH); struct nhop_object *nh = nhop_alloc(fibnum, dst->sa_family); if (nh == NULL) return (ENOMEM); nhop_set_gw(nh, gateway, flags & RTF_GATEWAY); nhop_set_transmit_ifp(nh, ifp); nhop_set_src(nh, ifa); nhop_set_pxtype_flag(nh, NHF_HOST); nhop_set_expire(nh, lifetime_sec + time_uptime); nhop_set_redirect(nh, true); nhop_set_origin(nh, NH_ORIGIN_REDIRECT); rnd.rnd_nhop = nhop_get_nhop(nh, &error); if (error == 0) { error = rib_add_route_px(fibnum, dst, -1, &rnd, RTM_F_CREATE, &rc); } if (error != 0) { /* TODO: add per-fib redirect stats. */ return (error); } RTSTAT_INC(rts_dynamic); /* Send notification of a route addition to userland. */ struct rt_addrinfo info = { .rti_info[RTAX_DST] = dst, .rti_info[RTAX_GATEWAY] = gateway, .rti_info[RTAX_AUTHOR] = author, }; rt_missmsg_fib(RTM_REDIRECT, &info, flags | RTF_UP, error, fibnum); return (0); } /* * Routing table ioctl interface. */ int rtioctl_fib(u_long req, caddr_t data, u_int fibnum) { /* * If more ioctl commands are added here, make sure the proper * super-user checks are being performed because it is possible for * prison-root to make it this far if raw sockets have been enabled * in jails. */ #ifdef INET /* Multicast goop, grrr... */ return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP; #else /* INET */ return ENXIO; #endif /* INET */ } struct ifaddr * ifa_ifwithroute(int flags, const struct sockaddr *dst, const struct sockaddr *gateway, u_int fibnum) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, * and the interface is a pt to pt link * we should search for the destination * as our clue to the interface. Otherwise * we can use the local address. */ ifa = NULL; if (flags & RTF_HOST) ifa = ifa_ifwithdstaddr(dst, fibnum); if (ifa == NULL) ifa = ifa_ifwithaddr(gateway); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ ifa = ifa_ifwithdstaddr(gateway, fibnum); } if (ifa == NULL) ifa = ifa_ifwithnet(gateway, 0, fibnum); if (ifa == NULL) { struct nhop_object *nh; nh = rib_lookup(fibnum, gateway, NHR_NONE, 0); /* * dismiss a gateway that is reachable only * through the default router */ if ((nh == NULL) || (nh->nh_flags & NHF_DEFAULT)) return (NULL); ifa = nh->nh_ifa; } if (ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *oifa = ifa; ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (ifa == NULL) ifa = oifa; } return (ifa); } /* * Delete Routes for a Network Interface * * Called for each routing entry via the rnh->rnh_walktree() call above * to delete all route entries referencing a detaching network interface. * * Arguments: * rt pointer to rtentry * nh pointer to nhop * arg argument passed to rnh->rnh_walktree() - detaching interface * * Returns: * 0 successful * errno failed - reason indicated */ static int rt_ifdelroute(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { struct ifnet *ifp = arg; if (nh->nh_ifp != ifp) return (0); /* * Protect (sorta) against walktree recursion problems * with cloned routes */ if ((rt->rte_flags & RTF_UP) == 0) return (0); return (1); } void rt_flushifroutes(struct ifnet *ifp) { rib_foreach_table_walk_del(AF_UNSPEC, rt_ifdelroute, ifp); } /* * Tries to extract interface from RTAX_IFP passed in rt_addrinfo. * Interface can be specified ether as interface index (sdl_index) or * the interface name (sdl_data). * * Returns found ifp or NULL */ static struct ifnet * info_get_ifp(struct rt_addrinfo *info) { const struct sockaddr_dl *sdl; sdl = (const struct sockaddr_dl *)info->rti_info[RTAX_IFP]; if (sdl->sdl_family != AF_LINK) return (NULL); if (sdl->sdl_index != 0) return (ifnet_byindex(sdl->sdl_index)); if (sdl->sdl_nlen > 0) { char if_name[IF_NAMESIZE]; if (sdl->sdl_nlen + offsetof(struct sockaddr_dl, sdl_data) > sdl->sdl_len) return (NULL); if (sdl->sdl_nlen >= IF_NAMESIZE) return (NULL); bzero(if_name, sizeof(if_name)); memcpy(if_name, sdl->sdl_data, sdl->sdl_nlen); return (ifunit(if_name)); } return (NULL); } /* * Calculates proper ifa/ifp for the cases when gateway AF is different * from dst AF. * * Returns 0 on success. */ __noinline static int rt_getifa_family(struct rt_addrinfo *info, uint32_t fibnum) { if (info->rti_ifp == NULL) { struct ifaddr *ifa = NULL; /* * No transmit interface specified. Guess it by checking gw sa. */ const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; ifa = ifa_ifwithroute(RTF_GATEWAY, gw, gw, fibnum); if (ifa == NULL) return (ENETUNREACH); info->rti_ifp = ifa->ifa_ifp; } /* Prefer address from outgoing interface */ info->rti_ifa = ifaof_ifpforaddr(info->rti_info[RTAX_DST], info->rti_ifp); #ifdef INET if (info->rti_ifa == NULL) { /* Use first found IPv4 address */ bool loopback_ok = info->rti_ifp->if_flags & IFF_LOOPBACK; info->rti_ifa = (struct ifaddr *)in_findlocal(fibnum, loopback_ok); } #endif if (info->rti_ifa == NULL) return (ENETUNREACH); return (0); } /* * Fills in rti_ifp and rti_ifa for the provided fib. * * Assume basic consistency checks are executed by callers: * RTAX_DST exists, if RTF_GATEWAY is set, RTAX_GATEWAY exists as well. */ int rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum) { const struct sockaddr *dst, *gateway, *ifaaddr; int error, flags; dst = info->rti_info[RTAX_DST]; gateway = info->rti_info[RTAX_GATEWAY]; ifaaddr = info->rti_info[RTAX_IFA]; flags = info->rti_flags; /* * ifp may be specified by sockaddr_dl * when protocol address is ambiguous. */ error = 0; /* If we have interface specified by RTAX_IFP address, try to use it */ if ((info->rti_ifp == NULL) && (info->rti_info[RTAX_IFP] != NULL)) info->rti_ifp = info_get_ifp(info); /* * If we have source address specified, try to find it * TODO: avoid enumerating all ifas on all interfaces. */ if (info->rti_ifa == NULL && ifaaddr != NULL) info->rti_ifa = ifa_ifwithaddr(ifaaddr); if ((info->rti_ifa == NULL) && ((info->rti_flags & RTF_GATEWAY) != 0) && (gateway->sa_family != dst->sa_family)) return (rt_getifa_family(info, fibnum)); if (info->rti_ifa == NULL) { const struct sockaddr *sa; /* * Most common use case for the userland-supplied routes. * * Choose sockaddr to select ifa. * -- if ifp is set -- * Order of preference: * 1) IFA address * 2) gateway address * Note: for interface routes link-level gateway address * is specified to indicate the interface index without * specifying RTF_GATEWAY. In this case, ignore gateway * Note: gateway AF may be different from dst AF. In this case, * ignore gateway * 3) final destination. * 4) if all of these fails, try to get at least link-level ifa. * -- else -- * try to lookup gateway or dst in the routing table to get ifa */ if (info->rti_info[RTAX_IFA] != NULL) sa = info->rti_info[RTAX_IFA]; else if ((info->rti_flags & RTF_GATEWAY) != 0 && gateway->sa_family == dst->sa_family) sa = gateway; else sa = dst; if (info->rti_ifp != NULL) { info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp); /* Case 4 */ if (info->rti_ifa == NULL && gateway != NULL) info->rti_ifa = ifaof_ifpforaddr(gateway, info->rti_ifp); } else if (dst != NULL && gateway != NULL) info->rti_ifa = ifa_ifwithroute(flags, dst, gateway, fibnum); else if (sa != NULL) info->rti_ifa = ifa_ifwithroute(flags, sa, sa, fibnum); } if (info->rti_ifa != NULL) { if (info->rti_ifp == NULL) info->rti_ifp = info->rti_ifa->ifa_ifp; } else error = ENETUNREACH; return (error); } void rt_updatemtu(struct ifnet *ifp) { struct rib_head *rnh; int mtu; int i, j; /* * Try to update rt_mtu for all routes using this interface * Unfortunately the only way to do this is to traverse all * routing tables in all fibs/domains. */ for (i = 1; i <= AF_MAX; i++) { mtu = if_getmtu_family(ifp, i); for (j = 0; j < rt_numfibs; j++) { rnh = rt_tables_get_rnh(j, i); if (rnh == NULL) continue; nhops_update_ifmtu(rnh, ifp, mtu); } } } #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); int rt_print(char *buf, int buflen, struct rtentry *rt); int p_sockaddr(char *buf, int buflen, struct sockaddr *s) { void *paddr = NULL; switch (s->sa_family) { case AF_INET: paddr = &((struct sockaddr_in *)s)->sin_addr; break; case AF_INET6: paddr = &((struct sockaddr_in6 *)s)->sin6_addr; break; } if (paddr == NULL) return (0); if (inet_ntop(s->sa_family, paddr, buf, buflen) == NULL) return (0); return (strlen(buf)); } int rt_print(char *buf, int buflen, struct rtentry *rt) { struct sockaddr *addr, *mask; int i = 0; addr = rt_key(rt); mask = rt_mask(rt); i = p_sockaddr(buf, buflen, addr); if (!(rt->rt_flags & RTF_HOST)) { buf[i++] = '/'; i += p_sockaddr(buf + i, buflen - i, mask); } if (rt->rt_flags & RTF_GATEWAY) { buf[i++] = '>'; i += p_sockaddr(buf + i, buflen - i, &rt->rt_nhop->gw_sa); } return (i); } #endif void rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst, const struct sockaddr *netmask) { const u_char *cp1 = (const u_char *)src; u_char *cp2 = (u_char *)dst; const u_char *cp3 = (const u_char *)netmask; u_char *cplim = cp2 + *cp3; u_char *cplim2 = cp2 + *cp1; *cp2++ = *cp1++; *cp2++ = *cp1++; /* copies sa_len & sa_family */ cp3 += 2; if (cplim > cplim2) cplim = cplim2; while (cp2 < cplim) *cp2++ = *cp1++ & *cp3++; if (cp2 < cplim2) bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } /* * Announce interface address arrival/withdraw * Returns 0 on success. */ int rt_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { #if defined(INET) || defined(INET6) struct sockaddr *sa = ifa->ifa_addr; struct ifnet *ifp = ifa->ifa_ifp; #endif KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT((fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); EVENTHANDLER_DIRECT_INVOKE(rt_addrmsg, ifa, cmd); #ifdef INET if (sa->sa_family == AF_INET) { char addrstr[INET_ADDRSTRLEN]; char strbuf[INET_ADDRSTRLEN + 12]; inet_ntoa_r(((struct sockaddr_in *)sa)->sin_addr, addrstr); snprintf(strbuf, sizeof(strbuf), "address=%s", addrstr); devctl_notify("IFNET", ifp->if_xname, (cmd == RTM_ADD) ? "ADDR_ADD" : "ADDR_DEL", strbuf); } #endif #ifdef INET6 if (sa->sa_family == AF_INET6) { char addrstr[INET6_ADDRSTRLEN]; char strbuf[INET6_ADDRSTRLEN + 12]; ip6_sprintf(addrstr, IFA_IN6(ifa)); snprintf(strbuf, sizeof(strbuf), "address=%s", addrstr); devctl_notify("IFNET", ifp->if_xname, (cmd == RTM_ADD) ? "ADDR_ADD" : "ADDR_DEL", strbuf); } #endif if (V_rt_add_addr_allfibs) fibnum = RT_ALL_FIBS; return (rtsock_addrmsg(cmd, ifa, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @rt: valid rtentry * @nh: nhop object to announce * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(rt_key(rt) != NULL, (":%s: rt_key must be supplied", __func__)); return (rtsock_routemsg(cmd, rt, nh, fibnum)); } /* * Announce kernel-originated route addition/removal to rtsock based on @rt data. * cmd: RTM_ cmd * @info: addrinfo structure with valid data. * @fibnum: fib id or RT_ALL_FIBS * * Returns 0 on success. */ int rt_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) { KASSERT(cmd == RTM_ADD || cmd == RTM_DELETE || cmd == RTM_CHANGE, ("unexpected cmd %d", cmd)); KASSERT(fibnum == RT_ALL_FIBS || (fibnum >= 0 && fibnum < rt_numfibs), ("%s: fib out of range 0 <=%d<%d", __func__, fibnum, rt_numfibs)); KASSERT(info->rti_info[RTAX_DST] != NULL, (":%s: RTAX_DST must be supplied", __func__)); return (rtsock_routemsg_info(cmd, info, fibnum)); } + +/* Netlink-related callbacks needed to glue rtsock, netlink and linuxolator */ +static void +ignore_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ +} +static struct rtbridge ignore_cb = { .route_f = ignore_route_event }; + +void *linux_netlink_p = NULL; /* Callback pointer for Linux translator functions */ +struct rtbridge *rtsock_callback_p = &ignore_cb; +struct rtbridge *netlink_callback_p = &ignore_cb; diff --git a/sys/net/route/route_ctl.h b/sys/net/route/route_ctl.h index 0b331e5f7d2c..d150da6264d4 100644 --- a/sys/net/route/route_ctl.h +++ b/sys/net/route/route_ctl.h @@ -1,192 +1,199 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2020 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * This header file contains public functions and structures used for * routing table manipulations. */ #ifndef _NET_ROUTE_ROUTE_CTL_H_ #define _NET_ROUTE_ROUTE_CTL_H_ struct rib_cmd_info { uint8_t rc_cmd; /* RTM_ADD|RTM_DEL|RTM_CHANGE */ uint8_t spare[3]; uint32_t rc_nh_weight; /* new nhop weight */ struct rtentry *rc_rt; /* Target entry */ struct nhop_object *rc_nh_old; /* Target nhop OR mpath */ struct nhop_object *rc_nh_new; /* Target nhop OR mpath */ }; struct route_nhop_data { union { struct nhop_object *rnd_nhop; struct nhgrp_object *rnd_nhgrp; }; uint32_t rnd_weight; }; int rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen, struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc); int rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen, rib_filter_f_t *filter_func, void *filter_arg, int op_flags, struct rib_cmd_info *rc); int rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen, const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc); /* operation flags */ #define RTM_F_CREATE 0x01 #define RTM_F_EXCL 0x02 #define RTM_F_REPLACE 0x04 #define RTM_F_APPEND 0x08 #define RTM_F_FORCE 0x10 int rib_add_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_change_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info, struct rib_cmd_info *rc); int rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *_data); int rib_handle_ifaddr_info(uint32_t fibnum, int cmd, struct rt_addrinfo *info); int rib_add_default_route(uint32_t fibnum, int family, struct ifnet *ifp, struct sockaddr *gw, struct rib_cmd_info *rc); typedef void route_notification_t(const struct rib_cmd_info *rc, void *); void rib_decompose_notification(const struct rib_cmd_info *rc, route_notification_t *cb, void *cbdata); int rib_add_redirect(u_int fibnum, struct sockaddr *dst, struct sockaddr *gateway, struct sockaddr *author, struct ifnet *ifp, int flags, int expire_sec); /* common flags for the functions below */ #define RIB_FLAG_WLOCK 0x01 /* Need exclusive rnh lock */ #define RIB_FLAG_LOCKED 0x02 /* Do not explicitly acquire rnh lock */ enum rib_walk_hook { RIB_WALK_HOOK_PRE, /* Hook is called before iteration */ RIB_WALK_HOOK_POST, /* Hook is called after iteration */ }; typedef int rib_walktree_f_t(struct rtentry *, void *); typedef void rib_walk_hook_f_t(struct rib_head *rnh, enum rib_walk_hook stage, void *arg); void rib_walk(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f, void *arg); void rib_walk_ext(uint32_t fibnum, int af, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_ext_internal(struct rib_head *rnh, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_ext_locked(struct rib_head *rnh, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_walk_from(uint32_t fibnum, int family, uint32_t flags, struct sockaddr *prefix, struct sockaddr *mask, rib_walktree_f_t *wa_f, void *arg); void rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg, bool report); void rib_foreach_table_walk(int family, bool wlock, rib_walktree_f_t *wa_f, rib_walk_hook_f_t *hook_f, void *arg); void rib_foreach_table_walk_del(int family, rib_filter_f_t *filter_f, void *arg); struct nhop_object; struct nhgrp_object; const struct rtentry *rib_lookup_prefix(uint32_t fibnum, int family, const struct sockaddr *dst, const struct sockaddr *netmask, struct route_nhop_data *rnd); const struct rtentry *rib_lookup_lpm(uint32_t fibnum, int family, const struct sockaddr *dst, struct route_nhop_data *rnd); /* rtentry accessors */ bool rt_is_host(const struct rtentry *rt); sa_family_t rt_get_family(const struct rtentry *); struct nhop_object *rt_get_raw_nhop(const struct rtentry *rt); void rt_get_rnd(const struct rtentry *rt, struct route_nhop_data *rnd); #ifdef INET struct in_addr; void rt_get_inet_prefix_plen(const struct rtentry *rt, struct in_addr *paddr, int *plen, uint32_t *pscopeid); void rt_get_inet_prefix_pmask(const struct rtentry *rt, struct in_addr *paddr, struct in_addr *pmask, uint32_t *pscopeid); struct rtentry *rt_get_inet_parent(uint32_t fibnum, struct in_addr addr, int plen); #endif #ifdef INET6 struct in6_addr; void rt_get_inet6_prefix_plen(const struct rtentry *rt, struct in6_addr *paddr, int *plen, uint32_t *pscopeid); void rt_get_inet6_prefix_pmask(const struct rtentry *rt, struct in6_addr *paddr, struct in6_addr *pmask, uint32_t *pscopeid); struct rtentry *rt_get_inet6_parent(uint32_t fibnum, const struct in6_addr *paddr, int plen); struct in6_addr; void ip6_writemask(struct in6_addr *addr6, uint8_t mask); #endif /* Nexthops */ uint32_t nhops_get_count(struct rib_head *rh); /* Multipath */ struct weightened_nhop; const struct weightened_nhop *nhgrp_get_nhops(const struct nhgrp_object *nhg, uint32_t *pnum_nhops); uint32_t nhgrp_get_count(struct rib_head *rh); int nhgrp_get_group(struct rib_head *rh, struct weightened_nhop *wn, int num_nhops, uint32_t uidx, struct nhgrp_object **pnhg); /* Route subscriptions */ enum rib_subscription_type { RIB_NOTIFY_IMMEDIATE, RIB_NOTIFY_DELAYED }; struct rib_subscription; typedef void rib_subscription_cb_t(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg); struct rib_subscription *rib_subscribe(uint32_t fibnum, int family, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok); struct rib_subscription *rib_subscribe_internal(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type, bool waitok); struct rib_subscription *rib_subscribe_locked(struct rib_head *rnh, rib_subscription_cb_t *f, void *arg, enum rib_subscription_type type); void rib_unsubscribe(struct rib_subscription *rs); void rib_unsubscribe_locked(struct rib_subscription *rs); void rib_notify(struct rib_head *rnh, enum rib_subscription_type type, struct rib_cmd_info *rc); +/* Event bridge */ +typedef void route_event_f(uint32_t fibnum, const struct rib_cmd_info *rc); +struct rtbridge{ + route_event_f *route_f; +}; +extern struct rtbridge *rtsock_callback_p; +extern struct rtbridge *netlink_callback_p; #endif diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 91ad8c79a5eb..99d962c972cb 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -1,2669 +1,2711 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)rtsock.c 8.7 (Berkeley) 10/12/95 * $FreeBSD$ */ #include "opt_ddb.h" #include "opt_route.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define DEBUG_MOD_NAME rtsock #define DEBUG_MAX_LEVEL LOG_DEBUG #include _DECLARE_DEBUG(LOG_INFO); #ifdef COMPAT_FREEBSD32 #include #include struct if_msghdr32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; struct if_data ifm_data; }; struct if_msghdrl32 { uint16_t ifm_msglen; uint8_t ifm_version; uint8_t ifm_type; int32_t ifm_addrs; int32_t ifm_flags; uint16_t ifm_index; uint16_t _ifm_spare1; uint16_t ifm_len; uint16_t ifm_data_off; uint32_t _ifm_spare2; struct if_data ifm_data; }; struct ifa_msghdrl32 { uint16_t ifam_msglen; uint8_t ifam_version; uint8_t ifam_type; int32_t ifam_addrs; int32_t ifam_flags; uint16_t ifam_index; uint16_t _ifam_spare1; uint16_t ifam_len; uint16_t ifam_data_off; int32_t ifam_metric; struct if_data ifam_data; }; #define SA_SIZE32(sa) \ ( (((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(int) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) ) #endif /* COMPAT_FREEBSD32 */ struct linear_buffer { char *base; /* Base allocated memory pointer */ uint32_t offset; /* Currently used offset */ uint32_t size; /* Total buffer size */ }; #define SCRATCH_BUFFER_SIZE 1024 #define RTS_PID_LOG(_l, _fmt, ...) RT_LOG_##_l(_l, "PID %d: " _fmt, curproc ? curproc->p_pid : 0, ## __VA_ARGS__) MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); /* NB: these are not modified */ static struct sockaddr route_src = { 2, PF_ROUTE, }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, }; /* These are external hooks for CARP. */ int (*carp_get_vhid_p)(struct ifaddr *); /* * Used by rtsock callback code to decide whether to filter the update * notification to a socket bound to a particular FIB. */ #define RTS_FILTER_FIB M_PROTO8 /* * Used to store address family of the notification. */ #define m_rtsock_family m_pkthdr.PH_loc.eight[0] struct rcb { LIST_ENTRY(rcb) list; struct socket *rcb_socket; sa_family_t rcb_family; }; typedef struct { LIST_HEAD(, rcb) cblist; int ip_count; /* attached w/ AF_INET */ int ip6_count; /* attached w/ AF_INET6 */ int any_count; /* total attached */ } route_cb_t; VNET_DEFINE_STATIC(route_cb_t, route_cb); #define V_route_cb VNET(route_cb) struct mtx rtsock_mtx; MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF); #define RTSOCK_LOCK() mtx_lock(&rtsock_mtx) #define RTSOCK_UNLOCK() mtx_unlock(&rtsock_mtx) #define RTSOCK_LOCK_ASSERT() mtx_assert(&rtsock_mtx, MA_OWNED) SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); struct walkarg { int family; int w_tmemsize; int w_op, w_arg; caddr_t w_tmem; struct sysctl_req *w_req; struct sockaddr *dst; struct sockaddr *mask; }; static void rts_input(struct mbuf *m); static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo); static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen); static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo); static int cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb); static int sysctl_dumpentry(struct rtentry *rt, void *vw); static int sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight, struct walkarg *w); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static void rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh, struct rt_metrics *out); static void rt_dispatch(struct mbuf *, sa_family_t); static void rt_ifannouncemsg(struct ifnet *ifp, int what); static int handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, struct rt_msghdr *rtm, struct rib_cmd_info *rc); static int update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm, int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh); static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m, sa_family_t saf, u_int fibnum, int rtm_errno); static bool can_export_rte(struct ucred *td_ucred, bool rt_is_host, const struct sockaddr *rt_dst); +static void rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc); static struct netisr_handler rtsock_nh = { .nh_name = "rtsock", .nh_handler = rts_input, .nh_proto = NETISR_ROUTE, .nh_policy = NETISR_POLICY_SOURCE, }; static int sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&rtsock_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&rtsock_nh, qlimit)); } SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_route_netisr_maxqlen, "I", "maximum routing socket dispatch queue length"); static void vnet_rts_init(void) { int tmp; if (IS_DEFAULT_VNET(curvnet)) { if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp)) rtsock_nh.nh_qlimit = tmp; netisr_register(&rtsock_nh); } #ifdef VIMAGE else netisr_register_vnet(&rtsock_nh); #endif } VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_init, 0); #ifdef VIMAGE static void vnet_rts_uninit(void) { netisr_unregister_vnet(&rtsock_nh); } VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, vnet_rts_uninit, 0); #endif +static void +report_route_event(const struct rib_cmd_info *rc, void *_cbdata) +{ + uint32_t fibnum = (uint32_t)(uintptr_t)_cbdata; + struct nhop_object *nh; + + nh = rc->rc_cmd == RTM_DELETE ? rc->rc_nh_old : rc->rc_nh_new; + rt_routemsg(rc->rc_cmd, rc->rc_rt, nh, fibnum); +} + +static void +rts_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ +#ifdef ROUTE_MPATH + if ((rc->rc_nh_new && NH_IS_NHGRP(rc->rc_nh_new)) || + (rc->rc_nh_old && NH_IS_NHGRP(rc->rc_nh_old))) { + rib_decompose_notification(rc, report_route_event, + (void *)(uintptr_t)fibnum); + } else +#endif + report_route_event(rc, (void *)(uintptr_t)fibnum); +} +static struct rtbridge rtsbridge = { .route_f = rts_handle_route_event }; +static struct rtbridge *rtsbridge_orig_p; + +static void +rtsock_notify_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ + netlink_callback_p->route_f(fibnum, rc); +} + +static void +rtsock_init(void) +{ + rtsbridge_orig_p = rtsock_callback_p; + rtsock_callback_p = &rtsbridge; +} +SYSINIT(rtsock_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtsock_init, NULL); + static void rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp) { rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } EVENTHANDLER_DEFINE(ifnet_arrival_event, rts_handle_ifnet_arrival, NULL, 0); static void rts_handle_ifnet_departure(void *arg __unused, struct ifnet *ifp) { rt_ifannouncemsg(ifp, IFAN_DEPARTURE); } EVENTHANDLER_DEFINE(ifnet_departure_event, rts_handle_ifnet_departure, NULL, 0); static void rts_append_data(struct socket *so, struct mbuf *m) { if (sbappendaddr(&so->so_rcv, &route_src, m, NULL) == 0) { soroverflow(so); m_freem(m); } else sorwakeup(so); } static void rts_input(struct mbuf *m) { struct rcb *rcb; struct socket *last; last = NULL; RTSOCK_LOCK(); LIST_FOREACH(rcb, &V_route_cb.cblist, list) { if (rcb->rcb_family != AF_UNSPEC && rcb->rcb_family != m->m_rtsock_family) continue; if ((m->m_flags & RTS_FILTER_FIB) && M_GETFIB(m) != rcb->rcb_socket->so_fibnum) continue; if (last != NULL) { struct mbuf *n; n = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (n != NULL) rts_append_data(last, n); } last = rcb->rcb_socket; } if (last != NULL) rts_append_data(last, m); else m_freem(m); RTSOCK_UNLOCK(); } static void rts_close(struct socket *so) { soisdisconnected(so); } static SYSCTL_NODE(_net, OID_AUTO, rtsock, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Routing socket infrastructure"); static u_long rts_sendspace = 8192; SYSCTL_ULONG(_net_rtsock, OID_AUTO, sendspace, CTLFLAG_RW, &rts_sendspace, 0, "Default routing socket send space"); static u_long rts_recvspace = 8192; SYSCTL_ULONG(_net_rtsock, OID_AUTO, recvspace, CTLFLAG_RW, &rts_recvspace, 0, "Default routing socket receive space"); static int rts_attach(struct socket *so, int proto, struct thread *td) { struct rcb *rcb; int error; error = soreserve(so, rts_sendspace, rts_recvspace); if (error) return (error); rcb = malloc(sizeof(*rcb), M_PCB, M_WAITOK); rcb->rcb_socket = so; rcb->rcb_family = proto; so->so_pcb = rcb; so->so_fibnum = td->td_proc->p_fibnum; so->so_options |= SO_USELOOPBACK; RTSOCK_LOCK(); LIST_INSERT_HEAD(&V_route_cb.cblist, rcb, list); switch (proto) { case AF_INET: V_route_cb.ip_count++; break; case AF_INET6: V_route_cb.ip6_count++; break; } V_route_cb.any_count++; RTSOCK_UNLOCK(); soisconnected(so); return (0); } static void rts_detach(struct socket *so) { struct rcb *rcb = so->so_pcb; RTSOCK_LOCK(); LIST_REMOVE(rcb, list); switch(rcb->rcb_family) { case AF_INET: V_route_cb.ip_count--; break; case AF_INET6: V_route_cb.ip6_count--; break; } V_route_cb.any_count--; RTSOCK_UNLOCK(); free(rcb, M_PCB); so->so_pcb = NULL; } static int rts_shutdown(struct socket *so) { socantsendmore(so); return (0); } #ifndef _SOCKADDR_UNION_DEFINED #define _SOCKADDR_UNION_DEFINED /* * The union of all possible address formats we handle. */ union sockaddr_union { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; #endif /* _SOCKADDR_UNION_DEFINED */ static int rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred) { #if defined(INET) || defined(INET6) struct epoch_tracker et; #endif /* First, see if the returned address is part of the jail. */ if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) { info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr; return (0); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: { struct in_addr ia; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; ia = ((struct sockaddr_in *)sa)->sin_addr; if (prison_check_ip4(cred, &ia) == 0) { found = 1; break; } } NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. */ ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)-> sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } bzero(&saun->sin, sizeof(struct sockaddr_in)); saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_family = AF_INET; saun->sin.sin_addr.s_addr = ia.s_addr; info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin; break; } #endif #ifdef INET6 case AF_INET6: { struct in6_addr ia6; struct ifaddr *ifa; int found; found = 0; /* * Try to find an address on the given outgoing interface * that belongs to the jail. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa; sa = ifa->ifa_addr; if (sa->sa_family != AF_INET6) continue; bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr, &ia6, sizeof(struct in6_addr)); if (prison_check_ip6(cred, &ia6) == 0) { found = 1; break; } } NET_EPOCH_EXIT(et); if (!found) { /* * As a last resort return the 'default' jail address. */ ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)-> sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } bzero(&saun->sin6, sizeof(struct sockaddr_in6)); saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_family = AF_INET6; bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr)); if (sa6_recoverscope(&saun->sin6) != 0) return (ESRCH); info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6; break; } #endif default: return (ESRCH); } return (0); } static int fill_blackholeinfo(struct rt_addrinfo *info, union sockaddr_union *saun) { struct ifaddr *ifa; sa_family_t saf; if (V_loif == NULL) { RTS_PID_LOG(LOG_INFO, "Unable to add blackhole/reject nhop without loopback"); return (ENOTSUP); } info->rti_ifp = V_loif; saf = info->rti_info[RTAX_DST]->sa_family; CK_STAILQ_FOREACH(ifa, &info->rti_ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == saf) { info->rti_ifa = ifa; break; } } if (info->rti_ifa == NULL) { RTS_PID_LOG(LOG_INFO, "Unable to find ifa for blackhole/reject nhop"); return (ENOTSUP); } bzero(saun, sizeof(union sockaddr_union)); switch (saf) { #ifdef INET case AF_INET: saun->sin.sin_family = AF_INET; saun->sin.sin_len = sizeof(struct sockaddr_in); saun->sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; #endif #ifdef INET6 case AF_INET6: saun->sin6.sin6_family = AF_INET6; saun->sin6.sin6_len = sizeof(struct sockaddr_in6); saun->sin6.sin6_addr = in6addr_loopback; break; #endif default: RTS_PID_LOG(LOG_INFO, "unsupported family: %d", saf); return (ENOTSUP); } info->rti_info[RTAX_GATEWAY] = &saun->sa; info->rti_flags |= RTF_GATEWAY; return (0); } /* * Fills in @info based on userland-provided @rtm message. * * Returns 0 on success. */ static int fill_addrinfo(struct rt_msghdr *rtm, int len, struct linear_buffer *lb, u_int fibnum, struct rt_addrinfo *info) { int error; rtm->rtm_pid = curproc->p_pid; info->rti_addrs = rtm->rtm_addrs; info->rti_mflags = rtm->rtm_inits; info->rti_rmx = &rtm->rtm_rmx; /* * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6 * link-local address because rtrequest requires addresses with * embedded scope id. */ if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, info)) return (EINVAL); info->rti_flags = rtm->rtm_flags; error = cleanup_xaddrs(info, lb); if (error != 0) return (error); /* * Verify that the caller has the appropriate privilege; RTM_GET * is the only operation the non-superuser is allowed. */ if (rtm->rtm_type != RTM_GET) { error = priv_check(curthread, PRIV_NET_ROUTE); if (error != 0) return (error); } /* * The given gateway address may be an interface address. * For example, issuing a "route change" command on a route * entry that was created from a tunnel, and the gateway * address given is the local end point. In this case the * RTF_GATEWAY flag must be cleared or the destination will * not be reachable even though there is no error message. */ if (info->rti_info[RTAX_GATEWAY] != NULL && info->rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) { struct nhop_object *nh; /* * A host route through the loopback interface is * installed for each interface adddress. In pre 8.0 * releases the interface address of a PPP link type * is not reachable locally. This behavior is fixed as * part of the new L2/L3 redesign and rewrite work. The * signature of this interface address route is the * AF_LINK sa_family type of the gateway, and the * rt_ifp has the IFF_LOOPBACK flag set. */ nh = rib_lookup(fibnum, info->rti_info[RTAX_GATEWAY], NHR_NONE, 0); if (nh != NULL && nh->gw_sa.sa_family == AF_LINK && nh->nh_ifp->if_flags & IFF_LOOPBACK) { info->rti_flags &= ~RTF_GATEWAY; info->rti_flags |= RTF_GWFLAG_COMPAT; } } return (0); } static struct nhop_object * select_nhop(struct nhop_object *nh, const struct sockaddr *gw) { if (!NH_IS_NHGRP(nh)) return (nh); #ifdef ROUTE_MPATH const struct weightened_nhop *wn; uint32_t num_nhops; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); if (gw == NULL) return (wn[0].nh); for (int i = 0; i < num_nhops; i++) { if (match_nhop_gw(wn[i].nh, gw)) return (wn[i].nh); } #endif return (NULL); } /* * Handles RTM_GET message from routing socket, returning matching rt. * * Returns: * 0 on success, with locked and referenced matching rt in @rt_nrt * errno of failure */ static int handle_rtm_get(struct rt_addrinfo *info, u_int fibnum, struct rt_msghdr *rtm, struct rib_cmd_info *rc) { RIB_RLOCK_TRACKER; struct rib_head *rnh; struct nhop_object *nh; sa_family_t saf; saf = info->rti_info[RTAX_DST]->sa_family; rnh = rt_tables_get_rnh(fibnum, saf); if (rnh == NULL) return (EAFNOSUPPORT); RIB_RLOCK(rnh); /* * By (implicit) convention host route (one without netmask) * means longest-prefix-match request and the route with netmask * means exact-match lookup. * As cleanup_xaddrs() cleans up info flags&addrs for the /32,/128 * prefixes, use original data to check for the netmask presence. */ if ((rtm->rtm_addrs & RTA_NETMASK) == 0) { /* * Provide longest prefix match for * address lookup (no mask). * 'route -n get addr' */ rc->rc_rt = (struct rtentry *) rnh->rnh_matchaddr( info->rti_info[RTAX_DST], &rnh->head); } else rc->rc_rt = (struct rtentry *) rnh->rnh_lookup( info->rti_info[RTAX_DST], info->rti_info[RTAX_NETMASK], &rnh->head); if (rc->rc_rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]); if (nh == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } /* * If performing proxied L2 entry insertion, and * the actual PPP host entry is found, perform * another search to retrieve the prefix route of * the local end point of the PPP link. * TODO: move this logic to userland. */ if (rtm->rtm_flags & RTF_ANNOUNCE) { struct sockaddr_storage laddr; if (nh->nh_ifp != NULL && nh->nh_ifp->if_type == IFT_PROPVIRTUAL) { struct ifaddr *ifa; ifa = ifa_ifwithnet(info->rti_info[RTAX_DST], 1, RT_ALL_FIBS); if (ifa != NULL) rt_maskedcopy(ifa->ifa_addr, (struct sockaddr *)&laddr, ifa->ifa_netmask); } else rt_maskedcopy(nh->nh_ifa->ifa_addr, (struct sockaddr *)&laddr, nh->nh_ifa->ifa_netmask); /* * refactor rt and no lock operation necessary */ rc->rc_rt = (struct rtentry *)rnh->rnh_matchaddr( (struct sockaddr *)&laddr, &rnh->head); if (rc->rc_rt == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]); if (nh == NULL) { RIB_RUNLOCK(rnh); return (ESRCH); } } rc->rc_nh_new = nh; rc->rc_nh_weight = rc->rc_rt->rt_weight; RIB_RUNLOCK(rnh); return (0); } static void init_sockaddrs_family(int family, struct sockaddr *dst, struct sockaddr *mask) { #ifdef INET if (family == AF_INET) { struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; struct sockaddr_in *mask4 = (struct sockaddr_in *)mask; bzero(dst4, sizeof(struct sockaddr_in)); bzero(mask4, sizeof(struct sockaddr_in)); dst4->sin_family = AF_INET; dst4->sin_len = sizeof(struct sockaddr_in); mask4->sin_family = AF_INET; mask4->sin_len = sizeof(struct sockaddr_in); } #endif #ifdef INET6 if (family == AF_INET6) { struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask; bzero(dst6, sizeof(struct sockaddr_in6)); bzero(mask6, sizeof(struct sockaddr_in6)); dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(struct sockaddr_in6); mask6->sin6_family = AF_INET6; mask6->sin6_len = sizeof(struct sockaddr_in6); } #endif } static void export_rtaddrs(const struct rtentry *rt, struct sockaddr *dst, struct sockaddr *mask) { #ifdef INET if (dst->sa_family == AF_INET) { struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; struct sockaddr_in *mask4 = (struct sockaddr_in *)mask; uint32_t scopeid = 0; rt_get_inet_prefix_pmask(rt, &dst4->sin_addr, &mask4->sin_addr, &scopeid); return; } #endif #ifdef INET6 if (dst->sa_family == AF_INET6) { struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask; uint32_t scopeid = 0; rt_get_inet6_prefix_pmask(rt, &dst6->sin6_addr, &mask6->sin6_addr, &scopeid); dst6->sin6_scope_id = scopeid; return; } #endif } static int update_rtm_from_info(struct rt_addrinfo *info, struct rt_msghdr **prtm, int alloc_len) { struct rt_msghdr *rtm, *orig_rtm = NULL; struct walkarg w; int len; rtm = *prtm; /* Check if we need to realloc storage */ rtsock_msg_buffer(rtm->rtm_type, info, NULL, &len); if (len > alloc_len) { struct rt_msghdr *tmp_rtm; tmp_rtm = malloc(len, M_TEMP, M_NOWAIT); if (tmp_rtm == NULL) return (ENOBUFS); bcopy(rtm, tmp_rtm, rtm->rtm_msglen); orig_rtm = rtm; rtm = tmp_rtm; alloc_len = len; /* * Delay freeing original rtm as info contains * data referencing it. */ } w.w_tmem = (caddr_t)rtm; w.w_tmemsize = alloc_len; rtsock_msg_buffer(rtm->rtm_type, info, &w, &len); rtm->rtm_addrs = info->rti_addrs; if (orig_rtm != NULL) free(orig_rtm, M_TEMP); *prtm = rtm; return (0); } /* * Update sockaddrs, flags, etc in @prtm based on @rc data. * rtm can be reallocated. * * Returns 0 on success, along with pointer to (potentially reallocated) * rtm. * */ static int update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm, int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh) { union sockaddr_union saun; struct rt_msghdr *rtm; struct ifnet *ifp; int error; rtm = *prtm; union sockaddr_union sa_dst, sa_mask; int family = info->rti_info[RTAX_DST]->sa_family; init_sockaddrs_family(family, &sa_dst.sa, &sa_mask.sa); export_rtaddrs(rc->rc_rt, &sa_dst.sa, &sa_mask.sa); info->rti_info[RTAX_DST] = &sa_dst.sa; info->rti_info[RTAX_NETMASK] = rt_is_host(rc->rc_rt) ? NULL : &sa_mask.sa; info->rti_info[RTAX_GATEWAY] = &nh->gw_sa; info->rti_info[RTAX_GENMASK] = 0; ifp = nh->nh_ifp; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { if (ifp) { info->rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; error = rtm_get_jailed(info, ifp, nh, &saun, curthread->td_ucred); if (error != 0) return (error); if (ifp->if_flags & IFF_POINTOPOINT) info->rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr; rtm->rtm_index = ifp->if_index; } else { info->rti_info[RTAX_IFP] = NULL; info->rti_info[RTAX_IFA] = NULL; } } else if (ifp != NULL) rtm->rtm_index = ifp->if_index; if ((error = update_rtm_from_info(info, prtm, alloc_len)) != 0) return (error); rtm = *prtm; rtm->rtm_flags = rc->rc_rt->rte_flags | nhop_get_rtflags(nh); if (rtm->rtm_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rtm->rtm_flags & ~RTF_GWFLAG_COMPAT); rt_getmetrics(rc->rc_rt, nh, &rtm->rtm_rmx); rtm->rtm_rmx.rmx_weight = rc->rc_nh_weight; return (0); } #ifdef ROUTE_MPATH static void save_del_notification(const struct rib_cmd_info *rc, void *_cbdata) { struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; if (rc->rc_cmd == RTM_DELETE) *rc_new = *rc; } static void save_add_notification(const struct rib_cmd_info *rc, void *_cbdata) { struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata; if (rc->rc_cmd == RTM_ADD) *rc_new = *rc; } #endif #if defined(INET6) || defined(INET) static struct sockaddr * alloc_sockaddr_aligned(struct linear_buffer *lb, int len) { len = roundup2(len, sizeof(uint64_t)); if (lb->offset + len > lb->size) return (NULL); struct sockaddr *sa = (struct sockaddr *)(lb->base + lb->offset); lb->offset += len; return (sa); } #endif static int rts_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct rt_msghdr *rtm = NULL; struct rt_addrinfo info; struct epoch_tracker et; #ifdef INET6 struct sockaddr_storage ss; struct sockaddr_in6 *sin6; int i, rti_need_deembed = 0; #endif int alloc_len = 0, len, error = 0, fibnum; sa_family_t saf = AF_UNSPEC; struct rib_cmd_info rc; struct nhop_object *nh; if ((flags & PRUS_OOB) || control != NULL) { m_freem(m); if (control != NULL) m_freem(control); return (EOPNOTSUPP); } fibnum = so->so_fibnum; #define senderr(e) { error = e; goto flush;} if (m == NULL || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == NULL)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); NET_EPOCH_ENTER(et); len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) senderr(EINVAL); /* * Most of current messages are in range 200-240 bytes, * minimize possible re-allocation on reply using larger size * buffer aligned on 1k boundaty. */ alloc_len = roundup2(len, 1024); int total_len = alloc_len + SCRATCH_BUFFER_SIZE; if ((rtm = malloc(total_len, M_TEMP, M_NOWAIT)) == NULL) senderr(ENOBUFS); m_copydata(m, 0, len, (caddr_t)rtm); bzero(&info, sizeof(info)); nh = NULL; struct linear_buffer lb = { .base = (char *)rtm + alloc_len, .size = SCRATCH_BUFFER_SIZE, }; if (rtm->rtm_version != RTM_VERSION) { /* Do not touch message since format is unknown */ free(rtm, M_TEMP); rtm = NULL; senderr(EPROTONOSUPPORT); } /* * Starting from here, it is possible * to alter original message and insert * caller PID and error value. */ if ((error = fill_addrinfo(rtm, len, &lb, fibnum, &info)) != 0) { senderr(error); } /* fill_addringo() embeds scope into IPv6 addresses */ #ifdef INET6 rti_need_deembed = 1; #endif saf = info.rti_info[RTAX_DST]->sa_family; /* support for new ARP code */ if (rtm->rtm_flags & RTF_LLDATA) { error = lla_rt_output(rtm, &info); goto flush; } union sockaddr_union gw_saun; int blackhole_flags = rtm->rtm_flags & (RTF_BLACKHOLE|RTF_REJECT); if (blackhole_flags != 0) { if (blackhole_flags != (RTF_BLACKHOLE | RTF_REJECT)) error = fill_blackholeinfo(&info, &gw_saun); else { RTS_PID_LOG(LOG_DEBUG, "both BLACKHOLE and REJECT flags specifiied"); error = EINVAL; } if (error != 0) senderr(error); } switch (rtm->rtm_type) { case RTM_ADD: case RTM_CHANGE: if (rtm->rtm_type == RTM_ADD) { if (info.rti_info[RTAX_GATEWAY] == NULL) { RTS_PID_LOG(LOG_DEBUG, "RTM_ADD w/o gateway"); senderr(EINVAL); } } error = rib_action(fibnum, rtm->rtm_type, &info, &rc); if (error == 0) { + rtsock_notify_event(fibnum, &rc); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_new) || (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) { struct rib_cmd_info rc_simple = {}; rib_decompose_notification(&rc, save_add_notification, (void *)&rc_simple); rc = rc_simple; } #endif /* nh MAY be empty if RTM_CHANGE request is no-op */ nh = rc.rc_nh_new; if (nh != NULL) { rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_flags = rc.rc_rt->rte_flags | nhop_get_rtflags(nh); } } break; case RTM_DELETE: error = rib_action(fibnum, RTM_DELETE, &info, &rc); if (error == 0) { + rtsock_notify_event(fibnum, &rc); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(rc.rc_nh_old) || (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) { struct rib_cmd_info rc_simple = {}; rib_decompose_notification(&rc, save_del_notification, (void *)&rc_simple); rc = rc_simple; } #endif nh = rc.rc_nh_old; } break; case RTM_GET: error = handle_rtm_get(&info, fibnum, rtm, &rc); if (error != 0) senderr(error); nh = rc.rc_nh_new; if (!can_export_rte(curthread->td_ucred, info.rti_info[RTAX_NETMASK] == NULL, info.rti_info[RTAX_DST])) { senderr(ESRCH); } break; default: senderr(EOPNOTSUPP); } if (error == 0 && nh != NULL) { error = update_rtm_from_rc(&info, &rtm, alloc_len, &rc, nh); /* * Note that some sockaddr pointers may have changed to * point to memory outsize @rtm. Some may be pointing * to the on-stack variables. * Given that, any pointer in @info CANNOT BE USED. */ /* * scopeid deembedding has been performed while * writing updated rtm in rtsock_msg_buffer(). * With that in mind, skip deembedding procedure below. */ #ifdef INET6 rti_need_deembed = 0; #endif } flush: NET_EPOCH_EXIT(et); #ifdef INET6 if (rtm != NULL) { if (rti_need_deembed) { /* sin6_scope_id is recovered before sending rtm. */ sin6 = (struct sockaddr_in6 *)&ss; for (i = 0; i < RTAX_MAX; i++) { if (info.rti_info[i] == NULL) continue; if (info.rti_info[i]->sa_family != AF_INET6) continue; bcopy(info.rti_info[i], sin6, sizeof(*sin6)); if (sa6_recoverscope(sin6) == 0) bcopy(sin6, info.rti_info[i], sizeof(*sin6)); } if (update_rtm_from_info(&info, &rtm, alloc_len) != 0) { if (error != 0) error = ENOBUFS; } } } #endif send_rtm_reply(so, rtm, m, saf, fibnum, error); return (error); } /* * Sends the prepared reply message in @rtm to all rtsock clients. * Frees @m and @rtm. * */ static void send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m, sa_family_t saf, u_int fibnum, int rtm_errno) { struct rcb *rcb = NULL; /* * Check to see if we don't want our own messages. */ if ((so->so_options & SO_USELOOPBACK) == 0) { if (V_route_cb.any_count <= 1) { if (rtm != NULL) free(rtm, M_TEMP); m_freem(m); return; } /* There is another listener, so construct message */ rcb = so->so_pcb; } if (rtm != NULL) { if (rtm_errno!= 0) rtm->rtm_errno = rtm_errno; else rtm->rtm_flags |= RTF_DONE; m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm); if (m->m_pkthdr.len < rtm->rtm_msglen) { m_freem(m); m = NULL; } else if (m->m_pkthdr.len > rtm->rtm_msglen) m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len); free(rtm, M_TEMP); } if (m != NULL) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; if (rcb) { /* * XXX insure we don't get a copy by * invalidating our protocol */ sa_family_t family = rcb->rcb_family; rcb->rcb_family = AF_UNSPEC; rt_dispatch(m, saf); rcb->rcb_family = family; } else rt_dispatch(m, saf); } } static void rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh, struct rt_metrics *out) { bzero(out, sizeof(*out)); out->rmx_mtu = nh->nh_mtu; out->rmx_weight = rt->rt_weight; out->rmx_nhidx = nhop_get_idx(nh); /* Kernel -> userland timebase conversion. */ out->rmx_expire = nhop_get_expire(nh) ? nhop_get_expire(nh) - time_uptime + time_second : 0; } /* * Extract the addresses of the passed sockaddrs. * Do a little sanity checking so as to avoid bad memory references. * This data is derived straight from userland. */ static int rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo) { struct sockaddr *sa; int i; for (i = 0; i < RTAX_MAX && cp < cplim; i++) { if ((rtinfo->rti_addrs & (1 << i)) == 0) continue; sa = (struct sockaddr *)cp; /* * It won't fit. */ if (cp + sa->sa_len > cplim) { RTS_PID_LOG(LOG_DEBUG, "sa_len too big for sa type %d", i); return (EINVAL); } /* * there are no more.. quit now * If there are more bits, they are in error. * I've seen this. route(1) can evidently generate these. * This causes kernel to core dump. * for compatibility, If we see this, point to a safe address. */ if (sa->sa_len == 0) { rtinfo->rti_info[i] = &sa_zero; return (0); /* should be EINVAL but for compat */ } /* accept it */ #ifdef INET6 if (sa->sa_family == AF_INET6) sa6_embedscope((struct sockaddr_in6 *)sa, V_ip6_use_defzone); #endif rtinfo->rti_info[i] = sa; cp += SA_SIZE(sa); } return (0); } #ifdef INET static inline void fill_sockaddr_inet(struct sockaddr_in *sin, struct in_addr addr) { const struct sockaddr_in nsin = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr = addr, }; *sin = nsin; } #endif #ifdef INET6 static inline void fill_sockaddr_inet6(struct sockaddr_in6 *sin6, const struct in6_addr *addr6, uint32_t scopeid) { const struct sockaddr_in6 nsin6 = { .sin6_family = AF_INET6, .sin6_len = sizeof(struct sockaddr_in6), .sin6_addr = *addr6, .sin6_scope_id = scopeid, }; *sin6 = nsin6; } #endif #if defined(INET6) || defined(INET) /* * Checks if gateway is suitable for lltable operations. * Lltable code requires AF_LINK gateway with ifindex * and mac address specified. * Returns 0 on success. */ static int cleanup_xaddrs_lladdr(struct rt_addrinfo *info) { struct sockaddr_dl *sdl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY]; if (sdl->sdl_family != AF_LINK) return (EINVAL); if (sdl->sdl_index == 0) { RTS_PID_LOG(LOG_DEBUG, "AF_LINK gateway w/o ifindex"); return (EINVAL); } if (offsetof(struct sockaddr_dl, sdl_data) + sdl->sdl_nlen + sdl->sdl_alen > sdl->sdl_len) { RTS_PID_LOG(LOG_DEBUG, "AF_LINK gw: sdl_nlen/sdl_alen too large"); return (EINVAL); } return (0); } static int cleanup_xaddrs_gateway(struct rt_addrinfo *info, struct linear_buffer *lb) { struct sockaddr *gw = info->rti_info[RTAX_GATEWAY]; struct sockaddr *sa; if (info->rti_flags & RTF_LLDATA) return (cleanup_xaddrs_lladdr(info)); switch (gw->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *gw_sin = (struct sockaddr_in *)gw; /* Ensure reads do not go beyoud SA boundary */ if (SA_SIZE(gw) < offsetof(struct sockaddr_in, sin_zero)) { RTS_PID_LOG(LOG_DEBUG, "gateway sin_len too small: %d", gw->sa_len); return (EINVAL); } sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_in)); if (sa == NULL) return (ENOBUFS); fill_sockaddr_inet((struct sockaddr_in *)sa, gw_sin->sin_addr); info->rti_info[RTAX_GATEWAY] = sa; } break; #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *gw_sin6 = (struct sockaddr_in6 *)gw; if (gw_sin6->sin6_len < sizeof(struct sockaddr_in6)) { RTS_PID_LOG(LOG_DEBUG, "gateway sin6_len too small: %d", gw->sa_len); return (EINVAL); } fill_sockaddr_inet6(gw_sin6, &gw_sin6->sin6_addr, 0); break; } #endif case AF_LINK: { struct sockaddr_dl *gw_sdl; size_t sdl_min_len = offsetof(struct sockaddr_dl, sdl_data); gw_sdl = (struct sockaddr_dl *)gw; if (gw_sdl->sdl_len < sdl_min_len) { RTS_PID_LOG(LOG_DEBUG, "gateway sdl_len too small: %d", gw_sdl->sdl_len); return (EINVAL); } sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_dl_short)); if (sa == NULL) return (ENOBUFS); const struct sockaddr_dl_short sdl = { .sdl_family = AF_LINK, .sdl_len = sizeof(struct sockaddr_dl_short), .sdl_index = gw_sdl->sdl_index, }; *((struct sockaddr_dl_short *)sa) = sdl; info->rti_info[RTAX_GATEWAY] = sa; break; } } return (0); } #endif static void remove_netmask(struct rt_addrinfo *info) { info->rti_info[RTAX_NETMASK] = NULL; info->rti_flags |= RTF_HOST; info->rti_addrs &= ~RTA_NETMASK; } #ifdef INET static int cleanup_xaddrs_inet(struct rt_addrinfo *info, struct linear_buffer *lb) { struct sockaddr_in *dst_sa, *mask_sa; const int sa_len = sizeof(struct sockaddr_in); struct in_addr dst, mask; /* Check & fixup dst/netmask combination first */ dst_sa = (struct sockaddr_in *)info->rti_info[RTAX_DST]; mask_sa = (struct sockaddr_in *)info->rti_info[RTAX_NETMASK]; /* Ensure reads do not go beyound the buffer size */ if (SA_SIZE(dst_sa) < offsetof(struct sockaddr_in, sin_zero)) { RTS_PID_LOG(LOG_DEBUG, "prefix dst sin_len too small: %d", dst_sa->sin_len); return (EINVAL); } if ((mask_sa != NULL) && mask_sa->sin_len < sizeof(struct sockaddr_in)) { /* * Some older routing software encode mask length into the * sin_len, thus resulting in "truncated" sockaddr. */ int len = mask_sa->sin_len - offsetof(struct sockaddr_in, sin_addr); if (len >= 0) { mask.s_addr = 0; if (len > sizeof(struct in_addr)) len = sizeof(struct in_addr); memcpy(&mask, &mask_sa->sin_addr, len); } else { RTS_PID_LOG(LOG_DEBUG, "prefix mask sin_len too small: %d", mask_sa->sin_len); return (EINVAL); } } else mask.s_addr = mask_sa ? mask_sa->sin_addr.s_addr : INADDR_BROADCAST; dst.s_addr = htonl(ntohl(dst_sa->sin_addr.s_addr) & ntohl(mask.s_addr)); /* Construct new "clean" dst/mask sockaddresses */ if ((dst_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet(dst_sa, dst); info->rti_info[RTAX_DST] = (struct sockaddr *)dst_sa; if (mask.s_addr != INADDR_BROADCAST) { if ((mask_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet(mask_sa, mask); info->rti_info[RTAX_NETMASK] = (struct sockaddr *)mask_sa; info->rti_flags &= ~RTF_HOST; } else remove_netmask(info); /* Check gateway */ if (info->rti_info[RTAX_GATEWAY] != NULL) return (cleanup_xaddrs_gateway(info, lb)); return (0); } #endif #ifdef INET6 static int cleanup_xaddrs_inet6(struct rt_addrinfo *info, struct linear_buffer *lb) { struct sockaddr *sa; struct sockaddr_in6 *dst_sa, *mask_sa; struct in6_addr mask, *dst; const int sa_len = sizeof(struct sockaddr_in6); /* Check & fixup dst/netmask combination first */ dst_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_DST]; mask_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_NETMASK]; if (dst_sa->sin6_len < sizeof(struct sockaddr_in6)) { RTS_PID_LOG(LOG_DEBUG, "prefix dst sin6_len too small: %d", dst_sa->sin6_len); return (EINVAL); } if (mask_sa && mask_sa->sin6_len < sizeof(struct sockaddr_in6)) { /* * Some older routing software encode mask length into the * sin6_len, thus resulting in "truncated" sockaddr. */ int len = mask_sa->sin6_len - offsetof(struct sockaddr_in6, sin6_addr); if (len >= 0) { bzero(&mask, sizeof(mask)); if (len > sizeof(struct in6_addr)) len = sizeof(struct in6_addr); memcpy(&mask, &mask_sa->sin6_addr, len); } else { RTS_PID_LOG(LOG_DEBUG, "rtsock: prefix mask sin6_len too small: %d", mask_sa->sin6_len); return (EINVAL); } } else mask = mask_sa ? mask_sa->sin6_addr : in6mask128; dst = &dst_sa->sin6_addr; IN6_MASK_ADDR(dst, &mask); if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet6((struct sockaddr_in6 *)sa, dst, 0); info->rti_info[RTAX_DST] = sa; if (!IN6_ARE_ADDR_EQUAL(&mask, &in6mask128)) { if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL) return (ENOBUFS); fill_sockaddr_inet6((struct sockaddr_in6 *)sa, &mask, 0); info->rti_info[RTAX_NETMASK] = sa; info->rti_flags &= ~RTF_HOST; } else remove_netmask(info); /* Check gateway */ if (info->rti_info[RTAX_GATEWAY] != NULL) return (cleanup_xaddrs_gateway(info, lb)); return (0); } #endif static int cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb) { int error = EAFNOSUPPORT; if (info->rti_info[RTAX_DST] == NULL) { RTS_PID_LOG(LOG_DEBUG, "prefix dst is not set"); return (EINVAL); } if (info->rti_flags & RTF_LLDATA) { /* * arp(8)/ndp(8) sends RTA_NETMASK for the associated * prefix along with the actual address in RTA_DST. * Remove netmask to avoid unnecessary address masking. */ remove_netmask(info); } switch (info->rti_info[RTAX_DST]->sa_family) { #ifdef INET case AF_INET: error = cleanup_xaddrs_inet(info, lb); break; #endif #ifdef INET6 case AF_INET6: error = cleanup_xaddrs_inet6(info, lb); break; #endif } return (error); } /* * Fill in @dmask with valid netmask leaving original @smask * intact. Mostly used with radix netmasks. */ struct sockaddr * rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask, struct sockaddr_storage *dmask) { if (dst == NULL || smask == NULL) return (NULL); memset(dmask, 0, dst->sa_len); memcpy(dmask, smask, smask->sa_len); dmask->ss_len = dst->sa_len; dmask->ss_family = dst->sa_family; return ((struct sockaddr *)dmask); } /* * Writes information related to @rtinfo object to newly-allocated mbuf. * Assumes MCLBYTES is enough to construct any message. * Used for OS notifications of vaious events (if/ifa announces,etc) * * Returns allocated mbuf or NULL on failure. */ static struct mbuf * rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo) { struct sockaddr_storage ss; struct rt_msghdr *rtm; struct mbuf *m; int i; struct sockaddr *sa; #ifdef INET6 struct sockaddr_in6 *sin6; #endif int len, dlen; switch (type) { case RTM_DELADDR: case RTM_NEWADDR: len = sizeof(struct ifa_msghdr); break; case RTM_DELMADDR: case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; case RTM_IFINFO: len = sizeof(struct if_msghdr); break; case RTM_IFANNOUNCE: case RTM_IEEE80211: len = sizeof(struct if_announcemsghdr); break; default: len = sizeof(struct rt_msghdr); } /* XXXGL: can we use MJUMPAGESIZE cluster here? */ KASSERT(len <= MCLBYTES, ("%s: message too big", __func__)); if (len > MHLEN) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (m); m->m_pkthdr.len = m->m_len = len; rtm = mtod(m, struct rt_msghdr *); bzero((caddr_t)rtm, len); for (i = 0; i < RTAX_MAX; i++) { if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); dlen = SA_SIZE(sa); KASSERT(dlen <= sizeof(ss), ("%s: sockaddr size overflow", __func__)); bzero(&ss, sizeof(ss)); bcopy(sa, &ss, sa->sa_len); sa = (struct sockaddr *)&ss; #ifdef INET6 if (sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)sa; (void)sa6_recoverscope(sin6); } #endif m_copyback(m, len, dlen, (caddr_t)sa); len += dlen; } if (m->m_pkthdr.len != len) { m_freem(m); return (NULL); } rtm->rtm_msglen = len; rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; return (m); } /* * Writes information related to @rtinfo object to preallocated buffer. * Stores needed size in @plen. If @w is NULL, calculates size without * writing. * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation. * * Returns 0 on success. * */ static int rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen) { struct sockaddr_storage ss; int len, buflen = 0, dlen, i; caddr_t cp = NULL; struct rt_msghdr *rtm = NULL; #ifdef INET6 struct sockaddr_in6 *sin6; #endif #ifdef COMPAT_FREEBSD32 bool compat32 = false; #endif switch (type) { case RTM_DELADDR: case RTM_NEWADDR: if (w != NULL && w->w_op == NET_RT_IFLISTL) { #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { len = sizeof(struct ifa_msghdrl32); compat32 = true; } else #endif len = sizeof(struct ifa_msghdrl); } else len = sizeof(struct ifa_msghdr); break; case RTM_IFINFO: #ifdef COMPAT_FREEBSD32 if (w != NULL && w->w_req->flags & SCTL_MASK32) { if (w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl32); else len = sizeof(struct if_msghdr32); compat32 = true; break; } #endif if (w != NULL && w->w_op == NET_RT_IFLISTL) len = sizeof(struct if_msghdrl); else len = sizeof(struct if_msghdr); break; case RTM_NEWMADDR: len = sizeof(struct ifma_msghdr); break; default: len = sizeof(struct rt_msghdr); } if (w != NULL) { rtm = (struct rt_msghdr *)w->w_tmem; buflen = w->w_tmemsize - len; cp = (caddr_t)w->w_tmem + len; } rtinfo->rti_addrs = 0; for (i = 0; i < RTAX_MAX; i++) { struct sockaddr *sa; if ((sa = rtinfo->rti_info[i]) == NULL) continue; rtinfo->rti_addrs |= (1 << i); #ifdef COMPAT_FREEBSD32 if (compat32) dlen = SA_SIZE32(sa); else #endif dlen = SA_SIZE(sa); if (cp != NULL && buflen >= dlen) { KASSERT(dlen <= sizeof(ss), ("%s: sockaddr size overflow", __func__)); bzero(&ss, sizeof(ss)); bcopy(sa, &ss, sa->sa_len); sa = (struct sockaddr *)&ss; #ifdef INET6 if (sa->sa_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)sa; (void)sa6_recoverscope(sin6); } #endif bcopy((caddr_t)sa, cp, (unsigned)dlen); cp += dlen; buflen -= dlen; } else if (cp != NULL) { /* * Buffer too small. Count needed size * and return with error. */ cp = NULL; } len += dlen; } if (cp != NULL) { dlen = ALIGN(len) - len; if (buflen < dlen) cp = NULL; else { bzero(cp, dlen); cp += dlen; buflen -= dlen; } } len = ALIGN(len); if (cp != NULL) { /* fill header iff buffer is large enough */ rtm->rtm_version = RTM_VERSION; rtm->rtm_type = type; rtm->rtm_msglen = len; } *plen = len; if (w != NULL && cp == NULL) return (ENOBUFS); return (0); } /* * This routine is called to generate a message from the routing * socket indicating that a redirect has occurred, a routing lookup * has failed, or that a protocol has detected timeouts to a particular * destination. */ void rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error, int fibnum) { struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; if (V_route_cb.any_count == 0) return; m = rtsock_msg_mbuf(type, rtinfo); if (m == NULL) return; if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); } void rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) { rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS); } /* * This routine is called to generate a message from the routing * socket indicating that the status of a network interface has changed. */ void rt_ifmsg(struct ifnet *ifp) { struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); m = rtsock_msg_mbuf(RTM_IFINFO, &info); if (m == NULL) return; ifm = mtod(m, struct if_msghdr *); ifm->ifm_index = ifp->if_index; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; if_data_copy(ifp, &ifm->ifm_data); ifm->ifm_addrs = 0; rt_dispatch(m, AF_UNSPEC); } /* * Announce interface address arrival/withdraw. * Please do not call directly, use rt_addrmsg(). * Assume input data to be valid. * Returns 0 on success. */ int rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum) { struct rt_addrinfo info; struct sockaddr *sa; int ncmd; struct mbuf *m; struct ifa_msghdr *ifam; struct ifnet *ifp = ifa->ifa_ifp; struct sockaddr_storage ss; if (V_route_cb.any_count == 0) return (0); ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( info.rti_info[RTAX_IFA], ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL) return (ENOBUFS); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_addrs = info.rti_addrs; if (fibnum != RT_ALL_FIBS) { M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * Announce route addition/removal to rtsock based on @rt data. * Callers are advives to use rt_routemsg() instead of using this * function directly. * Assume @rt data is consistent. * * Returns 0 on success. */ int rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh, int fibnum) { union sockaddr_union dst, mask; struct rt_addrinfo info; if (V_route_cb.any_count == 0) return (0); int family = rt_get_family(rt); init_sockaddrs_family(family, &dst.sa, &mask.sa); export_rtaddrs(rt, &dst.sa, &mask.sa); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = &dst.sa; info.rti_info[RTAX_NETMASK] = &mask.sa; info.rti_info[RTAX_GATEWAY] = &nh->gw_sa; info.rti_flags = rt->rte_flags | nhop_get_rtflags(nh); info.rti_ifp = nh->nh_ifp; return (rtsock_routemsg_info(cmd, &info, fibnum)); } int rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum) { struct rt_msghdr *rtm; struct sockaddr *sa; struct mbuf *m; if (V_route_cb.any_count == 0) return (0); if (info->rti_flags & RTF_HOST) info->rti_info[RTAX_NETMASK] = NULL; m = rtsock_msg_mbuf(cmd, info); if (m == NULL) return (ENOBUFS); if (fibnum != RT_ALL_FIBS) { KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out " "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs)); M_SETFIB(m, fibnum); m->m_flags |= RTS_FILTER_FIB; } rtm = mtod(m, struct rt_msghdr *); rtm->rtm_addrs = info->rti_addrs; if (info->rti_ifp != NULL) rtm->rtm_index = info->rti_ifp->if_index; /* Add RTF_DONE to indicate command 'completion' required by API */ info->rti_flags |= RTF_DONE; /* Reported routes has to be up */ if (cmd == RTM_ADD || cmd == RTM_CHANGE) info->rti_flags |= RTF_UP; rtm->rtm_flags = info->rti_flags; sa = info->rti_info[RTAX_DST]; rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC); return (0); } /* * This is the analogue to the rt_newaddrmsg which performs the same * function but for multicast group memberhips. This is easier since * there is no route state to worry about. */ void rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) { struct rt_addrinfo info; struct mbuf *m = NULL; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; if (V_route_cb.any_count == 0) return; bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_IFA] = ifma->ifma_addr; if (ifp && ifp->if_addr) info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr; else info.rti_info[RTAX_IFP] = NULL; /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr; m = rtsock_msg_mbuf(cmd, &info); if (m == NULL) return; ifmam = mtod(m, struct ifma_msghdr *); KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n", __func__)); ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC); } static struct mbuf * rt_makeifannouncemsg(struct ifnet *ifp, int type, int what, struct rt_addrinfo *info) { struct if_announcemsghdr *ifan; struct mbuf *m; if (V_route_cb.any_count == 0) return NULL; bzero((caddr_t)info, sizeof(*info)); m = rtsock_msg_mbuf(type, info); if (m != NULL) { ifan = mtod(m, struct if_announcemsghdr *); ifan->ifan_index = ifp->if_index; strlcpy(ifan->ifan_name, ifp->if_xname, sizeof(ifan->ifan_name)); ifan->ifan_what = what; } return m; } /* * This is called to generate routing socket messages indicating * IEEE80211 wireless events. * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way. */ void rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info); if (m != NULL) { /* * Append the ieee80211 data. Try to stick it in the * mbuf containing the ifannounce msg; otherwise allocate * a new mbuf and append. * * NB: we assume m is a single mbuf. */ if (data_len > M_TRAILINGSPACE(m)) { struct mbuf *n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } bcopy(data, mtod(n, void *), data_len); n->m_len = data_len; m->m_next = n; } else if (data_len > 0) { bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len); m->m_len += data_len; } if (m->m_flags & M_PKTHDR) m->m_pkthdr.len += data_len; mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len; rt_dispatch(m, AF_UNSPEC); } } /* * This is called to generate routing socket messages indicating * network interface arrival and departure. */ static void rt_ifannouncemsg(struct ifnet *ifp, int what) { struct mbuf *m; struct rt_addrinfo info; m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info); if (m != NULL) rt_dispatch(m, AF_UNSPEC); } static void rt_dispatch(struct mbuf *m, sa_family_t saf) { M_ASSERTPKTHDR(m); m->m_rtsock_family = saf; if (V_loif) m->m_pkthdr.rcvif = V_loif; else { m_freem(m); return; } netisr_queue(NETISR_ROUTE, m); /* mbuf is free'd on failure. */ } /* * Checks if rte can be exported w.r.t jails/vnets. * * Returns true if it can, false otherwise. */ static bool can_export_rte(struct ucred *td_ucred, bool rt_is_host, const struct sockaddr *rt_dst) { if ((!rt_is_host) ? jailed_without_vnet(td_ucred) : prison_if(td_ucred, rt_dst) != 0) return (false); return (true); } /* * This is used in dumping the kernel table via sysctl(). */ static int sysctl_dumpentry(struct rtentry *rt, void *vw) { struct walkarg *w = vw; struct nhop_object *nh; NET_EPOCH_ASSERT(); export_rtaddrs(rt, w->dst, w->mask); if (!can_export_rte(w->w_req->td->td_ucred, rt_is_host(rt), w->dst)) return (0); nh = rt_get_raw_nhop(rt); #ifdef ROUTE_MPATH if (NH_IS_NHGRP(nh)) { const struct weightened_nhop *wn; uint32_t num_nhops; int error; wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops); for (int i = 0; i < num_nhops; i++) { error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w); if (error != 0) return (error); } } else #endif sysctl_dumpnhop(rt, nh, rt->rt_weight, w); return (0); } static int sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight, struct walkarg *w) { struct rt_addrinfo info; int error = 0, size; uint32_t rtflags; rtflags = nhop_get_rtflags(nh); if (w->w_op == NET_RT_FLAGS && !(rtflags & w->w_arg)) return (0); bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = w->dst; info.rti_info[RTAX_GATEWAY] = &nh->gw_sa; info.rti_info[RTAX_NETMASK] = (rtflags & RTF_HOST) ? NULL : w->mask; info.rti_info[RTAX_GENMASK] = 0; if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) { info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr; info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr; if (nh->nh_ifp->if_flags & IFF_POINTOPOINT) info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr; } if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0) return (error); if (w->w_req && w->w_tmem) { struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; bzero(&rtm->rtm_index, sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index)); /* * rte flags may consist of RTF_HOST (duplicated in nhop rtflags) * and RTF_UP (if entry is linked, which is always true here). * Given that, use nhop rtflags & add RTF_UP. */ rtm->rtm_flags = rtflags | RTF_UP; if (rtm->rtm_flags & RTF_GWFLAG_COMPAT) rtm->rtm_flags = RTF_GATEWAY | (rtm->rtm_flags & ~RTF_GWFLAG_COMPAT); rt_getmetrics(rt, nh, &rtm->rtm_rmx); rtm->rtm_rmx.rmx_weight = weight; rtm->rtm_index = nh->nh_ifp->if_index; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); return (error); } return (error); } static int sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdrl *ifm; struct if_data *ifd; ifm = (struct if_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdrl32 *ifm32; ifm32 = (struct if_msghdrl32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifm32->ifm_len = sizeof(*ifm32); ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data); ifm32->_ifm_spare2 = 0; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifm->ifm_len = sizeof(*ifm); ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data); ifm->_ifm_spare2 = 0; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd, struct rt_addrinfo *info, struct walkarg *w, int len) { struct if_msghdr *ifm; struct if_data *ifd; ifm = (struct if_msghdr *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct if_msghdr32 *ifm32; ifm32 = (struct if_msghdr32 *)ifm; ifm32->ifm_addrs = info->rti_addrs; ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm32->ifm_index = ifp->if_index; ifm32->_ifm_spare1 = 0; ifd = &ifm32->ifm_data; } else #endif { ifm->ifm_addrs = info->rti_addrs; ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifm_index = ifp->if_index; ifm->_ifm_spare1 = 0; ifd = &ifm->ifm_data; } memcpy(ifd, src_ifd, sizeof(*ifd)); return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len)); } static int sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdrl *ifam; struct if_data *ifd; ifam = (struct ifa_msghdrl *)w->w_tmem; #ifdef COMPAT_FREEBSD32 if (w->w_req->flags & SCTL_MASK32) { struct ifa_msghdrl32 *ifam32; ifam32 = (struct ifa_msghdrl32 *)ifam; ifam32->ifam_addrs = info->rti_addrs; ifam32->ifam_flags = ifa->ifa_flags; ifam32->ifam_index = ifa->ifa_ifp->if_index; ifam32->_ifam_spare1 = 0; ifam32->ifam_len = sizeof(*ifam32); ifam32->ifam_data_off = offsetof(struct ifa_msghdrl32, ifam_data); ifam32->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam32->ifam_data; } else #endif { ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_len = sizeof(*ifam); ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data); ifam->ifam_metric = ifa->ifa_ifp->if_metric; ifd = &ifam->ifam_data; } bzero(ifd, sizeof(*ifd)); ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets); ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets); ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes); ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes); /* Fixup if_data carp(4) vhid. */ if (carp_get_vhid_p != NULL) ifd->ifi_vhid = (*carp_get_vhid_p)(ifa); return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info, struct walkarg *w, int len) { struct ifa_msghdr *ifam; ifam = (struct ifa_msghdr *)w->w_tmem; ifam->ifam_addrs = info->rti_addrs; ifam->ifam_flags = ifa->ifa_flags; ifam->ifam_index = ifa->ifa_ifp->if_index; ifam->_ifam_spare1 = 0; ifam->ifam_metric = ifa->ifa_ifp->if_metric; return (SYSCTL_OUT(w->w_req, w->w_tmem, len)); } static int sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct if_data ifd; struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; if_data_copy(ifp, &ifd); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); if (error != 0) goto done; info.rti_info[RTAX_IFP] = NULL; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifml(ifp, &ifd, &info, w, len); else error = sysctl_iflist_ifm(ifp, &ifd, &info, w, len); if (error) goto done; } while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) { if (af && af != ifa->ifa_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifa->ifa_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifa->ifa_addr; info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask( ifa->ifa_addr, ifa->ifa_netmask, &ss); info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len); if (error != 0) goto done; if (w->w_req && w->w_tmem) { if (w->w_op == NET_RT_IFLISTL) error = sysctl_iflist_ifaml(ifa, &info, w, len); else error = sysctl_iflist_ifam(ifa, &info, w, len); if (error) goto done; } } info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: return (error); } static int sysctl_ifmalist(int af, struct walkarg *w) { struct rt_addrinfo info; struct ifaddr *ifa; struct ifmultiaddr *ifma; struct ifnet *ifp; int error, len; NET_EPOCH_ASSERT(); error = 0; bzero((caddr_t)&info, sizeof(info)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL; CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (af && af != ifma->ifma_addr->sa_family) continue; if (prison_if(w->w_req->td->td_ucred, ifma->ifma_addr) != 0) continue; info.rti_info[RTAX_IFA] = ifma->ifma_addr; info.rti_info[RTAX_GATEWAY] = (ifma->ifma_addr->sa_family != AF_LINK) ? ifma->ifma_lladdr : NULL; error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len); if (error != 0) break; if (w->w_req && w->w_tmem) { struct ifma_msghdr *ifmam; ifmam = (struct ifma_msghdr *)w->w_tmem; ifmam->ifmam_index = ifma->ifma_ifp->if_index; ifmam->ifmam_flags = 0; ifmam->ifmam_addrs = info.rti_addrs; ifmam->_ifmam_spare1 = 0; error = SYSCTL_OUT(w->w_req, w->w_tmem, len); if (error != 0) break; } } if (error != 0) break; } return (error); } static void rtable_sysctl_dump(uint32_t fibnum, int family, struct walkarg *w) { union sockaddr_union sa_dst, sa_mask; w->family = family; w->dst = (struct sockaddr *)&sa_dst; w->mask = (struct sockaddr *)&sa_mask; init_sockaddrs_family(family, w->dst, w->mask); rib_walk(fibnum, family, false, sysctl_dumpentry, w); } static int sysctl_rtsock(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; int *name = (int *)arg1; u_int namelen = arg2; struct rib_head *rnh = NULL; /* silence compiler. */ int i, lim, error = EINVAL; int fib = 0; u_char af; struct walkarg w; if (namelen < 3) return (EINVAL); name++; namelen--; if (req->newptr) return (EPERM); if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) { if (namelen == 3) fib = req->td->td_proc->p_fibnum; else if (namelen == 4) fib = (name[3] == RT_ALL_FIBS) ? req->td->td_proc->p_fibnum : name[3]; else return ((namelen < 3) ? EISDIR : ENOTDIR); if (fib < 0 || fib >= rt_numfibs) return (EINVAL); } else if (namelen != 3) return ((namelen < 3) ? EISDIR : ENOTDIR); af = name[0]; if (af > AF_MAX) return (EINVAL); bzero(&w, sizeof(w)); w.w_op = name[1]; w.w_arg = name[2]; w.w_req = req; error = sysctl_wire_old_buffer(req, 0); if (error) return (error); /* * Allocate reply buffer in advance. * All rtsock messages has maximum length of u_short. */ w.w_tmemsize = 65536; w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK); NET_EPOCH_ENTER(et); switch (w.w_op) { case NET_RT_DUMP: case NET_RT_FLAGS: if (af == 0) { /* dump all tables */ i = 1; lim = AF_MAX; } else /* dump only one table */ i = lim = af; /* * take care of llinfo entries, the caller must * specify an AF */ if (w.w_op == NET_RT_FLAGS && (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) { if (af != 0) error = lltable_sysctl_dumparp(af, w.w_req); else error = EINVAL; break; } /* * take care of routing entries */ for (error = 0; error == 0 && i <= lim; i++) { rnh = rt_tables_get_rnh(fib, i); if (rnh != NULL) { rtable_sysctl_dump(fib, i, &w); } else if (af != 0) error = EAFNOSUPPORT; } break; case NET_RT_NHOP: case NET_RT_NHGRP: /* Allow dumping one specific af/fib at a time */ if (namelen < 4) { error = EINVAL; break; } fib = name[3]; if (fib < 0 || fib > rt_numfibs) { error = EINVAL; break; } rnh = rt_tables_get_rnh(fib, af); if (rnh == NULL) { error = EAFNOSUPPORT; break; } if (w.w_op == NET_RT_NHOP) error = nhops_dump_sysctl(rnh, w.w_req); else #ifdef ROUTE_MPATH error = nhgrp_dump_sysctl(rnh, w.w_req); #else error = ENOTSUP; #endif break; case NET_RT_IFLIST: case NET_RT_IFLISTL: error = sysctl_iflist(af, &w); break; case NET_RT_IFMALIST: error = sysctl_ifmalist(af, &w); break; } NET_EPOCH_EXIT(et); free(w.w_tmem, M_TEMP); return (error); } static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_rtsock, "Return route tables and interface/address lists"); /* * Definitions of protocols supported in the ROUTE domain. */ static struct domain routedomain; /* or at least forward */ static struct protosw routesw = { .pr_type = SOCK_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_abort = rts_close, .pr_attach = rts_attach, .pr_detach = rts_detach, .pr_send = rts_send, .pr_shutdown = rts_shutdown, .pr_close = rts_close, }; static struct domain routedomain = { .dom_family = PF_ROUTE, .dom_name = "route", .dom_nprotosw = 1, .dom_protosw = { &routesw }, }; DOMAIN_SET(route); diff --git a/sys/netlink/netlink.h b/sys/netlink/netlink.h new file mode 100644 index 000000000000..6a68dcec1382 --- /dev/null +++ b/sys/netlink/netlink.h @@ -0,0 +1,257 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) The Internet Society (2003). All Rights Reserved. + * + * This document and translations of it may be copied and furnished to + * others, and derivative works that comment on or otherwise explain it + * or assist in its implementation may be prepared, copied, published + * and distributed, in whole or in part, without restriction of any + * kind, provided that the above copyright notice and this paragraph are + * included on all such copies and derivative works. However, this + * document itself may not be modified in any way, such as by removing + * the copyright notice or references to the Internet Society or other + * Internet organizations, except as needed for the purpose of + * developing Internet standards in which case the procedures for + * copyrights defined in the Internet Standards process must be + * followed, or as required to translate it into languages other than + * English. + * + * The limited permissions granted above are perpetual and will not be + * revoked by the Internet Society or its successors or assignees. + * + * This document and the information contained herein is provided on an + * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING + * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION + * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + + */ + +/* + * This file contains structures and constants for RFC 3549 (Netlink) + * protocol. Some values have been taken from Linux implementation. + */ + +#ifndef _NETLINK_NETLINK_H_ +#define _NETLINK_NETLINK_H_ + +#include +#include + +struct sockaddr_nl { + uint8_t nl_len; /* sizeof(sockaddr_nl) */ + sa_family_t nl_family; /* netlink family */ + uint16_t nl_pad; /* reserved, set to 0 */ + uint32_t nl_pid; /* desired port ID, 0 for auto-select */ + uint32_t nl_groups; /* multicast groups mask to bind to */ +}; + +#define SOL_NETLINK 270 + +/* Netlink socket options */ +#define NETLINK_ADD_MEMBERSHIP 1 /* Subscribe for the specified group notifications */ +#define NETLINK_DROP_MEMBERSHIP 2 /* Unsubscribe from the specified group */ +#define NETLINK_PKTINFO 3 /* XXX: not supported */ +#define NETLINK_BROADCAST_ERROR 4 /* XXX: not supported */ +#define NETLINK_NO_ENOBUFS 5 /* XXX: not supported */ +#define NETLINK_RX_RING 6 /* XXX: not supported */ +#define NETLINK_TX_RING 7 /* XXX: not supported */ +#define NETLINK_LISTEN_ALL_NSID 8 /* XXX: not supported */ + +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 /* Send only original message header in the reply */ +#define NETLINK_EXT_ACK 11 /* Ack support for receiving additional TLVs in ack */ +#define NETLINK_GET_STRICT_CHK 12 /* Strict header checking */ + + +/* + * RFC 3549, 2.3.2 Netlink Message Header + */ +struct nlmsghdr { + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message type identifier */ + uint16_t nlmsg_flags; /* Flags (NLM_F_) */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ +}; + +/* + * RFC 3549, 2.3.2 standard flag bits (nlmsg_flags) + */ +#define NLM_F_REQUEST 0x01 /* Indicateds request to kernel */ +#define NLM_F_MULTI 0x02 /* Message is part of a group terminated by NLMSG_DONE msg */ +#define NLM_F_ACK 0x04 /* Reply with ack message containing resulting error code */ +#define NLM_F_ECHO 0x08 /* (not supported) Echo this request back */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* + * RFC 3549, 2.3.2 Additional flag bits for GET requests + */ +#define NLM_F_ROOT 0x100 /* Return the complete table */ +#define NLM_F_MATCH 0x200 /* Return all entries matching criteria */ +#define NLM_F_ATOMIC 0x400 /* Return an atomic snapshot (ignored) */ +#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH) + +/* + * RFC 3549, 2.3.2 Additional flag bits for NEW requests + */ +#define NLM_F_REPLACE 0x100 /* Replace existing matching config object */ +#define NLM_F_EXCL 0x200 /* Don't replace the object if exists */ +#define NLM_F_CREATE 0x400 /* Create if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE requests */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + * RFC 3549, 2.3.2 standard message types (nlmsg_type). + */ +#define NLMSG_NOOP 0x1 /* Message is ignored. */ +#define NLMSG_ERROR 0x2 /* reply error code reporting */ +#define NLMSG_DONE 0x3 /* Message terminates a multipart message. */ +#define NLMSG_OVERRUN 0x4 /* overrun detected, data is lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +/* + * Defition of numbers assigned to the netlink subsystems. + */ +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* not supported */ +#define NETLINK_USERSOCK 2 /* not supported */ +#define NETLINK_FIREWALL 3 /* not supported */ +#define NETLINK_SOCK_DIAG 4 /* not supported */ +#define NETLINK_NFLOG 5 /* not supported */ +#define NETLINK_XFRM 6 /* (not supported) PF_SETKEY */ +#define NETLINK_SELINUX 7 /* not supported */ +#define NETLINK_ISCSI 8 /* not supported */ +#define NETLINK_AUDIT 9 /* not supported */ +#define NETLINK_FIB_LOOKUP 10 /* not supported */ +#define NETLINK_CONNECTOR 11 /* not supported */ +#define NETLINK_NETFILTER 12 /* not supported */ +#define NETLINK_IP6_FW 13 /* not supported */ +#define NETLINK_DNRTMSG 14 /* not supported */ +#define NETLINK_KOBJECT_UEVENT 15 /* not supported */ +#define NETLINK_GENERIC 16 /* Generic netlink (dynamic families) */ + +/* + * RFC 3549, 2.3.2.2 The ACK Netlink Message + */ +struct nlmsgerr { + int error; + struct nlmsghdr msg; +}; + +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG = 1, /* string, error message */ + NLMSGERR_ATTR_OFFS = 2, /* u32, offset of the invalid attr from nl header */ + NLMSGERR_ATTR_COOKIE = 3, /* binary, data to pass to userland */ + NLMSGERR_ATTR_POLICY = 4, /* not supported */ + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + + +#ifndef roundup2 +#define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#endif +#define NL_ITEM_ALIGN_SIZE sizeof(uint32_t) +#define NL_ITEM_ALIGN(_len) roundup2(_len, NL_ITEM_ALIGN_SIZE) +#define NL_ITEM_DATA(_ptr, _off) ((void *)((char *)(_ptr) + _off)) +#define NL_ITEM_DATA_CONST(_ptr, _off) ((const void *)((const char *)(_ptr) + _off)) + +#define NL_ITEM_OK(_ptr, _len, _hlen, _LEN_M) \ + ((_len) >= _hlen && _LEN_M(_ptr) >= _hlen && _LEN_M(_ptr) <= (_len)) +#define NL_ITEM_NEXT(_ptr, _LEN_M) ((typeof(_ptr))((char *)(_ptr) + _LEN_M(_ptr))) +#define NL_ITEM_ITER(_ptr, _len, _LEN_MACRO) \ + ((_len) -= _LEN_MACRO(_ptr), NL_ITEM_NEXT(_ptr, _LEN_MACRO)) + + +#ifndef _KERNEL +/* part of netlink(3) API */ +#define NLMSG_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLMSG_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLMSG_HDRLEN ((int)sizeof(struct nlmsghdr)) +#define NLMSG_LENGTH(_len) ((_len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(_len)) +#define NLMSG_DATA(_hdr) NL_ITEM_DATA(_hdr, NLMSG_HDRLEN) +#define _NLMSG_LEN(_hdr) ((int)(_hdr)->nlmsg_len) +#define _NLMSG_ALIGNED_LEN(_hdr) NLMSG_ALIGN(_NLMSG_LEN(_hdr)) +#define NLMSG_OK(_hdr, _len) NL_ITEM_OK(_hdr, _len, NLMSG_HDRLEN, _NLMSG_LEN) +#define NLMSG_PAYLOAD(_hdr,_len) (_NLMSG_LEN(_hdr) - NLMSG_SPACE((_len))) +#define NLMSG_NEXT(_hdr, _len) NL_ITEM_ITER(_hdr, _len, _NLMSG_ALIGNED_LEN) + +#else +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) (((len) + NLMSG_ALIGNTO - 1) & ~(NLMSG_ALIGNTO - 1)) +#define NLMSG_HDRLEN ((int)NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#endif + +/* + * Base netlink attribute TLV header. + */ +struct nlattr { + uint16_t nla_len; /* Total attribute length */ + uint16_t nla_type; /* Attribute type */ +}; + +/* + * + * nl_type field enconding: + * + * 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |N|O| Attribute type | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * N - attribute contains other attributes (mostly unused) + * O - encoded in network byte order (mostly unused) + * Note: N & O are mutually exclusive + * + * Note: attribute type value scope normally is either parent attribute + * or the message/message group. + */ + +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#ifndef _KERNEL +#define NLA_ALIGNTO NL_ITEM_ALIGN_SIZE +#define NLA_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#endif + +#endif diff --git a/sys/netlink/netlink_ctl.h b/sys/netlink/netlink_ctl.h new file mode 100644 index 000000000000..fb5a8b30e0aa --- /dev/null +++ b/sys/netlink/netlink_ctl.h @@ -0,0 +1,102 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_CTL_H_ +#define _NETLINK_NETLINK_CTL_H_ + +/* + * This file provides headers for the public KPI of the netlink + * subsystem + */ + +MALLOC_DECLARE(M_NETLINK); + +/* + * Macro for handling attribute TLVs + */ +#define _roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) + +#define NETLINK_ALIGN_SIZE sizeof(uint32_t) +#define NETLINK_ALIGN(_len) _roundup2(_len, NETLINK_ALIGN_SIZE) + +#define NLA_ALIGN_SIZE sizeof(uint32_t) +#define NLA_ALIGN(_len) _roundup2(_len, NLA_ALIGN_SIZE) +#define NLA_HDRLEN ((int)sizeof(struct nlattr)) +#define NLA_DATA_LEN(_nla) ((int)((_nla)->nla_len - NLA_HDRLEN)) +#define NLA_DATA(_nla) NL_ITEM_DATA(_nla, NLA_HDRLEN) +#define NLA_DATA_CONST(_nla) NL_ITEM_DATA_CONST(_nla, NLA_HDRLEN) +#define NLA_TYPE(_nla) ((_nla)->nla_type & 0x3FFF) + +#ifndef typeof +#define typeof __typeof +#endif + +#define NLA_NEXT(_attr) (struct nlattr *)((char *)_attr + NLA_ALIGN(_attr->nla_len)) +#define _NLA_END(_start, _len) ((char *)(_start) + (_len)) +#define NLA_FOREACH(_attr, _start, _len) \ + for (typeof(_attr) _end = (typeof(_attr))_NLA_END(_start, _len), _attr = (_start); \ + ((char *)_attr < (char *)_end) && \ + ((char *)NLA_NEXT(_attr) <= (char *)_end); \ + _attr = (_len -= NLA_ALIGN(_attr->nla_len), NLA_NEXT(_attr))) + +#define NL_ARRAY_LEN(_a) (sizeof(_a) / sizeof((_a)[0])) + +#include +#include + + +/* Protocol handlers */ +struct nl_pstate; +typedef int (*nl_handler_f)(struct nlmsghdr *hdr, struct nl_pstate *npt); + +bool netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler); +bool netlink_unregister_proto(int proto); + +/* Common helpers */ +bool nl_has_listeners(int netlink_family, uint32_t groups_mask); +bool nlp_has_priv(struct nlpcb *nlp, int priv); + +/* netlink_generic.c */ +struct genl_cmd { + const char *cmd_name; + nl_handler_f cmd_cb; + uint32_t cmd_flags; + uint32_t cmd_priv; + uint32_t cmd_num; +}; + +uint32_t genl_register_family(const char *family_name, size_t hdrsize, + int family_version, int max_attr_idx); +bool genl_unregister_family(const char *family_name); +bool genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, + int count); +uint32_t genl_register_group(const char *family_name, const char *group_name); + +/* Debug */ +uint32_t nlp_get_pid(const struct nlpcb *nlp); + +#endif diff --git a/sys/netlink/netlink_debug.h b/sys/netlink/netlink_debug.h new file mode 100644 index 000000000000..6ff6811c6a5a --- /dev/null +++ b/sys/netlink/netlink_debug.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETLINK_NETLINK_DEBUG_H_ +#define _NETLINK_NETLINK_DEBUG_H_ + +#define _DEBUG_SYSCTL_OID _net_netlink_debug +#include + +SYSCTL_DECL(_net_netlink_debug); + +/* + * Generic debug + * [nl_domain] func_name: debug text + */ +#define NL_LOG RT_LOG + +/* + * Logging for events specific for particular process + * Example: [nl_domain] PID 4834 fdump_sa: unsupported family: 45 + */ +#define NL_RAW_PID_LOG(_l, _pid, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, _pid, _fmt, ## __VA_ARGS__) +#define _NL_RAW_PID_LOG(_l, _pid, _fmt, ...) if (_DEBUG_PASS_MSG(_l)) { \ + _output("[" DEBUG_PREFIX_NAME "] PID %u %s: " _fmt "\n", _pid, __func__, ##__VA_ARGS__); \ +} + +#define NLP_LOG(_l, _nlp, _fmt, ...) NL_RAW_PID_LOG_##_l(_l, nlp_get_pid(_nlp), _fmt, ## __VA_ARGS__) + +#if DEBUG_MAX_LEVEL>=LOG_DEBUG3 +#define NL_RAW_PID_LOG_LOG_DEBUG3 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG3(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG2 +#define NL_RAW_PID_LOG_LOG_DEBUG2 _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG2(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_DEBUG +#define NL_RAW_PID_LOG_LOG_DEBUG _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_DEBUG(_l, _pid, _fmt, ...) +#endif +#if DEBUG_MAX_LEVEL>=LOG_INFO +#define NL_RAW_PID_LOG_LOG_INFO _NL_RAW_PID_LOG +#else +#define NL_RAW_PID_LOG_LOG_INFO(_l, _pid, _fmt, ...) +#endif +#define NL_RAW_PID_LOG_LOG_NOTICE _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_ERR _NL_RAW_PID_LOG +#define NL_RAW_PID_LOG_LOG_WARNING _NL_RAW_PID_LOG + + + +#endif diff --git a/sys/netlink/netlink_domain.c b/sys/netlink/netlink_domain.c new file mode 100644 index 000000000000..159dfd03724d --- /dev/null +++ b/sys/netlink/netlink_domain.c @@ -0,0 +1,689 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains socket and protocol bindings for netlink. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* priv_check */ + +#include +#include +#include + +#define DEBUG_MOD_NAME nl_domain +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + + +#define NLCTL_TRACKER struct rm_priotracker nl_tracker +#define NLCTL_RLOCK(_ctl) rm_rlock(&((_ctl)->ctl_lock), &nl_tracker) +#define NLCTL_RUNLOCK(_ctl) rm_runlock(&((_ctl)->ctl_lock), &nl_tracker) + +#define NLCTL_WLOCK(_ctl) rm_wlock(&((_ctl)->ctl_lock)) +#define NLCTL_WUNLOCK(_ctl) rm_wunlock(&((_ctl)->ctl_lock)) + +static u_long nl_sendspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, sendspace, CTLFLAG_RW, &nl_sendspace, 0, + "Default netlink socket send space"); + +static u_long nl_recvspace = NLSNDQ; +SYSCTL_ULONG(_net_netlink, OID_AUTO, recvspace, CTLFLAG_RW, &nl_recvspace, 0, + "Default netlink socket receive space"); + +extern u_long sb_max_adj; +static u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */ + +uint32_t +nlp_get_pid(const struct nlpcb *nlp) +{ + return (nlp->nl_process_id); +} + +/* + * Looks up a nlpcb struct based on the @portid. Need to claim nlsock_mtx. + * Returns nlpcb pointer if present else NULL + */ +static struct nlpcb * +nl_port_lookup(uint32_t port_id) +{ + struct nlpcb *nlp; + + CK_LIST_FOREACH(nlp, &V_nl_ctl->ctl_port_head, nl_port_next) { + if (nlp->nl_port == port_id) + return (nlp); + } + return (NULL); +} + +static void +nl_update_groups_locked(struct nlpcb *nlp, uint64_t nl_groups) +{ + /* Update group mask */ + NL_LOG(LOG_DEBUG2, "socket %p, groups 0x%lX -> 0x%lX", + nlp->nl_socket, nlp->nl_groups, nl_groups); + nlp->nl_groups = nl_groups; +} + +/* + * Broadcasts message @m to the protocol @proto group specified by @group_id + */ +void +nl_send_group(struct mbuf *m, int num_messages, int proto, int group_id) +{ + struct nlpcb *nlp_last = NULL; + struct nlpcb *nlp; + NLCTL_TRACKER; + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); + NL_LOG(LOG_DEBUG2, "MCAST mbuf len %u msg type %d len %u to group %d/%d", + m->m_len, hdr->nlmsg_type, hdr->nlmsg_len, proto, group_id); + } + + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + if (__predict_false(ctl == NULL)) { + /* + * Can be the case when notification is sent within VNET + * which doesn't have any netlink sockets. + */ + m_freem(m); + return; + } + + NLCTL_RLOCK(ctl); + + int io_flags = NL_IOF_UNTRANSLATED; + uint64_t groups_mask = 1 << ((uint64_t)group_id - 1); + + CK_LIST_FOREACH(nlp, &ctl->ctl_pcb_head, nl_next) { + if (nlp->nl_groups & groups_mask && nlp->nl_proto == proto) { + if (nlp_last != NULL) { + struct mbuf *m_copy; + m_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (m_copy != NULL) + nl_send_one(m_copy, nlp_last, num_messages, io_flags); + else { + NLP_LOCK(nlp_last); + if (nlp_last->nl_socket != NULL) + sorwakeup(nlp_last->nl_socket); + NLP_UNLOCK(nlp_last); + } + } + nlp_last = nlp; + } + } + if (nlp_last != NULL) + nl_send_one(m, nlp_last, num_messages, io_flags); + else + m_freem(m); + + NLCTL_RUNLOCK(ctl); +} + +bool +nl_has_listeners(int netlink_family, uint32_t groups_mask) +{ + return (V_nl_ctl != NULL); +} + +bool +nlp_has_priv(struct nlpcb *nlp, int priv) +{ + return (priv_check_cred(nlp->nl_cred, priv) == 0); +} + +static uint32_t +nl_find_port() { + /* + * app can open multiple netlink sockets. + * Start with current pid, if already taken, + * try random numbers in 65k..256k+65k space, + * avoiding clash with pids. + */ + if (nl_port_lookup(curproc->p_pid) == NULL) + return (curproc->p_pid); + for (int i = 0; i < 16; i++) { + uint32_t nl_port = (arc4random() % 65536) + 65536 * 4; + if (nl_port_lookup(nl_port) == 0) + return (nl_port); + NL_LOG(LOG_DEBUG3, "tried %u\n", nl_port); + } + return (curproc->p_pid); +} + +static int +nl_bind_locked(struct nlpcb *nlp, struct sockaddr_nl *snl) +{ + if (nlp->nl_bound) { + if (nlp->nl_port != snl->nl_pid) { + NL_LOG(LOG_DEBUG, + "bind() failed: program pid %d " + "is different from provided pid %d", + nlp->nl_port, snl->nl_pid); + return (EINVAL); // XXX: better error + } + } else { + if (snl->nl_pid == 0) + snl->nl_pid = nl_find_port(); + if (nl_port_lookup(snl->nl_pid) != NULL) + return (EADDRINUSE); + nlp->nl_port = snl->nl_pid; + nlp->nl_bound = true; + CK_LIST_INSERT_HEAD(&V_nl_ctl->ctl_port_head, nlp, nl_port_next); + } + nl_update_groups_locked(nlp, snl->nl_groups); + + return (0); +} + +static int +nl_pru_attach(struct socket *so, int proto, struct thread *td) +{ + struct nlpcb *nlp; + int error; + + if (__predict_false(netlink_unloading != 0)) + return (EAFNOSUPPORT); + + error = nl_verify_proto(proto); + if (error != 0) + return (error); + + bool is_linux = SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX; + NL_LOG(LOG_DEBUG2, "socket %p, %sPID %d: attaching socket to %s", + so, is_linux ? "(linux) " : "", curproc->p_pid, + nl_get_proto_name(proto)); + + /* Create per-VNET state on first socket init */ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + if (ctl == NULL) + ctl = vnet_nl_ctl_init(); + KASSERT(V_nl_ctl != NULL, ("nl_attach: vnet_sock_init() failed")); + + MPASS(sotonlpcb(so) == NULL); + + nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO); + error = soreserve(so, nl_sendspace, nl_recvspace); + if (error != 0) { + free(nlp, M_PCB); + return (error); + } + so->so_pcb = nlp; + nlp->nl_socket = so; + /* Copy so_cred to avoid having socket_var.h in every header */ + nlp->nl_cred = so->so_cred; + nlp->nl_proto = proto; + nlp->nl_process_id = curproc->p_pid; + nlp->nl_linux = is_linux; + nlp->nl_active = true; + NLP_LOCK_INIT(nlp); + refcount_init(&nlp->nl_refcount, 1); + nl_init_io(nlp); + + nlp->nl_taskqueue = taskqueue_create("netlink_socket", M_WAITOK, + taskqueue_thread_enqueue, &nlp->nl_taskqueue); + TASK_INIT(&nlp->nl_task, 0, nl_taskqueue_handler, nlp); + taskqueue_start_threads(&nlp->nl_taskqueue, 1, PWAIT, + "netlink_socket (PID %u)", nlp->nl_process_id); + + NLCTL_WLOCK(ctl); + /* XXX: check ctl is still alive */ + CK_LIST_INSERT_HEAD(&ctl->ctl_pcb_head, nlp, nl_next); + NLCTL_WUNLOCK(ctl); + + soisconnected(so); + + return (0); +} + +static void +nl_pru_abort(struct socket *so) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + soisdisconnected(so); +} + +static int +nl_pru_bind(struct socket *so, struct sockaddr *sa, struct thread *td) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct nlpcb *nlp = sotonlpcb(so); + struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; + int error; + + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + if (snl->nl_len != sizeof(*snl)) { + NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); + return (EINVAL); + } + + + NLCTL_WLOCK(ctl); + NLP_LOCK(nlp); + error = nl_bind_locked(nlp, snl); + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(ctl); + NL_LOG(LOG_DEBUG2, "socket %p, bind() to %u, groups %u, error %d", so, + snl->nl_pid, snl->nl_groups, error); + + return (error); +} + + +static int +nl_assign_port(struct nlpcb *nlp, uint32_t port_id) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct sockaddr_nl snl = { + .nl_pid = port_id, + }; + int error; + + NLCTL_WLOCK(ctl); + NLP_LOCK(nlp); + snl.nl_groups = nlp->nl_groups; + error = nl_bind_locked(nlp, &snl); + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(ctl); + + NL_LOG(LOG_DEBUG3, "socket %p, port assign: %d, error: %d", nlp->nl_socket, port_id, error); + return (error); +} + +/* + * nl_autobind_port binds a unused portid to @nlp + * @nlp: pcb data for the netlink socket + * @candidate_id: first id to consider + */ +static int +nl_autobind_port(struct nlpcb *nlp, uint32_t candidate_id) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + uint32_t port_id = candidate_id; + NLCTL_TRACKER; + bool exist; + int error; + + for (int i = 0; i < 10; i++) { + NL_LOG(LOG_DEBUG3, "socket %p, trying to assign port %d", nlp->nl_socket, port_id); + NLCTL_RLOCK(ctl); + exist = nl_port_lookup(port_id) != 0; + NLCTL_RUNLOCK(ctl); + if (!exist) { + error = nl_assign_port(nlp, port_id); + if (error != EADDRINUSE) + break; + } + port_id++; + } + NL_LOG(LOG_DEBUG3, "socket %p, autobind to %d, error: %d", nlp->nl_socket, port_id, error); + return (error); +} + +static int +nl_pru_connect(struct socket *so, struct sockaddr *sa, struct thread *td) +{ + struct sockaddr_nl *snl = (struct sockaddr_nl *)sa; + struct nlpcb *nlp; + + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + if (snl->nl_len != sizeof(*snl)) { + NL_LOG(LOG_DEBUG, "socket %p, wrong sizeof(), ignoring bind()", so); + return (EINVAL); + } + + nlp = sotonlpcb(so); + if (!nlp->nl_bound) { + int error = nl_autobind_port(nlp, td->td_proc->p_pid); + if (error != 0) { + NL_LOG(LOG_DEBUG, "socket %p, nl_autobind() failed: %d", so, error); + return (error); + } + } + /* XXX: Handle socket flags & multicast */ + soisconnected(so); + + NL_LOG(LOG_DEBUG2, "socket %p, connect to %u", so, snl->nl_pid); + + return (0); +} + +static void +destroy_nlpcb(struct nlpcb *nlp) +{ + NLP_LOCK(nlp); + nl_free_io(nlp); + NLP_LOCK_DESTROY(nlp); + free(nlp, M_PCB); +} + +static void +destroy_nlpcb_epoch(epoch_context_t ctx) +{ + struct nlpcb *nlp; + + nlp = __containerof(ctx, struct nlpcb, nl_epoch_ctx); + + destroy_nlpcb(nlp); +} + + +static void +nl_pru_detach(struct socket *so) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + MPASS(sotonlpcb(so) != NULL); + struct nlpcb *nlp; + + NL_LOG(LOG_DEBUG2, "detaching socket %p, PID %d", so, curproc->p_pid); + nlp = sotonlpcb(so); + + /* Mark as inactive so no new work can be enqueued */ + NLP_LOCK(nlp); + bool was_bound = nlp->nl_bound; + nlp->nl_active = false; + NLP_UNLOCK(nlp); + + /* Wait till all scheduled work has been completed */ + taskqueue_drain_all(nlp->nl_taskqueue); + taskqueue_free(nlp->nl_taskqueue); + + NLCTL_WLOCK(ctl); + NLP_LOCK(nlp); + if (was_bound) { + CK_LIST_REMOVE(nlp, nl_port_next); + NL_LOG(LOG_DEBUG3, "socket %p, unlinking bound pid %u", so, nlp->nl_port); + } + CK_LIST_REMOVE(nlp, nl_next); + nlp->nl_socket = NULL; + NLP_UNLOCK(nlp); + NLCTL_WUNLOCK(ctl); + + so->so_pcb = NULL; + + NL_LOG(LOG_DEBUG3, "socket %p, detached", so); + + /* XXX: is delayed free needed? */ + epoch_call(net_epoch_preempt, destroy_nlpcb_epoch, &nlp->nl_epoch_ctx); +} + +static int +nl_pru_disconnect(struct socket *so) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + return (ENOTCONN); +} + +static int +nl_pru_peeraddr(struct socket *so, struct sockaddr **sa) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + return (ENOTCONN); +} + +static int +nl_pru_shutdown(struct socket *so) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + socantsendmore(so); + return (0); +} + +static int +nl_pru_sockaddr(struct socket *so, struct sockaddr **sa) +{ + struct sockaddr_nl *snl; + + snl = malloc(sizeof(struct sockaddr_nl), M_SONAME, M_WAITOK | M_ZERO); + /* TODO: set other fields */ + snl->nl_len = sizeof(struct sockaddr_nl); + snl->nl_family = AF_NETLINK; + snl->nl_pid = sotonlpcb(so)->nl_port; + *sa = (struct sockaddr *)snl; + return (0); +} + +static void +nl_pru_close(struct socket *so) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + soisdisconnected(so); +} + +static int +nl_pru_output(struct mbuf *m, struct socket *so, ...) +{ + + if (__predict_false(m == NULL || + ((m->m_len < sizeof(struct nlmsghdr)) && + (m = m_pullup(m, sizeof(struct nlmsghdr))) == NULL))) + return (ENOBUFS); + MPASS((m->m_flags & M_PKTHDR) != 0); + + NL_LOG(LOG_DEBUG3, "sending message to kernel async processing"); + nl_receive_async(m, so); + return (0); +} + + +static int +nl_pru_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *sa, + struct mbuf *control, struct thread *td) +{ + NL_LOG(LOG_DEBUG2, "sending message to kernel"); + + if (__predict_false(control != NULL)) { + if (control->m_len) { + m_freem(control); + return (EINVAL); + } + m_freem(control); + } + + return (nl_pru_output(m, so)); +} + +static int +nl_pru_rcvd(struct socket *so, int flags) +{ + NL_LOG(LOG_DEBUG3, "socket %p, PID %d", so, curproc->p_pid); + MPASS(sotonlpcb(so) != NULL); + + nl_on_transmit(sotonlpcb(so)); + + return (0); +} + +static int +nl_getoptflag(int sopt_name) +{ + switch (sopt_name) { + case NETLINK_CAP_ACK: + return (NLF_CAP_ACK); + case NETLINK_EXT_ACK: + return (NLF_EXT_ACK); + case NETLINK_GET_STRICT_CHK: + return (NLF_STRICT); + } + + return (0); +} + +static int +nl_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct nl_control *ctl = atomic_load_ptr(&V_nl_ctl); + struct nlpcb *nlp = sotonlpcb(so); + uint32_t flag; + uint64_t groups, group_mask; + int optval, error = 0; + NLCTL_TRACKER; + + NL_LOG(LOG_DEBUG2, "%ssockopt(%p, %d)", (sopt->sopt_dir) ? "set" : "get", + so, sopt->sopt_name); + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case NETLINK_ADD_MEMBERSHIP: + case NETLINK_DROP_MEMBERSHIP: + sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + if (optval <= 0 || optval >= 64) { + error = ERANGE; + break; + } + group_mask = (uint64_t)1 << (optval - 1); + NL_LOG(LOG_DEBUG2, "ADD/DEL group %d mask (%lX)", optval, group_mask); + + NLCTL_WLOCK(ctl); + if (sopt->sopt_name == NETLINK_ADD_MEMBERSHIP) + groups = nlp->nl_groups | group_mask; + else + groups = nlp->nl_groups & ~group_mask; + nl_update_groups_locked(nlp, groups); + NLCTL_WUNLOCK(ctl); + break; + case NETLINK_CAP_ACK: + case NETLINK_EXT_ACK: + case NETLINK_GET_STRICT_CHK: + sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + + flag = nl_getoptflag(sopt->sopt_name); + + NLCTL_WLOCK(ctl); + if (optval != 0) + nlp->nl_flags |= flag; + else + nlp->nl_flags &= ~flag; + NLCTL_WUNLOCK(ctl); + break; + default: + error = ENOPROTOOPT; + } + break; + case SOPT_GET: + switch (sopt->sopt_name) { + case NETLINK_LIST_MEMBERSHIPS: + NLCTL_RLOCK(ctl); + optval = nlp->nl_groups; + NLCTL_RUNLOCK(ctl); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + case NETLINK_CAP_ACK: + case NETLINK_EXT_ACK: + case NETLINK_GET_STRICT_CHK: + NLCTL_RLOCK(ctl); + optval = (nlp->nl_flags & nl_getoptflag(sopt->sopt_name)) != 0; + NLCTL_RUNLOCK(ctl); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + default: + error = ENOPROTOOPT; + } + break; + default: + error = ENOPROTOOPT; + } + + return (error); +} + +static int +nl_setsbopt(struct socket *so, struct sockopt *sopt) +{ + int error, optval; + bool result; + + if (sopt->sopt_name != SO_RCVBUF) + return (sbsetopt(so, sopt)); + + /* Allow to override max buffer size in certain conditions */ + + error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); + if (error != 0) + return (error); + NL_LOG(LOG_DEBUG2, "socket %p, PID %d, SO_RCVBUF=%d", so, curproc->p_pid, optval); + if (optval > sb_max_adj) { + if (priv_check(curthread, PRIV_NET_ROUTE) != 0) + return (EPERM); + } + + SOCK_RECVBUF_LOCK(so); + result = sbreserve_locked_limit(so, SO_RCV, optval, nl_maxsockbuf, curthread); + SOCK_RECVBUF_UNLOCK(so); + + return (result ? 0 : ENOBUFS); +} + +static struct protosw netlinksw = { + .pr_type = SOCK_RAW, + .pr_flags = PR_ATOMIC | PR_ADDR | PR_WANTRCVD, + .pr_ctloutput = nl_ctloutput, + .pr_setsbopt = nl_setsbopt, + .pr_abort = nl_pru_abort, + .pr_attach = nl_pru_attach, + .pr_bind = nl_pru_bind, + .pr_connect = nl_pru_connect, + .pr_detach = nl_pru_detach, + .pr_disconnect = nl_pru_disconnect, + .pr_peeraddr = nl_pru_peeraddr, + .pr_send = nl_pru_send, + .pr_rcvd = nl_pru_rcvd, + .pr_shutdown = nl_pru_shutdown, + .pr_sockaddr = nl_pru_sockaddr, + .pr_close = nl_pru_close +}; + +static struct domain netlinkdomain = { + .dom_family = PF_NETLINK, + .dom_name = "netlink", + .dom_flags = DOMF_UNLOADABLE, + .dom_nprotosw = 1, + .dom_protosw = { &netlinksw }, +}; + +DOMAIN_SET(netlink); diff --git a/sys/netlink/netlink_generic.c b/sys/netlink/netlink_generic.c new file mode 100644 index 000000000000..d422416cd9b4 --- /dev/null +++ b/sys/netlink/netlink_generic.c @@ -0,0 +1,472 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_generic +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + +#define MAX_FAMILIES 20 +#define MAX_GROUPS 20 + +#define MIN_GROUP_NUM 48 + +static struct sx sx_lock; + +#define GENL_LOCK_INIT() sx_init(&sx_lock, "genetlink lock") +#define GENL_LOCK_DESTROY() sx_destroy(&sx_lock) +#define GENL_LOCK() sx_xlock(&sx_lock) +#define GENL_UNLOCK() sx_xunlock(&sx_lock) + +struct genl_family { + const char *family_name; + uint16_t family_hdrsize; + uint16_t family_id; + uint16_t family_version; + uint16_t family_attr_max; + uint16_t family_cmd_size; + uint16_t family_num_groups; + struct genl_cmd *family_cmds; +}; + +static struct genl_family families[MAX_FAMILIES]; + + +struct genl_group { + struct genl_family *group_family; + const char *group_name; +}; +static struct genl_group groups[MAX_GROUPS]; + + +static int dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, + const struct genl_family *gf, struct nl_writer *nw); +static void nlctrl_notify(const struct genl_family *gf, int action); + +static struct genl_family * +find_family(const char *family_name) +{ + for (int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + if (gf->family_name != NULL && !strcmp(gf->family_name, family_name)) + return (gf); + } + + return (NULL); +} + +uint32_t +genl_register_family(const char *family_name, size_t hdrsize, int family_version, + int max_attr_idx) +{ + uint32_t family_id = 0; + + MPASS(family_name != NULL); + if (find_family(family_name) != NULL) + return (0); + + GENL_LOCK(); + for (int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + if (gf->family_name == NULL) { + gf->family_name = family_name; + gf->family_version = family_version; + gf->family_hdrsize = hdrsize; + gf->family_attr_max = max_attr_idx; + gf->family_id = i + GENL_MIN_ID; + NL_LOG(LOG_DEBUG2, "Registered family %s id %d", + gf->family_name, gf->family_id); + family_id = gf->family_id; + nlctrl_notify(gf, CTRL_CMD_NEWFAMILY); + break; + } + } + GENL_UNLOCK(); + + return (family_id); +} + +static void +free_family(struct genl_family *gf) +{ + if (gf->family_cmds != NULL) + free(gf->family_cmds, M_NETLINK); +} + +/* + * Can sleep, I guess + */ +bool +genl_unregister_family(const char *family_name) +{ + bool found = false; + + GENL_LOCK(); + struct genl_family *gf = find_family(family_name); + + nlctrl_notify(gf, CTRL_CMD_DELFAMILY); + + if (gf != NULL) { + found = true; + /* TODO: zero pointer first */ + free_family(gf); + bzero(gf, sizeof(*gf)); + } + GENL_UNLOCK(); + + return (found); +} + +bool +genl_register_cmds(const char *family_name, const struct genl_cmd *cmds, int count) +{ + GENL_LOCK(); + struct genl_family *gf = find_family(family_name); + if (gf == NULL) { + GENL_UNLOCK(); + return (false); + } + + int cmd_size = gf->family_cmd_size; + + for (int i = 0; i < count; i++) { + MPASS(cmds[i].cmd_cb != NULL); + if (cmds[i].cmd_num >= cmd_size) + cmd_size = cmds[i].cmd_num + 1; + } + + if (cmd_size > gf->family_cmd_size) { + /* need to realloc */ + size_t sz = cmd_size * sizeof(struct genl_cmd); + void *data = malloc(sz, M_NETLINK, M_WAITOK | M_ZERO); + + memcpy(data, gf->family_cmds, gf->family_cmd_size * sizeof(struct genl_cmd)); + void *old_data = gf->family_cmds; + gf->family_cmds = data; + gf->family_cmd_size = cmd_size; + free(old_data, M_NETLINK); + } + + for (int i = 0; i < count; i++) { + const struct genl_cmd *cmd = &cmds[i]; + MPASS(gf->family_cmds[cmd->cmd_num].cmd_cb == NULL); + gf->family_cmds[cmd->cmd_num] = cmds[i]; + NL_LOG(LOG_DEBUG2, "Adding cmd %s(%d) to family %s", + cmd->cmd_name, cmd->cmd_num, gf->family_name); + } + GENL_UNLOCK(); + return (true); +} + +static struct genl_group * +find_group(const struct genl_family *gf, const char *group_name) +{ + for (int i = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == gf && !strcmp(gg->group_name, group_name)) + return (gg); + } + return (NULL); +} + +uint32_t +genl_register_group(const char *family_name, const char *group_name) +{ + uint32_t group_id = 0; + + MPASS(family_name != NULL); + MPASS(group_name != NULL); + + GENL_LOCK(); + struct genl_family *gf = find_family(family_name); + + if (gf == NULL || find_group(gf, group_name) != NULL) { + GENL_UNLOCK(); + return (0); + } + + for (int i = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family == NULL) { + gf->family_num_groups++; + gg->group_family = gf; + gg->group_name = group_name; + group_id = i + MIN_GROUP_NUM; + break; + } + } + GENL_UNLOCK(); + + return (group_id); +} + +/* + * Handler called by netlink subsystem when matching netlink message is received + */ +static int +genl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + struct nlpcb *nlp = npt->nlp; + int error = 0; + + int family_id = (int)hdr->nlmsg_type - GENL_MIN_ID; + + if (__predict_false(family_id < 0 || family_id > MAX_FAMILIES)) { + NLP_LOG(LOG_DEBUG, nlp, "invalid message type: %d", hdr->nlmsg_type); + return (ENOTSUP); + } + + if (__predict_false(hdr->nlmsg_len < sizeof(hdr) + GENL_HDRLEN)) { + NLP_LOG(LOG_DEBUG, nlp, "invalid message size: %d", hdr->nlmsg_len); + return (EINVAL); + } + + struct genl_family *gf = &families[family_id]; + + struct genlmsghdr *ghdr = (struct genlmsghdr *)(hdr + 1); + + if (ghdr->cmd >= gf->family_cmd_size || gf->family_cmds[ghdr->cmd].cmd_cb == NULL) { + NLP_LOG(LOG_DEBUG, nlp, "family %s: invalid cmd %d", + gf->family_name, ghdr->cmd); + return (ENOTSUP); + } + + struct genl_cmd *cmd = &gf->family_cmds[ghdr->cmd]; + + if (cmd->cmd_priv != 0 && !nlp_has_priv(nlp, cmd->cmd_priv)) { + NLP_LOG(LOG_DEBUG, nlp, "family %s: cmd %d priv_check() failed", + gf->family_name, ghdr->cmd); + return (EPERM); + } + + NLP_LOG(LOG_DEBUG2, nlp, "received family %s cmd %s(%d) len %d", + gf->family_name, cmd->cmd_name, ghdr->cmd, hdr->nlmsg_len); + + error = cmd->cmd_cb(hdr, npt); + + return (error); +} + +static uint32_t +get_cmd_flags(const struct genl_cmd *cmd) +{ + uint32_t flags = cmd->cmd_flags; + if (cmd->cmd_priv != 0) + flags |= GENL_ADMIN_PERM; + return (flags); +} + +static int +dump_family(struct nlmsghdr *hdr, struct genlmsghdr *ghdr, + const struct genl_family *gf, struct nl_writer *nw) +{ + if (!nlmsg_reply(nw, hdr, sizeof(struct genlmsghdr))) + goto enomem; + + struct genlmsghdr *ghdr_new = nlmsg_reserve_object(nw, struct genlmsghdr); + ghdr_new->cmd = ghdr->cmd; + ghdr_new->version = gf->family_version; + ghdr_new->reserved = 0; + + nlattr_add_string(nw, CTRL_ATTR_FAMILY_NAME, gf->family_name); + nlattr_add_u16(nw, CTRL_ATTR_FAMILY_ID, gf->family_id); + nlattr_add_u32(nw, CTRL_ATTR_VERSION, gf->family_version); + nlattr_add_u32(nw, CTRL_ATTR_HDRSIZE, gf->family_hdrsize); + nlattr_add_u32(nw, CTRL_ATTR_MAXATTR, gf->family_attr_max); + + if (gf->family_cmd_size > 0) { + int off = nlattr_add_nested(nw, CTRL_ATTR_OPS); + if (off == 0) + goto enomem; + for (int i = 0, cnt=0; i < gf->family_cmd_size; i++) { + struct genl_cmd *cmd = &gf->family_cmds[i]; + if (cmd->cmd_cb == NULL) + continue; + int cmd_off = nlattr_add_nested(nw, ++cnt); + if (cmd_off == 0) + goto enomem; + + nlattr_add_u32(nw, CTRL_ATTR_OP_ID, cmd->cmd_num); + nlattr_add_u32(nw, CTRL_ATTR_OP_FLAGS, get_cmd_flags(cmd)); + nlattr_set_len(nw, cmd_off); + } + nlattr_set_len(nw, off); + } + if (gf->family_num_groups > 0) { + int off = nlattr_add_nested(nw, CTRL_ATTR_MCAST_GROUPS); + if (off == 0) + goto enomem; + for (int i = 0, cnt = 0; i < MAX_GROUPS; i++) { + struct genl_group *gg = &groups[i]; + if (gg->group_family != gf) + continue; + + int cmd_off = nlattr_add_nested(nw, ++cnt); + if (cmd_off == 0) + goto enomem; + nlattr_add_u32(nw, CTRL_ATTR_MCAST_GRP_ID, i + MIN_GROUP_NUM); + nlattr_add_string(nw, CTRL_ATTR_MCAST_GRP_NAME, gg->group_name); + nlattr_set_len(nw, cmd_off); + } + nlattr_set_len(nw, off); + } + if (nlmsg_end(nw)) + return (0); +enomem: + NL_LOG(LOG_DEBUG, "unable to dump family %s state (ENOMEM)", gf->family_name); + nlmsg_abort(nw); + return (ENOMEM); +} + + +/* Declare ourself as a user */ +#define CTRL_FAMILY_NAME "nlctrl" + +static uint32_t ctrl_family_id; +static uint32_t ctrl_group_id; + +struct nl_parsed_family { + uint32_t family_id; + char *family_name; + uint8_t version; +}; + +#define _IN(_field) offsetof(struct genlmsghdr, _field) +#define _OUT(_field) offsetof(struct nl_parsed_family, _field) +static const struct nlfield_parser nlf_p_generic[] = { + { .off_in = _IN(version), .off_out = _OUT(version), .cb = nlf_get_u8 }, +}; + +static struct nlattr_parser nla_p_generic[] = { + { .type = CTRL_ATTR_FAMILY_ID , .off = _OUT(family_id), .cb = nlattr_get_uint32 }, + { .type = CTRL_ATTR_FAMILY_NAME , .off = _OUT(family_id), .cb = nlattr_get_string }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_PARSER(genl_parser, struct genlmsghdr, nlf_p_generic, nla_p_generic); + +static int +nlctrl_handle_getfamily(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + int error = 0; + + struct nl_parsed_family attrs = {}; + error = nl_parse_nlmsg(hdr, &genl_parser, npt, &attrs); + if (error != 0) + return (error); + + struct genlmsghdr ghdr = { + .cmd = CTRL_CMD_NEWFAMILY, + }; + + for (int i = 0; i < MAX_FAMILIES; i++) { + struct genl_family *gf = &families[i]; + if (gf->family_name == NULL) + continue; + if (attrs.family_id != 0 && attrs.family_id != gf->family_id) + continue; + if (attrs.family_name != NULL && strcmp(attrs.family_name, gf->family_name)) + continue; + error = dump_family(hdr, &ghdr, &families[i], npt->nw); + if (error != 0) + break; + } + + return (error); +} + +static void +nlctrl_notify(const struct genl_family *gf, int cmd) +{ + struct nlmsghdr hdr = {.nlmsg_type = NETLINK_GENERIC }; + struct genlmsghdr ghdr = { .cmd = cmd }; + struct nl_writer nw = {}; + + if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_GENERIC, ctrl_group_id)) { + dump_family(&hdr, &ghdr, gf, &nw); + nlmsg_flush(&nw); + return; + } + NL_LOG(LOG_DEBUG, "error allocating group writer"); +} + +static const struct genl_cmd nlctrl_cmds[] = { + { + .cmd_num = CTRL_CMD_GETFAMILY, + .cmd_name = "GETFAMILY", + .cmd_cb = nlctrl_handle_getfamily, + .cmd_flags = GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP, GENL_CMD_CAP_HASPOL, + }, +}; + +static void +genl_nlctrl_init() +{ + ctrl_family_id = genl_register_family(CTRL_FAMILY_NAME, 0, 2, CTRL_ATTR_MAX); + genl_register_cmds(CTRL_FAMILY_NAME, nlctrl_cmds, NL_ARRAY_LEN(nlctrl_cmds)); + ctrl_group_id = genl_register_group(CTRL_FAMILY_NAME, "notify"); +} + +static void +genl_nlctrl_destroy() +{ + genl_unregister_family(CTRL_FAMILY_NAME); +} + +static const struct nlhdr_parser *all_parsers[] = { &genl_parser }; + +static void +genl_load(void *u __unused) +{ + GENL_LOCK_INIT(); + NL_VERIFY_PARSERS(all_parsers); + netlink_register_proto(NETLINK_GENERIC, "NETLINK_GENERIC", genl_handle_message); + genl_nlctrl_init(); +} +SYSINIT(genl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_load, NULL); + +static void +genl_unload(void *u __unused) +{ + genl_nlctrl_destroy(); + GENL_LOCK_DESTROY(); + epoch_wait_preempt(net_epoch_preempt); +} +SYSUNINIT(genl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, genl_unload, NULL); diff --git a/sys/netlink/netlink_generic.h b/sys/netlink/netlink_generic.h new file mode 100644 index 000000000000..9b411a67ab2a --- /dev/null +++ b/sys/netlink/netlink_generic.h @@ -0,0 +1,112 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Generic netlink message header and attributes + */ +#ifndef _NETLINK_NETLINK_GENERIC_H_ +#define _NETLINK_NETLINK_GENERIC_H_ + +/* Base header for all of the relevant messages */ +struct genlmsghdr { + uint8_t cmd; /* CTRL_CMD_ */ + uint8_t version; /* ABI version for the cmd */ + uint16_t reserved; /* reserved: set to 0 */ +}; +#define GENL_HDRLEN NL_ITEM_ALIGN(sizeof(struct genlmsghdr)) + +/* Dynamic family number range, inclusive */ +#define GENL_MIN_ID NLMSG_MIN_TYPE +#define GENL_MAX_ID 1023 + +/* Pre-defined family numbers */ +#define GENL_ID_CTRL GENL_MIN_ID + +/* Available commands */ +enum { + CTRL_CMD_UNSPEC = 0, + CTRL_CMD_NEWFAMILY = 1, + CTRL_CMD_DELFAMILY = 2, + CTRL_CMD_GETFAMILY = 3, /* lists all (or matching) genetlink families */ + CTRL_CMD_NEWOPS = 4, + CTRL_CMD_DELOPS = 5, + CTRL_CMD_GETOPS = 6, + CTRL_CMD_NEWMCAST_GRP = 7, + CTRL_CMD_DELMCAST_GRP = 8, + CTRL_CMD_GETMCAST_GRP = 9, + CTRL_CMD_GETPOLICY = 10, + __CTRL_CMD_MAX, +}; +#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1) + +/* Generic attributes */ +enum { + CTRL_ATTR_UNSPEC, + CTRL_ATTR_FAMILY_ID = 1, /* u16, dynamically-assigned ID */ + CTRL_ATTR_FAMILY_NAME = 2, /* string, family name */ + CTRL_ATTR_VERSION = 3, /* u32, command version */ + CTRL_ATTR_HDRSIZE = 4, /* u32, family header size */ + CTRL_ATTR_MAXATTR = 5, /* u32, maximum family attr # */ + CTRL_ATTR_OPS = 6, /* nested, available operations */ + CTRL_ATTR_MCAST_GROUPS = 7, + CTRL_ATTR_POLICY = 8, + CTRL_ATTR_OP_POLICY = 9, + CTRL_ATTR_OP = 10, + __CTRL_ATTR_MAX, +}; +#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1) + +#define GENL_NAMSIZ 16 /* max family name length including \0 */ + +/* CTRL_ATTR_OPS attributes */ +enum { + CTRL_ATTR_OP_UNSPEC, + CTRL_ATTR_OP_ID = 1, /* u32, operation # */ + CTRL_ATTR_OP_FLAGS = 2, /* u32, flags-based op description */ + __CTRL_ATTR_OP_MAX, +}; +#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1) + +/* CTRL_ATTR_OP_FLAGS values */ +#define GENL_ADMIN_PERM 0x0001 /* Requires elevated permissions */ +#define GENL_CMD_CAP_DO 0x0002 /* Operation is a modification request */ +#define GENL_CMD_CAP_DUMP 0x0004 /* Operation is a get/dump request */ +#define GENL_CMD_CAP_HASPOL 0x0008 /* Operation has a validation policy */ +#define GENL_UNS_ADMIN_PERM 0x0010 + +/* CTRL_ATTR_MCAST_GROUPS attributes */ +enum { + CTRL_ATTR_MCAST_GRP_UNSPEC, + CTRL_ATTR_MCAST_GRP_NAME, /* string, group name */ + CTRL_ATTR_MCAST_GRP_ID, /* u32, dynamically-assigned group id */ + __CTRL_ATTR_MCAST_GRP_MAX, +}; +#define CTRL_ATTR_MCAST_GRP_MAX (CTRL_ATTR_MCAST_GRP_MAX - 1) + + +#endif + diff --git a/sys/netlink/netlink_io.c b/sys/netlink/netlink_io.c new file mode 100644 index 000000000000..ef1c2c73a10e --- /dev/null +++ b/sys/netlink/netlink_io.c @@ -0,0 +1,528 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_io +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +/* + * The logic below provide a p2p interface for receiving and + * sending netlink data between the kernel and userland. + */ + +static const struct sockaddr_nl _nl_empty_src = { + .nl_len = sizeof(struct sockaddr_nl), + .nl_family = PF_NETLINK, + .nl_pid = 0 /* comes from the kernel */ +}; +static const struct sockaddr *nl_empty_src = (const struct sockaddr *)&_nl_empty_src; + +static struct mbuf *nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp); + + +static void +queue_push(struct nl_io_queue *q, struct mbuf *mq) +{ + while (mq != NULL) { + struct mbuf *m = mq; + mq = mq->m_nextpkt; + m->m_nextpkt = NULL; + + q->length += m_length(m, NULL); + STAILQ_INSERT_TAIL(&q->head, m, m_stailqpkt); + } +} + +static void +queue_push_head(struct nl_io_queue *q, struct mbuf *m) +{ + MPASS(m->m_nextpkt == NULL); + + q->length += m_length(m, NULL); + STAILQ_INSERT_HEAD(&q->head, m, m_stailqpkt); +} + +static struct mbuf * +queue_pop(struct nl_io_queue *q) +{ + if (!STAILQ_EMPTY(&q->head)) { + struct mbuf *m = STAILQ_FIRST(&q->head); + STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); + m->m_nextpkt = NULL; + q->length -= m_length(m, NULL); + + return (m); + } + return (NULL); +} + +static struct mbuf * +queue_head(const struct nl_io_queue *q) +{ + return (STAILQ_FIRST(&q->head)); +} + +static inline bool +queue_empty(const struct nl_io_queue *q) +{ + return (q->length == 0); +} + +static void +queue_free(struct nl_io_queue *q) +{ + while (!STAILQ_EMPTY(&q->head)) { + struct mbuf *m = STAILQ_FIRST(&q->head); + STAILQ_REMOVE_HEAD(&q->head, m_stailqpkt); + m->m_nextpkt = NULL; + m_freem(m); + } + q->length = 0; +} + + +static void +nl_schedule_taskqueue(struct nlpcb *nlp) +{ + if (!nlp->nl_task_pending) { + nlp->nl_task_pending = true; + taskqueue_enqueue(nlp->nl_taskqueue, &nlp->nl_task); + NL_LOG(LOG_DEBUG3, "taskqueue scheduled"); + } else { + NL_LOG(LOG_DEBUG3, "taskqueue schedule skipped"); + } +} + +int +nl_receive_async(struct mbuf *m, struct socket *so) +{ + struct nlpcb *nlp = sotonlpcb(so); + int error = 0; + + m->m_nextpkt = NULL; + + NLP_LOCK(nlp); + + if ((__predict_true(nlp->nl_active))) { + sbappend(&so->so_snd, m, 0); + NL_LOG(LOG_DEBUG3, "enqueue %u bytes", m_length(m, NULL)); + nl_schedule_taskqueue(nlp); + } else { + NL_LOG(LOG_DEBUG, "ignoring %u bytes on non-active socket", + m_length(m, NULL)); + m_free(m); + error = EINVAL; + } + + NLP_UNLOCK(nlp); + + return (error); +} + +static bool +tx_check_locked(struct nlpcb *nlp) +{ + if (queue_empty(&nlp->tx_queue)) + return (true); + + /* + * Check if something can be moved from the internal TX queue + * to the socket queue. + */ + + bool appended = false; + struct sockbuf *sb = &nlp->nl_socket->so_rcv; + SOCKBUF_LOCK(sb); + + while (true) { + struct mbuf *m = queue_head(&nlp->tx_queue); + if (m && sbappendaddr_locked(sb, nl_empty_src, m, NULL) != 0) { + /* appended successfully */ + queue_pop(&nlp->tx_queue); + appended = true; + } else + break; + } + + SOCKBUF_UNLOCK(sb); + + if (appended) + sorwakeup(nlp->nl_socket); + + return (queue_empty(&nlp->tx_queue)); +} + +static bool +nl_process_received_one(struct nlpcb *nlp) +{ + bool reschedule = false; + + NLP_LOCK(nlp); + nlp->nl_task_pending = false; + + if (!tx_check_locked(nlp)) { + /* TX overflow queue still not empty, ignore RX */ + NLP_UNLOCK(nlp); + return (false); + } + + if (queue_empty(&nlp->rx_queue)) { + /* + * Grab all data we have from the socket TX queue + * and store it the internal queue, so it can be worked on + * w/o holding socket lock. + */ + struct sockbuf *sb = &nlp->nl_socket->so_snd; + + SOCKBUF_LOCK(sb); + unsigned int avail = sbavail(sb); + if (avail > 0) { + NL_LOG(LOG_DEBUG3, "grabbed %u bytes", avail); + queue_push(&nlp->rx_queue, sbcut_locked(sb, avail)); + } + SOCKBUF_UNLOCK(sb); + } else { + /* Schedule another pass to read from the socket queue */ + reschedule = true; + } + + int prev_hiwat = nlp->tx_queue.hiwat; + NLP_UNLOCK(nlp); + + while (!queue_empty(&nlp->rx_queue)) { + struct mbuf *m = queue_pop(&nlp->rx_queue); + + m = nl_process_mbuf(m, nlp); + if (m != NULL) { + queue_push_head(&nlp->rx_queue, m); + reschedule = false; + break; + } + } + if (nlp->tx_queue.hiwat > prev_hiwat) { + NLP_LOG(LOG_DEBUG, nlp, "TX override peaked to %d", nlp->tx_queue.hiwat); + + } + + return (reschedule); +} + +static void +nl_process_received(struct nlpcb *nlp) +{ + NL_LOG(LOG_DEBUG3, "taskqueue called"); + + while (nl_process_received_one(nlp)) + ; +} + +void +nl_init_io(struct nlpcb *nlp) +{ + STAILQ_INIT(&nlp->rx_queue.head); + STAILQ_INIT(&nlp->tx_queue.head); +} + +void +nl_free_io(struct nlpcb *nlp) +{ + queue_free(&nlp->rx_queue); + queue_free(&nlp->tx_queue); +} + +/* + * Called after some data have been read from the socket. + */ +void +nl_on_transmit(struct nlpcb *nlp) +{ + NLP_LOCK(nlp); + + struct socket *so = nlp->nl_socket; + if (__predict_false(nlp->nl_dropped_bytes > 0 && so != NULL)) { + uint64_t dropped_bytes = nlp->nl_dropped_bytes; + uint64_t dropped_messages = nlp->nl_dropped_messages; + nlp->nl_dropped_bytes = 0; + nlp->nl_dropped_messages = 0; + + struct sockbuf *sb = &so->so_rcv; + NLP_LOG(LOG_DEBUG, nlp, + "socket RX overflowed, %lu messages (%lu bytes) dropped. " + "bytes: [%u/%u] mbufs: [%u/%u]", dropped_messages, dropped_bytes, + sb->sb_ccc, sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax); + /* TODO: send netlink message */ + } + + nl_schedule_taskqueue(nlp); + NLP_UNLOCK(nlp); +} + +void +nl_taskqueue_handler(void *_arg, int pending) +{ + struct nlpcb *nlp = (struct nlpcb *)_arg; + + CURVNET_SET(nlp->nl_socket->so_vnet); + nl_process_received(nlp); + CURVNET_RESTORE(); +} + +static __noinline void +queue_push_tx(struct nlpcb *nlp, struct mbuf *m) +{ + queue_push(&nlp->tx_queue, m); + nlp->nl_tx_blocked = true; + + if (nlp->tx_queue.length > nlp->tx_queue.hiwat) + nlp->tx_queue.hiwat = nlp->tx_queue.length; +} + +/* + * Tries to send @m to the socket @nlp. + * + * @m: mbuf(s) to send to. Consumed in any case. + * @nlp: socket to send to + * @cnt: number of messages in @m + * @io_flags: combination of NL_IOF_* flags + * + * Returns true on success. + * If no queue overrunes happened, wakes up socket owner. + */ +bool +nl_send_one(struct mbuf *m, struct nlpcb *nlp, int num_messages, int io_flags) +{ + bool untranslated = io_flags & NL_IOF_UNTRANSLATED; + bool ignore_limits = io_flags & NL_IOF_IGNORE_LIMIT; + bool result = true; + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + struct nlmsghdr *hdr = mtod(m, struct nlmsghdr *); + NLP_LOG(LOG_DEBUG2, nlp, + "TX mbuf len %u msgs %u msg type %d first hdrlen %u io_flags %X", + m_length(m, NULL), num_messages, hdr->nlmsg_type, hdr->nlmsg_len, + io_flags); + } + + if (__predict_false(nlp->nl_linux && linux_netlink_p != NULL && untranslated)) { + m = linux_netlink_p->mbufs_to_linux(nlp->nl_proto, m, nlp); + if (m == NULL) + return (false); + } + + NLP_LOCK(nlp); + + if (__predict_false(nlp->nl_socket == NULL)) { + NLP_UNLOCK(nlp); + m_freem(m); + return (false); + } + + if (!queue_empty(&nlp->tx_queue)) { + if (ignore_limits) { + queue_push_tx(nlp, m); + } else { + m_free(m); + result = false; + } + NLP_UNLOCK(nlp); + return (result); + } + + struct socket *so = nlp->nl_socket; + if (sbappendaddr(&so->so_rcv, nl_empty_src, m, NULL) != 0) { + sorwakeup(so); + NLP_LOG(LOG_DEBUG3, nlp, "appended data & woken up"); + } else { + if (ignore_limits) { + queue_push_tx(nlp, m); + } else { + /* + * Store dropped data so it can be reported + * on the next read + */ + nlp->nl_dropped_bytes += m_length(m, NULL); + nlp->nl_dropped_messages += num_messages; + NLP_LOG(LOG_DEBUG2, nlp, "RX oveflow: %lu m (+%d), %lu b (+%d)", + nlp->nl_dropped_messages, num_messages, + nlp->nl_dropped_bytes, m_length(m, NULL)); + soroverflow(so); + m_freem(m); + result = false; + } + } + NLP_UNLOCK(nlp); + + return (result); +} + +static int +nl_receive_message(struct nlmsghdr *hdr, int remaining_length, + struct nlpcb *nlp, struct nl_pstate *npt) +{ + nl_handler_f handler = nl_handlers[nlp->nl_proto].cb; + int error = 0; + + NL_LOG(LOG_DEBUG2, "msg len: %d type: %d", hdr->nlmsg_len, + hdr->nlmsg_type); + + if (__predict_false(hdr->nlmsg_len > remaining_length)) { + NLP_LOG(LOG_DEBUG, nlp, "message is not entirely present: want %d got %d", + hdr->nlmsg_len, remaining_length); + return (EINVAL); + } else if (__predict_false(hdr->nlmsg_len < sizeof(*hdr))) { + NL_LOG(LOG_DEBUG, "message too short: %d", hdr->nlmsg_len); + return (EINVAL); + } + /* Stamp each message with sender pid */ + hdr->nlmsg_pid = nlp->nl_port; + + npt->hdr = hdr; + + if (hdr->nlmsg_flags & NLM_F_REQUEST && hdr->nlmsg_type >= NLMSG_MIN_TYPE) { + NL_LOG(LOG_DEBUG2, "handling message with msg type: %d", + hdr->nlmsg_type); + + if (nlp->nl_linux && linux_netlink_p != NULL) { + struct nlmsghdr *hdr_orig = hdr; + hdr = linux_netlink_p->msg_from_linux(nlp->nl_proto, hdr, npt); + if (hdr == NULL) { + npt->hdr = hdr_orig; + if (hdr->nlmsg_flags & NLM_F_ACK) + nlmsg_ack(nlp, EAGAIN, hdr, npt); + return (0); + } + } + error = handler(hdr, npt); + NL_LOG(LOG_DEBUG2, "retcode: %d", error); + } + if ((hdr->nlmsg_flags & NLM_F_ACK) || (error != 0 && error != EINTR)) { + NL_LOG(LOG_DEBUG3, "ack"); + nlmsg_ack(nlp, error, hdr, npt); + NL_LOG(LOG_DEBUG3, "done"); + } + + return (0); +} + +static void +npt_clear(struct nl_pstate *npt) +{ + lb_clear(&npt->lb); + npt->error = 0; + npt->err_msg = NULL; + npt->err_off = 0; + npt->hdr = NULL; +} + +/* + * Processes an incoming packet, which can contain multiple netlink messages + */ +static struct mbuf * +nl_process_mbuf(struct mbuf *m, struct nlpcb *nlp) +{ + int offset, buffer_length; + struct nlmsghdr *hdr; + char *buffer; + int error; + + NL_LOG(LOG_DEBUG3, "RX netlink mbuf %p on %p", m, nlp->nl_socket); + + struct nl_writer nw = {}; + if (!nlmsg_get_unicast_writer(&nw, NLMSG_SMALL, nlp)) { + m_freem(m); + NL_LOG(LOG_DEBUG, "error allocating socket writer"); + return (NULL); + } + + nlmsg_ignore_limit(&nw); + /* TODO: alloc this buf once for nlp */ + int data_length = m_length(m, NULL); + buffer_length = roundup2(data_length, 8) + SCRATCH_BUFFER_SIZE; + if (nlp->nl_linux) + buffer_length += roundup2(data_length, 8); + buffer = malloc(buffer_length, M_NETLINK, M_NOWAIT | M_ZERO); + if (buffer == NULL) { + m_freem(m); + nlmsg_flush(&nw); + NL_LOG(LOG_DEBUG, "Unable to allocate %d bytes of memory", + buffer_length); + return (NULL); + } + m_copydata(m, 0, data_length, buffer); + + struct nl_pstate npt = { + .nlp = nlp, + .lb.base = &buffer[roundup2(data_length, 8)], + .lb.size = buffer_length - roundup2(data_length, 8), + .nw = &nw, + .strict = nlp->nl_flags & NLF_STRICT, + }; + + for (offset = 0; offset + sizeof(struct nlmsghdr) <= data_length;) { + hdr = (struct nlmsghdr *)&buffer[offset]; + /* Save length prior to calling handler */ + int msglen = NLMSG_ALIGN(hdr->nlmsg_len); + NL_LOG(LOG_DEBUG3, "parsing offset %d/%d", offset, data_length); + npt_clear(&npt); + error = nl_receive_message(hdr, data_length - offset, nlp, &npt); + offset += msglen; + if (__predict_false(error != 0 || nlp->nl_tx_blocked)) + break; + } + NL_LOG(LOG_DEBUG3, "packet parsing done"); + free(buffer, M_NETLINK); + nlmsg_flush(&nw); + + if (nlp->nl_tx_blocked) { + NLP_LOCK(nlp); + nlp->nl_tx_blocked = false; + NLP_UNLOCK(nlp); + m_adj(m, offset); + return (m); + } else { + m_freem(m); + return (NULL); + } +} diff --git a/sys/netlink/netlink_linux.h b/sys/netlink/netlink_linux.h new file mode 100644 index 000000000000..8841624be070 --- /dev/null +++ b/sys/netlink/netlink_linux.h @@ -0,0 +1,54 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_LINUX_VAR_H_ +#define _NETLINK_LINUX_VAR_H_ + +/* + * The file contains headers for the bridge interface between + * linux[_common] module and the netlink module + */ +struct nlpcb; +struct nl_pstate; + +typedef struct mbuf *mbufs_to_linux_cb_t(int netlink_family, struct mbuf *m, + struct nlpcb *nlp); +typedef struct mbuf *msgs_to_linux_cb_t(int netlink_family, char *buf, int data_length, + struct nlpcb *nlp); +typedef struct nlmsghdr *msg_from_linux_cb_t(int netlink_family, struct nlmsghdr *hdr, + struct nl_pstate *npt); + +struct linux_netlink_provider { + mbufs_to_linux_cb_t *mbufs_to_linux; + msgs_to_linux_cb_t *msgs_to_linux; + msg_from_linux_cb_t *msg_from_linux; + +}; + +extern struct linux_netlink_provider *linux_netlink_p; + +#endif diff --git a/sys/netlink/netlink_message_parser.c b/sys/netlink/netlink_message_parser.c new file mode 100644 index 000000000000..d33eddb800e4 --- /dev/null +++ b/sys/netlink/netlink_message_parser.c @@ -0,0 +1,472 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_parser +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +bool +nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...) +{ + va_list ap; + + if (npt->err_msg != NULL) + return (false); + char *buf = npt_alloc(npt, NL_MAX_ERROR_BUF); + if (buf == NULL) + return (false); + va_start(ap, fmt); + vsnprintf(buf, NL_MAX_ERROR_BUF, fmt, ap); + va_end(ap); + + npt->err_msg = buf; + return (true); +} + +bool +nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off) +{ + if (npt->err_off != 0) + return (false); + npt->err_off = off; + return (true); +} + +static const struct nlattr_parser * +search_states(const struct nlattr_parser *ps, int pslen, int key) +{ + int left_i = 0, right_i = pslen - 1; + + if (key < ps[0].type || key > ps[pslen - 1].type) + return (NULL); + + while (left_i + 1 < right_i) { + int mid_i = (left_i + right_i) / 2; + if (key < ps[mid_i].type) + right_i = mid_i; + else if (key > ps[mid_i].type) + left_i = mid_i + 1; + else + return (&ps[mid_i]); + } + if (ps[left_i].type == key) + return (&ps[left_i]); + else if (ps[right_i].type == key) + return (&ps[right_i]); + return (NULL); +} + +int +nl_parse_attrs_raw(struct nlattr *nla_head, int len, const struct nlattr_parser *ps, int pslen, + struct nl_pstate *npt, void *target) +{ + struct nlattr *nla = NULL; + int error = 0; + + NL_LOG(LOG_DEBUG3, "parse %p remaining_len %d", nla_head, len); + int orig_len = len; + NLA_FOREACH(nla, nla_head, len) { + NL_LOG(LOG_DEBUG3, ">> parsing %p attr_type %d len %d (rem %d)", nla, nla->nla_type, nla->nla_len, len); + if (nla->nla_len < sizeof(struct nlattr)) { + NLMSG_REPORT_ERR_MSG(npt, "Invalid attr %p type %d len: %d", + nla, nla->nla_type, nla->nla_len); + uint32_t off = (char *)nla - (char *)npt->hdr; + nlmsg_report_err_offset(npt, off); + return (EINVAL); + } + + int nla_type = nla->nla_type & NLA_TYPE_MASK; + const struct nlattr_parser *s = search_states(ps, pslen, nla_type); + if (s != NULL) { + void *ptr = (void *)((char *)target + s->off); + error = s->cb(nla, npt, s->arg, ptr); + if (error != 0) { + uint32_t off = (char *)nla - (char *)npt->hdr; + nlmsg_report_err_offset(npt, off); + NL_LOG(LOG_DEBUG3, "parse failed att offset %u", off); + return (error); + } + } else { + /* Ignore non-specified attributes */ + NL_LOG(LOG_DEBUG3, "ignoring attr %d", nla->nla_type); + } + } + if (len >= sizeof(struct nlattr)) { + nla = (struct nlattr *)((char *)nla_head + (orig_len - len)); + NL_LOG(LOG_DEBUG3, " >>> end %p attr_type %d len %d", nla, + nla->nla_type, nla->nla_len); + } + NL_LOG(LOG_DEBUG3, "end parse: %p remaining_len %d", nla, len); + + return (0); +} + +int +nl_parse_attrs(struct nlmsghdr *hdr, int hdrlen, struct nlattr_parser *ps, int pslen, + struct nl_pstate *npt, void *target) +{ + int off = NLMSG_HDRLEN + NETLINK_ALIGN(hdrlen); + int len = hdr->nlmsg_len - off; + struct nlattr *nla_head = (struct nlattr *)((char *)hdr + off); + + return (nl_parse_attrs_raw(nla_head, len, ps, pslen, npt, target)); +} + +int +nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != 0)) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not a flag", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + + *((uint8_t *)target) = 1; + return (0); +} + +static struct sockaddr * +parse_rta_ip4(void *rta_data, struct nl_pstate *npt, int *perror) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in)); + if (__predict_false(sin == NULL)) { + *perror = ENOBUFS; + return (NULL); + } + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + memcpy(&sin->sin_addr, rta_data, sizeof(struct in_addr)); + return ((struct sockaddr *)sin); +} + +static struct sockaddr * +parse_rta_ip6(void *rta_data, struct nl_pstate *npt, int *perror) +{ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)npt_alloc_sockaddr(npt, sizeof(struct sockaddr_in6)); + if (__predict_false(sin6 == NULL)) { + *perror = ENOBUFS; + return (NULL); + } + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + memcpy(&sin6->sin6_addr, rta_data, sizeof(struct in6_addr)); + return ((struct sockaddr *)sin6); +} + +static struct sockaddr * +parse_rta_ip(struct rtattr *rta, struct nl_pstate *npt, int *perror) +{ + void *rta_data = NL_RTA_DATA(rta); + int rta_len = NL_RTA_DATA_LEN(rta); + + if (rta_len == sizeof(struct in_addr)) { + return (parse_rta_ip4(rta_data, npt, perror)); + } else if (rta_len == sizeof(struct in6_addr)) { + return (parse_rta_ip6(rta_data, npt, perror)); + } else { + NLMSG_REPORT_ERR_MSG(npt, "unknown IP len: %d for rta type %d", + rta_len, rta->rta_type); + *perror = ENOTSUP; + return (NULL); + } + return (NULL); +} + +int +nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int error = 0; + + struct sockaddr *sa = parse_rta_ip((struct rtattr *)nla, npt, &error); + + *((struct sockaddr **)target) = sa; + return (error); +} + +static struct sockaddr * +parse_rta_via(struct rtattr *rta, struct nl_pstate *npt, int *perror) +{ + struct rtvia *via = NL_RTA_DATA(rta); + int data_len = NL_RTA_DATA_LEN(rta); + + if (__predict_false(data_len) < sizeof(struct rtvia)) { + NLMSG_REPORT_ERR_MSG(npt, "undersized RTA_VIA(%d) attr: len %d", + rta->rta_type, data_len); + *perror = EINVAL; + return (NULL); + } + data_len -= offsetof(struct rtvia, rtvia_addr); + + switch (via->rtvia_family) { + case AF_INET: + if (__predict_false(data_len < sizeof(struct in_addr))) { + *perror = EINVAL; + return (NULL); + } + return (parse_rta_ip4(via->rtvia_addr, npt, perror)); + case AF_INET6: + if (__predict_false(data_len < sizeof(struct in6_addr))) { + *perror = EINVAL; + return (NULL); + } + return (parse_rta_ip6(via->rtvia_addr, npt, perror)); + default: + *perror = ENOTSUP; + return (NULL); + } +} + +int +nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int error = 0; + + struct sockaddr *sa = parse_rta_via((struct rtattr *)nla, npt, &error); + + *((struct sockaddr **)target) = sa; + return (error); +} + + +int +nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint16_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((uint16_t *)target) = *((const uint16_t *)NL_RTA_DATA_CONST(nla)); + return (0); +} + +int +nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + *((uint32_t *)target) = *((const uint32_t *)NL_RTA_DATA_CONST(nla)); + return (0); +} + +int +nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint64_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint64", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + memcpy(target, NL_RTA_DATA_CONST(nla), sizeof(uint64_t)); + return (0); +} + +static int +nlattr_get_ifp_internal(struct nlattr *nla, struct nl_pstate *npt, + void *target, bool zero_ok) +{ + if (__predict_false(NLA_DATA_LEN(nla) != sizeof(uint32_t))) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not uint32", + nla->nla_type, NLA_DATA_LEN(nla)); + return (EINVAL); + } + uint32_t ifindex = *((const uint32_t *)NLA_DATA_CONST(nla)); + + if (ifindex == 0 && zero_ok) { + *((struct ifnet **)target) = NULL; + return (0); + } + + NET_EPOCH_ASSERT(); + + struct ifnet *ifp = ifnet_byindex(ifindex); + if (__predict_false(ifp == NULL)) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d: ifindex %u invalid", + nla->nla_type, ifindex); + return (ENOENT); + } + *((struct ifnet **)target) = ifp; + NL_LOG(LOG_DEBUG3, "nla type %d: ifindex %u -> %s", nla->nla_type, + ifindex, if_name(ifp)); + + return (0); +} + +int +nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + return (nlattr_get_ifp_internal(nla, npt, target, false)); +} + +int +nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + return (nlattr_get_ifp_internal(nla, npt, target, true)); +} + +int +nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int maxlen = NLA_DATA_LEN(nla); + + if (__predict_false(strnlen((char *)NLA_DATA(nla), maxlen) >= maxlen)) { + NLMSG_REPORT_ERR_MSG(npt, "nla type %d size(%u) is not NULL-terminated", + nla->nla_type, maxlen); + return (EINVAL); + } + + *((char **)target) = (char *)NLA_DATA(nla); + return (0); +} + +int +nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int maxlen = NLA_DATA_LEN(nla); + + char *buf = npt_alloc(npt, maxlen + 1); + if (buf == NULL) + return (ENOMEM); + buf[maxlen] = '\0'; + memcpy(buf, NLA_DATA(nla), maxlen); + + *((char **)target) = buf; + return (0); +} +int +nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + NL_LOG(LOG_DEBUG3, "STORING %p len %d", nla, nla->nla_len); + *((struct nlattr **)target) = nla; + return (0); +} + +int +nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + const struct nlhdr_parser *p = (const struct nlhdr_parser *)arg; + int error; + + /* Assumes target points to the beginning of the structure */ + error = nl_parse_header(NLA_DATA(nla), NLA_DATA_LEN(nla), p, npt, target); + return (error); +} + +int +nlf_get_ifp(void *src, struct nl_pstate *npt, void *target) +{ + int ifindex = *((const int *)src); + + NET_EPOCH_ASSERT(); + + struct ifnet *ifp = ifnet_byindex(ifindex); + if (ifp == NULL) { + NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex); + return (ENOENT); + } + *((struct ifnet **)target) = ifp; + + return (0); +} + +int +nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target) +{ + int ifindex = *((const int *)src); + + NET_EPOCH_ASSERT(); + + struct ifnet *ifp = ifnet_byindex(ifindex); + if (ifindex != 0 && ifp == NULL) { + NL_LOG(LOG_DEBUG, "ifindex %u invalid", ifindex); + return (ENOENT); + } + *((struct ifnet **)target) = ifp; + + return (0); +} + +int +nlf_get_u8(void *src, struct nl_pstate *npt, void *target) +{ + uint8_t val = *((const uint8_t *)src); + + *((uint8_t *)target) = val; + + return (0); +} + +int +nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target) +{ + *((uint32_t *)target) = *((const uint8_t *)src); + return (0); +} + +int +nlf_get_u16(void *src, struct nl_pstate *npt, void *target) +{ + *((uint16_t *)target) = *((const uint16_t *)src); + return (0); +} + +int +nlf_get_u32(void *src, struct nl_pstate *npt, void *target) +{ + *((uint32_t *)target) = *((const uint32_t *)src); + return (0); +} + diff --git a/sys/netlink/netlink_message_parser.h b/sys/netlink/netlink_message_parser.h new file mode 100644 index 000000000000..06a6788b7de5 --- /dev/null +++ b/sys/netlink/netlink_message_parser.h @@ -0,0 +1,270 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_MESSAGE_PARSER_H_ +#define _NETLINK_NETLINK_MESSAGE_PARSER_H_ + +/* + * It is not meant to be included directly + */ + +/* Parsing state */ +struct linear_buffer { + char *base; /* Base allocated memory pointer */ + uint32_t offset; /* Currently used offset */ + uint32_t size; /* Total buffer size */ +}; + +static inline void * +lb_alloc(struct linear_buffer *lb, int len) +{ + len = roundup2(len, sizeof(uint64_t)); + if (lb->offset + len > lb->size) + return (NULL); + void *data = (void *)(lb->base + lb->offset); + lb->offset += len; + return (data); +} + +static inline void +lb_clear(struct linear_buffer *lb) +{ + memset(lb->base, 0, lb->size); + lb->offset = 0; +} + +#define NL_MAX_ERROR_BUF 128 +#define SCRATCH_BUFFER_SIZE (1024 + NL_MAX_ERROR_BUF) +struct nl_pstate { + struct linear_buffer lb; /* Per-message scratch buffer */ + struct nlpcb *nlp; /* Originator socket */ + struct nl_writer *nw; /* Message writer to use */ + struct nlmsghdr *hdr; /* Current parsed message header */ + uint32_t err_off; /* error offset from hdr start */ + int error; /* last operation error */ + char *err_msg; /* Description of last error */ + bool strict; /* Strict parsing required */ +}; + +static inline void * +npt_alloc(struct nl_pstate *npt, int len) +{ + return (lb_alloc(&npt->lb, len)); +} +#define npt_alloc_sockaddr(_npt, _len) ((struct sockaddr *)(npt_alloc(_npt, _len))) + +typedef int parse_field_f(void *hdr, struct nl_pstate *npt, + void *target); +struct nlfield_parser { + uint16_t off_in; + uint16_t off_out; + parse_field_f *cb; +}; +static const struct nlfield_parser nlf_p_empty[] = {}; + +int nlf_get_ifp(void *src, struct nl_pstate *npt, void *target); +int nlf_get_ifpz(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u8(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u16(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u32(void *src, struct nl_pstate *npt, void *target); +int nlf_get_u8_u32(void *src, struct nl_pstate *npt, void *target); + + +struct nlattr_parser; +typedef int parse_attr_f(struct nlattr *attr, struct nl_pstate *npt, + const void *arg, void *target); +struct nlattr_parser { + uint16_t type; /* Attribute type */ + uint16_t off; /* field offset in the target structure */ + parse_attr_f *cb; /* parser function to call */ + const void *arg; +}; + +typedef bool strict_parser_f(void *hdr, struct nl_pstate *npt); + +struct nlhdr_parser { + int nl_hdr_off; /* aligned netlink header size */ + int out_hdr_off; /* target header size */ + int fp_size; + int np_size; + const struct nlfield_parser *fp; /* array of header field parsers */ + const struct nlattr_parser *np; /* array of attribute parsers */ + strict_parser_f *sp; /* Parser function */ +}; + +#define NL_DECLARE_PARSER(_name, _t, _fp, _np) \ +static const struct nlhdr_parser _name = { \ + .nl_hdr_off = sizeof(_t), \ + .fp = &((_fp)[0]), \ + .np = &((_np)[0]), \ + .fp_size = NL_ARRAY_LEN(_fp), \ + .np_size = NL_ARRAY_LEN(_np), \ +} + +#define NL_DECLARE_STRICT_PARSER(_name, _t, _sp, _fp, _np)\ +static const struct nlhdr_parser _name = { \ + .nl_hdr_off = sizeof(_t), \ + .fp = &((_fp)[0]), \ + .np = &((_np)[0]), \ + .fp_size = NL_ARRAY_LEN(_fp), \ + .np_size = NL_ARRAY_LEN(_np), \ + .sp = _sp, \ +} + +#define NL_DECLARE_ARR_PARSER(_name, _t, _o, _fp, _np) \ +static const struct nlhdr_parser _name = { \ + .nl_hdr_off = sizeof(_t), \ + .out_hdr_off = sizeof(_o), \ + .fp = &((_fp)[0]), \ + .np = &((_np)[0]), \ + .fp_size = NL_ARRAY_LEN(_fp), \ + .np_size = NL_ARRAY_LEN(_np), \ +} + +#define NL_DECLARE_ATTR_PARSER(_name, _np) \ +static const struct nlhdr_parser _name = { \ + .np = &((_np)[0]), \ + .np_size = NL_ARRAY_LEN(_np), \ +} + +struct nlarr_hdr { + int num_items; + int max_items; +}; + +int nl_parse_attrs_raw(struct nlattr *nla_head, int len, const struct nlattr_parser *ps, + int pslen, struct nl_pstate *npt, void *target); +int nl_parse_attrs(struct nlmsghdr *hdr, int hdrlen, struct nlattr_parser *ps, + int pslen, struct nl_pstate *npt, void *target); + +int nlattr_get_flag(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ip(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint16(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint32(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_uint64(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ifp(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ifpz(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_ipvia(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_string(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_stringn(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_nla(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); +int nlattr_get_nested(struct nlattr *nla, struct nl_pstate *npt, + const void *arg, void *target); + +bool nlmsg_report_err_msg(struct nl_pstate *npt, const char *fmt, ...); + +#define NLMSG_REPORT_ERR_MSG(_npt, _fmt, ...) { \ + nlmsg_report_err_msg(_npt, _fmt, ## __VA_ARGS__); \ + NLP_LOG(LOG_DEBUG, (_npt)->nlp, _fmt, ## __VA_ARGS__); \ +} + +bool nlmsg_report_err_offset(struct nl_pstate *npt, uint32_t off); + +/* + * Have it inline so compiler can optimize field accesses into + * the list of direct function calls without iteration. + */ +static inline int +nl_parse_header(void *hdr, int len, const struct nlhdr_parser *parser, + struct nl_pstate *npt, void *target) +{ + int error; + + if (__predict_false(len < parser->nl_hdr_off)) { + nlmsg_report_err_msg(npt, "header too short: expected %d, got %d", + parser->nl_hdr_off, len); + return (EINVAL); + } + + if (npt->strict && parser->sp != NULL && !parser->sp(hdr, npt)) + return (EINVAL); + + /* Extract fields first */ + for (int i = 0; i < parser->fp_size; i++) { + const struct nlfield_parser *fp = &parser->fp[i]; + void *src = (char *)hdr + fp->off_in; + void *dst = (char *)target + fp->off_out; + + error = fp->cb(src, npt, dst); + if (error != 0) + return (error); + } + + struct nlattr *nla_head = (struct nlattr *)((char *)hdr + parser->nl_hdr_off); + error = nl_parse_attrs_raw(nla_head, len - parser->nl_hdr_off, parser->np, + parser->np_size, npt, target); + + return (error); +} + +static inline int +nl_parse_nested(struct nlattr *nla, const struct nlhdr_parser *parser, + struct nl_pstate *npt, void *target) +{ + struct nlattr *nla_head = (struct nlattr *)NLA_DATA(nla); + + return (nl_parse_attrs_raw(nla_head, NLA_DATA_LEN(nla), parser->np, + parser->np_size, npt, target)); +} + +/* + * Checks that attributes are sorted by attribute type. + */ +static inline void +nl_verify_parsers(const struct nlhdr_parser **parser, int count) +{ + for (int i = 0; i < count; i++) { + const struct nlhdr_parser *p = parser[i]; + int attr_type = 0; + for (int j = 0; j < p->np_size; j++) { + MPASS(p->np[j].type > attr_type); + attr_type = p->np[j].type; + } + } +} +void nl_verify_parsers(const struct nlhdr_parser **parser, int count); +#define NL_VERIFY_PARSERS(_p) nl_verify_parsers((_p), NL_ARRAY_LEN(_p)) + +static inline int +nl_parse_nlmsg(struct nlmsghdr *hdr, const struct nlhdr_parser *parser, + struct nl_pstate *npt, void *target) +{ + return (nl_parse_header(hdr + 1, hdr->nlmsg_len - sizeof(*hdr), parser, npt, target)); +} + +#endif diff --git a/sys/netlink/netlink_message_writer.c b/sys/netlink/netlink_message_writer.c new file mode 100644 index 000000000000..1856f2859b01 --- /dev/null +++ b/sys/netlink/netlink_message_writer.c @@ -0,0 +1,686 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_writer +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +/* + * The goal of this file is to provide convenient message writing KPI on top of + * different storage methods (mbufs, uio, temporary memory chunks). + * + * The main KPI guarantee is the the (last) message always resides in the contiguous + * memory buffer, so one is able to update the header after writing the entire message. + * + * This guarantee comes with a side effect of potentially reallocating underlying + * buffer, so one needs to update the desired pointers after something is added + * to the header. + * + * Messaging layer contains hooks performing transparent Linux translation for the messages. + * + * There are 3 types of supported targets: + * * socket (adds mbufs to the socket buffer, used for message replies) + * * group (sends mbuf/chain to the specified groups, used for the notifications) + * * chain (returns mbuf chain, used in Linux message translation code) + * + * There are 3 types of storage: + * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message + * fits in MCLBYTES) + * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs + * to be larger than one supported by NS_WRITER_TYPE_MBUF) + * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for + * Linux sockets, calls translation hook prior to sending messages to the socket). + * + * Internally, KPI switches between different types of storage when memory requirements + * change. It happens transparently to the caller. + */ + + +typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok); +typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt); + +struct nlwriter_ops { + nlwriter_op_init *init; + nlwriter_op_write *write_socket; + nlwriter_op_write *write_group; + nlwriter_op_write *write_chain; +}; + +/* + * NS_WRITER_TYPE_BUF + * Writes message to a temporary memory buffer, + * flushing to the socket/group when buffer size limit is reached + */ +static bool +nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok) +{ + int mflag = waitok ? M_WAITOK : M_NOWAIT; + nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO); + if (__predict_false(nw->_storage == NULL)) + return (false); + nw->alloc_len = size; + nw->offset = 0; + nw->hdr = NULL; + nw->data = nw->_storage; + nw->writer_type = NS_WRITER_TYPE_BUF; + nw->malloc_flag = mflag; + nw->num_messages = 0; + nw->enomem = false; + return (true); +} + +static bool +nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw); + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + /* XXX: should we set sorcverr? */ + free(buf, M_NETLINK); + return (false); + } + m_append(m, datalen, buf); + free(buf, M_NETLINK); + + int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; + return (nl_send_one(m, (struct nlpcb *)(nw->arg_ptr), cnt, io_flags)); +} + +static bool +nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + free(buf, M_NETLINK); + return (false); + } + bool success = m_append(m, datalen, buf) != 0; + free(buf, M_NETLINK); + + if (!success) + return (false); + + nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF); + return (true); +} + +static bool +nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + struct mbuf **m0 = (struct mbuf **)(nw->arg_ptr); + NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); + + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + if (*m0 == NULL) { + struct mbuf *m; + + m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + free(buf, M_NETLINK); + return (false); + } + *m0 = m; + } + if (__predict_false(m_append(*m0, datalen, buf) == 0)) { + free(buf, M_NETLINK); + return (false); + } + return (true); +} + + +/* + * NS_WRITER_TYPE_MBUF + * Writes message to the allocated mbuf, + * flushing to socket/group when mbuf size limit is reached. + * This is the most efficient mechanism as it avoids double-copying. + * + * Allocates a single mbuf suitable to store up to @size bytes of data. + * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr + * If size <= MCLBYTES (2k), allocate a single mbuf cluster + * Otherwise, return NULL. + */ +static bool +nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok) +{ + struct mbuf *m; + + int mflag = waitok ? M_WAITOK : M_NOWAIT; + m = m_get2(size, mflag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) + return (false); + nw->alloc_len = M_TRAILINGSPACE(m); + nw->offset = 0; + nw->hdr = NULL; + nw->_storage = (void *)m; + nw->data = mtod(m, void *); + nw->writer_type = NS_WRITER_TYPE_MBUF; + nw->malloc_flag = mflag; + nw->num_messages = 0; + nw->enomem = false; + NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p", + m, size, nw->alloc_len, nw->data); + return (true); +} + +static bool +nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + struct mbuf *m = (struct mbuf *)buf; + NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); + + if (__predict_false(datalen == 0)) { + m_freem(m); + return (true); + } + + m->m_pkthdr.len = datalen; + m->m_len = datalen; + int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; + return (nl_send_one(m, (struct nlpcb *)(nw->arg_ptr), cnt, io_flags)); +} + +static bool +nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + struct mbuf *m = (struct mbuf *)buf; + NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); + + if (__predict_false(datalen == 0)) { + m_freem(m); + return (true); + } + + m->m_pkthdr.len = datalen; + m->m_len = datalen; + nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF); + return (true); +} + +static bool +nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + struct mbuf *m_new = (struct mbuf *)buf; + struct mbuf **m0 = (struct mbuf **)(nw->arg_ptr); + + NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg_ptr); + + if (__predict_false(datalen == 0)) { + m_freem(m_new); + return (true); + } + + m_new->m_pkthdr.len = datalen; + m_new->m_len = datalen; + + if (*m0 == NULL) { + *m0 = m_new; + } else { + struct mbuf *m_last; + for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next) + ; + m_last->m_next = m_new; + (*m0)->m_pkthdr.len += datalen; + } + + return (true); +} + +/* + * NS_WRITER_TYPE_LBUF + * Writes message to the allocated memory buffer, + * flushing to socket/group when mbuf size limit is reached. + * Calls linux handler to rewrite messages before sending to the socket. + */ +static bool +nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok) +{ + int mflag = waitok ? M_WAITOK : M_NOWAIT; + size = roundup2(size, sizeof(void *)); + int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE; + char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO); + if (__predict_false(buf == NULL)) + return (false); + + /* Fill buffer header first */ + struct linear_buffer *lb = (struct linear_buffer *)buf; + lb->base = &buf[sizeof(struct linear_buffer) + size]; + lb->size = size + SCRATCH_BUFFER_SIZE; + + nw->alloc_len = size; + nw->offset = 0; + nw->hdr = NULL; + nw->_storage = buf; + nw->data = (char *)(lb + 1); + nw->malloc_flag = mflag; + nw->writer_type = NS_WRITER_TYPE_LBUF; + nw->num_messages = 0; + nw->enomem = false; + return (true); +} + + +static bool +nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + struct linear_buffer *lb = (struct linear_buffer *)buf; + char *data = (char *)(lb + 1); + struct nlpcb *nlp = (struct nlpcb *)(nw->arg_ptr); + + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = NULL; + if (linux_netlink_p != NULL) + m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp); + free(buf, M_NETLINK); + + if (__predict_false(m == NULL)) { + /* XXX: should we set sorcverr? */ + return (false); + } + + int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0; + return (nl_send_one(m, nlp, cnt, io_flags)); +} + +/* Shouldn't be called (maybe except Linux code originating message) */ +static bool +nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt) +{ + struct linear_buffer *lb = (struct linear_buffer *)buf; + char *data = (char *)(lb + 1); + + if (__predict_false(datalen == 0)) { + free(buf, M_NETLINK); + return (true); + } + + struct mbuf *m = m_getm2(NULL, datalen, nw->malloc_flag, MT_DATA, M_PKTHDR); + if (__predict_false(m == NULL)) { + free(buf, M_NETLINK); + return (false); + } + m_append(m, datalen, data); + free(buf, M_NETLINK); + + nl_send_group(m, cnt, nw->arg_uint >> 16, nw->arg_uint & 0xFFFF); + return (true); +} + +struct nlwriter_ops nlmsg_writers[] = { + /* NS_WRITER_TYPE_MBUF */ + { + .init = nlmsg_get_ns_mbuf, + .write_socket = nlmsg_write_socket_mbuf, + .write_group = nlmsg_write_group_mbuf, + .write_chain = nlmsg_write_chain_mbuf, + }, + /* NS_WRITER_TYPE_BUF */ + { + .init = nlmsg_get_ns_buf, + .write_socket = nlmsg_write_socket_buf, + .write_group = nlmsg_write_group_buf, + .write_chain = nlmsg_write_chain_buf, + }, + /* NS_WRITER_TYPE_LBUF */ + { + .init = nlmsg_get_ns_lbuf, + .write_socket = nlmsg_write_socket_lbuf, + .write_group = nlmsg_write_group_lbuf, + }, +}; + +static void +nlmsg_set_callback(struct nl_writer *nw) +{ + struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type]; + + switch (nw->writer_target) { + case NS_WRITER_TARGET_SOCKET: + nw->cb = pops->write_socket; + break; + case NS_WRITER_TARGET_GROUP: + nw->cb = pops->write_group; + break; + case NS_WRITER_TARGET_CHAIN: + nw->cb = pops->write_chain; + break; + default: + panic("not implemented"); + } +} + +static bool +nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok) +{ + MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0])); + NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type); + return (nlmsg_writers[type].init(nw, size, waitok)); +} + +static bool +nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux) +{ + int type; + + if (!is_linux) { + if (__predict_true(size <= MCLBYTES)) + type = NS_WRITER_TYPE_MBUF; + else + type = NS_WRITER_TYPE_BUF; + } else + type = NS_WRITER_TYPE_LBUF; + return (nlmsg_get_buf_type(nw, size, type, waitok)); +} + +bool +nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp) +{ + if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux)) + return (false); + nw->arg_ptr = (void *)nlp; + nw->writer_target = NS_WRITER_TARGET_SOCKET; + nlmsg_set_callback(nw); + return (true); +} + +bool +nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id) +{ + if (!nlmsg_get_buf(nw, size, false, false)) + return (false); + nw->arg_uint = (uint64_t)protocol << 16 | (uint64_t)group_id; + nw->writer_target = NS_WRITER_TARGET_GROUP; + nlmsg_set_callback(nw); + return (true); +} + +bool +nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm) +{ + if (!nlmsg_get_buf(nw, size, false, false)) + return (false); + *pm = NULL; + nw->arg_ptr = (void *)pm; + nw->writer_target = NS_WRITER_TARGET_CHAIN; + nlmsg_set_callback(nw); + NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf); + return (true); +} + +void +nlmsg_ignore_limit(struct nl_writer *nw) +{ + nw->ignore_limit = true; +} + +bool +nlmsg_flush(struct nl_writer *nw) +{ + + if (__predict_false(nw->hdr != NULL)) { + /* Last message has not been completed, skip it. */ + int completed_len = (char *)nw->hdr - nw->data; + /* Send completed messages */ + nw->offset -= nw->offset - completed_len; + nw->hdr = NULL; + } + + NL_LOG(LOG_DEBUG2, "OUT"); + bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages); + nw->_storage = NULL; + + if (!result) { + NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb); + } + + return (result); +} + +/* + * Flushes previous data and allocates new underlying storage + * sufficient for holding at least @required_len bytes. + * Return true on success. + */ +bool +nlmsg_refill_buffer(struct nl_writer *nw, int required_len) +{ + struct nl_writer ns_new = {}; + int completed_len, new_len; + + if (nw->enomem) + return (false); + + NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim", + nw->offset, nw->alloc_len, required_len); + + /* Calculated new buffer size and allocate it s*/ + completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset; + if (completed_len > 0 && required_len < MCLBYTES) { + /* We already ran out of space, use the largest effective size */ + new_len = max(nw->alloc_len, MCLBYTES); + } else { + if (nw->alloc_len < MCLBYTES) + new_len = MCLBYTES; + else + new_len = nw->alloc_len * 2; + while (new_len < required_len) + new_len *= 2; + } + bool waitok = (nw->malloc_flag == M_WAITOK); + bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF); + if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) { + nw->enomem = true; + NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM"); + return (false); + } + if (nw->ignore_limit) + nlmsg_ignore_limit(&ns_new); + + /* Update callback data */ + ns_new.writer_target = nw->writer_target; + nlmsg_set_callback(&ns_new); + ns_new.arg_uint = nw->arg_uint; + + /* Copy last (unfinished) header to the new storage */ + int last_len = nw->offset - completed_len; + if (last_len > 0) { + memcpy(ns_new.data, nw->hdr, last_len); + ns_new.hdr = (struct nlmsghdr *)ns_new.data; + ns_new.offset = last_len; + } + + NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len); + + /* Flush completed headers & switch to the new nw */ + nlmsg_flush(nw); + memcpy(nw, &ns_new, sizeof(struct nl_writer)); + NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len); + + return (true); +} + +bool +nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len) +{ + struct nlmsghdr *hdr; + + MPASS(nw->hdr == NULL); + + int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr)); + if (__predict_false(nw->offset + required_len > nw->alloc_len)) { + if (!nlmsg_refill_buffer(nw, required_len)) + return (false); + } + + hdr = (struct nlmsghdr *)(&nw->data[nw->offset]); + + hdr->nlmsg_len = len; + hdr->nlmsg_type = type; + hdr->nlmsg_flags = flags; + hdr->nlmsg_seq = seq; + hdr->nlmsg_pid = portid; + + nw->hdr = hdr; + nw->offset += sizeof(struct nlmsghdr); + + return (true); +} + +bool +nlmsg_end(struct nl_writer *nw) +{ + MPASS(nw->hdr != NULL); + + if (nw->enomem) { + NL_LOG(LOG_DEBUG, "ENOMEM when dumping message"); + nlmsg_abort(nw); + return (false); + } + + nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr); + nw->hdr = NULL; + nw->num_messages++; + return (true); +} + +void +nlmsg_abort(struct nl_writer *nw) +{ + if (nw->hdr != NULL) { + nw->offset = (uint32_t)((char *)nw->hdr - nw->data); + nw->hdr = NULL; + } +} + +void +nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr, + struct nl_pstate *npt) +{ + struct nlmsgerr *errmsg; + int payload_len; + uint32_t flags = nlp->nl_flags; + struct nl_writer *nw = npt->nw; + bool cap_ack; + + payload_len = sizeof(struct nlmsgerr); + + /* + * The only case when we send the full message in the + * reply is when there is an error and NETLINK_CAP_ACK + * is not set. + */ + cap_ack = (error == 0) || (flags & NLF_CAP_ACK); + if (!cap_ack) + payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr); + payload_len = NETLINK_ALIGN(payload_len); + + uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0; + if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK) + nl_flags |= NLM_F_ACK_TLVS; + + /* + * TODO: handle cookies + */ + + NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d", + hdr->nlmsg_type, hdr->nlmsg_seq); + + if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len)) + goto enomem; + + errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr); + errmsg->error = error; + /* In case of error copy the whole message, else just the header */ + memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len); + + if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK) + nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg); + if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK) + nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off); + + if (nlmsg_end(nw)) + return; +enomem: + NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u", + hdr->nlmsg_type, hdr->nlmsg_seq); + nlmsg_abort(nw); +} + +bool +nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr) +{ + if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) { + NL_LOG(LOG_DEBUG, "Error finalizing table dump"); + return (false); + } + /* Save operation result */ + int *perror = nlmsg_reserve_object(nw, int); + NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error, + nw->offset, perror); + *perror = error; + nlmsg_end(nw); + + return (true); +} diff --git a/sys/netlink/netlink_message_writer.h b/sys/netlink/netlink_message_writer.h new file mode 100644 index 000000000000..95f6dd8e6da0 --- /dev/null +++ b/sys/netlink/netlink_message_writer.h @@ -0,0 +1,250 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETLINK_NETLINK_MESSAGE_WRITER_H_ +#define _NETLINK_NETLINK_MESSAGE_WRITER_H_ + +/* + * It is not meant to be included directly + */ + +struct mbuf; +struct nl_writer; +typedef bool nl_writer_cb(struct nl_writer *nw, void *buf, int buflen, int cnt); + +struct nl_writer { + int alloc_len; /* allocated buffer length */ + int offset; /* offset from the start of the buffer */ + struct nlmsghdr *hdr; /* Pointer to the currently-filled msg */ + char *data; /* pointer to the contiguous storage */ + void *_storage; /* Underlying storage pointer */ + nl_writer_cb *cb; /* Callback to flush data */ + union { + void *arg_ptr; /* Callback argument as pointer */ + uint64_t arg_uint; /* Callback argument as int */ + }; + int num_messages; /* Number of messages in the buffer */ + int malloc_flag; /* M_WAITOK or M_NOWAIT */ + uint8_t writer_type; /* NS_WRITER_TYPE_* */ + uint8_t writer_target; /* NS_WRITER_TARGET_* */ + bool ignore_limit; /* If true, ignores RCVBUF limit */ + bool enomem; /* True if ENOMEM occured */ +}; +#define NS_WRITER_TARGET_SOCKET 0 +#define NS_WRITER_TARGET_GROUP 1 +#define NS_WRITER_TARGET_CHAIN 2 + +#define NS_WRITER_TYPE_MBUF 0 +#define NS_WRITER_TYPE_BUF 1 +#define NS_WRITER_TYPE_LBUF 2 +#define NS_WRITER_TYPE_MBUFC 3 + + +#define NLMSG_SMALL 128 +#define NLMSG_LARGE 2048 + +/* Message and attribute writing */ + +struct nlpcb; +bool nlmsg_get_unicast_writer(struct nl_writer *nw, int expected_size, struct nlpcb *nlp); +bool nlmsg_get_group_writer(struct nl_writer *nw, int expected_size, int proto, int group_id); +bool nlmsg_get_chain_writer(struct nl_writer *nw, int expected_size, struct mbuf **pm); +bool nlmsg_flush(struct nl_writer *nw); +void nlmsg_ignore_limit(struct nl_writer *nw); + +bool nlmsg_refill_buffer(struct nl_writer *nw, int required_size); +bool nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type, + uint16_t flags, uint32_t len); +bool nlmsg_end(struct nl_writer *nw); +void nlmsg_abort(struct nl_writer *nw); + +bool nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr); + +static inline bool +nlmsg_reply(struct nl_writer *nw, const struct nlmsghdr *hdr, int payload_len) +{ + return (nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, hdr->nlmsg_type, + hdr->nlmsg_flags, payload_len)); +} + +#define nlmsg_data(_hdr) ((void *)((_hdr) + 1)) + +/* + * KPI similar to mtodo(): + * current (uncompleted) header is guaranteed to be contiguous, + * but can be reallocated, thus pointers may need to be readjusted. + */ +static inline int +nlattr_save_offset(const struct nl_writer *nw) +{ + return (nw->offset - ((char *)nw->hdr - nw->data)); +} + +static inline void * +_nlattr_restore_offset(const struct nl_writer *nw, int off) +{ + return ((void *)((char *)nw->hdr + off)); +} +#define nlattr_restore_offset(_ns, _off, _t) ((_t *)_nlattr_restore_offset(_ns, _off)) + +static inline void +nlattr_set_len(const struct nl_writer *nw, int off) +{ + struct nlattr *nla = nlattr_restore_offset(nw, off, struct nlattr); + nla->nla_len = nlattr_save_offset(nw) - off; +} + +static inline void * +nlmsg_reserve_data_raw(struct nl_writer *nw, size_t sz) +{ + if (__predict_false(nw->offset + NETLINK_ALIGN(sz) > nw->alloc_len)) { + if (!nlmsg_refill_buffer(nw, NETLINK_ALIGN(sz))) + return (NULL); + } + + void *data_ptr = &nw->data[nw->offset]; + nw->offset += NLMSG_ALIGN(sz); + + return (data_ptr); +} +#define nlmsg_reserve_object(_ns, _t) ((_t *)nlmsg_reserve_data_raw(_ns, NLA_ALIGN(sizeof(_t)))) +#define nlmsg_reserve_data(_ns, _sz, _t) ((_t *)nlmsg_reserve_data_raw(_ns, _sz)) + +static inline int +nlattr_add_nested(struct nl_writer *nw, uint16_t nla_type) +{ + int off = nlattr_save_offset(nw); + struct nlattr *nla = nlmsg_reserve_data(nw, sizeof(struct nlattr), struct nlattr); + if (__predict_false(nla == NULL)) + return (0); + nla->nla_type = nla_type; + return (off); +} + +static inline void * +_nlmsg_reserve_attr(struct nl_writer *nw, uint16_t nla_type, uint16_t sz) +{ + sz += sizeof(struct nlattr); + + struct nlattr *nla = nlmsg_reserve_data(nw, sz, struct nlattr); + if (__predict_false(nla == NULL)) + return (NULL); + nla->nla_type = nla_type; + nla->nla_len = sz; + + return ((void *)(nla + 1)); +} +#define nlmsg_reserve_attr(_ns, _at, _t) ((_t *)_nlmsg_reserve_attr(_ns, _at, NLA_ALIGN(sizeof(_t)))) + +static inline bool +nlattr_add(struct nl_writer *nw, int attr_type, int attr_len, const void *data) +{ + int required_len = NLA_ALIGN(attr_len + sizeof(struct nlattr)); + + if (__predict_false(nw->offset + required_len > nw->alloc_len)) { + if (!nlmsg_refill_buffer(nw, required_len)) + return (false); + } + + struct nlattr *nla = (struct nlattr *)(&nw->data[nw->offset]); + + nla->nla_len = attr_len + sizeof(struct nlattr); + nla->nla_type = attr_type; + if (attr_len > 0) { + if ((attr_len % 4) != 0) { + /* clear padding bytes */ + bzero((char *)nla + required_len - 4, 4); + } + memcpy((nla + 1), data, attr_len); + } + nw->offset += required_len; + return (true); +} + +static inline bool +nlattr_add_u8(struct nl_writer *nw, int attrtype, uint8_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint8_t), &value)); +} + +static inline bool +nlattr_add_u16(struct nl_writer *nw, int attrtype, uint16_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint16_t), &value)); +} + +static inline bool +nlattr_add_u32(struct nl_writer *nw, int attrtype, uint32_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint32_t), &value)); +} + +static inline bool +nlattr_add_u64(struct nl_writer *nw, int attrtype, uint64_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(uint64_t), &value)); +} + +static inline bool +nlattr_add_s8(struct nl_writer *nw, int attrtype, int8_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int8_t), &value)); +} + +static inline bool +nlattr_add_s16(struct nl_writer *nw, int attrtype, int16_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int16_t), &value)); +} + +static inline bool +nlattr_add_s32(struct nl_writer *nw, int attrtype, int32_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int32_t), &value)); +} + +static inline bool +nlattr_add_s64(struct nl_writer *nw, int attrtype, int64_t value) +{ + return (nlattr_add(nw, attrtype, sizeof(int64_t), &value)); +} + +static inline bool +nlattr_add_flag(struct nl_writer *nw, int attrtype) +{ + return (nlattr_add(nw, attrtype, 0, NULL)); +} + +static inline bool +nlattr_add_string(struct nl_writer *nw, int attrtype, const char *str) +{ + return (nlattr_add(nw, attrtype, strlen(str) + 1, str)); +} + + +#endif diff --git a/sys/netlink/netlink_module.c b/sys/netlink/netlink_module.c new file mode 100644 index 000000000000..a1bcb8a29511 --- /dev/null +++ b/sys/netlink/netlink_module.c @@ -0,0 +1,228 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +MALLOC_DEFINE(M_NETLINK, "netlink", "Memory used for netlink packets"); + +#define DEBUG_MOD_NAME nl_mod +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +SYSCTL_NODE(_net, OID_AUTO, netlink, CTLFLAG_RD, 0, ""); +SYSCTL_NODE(_net_netlink, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); + +#define NL_MAX_HANDLERS 20 +struct nl_proto_handler _nl_handlers[NL_MAX_HANDLERS]; +struct nl_proto_handler *nl_handlers = _nl_handlers; + +CK_LIST_HEAD(nl_control_head, nl_control); +static struct nl_control_head vnets_head = CK_LIST_HEAD_INITIALIZER(); + +VNET_DEFINE(struct nl_control *, nl_ctl) = NULL; + +struct mtx nl_global_mtx; +MTX_SYSINIT(nl_global_mtx, &nl_global_mtx, "global netlink lock", MTX_DEF); + +#define NL_GLOBAL_LOCK() mtx_lock(&nl_global_mtx) +#define NL_GLOBAL_UNLOCK() mtx_unlock(&nl_global_mtx) + +int netlink_unloading = 0; + +static void +free_nl_ctl(struct nl_control *ctl) +{ + rm_destroy(&ctl->ctl_lock); + free(ctl, M_NETLINK); +} + +struct nl_control * +vnet_nl_ctl_init(void) +{ + struct nl_control *ctl; + + ctl = malloc(sizeof(struct nl_control), M_NETLINK, M_WAITOK | M_ZERO); + rm_init(&ctl->ctl_lock, "netlink lock"); + CK_LIST_INIT(&ctl->ctl_port_head); + CK_LIST_INIT(&ctl->ctl_pcb_head); + + NL_GLOBAL_LOCK(); + + struct nl_control *tmp = atomic_load_ptr(&V_nl_ctl); + + if (tmp == NULL) { + atomic_store_ptr(&V_nl_ctl, ctl); + CK_LIST_INSERT_HEAD(&vnets_head, ctl, ctl_next); + NL_LOG(LOG_DEBUG2, "VNET %p init done, inserted %p into global list", + curvnet, ctl); + } else { + NL_LOG(LOG_DEBUG, "per-VNET init clash, dropping this instance"); + free_nl_ctl(ctl); + ctl = tmp; + } + + NL_GLOBAL_UNLOCK(); + + return (ctl); +} + +static void +vnet_nl_ctl_destroy(const void *unused __unused) +{ + struct nl_control *ctl; + + /* Assume at the time all of the processes / sockets are dead */ + + NL_GLOBAL_LOCK(); + ctl = atomic_load_ptr(&V_nl_ctl); + atomic_store_ptr(&V_nl_ctl, NULL); + if (ctl != NULL) { + NL_LOG(LOG_DEBUG2, "Removing %p from global list", ctl); + CK_LIST_REMOVE(ctl, ctl_next); + } + NL_GLOBAL_UNLOCK(); + + if (ctl != NULL) + free_nl_ctl(ctl); +} +VNET_SYSUNINIT(vnet_nl_ctl_destroy, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_nl_ctl_destroy, NULL); + +int +nl_verify_proto(int proto) +{ + if (proto < 0 || proto >= NL_MAX_HANDLERS) { + return (EINVAL); + } + int handler_defined = nl_handlers[proto].cb != NULL; + return (handler_defined ? 0 : EPROTONOSUPPORT); +} + +const char * +nl_get_proto_name(int proto) +{ + return (nl_handlers[proto].proto_name); +} + +bool +netlink_register_proto(int proto, const char *proto_name, nl_handler_f handler) +{ + if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) + return (false); + NL_GLOBAL_LOCK(); + KASSERT((nl_handlers[proto].cb == NULL), ("netlink handler %d is already set", proto)); + nl_handlers[proto].cb = handler; + nl_handlers[proto].proto_name = proto_name; + NL_GLOBAL_UNLOCK(); + NL_LOG(LOG_DEBUG, "Registered netlink %s(%d) handler", proto_name, proto); + return (true); +} + +bool +netlink_unregister_proto(int proto) +{ + if ((proto < 0) || (proto >= NL_MAX_HANDLERS)) + return (false); + NL_GLOBAL_LOCK(); + KASSERT((nl_handlers[proto].cb != NULL), ("netlink handler %d is not set", proto)); + nl_handlers[proto].cb = NULL; + nl_handlers[proto].proto_name = NULL; + NL_GLOBAL_UNLOCK(); + NL_LOG(LOG_DEBUG, "Unregistered netlink proto %d handler", proto); + return (true); +} + +static bool +can_unload(void) +{ + struct nl_control *ctl; + bool result = true; + + NL_GLOBAL_LOCK(); + + CK_LIST_FOREACH(ctl, &vnets_head, ctl_next) { + NL_LOG(LOG_DEBUG2, "Iterating VNET head %p", ctl); + if (!CK_LIST_EMPTY(&ctl->ctl_pcb_head)) { + NL_LOG(LOG_NOTICE, "non-empty socket list in ctl %p", ctl); + result = false; + break; + } + } + + NL_GLOBAL_UNLOCK(); + + return (result); +} + +static int +netlink_modevent(module_t mod __unused, int what, void *priv __unused) +{ + int ret = 0; + + switch (what) { + case MOD_LOAD: + NL_LOG(LOG_DEBUG, "Loading"); + NL_LOG(LOG_NOTICE, "netlink support is in BETA stage"); + break; + + case MOD_UNLOAD: + NL_LOG(LOG_DEBUG, "Unload called"); + if (can_unload()) { + NL_LOG(LOG_WARNING, "unloading"); + netlink_unloading = 1; + } else + ret = EBUSY; + break; + + default: + ret = EOPNOTSUPP; + break; + } + + return (ret); +} +static moduledata_t netlink_mod = { "netlink", netlink_modevent, NULL }; + +DECLARE_MODULE(netlink, netlink_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +MODULE_VERSION(netlink, 1); diff --git a/sys/netlink/netlink_route.c b/sys/netlink/netlink_route.c new file mode 100644 index 000000000000..f12bf268e252 --- /dev/null +++ b/sys/netlink/netlink_route.c @@ -0,0 +1,135 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_route_core +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +#define HANDLER_MAX_NUM (NL_RTM_MAX + 10) +static const struct rtnl_cmd_handler *rtnl_handler[HANDLER_MAX_NUM] = {}; + +bool +rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count) +{ + for (int i = 0; i < count; i++) { + if (handlers[i].cmd >= HANDLER_MAX_NUM) + return (false); + MPASS(rtnl_handler[handlers[i].cmd] == NULL); + } + for (int i = 0; i < count; i++) + rtnl_handler[handlers[i].cmd] = &handlers[i]; + return (true); +} + +/* + * Handler called by netlink subsystem when matching netlink message is received + */ +static int +rtnl_handle_message(struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + const struct rtnl_cmd_handler *cmd; + struct epoch_tracker et; + struct nlpcb *nlp = npt->nlp; + int error = 0; + + if (__predict_false(hdr->nlmsg_type >= HANDLER_MAX_NUM)) { + NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type); + return (ENOTSUP); + } + + cmd = rtnl_handler[hdr->nlmsg_type]; + if (__predict_false(cmd == NULL)) { + NLMSG_REPORT_ERR_MSG(npt, "unknown message type: %d", hdr->nlmsg_type); + return (ENOTSUP); + } + + NLP_LOG(LOG_DEBUG2, nlp, "received msg %s(%d) len %d", cmd->name, + hdr->nlmsg_type, hdr->nlmsg_len); + + if (cmd->priv != 0 && !nlp_has_priv(nlp, cmd->priv)) { + NLP_LOG(LOG_DEBUG2, nlp, "priv %d check failed for msg %s", cmd->priv, cmd->name); + return (EPERM); + } else if (cmd->priv != 0) + NLP_LOG(LOG_DEBUG3, nlp, "priv %d check passed for msg %s", cmd->priv, cmd->name); + + bool need_epoch = !(cmd->flags & RTNL_F_NOEPOCH); + + if (need_epoch) + NET_EPOCH_ENTER(et); + error = cmd->cb(hdr, nlp, npt); + if (need_epoch) + NET_EPOCH_EXIT(et); + + NLP_LOG(LOG_DEBUG3, nlp, "message %s -> error %d", cmd->name, error); + + return (error); +} + +static struct rtbridge nlbridge = { .route_f = rtnl_handle_route_event }; +static struct rtbridge *nlbridge_orig_p; + +static void +rtnl_load(void *u __unused) +{ + NL_LOG(LOG_NOTICE, "rtnl loading"); + nlbridge_orig_p = netlink_callback_p; + netlink_callback_p = &nlbridge; + rtnl_neighs_init(); + rtnl_ifaces_init(); + rtnl_nexthops_init(); + rtnl_routes_init(); + netlink_register_proto(NETLINK_ROUTE, "NETLINK_ROUTE", rtnl_handle_message); +} +SYSINIT(rtnl_load, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_load, NULL); + +static void +rtnl_unload(void *u __unused) +{ + netlink_callback_p = nlbridge_orig_p; + rtnl_ifaces_destroy(); + rtnl_neighs_destroy(); + + /* Wait till all consumers read nlbridge data */ + epoch_wait_preempt(net_epoch_preempt); +} +SYSUNINIT(rtnl_unload, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rtnl_unload, NULL); diff --git a/sys/netlink/netlink_route.h b/sys/netlink/netlink_route.h new file mode 100644 index 000000000000..93445f2e1699 --- /dev/null +++ b/sys/netlink/netlink_route.h @@ -0,0 +1,43 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_ROUTE_H_ +#define _NETLINK_NETLINK_ROUTE_H_ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#endif diff --git a/sys/netlink/netlink_var.h b/sys/netlink/netlink_var.h new file mode 100644 index 000000000000..40d3870fd795 --- /dev/null +++ b/sys/netlink/netlink_var.h @@ -0,0 +1,142 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _NETLINK_NETLINK_VAR_H_ +#define _NETLINK_NETLINK_VAR_H_ + +#include +#include +#include +#include +#include + +#define NLSNDQ 65536 /* Default socket sendspace */ +#define NLRCVQ 65536 /* Default socket recvspace */ + +struct ucred; + +struct nl_io_queue { + STAILQ_HEAD(, mbuf) head; + int length; + int hiwat; +}; + +struct nlpcb { + struct socket *nl_socket; + uint64_t nl_groups; + uint32_t nl_port; + uint32_t nl_flags; + uint32_t nl_process_id; + int nl_proto; + bool nl_active; + bool nl_bound; + bool nl_task_pending; + bool nl_tx_blocked; /* No new requests accepted */ + bool nl_linux; /* true if running under compat */ + struct nl_io_queue rx_queue; + struct nl_io_queue tx_queue; + struct taskqueue *nl_taskqueue; + struct task nl_task; + struct ucred *nl_cred; /* Copy of nl_socket->so_cred */ + uint64_t nl_dropped_bytes; + uint64_t nl_dropped_messages; + CK_LIST_ENTRY(nlpcb) nl_next; + CK_LIST_ENTRY(nlpcb) nl_port_next; + volatile u_int nl_refcount; + struct mtx nl_lock; + struct epoch_context nl_epoch_ctx; +}; +#define sotonlpcb(so) ((struct nlpcb *)(so)->so_pcb) + +#define NLP_LOCK_INIT(_nlp) mtx_init(&((_nlp)->nl_lock), "nlp mtx", NULL, MTX_DEF) +#define NLP_LOCK_DESTROY(_nlp) mtx_destroy(&((_nlp)->nl_lock)) +#define NLP_LOCK(_nlp) mtx_lock(&((_nlp)->nl_lock)) +#define NLP_UNLOCK(_nlp) mtx_unlock(&((_nlp)->nl_lock)) + +#define ALIGNED_NL_SZ(_data) roundup2((((struct nlmsghdr *)(_data))->nlmsg_len), 16) + +/* nl_flags */ +#define NLF_CAP_ACK 0x01 /* Do not send message body with errmsg */ +#define NLF_EXT_ACK 0x02 /* Allow including extended TLVs in ack */ +#define NLF_STRICT 0x04 /* Perform strict header checks */ + +SYSCTL_DECL(_net_netlink); + +struct nl_io { + struct callout callout; + struct mbuf *head; + struct mbuf *last; + int64_t length; +}; + +struct nl_control { + CK_LIST_HEAD(nl_pid_head, nlpcb) ctl_port_head; + CK_LIST_HEAD(nlpcb_head, nlpcb) ctl_pcb_head; + CK_LIST_ENTRY(nl_control) ctl_next; + struct nl_io ctl_io; + struct rmlock ctl_lock; +}; +VNET_DECLARE(struct nl_control *, nl_ctl); +#define V_nl_ctl VNET(nl_ctl) + + +struct sockaddr_nl; +struct sockaddr; +struct nlmsghdr; + +/* netlink_module.c */ +struct nl_control *vnet_nl_ctl_init(void); + +int nl_verify_proto(int proto); +const char *nl_get_proto_name(int proto); + +extern int netlink_unloading; + +struct nl_proto_handler { + nl_handler_f cb; + const char *proto_name; +}; +extern struct nl_proto_handler *nl_handlers; + +/* netlink_domain.c */ +void nl_send_group(struct mbuf *m, int cnt, int proto, int group_id); + +/* netlink_io.c */ +#define NL_IOF_UNTRANSLATED 0x01 +#define NL_IOF_IGNORE_LIMIT 0x02 +bool nl_send_one(struct mbuf *m, struct nlpcb *nlp, int cnt, int io_flags); +void nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *nlmsg, + struct nl_pstate *npt); +void nl_on_transmit(struct nlpcb *nlp); +void nl_init_io(struct nlpcb *nlp); +void nl_free_io(struct nlpcb *nlp); + +void nl_taskqueue_handler(void *_arg, int pending); +int nl_receive_async(struct mbuf *m, struct socket *so); +void nl_process_receive_locked(struct nlpcb *nlp); + +#endif diff --git a/sys/netlink/route/common.h b/sys/netlink/route/common.h new file mode 100644 index 000000000000..1bfb888b34c0 --- /dev/null +++ b/sys/netlink/route/common.h @@ -0,0 +1,213 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Common defines for all parts of the NETLINK_ROUTE family + */ +#ifndef _NETLINK_ROUTE_COMMON_H_ +#define _NETLINK_ROUTE_COMMON_H_ + +/* Defined NETLINK_ROUTE messages */ +enum { + NL_RTM_BASE = 16, + NL_RTM_NEWLINK = 16, /* creates new interface */ + NL_RTM_DELLINK = 17, /* deletes matching interface */ + NL_RTM_GETLINK = 18, /* lists matching interfaces */ + NL_RTM_SETLINK = 19, /* not supported */ + NL_RTM_NEWADDR = 20, /* not supported */ + NL_RTM_DELADDR = 21, /* not supported */ + NL_RTM_GETADDR = 22, /* lists matching ifaddrs */ + NL_RTM_NEWROUTE = 24, /* adds or changes a route */ + NL_RTM_DELROUTE = 25, /* deletes matching route */ + NL_RTM_GETROUTE = 26, /* lists matching routes */ + NL_RTM_NEWNEIGH = 28, /* creates new arp/ndp entry */ + NL_RTM_DELNEIGH = 29, /* deletes matching arp/ndp entry */ + NL_RTM_GETNEIGH = 30, /* lists matching arp/ndp entry */ + NL_RTM_NEWRULE = 32, /* not supported */ + NL_RTM_DELRULE = 33, /* not supported */ + NL_RTM_GETRULE = 34, /* not supported */ + NL_RTM_NEWQDISC = 36, /* not supported */ + NL_RTM_DELQDISC = 37, /* not supported */ + NL_RTM_GETQDISC = 38, /* not supported */ + NL_RTM_NEWTCLASS = 40, /* not supported */ + NL_RTM_DELTCLASS = 41, /* not supported */ + NL_RTM_GETTCLASS = 42, /* not supported */ + NL_RTM_NEWTFILTER = 44, /* not supported */ + NL_RTM_DELTFILTER = 45, /* not supported */ + NL_RTM_GETTFILTER = 46, /* not supported */ + NL_RTM_NEWACTION = 48, /* not supported */ + NL_RTM_DELACTION = 49, /* not supported */ + NL_RTM_GETACTION = 50, /* not supported */ + NL_RTM_NEWPREFIX = 52, /* not supported */ + NL_RTM_GETMULTICAST = 58, /* not supported */ + NL_RTM_GETANYCAST = 62, /* not supported */ + NL_RTM_NEWNEIGHTBL = 64, /* not supported */ + NL_RTM_GETNEIGHTBL = 66, /* not supported */ + NL_RTM_SETNEIGHTBL = 67, /* not supported */ + NL_RTM_NEWNDUSEROPT = 68, /* not supported */ + NL_RTM_NEWADDRLABEL = 72, /* not supported */ + NL_RTM_DELADDRLABEL = 73, /* not supported */ + NL_RTM_GETADDRLABEL = 74, /* not supported */ + NL_RTM_GETDCB = 78, /* not supported */ + NL_RTM_SETDCB = 79, /* not supported */ + NL_RTM_NEWNETCONF = 80, /* not supported */ + NL_RTM_GETNETCONF = 82, /* not supported */ + NL_RTM_NEWMDB = 84, /* not supported */ + NL_RTM_DELMDB = 85, /* not supported */ + NL_RTM_GETMDB = 86, /* not supported */ + NL_RTM_NEWNSID = 88, /* not supported */ + NL_RTM_DELNSID = 89, /* not supported */ + NL_RTM_GETNSID = 90, /* not supported */ + NL_RTM_NEWSTATS = 92, /* not supported */ + NL_RTM_GETSTATS = 94, /* not supported */ + NL_RTM_NEWNEXTHOP = 104, /* creates new user nexhtop */ + NL_RTM_DELNEXTHOP = 105, /* deletes matching nexthop */ + NL_RTM_GETNEXTHOP = 106, /* lists created user nexthops */ + __NL_RTM_MAX, +}; +#define NL_RTM_MAX (((__NL_RTM_MAX + 3) & ~3) - 1) + +#ifndef _KERNEL +/* + * RTM_* namespace clashes with BSD rtsock namespace. + * Use NL_RTM_ prefix in the kernel and map it to RTM_ + * for userland. + */ +#define RTM_BASE NL_RTM_BASE +#define RTM_NEWLINK NL_RTM_NEWLINK +#define RTM_DELLINK NL_RTM_DELLINK +#define RTM_GETLINK NL_RTM_GETLINK +#define RTM_SETLINK NL_RTM_SETLINK +#define RTM_NEWADDR NL_RTM_NEWADDR +#define RTM_DELADDR NL_RTM_DELADDR +#define RTM_GETADDR NL_RTM_GETADDR +#define RTM_NEWROUTE NL_RTM_NEWROUTE +#define RTM_DELROUTE NL_RTM_DELROUTE +#define RTM_GETROUTE NL_RTM_GETROUTE +#define RTM_NEWNEXTHOP NL_RTM_NEWNEXTHOP +#define RTM_DELNEXTHOP NL_RTM_DELNEXTHOP +#define RTM_GETNEXTHOP NL_RTM_GETNEXTHOP +#endif + +#ifndef _KERNEL +/* rtnetlink multicast groups - backwards compatibility for userspace */ +#define RTMGRP_LINK 0x01 +#define RTMGRP_NOTIFY 0x02 +#define RTMGRP_NEIGH 0x04 +#define RTMGRP_TC 0x08 + +#define RTMGRP_IPV4_IFADDR 0x10 +#define RTMGRP_IPV4_MROUTE 0x20 +#define RTMGRP_IPV4_ROUTE 0x40 +#define RTMGRP_IPV4_RULE 0x80 + +#define RTMGRP_IPV6_IFADDR 0x100 +#define RTMGRP_IPV6_MROUTE 0x200 +#define RTMGRP_IPV6_ROUTE 0x400 +#define RTMGRP_IPV6_IFINFO 0x800 + +#define RTMGRP_DECnet_IFADDR 0x1000 +#define RTMGRP_DECnet_ROUTE 0x4000 + +#define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* Defined NETLINK_ROUTE multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV4_RULE, +#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_NOP2, + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_DECnet_RULE, +#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE + RTNLGRP_NOP4, + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + RTNLGRP_IPV6_RULE, +#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE + RTNLGRP_ND_USEROPT, +#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT + RTNLGRP_PHONET_IFADDR, +#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR + RTNLGRP_PHONET_ROUTE, +#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE + RTNLGRP_DCB, +#define RTNLGRP_DCB RTNLGRP_DCB + RTNLGRP_IPV4_NETCONF, +#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF + RTNLGRP_IPV6_NETCONF, +#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF + RTNLGRP_MDB, +#define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID + RTNLGRP_MPLS_NETCONF, +#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF + RTNLGRP_IPV4_MROUTE_R, +#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R + RTNLGRP_IPV6_MROUTE_R, +#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) + + +#endif + diff --git a/sys/netlink/route/iface.c b/sys/netlink/route/iface.c new file mode 100644 index 000000000000..8db24b5507e4 --- /dev/null +++ b/sys/netlink/route/iface.c @@ -0,0 +1,857 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* scope deembedding */ + +#define DEBUG_MOD_NAME nl_iface +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + uint32_t fibnum; + int family; + int error; + int count; + int dumped; +}; + +static eventhandler_tag ifdetach_event, ifattach_event, ifaddr_event; + +static SLIST_HEAD(, nl_cloner) nl_cloners = SLIST_HEAD_INITIALIZER(nl_cloners); + +static struct sx rtnl_cloner_lock; +SX_SYSINIT(rtnl_cloner_lock, &rtnl_cloner_lock, "rtnl cloner lock"); + +/* + * RTM_GETLINK request + * sendto(3, {{len=32, type=RTM_GETLINK, flags=NLM_F_REQUEST|NLM_F_DUMP, seq=1641940952, pid=0}, + * {ifi_family=AF_INET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32 + * + * Reply: + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_ETHER, ifi_index=if_nametoindex("enp0s31f6"), ifi_flags=IFF_UP|IFF_BROADCAST|IFF_RUNNING|IFF_MULTICAST|IFF_LOWER_UP, ifi_change=0}, +{{nla_len=10, nla_type=IFLA_ADDRESS}, "\xfe\x54\x00\x52\x3e\x90"} + +[ +{{nla_len=14, nla_type=IFLA_IFNAME}, "enp0s31f6"}, +{{nla_len=8, nla_type=IFLA_TXQLEN}, 1000}, +{{nla_len=5, nla_type=IFLA_OPERSTATE}, 6}, +{{nla_len=5, nla_type=IFLA_LINKMODE}, 0}, +{{nla_len=8, nla_type=IFLA_MTU}, 1500}, +{{nla_len=8, nla_type=IFLA_MIN_MTU}, 68}, + {{nla_len=8, nla_type=IFLA_MAX_MTU}, 9000}, +{{nla_len=8, nla_type=IFLA_GROUP}, 0}, +{{nla_len=8, nla_type=IFLA_PROMISCUITY}, 0}, +{{nla_len=8, nla_type=IFLA_NUM_TX_QUEUES}, 1}, +{{nla_len=8, nla_type=IFLA_GSO_MAX_SEGS}, 65535}, +{{nla_len=8, nla_type=IFLA_GSO_MAX_SIZE}, 65536}, +{{nla_len=8, nla_type=IFLA_NUM_RX_QUEUES}, 1}, +{{nla_len=5, nla_type=IFLA_CARRIER}, 1}, +{{nla_len=13, nla_type=IFLA_QDISC}, "fq_codel"}, +{{nla_len=8, nla_type=IFLA_CARRIER_CHANGES}, 2}, +{{nla_len=5, nla_type=IFLA_PROTO_DOWN}, 0}, +{{nla_len=8, nla_type=IFLA_CARRIER_UP_COUNT}, 1}, +{{nla_len=8, nla_type=IFLA_CARRIER_DOWN_COUNT}, 1}, + */ + +struct if_state { + uint8_t ifla_operstate; + uint8_t ifla_carrier; +}; + +static void +get_operstate_ether(struct ifnet *ifp, struct if_state *pstate) +{ + struct ifmediareq ifmr = {}; + int error; + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (void *)&ifmr); + + if (error != 0) { + NL_LOG(LOG_DEBUG, "error calling SIOCGIFMEDIA on %s: %d", + if_name(ifp), error); + return; + } + + switch (IFM_TYPE(ifmr.ifm_active)) { + case IFM_ETHER: + if (ifmr.ifm_status & IFM_ACTIVE) { + pstate->ifla_carrier = 1; + if (ifp->if_flags & IFF_MONITOR) + pstate->ifla_operstate = IF_OPER_DORMANT; + else + pstate->ifla_operstate = IF_OPER_UP; + } else + pstate->ifla_operstate = IF_OPER_DOWN; + } +} + +static bool +get_stats(struct nl_writer *nw, struct ifnet *ifp) +{ + struct rtnl_link_stats64 *stats; + + int nla_len = sizeof(struct nlattr) + sizeof(*stats); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + if (nla == NULL) + return (false); + nla->nla_type = IFLA_STATS64; + nla->nla_len = nla_len; + stats = (struct rtnl_link_stats64 *)(nla + 1); + + stats->rx_packets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); + stats->tx_packets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); + stats->rx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); + stats->tx_bytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); + stats->rx_errors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); + stats->tx_errors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); + stats->rx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); + stats->tx_dropped = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); + stats->multicast = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); + stats->rx_nohandler = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); + + return (true); +} + +static void +get_operstate(struct ifnet *ifp, struct if_state *pstate) +{ + pstate->ifla_operstate = IF_OPER_UNKNOWN; + pstate->ifla_carrier = 0; /* no carrier */ + + switch (ifp->if_type) { + case IFT_ETHER: + get_operstate_ether(ifp, pstate); + break; + case IFT_LOOP: + if (ifp->if_flags & IFF_UP) { + pstate->ifla_operstate = IF_OPER_UP; + pstate->ifla_carrier = 1; + } else + pstate->ifla_operstate = IF_OPER_DOWN; + break; + } +} + +static unsigned +ifp_flags_to_netlink(const struct ifnet *ifp) +{ + return (ifp->if_flags | ifp->if_drv_flags); +} + +#define LLADDR_CONST(s) ((const void *)((s)->sdl_data + (s)->sdl_nlen)) +static bool +dump_sa(struct nl_writer *nw, int attr, const struct sockaddr *sa) +{ + uint32_t addr_len = 0; + const void *addr_data = NULL; + struct in6_addr addr6; + + if (sa == NULL) + return (true); + + switch (sa->sa_family) { + case AF_INET: + addr_len = sizeof(struct in_addr); + addr_data = &((const struct sockaddr_in *)sa)->sin_addr; + break; + case AF_INET6: + in6_splitscope(&((const struct sockaddr_in6 *)sa)->sin6_addr, &addr6, &addr_len); + addr_len = sizeof(struct in6_addr); + addr_data = &addr6; + break; + case AF_LINK: + addr_len = ((const struct sockaddr_dl *)sa)->sdl_alen; + addr_data = LLADDR_CONST((const struct sockaddr_dl *)sa); + break; + default: + NL_LOG(LOG_DEBUG, "unsupported family: %d, skipping", sa->sa_family); + return (true); + } + + return (nlattr_add(nw, attr, addr_len, addr_data)); +} + +/* + * Dumps interface state, properties and metrics. + * @nw: message writer + * @ifp: target interface + * @hdr: template header + * + * This function is called without epoch and MAY sleep. + */ +static bool +dump_iface(struct nl_writer *nw, struct ifnet *ifp, const struct nlmsghdr *hdr) +{ + struct ifinfomsg *ifinfo; + + NL_LOG(LOG_DEBUG3, "dumping interface %s data", if_name(ifp)); + + if (!nlmsg_reply(nw, hdr, sizeof(struct ifinfomsg))) + goto enomem; + + ifinfo = nlmsg_reserve_object(nw, struct ifinfomsg); + ifinfo->ifi_family = AF_UNSPEC; + ifinfo->__ifi_pad = 0; + ifinfo->ifi_type = ifp->if_type; + ifinfo->ifi_index = ifp->if_index; + ifinfo->ifi_flags = ifp_flags_to_netlink(ifp); + ifinfo->ifi_change = 0; + + nlattr_add_string(nw, IFLA_IFNAME, if_name(ifp)); + + struct if_state ifs = {}; + get_operstate(ifp, &ifs); + + nlattr_add_u8(nw, IFLA_OPERSTATE, ifs.ifla_operstate); + nlattr_add_u8(nw, IFLA_CARRIER, ifs.ifla_carrier); + +/* + nlattr_add_u8(nw, IFLA_PROTO_DOWN, val); + nlattr_add_u8(nw, IFLA_LINKMODE, val); +*/ + if ((ifp->if_addr != NULL)) { + dump_sa(nw, IFLA_ADDRESS, ifp->if_addr->ifa_addr); + } + + if ((ifp->if_broadcastaddr != NULL)) { + nlattr_add(nw, IFLA_BROADCAST, ifp->if_addrlen, + ifp->if_broadcastaddr); + } + + nlattr_add_u32(nw, IFLA_MTU, ifp->if_mtu); +/* + nlattr_add_u32(nw, IFLA_MIN_MTU, 60); + nlattr_add_u32(nw, IFLA_MAX_MTU, 9000); + nlattr_add_u32(nw, IFLA_GROUP, 0); +*/ + get_stats(nw, ifp); + + uint32_t val = (ifp->if_flags & IFF_PROMISC) != 0; + nlattr_add_u32(nw, IFLA_PROMISCUITY, val); + + if (nlmsg_end(nw)) + return (true); + +enomem: + NL_LOG(LOG_DEBUG, "unable to dump interface %s state (ENOMEM)", if_name(ifp)); + nlmsg_abort(nw); + return (false); +} + +static bool +check_ifmsg(void *hdr, struct nl_pstate *npt) +{ + struct ifinfomsg *ifm = hdr; + + if (ifm->__ifi_pad != 0 || ifm->ifi_type != 0 || + ifm->ifi_flags != 0 || ifm->ifi_change != 0) { + nlmsg_report_err_msg(npt, + "strict checking: non-zero values in ifinfomsg header"); + return (false); + } + + return (true); +} + +#define _IN(_field) offsetof(struct ifinfomsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_link, _field) +static const struct nlfield_parser nlf_p_if[] = { + { .off_in = _IN(ifi_type), .off_out = _OUT(ifi_type), .cb = nlf_get_u16 }, + { .off_in = _IN(ifi_index), .off_out = _OUT(ifi_index), .cb = nlf_get_u32 }, +}; + +static const struct nlattr_parser nla_p_linfo[] = { + { .type = IFLA_INFO_KIND, .off = _OUT(ifla_cloner), .cb = nlattr_get_stringn }, + { .type = IFLA_INFO_DATA, .off = _OUT(ifla_idata), .cb = nlattr_get_nla }, +}; +NL_DECLARE_ATTR_PARSER(linfo_parser, nla_p_linfo); + +static const struct nlattr_parser nla_p_if[] = { + { .type = IFLA_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, + { .type = IFLA_MTU, .off = _OUT(ifla_mtu), .cb = nlattr_get_uint32 }, + { .type = IFLA_LINK, .off = _OUT(ifi_index), .cb = nlattr_get_uint32 }, + { .type = IFLA_LINKINFO, .arg = &linfo_parser, .cb = nlattr_get_nested }, + { .type = IFLA_GROUP, .off = _OUT(ifla_group), .cb = nlattr_get_string }, + { .type = IFLA_ALT_IFNAME, .off = _OUT(ifla_ifname), .cb = nlattr_get_string }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_STRICT_PARSER(ifmsg_parser, struct ifinfomsg, check_ifmsg, nlf_p_if, nla_p_if); + +static bool +match_iface(struct nl_parsed_link *attrs, struct ifnet *ifp) +{ + if (attrs->ifi_index != 0 && attrs->ifi_index != ifp->if_index) + return (false); + if (attrs->ifi_type != 0 && attrs->ifi_index != ifp->if_type) + return (false); + if (attrs->ifla_ifname != NULL && strcmp(attrs->ifla_ifname, if_name(ifp))) + return (false); + /* TODO: add group match */ + + return (true); +} + +/* + * {nlmsg_len=52, nlmsg_type=RTM_GETLINK, nlmsg_flags=NLM_F_REQUEST, nlmsg_seq=1662842818, nlmsg_pid=0}, + * {ifi_family=AF_PACKET, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, + * [ + * [{nla_len=10, nla_type=IFLA_IFNAME}, "vnet9"], + * [{nla_len=8, nla_type=IFLA_EXT_MASK}, RTEXT_FILTER_VF] + * ] + */ +static int +rtnl_handle_getlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct epoch_tracker et; + struct ifnet *ifp; + int error = 0; + + struct nl_parsed_link attrs = {}; + error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + struct netlink_walkargs wa = { + .so = nlp, + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + .hdr.nlmsg_type = NL_RTM_NEWLINK, + }; + + /* Fast track for an interface w/ explicit index match */ + if (attrs.ifi_index != 0) { + NET_EPOCH_ENTER(et); + ifp = ifnet_byindex_ref(attrs.ifi_index); + NET_EPOCH_EXIT(et); + NLP_LOG(LOG_DEBUG3, nlp, "fast track -> searching index %u", attrs.ifi_index); + if (ifp != NULL) { + if (match_iface(&attrs, ifp)) { + if (!dump_iface(wa.nw, ifp, &wa.hdr)) + error = ENOMEM; + } else + error = ESRCH; + if_rele(ifp); + } else + error = ESRCH; + return (error); + } + + /* + * Fetching some link properties require performing ioctl's that may be blocking. + * Address it by saving referenced pointers of the matching links, + * exiting from epoch and going through the list one-by-one. + */ + + NL_LOG(LOG_DEBUG2, "Start dump"); + + struct ifnet **match_array; + int offset = 0, base_count = 16; /* start with 128 bytes */ + match_array = malloc(base_count * sizeof(void *), M_TEMP, M_NOWAIT); + + NLP_LOG(LOG_DEBUG3, nlp, "MATCHING: index=%u type=%d name=%s", + attrs.ifi_index, attrs.ifi_type, attrs.ifla_ifname); + NET_EPOCH_ENTER(et); + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + wa.count++; + if (match_iface(&attrs, ifp)) { + if (offset < base_count) { + if (!if_try_ref(ifp)) + continue; + match_array[offset++] = ifp; + continue; + } + /* Too many matches, need to reallocate */ + struct ifnet **new_array; + int sz = base_count * sizeof(void *); + base_count *= 2; + new_array = malloc(sz * 2, M_TEMP, M_NOWAIT); + if (new_array == NULL) { + error = ENOMEM; + break; + } + memcpy(new_array, match_array, sz); + free(match_array, M_TEMP); + match_array = new_array; + } + } + NET_EPOCH_EXIT(et); + + NL_LOG(LOG_DEBUG2, "Matched %d interface(s), dumping", offset); + for (int i = 0; error == 0 && i < offset; i++) { + if (!dump_iface(wa.nw, match_array[i], &wa.hdr)) + error = ENOMEM; + } + for (int i = 0; i < offset; i++) + if_rele(match_array[i]); + free(match_array, M_TEMP); + + NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); + + if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (error); +} + +/* + * sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=[ + * {nlmsg_len=60, nlmsg_type=RTM_NEWLINK, nlmsg_flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, nlmsg_seq=1662715618, nlmsg_pid=0}, + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, + * {nla_len=11, nla_type=IFLA_IFNAME}, "dummy0"], + * [ + * {nla_len=16, nla_type=IFLA_LINKINFO}, + * [ + * {nla_len=9, nla_type=IFLA_INFO_KIND}, "dummy"... + * ] + * ] + */ + +static int +rtnl_handle_dellink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct epoch_tracker et; + struct ifnet *ifp; + int error; + + struct nl_parsed_link attrs = {}; + error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + NET_EPOCH_ENTER(et); + ifp = ifnet_byindex_ref(attrs.ifi_index); + NET_EPOCH_EXIT(et); + if (ifp == NULL) { + NLP_LOG(LOG_DEBUG, nlp, "unable to find interface %u", attrs.ifi_index); + return (ENOENT); + } + NLP_LOG(LOG_DEBUG3, nlp, "mapped ifindex %u to %s", attrs.ifi_index, if_name(ifp)); + + sx_xlock(&ifnet_detach_sxlock); + error = if_clone_destroy(if_name(ifp)); + sx_xunlock(&ifnet_detach_sxlock); + + NLP_LOG(LOG_DEBUG2, nlp, "deleting interface %s returned %d", if_name(ifp), error); + + if_rele(ifp); + return (error); +} + +static int +rtnl_handle_newlink(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct nl_cloner *cloner; + int error; + + struct nl_parsed_link attrs = {}; + error = nl_parse_nlmsg(hdr, &ifmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.ifla_ifname == NULL || strlen(attrs.ifla_ifname) == 0) { + /* Applications like ip(8) verify RTM_NEWLINK existance + * by calling it with empty arguments. Always return "innocent" + * error. + */ + NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_IFNAME attribute"); + return (EPERM); + } + + if (attrs.ifla_cloner == NULL || strlen(attrs.ifla_cloner) == 0) { + NLMSG_REPORT_ERR_MSG(npt, "empty IFLA_INFO_KIND attribute"); + return (EINVAL); + } + + sx_slock(&rtnl_cloner_lock); + SLIST_FOREACH(cloner, &nl_cloners, next) { + if (!strcmp(attrs.ifla_cloner, cloner->name)) { + error = cloner->create_f(&attrs, nlp, npt); + sx_sunlock(&rtnl_cloner_lock); + return (error); + } + } + sx_sunlock(&rtnl_cloner_lock); + + /* TODO: load cloner module if not exists & privilege permits */ + NLMSG_REPORT_ERR_MSG(npt, "interface type %s not supported", attrs.ifla_cloner); + return (ENOTSUP); + + return (error); +} + +/* + +{ifa_family=AF_INET, ifa_prefixlen=8, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_HOST, ifa_index=if_nametoindex("lo")}, + [ + {{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("127.0.0.1")}, + {{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("127.0.0.1")}, + {{nla_len=7, nla_type=IFA_LABEL}, "lo"}, + {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}, + {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=3619, tstamp=3619}}]}, +--- + +{{len=72, type=RTM_NEWADDR, flags=NLM_F_MULTI, seq=1642191126, pid=566735}, + {ifa_family=AF_INET6, ifa_prefixlen=96, ifa_flags=IFA_F_PERMANENT, ifa_scope=RT_SCOPE_UNIVERSE, ifa_index=if_nametoindex("virbr0")}, + [ + {{nla_len=20, nla_type=IFA_ADDRESS}, inet_pton(AF_INET6, "2a01:4f8:13a:70c:ffff::1")}, + {{nla_len=20, nla_type=IFA_CACHEINFO}, {ifa_prefered=4294967295, ifa_valid=4294967295, cstamp=4283, tstamp=4283}}, + {{nla_len=8, nla_type=IFA_FLAGS}, IFA_F_PERMANENT}]}, +*/ + +static uint8_t +ifa_get_scope(const struct ifaddr *ifa) +{ + const struct sockaddr *sa; + uint8_t addr_scope = RT_SCOPE_UNIVERSE; + + sa = ifa->ifa_addr; + switch (sa->sa_family) { + case AF_INET: + { + struct in_addr addr; + addr = ((const struct sockaddr_in *)sa)->sin_addr; + if (IN_LOOPBACK(addr.s_addr)) + addr_scope = RT_SCOPE_HOST; + else if (IN_LINKLOCAL(addr.s_addr)) + addr_scope = RT_SCOPE_LINK; + break; + } + case AF_INET6: + { + const struct in6_addr *addr; + addr = &((const struct sockaddr_in6 *)sa)->sin6_addr; + if (IN6_IS_ADDR_LOOPBACK(addr)) + addr_scope = RT_SCOPE_HOST; + else if (IN6_IS_ADDR_LINKLOCAL(addr)) + addr_scope = RT_SCOPE_LINK; + break; + } + } + + return (addr_scope); +} + +static uint8_t +inet6_get_plen(const struct in6_addr *addr) +{ + + return (bitcount32(addr->s6_addr32[0]) + bitcount32(addr->s6_addr32[1]) + + bitcount32(addr->s6_addr32[2]) + bitcount32(addr->s6_addr32[3])); +} + +static uint8_t +get_sa_plen(const struct sockaddr *sa) +{ + const struct in6_addr *paddr6; + const struct in_addr *paddr; + + switch (sa->sa_family) { + case AF_INET: + if (sa == NULL) + return (32); + paddr = &(((const struct sockaddr_in *)sa)->sin_addr); + return bitcount32(paddr->s_addr);; + case AF_INET6: + if (sa == NULL) + return (128); + paddr6 = &(((const struct sockaddr_in6 *)sa)->sin6_addr); + return inet6_get_plen(paddr6); + } + + return (0); +} + + +/* + * {'attrs': [('IFA_ADDRESS', '12.0.0.1'), + ('IFA_LOCAL', '12.0.0.1'), + ('IFA_LABEL', 'eth10'), + ('IFA_FLAGS', 128), + ('IFA_CACHEINFO', {'ifa_preferred': 4294967295, 'ifa_valid': 4294967295, 'cstamp': 63745746, 'tstamp': 63745746})], + */ +static bool +dump_iface_addr(struct nl_writer *nw, struct ifnet *ifp, struct ifaddr *ifa, + const struct nlmsghdr *hdr) +{ + struct ifaddrmsg *ifamsg; + struct sockaddr *sa = ifa->ifa_addr; + + NL_LOG(LOG_DEBUG3, "dumping ifa %p type %s(%d) for interface %s", + ifa, rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); + + if (!nlmsg_reply(nw, hdr, sizeof(struct ifaddrmsg))) + goto enomem; + + ifamsg = nlmsg_reserve_object(nw, struct ifaddrmsg); + ifamsg->ifa_family = sa->sa_family; + ifamsg->ifa_prefixlen = get_sa_plen(ifa->ifa_netmask); + ifamsg->ifa_flags = 0; // ifa_flags is useless + ifamsg->ifa_scope = ifa_get_scope(ifa); + ifamsg->ifa_index = ifp->if_index; + + struct sockaddr *dst_sa = ifa->ifa_dstaddr; + if ((dst_sa == NULL) || (dst_sa->sa_family != sa->sa_family)) + dst_sa = sa; + dump_sa(nw, IFA_ADDRESS, dst_sa); + dump_sa(nw, IFA_LOCAL, sa); + nlattr_add_string(nw, IFA_LABEL, if_name(ifp)); + + uint32_t val = 0; // ifa->ifa_flags; + nlattr_add_u32(nw, IFA_FLAGS, val); + + if (nlmsg_end(nw)) + return (true); +enomem: + NL_LOG(LOG_DEBUG, "Failed to dump ifa type %s(%d) for interface %s", + rib_print_family(sa->sa_family), sa->sa_family, if_name(ifp)); + nlmsg_abort(nw); + return (false); +} + +static int +rtnl_handle_getaddr(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct ifaddr *ifa; + struct ifnet *ifp; + int error = 0; + + struct netlink_walkargs wa = { + .so = nlp, + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + .hdr.nlmsg_type = NL_RTM_NEWADDR, + }; + + NL_LOG(LOG_DEBUG2, "Start dump"); + + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (wa.family != 0 && wa.family != ifa->ifa_addr->sa_family) + continue; + if (ifa->ifa_addr->sa_family == AF_LINK) + continue; + wa.count++; + if (!dump_iface_addr(wa.nw, ifp, ifa, &wa.hdr)) { + error = ENOMEM; + break; + } + wa.dumped++; + } + if (error != 0) + break; + } + + NL_LOG(LOG_DEBUG2, "End dump, iterated %d dumped %d", wa.count, wa.dumped); + + if (!nlmsg_end_dump(wa.nw, error, &wa.hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (error); +} + +static void +rtnl_handle_ifaddr(void *arg __unused, struct ifaddr *ifa, int cmd) +{ + struct nlmsghdr hdr = {}; + struct nl_writer nw = {}; + uint32_t group = 0; + + switch (ifa->ifa_addr->sa_family) { + case AF_INET: + group = RTNLGRP_IPV4_IFADDR; + break; + case AF_INET6: + group = RTNLGRP_IPV6_IFADDR; + break; + default: + NL_LOG(LOG_DEBUG2, "ifa notification for unknown AF: %d", + ifa->ifa_addr->sa_family); + return; + } + + if (!nl_has_listeners(NETLINK_ROUTE, group)) + return; + + if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, group)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + + hdr.nlmsg_type = (cmd == RTM_DELETE) ? NL_RTM_DELADDR : NL_RTM_NEWADDR; + + dump_iface_addr(&nw, ifa->ifa_ifp, ifa, &hdr); + nlmsg_flush(&nw); +} + +static void +rtnl_handle_ifattach(void *arg, struct ifnet *ifp) +{ + struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_NEWLINK }; + struct nl_writer nw = {}; + + if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) + return; + + if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) { + NL_LOG(LOG_DEBUG, "error allocating mbuf"); + return; + } + dump_iface(&nw, ifp, &hdr); + nlmsg_flush(&nw); +} + +static void +rtnl_handle_ifdetach(void *arg, struct ifnet *ifp) +{ + struct nlmsghdr hdr = { .nlmsg_type = NL_RTM_DELLINK }; + struct nl_writer nw = {}; + + if (!nl_has_listeners(NETLINK_ROUTE, RTNLGRP_LINK)) + return; + + if (!nlmsg_get_group_writer(&nw, NLMSG_LARGE, NETLINK_ROUTE, RTNLGRP_LINK)) { + NL_LOG(LOG_DEBUG, "error allocating mbuf"); + return; + } + dump_iface(&nw, ifp, &hdr); + nlmsg_flush(&nw); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_GETLINK, + .name = "RTM_GETLINK", + .cb = &rtnl_handle_getlink, + .flags = RTNL_F_NOEPOCH, + }, + { + .cmd = NL_RTM_DELLINK, + .name = "RTM_DELLINK", + .cb = &rtnl_handle_dellink, + .priv = PRIV_NET_IFDESTROY, + .flags = RTNL_F_NOEPOCH, + }, + { + .cmd = NL_RTM_NEWLINK, + .name = "RTM_NEWLINK", + .cb = &rtnl_handle_newlink, + .priv = PRIV_NET_IFCREATE, + .flags = RTNL_F_NOEPOCH, + }, + { + .cmd = NL_RTM_GETADDR, + .name = "RTM_GETADDR", + .cb = &rtnl_handle_getaddr, + }, + { + .cmd = NL_RTM_NEWADDR, + .name = "RTM_NEWADDR", + .cb = &rtnl_handle_getaddr, + }, + { + .cmd = NL_RTM_DELADDR, + .name = "RTM_DELADDR", + .cb = &rtnl_handle_getaddr, + }, +}; + +static const struct nlhdr_parser *all_parsers[] = { &ifmsg_parser }; + +void +rtnl_iface_add_cloner(struct nl_cloner *cloner) +{ + sx_xlock(&rtnl_cloner_lock); + SLIST_INSERT_HEAD(&nl_cloners, cloner, next); + sx_xunlock(&rtnl_cloner_lock); +} + +void rtnl_iface_del_cloner(struct nl_cloner *cloner) +{ + sx_xlock(&rtnl_cloner_lock); + SLIST_REMOVE(&nl_cloners, cloner, nl_cloner, next); + sx_xunlock(&rtnl_cloner_lock); +} + +void +rtnl_ifaces_init(void) +{ + ifattach_event = EVENTHANDLER_REGISTER( + ifnet_arrival_event, rtnl_handle_ifattach, NULL, + EVENTHANDLER_PRI_ANY); + ifdetach_event = EVENTHANDLER_REGISTER( + ifnet_departure_event, rtnl_handle_ifdetach, NULL, + EVENTHANDLER_PRI_ANY); + ifaddr_event = EVENTHANDLER_REGISTER( + rt_addrmsg, rtnl_handle_ifaddr, NULL, + EVENTHANDLER_PRI_ANY); + NL_VERIFY_PARSERS(all_parsers); + rtnl_iface_drivers_register(); + rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); +} + +void +rtnl_ifaces_destroy(void) +{ + EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ifattach_event); + EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_event); + EVENTHANDLER_DEREGISTER(rt_addrmsg, ifaddr_event); +} diff --git a/sys/netlink/route/iface_drivers.c b/sys/netlink/route/iface_drivers.c new file mode 100644 index 000000000000..ccc8f2184fa3 --- /dev/null +++ b/sys/netlink/route/iface_drivers.c @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* scope deembedding */ + +#define DEBUG_MOD_NAME nl_iface_drivers +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +/* + * + * {len=76, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1662892737, pid=0}, + * {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, + * [ + * {{nla_len=8, nla_type=IFLA_LINK}, 2}, + * {{nla_len=12, nla_type=IFLA_IFNAME}, "xvlan22"}, + * {{nla_len=24, nla_type=IFLA_LINKINFO}, + * [ + * {{nla_len=8, nla_type=IFLA_INFO_KIND}, "vlan"...}, + * {{nla_len=12, nla_type=IFLA_INFO_DATA}, "\x06\x00\x01\x00\x16\x00\x00\x00"}]}]}, iov_len=76}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 76 + */ + +struct nl_parsed_vlan { + uint16_t vlan_id; + uint16_t vlan_proto; + struct ifla_vlan_flags vlan_flags; +}; + +#define _OUT(_field) offsetof(struct nl_parsed_vlan, _field) +static const struct nlattr_parser nla_p_vlan[] = { + { .type = IFLA_VLAN_ID, .off = _OUT(vlan_id), .cb = nlattr_get_uint16 }, + { .type = IFLA_VLAN_FLAGS, .off = _OUT(vlan_flags), .cb = nlattr_get_nla }, + { .type = IFLA_VLAN_PROTOCOL, .off = _OUT(vlan_proto), .cb = nlattr_get_uint16 }, +}; +#undef _OUT +NL_DECLARE_ATTR_PARSER(vlan_parser, nla_p_vlan); + +static int +create_vlan(struct nl_parsed_link *lattrs, struct nlpcb *nlp, struct nl_pstate *npt) +{ + struct epoch_tracker et; + struct ifnet *ifp; + int error; + + /* + * lattrs.ifla_ifname is the new interface name + * lattrs.ifi_index contains parent interface index + * lattrs.ifla_idata contains un-parsed vlan data + */ + + struct nl_parsed_vlan attrs = { + .vlan_id = 0xFEFE, + .vlan_proto = ETHERTYPE_VLAN + }; + NLP_LOG(LOG_DEBUG3, nlp, "nested: %p len %d", lattrs->ifla_idata, lattrs->ifla_idata->nla_len); + + if (lattrs->ifla_idata == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "vlan id is required, guessing not supported"); + return (ENOTSUP); + } + + error = nl_parse_nested(lattrs->ifla_idata, &vlan_parser, npt, &attrs); + if (error != 0) + return (error); + if (attrs.vlan_id > 4095) { + NLMSG_REPORT_ERR_MSG(npt, "Invalid VID: %d", attrs.vlan_id); + return (EINVAL); + } + if (attrs.vlan_proto != ETHERTYPE_VLAN && attrs.vlan_proto != ETHERTYPE_QINQ) { + NLMSG_REPORT_ERR_MSG(npt, "Unsupported ethertype: 0x%04X", attrs.vlan_proto); + return (ENOTSUP); + } + + NET_EPOCH_ENTER(et); + ifp = ifnet_byindex_ref(lattrs->ifi_index); + NET_EPOCH_EXIT(et); + if (ifp == NULL) { + NLP_LOG(LOG_DEBUG, nlp, "unable to find parent interface %u", + lattrs->ifi_index); + return (ENOENT); + } + + /* Waiting till if_clone changes lands */ +/* + struct vlanreq params = { + .vlr_tag = attrs.vlan_id, + .vlr_proto = attrs.vlan_proto, + }; +*/ + int ifname_len = strlen(lattrs->ifla_ifname) + 1; + error = if_clone_create(lattrs->ifla_ifname, ifname_len, (char *)NULL); + + NLP_LOG(LOG_DEBUG2, nlp, "clone for %s returned %d", lattrs->ifla_ifname, error); + + if_rele(ifp); + return (error); +} + +static struct nl_cloner vlan_cloner = { + .name = "vlan", + .create_f = create_vlan, + +}; + +static const struct nlhdr_parser *all_parsers[] = { &vlan_parser }; + +void +rtnl_iface_drivers_register(void) +{ + rtnl_iface_add_cloner(&vlan_cloner); + NL_VERIFY_PARSERS(all_parsers); +} + + diff --git a/sys/netlink/route/ifaddrs.h b/sys/netlink/route/ifaddrs.h new file mode 100644 index 000000000000..e2013cb266d7 --- /dev/null +++ b/sys/netlink/route/ifaddrs.h @@ -0,0 +1,90 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Interface address-related (RTM_ADDR) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_IFADDRS_H_ +#define _NETLINK_ROUTE_IFADDRS_H_ + +/* Base header for all of the relevant messages */ +struct ifaddrmsg { + uint8_t ifa_family; /* Address family */ + uint8_t ifa_prefixlen; /* Prefix length */ + uint8_t ifa_flags; /* Address-specific flags */ + uint8_t ifa_scope; /* Address scope */ + uint32_t ifa_index; /* Link ifindex */ +}; + +#ifndef _KERNEL +#define _NL_IFA_HDRLEN ((int)sizeof(struct ifaddrmsg)) +#define IFA_RTA(_ifa) ((struct rtattr *)(NL_ITEM_DATA(_ifa, _NL_IFA_HDRLEN))) +#define IFA_PAYLOAD(_hdr) NLMSG_PAYLOAD(_hdr, _NL_IFA_HDRLEN) +#endif + +/* Defined attributes */ +enum { + IFA_UNSPEC, + IFA_ADDRESS = 1, /* binary, prefix address (destination for p2p) */ + IFA_LOCAL = 2, /* binary, interface address */ + IFA_LABEL = 3, /* not supported */ + IFA_BROADCAST = 4, /* binary, broadcast ifa */ + IFA_ANYCAST = 5, /* not supported */ + IFA_CACHEINFO = 6, /* not supported */ + IFA_MULTICAST = 7, /* not supported */ + IFA_FLAGS = 8, /* not supported */ + IFA_RT_PRIORITY = 9, /* not supported */ + IFA_TARGET_NETNSID = 10, /* not supported */ + __IFA_MAX, +}; +#define IFA_MAX (__IFA_MAX - 1) + +/* IFA_FLAGS attribute flags */ +#define IFA_F_SECONDARY 0x0001 +#define IFA_F_TEMPORARY IFA_F_SECONDARY +#define IFA_F_NODAD 0x0002 +#define IFA_F_OPTIMISTIC 0x0004 +#define IFA_F_DADFAILED 0x0008 +#define IFA_F_HOMEADDRESS 0x0010 +#define IFA_F_DEPRECATED 0x0020 +#define IFA_F_TENTATIVE 0x0040 +#define IFA_F_PERMANENT 0x0080 +#define IFA_F_MANAGETEMPADDR 0x0100 +#define IFA_F_NOPREFIXROUTE 0x0200 +#define IFA_F_MCAUTOJOIN 0x0400 +#define IFA_F_STABLE_PRIVACY 0x0800 + +/* IFA_CACHEINFO value */ +struct ifa_cacheinfo { + uint32_t ifa_prefered; + uint32_t ifa_valid; + uint32_t cstamp; + uint32_t tstamp; +}; + +#endif diff --git a/sys/netlink/route/interface.h b/sys/netlink/route/interface.h new file mode 100644 index 000000000000..cae763cc4a58 --- /dev/null +++ b/sys/netlink/route/interface.h @@ -0,0 +1,245 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Interface-related (RTM_LINK) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_INTERFACE_H_ +#define _NETLINK_ROUTE_INTERFACE_H_ + +/* Base header for all of the relevant messages */ +struct ifinfomsg { + unsigned char ifi_family; /* not used */ + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Inteface index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ +}; + +#ifndef _KERNEL +/* Compatilbility helpers */ +#define _IFINFO_HDRLEN ((int)sizeof(struct ifinfomsg)) +#define IFLA_RTA(_ifi) ((struct rtattr *)NL_ITEM_DATA(_ifi, _IFINFO_HDRLEN)) +#define IFLA_PAYLOAD(_ifi) NLMSG_PAYLOAD(_ifi, _IFINFO_HDRLEN) +#endif + +enum { + IFLA_UNSPEC = 0, + IFLA_ADDRESS = 1, /* binary: Link-level address (MAC) */ +#define IFLA_ADDRESS IFLA_ADDRESS + IFLA_BROADCAST = 2, /* binary: link-level broadcast address */ +#define IFLA_BROADCAST IFLA_BROADCAST + IFLA_IFNAME = 3, /* string: Interface name */ +#define IFLA_IFNAME IFLA_IFNAME + IFLA_MTU = 4, /* u32: Current interface L3 mtu */ +#define IFLA_MTU IFLA_MTU + IFLA_LINK = 5, /* u32: interface index */ +#define IFLA_LINK IFLA_LINK + IFLA_QDISC = 6, /* string: Queing policy (not supported) */ +#define IFLA_QDISC IFLA_QDISC + IFLA_STATS = 7, /* Interface counters */ +#define IFLA_STATS IFLA_STATS + IFLA_COST = 8, /* not supported */ +#define IFLA_COST IFLA_COST + IFLA_PRIORITY = 9, /* not supported */ +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER = 10, /* u32: parent interface ifindex */ +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS = 11, /* not supported */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO = 12, /* protocol-specific data */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN = 13, /* u32: transmit queue length */ +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP = 14, /* not supported */ +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT = 15, /* not supported */ +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE = 16, /* u8: ifOperStatus per RFC 2863 */ +#define IFLA_OPERSTATE IFLA_OPERSTATE + IFLA_LINKMODE = 17, /* u8: ifmedia (not supported) */ +#define IFLA_LINKMODE IFLA_LINKMODE + IFLA_LINKINFO = 18, /* nested: IFLA_INFO_ */ +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID = 19, /* u32: vnet id (not supported) */ +#define IFLA_NET_NS_PID IFLA_NET_NS_PID + IFLA_IFALIAS = 20, /* not supported */ +#define IFLA_IFALIAS IFLA_IFALIAS + IFLA_NUM_VF = 21, /* not supported */ +#define IFLA_NUM_VF IFLA_NUM_VF + IFLA_VFINFO_LIST= 22, /* not supported */ +#define IFLA_VFINFO_LIST IFLA_VFINFO_LIST + IFLA_STATS64 = 23, /* rtnl_link_stats64: iface stats */ +#define IFLA_STATS64 IFLA_STATS64 + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + __IFLA_MAX +}; +#define IFLA_MAX (__IFLA_MAX - 1) + +/* + * Attributes that can be used as filters: + * IFLA_IFNAME, IFLA_GROUP, IFLA_ALT_IFNAME + * Headers that can be used as filters: + * ifi_index, ifi_type + */ + +/* + * IFLA_OPERSTATE. + * The values below represent the possible + * states of ifOperStatus defined by RFC 2863 + */ +enum { + IF_OPER_UNKNOWN = 0, /* status can not be determined */ + IF_OPER_NOTPRESENT = 1, /* some (hardware) component not present */ + IF_OPER_DOWN = 2, /* down */ + IF_OPER_LOWERLAYERDOWN = 3, /* some lower-level interface is down */ + IF_OPER_TESTING = 4, /* in some test mode */ + IF_OPER_DORMANT = 5, /* "up" but waiting for some condition (802.1X) */ + IF_OPER_UP = 6, /* ready to pass packets */ +}; + +/* IFLA_STATS */ +struct rtnl_link_stats { + uint32_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */ + uint32_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */ + uint32_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */ + uint32_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */ + uint32_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */ + uint32_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */ + uint32_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */ + uint32_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */ + uint32_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */ + uint32_t collisions; /* not supported */ + uint32_t rx_length_errors; /* not supported */ + uint32_t rx_over_errors; /* not supported */ + uint32_t rx_crc_errors; /* not supported */ + uint32_t rx_frame_errors; /* not supported */ + uint32_t rx_fifo_errors; /* not supported */ + uint32_t rx_missed_errors; /* not supported */ + uint32_t tx_aborted_errors; /* not supported */ + uint32_t tx_carrier_errors; /* not supported */ + uint32_t tx_fifo_errors; /* not supported */ + uint32_t tx_heartbeat_errors; /* not supported */ + uint32_t tx_window_errors; /* not supported */ + uint32_t rx_compressed; /* not supported */ + uint32_t tx_compressed; /* not supported */ + uint32_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */ +}; + +/* IFLA_STATS64 */ +struct rtnl_link_stats64 { + uint64_t rx_packets; /* total RX packets (IFCOUNTER_IPACKETS) */ + uint64_t tx_packets; /* total TX packets (IFCOUNTER_OPACKETS) */ + uint64_t rx_bytes; /* total RX bytes (IFCOUNTER_IBYTES) */ + uint64_t tx_bytes; /* total TX bytes (IFCOUNTER_OBYTES) */ + uint64_t rx_errors; /* RX errors (IFCOUNTER_IERRORS) */ + uint64_t tx_errors; /* RX errors (IFCOUNTER_OERRORS) */ + uint64_t rx_dropped; /* RX drop (no space in ring/no bufs) (IFCOUNTER_IQDROPS) */ + uint64_t tx_dropped; /* TX drop (IFCOUNTER_OQDROPS) */ + uint64_t multicast; /* RX multicast packets (IFCOUNTER_IMCASTS) */ + uint64_t collisions; /* not supported */ + uint64_t rx_length_errors; /* not supported */ + uint64_t rx_over_errors; /* not supported */ + uint64_t rx_crc_errors; /* not supported */ + uint64_t rx_frame_errors; /* not supported */ + uint64_t rx_fifo_errors; /* not supported */ + uint64_t rx_missed_errors; /* not supported */ + uint64_t tx_aborted_errors; /* not supported */ + uint64_t tx_carrier_errors; /* not supported */ + uint64_t tx_fifo_errors; /* not supported */ + uint64_t tx_heartbeat_errors; /* not supported */ + uint64_t tx_window_errors; /* not supported */ + uint64_t rx_compressed; /* not supported */ + uint64_t tx_compressed; /* not supported */ + uint64_t rx_nohandler; /* dropped due to no proto handler (IFCOUNTER_NOPROTO) */ +}; + +/* IFLA_LINKINFO child nlattr types */ +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND = 1, /* string, link type ("vlan") */ + IFLA_INFO_DATA = 2, /* Per-link-type custom data */ + IFLA_INFO_XSTATS = 3, + IFLA_INFO_SLAVE_KIND = 4, + IFLA_INFO_SLAVE_DATA = 5, + __IFLA_INFO_MAX, +}; +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* IFLA_INFO_DATA vlan attributes */ +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) +struct ifla_vlan_flags { + uint32_t flags; + uint32_t mask; +}; + +#endif diff --git a/sys/netlink/route/neigh.c b/sys/netlink/route/neigh.c new file mode 100644 index 000000000000..02ad138240a2 --- /dev/null +++ b/sys/netlink/route/neigh.c @@ -0,0 +1,571 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include /* nd6.h requires this */ +#include /* nd6 state machine */ +#include /* scope deembedding */ + +#define DEBUG_MOD_NAME nl_neigh +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +static int lle_families[] = { AF_INET, AF_INET6 }; + +static eventhandler_tag lle_event_p; + +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + struct ifnet *ifp; + int family; + int error; + int count; + int dumped; +}; + +static int +lle_state_to_nl_state(int family, struct llentry *lle) +{ + int state = lle->ln_state; + + switch (family) { + case AF_INET: + if (lle->la_flags & (LLE_STATIC | LLE_IFADDR)) + state = 1; + switch (state) { + case 0: /* ARP_LLINFO_INCOMPLETE */ + return (NUD_INCOMPLETE); + case 1: /* ARP_LLINFO_REACHABLE */ + return (NUD_REACHABLE); + case 2: /* ARP_LLINFO_VERIFY */ + return (NUD_PROBE); + } + break; + case AF_INET6: + switch (state) { + case ND6_LLINFO_INCOMPLETE: + return (NUD_INCOMPLETE); + case ND6_LLINFO_REACHABLE: + return (NUD_REACHABLE); + case ND6_LLINFO_STALE: + return (NUD_STALE); + case ND6_LLINFO_DELAY: + return (NUD_DELAY); + case ND6_LLINFO_PROBE: + return (NUD_PROBE); + } + break; + } + + return (NUD_NONE); +} + +static uint32_t +lle_flags_to_nl_flags(const struct llentry *lle) +{ + uint32_t nl_flags = 0; + + if (lle->la_flags & LLE_IFADDR) + nl_flags |= NTF_SELF; + if (lle->la_flags & LLE_PUB) + nl_flags |= NTF_PROXY; + if (lle->la_flags & LLE_STATIC) + nl_flags |= NTF_STICKY; + if (lle->ln_router != 0) + nl_flags |= NTF_ROUTER; + + return (nl_flags); +} + +static int +dump_lle_locked(struct llentry *lle, void *arg) +{ + struct netlink_walkargs *wa = (struct netlink_walkargs *)arg; + struct nlmsghdr *hdr = &wa->hdr; + struct nl_writer *nw = wa->nw; + struct ndmsg *ndm; + union { + struct in_addr in; + struct in6_addr in6; + } addr; + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char llebuf[NHOP_PRINT_BUFSIZE]; + llentry_print_buf_lltable(lle, llebuf, sizeof(llebuf)); + NL_LOG(LOG_DEBUG2, "dumping %s", llebuf); + } + + if (!nlmsg_reply(nw, hdr, sizeof(struct ndmsg))) + goto enomem; + + ndm = nlmsg_reserve_object(nw, struct ndmsg); + ndm->ndm_family = wa->family; + ndm->ndm_ifindex = wa->ifp->if_index; + ndm->ndm_state = lle_state_to_nl_state(wa->family, lle); + ndm->ndm_flags = lle_flags_to_nl_flags(lle); + + switch (wa->family) { +#ifdef INET + case AF_INET: + addr.in = lle->r_l3addr.addr4; + nlattr_add(nw, NDA_DST, 4, &addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + addr.in6 = lle->r_l3addr.addr6; + in6_clearscope(&addr.in6); + nlattr_add(nw, NDA_DST, 16, &addr); + break; +#endif + } + + if (lle->r_flags & RLLE_VALID) { + /* Has L2 */ + int addrlen = wa->ifp->if_addrlen; + nlattr_add(nw, NDA_LLADDR, addrlen, lle->ll_addr); + } + + nlattr_add_u32(nw, NDA_PROBES, lle->la_asked); + + struct nda_cacheinfo *cache; + cache = nlmsg_reserve_attr(nw, NDA_CACHEINFO, struct nda_cacheinfo); + if (cache == NULL) + goto enomem; + /* TODO: provide confirmed/updated */ + cache->ndm_refcnt = lle->lle_refcnt; + + if (nlmsg_end(nw)) + return (0); +enomem: + NL_LOG(LOG_DEBUG, "unable to dump lle state (ENOMEM)"); + nlmsg_abort(nw); + return (ENOMEM); +} + +static int +dump_lle(struct lltable *llt, struct llentry *lle, void *arg) +{ + int error; + + LLE_RLOCK(lle); + error = dump_lle_locked(lle, arg); + LLE_RUNLOCK(lle); + return (error); +} + +static bool +dump_llt(struct lltable *llt, struct netlink_walkargs *wa) +{ + lltable_foreach_lle(llt, dump_lle, wa); + + return (true); +} + +static int +dump_llts_iface(struct netlink_walkargs *wa, struct ifnet *ifp, int family) +{ + int error = 0; + + wa->ifp = ifp; + for (int i = 0; i < sizeof(lle_families) / sizeof(int); i++) { + int fam = lle_families[i]; + struct lltable *llt = lltable_get(ifp, fam); + if (llt != NULL && (family == 0 || family == fam)) { + wa->count++; + wa->family = fam; + if (!dump_llt(llt, wa)) { + error = ENOMEM; + break; + } + wa->dumped++; + } + } + return (error); +} + +static int +dump_llts(struct netlink_walkargs *wa, struct ifnet *ifp, int family) +{ + NL_LOG(LOG_DEBUG, "Start dump ifp=%s family=%d", ifp ? if_name(ifp) : "NULL", family); + + wa->hdr.nlmsg_flags |= NLM_F_MULTI; + + if (ifp != NULL) { + dump_llts_iface(wa, ifp, family); + } else { + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + dump_llts_iface(wa, ifp, family); + } + } + + NL_LOG(LOG_DEBUG, "End dump, iterated %d dumped %d", wa->count, wa->dumped); + + if (!nlmsg_end_dump(wa->nw, wa->error, &wa->hdr)) { + NL_LOG(LOG_DEBUG, "Unable to add new message"); + return (ENOMEM); + } + + return (0); +} + +static int +get_lle(struct netlink_walkargs *wa, struct ifnet *ifp, int family, struct sockaddr *dst) +{ + struct lltable *llt = lltable_get(ifp, family); + if (llt == NULL) + return (ESRCH); + +#ifdef INET6 + if (dst->sa_family == AF_INET6) { + struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; + + if (IN6_IS_SCOPE_LINKLOCAL(&dst6->sin6_addr)) + in6_set_unicast_scopeid(&dst6->sin6_addr, ifp->if_index); + } +#endif + struct llentry *lle = lla_lookup(llt, LLE_UNLOCKED, dst); + if (lle == NULL) + return (ESRCH); + + wa->ifp = ifp; + wa->family = family; + + return (dump_lle(llt, lle, wa)); +} + +struct nl_parsed_neigh { + struct sockaddr *nda_dst; + struct ifnet *nda_ifp; + struct nlattr *nda_lladdr; + uint32_t ndm_flags; + uint16_t ndm_state; + uint8_t ndm_family; +}; + +#define _IN(_field) offsetof(struct ndmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_neigh, _field) +static struct nlfield_parser nlf_p_neigh[] = { + { .off_in = _IN(ndm_family), .off_out = _OUT(ndm_family), .cb = nlf_get_u8 }, + { .off_in = _IN(ndm_flags), .off_out = _OUT(ndm_flags), .cb = nlf_get_u8_u32 }, + { .off_in = _IN(ndm_state), .off_out = _OUT(ndm_state), .cb = nlf_get_u16 }, + { .off_in = _IN(ndm_ifindex), .off_out = _OUT(nda_ifp), .cb = nlf_get_ifpz }, +}; + +static struct nlattr_parser nla_p_neigh[] = { + { .type = NDA_DST, .off = _OUT(nda_dst), .cb = nlattr_get_ip }, + { .type = NDA_LLADDR, .off = _OUT(nda_lladdr), .cb = nlattr_get_nla }, + { .type = NDA_IFINDEX, .off = _OUT(nda_ifp), .cb = nlattr_get_ifp }, + { .type = NDA_FLAGS_EXT, .off = _OUT(ndm_flags), .cb = nlattr_get_uint32 }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_PARSER(ndmsg_parser, struct ndmsg, nlf_p_neigh, nla_p_neigh); + + +/* + * type=RTM_NEWNEIGH, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1661941473, pid=0}, + * {ndm_family=AF_INET6, ndm_ifindex=if_nametoindex("enp0s31f6"), ndm_state=NUD_PERMANENT, ndm_flags=0, ndm_type=RTN_UNSPEC}, + * [ + * {{nla_len=20, nla_type=NDA_DST}, inet_pton(AF_INET6, "2a01:4f8:13a:70c::3")}, + * {{nla_len=10, nla_type=NDA_LLADDR}, 20:4e:71:62:ae:f2}]}, iov_len=60} + */ + +static int +rtnl_handle_newneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_neigh attrs = {}; + error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nda_ifp == NULL || attrs.nda_dst == NULL || attrs.nda_lladdr == NULL) { + if (attrs.nda_ifp == NULL) + NLMSG_REPORT_ERR_MSG(npt, "NDA_IFINDEX / ndm_ifindex not set"); + if (attrs.nda_dst == NULL) + NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); + if (attrs.nda_lladdr == NULL) + NLMSG_REPORT_ERR_MSG(npt, "NDA_LLADDR not set"); + return (EINVAL); + } + + if (attrs.nda_dst->sa_family != attrs.ndm_family) { + NLMSG_REPORT_ERR_MSG(npt, + "NDA_DST family (%d) is different from ndm_family (%d)", + attrs.nda_dst->sa_family, attrs.ndm_family); + return (EINVAL); + } + + int addrlen = attrs.nda_ifp->if_addrlen; + if (attrs.nda_lladdr->nla_len != sizeof(struct nlattr) + addrlen) { + NLMSG_REPORT_ERR_MSG(npt, + "NDA_LLADDR address length (%ld) is different from expected (%d)", + attrs.nda_lladdr->nla_len - sizeof(struct nlattr), addrlen); + return (EINVAL); + } + + if (attrs.ndm_state != NUD_PERMANENT) { + NLMSG_REPORT_ERR_MSG(npt, "ndm_state %d not supported", attrs.ndm_state); + return (ENOTSUP); + } + + const uint16_t supported_flags = NTF_PROXY | NTF_STICKY; + if ((attrs.ndm_flags & supported_flags) != attrs.ndm_flags) { + NLMSG_REPORT_ERR_MSG(npt, "ndm_flags %X not supported", + attrs.ndm_flags &~ supported_flags); + return (ENOTSUP); + } + + /* Replacement requires new entry creation anyway */ + if ((hdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) == 0) + return (ENOTSUP); + + struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); + if (llt == NULL) + return (EAFNOSUPPORT); + + + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize = sizeof(linkhdr); + int lladdr_off = 0; + if (lltable_calc_llheader(attrs.nda_ifp, attrs.ndm_family, + (char *)(attrs.nda_lladdr + 1), linkhdr, &linkhdrsize, &lladdr_off) != 0) { + NLMSG_REPORT_ERR_MSG(npt, "unable to calculate lle prepend data"); + return (EINVAL); + } + + int lle_flags = LLE_STATIC | ((attrs.ndm_flags & NTF_PROXY) ? LLE_PUB : 0); + struct llentry *lle = lltable_alloc_entry(llt, lle_flags, attrs.nda_dst); + if (lle == NULL) + return (ENOMEM); + lltable_set_entry_addr(attrs.nda_ifp, lle, linkhdr, linkhdrsize, lladdr_off); + + /* llentry created, try to insert or update :*/ + IF_AFDATA_WLOCK(attrs.nda_ifp); + LLE_WLOCK(lle); + struct llentry *lle_tmp = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst); + if (lle_tmp != NULL) { + if (hdr->nlmsg_flags & NLM_F_EXCL) { + LLE_WUNLOCK(lle_tmp); + lle_tmp = NULL; + error = EEXIST; + } else if (hdr->nlmsg_flags & NLM_F_REPLACE) { + lltable_unlink_entry(llt, lle_tmp); + lltable_link_entry(llt, lle); + } else + error = EEXIST; + } else { + if (hdr->nlmsg_flags & NLM_F_CREATE) + lltable_link_entry(llt, lle); + else + error = ENOENT; + } + IF_AFDATA_WUNLOCK(attrs.nda_ifp); + + if (error != 0) { + if (lle != NULL) + llentry_free(lle); + return (error); + } + + if (lle_tmp != NULL) + llentry_free(lle_tmp); + + /* XXX: We're inside epoch */ + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); + LLE_WUNLOCK(lle); + + return (0); +} + +static int +rtnl_handle_delneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_neigh attrs = {}; + error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nda_dst == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "NDA_DST not set"); + return (EINVAL); + } + + if (attrs.nda_ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "no ifindex provided"); + return (EINVAL); + } + + struct lltable *llt = lltable_get(attrs.nda_ifp, attrs.ndm_family); + if (llt == NULL) + return (EAFNOSUPPORT); + + IF_AFDATA_WLOCK(attrs.nda_ifp); + struct llentry *lle = lla_lookup(llt, LLE_EXCLUSIVE, attrs.nda_dst); + if (lle != NULL) { + if ((lle->la_flags & LLE_IFADDR) != 0) { + LLE_WUNLOCK(lle); + lle = NULL; + error = EPERM; + } else + lltable_unlink_entry(llt, lle); + } else + error = ENOENT; + IF_AFDATA_WUNLOCK(attrs.nda_ifp); + + if (error == 0 && lle != NULL) + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); + + if (lle != NULL) + llentry_free(lle); + + return (error); +} + +static int +rtnl_handle_getneigh(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_neigh attrs = {}; + error = nl_parse_nlmsg(hdr, &ndmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nda_dst != NULL && attrs.nda_ifp == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "has NDA_DST but no ifindex provided"); + return (EINVAL); + } + + struct netlink_walkargs wa = { + .so = nlp, + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEIGH, + }; + + if (attrs.nda_dst == NULL) + error = dump_llts(&wa, attrs.nda_ifp, attrs.ndm_family); + else + error = get_lle(&wa, attrs.nda_ifp, attrs.ndm_family, attrs.nda_dst); + + return (error); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_NEWNEIGH, + .name = "RTM_NEWNEIGH", + .cb = &rtnl_handle_newneigh, + }, + { + .cmd = NL_RTM_DELNEIGH, + .name = "RTM_DELNEIGH", + .cb = &rtnl_handle_delneigh, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_GETNEIGH, + .name = "RTM_GETNEIGH", + .cb = &rtnl_handle_getneigh, + .priv = PRIV_NET_ROUTE, + } +}; + +static void +rtnl_lle_event(void *arg __unused, struct llentry *lle, int evt) +{ + struct ifnet *ifp; + int family; + + LLE_WLOCK_ASSERT(lle); + + ifp = lltable_get_ifp(lle->lle_tbl); + family = lltable_get_af(lle->lle_tbl); + + if (family != AF_INET && family != AF_INET6) + return; + + int nlmsgs_type = evt == LLENTRY_RESOLVED ? NL_RTM_NEWNEIGH : NL_RTM_DELNEIGH; + + struct nl_writer nw = {}; + if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEIGH)) { + NL_LOG(LOG_DEBUG, "error allocating group writer"); + return; + } + + struct netlink_walkargs wa = { + .hdr.nlmsg_type = nlmsgs_type, + .nw = &nw, + .ifp = ifp, + .family = family, + }; + + dump_lle_locked(lle, &wa); + nlmsg_flush(&nw); +} + +static const struct nlhdr_parser *all_parsers[] = { &ndmsg_parser }; + +void +rtnl_neighs_init() +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); + lle_event_p = EVENTHANDLER_REGISTER(lle_event, rtnl_lle_event, NULL, + EVENTHANDLER_PRI_ANY); +} + +void +rtnl_neighs_destroy() +{ + EVENTHANDLER_DEREGISTER(lle_event, lle_event_p); +} diff --git a/sys/netlink/route/neigh.h b/sys/netlink/route/neigh.h new file mode 100644 index 000000000000..1ec1b95fdcde --- /dev/null +++ b/sys/netlink/route/neigh.h @@ -0,0 +1,105 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Neighbors-related (RTM_NEIGH) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_NEIGH_H_ +#define _NETLINK_ROUTE_NEIGH_H_ + +/* Base header for all of the relevant messages */ +struct ndmsg { + uint8_t ndm_family; + uint8_t ndm_pad1; + uint16_t ndm_pad2; + int32_t ndm_ifindex; + uint16_t ndm_state; + uint8_t ndm_flags; + uint8_t ndm_type; +}; + +/* Attributes */ +enum { + NDA_UNSPEC, + NDA_DST, /* binary: neigh l3 address */ + NDA_LLADDR, /* binary: neigh link-level address */ + NDA_CACHEINFO, /* binary, struct nda_cacheinfo */ + NDA_PROBES, /* XXX */ + NDA_VLAN, /* upper 802.1Q tag */ + NDA_PORT, /* not supported */ + NDA_VNI, /* not supported */ + NDA_IFINDEX, /* interface index */ + NDA_MASTER, /* not supported */ + NDA_LINK_NETNSID, /* not supported */ + NDA_SRC_VNI, /* not supported */ + NDA_PROTOCOL, /* XXX */ + NDA_NH_ID, /* not supported */ + NDA_FDB_EXT_ATTRS, /* not supported */ + NDA_FLAGS_EXT, /* u32: ndm_flags */ + NDA_NDM_STATE_MASK, /* XXX */ + NDA_NDM_FLAGS_MASK, /* XXX */ + __NDA_MAX +}; + +#define NDA_MAX (__NDA_MAX - 1) + + +/* ndm_flags / NDA_FLAGS_EXT */ +#define NTF_USE 0x0001 /* XXX */ +#define NTF_SELF 0x0002 /* local station */ +#define NTF_MASTER 0x0004 /* XXX */ +#define NTF_PROXY 0x0008 /* proxy entry */ +#define NTF_EXT_LEARNED 0x0010 /* not used */ +#define NTF_OFFLOADED 0x0020 /* not used */ +#define NTF_STICKY 0x0040 /* permament entry */ +#define NTF_ROUTER 0x0080 /* dst indicated itself as a router */ +/* start of NDA_FLAGS_EXT */ +#define NTF_EXT_MANAGED 0x0100 /* not used */ + +/* ndm_state */ +#define NUD_INCOMPLETE 0x01 /* No lladdr, address resolution in progress */ +#define NUD_REACHABLE 0x02 /* reachable & recently resolved */ +#define NUD_STALE 0x04 /* has lladdr but it's stale */ +#define NUD_DELAY 0x08 /* has lladdr, is stale, probes delayed */ +#define NUD_PROBE 0x10 /* has lladdr, is stale, probes sent */ +#define NUD_FAILED 0x20 /* unused */ + +/* Dummy states */ +#define NUD_NOARP 0x40 /* not used */ +#define NUD_PERMANENT 0x80 /* not flushed */ +#define NUD_NONE 0x00 + +/* NDA_CACHEINFO */ +struct nda_cacheinfo { + uint32_t ndm_confirmed; /* seconds since ARP/ND was received from neigh */ + uint32_t ndm_used; /* seconds since last used (not provided) */ + uint32_t ndm_updated; /* seconds since state was updated last */ + uint32_t ndm_refcnt; /* number of references held */ +}; + +#endif diff --git a/sys/netlink/route/nexthop.c b/sys/netlink/route/nexthop.c new file mode 100644 index 000000000000..92555aa8b123 --- /dev/null +++ b/sys/netlink/route/nexthop.c @@ -0,0 +1,1000 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_nhop +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG3); + +/* + * This file contains the logic to maintain kernel nexthops and + * nexhop groups based om the data provided by the user. + * + * Kernel stores (nearly) all of the routing data in the nexthops, + * including the prefix-specific flags (NHF_HOST and NHF_DEFAULT). + * + * Netlink API provides higher-level abstraction for the user. Each + * user-created nexthop may map to multiple kernel nexthops. + * + * The following variations require separate kernel nexthop to be + * created: + * * prefix flags (NHF_HOST, NHF_DEFAULT) + * * using IPv6 gateway for IPv4 routes + * * different fibnum + * + * These kernel nexthops have the lifetime bound to the lifetime of + * the user_nhop object. They are not collected until user requests + * to delete the created user_nhop. + * + */ +struct user_nhop { + uint32_t un_idx; /* Userland-provided index */ + uint32_t un_fibfam; /* fibnum+af(as highest byte) */ + uint8_t un_protocol; /* protocol that install the record */ + struct nhop_object *un_nhop; /* "production" nexthop */ + struct nhop_object *un_nhop_src; /* nexthop to copy from */ + struct weightened_nhop *un_nhgrp_src; /* nexthops for nhg */ + uint32_t un_nhgrp_count; /* number of nexthops */ + struct user_nhop *un_next; /* next item in hash chain */ + struct user_nhop *un_nextchild; /* master -> children */ + struct epoch_context un_epoch_ctx; /* epoch ctl helper */ +}; + +/* produce hash value for an object */ +#define unhop_hash_obj(_obj) (hash_unhop(_obj)) +/* compare two objects */ +#define unhop_cmp(_one, _two) (cmp_unhop(_one, _two)) +/* next object accessor */ +#define unhop_next(_obj) (_obj)->un_next + +CHT_SLIST_DEFINE(unhop, struct user_nhop); + +struct unhop_ctl { + struct unhop_head un_head; + struct rmlock un_lock; +}; +#define UN_LOCK_INIT(_ctl) rm_init(&(_ctl)->un_lock, "unhop_ctl") +#define UN_TRACKER struct rm_priotracker un_tracker +#define UN_RLOCK(_ctl) rm_rlock(&((_ctl)->un_lock), &un_tracker) +#define UN_RUNLOCK(_ctl) rm_runlock(&((_ctl)->un_lock), &un_tracker) + +#define UN_WLOCK(_ctl) rm_wlock(&(_ctl)->un_lock); +#define UN_WUNLOCK(_ctl) rm_wunlock(&(_ctl)->un_lock); + +VNET_DEFINE_STATIC(struct unhop_ctl *, un_ctl) = NULL; +#define V_un_ctl VNET(un_ctl) + +static void consider_resize(struct unhop_ctl *ctl, uint32_t new_size); +static int cmp_unhop(const struct user_nhop *a, const struct user_nhop *b); +static unsigned int hash_unhop(const struct user_nhop *obj); + +static void destroy_unhop(struct user_nhop *unhop); +static struct nhop_object *clone_unhop(const struct user_nhop *unhop, + uint32_t fibnum, int family, int nh_flags); + +static int +cmp_unhop(const struct user_nhop *a, const struct user_nhop *b) +{ + return (a->un_idx == b->un_idx && a->un_fibfam == b->un_fibfam); +} + +/* + * Hash callback: calculate hash of an object + */ +static unsigned int +hash_unhop(const struct user_nhop *obj) +{ + return (obj->un_idx ^ obj->un_fibfam); +} + +#define UNHOP_IS_MASTER(_unhop) ((_unhop)->un_fibfam == 0) + +/* + * Factory interface for creating matching kernel nexthops/nexthop groups + * + * @uidx: userland nexhop index used to create the nexthop + * @fibnum: fibnum nexthop will be used in + * @family: upper family nexthop will be used in + * @nh_flags: desired nexthop prefix flags + * @perror: pointer to store error to + * + * Returns referenced nexthop linked to @fibnum/@family rib on success. + */ +struct nhop_object * +nl_find_nhop(uint32_t fibnum, int family, uint32_t uidx, + int nh_flags, int *perror) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + UN_TRACKER; + + if (__predict_false(ctl == NULL)) + return (NULL); + + struct user_nhop key= { + .un_idx = uidx, + .un_fibfam = fibnum | ((uint32_t)family) << 24, + }; + struct user_nhop *unhop; + + nh_flags = nh_flags & (NHF_HOST | NHF_DEFAULT); + + if (__predict_false(family == 0)) + return (NULL); + + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop != NULL) { + struct nhop_object *nh = unhop->un_nhop; + UN_RLOCK(ctl); + *perror = 0; + nhop_ref_any(nh); + return (nh); + } + + /* + * Exact nexthop not found. Search for template nexthop to clone from. + */ + key.un_fibfam = 0; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + UN_RUNLOCK(ctl); + *perror = ESRCH; + return (NULL); + } + + UN_RUNLOCK(ctl); + + /* Create entry to insert first */ + struct user_nhop *un_new, *un_tmp; + un_new = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); + if (un_new == NULL) { + *perror = ENOMEM; + return (NULL); + } + un_new->un_idx = uidx; + un_new->un_fibfam = fibnum | ((uint32_t)family) << 24; + + /* Relying on epoch to protect unhop here */ + un_new->un_nhop = clone_unhop(unhop, fibnum, family, nh_flags); + if (un_new->un_nhop == NULL) { + free(un_new, M_NETLINK); + *perror = ENOMEM; + return (NULL); + } + + /* Insert back and report */ + UN_WLOCK(ctl); + + /* First, find template record once again */ + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + /* Someone deleted the nexthop during the call */ + UN_WUNLOCK(ctl); + *perror = ESRCH; + destroy_unhop(un_new); + return (NULL); + } + + /* Second, check the direct match */ + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, un_new, un_tmp); + struct nhop_object *nh; + if (un_tmp != NULL) { + /* Another thread already created the desired nextop, use it */ + nh = un_tmp->un_nhop; + } else { + /* Finally, insert the new nexthop and link it to the primary */ + nh = un_new->un_nhop; + CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, un_new); + un_new->un_nextchild = unhop->un_nextchild; + unhop->un_nextchild = un_new; + un_new = NULL; + NL_LOG(LOG_DEBUG2, "linked cloned nexthop %p", nh); + } + + UN_WUNLOCK(ctl); + + if (un_new != NULL) + destroy_unhop(un_new); + + *perror = 0; + nhop_ref_any(nh); + return (nh); +} + +static struct user_nhop * +nl_find_base_unhop(struct unhop_ctl *ctl, uint32_t uidx) +{ + struct user_nhop key= { .un_idx = uidx }; + struct user_nhop *unhop = NULL; + UN_TRACKER; + + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + UN_RUNLOCK(ctl); + + return (unhop); +} + +#define MAX_STACK_NHOPS 4 +static struct nhop_object * +clone_unhop(const struct user_nhop *unhop, uint32_t fibnum, int family, int nh_flags) +{ + const struct weightened_nhop *wn; + struct weightened_nhop *wn_new, wn_base[MAX_STACK_NHOPS]; + struct nhop_object *nh = NULL; + uint32_t num_nhops; + int error; + + if (unhop->un_nhop_src != NULL) { + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop->un_nhop_src, nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG2, unhop->un_nhop_src, + "cloning nhop %s -> %u.%u flags 0x%X", nhbuf, fibnum, + family, nh_flags); + } + struct nhop_object *nh; + nh = nhop_alloc(fibnum, AF_UNSPEC); + if (nh == NULL) + return (NULL); + nhop_copy(nh, unhop->un_nhop_src); + /* Check that nexthop gateway is compatible with the new family */ + if (!nhop_set_upper_family(nh, family)) { + nhop_free(nh); + return (NULL); + } + nhop_set_uidx(nh, unhop->un_idx); + nhop_set_pxtype_flag(nh, nh_flags); + return (nhop_get_nhop(nh, &error)); + } + + wn = unhop->un_nhgrp_src; + num_nhops = unhop->un_nhgrp_count; + + if (num_nhops > MAX_STACK_NHOPS) { + wn_new = malloc(num_nhops * sizeof(struct weightened_nhop), M_TEMP, M_NOWAIT); + if (wn_new == NULL) + return (NULL); + } else + wn_new = wn_base; + + for (int i = 0; i < num_nhops; i++) { + uint32_t uidx = nhop_get_uidx(wn[i].nh); + MPASS(uidx != 0); + wn_new[i].nh = nl_find_nhop(fibnum, family, uidx, nh_flags, &error); + if (error != 0) + break; + wn_new[i].weight = wn[i].weight; + } + + if (error == 0) { + struct rib_head *rh = nhop_get_rh(wn_new[0].nh); + struct nhgrp_object *nhg; + + error = nhgrp_get_group(rh, wn_new, num_nhops, unhop->un_idx, &nhg); + nh = (struct nhop_object *)nhg; + } + + if (wn_new != wn_base) + free(wn_new, M_TEMP); + return (nh); +} + +static void +destroy_unhop(struct user_nhop *unhop) +{ + if (unhop->un_nhop != NULL) + nhop_free_any(unhop->un_nhop); + if (unhop->un_nhop_src != NULL) + nhop_free_any(unhop->un_nhop_src); + free(unhop, M_NETLINK); +} + +static void +destroy_unhop_epoch(epoch_context_t ctx) +{ + struct user_nhop *unhop; + + unhop = __containerof(ctx, struct user_nhop, un_epoch_ctx); + + destroy_unhop(unhop); +} + +static uint32_t +find_spare_uidx(struct unhop_ctl *ctl) +{ + struct user_nhop *unhop, key = {}; + uint32_t uidx = 0; + UN_TRACKER; + + UN_RLOCK(ctl); + /* This should return spare uid with 75% of 65k used in ~99/100 cases */ + for (int i = 0; i < 16; i++) { + key.un_idx = (arc4random() % 65536) + 65536 * 4; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + if (unhop == NULL) { + uidx = key.un_idx; + break; + } + } + UN_RUNLOCK(ctl); + + return (uidx); +} + + +/* + * Actual netlink code + */ +struct netlink_walkargs { + struct nl_writer *nw; + struct nlmsghdr hdr; + struct nlpcb *so; + int family; + int error; + int count; + int dumped; +}; +#define ENOMEM_IF_NULL(_v) if ((_v) == NULL) goto enomem + +static bool +dump_nhgrp(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + + if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) + goto enomem; + + struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); + nhm->nh_family = AF_UNSPEC; + nhm->nh_scope = 0; + nhm->nh_protocol = unhop->un_protocol; + nhm->nh_flags = 0; + + nlattr_add_u32(nw, NHA_ID, unhop->un_idx); + nlattr_add_u16(nw, NHA_GROUP_TYPE, NEXTHOP_GRP_TYPE_MPATH); + + struct weightened_nhop *wn = unhop->un_nhgrp_src; + uint32_t num_nhops = unhop->un_nhgrp_count; + /* TODO: a better API? */ + int nla_len = sizeof(struct nlattr); + nla_len += NETLINK_ALIGN(num_nhops * sizeof(struct nexthop_grp)); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + if (nla == NULL) + goto enomem; + nla->nla_type = NHA_GROUP; + nla->nla_len = nla_len; + for (int i = 0; i < num_nhops; i++) { + struct nexthop_grp *grp = &((struct nexthop_grp *)(nla + 1))[i]; + grp->id = nhop_get_uidx(wn[i].nh); + grp->weight = wn[i].weight; + grp->resvd1 = 0; + grp->resvd2 = 0; + } + + if (nlmsg_end(nw)) + return (true); +enomem: + NL_LOG(LOG_DEBUG, "error: unable to allocate attribute memory"); + nlmsg_abort(nw); + return (false); +} + +static bool +dump_nhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + struct nhop_object *nh = unhop->un_nhop_src; + + if (!nlmsg_reply(nw, hdr, sizeof(struct nhmsg))) + goto enomem; + + struct nhmsg *nhm = nlmsg_reserve_object(nw, struct nhmsg); + ENOMEM_IF_NULL(nhm); + nhm->nh_family = nhop_get_neigh_family(nh); + nhm->nh_scope = 0; // XXX: what's that? + nhm->nh_protocol = unhop->un_protocol; + nhm->nh_flags = 0; + + nlattr_add_u32(nw, NHA_ID, unhop->un_idx); + if (nh->nh_flags & NHF_BLACKHOLE) { + nlattr_add_flag(nw, NHA_BLACKHOLE); + goto done; + } + nlattr_add_u32(nw, NHA_OIF, nh->nh_ifp->if_index); + + switch (nh->gw_sa.sa_family) { +#ifdef INET + case AF_INET: + nlattr_add(nw, NHA_GATEWAY, 4, &nh->gw4_sa.sin_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + { + struct in6_addr addr = nh->gw6_sa.sin6_addr; + in6_clearscope(&addr); + nlattr_add(nw, NHA_GATEWAY, 16, &addr); + break; + } +#endif + } + +done: + if (nlmsg_end(nw)) + return (true); +enomem: + nlmsg_abort(nw); + return (false); +} + +static void +dump_unhop(const struct user_nhop *unhop, struct nlmsghdr *hdr, + struct nl_writer *nw) +{ + if (unhop->un_nhop_src != NULL) + dump_nhop(unhop, hdr, nw); + else + dump_nhgrp(unhop, hdr, nw); +} + +static int +delete_unhop(struct unhop_ctl *ctl, struct nlmsghdr *hdr, uint32_t uidx) +{ + struct user_nhop *unhop_ret, *unhop_base, *unhop_chain; + + struct user_nhop key = { .un_idx = uidx }; + + UN_WLOCK(ctl); + + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop_base); + + if (unhop_base != NULL) { + CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_base, unhop_ret); + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop_base->un_nhop, nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG3, unhop_base->un_nhop, + "removed base nhop %u: %s", uidx, nhbuf); + } + /* Unlink all child nexhops as well, keeping the chain intact */ + unhop_chain = unhop_base->un_nextchild; + while (unhop_chain != NULL) { + CHT_SLIST_REMOVE(&ctl->un_head, unhop, unhop_chain, + unhop_ret); + MPASS(unhop_chain == unhop_ret); + IF_DEBUG_LEVEL(LOG_DEBUG3) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf_any(unhop_chain->un_nhop, + nhbuf, sizeof(nhbuf)); + FIB_NH_LOG(LOG_DEBUG3, unhop_chain->un_nhop, + "removed child nhop %u: %s", uidx, nhbuf); + } + unhop_chain = unhop_chain->un_nextchild; + } + } + + UN_WUNLOCK(ctl); + + if (unhop_base == NULL) { + NL_LOG(LOG_DEBUG, "unable to find unhop %u", uidx); + return (ENOENT); + } + + /* Report nexthop deletion */ + struct netlink_walkargs wa = { + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_DELNEXTHOP, + }; + + struct nl_writer nw = {}; + if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { + NL_LOG(LOG_DEBUG, "error allocating message writer"); + return (ENOMEM); + } + + dump_unhop(unhop_base, &wa.hdr, &nw); + nlmsg_flush(&nw); + + while (unhop_base != NULL) { + unhop_chain = unhop_base->un_nextchild; + epoch_call(net_epoch_preempt, destroy_unhop_epoch, + &unhop_base->un_epoch_ctx); + unhop_base = unhop_chain; + } + + return (0); +} + +static void +consider_resize(struct unhop_ctl *ctl, uint32_t new_size) +{ + void *new_ptr = NULL; + size_t alloc_size; + + if (new_size == 0) + return; + + if (new_size != 0) { + alloc_size = CHT_SLIST_GET_RESIZE_SIZE(new_size); + new_ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (new_ptr == NULL) + return; + } + + NL_LOG(LOG_DEBUG, "resizing hash: %u -> %u", ctl->un_head.hash_size, new_size); + UN_WLOCK(ctl); + if (new_ptr != NULL) { + CHT_SLIST_RESIZE(&ctl->un_head, unhop, new_ptr, new_size); + } + UN_WUNLOCK(ctl); + + + if (new_ptr != NULL) + free(new_ptr, M_NETLINK); +} + +static bool __noinline +vnet_init_unhops() +{ + uint32_t num_buckets = 16; + size_t alloc_size = CHT_SLIST_GET_RESIZE_SIZE(num_buckets); + + struct unhop_ctl *ctl = malloc(sizeof(struct unhop_ctl), M_NETLINK, + M_NOWAIT | M_ZERO); + if (ctl == NULL) + return (false); + + void *ptr = malloc(alloc_size, M_NETLINK, M_NOWAIT | M_ZERO); + if (ptr == NULL) { + free(ctl, M_NETLINK); + return (false); + } + CHT_SLIST_INIT(&ctl->un_head, ptr, num_buckets); + UN_LOCK_INIT(ctl); + + if (!atomic_cmpset_ptr((uintptr_t *)&V_un_ctl, (uintptr_t)NULL, (uintptr_t)ctl)) { + free(ptr, M_NETLINK); + free(ctl, M_NETLINK); + } + + if (atomic_load_ptr(&V_un_ctl) == NULL) + return (false); + + NL_LOG(LOG_NOTICE, "UNHOPS init done"); + + return (true); +} + +static void +vnet_destroy_unhops(const void *unused __unused) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + struct user_nhop *unhop, *tmp; + + if (ctl == NULL) + return; + V_un_ctl = NULL; + + /* Wait till all unhop users finish their reads */ + epoch_wait_preempt(net_epoch_preempt); + + UN_WLOCK(ctl); + CHT_SLIST_FOREACH_SAFE(&ctl->un_head, unhop, unhop, tmp) { + destroy_unhop(unhop); + } CHT_SLIST_FOREACH_SAFE_END; + UN_WUNLOCK(ctl); + + free(ctl->un_head.ptr, M_NETLINK); + free(ctl, M_NETLINK); +} +VNET_SYSUNINIT(vnet_destroy_unhops, SI_SUB_PROTO_IF, SI_ORDER_ANY, + vnet_destroy_unhops, NULL); + +static int +nlattr_get_nhg(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int error = 0; + + /* Verify attribute correctness */ + struct nexthop_grp *grp = NLA_DATA(nla); + int data_len = NLA_DATA_LEN(nla); + + int count = data_len / sizeof(*grp); + if (count == 0 || (count * sizeof(*grp) != data_len)) { + NL_LOG(LOG_DEBUG, "Invalid length for RTA_GROUP: %d", data_len); + return (EINVAL); + } + + *((struct nlattr **)target) = nla; + return (error); +} + +struct nl_parsed_nhop { + uint32_t nha_id; + uint8_t nha_blackhole; + uint8_t nha_groups; + struct ifnet *nha_oif; + struct sockaddr *nha_gw; + struct nlattr *nha_group; + uint8_t nh_family; + uint8_t nh_protocol; +}; + +#define _IN(_field) offsetof(struct nhmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_nhop, _field) +static const struct nlfield_parser nlf_p_nh[] = { + { .off_in = _IN(nh_family), .off_out = _OUT(nh_family), .cb = nlf_get_u8 }, + { .off_in = _IN(nh_protocol), .off_out = _OUT(nh_protocol), .cb = nlf_get_u8 }, +}; + +static const struct nlattr_parser nla_p_nh[] = { + { .type = NHA_ID, .off = _OUT(nha_id), .cb = nlattr_get_uint32 }, + { .type = NHA_GROUP, .off = _OUT(nha_group), .cb = nlattr_get_nhg }, + { .type = NHA_BLACKHOLE, .off = _OUT(nha_blackhole), .cb = nlattr_get_flag }, + { .type = NHA_OIF, .off = _OUT(nha_oif), .cb = nlattr_get_ifp }, + { .type = NHA_GATEWAY, .off = _OUT(nha_gw), .cb = nlattr_get_ip }, + { .type = NHA_GROUPS, .off = _OUT(nha_groups), .cb = nlattr_get_flag }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_PARSER(nhmsg_parser, struct nhmsg, nlf_p_nh, nla_p_nh); + +static bool +eligible_nhg(const struct nhop_object *nh) +{ + return (nh->nh_flags & NHF_GATEWAY); +} + +static int +newnhg(struct unhop_ctl *ctl, struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + struct nexthop_grp *grp = NLA_DATA(attrs->nha_group); + int count = NLA_DATA_LEN(attrs->nha_group) / sizeof(*grp); + struct weightened_nhop *wn; + + wn = malloc(sizeof(*wn) * count, M_NETLINK, M_NOWAIT | M_ZERO); + if (wn == NULL) + return (ENOMEM); + + for (int i = 0; i < count; i++) { + struct user_nhop *unhop; + unhop = nl_find_base_unhop(ctl, grp[i].id); + if (unhop == NULL) { + NL_LOG(LOG_DEBUG, "unable to find uidx %u", grp[i].id); + free(wn, M_NETLINK); + return (ESRCH); + } else if (unhop->un_nhop_src == NULL) { + NL_LOG(LOG_DEBUG, "uidx %u is a group, nested group unsupported", + grp[i].id); + free(wn, M_NETLINK); + return (ENOTSUP); + } else if (!eligible_nhg(unhop->un_nhop_src)) { + NL_LOG(LOG_DEBUG, "uidx %u nhop is not mpath-eligible", + grp[i].id); + free(wn, M_NETLINK); + return (ENOTSUP); + } + /* + * TODO: consider more rigid eligibility checks: + * restrict nexthops with the same gateway + */ + wn[i].nh = unhop->un_nhop_src; + wn[i].weight = grp[i].weight; + } + unhop->un_nhgrp_src = wn; + unhop->un_nhgrp_count = count; + return (0); +} + +static int +newnhop(struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + struct ifaddr *ifa = NULL; + struct nhop_object *nh; + int error; + + if (!attrs->nha_blackhole) { + if (attrs->nha_gw == NULL) { + NL_LOG(LOG_DEBUG, "missing NHA_GATEWAY"); + return (EINVAL); + } + if (attrs->nha_oif == NULL) { + NL_LOG(LOG_DEBUG, "missing NHA_OIF"); + return (EINVAL); + } + if (ifa == NULL) + ifa = ifaof_ifpforaddr(attrs->nha_gw, attrs->nha_oif); + if (ifa == NULL) { + NL_LOG(LOG_DEBUG, "Unable to determine default source IP"); + return (EINVAL); + } + } + + int family = attrs->nha_gw != NULL ? attrs->nha_gw->sa_family : attrs->nh_family; + + nh = nhop_alloc(RT_DEFAULT_FIB, family); + if (nh == NULL) { + NL_LOG(LOG_DEBUG, "Unable to allocate nexthop"); + return (ENOMEM); + } + nhop_set_uidx(nh, attrs->nha_id); + + if (attrs->nha_blackhole) + nhop_set_blackhole(nh, NHF_BLACKHOLE); + else { + nhop_set_gw(nh, attrs->nha_gw, true); + nhop_set_transmit_ifp(nh, attrs->nha_oif); + nhop_set_src(nh, ifa); + } + + error = nhop_get_unlinked(nh); + if (error != 0) { + NL_LOG(LOG_DEBUG, "unable to finalize nexthop"); + return (error); + } + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char nhbuf[NHOP_PRINT_BUFSIZE]; + nhop_print_buf(nh, nhbuf, sizeof(nhbuf)); + NL_LOG(LOG_DEBUG2, "Adding unhop %u: %s", attrs->nha_id, nhbuf); + } + + unhop->un_nhop_src = nh; + return (0); +} + +static int +rtnl_handle_newnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct user_nhop *unhop; + int error; + + if ((__predict_false(V_un_ctl == NULL)) && (!vnet_init_unhops())) + return (ENOMEM); + struct unhop_ctl *ctl = V_un_ctl; + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + /* + * Get valid nha_id. Treat nha_id == 0 (auto-assignment) as a second-class + * citizen. + */ + if (attrs.nha_id == 0) { + attrs.nha_id = find_spare_uidx(ctl); + if (attrs.nha_id == 0) { + NL_LOG(LOG_DEBUG, "Unable to get spare uidx"); + return (ENOSPC); + } + } + + NL_LOG(LOG_DEBUG, "IFINDEX %d", attrs.nha_oif ? attrs.nha_oif->if_index : 0); + + unhop = malloc(sizeof(struct user_nhop), M_NETLINK, M_NOWAIT | M_ZERO); + if (unhop == NULL) { + NL_LOG(LOG_DEBUG, "Unable to allocate user_nhop"); + return (ENOMEM); + } + unhop->un_idx = attrs.nha_id; + unhop->un_protocol = attrs.nh_protocol; + + if (attrs.nha_group) + error = newnhg(ctl, &attrs, unhop); + else + error = newnhop(&attrs, unhop); + + if (error != 0) { + free(unhop, M_NETLINK); + return (error); + } + + UN_WLOCK(ctl); + /* Check if uidx already exists */ + struct user_nhop *tmp = NULL; + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, unhop, tmp); + if (tmp != NULL) { + UN_WUNLOCK(ctl); + NL_LOG(LOG_DEBUG, "nhop idx %u already exists", attrs.nha_id); + destroy_unhop(unhop); + return (EEXIST); + } + CHT_SLIST_INSERT_HEAD(&ctl->un_head, unhop, unhop); + uint32_t num_buckets_new = CHT_SLIST_GET_RESIZE_BUCKETS(&ctl->un_head); + UN_WUNLOCK(ctl); + + /* Report addition of the next nexhop */ + struct netlink_walkargs wa = { + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, + }; + + struct nl_writer nw = {}; + if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, RTNLGRP_NEXTHOP)) { + NL_LOG(LOG_DEBUG, "error allocating message writer"); + return (ENOMEM); + } + + dump_unhop(unhop, &wa.hdr, &nw); + nlmsg_flush(&nw); + + consider_resize(ctl, num_buckets_new); + + return (0); +} + +static int +rtnl_handle_delnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + int error; + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.nha_id == 0) { + NL_LOG(LOG_DEBUG, "NHA_ID not set"); + return (EINVAL); + } + + error = delete_unhop(ctl, hdr, attrs.nha_id); + + return (error); +} + +static bool +match_unhop(const struct nl_parsed_nhop *attrs, struct user_nhop *unhop) +{ + if (attrs->nha_id != 0 && unhop->un_idx != attrs->nha_id) + return (false); + if (attrs->nha_groups != 0 && unhop->un_nhgrp_src == NULL) + return (false); + if (attrs->nha_oif != NULL && + (unhop->un_nhop_src == NULL || unhop->un_nhop_src->nh_ifp != attrs->nha_oif)) + return (false); + + return (true); +} + +static int +rtnl_handle_getnhop(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct unhop_ctl *ctl = atomic_load_ptr(&V_un_ctl); + struct user_nhop *unhop; + UN_TRACKER; + int error; + + if (__predict_false(ctl == NULL)) + return (ESRCH); + + struct nl_parsed_nhop attrs = {}; + error = nl_parse_nlmsg(hdr, &nhmsg_parser, npt, &attrs); + if (error != 0) + return (error); + + struct netlink_walkargs wa = { + .nw = npt->nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_flags = hdr->nlmsg_flags, + .hdr.nlmsg_type = NL_RTM_NEWNEXTHOP, + }; + + if (attrs.nha_id != 0) { + NL_LOG(LOG_DEBUG2, "searching for uidx %u", attrs.nha_id); + struct user_nhop key= { .un_idx = attrs.nha_id }; + UN_RLOCK(ctl); + CHT_SLIST_FIND_BYOBJ(&ctl->un_head, unhop, &key, unhop); + UN_RUNLOCK(ctl); + + if (unhop == NULL) + return (ESRCH); + dump_unhop(unhop, &wa.hdr, wa.nw); + return (0); + } + + UN_RLOCK(ctl); + wa.hdr.nlmsg_flags |= NLM_F_MULTI; + CHT_SLIST_FOREACH(&ctl->un_head, unhop, unhop) { + if (UNHOP_IS_MASTER(unhop) && match_unhop(&attrs, unhop)) + dump_unhop(unhop, &wa.hdr, wa.nw); + } CHT_SLIST_FOREACH_END; + UN_RUNLOCK(ctl); + + if (wa.error == 0) { + if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) + return (ENOMEM); + } + return (0); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_NEWNEXTHOP, + .name = "RTM_NEWNEXTHOP", + .cb = &rtnl_handle_newnhop, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_DELNEXTHOP, + .name = "RTM_DELNEXTHOP", + .cb = &rtnl_handle_delnhop, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_GETNEXTHOP, + .name = "RTM_GETNEXTHOP", + .cb = &rtnl_handle_getnhop, + } +}; + +static const struct nlhdr_parser *all_parsers[] = { &nhmsg_parser }; + +void +rtnl_nexthops_init() +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); +} diff --git a/sys/netlink/route/nexthop.h b/sys/netlink/route/nexthop.h new file mode 100644 index 000000000000..310c3e08fc4b --- /dev/null +++ b/sys/netlink/route/nexthop.h @@ -0,0 +1,102 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * NEXTHOP-related (RTM_NEXTHOP) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_NEXTHOP_H_ +#define _NETLINK_ROUTE_NEXTHOP_H_ + +/* Base header for all of the relevant messages */ +struct nhmsg { + unsigned char nh_family; /* transport family */ + unsigned char nh_scope; /* ignored on RX, filled by kernel */ + unsigned char nh_protocol; /* Routing protocol that installed nh */ + unsigned char resvd; + unsigned int nh_flags; /* RTNH_F_* flags from route.h */ +}; + +enum { + NHA_UNSPEC, + NHA_ID, /* u32: nexthop userland index, auto-assigned if 0 */ + NHA_GROUP, /* binary: array of struct nexthop_grp */ + NHA_GROUP_TYPE, /* u16: set to NEXTHOP_GRP_TYPE */ + NHA_BLACKHOLE, /* flag: nexthop used to blackhole packets */ + NHA_OIF, /* u32: transmit ifindex */ + NHA_GATEWAY, /* network: IPv4/IPv6 gateway addr */ + NHA_ENCAP_TYPE, /* not supported */ + NHA_ENCAP, /* not supported */ + NHA_GROUPS, /* flag: match nexthop groups */ + NHA_MASTER, /* not supported */ + NHA_FDB, /* not supported */ + NHA_RES_GROUP, /* not supported */ + NHA_RES_BUCKET, /* not supported */ + __NHA_MAX, +}; +#define NHA_MAX (__NHA_MAX - 1) + +/* + * Attributes that can be used as filters: + * NHA_ID (nexhop or group), NHA_OIF, NHA_GROUPS, + */ + +/* + * NHA_GROUP: array of the following structures. + * If attribute is set, the only other valid attributes are + * NHA_ID and NHA_GROUP_TYPE. + * NHA_RES_GROUP and NHA_RES_BUCKET are not supported yet + */ +struct nexthop_grp { + uint32_t id; /* nexhop userland index */ + uint8_t weight; /* weight of this nexthop */ + uint8_t resvd1; + uint16_t resvd2; +}; + +/* NHA_GROUP_TYPE: u16 */ +enum { + NEXTHOP_GRP_TYPE_MPATH, /* default nexthop group */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ + __NEXTHOP_GRP_TYPE_MAX, +}; +#define NEXTHOP_GRP_TYPE_MAX (__NEXTHOP_GRP_TYPE_MAX - 1) + + +/* NHA_RES_GROUP */ +enum { + NHA_RES_GROUP_UNSPEC, + NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC, + NHA_RES_GROUP_BUCKETS, + NHA_RES_GROUP_IDLE_TIMER, + NHA_RES_GROUP_UNBALANCED_TIMER, + NHA_RES_GROUP_UNBALANCED_TIME, + __NHA_RES_GROUP_MAX, +}; +#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1) + +#endif diff --git a/sys/netlink/route/route.c b/sys/netlink/route/route.c new file mode 100644 index 000000000000..7573b371155e --- /dev/null +++ b/sys/netlink/route/route.c @@ -0,0 +1,972 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 Ng Peng Nam Sean + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_route.h" +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_MOD_NAME nl_route +#define DEBUG_MAX_LEVEL LOG_DEBUG3 +#include +_DECLARE_DEBUG(LOG_DEBUG); + +static unsigned char +get_rtm_type(const struct nhop_object *nh) +{ + int nh_flags = nh->nh_flags; + + /* Use the fact that nhg runtime flags are only NHF_MULTIPATH */ + if (nh_flags & NHF_BLACKHOLE) + return (RTN_BLACKHOLE); + else if (nh_flags & NHF_REJECT) + return (RTN_PROHIBIT); + return (RTN_UNICAST); +} + +static uint8_t +nl_get_rtm_protocol(const struct nhop_object *nh) +{ + if (NH_IS_NHGRP(nh)) { + const struct nhgrp_object *nhg = (const struct nhgrp_object *)nh; + uint8_t origin = nhgrp_get_origin(nhg); + if (origin != RTPROT_UNSPEC) + return (origin); + nh = nhg->nhops[0]; + } + uint8_t origin = nhop_get_origin(nh); + if (origin != RTPROT_UNSPEC) + return (origin); + /* TODO: remove guesswork once all kernel users fill in origin */ + int rt_flags = nhop_get_rtflags(nh); + if (rt_flags & RTF_PROTO1) + return (RTPROT_ZEBRA); + if (rt_flags & RTF_STATIC) + return (RTPROT_STATIC); + return (RTPROT_KERNEL); +} + +static int +get_rtmsg_type_from_rtsock(int cmd) +{ + switch (cmd) { + case RTM_ADD: + case RTM_CHANGE: + case RTM_GET: + return NL_RTM_NEWROUTE; + case RTM_DELETE: + return NL_RTM_DELROUTE; + } + + return (0); +} + +/* + * fibnum heuristics + * + * if (dump && rtm_table == 0 && !rta_table) RT_ALL_FIBS + * msg rtm_table RTA_TABLE result + * RTM_GETROUTE/dump 0 - RT_ALL_FIBS + * RTM_GETROUTE/dump 1 - 1 + * RTM_GETROUTE/get 0 - 0 + * + */ + +static struct nhop_object * +rc_get_nhop(const struct rib_cmd_info *rc) +{ + return ((rc->rc_cmd == RTM_DELETE) ? rc->rc_nh_old : rc->rc_nh_new); +} + +static void +dump_rc_nhop_gw(struct nl_writer *nw, const struct nhop_object *nh) +{ + int upper_family; + + switch (nhop_get_neigh_family(nh)) { + case AF_LINK: + /* onlink prefix, skip */ + break; + case AF_INET: + nlattr_add(nw, NL_RTA_GATEWAY, 4, &nh->gw4_sa.sin_addr); + break; + case AF_INET6: + upper_family = nhop_get_upper_family(nh); + if (upper_family == AF_INET6) { + nlattr_add(nw, NL_RTA_GATEWAY, 16, &nh->gw6_sa.sin6_addr); + } else if (upper_family == AF_INET) { + /* IPv4 over IPv6 */ + char buf[20]; + struct rtvia *via = (struct rtvia *)&buf[0]; + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &nh->gw6_sa.sin6_addr, 16); + nlattr_add(nw, NL_RTA_VIA, 17, via); + } + break; + } +} + +static void +dump_rc_nhop_mtu(struct nl_writer *nw, const struct nhop_object *nh) +{ + int nla_len = sizeof(struct nlattr) * 2 + sizeof(uint32_t); + struct nlattr *nla = nlmsg_reserve_data(nw, nla_len, struct nlattr); + + if (nla == NULL) + return; + nla->nla_type = NL_RTA_METRICS; + nla->nla_len = nla_len; + nla++; + nla->nla_type = NL_RTAX_MTU; + nla->nla_len = sizeof(struct nlattr) + sizeof(uint32_t); + *((uint32_t *)(nla + 1)) = nh->nh_mtu; +} + +static void +dump_rc_nhg(struct nl_writer *nw, const struct nhgrp_object *nhg, struct rtmsg *rtm) +{ + uint32_t uidx = nhgrp_get_uidx(nhg); + uint32_t num_nhops; + const struct weightened_nhop *wn = nhgrp_get_nhops(nhg, &num_nhops); + uint32_t base_rtflags = nhop_get_rtflags(wn[0].nh); + + if (uidx != 0) + nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); + + nlattr_add_u32(nw, NL_RTA_RTFLAGS, base_rtflags); + int off = nlattr_add_nested(nw, NL_RTA_MULTIPATH); + if (off == 0) + return; + + for (int i = 0; i < num_nhops; i++) { + int nh_off = nlattr_save_offset(nw); + struct rtnexthop *rtnh = nlmsg_reserve_object(nw, struct rtnexthop); + if (rtnh == NULL) + return; + rtnh->rtnh_flags = 0; + rtnh->rtnh_ifindex = wn[i].nh->nh_ifp->if_index; + rtnh->rtnh_hops = wn[i].weight; + dump_rc_nhop_gw(nw, wn[i].nh); + uint32_t rtflags = nhop_get_rtflags(wn[i].nh); + if (rtflags != base_rtflags) + nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); + if (rtflags & RTF_FIXEDMTU) + dump_rc_nhop_mtu(nw, wn[i].nh); + rtnh = nlattr_restore_offset(nw, nh_off, struct rtnexthop); + /* + * nlattr_add() allocates 4-byte aligned storage, no need to aligh + * length here + * */ + rtnh->rtnh_len = nlattr_save_offset(nw) - nh_off; + } + nlattr_set_len(nw, off); +} + +static void +dump_rc_nhop(struct nl_writer *nw, const struct nhop_object *nh, struct rtmsg *rtm) +{ + if (NH_IS_NHGRP(nh)) { + dump_rc_nhg(nw, (const struct nhgrp_object *)nh, rtm); + return; + } + + uint32_t rtflags = nhop_get_rtflags(nh); + + /* + * IPv4 over IPv6 + * ('RTA_VIA', {'family': 10, 'addr': 'fe80::20c:29ff:fe67:2dd'}), ('RTA_OIF', 2), + * IPv4 w/ gw + * ('RTA_GATEWAY', '172.16.107.131'), ('RTA_OIF', 2)], + * Direct route: + * ('RTA_OIF', 2) + */ + if (nh->nh_flags & NHF_GATEWAY) + dump_rc_nhop_gw(nw, nh); + + uint32_t uidx = nhop_get_uidx(nh); + if (uidx != 0) + nlattr_add_u32(nw, NL_RTA_NH_ID, uidx); + nlattr_add_u32(nw, NL_RTA_KNH_ID, nhop_get_idx(nh)); + nlattr_add_u32(nw, NL_RTA_RTFLAGS, rtflags); + + if (rtflags & RTF_FIXEDMTU) + dump_rc_nhop_mtu(nw, nh); + uint32_t nh_expire = nhop_get_expire(nh); + if (nh_expire > 0) + nlattr_add_u32(nw, NL_RTA_EXPIRES, nh_expire - time_uptime); + + /* In any case, fill outgoing interface */ + nlattr_add_u32(nw, NL_RTA_OIF, nh->nh_ifp->if_index); +} + +/* + * Dumps output from a rib command into an rtmsg + */ + +static int +dump_px(uint32_t fibnum, const struct nlmsghdr *hdr, + const struct rtentry *rt, struct route_nhop_data *rnd, + struct nl_writer *nw) +{ + struct rtmsg *rtm; + int error = 0; + + NET_EPOCH_ASSERT(); + + if (!nlmsg_reply(nw, hdr, sizeof(struct rtmsg))) + goto enomem; + + int family = rt_get_family(rt); + int rtm_off = nlattr_save_offset(nw); + rtm = nlmsg_reserve_object(nw, struct rtmsg); + rtm->rtm_family = family; + rtm->rtm_dst_len = 0; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + if (fibnum < 255) + rtm->rtm_table = (unsigned char)fibnum; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + if (!NH_IS_NHGRP(rnd->rnd_nhop)) { + rtm->rtm_protocol = nl_get_rtm_protocol(rnd->rnd_nhop); + rtm->rtm_type = get_rtm_type(rnd->rnd_nhop); + } else { + rtm->rtm_protocol = RTPROT_UNSPEC; /* TODO: protocol from nhg? */ + rtm->rtm_type = RTN_UNICAST; + } + + nlattr_add_u32(nw, NL_RTA_TABLE, fibnum); + + int plen = 0; + uint32_t scopeid = 0; + switch (family) { + case AF_INET: + { + struct in_addr addr; + rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid); + nlattr_add(nw, NL_RTA_DST, 4, &addr); + break; + } + case AF_INET6: + { + struct in6_addr addr; + rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid); + nlattr_add(nw, NL_RTA_DST, 16, &addr); + break; + } + default: + FIB_LOG(LOG_NOTICE, fibnum, family, "unsupported rt family: %d", family); + error = EAFNOSUPPORT; + goto flush; + } + + rtm = nlattr_restore_offset(nw, rtm_off, struct rtmsg); + if (plen > 0) + rtm->rtm_dst_len = plen; + dump_rc_nhop(nw, rnd->rnd_nhop, rtm); + + if (nlmsg_end(nw)) + return (0); +enomem: + error = ENOMEM; +flush: + nlmsg_abort(nw); + return (error); +} + +static int +family_to_group(int family) +{ + switch (family) { + case AF_INET: + return (RTNLGRP_IPV4_ROUTE); + case AF_INET6: + return (RTNLGRP_IPV6_ROUTE); + } + return (0); +} + + +static void +report_operation(uint32_t fibnum, struct rib_cmd_info *rc, + struct nlpcb *nlp, struct nlmsghdr *hdr) +{ + struct nl_writer nw; + + uint32_t group_id = family_to_group(rt_get_family(rc->rc_rt)); + if (nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { + struct route_nhop_data rnd = { + .rnd_nhop = rc_get_nhop(rc), + .rnd_weight = rc->rc_nh_weight, + }; + hdr->nlmsg_flags &= ~(NLM_F_REPLACE | NLM_F_CREATE); + hdr->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_APPEND); + switch (rc->rc_cmd) { + case RTM_ADD: + hdr->nlmsg_type = NL_RTM_NEWROUTE; + hdr->nlmsg_flags |= NLM_F_CREATE | NLM_F_EXCL; + break; + case RTM_CHANGE: + hdr->nlmsg_type = NL_RTM_NEWROUTE; + hdr->nlmsg_flags |= NLM_F_REPLACE; + break; + case RTM_DELETE: + hdr->nlmsg_type = NL_RTM_DELROUTE; + break; + } + dump_px(fibnum, hdr, rc->rc_rt, &rnd, &nw); + nlmsg_flush(&nw); + } + + rtsock_callback_p->route_f(fibnum, rc); +} + +struct rta_mpath_nh { + struct sockaddr *gw; + struct ifnet *ifp; + uint8_t rtnh_flags; + uint8_t rtnh_weight; +}; + +#define _IN(_field) offsetof(struct rtnexthop, _field) +#define _OUT(_field) offsetof(struct rta_mpath_nh, _field) +const static struct nlattr_parser nla_p_rtnh[] = { + { .type = NL_RTA_GATEWAY, .off = _OUT(gw), .cb = nlattr_get_ip }, + { .type = NL_RTA_VIA, .off = _OUT(gw), .cb = nlattr_get_ipvia }, +}; +const static struct nlfield_parser nlf_p_rtnh[] = { + { .off_in = _IN(rtnh_flags), .off_out = _OUT(rtnh_flags), .cb = nlf_get_u8 }, + { .off_in = _IN(rtnh_hops), .off_out = _OUT(rtnh_weight), .cb = nlf_get_u8 }, + { .off_in = _IN(rtnh_ifindex), .off_out = _OUT(ifp), .cb = nlf_get_ifpz }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_PARSER(mpath_parser, struct rtnexthop, nlf_p_rtnh, nla_p_rtnh); + +struct rta_mpath { + int num_nhops; + struct rta_mpath_nh nhops[0]; +}; + +static int +nlattr_get_multipath(struct nlattr *nla, struct nl_pstate *npt, const void *arg, void *target) +{ + int data_len = nla->nla_len - sizeof(struct nlattr); + struct rtnexthop *rtnh; + + int max_nhops = data_len / sizeof(struct rtnexthop); + + struct rta_mpath *mp = npt_alloc(npt, (max_nhops + 2) * sizeof(struct rta_mpath_nh)); + mp->num_nhops = 0; + + for (rtnh = (struct rtnexthop *)(nla + 1); data_len > 0; ) { + struct rta_mpath_nh *mpnh = &mp->nhops[mp->num_nhops++]; + + int error = nl_parse_header(rtnh, rtnh->rtnh_len, &mpath_parser, + npt, mpnh); + if (error != 0) { + NLMSG_REPORT_ERR_MSG(npt, "RTA_MULTIPATH: nexhop %d: parse failed", + mp->num_nhops - 1); + return (error); + } + + int len = NL_ITEM_ALIGN(rtnh->rtnh_len); + data_len -= len; + rtnh = (struct rtnexthop *)((char *)rtnh + len); + } + if (data_len != 0 || mp->num_nhops == 0) { + NLMSG_REPORT_ERR_MSG(npt, "invalid RTA_MULTIPATH attr"); + return (EINVAL); + } + + *((struct rta_mpath **)target) = mp; + return (0); +} + + +struct nl_parsed_route { + struct sockaddr *rta_dst; + struct sockaddr *rta_gw; + struct ifnet *rta_oif; + struct rta_mpath *rta_multipath; + uint32_t rta_table; + uint32_t rta_rtflags; + uint32_t rta_nh_id; + uint32_t rtax_mtu; + uint8_t rtm_family; + uint8_t rtm_dst_len; +}; + +#define _IN(_field) offsetof(struct rtmsg, _field) +#define _OUT(_field) offsetof(struct nl_parsed_route, _field) +static struct nlattr_parser nla_p_rtmetrics[] = { + { .type = NL_RTAX_MTU, .off = _OUT(rtax_mtu), .cb = nlattr_get_uint32 }, +}; +NL_DECLARE_ATTR_PARSER(metrics_parser, nla_p_rtmetrics); + +static const struct nlattr_parser nla_p_rtmsg[] = { + { .type = NL_RTA_DST, .off = _OUT(rta_dst), .cb = nlattr_get_ip }, + { .type = NL_RTA_OIF, .off = _OUT(rta_oif), .cb = nlattr_get_ifp }, + { .type = NL_RTA_GATEWAY, .off = _OUT(rta_gw), .cb = nlattr_get_ip }, + { .type = NL_RTA_METRICS, .arg = &metrics_parser, .cb = nlattr_get_nested }, + { .type = NL_RTA_MULTIPATH, .off = _OUT(rta_multipath), .cb = nlattr_get_multipath }, + { .type = NL_RTA_RTFLAGS, .off = _OUT(rta_rtflags), .cb = nlattr_get_uint32 }, + { .type = NL_RTA_TABLE, .off = _OUT(rta_table), .cb = nlattr_get_uint32 }, + { .type = NL_RTA_VIA, .off = _OUT(rta_gw), .cb = nlattr_get_ipvia }, + { .type = NL_RTA_NH_ID, .off = _OUT(rta_nh_id), .cb = nlattr_get_uint32 }, +}; + +static const struct nlfield_parser nlf_p_rtmsg[] = { + {.off_in = _IN(rtm_family), .off_out = _OUT(rtm_family), .cb = nlf_get_u8 }, + {.off_in = _IN(rtm_dst_len), .off_out = _OUT(rtm_dst_len), .cb = nlf_get_u8 }, +}; +#undef _IN +#undef _OUT +NL_DECLARE_PARSER(rtm_parser, struct rtmsg, nlf_p_rtmsg, nla_p_rtmsg); + +struct netlink_walkargs { + struct nl_writer *nw; + struct route_nhop_data rnd; + struct nlmsghdr hdr; + struct nlpcb *nlp; + uint32_t fibnum; + int family; + int error; + int count; + int dumped; + int dumped_tables; +}; + +static int +dump_rtentry(struct rtentry *rt, void *_arg) +{ + struct netlink_walkargs *wa = (struct netlink_walkargs *)_arg; + int error; + + wa->count++; + if (wa->error != 0) + return (0); + wa->dumped++; + + rt_get_rnd(rt, &wa->rnd); + + error = dump_px(wa->fibnum, &wa->hdr, rt, &wa->rnd, wa->nw); + + IF_DEBUG_LEVEL(LOG_DEBUG3) { + char rtbuf[INET6_ADDRSTRLEN + 5]; + FIB_LOG(LOG_DEBUG3, wa->fibnum, wa->family, + "Dump %s, offset %u, error %d", + rt_print_buf(rt, rtbuf, sizeof(rtbuf)), + wa->nw->offset, error); + } + wa->error = error; + + return (0); +} + +static void +dump_rtable_one(struct netlink_walkargs *wa, uint32_t fibnum, int family) +{ + FIB_LOG(LOG_DEBUG2, fibnum, family, "Start dump"); + wa->count = 0; + wa->dumped = 0; + + rib_walk(fibnum, family, false, dump_rtentry, wa); + + wa->dumped_tables++; + + FIB_LOG(LOG_DEBUG2, fibnum, family, "End dump, iterated %d dumped %d", + wa->count, wa->dumped); + NL_LOG(LOG_DEBUG2, "Current offset: %d", wa->nw->offset); +} + +static int +dump_rtable_fib(struct netlink_walkargs *wa, uint32_t fibnum, int family) +{ + wa->fibnum = fibnum; + + if (family == AF_UNSPEC) { + for (int i = 0; i < AF_MAX; i++) { + if (rt_tables_get_rnh(fibnum, i) != 0) { + wa->family = i; + dump_rtable_one(wa, fibnum, i); + if (wa->error != 0) + break; + } + } + } else { + if (rt_tables_get_rnh(fibnum, family) != 0) { + wa->family = family; + dump_rtable_one(wa, fibnum, family); + } + } + + return (wa->error); +} + +static int +handle_rtm_getroute(struct nlpcb *nlp, struct nl_parsed_route *attrs, + struct nlmsghdr *hdr, struct nl_pstate *npt) +{ + RIB_RLOCK_TRACKER; + struct rib_head *rnh; + struct rtentry *rt; + uint32_t fibnum = attrs->rta_table; + sa_family_t family = attrs->rtm_family; + + if (attrs->rta_dst == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "No RTA_DST supplied"); + return (EINVAL); + } + + FIB_LOG(LOG_DEBUG, fibnum, family, "getroute called"); + + rnh = rt_tables_get_rnh(fibnum, family); + if (rnh == NULL) + return (EAFNOSUPPORT); + + RIB_RLOCK(rnh); + + rt = (struct rtentry *)rnh->rnh_matchaddr(attrs->rta_dst, &rnh->head); + if (rt == NULL) { + RIB_RUNLOCK(rnh); + return (ESRCH); + } + + struct route_nhop_data rnd; + rt_get_rnd(rt, &rnd); + rnd.rnd_nhop = nhop_select_func(rnd.rnd_nhop, 0); + + RIB_RUNLOCK(rnh); + + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char rtbuf[NHOP_PRINT_BUFSIZE] __unused, nhbuf[NHOP_PRINT_BUFSIZE] __unused; + FIB_LOG(LOG_DEBUG2, fibnum, family, "getroute completed: got %s for %s", + nhop_print_buf_any(rnd.rnd_nhop, nhbuf, sizeof(nhbuf)), + rt_print_buf(rt, rtbuf, sizeof(rtbuf))); + } + + hdr->nlmsg_type = NL_RTM_NEWROUTE; + dump_px(fibnum, hdr, rt, &rnd, npt->nw); + + return (0); +} + +static int +handle_rtm_dump(struct nlpcb *nlp, uint32_t fibnum, int family, + struct nlmsghdr *hdr, struct nl_writer *nw) +{ + struct netlink_walkargs wa = { + .nlp = nlp, + .nw = nw, + .hdr.nlmsg_pid = hdr->nlmsg_pid, + .hdr.nlmsg_seq = hdr->nlmsg_seq, + .hdr.nlmsg_type = NL_RTM_NEWROUTE, + .hdr.nlmsg_flags = hdr->nlmsg_flags | NLM_F_MULTI, + }; + + if (fibnum == RT_TABLE_UNSPEC) { + for (int i = 0; i < V_rt_numfibs; i++) { + dump_rtable_fib(&wa, fibnum, family); + if (wa.error != 0) + break; + } + } else + dump_rtable_fib(&wa, fibnum, family); + + if (wa.error == 0 && wa.dumped_tables == 0) { + FIB_LOG(LOG_DEBUG, fibnum, family, "incorrect fibnum/family"); + wa.error = ESRCH; + // How do we propagate it? + } + + if (!nlmsg_end_dump(wa.nw, wa.error, &wa.hdr)) { + NL_LOG(LOG_DEBUG, "Unable to finalize the dump"); + return (ENOMEM); + } + + return (wa.error); +} + +static struct nhop_object * +finalize_nhop(struct nhop_object *nh, int *perror) +{ + /* + * The following MUST be filled: + * nh_ifp, nh_ifa, nh_gw + */ + if (nh->gw_sa.sa_family == 0) { + /* + * Empty gateway. Can be direct route with RTA_OIF set. + */ + if (nh->nh_ifp != NULL) + nhop_set_direct_gw(nh, nh->nh_ifp); + else { + NL_LOG(LOG_DEBUG, "empty gateway and interface, skipping"); + *perror = EINVAL; + return (NULL); + } + /* Both nh_ifp and gateway are set */ + } else { + /* Gateway is set up, we can derive ifp if not set */ + if (nh->nh_ifp == NULL) { + struct ifaddr *ifa = ifa_ifwithnet(&nh->gw_sa, 1, nhop_get_fibnum(nh)); + if (ifa == NULL) { + NL_LOG(LOG_DEBUG, "Unable to determine ifp, skipping"); + *perror = EINVAL; + return (NULL); + } + nhop_set_transmit_ifp(nh, ifa->ifa_ifp); + } + } + /* Both nh_ifp and gateway are set */ + if (nh->nh_ifa == NULL) { + struct ifaddr *ifa = ifaof_ifpforaddr(&nh->gw_sa, nh->nh_ifp); + if (ifa == NULL) { + NL_LOG(LOG_DEBUG, "Unable to determine ifa, skipping"); + *perror = EINVAL; + return (NULL); + } + nhop_set_src(nh, ifa); + } + + return (nhop_get_nhop(nh, perror)); +} + +static int +get_pxflag(const struct nl_parsed_route *attrs) +{ + int pxflag = 0; + switch (attrs->rtm_family) { + case AF_INET: + if (attrs->rtm_dst_len == 32) + pxflag = NHF_HOST; + else if (attrs->rtm_dst_len == 0) + pxflag = NHF_DEFAULT; + break; + case AF_INET6: + if (attrs->rtm_dst_len == 32) + pxflag = NHF_HOST; + else if (attrs->rtm_dst_len == 0) + pxflag = NHF_DEFAULT; + break; + } + + return (pxflag); +} + +static int +get_op_flags(int nlm_flags) +{ + int op_flags = 0; + + op_flags |= (nlm_flags & NLM_F_REPLACE) ? RTM_F_REPLACE : 0; + op_flags |= (nlm_flags & NLM_F_EXCL) ? RTM_F_EXCL : 0; + op_flags |= (nlm_flags & NLM_F_CREATE) ? RTM_F_CREATE : 0; + op_flags |= (nlm_flags & NLM_F_APPEND) ? RTM_F_APPEND : 0; + + return (op_flags); +} + +static int +create_nexthop_one(struct nl_parsed_route *attrs, struct rta_mpath_nh *mpnh, + struct nl_pstate *npt, struct nhop_object **pnh) +{ + int error; + + if (mpnh->gw == NULL) + return (EINVAL); + + struct nhop_object *nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); + if (nh == NULL) + return (ENOMEM); + + nhop_set_gw(nh, mpnh->gw, true); + if (mpnh->ifp != NULL) + nhop_set_transmit_ifp(nh, mpnh->ifp); + nhop_set_rtflags(nh, attrs->rta_rtflags); + + *pnh = finalize_nhop(nh, &error); + + return (error); +} + +static struct nhop_object * +create_nexthop_from_attrs(struct nl_parsed_route *attrs, + struct nl_pstate *npt, int *perror) +{ + struct nhop_object *nh; + int error = 0; + + if (attrs->rta_multipath != NULL) { + /* Multipath w/o explicit nexthops */ + int num_nhops = attrs->rta_multipath->num_nhops; + struct weightened_nhop *wn = npt_alloc(npt, sizeof(*wn) * num_nhops); + nh = NULL; + + for (int i = 0; i < num_nhops; i++) { + struct rta_mpath_nh *mpnh = &attrs->rta_multipath->nhops[i]; + + error = create_nexthop_one(attrs, mpnh, npt, &wn[i].nh); + if (error != 0) { + for (int j = 0; j < i; j++) + nhop_free(wn[j].nh); + break; + } + wn[i].weight = mpnh->rtnh_weight > 0 ? mpnh->rtnh_weight : 1; + } + if (error == 0) { + struct rib_head *rh = nhop_get_rh(wn[0].nh); + + error = nhgrp_get_group(rh, wn, num_nhops, 0, + (struct nhgrp_object **)&nh); + + for (int i = 0; i < num_nhops; i++) + nhop_free(wn[i].nh); + } + *perror = error; + } else { + nh = nhop_alloc(attrs->rta_table, attrs->rtm_family); + if (nh == NULL) { + *perror = ENOMEM; + return (NULL); + } + if (attrs->rta_gw != NULL) + nhop_set_gw(nh, attrs->rta_gw, true); + if (attrs->rta_oif != NULL) + nhop_set_transmit_ifp(nh, attrs->rta_oif); + if (attrs->rtax_mtu != 0) + nhop_set_mtu(nh, attrs->rtax_mtu, true); + if (attrs->rta_rtflags & RTF_BROADCAST) + nhop_set_broadcast(nh, true); + if (attrs->rta_rtflags & RTF_BLACKHOLE) + nhop_set_blackhole(nh, NHF_BLACKHOLE); + if (attrs->rta_rtflags & RTF_REJECT) + nhop_set_blackhole(nh, NHF_REJECT); + nhop_set_rtflags(nh, attrs->rta_rtflags); + nh = finalize_nhop(nh, perror); + } + + return (nh); +} + +static int +rtnl_handle_newroute(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct rib_cmd_info rc = {}; + struct nhop_object *nh = NULL; + int error; + + struct nl_parsed_route attrs = {}; + error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); + if (error != 0) + return (error); + + /* Check if we have enough data */ + if (attrs.rta_dst == NULL) { + NL_LOG(LOG_DEBUG, "missing RTA_DST"); + return (EINVAL); + } + + if (attrs.rta_nh_id != 0) { + /* Referenced uindex */ + int pxflag = get_pxflag(&attrs); + nh = nl_find_nhop(attrs.rta_table, attrs.rtm_family, attrs.rta_nh_id, + pxflag, &error); + if (error != 0) + return (error); + } else { + nh = create_nexthop_from_attrs(&attrs, npt, &error); + if (error != 0) { + NL_LOG(LOG_DEBUG, "Error creating nexthop"); + return (error); + } + } + + int weight = NH_IS_NHGRP(nh) ? 0 : RT_DEFAULT_WEIGHT; + struct route_nhop_data rnd = { .rnd_nhop = nh, .rnd_weight = weight }; + int op_flags = get_op_flags(hdr->nlmsg_flags); + + error = rib_add_route_px(attrs.rta_table, attrs.rta_dst, attrs.rtm_dst_len, + &rnd, op_flags, &rc); + if (error == 0) + report_operation(attrs.rta_table, &rc, nlp, hdr); + return (error); +} + +static int +path_match_func(const struct rtentry *rt, const struct nhop_object *nh, void *_data) +{ + struct nl_parsed_route *attrs = (struct nl_parsed_route *)_data; + + if ((attrs->rta_gw != NULL) && !rib_match_gw(rt, nh, attrs->rta_gw)) + return (0); + + if ((attrs->rta_oif != NULL) && (attrs->rta_oif != nh->nh_ifp)) + return (0); + + return (1); +} + +static int +rtnl_handle_delroute(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt) +{ + struct rib_cmd_info rc; + int error; + + struct nl_parsed_route attrs = {}; + error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); + if (error != 0) + return (error); + + if (attrs.rta_dst == NULL) { + NLMSG_REPORT_ERR_MSG(npt, "RTA_DST is not set"); + return (ESRCH); + } + + error = rib_del_route_px(attrs.rta_table, attrs.rta_dst, + attrs.rtm_dst_len, path_match_func, &attrs, 0, &rc); + if (error == 0) + report_operation(attrs.rta_table, &rc, nlp, hdr); + return (error); +} + +static int +rtnl_handle_getroute(struct nlmsghdr *hdr, struct nlpcb *nlp, struct nl_pstate *npt) +{ + int error; + + struct nl_parsed_route attrs = {}; + error = nl_parse_nlmsg(hdr, &rtm_parser, npt, &attrs); + if (error != 0) + return (error); + + if (hdr->nlmsg_flags & NLM_F_DUMP) + error = handle_rtm_dump(nlp, attrs.rta_table, attrs.rtm_family, hdr, npt->nw); + else + error = handle_rtm_getroute(nlp, &attrs, hdr, npt); + + return (error); +} + +void +rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc) +{ + int family, nlm_flags = 0; + + struct nl_writer nw; + + family = rt_get_family(rc->rc_rt); + + /* XXX: check if there are active listeners first */ + + /* TODO: consider passing PID/type/seq */ + switch (rc->rc_cmd) { + case RTM_ADD: + nlm_flags = NLM_F_EXCL | NLM_F_CREATE; + break; + case RTM_CHANGE: + nlm_flags = NLM_F_REPLACE; + break; + case RTM_DELETE: + nlm_flags = 0; + break; + } + IF_DEBUG_LEVEL(LOG_DEBUG2) { + char rtbuf[NHOP_PRINT_BUFSIZE] __unused; + FIB_LOG(LOG_DEBUG2, fibnum, family, + "received event %s for %s / nlm_flags=%X", + rib_print_cmd(rc->rc_cmd), + rt_print_buf(rc->rc_rt, rtbuf, sizeof(rtbuf)), + nlm_flags); + } + + struct nlmsghdr hdr = { + .nlmsg_flags = nlm_flags, + .nlmsg_type = get_rtmsg_type_from_rtsock(rc->rc_cmd), + }; + + struct route_nhop_data rnd = { + .rnd_nhop = rc_get_nhop(rc), + .rnd_weight = rc->rc_nh_weight, + }; + + uint32_t group_id = family_to_group(family); + if (!nlmsg_get_group_writer(&nw, NLMSG_SMALL, NETLINK_ROUTE, group_id)) { + NL_LOG(LOG_DEBUG, "error allocating event buffer"); + return; + } + + dump_px(fibnum, &hdr, rc->rc_rt, &rnd, &nw); + nlmsg_flush(&nw); +} + +static const struct rtnl_cmd_handler cmd_handlers[] = { + { + .cmd = NL_RTM_GETROUTE, + .name = "RTM_GETROUTE", + .cb = &rtnl_handle_getroute, + }, + { + .cmd = NL_RTM_DELROUTE, + .name = "RTM_DELROUTE", + .cb = &rtnl_handle_delroute, + .priv = PRIV_NET_ROUTE, + }, + { + .cmd = NL_RTM_NEWROUTE, + .name = "RTM_NEWROUTE", + .cb = &rtnl_handle_newroute, + .priv = PRIV_NET_ROUTE, + } +}; + +static const struct nlhdr_parser *all_parsers[] = {&mpath_parser, &metrics_parser, &rtm_parser}; + +void +rtnl_routes_init() +{ + NL_VERIFY_PARSERS(all_parsers); + rtnl_register_messages(cmd_handlers, NL_ARRAY_LEN(cmd_handlers)); +} diff --git a/sys/netlink/route/route.h b/sys/netlink/route/route.h new file mode 100644 index 000000000000..6e1ef6cbf0c6 --- /dev/null +++ b/sys/netlink/route/route.h @@ -0,0 +1,366 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Route-related (RTM_ROUTE) message header and attributes. + */ + +#ifndef _NETLINK_ROUTE_ROUTE_H_ +#define _NETLINK_ROUTE_ROUTE_H_ + +/* Base header for all of the relevant messages */ +struct rtmsg { + unsigned char rtm_family; /* address family */ + unsigned char rtm_dst_len; /* Prefix length */ + unsigned char rtm_src_len; /* Source prefix length (not used) */ + unsigned char rtm_tos; /* Type of service (not used) */ + unsigned char rtm_table; /* rtable id */ + unsigned char rtm_protocol; /* Routing protocol id (RTPROT_) */ + unsigned char rtm_scope; /* Route distance (RT_SCOPE_) */ + unsigned char rtm_type; /* Route type (RTN_) */ + unsigned rtm_flags; /* Route flags (RTM_F_) */ +}; + +/* + * RFC 3549, 3.1.1, route type (rtm_type field). + */ +enum { + RTN_UNSPEC, + RTN_UNICAST, /* Unicast route */ + RTN_LOCAL, /* Accept locally (not supported) */ + RTN_BROADCAST, /* Accept locally as broadcast, send as broadcast */ + RTN_ANYCAST, /* Accept locally as broadcast, but send as unicast */ + RTN_MULTICAST, /* Multicast route */ + RTN_BLACKHOLE, /* Drop traffic towards destination */ + RTN_UNREACHABLE, /* Destination is unreachable */ + RTN_PROHIBIT, /* Administratively prohibited */ + RTN_THROW, /* Not in this table (not supported) */ + RTN_NAT, /* Translate this address (not supported) */ + RTN_XRESOLVE, /* Use external resolver (not supported) */ + __RTN_MAX, +}; +#define RTN_MAX (__RTN_MAX - 1) + +/* + * RFC 3549, 3.1.1, protocol (Identifies what/who added the route). + * Values larger than RTPROT_STATIC(4) are not interpreted by the + * kernel, they are just for user information. + */ +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirect */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ + +#define RTPROT_GATED 8 +#define RTPROT_RA 9 +#define RTPROT_MRT 1 +#define RTPROT_ZEBRA 11 +#define RTPROT_BIRD 12 +#define RTPROT_DNROUTED 13 +#define RTPROT_XORP 14 +#define RTPROT_NTK 15 +#define RTPROT_DHCP 16 +#define RTPROT_MROUTED 17 +#define RTPROT_KEEPALIVED 18 +#define RTPROT_BABEL 42 +#define RTPROT_OPENR 99 +#define RTPROT_BGP 186 +#define RTPROT_ISIS 187 +#define RTPROT_OSPF 188 +#define RTPROT_RIP 189 +#define RTPROT_EIGRP 192 + +/* + * RFC 3549 3.1.1 Route scope (valid distance to destination). + * + * The values between RT_SCOPE_UNIVERSE(0) and RT_SCOPE_SITE(200) + * are available to the user. + */ +enum rt_scope_t { + RT_SCOPE_UNIVERSE = 0, + /* User defined values */ + RT_SCOPE_SITE = 200, + RT_SCOPE_LINK = 253, + RT_SCOPE_HOST = 254, + RT_SCOPE_NOWHERE = 255 +}; + +/* + * RFC 3549 3.1.1 Route flags (rtm_flags). + * Is a composition of RTNH_F flags (0x1..0x40 range), RTM_F flags (below) + * and per-protocol (IPv4/IPv6) flags. + */ +#define RTM_F_NOTIFY 0x00000100 /* not supported */ +#define RTM_F_CLONED 0x00000200 /* not supported */ +#define RTM_F_EQUALIZE 0x00000400 /* not supported */ +#define RTM_F_PREFIX 0x00000800 /* not supported */ +#define RTM_F_LOOKUP_TABLE 0x00001000 /* not supported */ +#define RTM_F_FIB_MATCH 0x00002000 /* not supported */ +#define RTM_F_OFFLOAD 0x00004000 /* not supported */ +#define RTM_F_TRAP 0x00008000 /* not supported */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* not supported */ + +/* Compatibility handling helpers */ +#ifndef _KERNEL +#define NL_RTM_HDRLEN ((int)sizeof(struct rtmsg)) +#define RTM_RTA(_rtm) ((struct rtattr *)((char *)(_rtm) + NL_RTM_HDRLEN)) +#define RTM_PAYLOAD(_hdr) NLMSG_PAYLOAD((_hdr), NL_RTM_HDRLEN) +#endif + +/* + * Routing table identifiers. + * FreeBSD route table numbering starts from 0, where 0 is a valid default routing table. + * Indicating "all tables" via netlink can be done by not including RTA_TABLE attribute + * and keeping rtm_table=0 (compatibility) or setting RTA_TABLE value to RT_TABLE_UNSPEC. + */ +#define RT_TABLE_MAIN 0 /* RT_DEFAULT_FIB */ +#define RT_TABLE_UNSPEC 0xFFFFFFFF /* RT_ALL_FIBS */ + +enum rtattr_type_t { + NL_RTA_UNSPEC, + NL_RTA_DST = 1, /* binary, IPv4/IPv6 destination */ + NL_RTA_SRC = 2, /* binary, preferred source address */ + NL_RTA_IIF = 3, /* not supported */ + NL_RTA_OIF = 4, /* u32, transmit ifindex */ + NL_RTA_GATEWAY = 5, /* binary: IPv4/IPv6 gateway */ + NL_RTA_PRIORITY = 6, /* not supported */ + NL_RTA_PREFSRC = 7, /* not supported */ + NL_RTA_METRICS = 8, /* nested, list of NL_RTAX* attrs */ + NL_RTA_MULTIPATH = 9, /* binary, array of struct rtnexthop */ + NL_RTA_PROTOINFO = 10, /* not supported / deprecated */ + NL_RTA_KNH_ID = 10, /* u32, FreeBSD specific, kernel nexthop index */ + NL_RTA_FLOW = 11, /* not supported */ + NL_RTA_CACHEINFO = 12, /* not supported */ + NL_RTA_SESSION = 13, /* not supported / deprecated */ + NL_RTA_MP_ALGO = 14, /* not supported / deprecated */ + NL_RTA_RTFLAGS = 14, /* u32, FreeBSD specific, */ + NL_RTA_TABLE = 15, /* u32, fibnum */ + NL_RTA_MARK = 16, /* not supported */ + NL_RTA_MFC_STATS = 17, /* not supported */ + NL_RTA_VIA = 18, /* binary, struct rtvia */ + NL_RTA_NEWDST = 19, /* not supported */ + NL_RTA_PREF = 20, /* not supported */ + NL_RTA_ENCAP_TYPE = 21, /* not supported */ + NL_RTA_ENCAP = 22, /* not supported */ + NL_RTA_EXPIRES = 23, /* u32, seconds till expiration */ + NL_RTA_PAD = 24, /* not supported */ + NL_RTA_UID = 25, /* not supported */ + NL_RTA_TTL_PROPAGATE = 26, /* not supported */ + NL_RTA_IP_PROTO = 27, /* not supported */ + NL_RTA_SPORT = 28, /* not supported */ + NL_RTA_DPORT = 29, /* not supported */ + NL_RTA_NH_ID = 30, /* u32, nexthop/nexthop group index */ + __RTA_MAX +}; +#define NL_RTA_MAX (__RTA_MAX - 1) + +/* + * Attributes that can be used as filters: + * + */ + +#ifndef _KERNEL +/* + * RTA_* space has clashes with rtsock namespace. + * Use NL_RTA_ prefix in the kernel and map to + * RTA_ for userland. + */ +#define RTA_UNSPEC NL_RTA_UNSPEC +#define RTA_DST NL_RTA_DST +#define RTA_SRC NL_RTA_SRC +#define RTA_IIF NL_RTA_IIF +#define RTA_OIF NL_RTA_OIF +#define RTA_GATEWAY NL_RTA_GATEWAY +#define RTA_PRIORITY NL_RTA_PRIORITY +#define RTA_PREFSRC NL_RTA_PREFSRC +#define RTA_METRICS NL_RTA_METRICS +#define RTA_MULTIPATH NL_RTA_MULTIPATH +#define RTA_PROTOINFO NL_RTA_PROTOINFO +#define RTA_KNH_ID NL_RTA_KNH_ID +#define RTA_FLOW NL_RTA_FLOW +#define RTA_CACHEINFO NL_RTA_CACHEINFO +#define RTA_SESSION NL_RTA_SESSION +#define RTA_MP_ALGO NL_RTA_MP_ALGO +#define RTA_TABLE NL_RTA_TABLE +#define RTA_MARK NL_RTA_MARK +#define RTA_MFC_STATS NL_RTA_MFC_STATS +#define RTA_VIA NL_RTA_VIA +#define RTA_NEWDST NL_RTA_NEWDST +#define RTA_PREF NL_RTA_PREF +#define RTA_ENCAP_TYPE NL_RTA_ENCAP_TYPE +#define RTA_ENCAP NL_RTA_ENCAP +#define RTA_EXPIRES NL_RTA_EXPIRES +#define RTA_PAD NL_RTA_PAD +#define RTA_UID NL_RTA_UID +#define RTA_TTL_PROPAGATE NL_RTA_TTL_PROPAGATE +#define RTA_IP_PROTO NL_RTA_IP_PROTO +#define RTA_SPORT NL_RTA_SPORT +#define RTA_DPORT NL_RTA_DPORT +#define RTA_NH_ID NL_RTA_NH_ID +#define RTA_MAX NL_RTA_MAX +#endif + +/* route attribute header */ +struct rtattr { + unsigned short rta_len; + unsigned short rta_type; +}; + +#define NL_RTA_ALIGN_SIZE NL_ITEM_ALIGN_SIZE +#define NL_RTA_ALIGN NL_ITEM_ALIGN +#define NL_RTA_HDRLEN ((int)sizeof(struct rtattr)) +#define NL_RTA_DATA_LEN(_rta) ((int)((_rta)->rta_len - NL_RTA_HDRLEN)) +#define NL_RTA_DATA(_rta) NL_ITEM_DATA(_rta, NL_RTA_HDRLEN) +#define NL_RTA_DATA_CONST(_rta) NL_ITEM_DATA_CONST(_rta, NL_RTA_HDRLEN) + +/* Compatibility attribute handling helpers */ +#ifndef _KERNEL +#define RTA_ALIGNTO NL_RTA_ALIGN_SIZE +#define RTA_ALIGN(_len) NL_RTA_ALIGN(_len) +#define _RTA_LEN(_rta) ((int)(_rta)->rta_len) +#define _RTA_ALIGNED_LEN(_rta) RTA_ALIGN(_RTA_LEN(_rta)) +#define RTA_OK(_rta, _len) NL_ITEM_OK(_rta, _len, NL_RTA_HDRLEN, _RTA_LEN) +#define RTA_NEXT(_rta, _len) NL_ITEM_ITER(_rta, _len, _RTA_ALIGNED_LEN) +#define RTA_LENGTH(_len) (NL_RTA_HDRLEN + (_len)) +#define RTA_SPACE(_len) RTA_ALIGN(RTA_LENGTH(_len)) +#define RTA_DATA(_rta) NL_RTA_DATA(_rta) +#define RTA_PAYLOAD(_rta) ((int)(_RTA_LEN(_rta) - NL_RTA_HDRLEN)) +#endif + +/* RTA attribute headers */ + +/* RTA_VIA */ +struct rtvia { + sa_family_t rtvia_family; + uint8_t rtvia_addr[0]; +}; + +/* + * RTA_METRICS is a nested attribute, consisting of a list of + * TLVs with types defined below. + */ + enum { + NL_RTAX_UNSPEC, + NL_RTAX_LOCK = 1, /* not supported */ + NL_RTAX_MTU = 2, /* desired path MTU */ + NL_RTAX_WINDOW = 3, /* not supported */ + NL_RTAX_RTT = 4, /* not supported */ + NL_RTAX_RTTVAR = 5, /* not supported */ + NL_RTAX_SSTHRESH = 6, /* not supported */ + NL_RTAX_CWND = 7, /* not supported */ + NL_RTAX_ADVMSS = 8, /* not supported */ + NL_RTAX_REORDERING = 9, /* not supported */ + NL_RTAX_HOPLIMIT = 10, /* not supported */ + NL_RTAX_INITCWND = 11, /* not supporrted */ + NL_RTAX_FEATURES = 12, /* not supported */ + NL_RTAX_RTO_MIN = 13, /* not supported */ + NL_RTAX_INITRWND = 14, /* not supported */ + NL_RTAX_QUICKACK = 15, /* not supported */ + NL_RTAX_CC_ALGO = 15, /* not supported */ + NL_RTAX_FASTOPEN_NO_COOKIE = 16, /* not supported */ + __NL_RTAX_MAX +}; +#define NL_RTAX_MAX (__NL_RTAX_MAX - 1) + +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK \ + (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG) + +#ifndef _KERNEL + +/* + * RTAX_* space clashes with rtsock namespace. + * Use NL_RTAX_ prefix in the kernel and map to + * RTAX_ for userland. + */ +#define RTAX_UNSPEC NL_RTAX_UNSPEC +#define RTAX_LOCK NL_RTAX_LOCK +#define RTAX_MTU NL_RTAX_MTU +#define RTAX_WINDOW NL_RTAX_WINDOW +#define RTAX_RTT NL_RTAX_RTT +#define RTAX_RTTVAR NL_RTAX_RTTVAR +#define RTAX_SSTHRESH NL_RTAX_SSTHRESH +#define RTAX_CWND NL_RTAX_CWND +#define RTAX_ADVMSS NL_RTAX_ADVMSS +#define RTAX_REORDERING NL_RTAX_REORDERING +#define RTAX_HOPLIMIT NL_RTAX_HOPLIMIT +#define RTAX_INITCWND NL_RTAX_INITCWND +#define RTAX_FEATURES NL_RTAX_FEATURES +#define RTAX_RTO_MIN NL_RTAX_RTO_MIN +#define RTAX_INITRWND NL_RTAX_INITRWND +#define RTAX_QUICKACK NL_RTAX_QUICKACK +#define RTAX_CC_ALGO NL_RTAX_CC_ALGO +#define RTAX_FASTOPEN_NO_COOKIE NL_RTAX_FASTOPEN_NO_COOKIE +#endif + +/* + * RTA_MULTIPATH consists of an array of rtnexthop structures. + * Each rtnexthop structure contains RTA_GATEWAY or RTA_VIA + * attribute following the header. + */ +struct rtnexthop { + unsigned short rtnh_len; + unsigned char rtnh_flags; + unsigned char rtnh_hops; /* nexthop weight */ + int rtnh_ifindex; +}; + +/* rtnh_flags */ +#define RTNH_F_DEAD 0x01 /* not supported */ +#define RTNH_F_PERVASIVE 0x02 /* not supported */ +#define RTNH_F_ONLINK 0x04 /* not supported */ +#define RTNH_F_OFFLOAD 0x08 /* not supported */ +#define RTNH_F_LINKDOWN 0x10 /* not supported */ +#define RTNH_F_UNRESOLVED 0x20 /* not supported */ +#define RTNH_F_TRAP 0x40 /* not supported */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \ + RTNH_F_OFFLOAD | RTNH_F_TRAP) + +/* Macros to handle hexthops */ +#define RTNH_ALIGNTO NL_ITEM_ALIGN_SIZE +#define RTNH_ALIGN(_len) NL_ITEM_ALIGN(_len) +#define RTNH_HDRLEN ((int)sizeof(struct rtnexthop)) +#define _RTNH_LEN(_nh) ((int)(_nh)->rtnh_len) +#define _RTNH_ALIGNED_LEN(_nh) RTNH_ALIGN(_RTNH_LEN(_nh)) +#define RTNH_OK(_nh, _len) NL_ITEM_OK(_nh, _len, RTNH_HDRLEN, _RTNH_LEN) +#define RTNH_NEXT(_nh) ((struct rtnexthop *)((char *)(_nh) + _RTNH_ALIGNED_LEN(_nh))) +#define RTNH_LENGTH(_len) (RTNH_HDRLEN + (_len)) +#define RTNH_SPACE(_len) RTNH_ALIGN(RTNH_LENGTH(_len)) +#define RTNH_DATA(_nh) ((struct rtattr *)NL_ITEM_DATA(_nh, RTNH_HDRLEN)) + +struct rtgenmsg { + unsigned char rtgen_family; +}; + +#endif diff --git a/sys/netlink/route/route_var.h b/sys/netlink/route/route_var.h new file mode 100644 index 000000000000..7a31a8c896a5 --- /dev/null +++ b/sys/netlink/route/route_var.h @@ -0,0 +1,101 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2022 Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains definitions shared among NETLINK_ROUTE family + */ +#ifndef _NETLINK_ROUTE_ROUTE_VAR_H_ +#define _NETLINK_ROUTE_ROUTE_VAR_H_ + +#include /* values for priv_check */ + +struct nlmsghdr; +struct nlpcb; +struct nl_pstate; + +typedef int rtnl_msg_cb_f(struct nlmsghdr *hdr, struct nlpcb *nlp, + struct nl_pstate *npt); + +struct rtnl_cmd_handler { + int cmd; + const char *name; + rtnl_msg_cb_f *cb; + int priv; + int flags; +}; + +#define RTNL_F_NOEPOCH 0x01 + +bool rtnl_register_messages(const struct rtnl_cmd_handler *handlers, int count); + +/* route.c */ +struct rib_cmd_info; +void rtnl_handle_route_event(uint32_t fibnum, const struct rib_cmd_info *rc); +void rtnl_routes_init(void); + +/* neigh.c */ +void rtnl_neighs_init(void); +void rtnl_neighs_destroy(void); + +/* iface.c */ +struct nl_parsed_link { + char *ifla_group; + char *ifla_ifname; + char *ifla_cloner; + struct nlattr *ifla_idata; + unsigned short ifi_type; + int ifi_index; + uint32_t ifla_mtu; +}; + +typedef int rtnl_iface_create_f(struct nl_parsed_link *lattrs, struct nlpcb *nlp, + struct nl_pstate *npt); +typedef int rtnl_iface_modify_f(struct nl_parsed_link *lattrs, struct nlpcb *nlp, + struct nl_pstate *npt); + +struct nl_cloner { + const char *name; + rtnl_iface_create_f *create_f; + rtnl_iface_modify_f *modify_f; + SLIST_ENTRY(nl_cloner) next; +}; + +void rtnl_ifaces_init(void); +void rtnl_ifaces_destroy(void); +void rtnl_iface_add_cloner(struct nl_cloner *cloner); +void rtnl_iface_del_cloner(struct nl_cloner *cloner); + +/* iface_drivers.c */ +void rtnl_iface_drivers_register(void); + +/* nexthop.c */ +void rtnl_nexthops_init(void); +struct nhop_object *nl_find_nhop(uint32_t fibnum, int family, + uint32_t uidx, int nh_flags, int *perror); + + +#endif