diff --git a/etc/mtree/BSD.include.dist b/etc/mtree/BSD.include.dist index 0f85798815d5..e7784cbb0a47 100644 --- a/etc/mtree/BSD.include.dist +++ b/etc/mtree/BSD.include.dist @@ -1,318 +1,316 @@ # $FreeBSD$ # # Please see the file src/etc/mtree/README before making changes to this file. # /set type=dir uname=root gname=wheel mode=0755 . arpa .. atf-c .. atf-c++ .. bsm .. bsnmp .. c++ v1 experimental .. ext .. tr1 .. .. .. cam ata .. mmc .. nvme .. scsi .. .. casper .. crypto .. dev acpica .. agp .. an .. ciss .. evdev .. filemon .. firewire .. hid .. hwpmc .. hyperv .. ic .. iicbus .. - if_wg - .. io .. mfi .. mlx5 .. mmc .. mpt mpilib .. .. nvme .. ofw .. pbio .. pci .. powermac_nvram .. ppbus .. pwm .. smbus .. speaker .. tcp_log .. usb .. veriexec .. vkbd .. wi .. .. devdctl .. edit readline .. .. fs cuse .. devfs .. fdescfs .. msdosfs .. nfs .. nullfs .. procfs .. smbfs .. udf .. unionfs .. .. gcc 4.2 .. .. geom cache .. concat .. eli .. gate .. journal .. label .. mirror .. mountver .. multipath .. nop .. raid .. raid3 .. shsec .. stripe .. virstor .. .. gssapi .. infiniband complib .. iba .. opensm .. vendor .. .. isofs cd9660 .. .. kadm5 .. krb5 .. lib80211 .. lib9p .. libipt .. libmilter .. libxo .. lzma .. machine pc .. .. net altq .. route .. .. net80211 .. netgraph atm .. bluetooth include .. .. netflow .. .. netinet cc .. netdump .. tcp_stacks .. .. netinet6 .. netipsec .. netnatm api .. msg .. saal .. sig .. .. netpfil pf .. .. netsmb .. nfs .. nfsclient .. nfsserver .. opencsd c_api .. etmv3 .. etmv4 .. ptm .. stm .. .. openssl .. pcap .. protocols .. rdma .. rpc .. rpcsvc .. security audit .. mac_biba .. mac_bsdextended .. mac_lomac .. mac_mls .. mac_partition .. mac_veriexec .. .. sys disk .. .. teken .. ufs ffs .. ufs .. .. vm .. xlocale .. .. diff --git a/include/Makefile b/include/Makefile index d47879e11c93..eebc7f0e121f 100644 --- a/include/Makefile +++ b/include/Makefile @@ -1,484 +1,479 @@ # @(#)Makefile 8.2 (Berkeley) 1/4/94 # $FreeBSD$ # # Doing a "make install" builds /usr/include. .include PACKAGE=runtime CLEANFILES= osreldate.h version SUBDIR= arpa protocols rpcsvc rpc xlocale SUBDIR_PARALLEL= INCS= a.out.h ar.h assert.h bitstring.h complex.h cpio.h _ctype.h ctype.h \ db.h \ dirent.h dlfcn.h elf.h elf-hints.h err.h fmtmsg.h fnmatch.h fstab.h \ fts.h ftw.h getopt.h glob.h grp.h \ ieeefp.h ifaddrs.h \ inttypes.h iso646.h kenv.h langinfo.h libgen.h limits.h link.h \ locale.h malloc.h malloc_np.h memory.h monetary.h mpool.h mqueue.h \ ndbm.h netconfig.h \ netdb.h nl_types.h nlist.h nss.h nsswitch.h paths.h \ printf.h proc_service.h pthread.h \ pthread_np.h pwd.h ranlib.h readpassphrase.h regex.h \ res_update.h resolv.h runetype.h search.h semaphore.h setjmp.h \ signal.h spawn.h stab.h stdalign.h stdbool.h stddef.h \ stdnoreturn.h stdio.h stdlib.h string.h stringlist.h \ strings.h sysexits.h tar.h termios.h tgmath.h \ time.h timeconv.h timers.h ttyent.h \ uchar.h ulimit.h unistd.h utime.h utmpx.h uuid.h varargs.h \ wchar.h wctype.h wordexp.h xlocale.h .PATH: ${SRCTOP}/contrib/libc-vis INCS+= vis.h MHDRS= float.h floatingpoint.h stdarg.h PHDRS= sched.h _semaphore.h LHDRS= aio.h errno.h fcntl.h linker_set.h poll.h stdatomic.h stdint.h \ syslog.h ucontext.h LDIRS= geom net net80211 netgraph netinet netinet6 \ netipsec netsmb nfs nfsclient nfsserver sys vm LSUBDIRS= dev/acpica dev/agp dev/an dev/ciss dev/filemon dev/firewire \ dev/hwpmc dev/hyperv \ - dev/ic dev/iicbus dev/if_wg dev/io dev/mfi dev/mmc dev/nvme \ + dev/ic dev/iicbus dev/io dev/mfi dev/mmc dev/nvme \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/pwm \ dev/smbus dev/speaker dev/tcp_log dev/veriexec dev/vkbd \ fs/devfs fs/fdescfs fs/msdosfs fs/nfs fs/nullfs \ fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ net/route \ netgraph/atm netgraph/netflow \ netinet/cc \ netinet/netdump \ netinet/tcp_stacks \ security/mac_biba security/mac_bsdextended security/mac_lomac \ security/mac_mls security/mac_partition \ security/mac_veriexec \ sys/disk \ ufs/ffs ufs/ufs LSUBSUBDIRS= dev/mpt/mpilib .PATH: ${SRCTOP}/sys/dev/acpica ACPICA= acpi_hpet.h \ acpiio.h ACPICADIR= ${INCLUDEDIR}/dev/acpica .PATH: ${SRCTOP}/sys/dev/agp AGP= agpreg.h AGPDIR= ${INCLUDEDIR}/dev/agp .PATH: ${SRCTOP}/sys/bsm BSM= audit.h \ audit_errno.h \ audit_internal.h \ audit_record.h \ audit_domain.h \ audit_fcntl.h \ audit_kevents.h \ audit_socket_type.h BSMPACKAGE= libbsm BSMDIR= ${INCLUDEDIR}/bsm .PATH: ${SRCTOP}/sys/security/audit SECAUDIT= audit.h \ audit_ioctl.h \ audit_private.h SECAUDITPACKAGE= libbsm SECAUDITDIR= ${INCLUDEDIR}/security/audit .PATH: ${SRCTOP}/sys/cam CAM= cam.h \ cam_ccb.h \ cam_compat.h \ cam_debug.h \ cam_iosched.h \ cam_periph.h \ cam_queue.h \ cam_sim.h \ cam_xpt.h \ cam_xpt_internal.h \ cam_xpt_periph.h \ cam_xpt_sim.h CAMDIR= ${INCLUDEDIR}/cam .PATH: ${SRCTOP}/sys/cam/ata CAMATA= ata_all.h CAMATADIR= ${INCLUDEDIR}/cam/ata .PATH: ${SRCTOP}/sys/cam/mmc CAMMMC= mmc.h \ mmc_bus.h \ mmc_all.h CAMMMCDIR= ${INCLUDEDIR}/cam/mmc .PATH: ${SRCTOP}/sys/cam/nvme CAMNVME= nvme_all.h CAMNVMEDIR= ${INCLUDEDIR}/cam/nvme .PATH: ${SRCTOP}/sys/cam/scsi CAMSCSI= scsi_all.h \ scsi_cd.h \ scsi_ch.h \ scsi_da.h \ scsi_enc.h \ scsi_enc_internal.h \ scsi_iu.h \ scsi_message.h \ scsi_pass.h \ scsi_pt.h \ scsi_sa.h \ scsi_ses.h \ scsi_sg.h \ scsi_targetio.h \ smp_all.h CAMSCSIDIR= ${INCLUDEDIR}/cam/scsi .PATH: ${SRCTOP}/sys/fs/cd9660 FS9660= cd9660_mount.h \ cd9660_node.h \ cd9660_rrip.h \ iso.h \ iso_rrip.h FS9660DIR= ${INCLUDEDIR}/isofs/cd9660 .PATH: ${SRCTOP}/sys/dev/evdev EVDEV= input.h \ input-event-codes.h \ uinput.h EVDEVDIR= ${INCLUDEDIR}/dev/evdev .PATH: ${SRCTOP}/sys/dev/hid HID= hid.h \ hidraw.h HIDDIR= ${INCLUDEDIR}/dev/hid .PATH: ${SRCTOP}/sys/dev/hyperv/include ${SRCTOP}/sys/dev/hyperv/utilities HYPERV= hv_snapshot.h \ hyperv.h HYPERVDIR= ${INCLUDEDIR}/dev/hyperv .PATH: ${SRCTOP}/sys/opencrypto OPENCRYPTO= cryptodev.h OPENCRYPTODIR= ${INCLUDEDIR}/crypto .PATH: ${SRCTOP}/sys/dev/pci PCI= pcireg.h PCIDIR= ${INCLUDEDIR}/dev/pci .PATH: ${SRCTOP}/sys/dev/veriexec VERIEXEC= veriexec_ioctl.h VERIEXECDIR= ${INCLUDEDIR}/dev/veriexec .PATH: ${SRCTOP}/sys/contrib/ipfilter/netinet IPFILTER= ip_auth.h \ ip_compat.h \ ip_dstlist.h \ ip_fil.h \ ip_frag.h \ ip_htable.h \ ip_lookup.h \ ip_nat.h \ ip_pool.h \ ip_proxy.h \ ip_rules.h \ ip_scan.h \ ip_state.h \ ip_sync.h \ ipf_rb.h \ ipl.h \ radix_ipf.h IPFILTERDIR= ${INCLUDEDIR}/netinet .PATH: ${SRCTOP}/sys/netpfil/pf PF= pf.h \ pf_altq.h \ pf_mtag.h PFPACKAGE= pf PFDIR= ${INCLUDEDIR}/netpfil/pf .PATH: ${SRCTOP}/sys/rpc RPC= rpcsec_tls.h \ types.h RPCDIR= ${INCLUDEDIR}/rpc .PATH: ${SRCTOP}/sys/teken TEKEN= teken.h TEKENDIR= ${INCLUDEDIR}/teken .PATH: ${SRCTOP}/sys/contrib/openzfs/include/sys NVPAIR= nvpair.h NVPAIRDIR= ${INCLUDEDIR}/sys .PATH: ${SRCTOP}/sys/dev/mlx5 MLX5= mlx5io.h MLX5DIR= ${INCLUDEDIR}/dev/mlx5 -.PATH: ${SRCTOP}/sys/dev/if_wg -WG= if_wg.h -WGDIR= ${INCLUDEDIR}/dev/if_wg - INCSGROUPS= INCS \ ACPICA \ AGP \ CAM \ CAMATA \ CAMMMC \ CAMNVME \ CAMSCSI \ CRYPTO \ EVDEV \ FS9660 \ HID \ HYPERV \ OPENCRYPTO \ PCI \ RPC \ TEKEN \ - VERIEXEC \ - WG + VERIEXEC .if ${MK_AUDIT} != "no" INCSGROUPS+= BSM INCSGROUPS+= SECAUDIT .endif .if ${MK_IPFILTER} != "no" INCSGROUPS+= IPFILTER .endif .if ${MK_PF} != "no" INCSGROUPS+= PF .endif .if ${MK_CDDL} != "no" INCSGROUPS+= NVPAIR .endif .if ${MK_MLX5TOOL} != "no" INCSGROUPS+= MLX5 .endif .if ${MK_BLUETOOTH} != "no" LSUBSUBDIRS+= netgraph/bluetooth/include .endif .if ${MK_CUSE} != "no" LSUBDIRS+= fs/cuse .endif .if ${MK_GSSAPI} != "no" SUBDIR+= gssapi INCS+= gssapi.h .endif .if ${MK_HESIOD} != "no" INCS+= hesiod.h .endif # Handle the #define aliases for libiconv .if ${MK_ICONV} == "yes" INCS+= iconv.h .endif .if ${MK_USB} != "no" LSUBDIRS+= dev/usb .endif .if ${MACHINE_CPUARCH} == "powerpc" && ${MACHINE_ARCH} != "powerpcspe" _dev_powermac_nvram= dev/powermac_nvram .endif # Define SHARED to indicate whether you want symbolic links to the system # source (``symlinks''), or a separate copy (``copies''). ``symlinks'' is # probably only useful for developers and should be avoided if you do not # wish to tie your /usr/include and /usr/src together. #SHARED= symlinks SHARED?= copies INCS+= osreldate.h SYSDIR= ${SRCTOP}/sys NEWVERS_SH= ${SYSDIR}/conf/newvers.sh PARAM_H= ${SYSDIR}/sys/param.h MK_OSRELDATE_SH= ${.CURDIR}/mk-osreldate.sh SYMLINKS+= ${LIBDIR:C,[^/]+,..,g:C,^/,,}${INCLUDEDIR} ${LIBDIR}/include osreldate.h: ${NEWVERS_SH} ${PARAM_H} ${MK_OSRELDATE_SH} env NEWVERS_SH=${NEWVERS_SH} PARAMFILE=${PARAM_H} SYSDIR=${SYSDIR} \ sh ${MK_OSRELDATE_SH} .for i in ${LHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .for i in ${MHDRS} INCSLINKS+= machine/$i ${INCLUDEDIR}/$i .endfor .for i in ${PHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .if ${MACHINE} != ${MACHINE_CPUARCH} _MARCHS= ${MACHINE_CPUARCH} .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _MARCHS+= x86 .endif META_TARGETS+= compat stage_includes: ${SHARED} SDESTDIR= ${SYSROOT:U${DESTDIR}} # Take care of stale directory-level symlinks. # Note: The "|| true" after find is needed in case one of the directories does # not exist (yet). compat: cd ${SDESTDIR}${INCLUDEDIR}; find ${LDIRS} ${LSUBDIRS} machine ${_MARCHS} \ crypto -maxdepth 0 -mindepth 0 -type l -print -delete || true mtree -deU ${NO_ROOT:D-W} ${MTREE_FOLLOWS_SYMLINKS} \ -f ${SRCTOP}/etc/mtree/BSD.include.dist \ -p ${SDESTDIR}${INCLUDEDIR} > /dev/null copies: .PHONY .META cd ${SDESTDIR}${INCLUDEDIR}; find ${LDIRS} ${LSUBDIRS} ${LSUBSUBDIRS} crypto \ machine machine/pc ${_MARCHS} -maxdepth 1 -mindepth 1 -type l \ -name "*.h" -print -delete || true .for i in ${LDIRS} ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/evdev:Ndev/hid:Ndev/hyperv:Ndev/pci:Ndev/veriexec} ${LSUBSUBDIRS} cd ${SRCTOP}/sys; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 $i/*.h \ ${SDESTDIR}${INCLUDEDIR}/$i .endfor cd ${SRCTOP}/sys/${MACHINE}/include; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${SDESTDIR}${INCLUDEDIR}/machine .if exists(${SRCTOP}/sys/${MACHINE}/include/pc) cd ${SRCTOP}/sys/${MACHINE}/include/pc; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${SDESTDIR}${INCLUDEDIR}/machine/pc .endif .for _MARCH in ${_MARCHS} .if exists(${SRCTOP}/sys/${_MARCH}/include) ${INSTALL} -d ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${SRCTOP}/sys/${_MARCH}/include; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH} .if exists(${SRCTOP}/sys/${_MARCH}/include/pc) ${INSTALL} -d ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${SRCTOP}/sys/${_MARCH}/include/pc; \ ${INSTALL} -C ${TAG_ARGS} -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}/pc .endif .endif .endfor symlinks: .PHONY .META @${ECHO} "Setting up symlinks to kernel source tree..." .for i in ${LDIRS} cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../%s ' sys/$i/*.h) ${SDESTDIR}${INCLUDEDIR}/$i .endfor .for i in ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/evdev:Ndev/hid:Ndev/hyperv:Ndev/pci:Ndev/veriexec} cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../%s ' sys/$i/*.h) ${SDESTDIR}${INCLUDEDIR}/$i .endfor ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../sys/dev/acpica/%s ' acpiio.h acpi_hpet.h) \ ${SDESTDIR}${INCLUDEDIR}/dev/acpica; \ ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../../sys/dev/agp/agpreg.h \ ${SDESTDIR}${INCLUDEDIR}/dev/agp; \ ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../sys/dev/evdev/%s ' input.h input-event-codes.h uinput.h) \ ${SDESTDIR}${INCLUDEDIR}/dev/evdev; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../sys/dev/hid/%s ' hid.h hidraw.h) \ ${SDESTDIR}${INCLUDEDIR}/dev/hid; \ ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../../sys/dev/hyperv/include/hyperv.h \ ${SDESTDIR}${INCLUDEDIR}/dev/hyperv; \ ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../../sys/dev/hyperv/utilities/hv_snapshot.h \ ${SDESTDIR}${INCLUDEDIR}/dev/hyperv; \ ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../../sys/dev/pci/pcireg.h \ ${SDESTDIR}${INCLUDEDIR}/dev/pci; \ ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../../sys/dev/veriexec/veriexec_ioctl.h \ ${SDESTDIR}${INCLUDEDIR}/dev/veriexec; .for i in ${LSUBSUBDIRS} cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../../%s ' sys/$i/*.h) ${SDESTDIR}${INCLUDEDIR}/$i .endfor .if ${MK_IPFILTER} != "no" cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../%s ' sys/contrib/ipfilter/netinet/*.h) \ ${SDESTDIR}${INCLUDEDIR}/netinet; .endif .if ${MK_PF} != "no" cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../%s ' sys/netpfil/pf/*.h) \ ${SDESTDIR}${INCLUDEDIR}/netpfil/pf; .endif ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../sys/crypto/rijndael/rijndael.h \ ${SDESTDIR}${INCLUDEDIR}/crypto; \ cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../%s ' sys/opencrypto/*.h) \ ${SDESTDIR}${INCLUDEDIR}/crypto; \ cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../%s ' sys/${MACHINE}/include/*.h) \ ${SDESTDIR}${INCLUDEDIR}/machine; .if exists(${SRCTOP}/sys/${MACHINE}/include/pc) cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../%s ' sys/${MACHINE}/include/pc/*.h) \ ${SDESTDIR}${INCLUDEDIR}/machine/pc; .endif .for _MARCH in ${_MARCHS} .if exists(${SRCTOP}/sys/${_MARCH}/include) ${INSTALL} -d ${TAG_ARGS:D${TAG_ARGS},dev} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../%s ' sys/${_MARCH}/include/*.h) \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}; .if exists(${SRCTOP}/sys/${_MARCH}/include/pc) ${INSTALL} -d ${TAG_ARGS:D${TAG_ARGS},dev} -o ${BINOWN} -g ${BINGRP} -m 755 \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../%s ' sys/${_MARCH}/include/pc/*.h) \ ${SDESTDIR}${INCLUDEDIR}/${_MARCH}/pc; .endif .endif .endfor cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../../%s ' sys/fs/cd9660/*.h) \ ${SDESTDIR}${INCLUDEDIR}/isofs/cd9660; \ cd ${SRCTOP}; ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ $$(printf '../../../sys/rpc/%s ' rpcsec_tls.h types.h) \ ${SDESTDIR}${INCLUDEDIR}/rpc; cd ${SRCTOP}/sys/rpc; .if ${MK_CDDL} != "no" ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} \ ../../../sys/contrib/openzfs/include/sys/nvpair.h \ ${SDESTDIR}${INCLUDEDIR}/sys .endif .if ${MK_MLX5TOOL} != "no" ${INSTALL_SYMLINK} ${TAG_ARGS:D${TAG_ARGS},dev} ../../../../sys/dev/mlx5/mlx5io.h \ ${SDESTDIR}${INCLUDEDIR}/dev/mlx5 .endif .include installincludes: ${SHARED} ${SHARED}: compat .if ${MACHINE} == "host" && !defined(_SKIP_BUILD) # we're here because we are building a sysroot... # we need MACHINE et al set correctly HOST_MACHINE!= uname -m HOST_MACHINE_ARCH!= uname -p MACHINE:= ${HOST_MACHINE} MACHINE_ARCH:= ${HOST_MACHINE_ARCH} .endif diff --git a/sbin/ifconfig/Makefile b/sbin/ifconfig/Makefile index 61cb8ab933fd..b178dc0c7e6a 100644 --- a/sbin/ifconfig/Makefile +++ b/sbin/ifconfig/Makefile @@ -1,82 +1,81 @@ # From: @(#)Makefile 8.1 (Berkeley) 6/5/93 # $FreeBSD$ .include PACKAGE=runtime PROG= ifconfig SRCS= ifconfig.c # base support # # NB: The order here defines the order in which the constructors # are called. This in turn defines the default order in which # status is displayed. Probably should add a priority mechanism # to the registration process so we don't depend on this aspect # of the toolchain. # SRCS+= af_link.c # LLC support .if ${MK_INET_SUPPORT} != "no" SRCS+= af_inet.c # IPv4 support .endif .if ${MK_INET6_SUPPORT} != "no" SRCS+= af_inet6.c # IPv6 support .endif .if ${MK_INET6_SUPPORT} != "no" SRCS+= af_nd6.c # ND6 support .endif SRCS+= ifclone.c # clone device support SRCS+= ifmac.c # MAC support SRCS+= ifmedia.c # SIOC[GS]IFMEDIA support SRCS+= iffib.c # non-default FIB support SRCS+= ifvlan.c # SIOC[GS]ETVLAN support SRCS+= ifvxlan.c # VXLAN support SRCS+= ifgre.c # GRE keys etc SRCS+= ifgif.c # GIF reversed header workaround SRCS+= ifipsec.c # IPsec VTI -SRCS+= ifwg.c # Wireguard SRCS+= sfp.c # SFP/SFP+ information LIBADD+= ifconfig m util CFLAGS+= -I${SRCTOP}/lib/libifconfig -I${OBJTOP}/lib/libifconfig .if ${MK_WIRELESS_SUPPORT} != "no" SRCS+= ifieee80211.c # SIOC[GS]IEEE80211 support LIBADD+= 80211 .endif SRCS+= carp.c # SIOC[GS]VH support SRCS+= ifgroup.c # ... .if ${MK_PF} != "no" SRCS+= ifpfsync.c # pfsync(4) support .endif SRCS+= ifbridge.c # bridge support SRCS+= iflagg.c # lagg support .if ${MK_EXPERIMENTAL} != "no" CFLAGS+= -DDRAFT_IETF_6MAN_IPV6ONLY_FLAG CFLAGS+= -DEXPERIMENTAL .endif .if ${MK_INET6_SUPPORT} != "no" CFLAGS+= -DINET6 .endif .if ${MK_INET_SUPPORT} != "no" CFLAGS+= -DINET .endif .if ${MK_JAIL} != "no" && !defined(RESCUE) CFLAGS+= -DJAIL LIBADD+= jail .endif LIBADD+= nv MAN= ifconfig.8 CFLAGS+= -Wall -Wmissing-prototypes -Wcast-qual -Wwrite-strings -Wnested-externs WARNS?= 2 HAS_TESTS= SUBDIR.${MK_TESTS}+= tests .include diff --git a/sbin/ifconfig/ifwg.c b/sbin/ifconfig/ifwg.c deleted file mode 100644 index a102f392cf80..000000000000 --- a/sbin/ifconfig/ifwg.c +++ /dev/null @@ -1,731 +0,0 @@ -/*- - * SPDX-License-Identifier: BSD-2-Clause - * - * Copyright (c) 2020 Rubicon Communications, LLC (Netgate) - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#ifndef RESCUE -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* NB: for offsetof */ -#include -#include -#include - -#include "ifconfig.h" - -static void wgfinish(int s, void *arg); - -static bool wgfinish_registered; - -static int allowed_ips_count; -static int allowed_ips_max; -static nvlist_t **allowed_ips, *nvl_peer; - -#define ALLOWEDIPS_START 16 -#define WG_KEY_SIZE_BASE64 ((((WG_KEY_SIZE) + 2) / 3) * 4 + 1) -#define WG_KEY_SIZE_HEX (WG_KEY_SIZE * 2 + 1) -#define WG_MAX_STRLEN 64 - -struct allowedip { - union { - struct in_addr ip4; - struct in6_addr ip6; - }; -}; - -static void -register_wgfinish(void) -{ - - if (wgfinish_registered) - return; - callback_register(wgfinish, NULL); - wgfinish_registered = true; -} - -static nvlist_t * -nvl_device(void) -{ - static nvlist_t *_nvl_device; - - if (_nvl_device == NULL) - _nvl_device = nvlist_create(0); - register_wgfinish(); - return (_nvl_device); -} - -static bool -key_from_base64(uint8_t key[static WG_KEY_SIZE], const char *base64) -{ - - if (strlen(base64) != WG_KEY_SIZE_BASE64 - 1) { - warnx("bad key len - need %d got %zu\n", WG_KEY_SIZE_BASE64 - 1, strlen(base64)); - return false; - } - if (base64[WG_KEY_SIZE_BASE64 - 2] != '=') { - warnx("bad key terminator, expected '=' got '%c'", base64[WG_KEY_SIZE_BASE64 - 2]); - return false; - } - return (b64_pton(base64, key, WG_KEY_SIZE)); -} - -static void -parse_endpoint(const char *endpoint_) -{ - int err; - char *base, *endpoint, *port, *colon, *tmp; - struct addrinfo hints, *res; - - endpoint = base = strdup(endpoint_); - colon = rindex(endpoint, ':'); - if (colon == NULL) - errx(1, "bad endpoint format %s - no port delimiter found", endpoint); - *colon = '\0'; - port = colon + 1; - - /* [::]:<> */ - if (endpoint[0] == '[') { - endpoint++; - tmp = index(endpoint, ']'); - if (tmp == NULL) - errx(1, "bad endpoint format %s - '[' found with no matching ']'", endpoint); - *tmp = '\0'; - } - bzero(&hints, sizeof(hints)); - hints.ai_family = AF_UNSPEC; - err = getaddrinfo(endpoint, port, &hints, &res); - if (err) - errx(1, "%s", gai_strerror(err)); - nvlist_add_binary(nvl_peer, "endpoint", res->ai_addr, res->ai_addrlen); - freeaddrinfo(res); - free(base); -} - -static void -in_len2mask(struct in_addr *mask, u_int len) -{ - u_int i; - u_char *p; - - p = (u_char *)mask; - memset(mask, 0, sizeof(*mask)); - for (i = 0; i < len / NBBY; i++) - p[i] = 0xff; - if (len % NBBY) - p[i] = (0xff00 >> (len % NBBY)) & 0xff; -} - -static u_int -in_mask2len(struct in_addr *mask) -{ - u_int x, y; - u_char *p; - - p = (u_char *)mask; - for (x = 0; x < sizeof(*mask); x++) { - if (p[x] != 0xff) - break; - } - y = 0; - if (x < sizeof(*mask)) { - for (y = 0; y < NBBY; y++) { - if ((p[x] & (0x80 >> y)) == 0) - break; - } - } - return x * NBBY + y; -} - -static void -in6_prefixlen2mask(struct in6_addr *maskp, int len) -{ - static const u_char maskarray[NBBY] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; - int bytelen, bitlen, i; - - /* sanity check */ - if (len < 0 || len > 128) { - errx(1, "in6_prefixlen2mask: invalid prefix length(%d)\n", - len); - return; - } - - memset(maskp, 0, sizeof(*maskp)); - bytelen = len / NBBY; - bitlen = len % NBBY; - for (i = 0; i < bytelen; i++) - maskp->s6_addr[i] = 0xff; - if (bitlen) - maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; -} - -static int -in6_mask2len(struct in6_addr *mask, u_char *lim0) -{ - int x = 0, y; - u_char *lim = lim0, *p; - - /* ignore the scope_id part */ - if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) - lim = (u_char *)mask + sizeof(*mask); - for (p = (u_char *)mask; p < lim; x++, p++) { - if (*p != 0xff) - break; - } - y = 0; - if (p < lim) { - for (y = 0; y < NBBY; y++) { - if ((*p & (0x80 >> y)) == 0) - break; - } - } - - /* - * when the limit pointer is given, do a stricter check on the - * remaining bits. - */ - if (p < lim) { - if (y != 0 && (*p & (0x00ff >> y)) != 0) - return -1; - for (p = p + 1; p < lim; p++) - if (*p != 0) - return -1; - } - - return x * NBBY + y; -} - -static bool -parse_ip(struct allowedip *aip, uint16_t *family, const char *value) -{ - struct addrinfo hints, *res; - int err; - bool ret; - - ret = true; - bzero(aip, sizeof(*aip)); - bzero(&hints, sizeof(hints)); - hints.ai_family = AF_UNSPEC; - hints.ai_flags = AI_NUMERICHOST; - err = getaddrinfo(value, NULL, &hints, &res); - if (err) - errx(1, "%s", gai_strerror(err)); - - *family = res->ai_family; - if (res->ai_family == AF_INET) { - struct sockaddr_in *sin = (struct sockaddr_in *)res->ai_addr; - - aip->ip4 = sin->sin_addr; - } else if (res->ai_family == AF_INET6) { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)res->ai_addr; - - aip->ip6 = sin6->sin6_addr; - } else { - ret = false; - } - - freeaddrinfo(res); - return (ret); -} - -static void -sa_ntop(const struct sockaddr *sa, char *buf, int *port) -{ - const struct sockaddr_in *sin; - const struct sockaddr_in6 *sin6; - int err; - - err = getnameinfo(sa, sa->sa_len, buf, INET6_ADDRSTRLEN, NULL, - 0, NI_NUMERICHOST); - - if (sa->sa_family == AF_INET) { - sin = (const struct sockaddr_in *)sa; - if (port) - *port = sin->sin_port; - } else if (sa->sa_family == AF_INET6) { - sin6 = (const struct sockaddr_in6 *)sa; - if (port) - *port = sin6->sin6_port; - } - - if (err) - errx(1, "%s", gai_strerror(err)); -} - -static void -dump_peer(const nvlist_t *nvl_peer_cfg) -{ - const void *key; - const struct sockaddr *endpoint; - char outbuf[WG_MAX_STRLEN]; - char addr_buf[INET6_ADDRSTRLEN]; - size_t aip_count, size; - int port; - uint16_t persistent_keepalive; - const nvlist_t * const *nvl_aips; - - printf("[Peer]\n"); - if (nvlist_exists_binary(nvl_peer_cfg, "public-key")) { - key = nvlist_get_binary(nvl_peer_cfg, "public-key", &size); - b64_ntop((const uint8_t *)key, size, outbuf, WG_MAX_STRLEN); - printf("PublicKey = %s\n", outbuf); - } - if (nvlist_exists_binary(nvl_peer_cfg, "preshared-key")) { - key = nvlist_get_binary(nvl_peer_cfg, "preshared-key", &size); - b64_ntop((const uint8_t *)key, size, outbuf, WG_MAX_STRLEN); - printf("PresharedKey = %s\n", outbuf); - } - if (nvlist_exists_binary(nvl_peer_cfg, "endpoint")) { - endpoint = nvlist_get_binary(nvl_peer_cfg, "endpoint", &size); - sa_ntop(endpoint, addr_buf, &port); - printf("Endpoint = %s:%d\n", addr_buf, ntohs(port)); - } - if (nvlist_exists_number(nvl_peer_cfg, - "persistent-keepalive-interval")) { - persistent_keepalive = nvlist_get_number(nvl_peer_cfg, - "persistent-keepalive-interval"); - printf("PersistentKeepalive = %d\n", persistent_keepalive); - } - if (!nvlist_exists_nvlist_array(nvl_peer_cfg, "allowed-ips")) - return; - - nvl_aips = nvlist_get_nvlist_array(nvl_peer_cfg, "allowed-ips", &aip_count); - if (nvl_aips == NULL || aip_count == 0) - return; - - printf("AllowedIPs = "); - for (size_t i = 0; i < aip_count; i++) { - uint8_t cidr; - struct sockaddr_storage ss; - sa_family_t family; - - if (!nvlist_exists_number(nvl_aips[i], "cidr")) - continue; - cidr = nvlist_get_number(nvl_aips[i], "cidr"); - if (nvlist_exists_binary(nvl_aips[i], "ipv4")) { - struct sockaddr_in *sin = (struct sockaddr_in *)&ss; - const struct in_addr *ip4; - - ip4 = nvlist_get_binary(nvl_aips[i], "ipv4", &size); - if (ip4 == NULL || cidr > 32) - continue; - sin->sin_len = sizeof(*sin); - sin->sin_family = AF_INET; - sin->sin_addr = *ip4; - } else if (nvlist_exists_binary(nvl_aips[i], "ipv6")) { - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ss; - const struct in6_addr *ip6; - - ip6 = nvlist_get_binary(nvl_aips[i], "ipv6", &size); - if (ip6 == NULL || cidr > 128) - continue; - sin6->sin6_len = sizeof(*sin6); - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = *ip6; - } else { - continue; - } - - family = ss.ss_family; - getnameinfo((struct sockaddr *)&ss, ss.ss_len, addr_buf, - INET6_ADDRSTRLEN, NULL, 0, NI_NUMERICHOST); - printf("%s/%d", addr_buf, cidr); - if (i < aip_count - 1) - printf(", "); - } - printf("\n"); -} - -static int -get_nvl_out_size(int sock, u_long op, size_t *size) -{ - struct wg_data_io wgd; - int err; - - memset(&wgd, 0, sizeof(wgd)); - - strlcpy(wgd.wgd_name, name, sizeof(wgd.wgd_name)); - wgd.wgd_size = 0; - wgd.wgd_data = NULL; - - err = ioctl(sock, op, &wgd); - if (err) - return (err); - *size = wgd.wgd_size; - return (0); -} - -static int -do_cmd(int sock, u_long op, void *arg, size_t argsize, int set) -{ - struct wg_data_io wgd; - - memset(&wgd, 0, sizeof(wgd)); - - strlcpy(wgd.wgd_name, name, sizeof(wgd.wgd_name)); - wgd.wgd_size = argsize; - wgd.wgd_data = arg; - - return (ioctl(sock, op, &wgd)); -} - -static -DECL_CMD_FUNC(peerlist, val, d) -{ - size_t size, peercount; - void *packed; - const nvlist_t *nvl; - const nvlist_t *const *nvl_peerlist; - - if (get_nvl_out_size(s, SIOCGWG, &size)) - errx(1, "can't get peer list size"); - if ((packed = malloc(size)) == NULL) - errx(1, "malloc failed for peer list"); - if (do_cmd(s, SIOCGWG, packed, size, 0)) - errx(1, "failed to obtain peer list"); - - nvl = nvlist_unpack(packed, size, 0); - if (!nvlist_exists_nvlist_array(nvl, "peers")) - return; - nvl_peerlist = nvlist_get_nvlist_array(nvl, "peers", &peercount); - - for (int i = 0; i < peercount; i++, nvl_peerlist++) { - dump_peer(*nvl_peerlist); - } -} - -static void -wgfinish(int s, void *arg) -{ - void *packed; - size_t size; - static nvlist_t *nvl_dev; - - nvl_dev = nvl_device(); - if (nvl_peer != NULL) { - if (!nvlist_exists_binary(nvl_peer, "public-key")) - errx(1, "must specify a public-key for adding peer"); - if (allowed_ips_count != 0) { - nvlist_add_nvlist_array(nvl_peer, "allowed-ips", - (const nvlist_t * const *)allowed_ips, - allowed_ips_count); - for (size_t i = 0; i < allowed_ips_count; i++) { - nvlist_destroy(allowed_ips[i]); - } - - free(allowed_ips); - } - - nvlist_add_nvlist_array(nvl_dev, "peers", - (const nvlist_t * const *)&nvl_peer, 1); - } - - packed = nvlist_pack(nvl_dev, &size); - - if (do_cmd(s, SIOCSWG, packed, size, true)) - errx(1, "failed to configure"); -} - -static -DECL_CMD_FUNC(peerstart, val, d) -{ - - if (nvl_peer != NULL) - errx(1, "cannot both add and remove a peer"); - register_wgfinish(); - nvl_peer = nvlist_create(0); - allowed_ips = calloc(ALLOWEDIPS_START, sizeof(*allowed_ips)); - allowed_ips_max = ALLOWEDIPS_START; - if (allowed_ips == NULL) - errx(1, "failed to allocate array for allowedips"); -} - -static -DECL_CMD_FUNC(peerdel, val, d) -{ - - if (nvl_peer != NULL) - errx(1, "cannot both add and remove a peer"); - register_wgfinish(); - nvl_peer = nvlist_create(0); - nvlist_add_bool(nvl_peer, "remove", true); -} - -static -DECL_CMD_FUNC(setwglistenport, val, d) -{ - struct addrinfo hints, *res; - const struct sockaddr_in *sin; - const struct sockaddr_in6 *sin6; - - u_long ul; - int err; - - bzero(&hints, sizeof(hints)); - hints.ai_family = AF_UNSPEC; - hints.ai_flags = AI_NUMERICHOST; - err = getaddrinfo(NULL, val, &hints, &res); - if (err) - errx(1, "%s", gai_strerror(err)); - - if (res->ai_family == AF_INET) { - sin = (struct sockaddr_in *)res->ai_addr; - ul = sin->sin_port; - } else if (res->ai_family == AF_INET6) { - sin6 = (struct sockaddr_in6 *)res->ai_addr; - ul = sin6->sin6_port; - } else { - errx(1, "unknown family"); - } - ul = ntohs((u_short)ul); - nvlist_add_number(nvl_device(), "listen-port", ul); -} - -static -DECL_CMD_FUNC(setwgprivkey, val, d) -{ - uint8_t key[WG_KEY_SIZE]; - - if (!key_from_base64(key, val)) - errx(1, "invalid key %s", val); - nvlist_add_binary(nvl_device(), "private-key", key, WG_KEY_SIZE); -} - -static -DECL_CMD_FUNC(setwgpubkey, val, d) -{ - uint8_t key[WG_KEY_SIZE]; - - if (nvl_peer == NULL) - errx(1, "setting public key only valid when adding peer"); - - if (!key_from_base64(key, val)) - errx(1, "invalid key %s", val); - nvlist_add_binary(nvl_peer, "public-key", key, WG_KEY_SIZE); -} - -static -DECL_CMD_FUNC(setwgpresharedkey, val, d) -{ - uint8_t key[WG_KEY_SIZE]; - - if (nvl_peer == NULL) - errx(1, "setting preshared-key only valid when adding peer"); - - if (!key_from_base64(key, val)) - errx(1, "invalid key %s", val); - nvlist_add_binary(nvl_peer, "preshared-key", key, WG_KEY_SIZE); -} - - -static -DECL_CMD_FUNC(setwgpersistentkeepalive, val, d) -{ - unsigned long persistent_keepalive; - char *endp; - - if (nvl_peer == NULL) - errx(1, "setting persistent keepalive only valid when adding peer"); - - errno = 0; - persistent_keepalive = strtoul(val, &endp, 0); - if (errno != 0 || *endp != '\0') - errx(1, "persistent-keepalive must be numeric (seconds)"); - if (persistent_keepalive > USHRT_MAX) - errx(1, "persistent-keepalive '%lu' too large", - persistent_keepalive); - nvlist_add_number(nvl_peer, "persistent-keepalive-interval", - persistent_keepalive); -} - -static -DECL_CMD_FUNC(setallowedips, val, d) -{ - char *base, *allowedip, *mask; - u_long ul; - char *endp; - struct allowedip aip; - nvlist_t *nvl_aip; - uint16_t family; - - if (nvl_peer == NULL) - errx(1, "setting allowed ip only valid when adding peer"); - if (allowed_ips_count == allowed_ips_max) { - allowed_ips_max *= 2; - allowed_ips = reallocarray(allowed_ips, allowed_ips_max, - sizeof(*allowed_ips)); - if (allowed_ips == NULL) - errx(1, "failed to grow allowed ip array"); - } - - allowed_ips[allowed_ips_count] = nvl_aip = nvlist_create(0); - if (nvl_aip == NULL) - errx(1, "failed to create new allowedip nvlist"); - - base = allowedip = strdup(val); - mask = index(allowedip, '/'); - if (mask == NULL) - errx(1, "mask separator not found in allowedip %s", val); - *mask = '\0'; - mask++; - - parse_ip(&aip, &family, allowedip); - ul = strtoul(mask, &endp, 0); - if (*endp != '\0') - errx(1, "invalid value for allowedip mask"); - - nvlist_add_number(nvl_aip, "cidr", ul); - if (family == AF_INET) { - nvlist_add_binary(nvl_aip, "ipv4", &aip.ip4, sizeof(aip.ip4)); - } else if (family == AF_INET6) { - nvlist_add_binary(nvl_aip, "ipv6", &aip.ip6, sizeof(aip.ip6)); - } else { - /* Shouldn't happen */ - nvlist_destroy(nvl_aip); - goto out; - } - - allowed_ips_count++; - -out: - free(base); -} - -static -DECL_CMD_FUNC(setendpoint, val, d) -{ - if (nvl_peer == NULL) - errx(1, "setting endpoint only valid when adding peer"); - parse_endpoint(val); -} - -static void -wireguard_status(int s) -{ - size_t size; - void *packed; - nvlist_t *nvl; - char buf[WG_KEY_SIZE_BASE64]; - const void *key; - uint16_t listen_port; - - if (get_nvl_out_size(s, SIOCGWG, &size)) - return; - if ((packed = malloc(size)) == NULL) - return; - if (do_cmd(s, SIOCGWG, packed, size, 0)) - return; - nvl = nvlist_unpack(packed, size, 0); - if (nvlist_exists_number(nvl, "listen-port")) { - listen_port = nvlist_get_number(nvl, "listen-port"); - printf("\tlisten-port: %d\n", listen_port); - } - if (nvlist_exists_binary(nvl, "private-key")) { - key = nvlist_get_binary(nvl, "private-key", &size); - b64_ntop((const uint8_t *)key, size, buf, WG_MAX_STRLEN); - printf("\tprivate-key: %s\n", buf); - } - if (nvlist_exists_binary(nvl, "public-key")) { - key = nvlist_get_binary(nvl, "public-key", &size); - b64_ntop((const uint8_t *)key, size, buf, WG_MAX_STRLEN); - printf("\tpublic-key: %s\n", buf); - } -} - -static struct cmd wireguard_cmds[] = { - DEF_CMD_ARG("listen-port", setwglistenport), - DEF_CMD_ARG("private-key", setwgprivkey), - /* XXX peer-list is deprecated. */ - DEF_CMD("peer-list", 0, peerlist), - DEF_CMD("peers", 0, peerlist), - DEF_CMD("peer", 0, peerstart), - DEF_CMD("-peer", 0, peerdel), - DEF_CMD_ARG("preshared-key", setwgpresharedkey), - DEF_CMD_ARG("public-key", setwgpubkey), - DEF_CMD_ARG("persistent-keepalive", setwgpersistentkeepalive), - DEF_CMD_ARG("allowed-ips", setallowedips), - DEF_CMD_ARG("endpoint", setendpoint), -}; - -static struct afswtch af_wireguard = { - .af_name = "af_wireguard", - .af_af = AF_UNSPEC, - .af_other_status = wireguard_status, -}; - -static void -wg_create(int s, struct ifreq *ifr) -{ - - setproctitle("ifconfig %s create ...\n", name); - - ifr->ifr_data = NULL; - if (ioctl(s, SIOCIFCREATE, ifr) < 0) - err(1, "SIOCIFCREATE"); -} - -static __constructor void -wireguard_ctor(void) -{ - int i; - - for (i = 0; i < nitems(wireguard_cmds); i++) - cmd_register(&wireguard_cmds[i]); - af_register(&af_wireguard); - clone_setdefcallback_prefix("wg", wg_create); -} - -#endif diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile index 54fd89fe7590..c29bb80c7a58 100644 --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -1,1044 +1,1042 @@ # @(#)Makefile 8.1 (Berkeley) 6/18/93 # $FreeBSD$ .include MAN= aac.4 \ aacraid.4 \ acpi.4 \ ${_acpi_asus.4} \ ${_acpi_asus_wmi.4} \ ${_acpi_dock.4} \ ${_acpi_fujitsu.4} \ ${_acpi_hp.4} \ ${_acpi_ibm.4} \ ${_acpi_panasonic.4} \ ${_acpi_rapidstart.4} \ ${_acpi_sony.4} \ acpi_thermal.4 \ acpi_battery.4 \ ${_acpi_toshiba.4} \ acpi_video.4 \ ${_acpi_wmi.4} \ ada.4 \ adm6996fc.4 \ ads111x.4 \ ae.4 \ ${_aesni.4} \ age.4 \ agp.4 \ ahc.4 \ ahci.4 \ ahd.4 \ ${_aibs.4} \ aio.4 \ alc.4 \ ale.4 \ alpm.4 \ altera_atse.4 \ altera_avgen.4 \ altera_jtag_uart.4 \ altera_sdcard.4 \ altq.4 \ amdpm.4 \ ${_amdsbwd.4} \ ${_amdsmb.4} \ ${_amdsmn.4} \ ${_amdtemp.4} \ ${_bxe.4} \ amr.4 \ an.4 \ ${_aout.4} \ ${_apic.4} \ arcmsr.4 \ ${_asmc.4} \ at45d.4 \ ata.4 \ ath.4 \ ath_ahb.4 \ ath_hal.4 \ ath_pci.4 \ atkbd.4 \ atkbdc.4 \ atp.4 \ ${_atf_test_case.4} \ ${_atrtc.4} \ ${_attimer.4} \ audit.4 \ auditpipe.4 \ aue.4 \ axe.4 \ axge.4 \ axp.4 \ bce.4 \ bcma.4 \ bfe.4 \ bge.4 \ ${_bhyve.4} \ bhnd.4 \ bhnd_chipc.4 \ bhnd_pmu.4 \ bhndb.4 \ bhndb_pci.4 \ blackhole.4 \ bnxt.4 \ bpf.4 \ bridge.4 \ bt.4 \ bwi.4 \ bwn.4 \ ${_bytgpio.4} \ capsicum.4 \ cardbus.4 \ carp.4 \ cas.4 \ cc_cdg.4 \ cc_chd.4 \ cc_cubic.4 \ cc_dctcp.4 \ cc_hd.4 \ cc_htcp.4 \ cc_newreno.4 \ cc_vegas.4 \ ${_ccd.4} \ ccr.4 \ cd.4 \ cdce.4 \ cdceem.4 \ cfi.4 \ cfumass.4 \ ${_cgem.4} \ ch.4 \ chromebook_platform.4 \ ${_chvgpio.4} \ ciss.4 \ cloudabi.4 \ ${_coretemp.4} \ cp2112.4 \ ${_cpuctl.4} \ cpufreq.4 \ crypto.4 \ ctl.4 \ cue.4 \ cxgb.4 \ cxgbe.4 \ cxgbev.4 \ cyapa.4 \ da.4 \ dc.4 \ dcons.4 \ dcons_crom.4 \ ddb.4 \ devctl.4 \ disc.4 \ disk.4 \ divert.4 \ ${_dpms.4} \ ds1307.4 \ ds3231.4 \ ${_dtrace_provs} \ dummynet.4 \ edsc.4 \ ehci.4 \ em.4 \ ena.4 \ enc.4 \ epair.4 \ esp.4 \ est.4 \ et.4 \ etherswitch.4 \ eventtimers.4 \ exca.4 \ e6060sw.4 \ fd.4 \ fdc.4 \ fdt.4 \ fdt_pinctrl.4 \ fdtbus.4 \ ffclock.4 \ filemon.4 \ firewire.4 \ ${_ftwd.4} \ full.4 \ fwe.4 \ fwip.4 \ fwohci.4 \ fxp.4 \ gbde.4 \ gdb.4 \ gem.4 \ geom.4 \ geom_linux_lvm.4 \ geom_map.4 \ geom_uzip.4 \ gif.4 \ gpio.4 \ gpioiic.4 \ gpiokeys.4 \ gpioled.4 \ gpioths.4 \ gre.4 \ h_ertt.4 \ hconf.4 \ hcons.4 \ hgame.4 \ hidbus.4 \ hidquirk.4 \ hidraw.4 \ hifn.4 \ hkbd.4 \ hms.4 \ hmt.4 \ hpen.4 \ hpet.4 \ ${_hpt27xx.4} \ ${_hptiop.4} \ ${_hptmv.4} \ ${_hptnr.4} \ ${_hptrr.4} \ hsctrl.4 \ htu21.4 \ ${_hv_kvp.4} \ ${_hv_netvsc.4} \ ${_hv_storvsc.4} \ ${_hv_utils.4} \ ${_hv_vmbus.4} \ ${_hv_vss.4} \ hwpmc.4 \ ${_hwpstate_intel.4} \ iavf.4 \ ichsmb.4 \ ${_ichwd.4} \ icmp.4 \ icmp6.4 \ ida.4 \ if_ipsec.4 \ iflib.4 \ ifmib.4 \ ig4.4 \ igmp.4 \ iic.4 \ iic_gpiomux.4 \ iicbb.4 \ iicbus.4 \ iichid.4 \ iicmux.4 \ iicsmb.4 \ iir.4 \ ${_imcsmb.4} \ inet.4 \ inet6.4 \ intpm.4 \ intro.4 \ ${_io.4} \ ${_ioat.4} \ ip.4 \ ip6.4 \ ipfirewall.4 \ ipheth.4 \ ${_ipmi.4} \ ips.4 \ ipsec.4 \ ipw.4 \ ipwfw.4 \ isci.4 \ isl.4 \ ismt.4 \ isp.4 \ ispfw.4 \ ${_itwd.4} \ iwi.4 \ iwifw.4 \ iwm.4 \ iwmfw.4 \ iwn.4 \ iwnfw.4 \ ixgbe.4 \ ixl.4 \ jedec_dimm.4 \ jme.4 \ kbdmux.4 \ kcov.4 \ keyboard.4 \ kld.4 \ ksyms.4 \ ksz8995ma.4 \ ktls.4 \ ktr.4 \ kue.4 \ lagg.4 \ le.4 \ led.4 \ lge.4 \ ${_linux.4} \ liquidio.4 \ lm75.4 \ lo.4 \ lp.4 \ lpbb.4 \ lpt.4 \ ltc430x.4 \ mac.4 \ mac_biba.4 \ mac_bsdextended.4 \ mac_ifoff.4 \ mac_lomac.4 \ mac_mls.4 \ mac_none.4 \ mac_ntpd.4 \ mac_partition.4 \ mac_portacl.4 \ mac_seeotheruids.4 \ mac_stub.4 \ mac_test.4 \ malo.4 \ md.4 \ mdio.4 \ me.4 \ mem.4 \ meteor.4 \ mfi.4 \ miibus.4 \ mld.4 \ mlx.4 \ mlx4en.4 \ mlx5en.4 \ mly.4 \ mmc.4 \ mmcsd.4 \ mn.4 \ mod_cc.4 \ mos.4 \ mouse.4 \ mpr.4 \ mps.4 \ mpt.4 \ mrsas.4 \ msk.4 \ mtio.4 \ multicast.4 \ muge.4 \ mvs.4 \ mwl.4 \ mwlfw.4 \ mx25l.4 \ mxge.4 \ my.4 \ net80211.4 \ netdump.4 \ netfpga10g_nf10bmac.4 \ netgdb.4 \ netgraph.4 \ netintro.4 \ netmap.4 \ ${_nfe.4} \ ${_nfsmb.4} \ ng_async.4 \ ngatmbase.4 \ ng_atmllc.4 \ ng_bpf.4 \ ng_bridge.4 \ ng_btsocket.4 \ ng_car.4 \ ng_ccatm.4 \ ng_checksum.4 \ ng_cisco.4 \ ng_deflate.4 \ ng_device.4 \ nge.4 \ ng_echo.4 \ ng_eiface.4 \ ng_etf.4 \ ng_ether.4 \ ng_ether_echo.4 \ ng_frame_relay.4 \ ng_gif.4 \ ng_gif_demux.4 \ ng_h4.4 \ ng_hci.4 \ ng_hole.4 \ ng_hub.4 \ ng_iface.4 \ ng_ipfw.4 \ ng_ip_input.4 \ ng_ksocket.4 \ ng_l2cap.4 \ ng_l2tp.4 \ ng_lmi.4 \ ng_macfilter.4 \ ng_mppc.4 \ ng_nat.4 \ ng_netflow.4 \ ng_one2many.4 \ ng_patch.4 \ ng_pipe.4 \ ng_ppp.4 \ ng_pppoe.4 \ ng_pptpgre.4 \ ng_pred1.4 \ ng_rfc1490.4 \ ng_socket.4 \ ng_source.4 \ ng_split.4 \ ng_sppp.4 \ ng_sscfu.4 \ ng_sscop.4 \ ng_tag.4 \ ng_tcpmss.4 \ ng_tee.4 \ ng_tty.4 \ ng_ubt.4 \ ng_UI.4 \ ng_uni.4 \ ng_vjc.4 \ ng_vlan.4 \ ng_vlan_rotate.4 \ nmdm.4 \ ${_ntb.4} \ ${_ntb_hw_amd.4} \ ${_ntb_hw_intel.4} \ ${_ntb_hw_plx.4} \ ${_ntb_transport.4} \ ${_nda.4} \ ${_if_ntb.4} \ null.4 \ numa.4 \ ${_nvd.4} \ ${_nvdimm.4} \ ${_nvme.4} \ ${_nvram.4} \ ${_nvram2env.4} \ oce.4 \ ocs_fc.4\ ohci.4 \ openfirm.4 \ orm.4 \ ${_ossl.4} \ ow.4 \ ow_temp.4 \ owc.4 \ ${_padlock.4} \ pass.4 \ pccard.4 \ pccbb.4 \ pcf.4 \ ${_pchtherm.4} \ pci.4 \ pcib.4 \ pcic.4 \ pcm.4 \ ${_pf.4} \ ${_pflog.4} \ ${_pfsync.4} \ pim.4 \ pms.4 \ polling.4 \ ppbus.4 \ ppc.4 \ ppi.4 \ procdesc.4 \ proto.4 \ ps4dshock.4 \ psm.4 \ pst.4 \ pt.4 \ ptnet.4 \ pts.4 \ pty.4 \ puc.4 \ pwmc.4 \ ${_qat.4} \ ${_qlxge.4} \ ${_qlxgb.4} \ ${_qlxgbe.4} \ ${_qlnxe.4} \ ral.4 \ random.4 \ rctl.4 \ re.4 \ rgephy.4 \ rights.4 \ rl.4 \ rndtest.4 \ route.4 \ rtsx.4 \ rtwn.4 \ rtwnfw.4 \ rtwn_pci.4 \ rue.4 \ sa.4 \ safe.4 \ safexcel.4 \ sbp.4 \ sbp_targ.4 \ scc.4 \ sched_4bsd.4 \ sched_ule.4 \ screen.4 \ scsi.4 \ sctp.4 \ sdhci.4 \ sem.4 \ send.4 \ ses.4 \ ${_sfxge.4} \ sge.4 \ siba.4 \ siftr.4 \ siis.4 \ simplebus.4 \ sis.4 \ sk.4 \ ${_smartpqi.4} \ smb.4 \ smbios.4 \ smbus.4 \ smp.4 \ smsc.4 \ snd_ad1816.4 \ snd_als4000.4 \ snd_atiixp.4 \ snd_cmi.4 \ snd_cs4281.4 \ snd_csa.4 \ snd_ds1.4 \ snd_emu10k1.4 \ snd_emu10kx.4 \ snd_envy24.4 \ snd_envy24ht.4 \ snd_es137x.4 \ snd_ess.4 \ snd_fm801.4 \ snd_gusc.4 \ snd_hda.4 \ snd_hdspe.4 \ snd_ich.4 \ snd_maestro3.4 \ snd_maestro.4 \ snd_mss.4 \ snd_neomagic.4 \ snd_sbc.4 \ snd_solo.4 \ snd_spicds.4 \ snd_t4dwave.4 \ snd_uaudio.4 \ snd_via8233.4 \ snd_via82c686.4 \ snd_vibes.4 \ sndstat.4 \ snp.4 \ spigen.4 \ ${_spkr.4} \ splash.4 \ sppp.4 \ ste.4 \ stf.4 \ stge.4 \ ${_sume.4} \ ${_superio.4} \ sym.4 \ syncache.4 \ syncer.4 \ syscons.4 \ sysmouse.4 \ tap.4 \ targ.4 \ tcp.4 \ tcp_bbr.4 \ tdfx.4 \ terasic_mtl.4 \ termios.4 \ textdump.4 \ ti.4 \ timecounters.4 \ ${_tpm.4} \ tty.4 \ tun.4 \ twa.4 \ twe.4 \ tws.4 \ udp.4 \ udplite.4 \ ure.4 \ vale.4 \ vga.4 \ vge.4 \ viapm.4 \ ${_viawd.4} \ virtio.4 \ virtio_balloon.4 \ virtio_blk.4 \ virtio_console.4 \ virtio_random.4 \ virtio_scsi.4 \ ${_vmci.4} \ vkbd.4 \ vlan.4 \ vxlan.4 \ ${_vmd.4} \ ${_vmm.4} \ ${_vmx.4} \ vr.4 \ vt.4 \ vte.4 \ vtnet.4 \ watchdog.4 \ ${_wbwd.4} \ - wg.4 \ witness.4 \ wlan.4 \ wlan_acl.4 \ wlan_amrr.4 \ wlan_ccmp.4 \ wlan_tkip.4 \ wlan_wep.4 \ wlan_xauth.4 \ wmt.4 \ ${_wpi.4} \ wsp.4 \ xb360gp.4 \ ${_xen.4} \ xhci.4 \ xl.4 \ ${_xnb.4} \ xpt.4 \ zero.4 MLINKS= ads111x.4 ads1013.4 \ ads111x.4 ads1014.4 \ ads111x.4 ads1015.4 \ ads111x.4 ads1113.4 \ ads111x.4 ads1114.4 \ ads111x.4 ads1115.4 MLINKS+=ae.4 if_ae.4 MLINKS+=age.4 if_age.4 MLINKS+=agp.4 agpgart.4 MLINKS+=alc.4 if_alc.4 MLINKS+=ale.4 if_ale.4 MLINKS+=altera_atse.4 atse.4 MLINKS+=altera_sdcard.4 altera_sdcardc.4 MLINKS+=altq.4 ALTQ.4 MLINKS+=ath.4 if_ath.4 MLINKS+=ath_pci.4 if_ath_pci.4 MLINKS+=an.4 if_an.4 MLINKS+=aue.4 if_aue.4 MLINKS+=axe.4 if_axe.4 MLINKS+=bce.4 if_bce.4 MLINKS+=bfe.4 if_bfe.4 MLINKS+=bge.4 if_bge.4 MLINKS+=bnxt.4 if_bnxt.4 MLINKS+=bridge.4 if_bridge.4 MLINKS+=bwi.4 if_bwi.4 MLINKS+=bwn.4 if_bwn.4 MLINKS+=${_bxe.4} ${_if_bxe.4} MLINKS+=cas.4 if_cas.4 MLINKS+=cdce.4 if_cdce.4 MLINKS+=cfi.4 cfid.4 MLINKS+=cloudabi.4 cloudabi32.4 \ cloudabi.4 cloudabi64.4 MLINKS+=crypto.4 cryptodev.4 MLINKS+=cue.4 if_cue.4 MLINKS+=cxgb.4 if_cxgb.4 MLINKS+=cxgbe.4 if_cxgbe.4 \ cxgbe.4 vcxgbe.4 \ cxgbe.4 if_vcxgbe.4 \ cxgbe.4 cxl.4 \ cxgbe.4 if_cxl.4 \ cxgbe.4 vcxl.4 \ cxgbe.4 if_vcxl.4 \ cxgbe.4 cc.4 \ cxgbe.4 if_cc.4 \ cxgbe.4 vcc.4 \ cxgbe.4 if_vcc.4 MLINKS+=cxgbev.4 if_cxgbev.4 \ cxgbev.4 cxlv.4 \ cxgbev.4 if_cxlv.4 \ cxgbev.4 ccv.4 \ cxgbev.4 if_ccv.4 MLINKS+=dc.4 if_dc.4 MLINKS+=disc.4 if_disc.4 MLINKS+=edsc.4 if_edsc.4 MLINKS+=em.4 if_em.4 \ em.4 igb.4 \ em.4 if_igb.4 MLINKS+=enc.4 if_enc.4 MLINKS+=epair.4 if_epair.4 MLINKS+=et.4 if_et.4 MLINKS+=fd.4 stderr.4 \ fd.4 stdin.4 \ fd.4 stdout.4 MLINKS+=fdt.4 FDT.4 MLINKS+=firewire.4 ieee1394.4 MLINKS+=fwe.4 if_fwe.4 MLINKS+=fwip.4 if_fwip.4 MLINKS+=fxp.4 if_fxp.4 MLINKS+=gem.4 if_gem.4 MLINKS+=geom.4 GEOM.4 MLINKS+=gif.4 if_gif.4 MLINKS+=gpio.4 gpiobus.4 MLINKS+=gpioths.4 dht11.4 MLINKS+=gpioths.4 dht22.4 MLINKS+=gre.4 if_gre.4 MLINKS+=hpet.4 acpi_hpet.4 MLINKS+=${_hptrr.4} ${_rr232x.4} MLINKS+=${_attimer.4} ${_i8254.4} MLINKS+=ip.4 rawip.4 MLINKS+=ipfirewall.4 ipaccounting.4 \ ipfirewall.4 ipacct.4 \ ipfirewall.4 ipfw.4 MLINKS+=ipheth.4 if_ipheth.4 MLINKS+=ipw.4 if_ipw.4 MLINKS+=iwi.4 if_iwi.4 MLINKS+=iwm.4 if_iwm.4 MLINKS+=iwn.4 if_iwn.4 MLINKS+=ixgbe.4 ix.4 MLINKS+=ixgbe.4 if_ix.4 MLINKS+=ixgbe.4 if_ixgbe.4 MLINKS+=ixl.4 if_ixl.4 MLINKS+=iavf.4 if_iavf.4 MLINKS+=jme.4 if_jme.4 MLINKS+=kue.4 if_kue.4 MLINKS+=lagg.4 trunk.4 MLINKS+=lagg.4 if_lagg.4 MLINKS+=le.4 if_le.4 MLINKS+=lge.4 if_lge.4 MLINKS+=lo.4 loop.4 MLINKS+=lp.4 plip.4 MLINKS+=malo.4 if_malo.4 MLINKS+=md.4 vn.4 MLINKS+=mem.4 kmem.4 MLINKS+=mfi.4 mfi_linux.4 \ mfi.4 mfip.4 MLINKS+=mlx5en.4 mce.4 MLINKS+=mn.4 if_mn.4 MLINKS+=mos.4 if_mos.4 MLINKS+=msk.4 if_msk.4 MLINKS+=mwl.4 if_mwl.4 MLINKS+=mxge.4 if_mxge.4 MLINKS+=my.4 if_my.4 MLINKS+=netfpga10g_nf10bmac.4 if_nf10bmac.4 MLINKS+=netintro.4 net.4 \ netintro.4 networking.4 MLINKS+=${_nfe.4} ${_if_nfe.4} MLINKS+=nge.4 if_nge.4 MLINKS+=openfirm.4 openfirmware.4 MLINKS+=ow.4 onewire.4 MLINKS+=pccbb.4 cbb.4 MLINKS+=pcm.4 snd.4 \ pcm.4 sound.4 MLINKS+=pms.4 pmspcv.4 MLINKS+=ptnet.4 if_ptnet.4 MLINKS+=ral.4 if_ral.4 MLINKS+=re.4 if_re.4 MLINKS+=rl.4 if_rl.4 MLINKS+=rtwn_pci.4 if_rtwn_pci.4 MLINKS+=rue.4 if_rue.4 MLINKS+=scsi.4 CAM.4 \ scsi.4 cam.4 \ scsi.4 scbus.4 \ scsi.4 SCSI.4 MLINKS+=sge.4 if_sge.4 MLINKS+=sis.4 if_sis.4 MLINKS+=sk.4 if_sk.4 MLINKS+=smp.4 SMP.4 MLINKS+=smsc.4 if_smsc.4 MLINKS+=snd_envy24.4 snd_ak452x.4 MLINKS+=snd_sbc.4 snd_sb16.4 \ snd_sbc.4 snd_sb8.4 MLINKS+=${_spkr.4} ${_speaker.4} MLINKS+=splash.4 screensaver.4 MLINKS+=ste.4 if_ste.4 MLINKS+=stf.4 if_stf.4 MLINKS+=stge.4 if_stge.4 MLINKS+=syncache.4 syncookies.4 MLINKS+=syscons.4 sc.4 MLINKS+=tap.4 if_tap.4 \ tap.4 vmnet.4 \ tap.4 if_vmnet.4 MLINKS+=tdfx.4 tdfx_linux.4 MLINKS+=ti.4 if_ti.4 MLINKS+=tun.4 if_tun.4 MLINKS+=ure.4 if_ure.4 MLINKS+=vge.4 if_vge.4 MLINKS+=vlan.4 if_vlan.4 MLINKS+=vxlan.4 if_vxlan.4 MLINKS+=${_vmx.4} ${_if_vmx.4} MLINKS+=vr.4 if_vr.4 MLINKS+=vte.4 if_vte.4 MLINKS+=vtnet.4 if_vtnet.4 MLINKS+=watchdog.4 SW_WATCHDOG.4 -MLINKS+=wg.4 if_wg.4 MLINKS+=${_wpi.4} ${_if_wpi.4} MLINKS+=xl.4 if_xl.4 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" _acpi_asus.4= acpi_asus.4 _acpi_asus_wmi.4= acpi_asus_wmi.4 _acpi_dock.4= acpi_dock.4 _acpi_fujitsu.4=acpi_fujitsu.4 _acpi_hp.4= acpi_hp.4 _acpi_ibm.4= acpi_ibm.4 _acpi_panasonic.4=acpi_panasonic.4 _acpi_rapidstart.4=acpi_rapidstart.4 _acpi_sony.4= acpi_sony.4 _acpi_toshiba.4=acpi_toshiba.4 _acpi_wmi.4= acpi_wmi.4 _aesni.4= aesni.4 _aout.4= aout.4 _apic.4= apic.4 _atrtc.4= atrtc.4 _attimer.4= attimer.4 _aibs.4= aibs.4 _amdsbwd.4= amdsbwd.4 _amdsmb.4= amdsmb.4 _amdsmn.4= amdsmn.4 _amdtemp.4= amdtemp.4 _asmc.4= asmc.4 _bxe.4= bxe.4 _bytgpio.4= bytgpio.4 _chvgpio.4= chvgpio.4 _coretemp.4= coretemp.4 _cpuctl.4= cpuctl.4 _dpms.4= dpms.4 _ftwd.4= ftwd.4 _hpt27xx.4= hpt27xx.4 _hptiop.4= hptiop.4 _hptmv.4= hptmv.4 _hptnr.4= hptnr.4 _hptrr.4= hptrr.4 _hv_kvp.4= hv_kvp.4 _hv_netvsc.4= hv_netvsc.4 _hv_storvsc.4= hv_storvsc.4 _hv_utils.4= hv_utils.4 _hv_vmbus.4= hv_vmbus.4 _hv_vss.4= hv_vss.4 _hwpstate_intel.4= hwpstate_intel.4 _i8254.4= i8254.4 _ichwd.4= ichwd.4 _if_bxe.4= if_bxe.4 _if_nfe.4= if_nfe.4 _if_urtw.4= if_urtw.4 _if_vmx.4= if_vmx.4 _if_wpi.4= if_wpi.4 _imcsmb.4= imcsmb.4 _ipmi.4= ipmi.4 _io.4= io.4 _itwd.4= itwd.4 _linux.4= linux.4 _nda.4= nda.4 _nfe.4= nfe.4 _nfsmb.4= nfsmb.4 _if_ntb.4= if_ntb.4 _ntb.4= ntb.4 _ntb_hw_amd.4= ntb_hw_amd.4 _ntb_hw_intel.4= ntb_hw_intel.4 _ntb_hw_plx.4= ntb_hw_plx.4 _ntb_transport.4=ntb_transport.4 _nvd.4= nvd.4 _nvme.4= nvme.4 _nvram.4= nvram.4 _ossl.4= ossl.4 _padlock.4= padlock.4 _pchtherm.4= pchtherm.4 _qat.4= qat.4 _rr232x.4= rr232x.4 _speaker.4= speaker.4 _spkr.4= spkr.4 _superio.4= superio.4 _tpm.4= tpm.4 _urtw.4= urtw.4 _viawd.4= viawd.4 _vmci.4= vmci.4 _vmx.4= vmx.4 _wbwd.4= wbwd.4 _wpi.4= wpi.4 _xen.4= xen.4 _xnb.4= xnb.4 .endif .if ${MACHINE_CPUARCH} == "amd64" _ioat.4= ioat.4 _nvdimm.4= nvdimm.4 _qlxge.4= qlxge.4 _qlxgb.4= qlxgb.4 _qlxgbe.4= qlxgbe.4 _qlnxe.4= qlnxe.4 _sfxge.4= sfxge.4 _smartpqi.4= smartpqi.4 _sume.4= sume.4 _vmd.4= vmd.4 MLINKS+=qlxge.4 if_qlxge.4 MLINKS+=qlxgb.4 if_qlxgb.4 MLINKS+=qlxgbe.4 if_qlxgbe.4 MLINKS+=qlnxe.4 if_qlnxe.4 MLINKS+=sfxge.4 if_sfxge.4 MLINKS+=sume.4 if_sume.4 .if ${MK_BHYVE} != "no" _bhyve.4= bhyve.4 _vmm.4= vmm.4 .endif .endif .if ${MACHINE_CPUARCH} == "mips" _nvram2env.4= nvram2env.4 .endif .if ${MACHINE_CPUARCH} == "powerpc" _nvd.4= nvd.4 _nvme.4= nvme.4 .endif .if ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "aarch64" || \ ${MACHINE_CPUARCH} == "riscv" _cgem.4= cgem.4 MLINKS+=cgem.4 if_cgem.4 .endif .if empty(MAN_ARCH) __arches= ${MACHINE} ${MACHINE_ARCH} ${MACHINE_CPUARCH} .elif ${MAN_ARCH} == "all" __arches= ${:!/bin/sh -c "/bin/ls -d ${.CURDIR}/man4.*"!:E} .else __arches= ${MAN_ARCH} .endif .for __arch in ${__arches:O:u} .if exists(${.CURDIR}/man4.${__arch}) SUBDIR+= man4.${__arch} .endif .endfor .if ${MK_BLUETOOTH} != "no" MAN+= ng_bluetooth.4 .endif .if ${MK_CCD} != "no" _ccd.4= ccd.4 .endif .if ${MK_CDDL} != "no" _dtrace_provs= dtrace_audit.4 \ dtrace_io.4 \ dtrace_ip.4 \ dtrace_lockstat.4 \ dtrace_proc.4 \ dtrace_sched.4 \ dtrace_sctp.4 \ dtrace_tcp.4 \ dtrace_udp.4 \ dtrace_udplite.4 MLINKS+= dtrace_audit.4 dtaudit.4 .endif .if ${MK_EFI} != "no" MAN+= efidev.4 MLINKS+= efidev.4 efirtc.4 .endif .if ${MK_ISCSI} != "no" MAN+= cfiscsi.4 MAN+= iscsi.4 MAN+= iscsi_initiator.4 MAN+= iser.4 .endif .if ${MK_OFED} != "no" MAN+= mlx4ib.4 MAN+= mlx5ib.4 .endif .if ${MK_MLX5TOOL} != "no" MAN+= mlx5io.4 .endif .if ${MK_TESTS} != "no" ATF= ${SRCTOP}/contrib/atf .PATH: ${ATF}/doc _atf_test_case.4= atf-test-case.4 .endif .if ${MK_PF} != "no" _pf.4= pf.4 _pflog.4= pflog.4 _pfsync.4= pfsync.4 .endif .if ${MK_USB} != "no" MAN+= \ otus.4 \ otusfw.4 \ rsu.4 \ rsufw.4 \ rtwn_usb.4 \ rum.4 \ run.4 \ runfw.4 \ u3g.4 \ uark.4 \ uart.4 \ uath.4 \ ubsa.4 \ ubser.4 \ ubtbcmfw.4 \ uchcom.4 \ ucom.4 \ ucycom.4 \ udav.4 \ udbp.4 \ udl.4 \ uep.4 \ ufoma.4 \ uftdi.4 \ ugen.4 \ ugold.4 \ uhci.4 \ uhid.4 \ uhso.4 \ uipaq.4 \ ukbd.4 \ uled.4 \ ulpt.4 \ umass.4 \ umcs.4 \ umct.4 \ umodem.4 \ umoscom.4 \ ums.4 \ unix.4 \ upgt.4 \ uplcom.4 \ ural.4 \ urio.4 \ urndis.4 \ ${_urtw.4} \ usb.4 \ usb_quirk.4 \ usb_template.4 \ usbhid.4 \ usfs.4 \ uslcom.4 \ uvisor.4 \ uvscom.4 \ zyd.4 MLINKS+=otus.4 if_otus.4 MLINKS+=rsu.4 if_rsu.4 MLINKS+=rtwn_usb.4 if_rtwn_usb.4 MLINKS+=rum.4 if_rum.4 MLINKS+=run.4 if_run.4 MLINKS+=u3g.4 u3gstub.4 MLINKS+=uath.4 if_uath.4 MLINKS+=udav.4 if_udav.4 MLINKS+=upgt.4 if_upgt.4 MLINKS+=ural.4 if_ural.4 MLINKS+=urndis.4 if_urndis.4 MLINKS+=${_urtw.4} ${_if_urtw.4} MLINKS+=zyd.4 if_zyd.4 .endif .include diff --git a/share/man/man4/wg.4 b/share/man/man4/wg.4 deleted file mode 100644 index 29215bd438ff..000000000000 --- a/share/man/man4/wg.4 +++ /dev/null @@ -1,259 +0,0 @@ -.\" Copyright (c) 2020 Gordon Bergling -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" $FreeBSD$ -.\" -.Dd March 12, 2021 -.Dt WG 4 -.Os -.Sh NAME -.Nm wg -.Nd "WireGuard - pseudo-device" -.Sh SYNOPSIS -To load the driver as a module at boot time, place the following line in -.Xr loader.conf 5 : -.Bd -literal -offset indent -if_wg_load="YES" -.Ed -.Sh DESCRIPTION -The -.Nm -driver provides Virtual Private Network (VPN) interfaces for the secure -exchange of layer 3 traffic with other WireGuard peers using the WireGuard -protocol. -.Pp -A -.Nm -interface recognises one or more peers, establishes a secure tunnel with -each on demand, and tracks each peer's UDP endpoint for exchanging encrypted -traffic with. -.Pp -The interfaces can be created at runtime using the -.Ic ifconfig Cm wg Ns Ar N Cm create -command. -The interface itself can be configured with -.Xr ifconfig 8 . -.Pp -The following parameters are available: -.Bl -tag -width indent -.It Cm listen-port -The listing port of the -.Nm -interface. -.It Cm public-key -The public key of the -.Nm -interface. -.It Cm private-key -The private key of the -.Nm -interface. -.It Cm preshared-key -Defines a pre-shared key for the -.Nm -interface. -.It Cm allowed-ips -A list of allowed IP addresses. -.It Cm endpoint -The IP address of the WiredGuard to connect to. -.It Cm peers -A list of peering IP addresses to connect to. -.It Cm persistent-keepalive-interval -Interval, in seconds, at which to send persistent keepalive packets. -.El -.Pp -The -.Nm -interfaces support the following -.Xr ioctl 2 Ns s : -.Bl -tag -width Ds -offset indent -.It Dv SIOCSWG Fa "struct wg_device_io *" -Set the device configuration. -.It Dv SIOCGWG Fa "struct wg_device_io *" -Get the device configuration. -.El -.Pp -The following glossary provides a brief overview of WireGuard -terminology: -.Bl -tag -width indent -offset 3n -.It Peer -Peers exchange IPv4 or IPv6 traffic over secure tunnels. -Each -.Nm -interface may be configured to recognise one or more peers. -.It Key -Each peer uses its private key and corresponding public key to -identify itself to others. -A peer configures a -.Nm -interface with its own private key and with the public keys of its peers. -.It Pre-shared key -In addition to the public keys, each peer pair may be configured with a -unique pre-shared symmetric key. -This is used in their handshake to guard against future compromise of the -peers' encrypted tunnel if a quantum-computational attack on their -Diffie-Hellman exchange becomes feasible. -It is optional, but recommended. -.It Allowed IPs -A single -.Nm -interface may maintain concurrent tunnels connecting diverse networks. -The interface therefore implements rudimentary routing and reverse-path -filtering functions for its tunneled traffic. -These functions reference a set of allowed IP ranges configured against -each peer. -.Pp -The interface will route outbound tunneled traffic to the peer configured -with the most specific matching allowed IP address range, or drop it -if no such match exists. -.Pp -The interface will accept tunneled traffic only from the peer -configured with the most specific matching allowed IP address range -for the incoming traffic, or drop it if no such match exists. -That is, tunneled traffic routed to a given peer cannot return through -another peer of the same -.Nm -interface. -This ensures that peers cannot spoof another's traffic. -.It Handshake -Two peers handshake to mutually authenticate each other and to -establish a shared series of secret ephemeral encryption keys. -Any peer may initiate a handshake. -Handshakes occur only when there is traffic to send, and recur every -two minutes during transfers. -.It Connectionless -Due to the handshake behavior, there is no connected or disconnected -state. -.El -.Ss Keys -Private keys for WireGuard can be generated from any sufficiently -secure random source. -The Curve25519 keys and the pre-shared keys are both 32 bytes -long and are commonly encoded in base64 for ease of use. -.Pp -Keys can be generated with -.Xr openssl 1 -as follows: -.Pp -.Dl $ openssl rand -base64 32 -.Pp -Although a valid Curve25519 key must have 5 bits set to -specific values, this is done by the interface and so it -will accept any random 32-byte base64 string. -.Pp -When an interface has a private key set with -.Nm public-key , -the corresponding -public key is shown in the status output of the interface: -.Bd -literal -offset indent -# ifconfig wg0 | grep public-key - public-key: 7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw= -.Ed -.Sh EXAMPLES -Create a -.Nm -interface and set random private key. -.Bd -literal -offset indent -# ifconfig wg0 create listen-port 54321 private-key `openssl rand -base64 32` -.Ed -.Pp -Retrieve the associated public key from a -.Nm -interface. -.Bd -literal -offset indent -$ ifconfig wg0 | awk '/public-key/ { print $2 }'` -.Ed -.Pp -Connect to a specific endpoint using its public-key and set the allowed IP address -.Bd -literal -offset indent -# ifconfig wg0 peer public-key '7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw=' endpoint 10.0.1.100:54321 allowed-ips 192.168.2.100/32 -.Ed -.Pp -Remove a peer -.Bd -literal -offset indent -# ifconfig wg0 -peer public-key '7lWtsDdqaGB3EY9WNxRN3hVaHMtu1zXw71+bOjNOVUw=' -.Ed -.Sh DIAGNOSTICS -The -.Nm -interface supports runtime debugging, which can be enabled with: -.Pp -.D1 Ic ifconfig Cm wg Ns Ar N Cm debug -.Pp -Some common error messages include: -.Bl -diag -.It "Handshake for peer X did not complete after 5 seconds, retrying" -Peer X did not reply to our initiation packet, for example because: -.Bl -bullet -.It -The peer does not have the local interface configured as a peer. -Peers must be able to mutually authenticate each other. -.It -The peer endpoint IP address is incorrectly configured. -.It -There are firewall rules preventing communication between hosts. -.El -.It "Invalid handshake initiation" -The incoming handshake packet could not be processed. -This is likely due to the local interface not containing -the correct public key for the peer. -.It "Invalid initiation MAC" -The incoming handshake initiation packet had an invalid MAC. -This is likely because the initiation sender has the wrong public key -for the handshake receiver. -.It "Packet has unallowed src IP from peer X" -After decryption, an incoming data packet has a source IP address that -is not assigned to the allowed IPs of Peer X. -.El -.Sh SEE ALSO -.Xr inet 4 , -.Xr ip 4 , -.Xr netintro 4 , -.Xr ipf 5 , -.Xr pf.conf 5 , -.Xr ifconfig 8 , -.Xr ipfw 8 -.Rs -.%T WireGuard whitepaper -.%U https://www.wireguard.com/papers/wireguard.pdf -.Re -.Sh HISTORY -The -.Nm -device driver first appeared in -.Fx 13.0 . -.Sh AUTHORS -The -.Nm -device driver written by -.An Jason A. Donenfeld Aq Mt Jason@zx2c4.com , -.An Matt Dunwoodie Aq Mt ncon@nconroy.net , -and -.An Kyle Evans Aq Mt kevans@FreeBSD.org . -.Pp -This manual page was written by -.An Gordon Bergling Aq Mt gbe@FreeBSD.org -and is based on the -.Ox -manual page written by -.An David Gwynne Aq Mt dlg@openbsd.org . diff --git a/sys/dev/if_wg/crypto.c b/sys/dev/if_wg/crypto.c deleted file mode 100644 index f28585429272..000000000000 --- a/sys/dev/if_wg/crypto.c +++ /dev/null @@ -1,1705 +0,0 @@ -/* - * Copyright (C) 2015-2021 Jason A. Donenfeld . All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include - -#include "crypto.h" - -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif -#ifndef noinline -#define noinline __attribute__((noinline)) -#endif -#ifndef __aligned -#define __aligned(x) __attribute__((aligned(x))) -#endif -#ifndef DIV_ROUND_UP -#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) -#endif - -#define le32_to_cpup(a) le32toh(*(a)) -#define le64_to_cpup(a) le64toh(*(a)) -#define cpu_to_le32(a) htole32(a) -#define cpu_to_le64(a) htole64(a) - -static inline uint32_t get_unaligned_le32(const uint8_t *a) -{ - uint32_t l; - __builtin_memcpy(&l, a, sizeof(l)); - return le32_to_cpup(&l); -} -static inline uint64_t get_unaligned_le64(const uint8_t *a) -{ - uint64_t l; - __builtin_memcpy(&l, a, sizeof(l)); - return le64_to_cpup(&l); -} -static inline void put_unaligned_le32(uint32_t s, uint8_t *d) -{ - uint32_t l = cpu_to_le32(s); - __builtin_memcpy(d, &l, sizeof(l)); -} -static inline void cpu_to_le32_array(uint32_t *buf, unsigned int words) -{ - while (words--) { - *buf = cpu_to_le32(*buf); - ++buf; - } -} -static inline void le32_to_cpu_array(uint32_t *buf, unsigned int words) -{ - while (words--) { - *buf = le32_to_cpup(buf); - ++buf; - } -} - -static inline uint32_t rol32(uint32_t word, unsigned int shift) -{ - return (word << (shift & 31)) | (word >> ((-shift) & 31)); -} -static inline uint32_t ror32(uint32_t word, unsigned int shift) -{ - return (word >> (shift & 31)) | (word << ((-shift) & 31)); -} - -static void xor_cpy(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, - size_t len) -{ - size_t i; - - for (i = 0; i < len; ++i) - dst[i] = src1[i] ^ src2[i]; -} - -#define QUARTER_ROUND(x, a, b, c, d) ( \ - x[a] += x[b], \ - x[d] = rol32((x[d] ^ x[a]), 16), \ - x[c] += x[d], \ - x[b] = rol32((x[b] ^ x[c]), 12), \ - x[a] += x[b], \ - x[d] = rol32((x[d] ^ x[a]), 8), \ - x[c] += x[d], \ - x[b] = rol32((x[b] ^ x[c]), 7) \ -) - -#define C(i, j) (i * 4 + j) - -#define DOUBLE_ROUND(x) ( \ - /* Column Round */ \ - QUARTER_ROUND(x, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \ - QUARTER_ROUND(x, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \ - QUARTER_ROUND(x, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \ - QUARTER_ROUND(x, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), \ - /* Diagonal Round */ \ - QUARTER_ROUND(x, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \ - QUARTER_ROUND(x, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \ - QUARTER_ROUND(x, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \ - QUARTER_ROUND(x, C(0, 3), C(1, 0), C(2, 1), C(3, 2)) \ -) - -#define TWENTY_ROUNDS(x) ( \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x), \ - DOUBLE_ROUND(x) \ -) - -enum chacha20_lengths { - CHACHA20_NONCE_SIZE = 16, - CHACHA20_KEY_SIZE = 32, - CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(uint32_t), - CHACHA20_BLOCK_SIZE = 64, - CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(uint32_t), - HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE, - HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE -}; - -enum chacha20_constants { /* expand 32-byte k */ - CHACHA20_CONSTANT_EXPA = 0x61707865U, - CHACHA20_CONSTANT_ND_3 = 0x3320646eU, - CHACHA20_CONSTANT_2_BY = 0x79622d32U, - CHACHA20_CONSTANT_TE_K = 0x6b206574U -}; - -struct chacha20_ctx { - union { - uint32_t state[16]; - struct { - uint32_t constant[4]; - uint32_t key[8]; - uint32_t counter[4]; - }; - }; -}; - -static void chacha20_init(struct chacha20_ctx *ctx, - const uint8_t key[CHACHA20_KEY_SIZE], - const uint64_t nonce) -{ - ctx->constant[0] = CHACHA20_CONSTANT_EXPA; - ctx->constant[1] = CHACHA20_CONSTANT_ND_3; - ctx->constant[2] = CHACHA20_CONSTANT_2_BY; - ctx->constant[3] = CHACHA20_CONSTANT_TE_K; - ctx->key[0] = get_unaligned_le32(key + 0); - ctx->key[1] = get_unaligned_le32(key + 4); - ctx->key[2] = get_unaligned_le32(key + 8); - ctx->key[3] = get_unaligned_le32(key + 12); - ctx->key[4] = get_unaligned_le32(key + 16); - ctx->key[5] = get_unaligned_le32(key + 20); - ctx->key[6] = get_unaligned_le32(key + 24); - ctx->key[7] = get_unaligned_le32(key + 28); - ctx->counter[0] = 0; - ctx->counter[1] = 0; - ctx->counter[2] = nonce & 0xffffffffU; - ctx->counter[3] = nonce >> 32; -} - -static void chacha20_block(struct chacha20_ctx *ctx, uint32_t *stream) -{ - uint32_t x[CHACHA20_BLOCK_WORDS]; - int i; - - for (i = 0; i < ARRAY_SIZE(x); ++i) - x[i] = ctx->state[i]; - - TWENTY_ROUNDS(x); - - for (i = 0; i < ARRAY_SIZE(x); ++i) - stream[i] = cpu_to_le32(x[i] + ctx->state[i]); - - ctx->counter[0] += 1; -} - -static void chacha20(struct chacha20_ctx *ctx, uint8_t *out, const uint8_t *in, - uint32_t len) -{ - uint32_t buf[CHACHA20_BLOCK_WORDS]; - - while (len >= CHACHA20_BLOCK_SIZE) { - chacha20_block(ctx, buf); - xor_cpy(out, in, (uint8_t *)buf, CHACHA20_BLOCK_SIZE); - len -= CHACHA20_BLOCK_SIZE; - out += CHACHA20_BLOCK_SIZE; - in += CHACHA20_BLOCK_SIZE; - } - if (len) { - chacha20_block(ctx, buf); - xor_cpy(out, in, (uint8_t *)buf, len); - } -} - -static void hchacha20(uint32_t derived_key[CHACHA20_KEY_WORDS], - const uint8_t nonce[HCHACHA20_NONCE_SIZE], - const uint8_t key[HCHACHA20_KEY_SIZE]) -{ - uint32_t x[] = { CHACHA20_CONSTANT_EXPA, - CHACHA20_CONSTANT_ND_3, - CHACHA20_CONSTANT_2_BY, - CHACHA20_CONSTANT_TE_K, - get_unaligned_le32(key + 0), - get_unaligned_le32(key + 4), - get_unaligned_le32(key + 8), - get_unaligned_le32(key + 12), - get_unaligned_le32(key + 16), - get_unaligned_le32(key + 20), - get_unaligned_le32(key + 24), - get_unaligned_le32(key + 28), - get_unaligned_le32(nonce + 0), - get_unaligned_le32(nonce + 4), - get_unaligned_le32(nonce + 8), - get_unaligned_le32(nonce + 12) - }; - - TWENTY_ROUNDS(x); - - memcpy(derived_key + 0, x + 0, sizeof(uint32_t) * 4); - memcpy(derived_key + 4, x + 12, sizeof(uint32_t) * 4); -} - -enum poly1305_lengths { - POLY1305_BLOCK_SIZE = 16, - POLY1305_KEY_SIZE = 32, - POLY1305_MAC_SIZE = 16 -}; - -struct poly1305_internal { - uint32_t h[5]; - uint32_t r[5]; - uint32_t s[4]; -}; - -struct poly1305_ctx { - struct poly1305_internal state; - uint32_t nonce[4]; - uint8_t data[POLY1305_BLOCK_SIZE]; - size_t num; -}; - -static void poly1305_init_core(struct poly1305_internal *st, - const uint8_t key[16]) -{ - /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ - st->r[0] = (get_unaligned_le32(&key[0])) & 0x3ffffff; - st->r[1] = (get_unaligned_le32(&key[3]) >> 2) & 0x3ffff03; - st->r[2] = (get_unaligned_le32(&key[6]) >> 4) & 0x3ffc0ff; - st->r[3] = (get_unaligned_le32(&key[9]) >> 6) & 0x3f03fff; - st->r[4] = (get_unaligned_le32(&key[12]) >> 8) & 0x00fffff; - - /* s = 5*r */ - st->s[0] = st->r[1] * 5; - st->s[1] = st->r[2] * 5; - st->s[2] = st->r[3] * 5; - st->s[3] = st->r[4] * 5; - - /* h = 0 */ - st->h[0] = 0; - st->h[1] = 0; - st->h[2] = 0; - st->h[3] = 0; - st->h[4] = 0; -} - -static void poly1305_blocks_core(struct poly1305_internal *st, - const uint8_t *input, size_t len, - const uint32_t padbit) -{ - const uint32_t hibit = padbit << 24; - uint32_t r0, r1, r2, r3, r4; - uint32_t s1, s2, s3, s4; - uint32_t h0, h1, h2, h3, h4; - uint64_t d0, d1, d2, d3, d4; - uint32_t c; - - r0 = st->r[0]; - r1 = st->r[1]; - r2 = st->r[2]; - r3 = st->r[3]; - r4 = st->r[4]; - - s1 = st->s[0]; - s2 = st->s[1]; - s3 = st->s[2]; - s4 = st->s[3]; - - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - h3 = st->h[3]; - h4 = st->h[4]; - - while (len >= POLY1305_BLOCK_SIZE) { - /* h += m[i] */ - h0 += (get_unaligned_le32(&input[0])) & 0x3ffffff; - h1 += (get_unaligned_le32(&input[3]) >> 2) & 0x3ffffff; - h2 += (get_unaligned_le32(&input[6]) >> 4) & 0x3ffffff; - h3 += (get_unaligned_le32(&input[9]) >> 6) & 0x3ffffff; - h4 += (get_unaligned_le32(&input[12]) >> 8) | hibit; - - /* h *= r */ - d0 = ((uint64_t)h0 * r0) + ((uint64_t)h1 * s4) + - ((uint64_t)h2 * s3) + ((uint64_t)h3 * s2) + - ((uint64_t)h4 * s1); - d1 = ((uint64_t)h0 * r1) + ((uint64_t)h1 * r0) + - ((uint64_t)h2 * s4) + ((uint64_t)h3 * s3) + - ((uint64_t)h4 * s2); - d2 = ((uint64_t)h0 * r2) + ((uint64_t)h1 * r1) + - ((uint64_t)h2 * r0) + ((uint64_t)h3 * s4) + - ((uint64_t)h4 * s3); - d3 = ((uint64_t)h0 * r3) + ((uint64_t)h1 * r2) + - ((uint64_t)h2 * r1) + ((uint64_t)h3 * r0) + - ((uint64_t)h4 * s4); - d4 = ((uint64_t)h0 * r4) + ((uint64_t)h1 * r3) + - ((uint64_t)h2 * r2) + ((uint64_t)h3 * r1) + - ((uint64_t)h4 * r0); - - /* (partial) h %= p */ - c = (uint32_t)(d0 >> 26); - h0 = (uint32_t)d0 & 0x3ffffff; - d1 += c; - c = (uint32_t)(d1 >> 26); - h1 = (uint32_t)d1 & 0x3ffffff; - d2 += c; - c = (uint32_t)(d2 >> 26); - h2 = (uint32_t)d2 & 0x3ffffff; - d3 += c; - c = (uint32_t)(d3 >> 26); - h3 = (uint32_t)d3 & 0x3ffffff; - d4 += c; - c = (uint32_t)(d4 >> 26); - h4 = (uint32_t)d4 & 0x3ffffff; - h0 += c * 5; - c = (h0 >> 26); - h0 = h0 & 0x3ffffff; - h1 += c; - - input += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - } - - st->h[0] = h0; - st->h[1] = h1; - st->h[2] = h2; - st->h[3] = h3; - st->h[4] = h4; -} - -static void poly1305_emit_core(struct poly1305_internal *st, uint8_t mac[16], - const uint32_t nonce[4]) -{ - uint32_t h0, h1, h2, h3, h4, c; - uint32_t g0, g1, g2, g3, g4; - uint64_t f; - uint32_t mask; - - /* fully carry h */ - h0 = st->h[0]; - h1 = st->h[1]; - h2 = st->h[2]; - h3 = st->h[3]; - h4 = st->h[4]; - - c = h1 >> 26; - h1 = h1 & 0x3ffffff; - h2 += c; - c = h2 >> 26; - h2 = h2 & 0x3ffffff; - h3 += c; - c = h3 >> 26; - h3 = h3 & 0x3ffffff; - h4 += c; - c = h4 >> 26; - h4 = h4 & 0x3ffffff; - h0 += c * 5; - c = h0 >> 26; - h0 = h0 & 0x3ffffff; - h1 += c; - - /* compute h + -p */ - g0 = h0 + 5; - c = g0 >> 26; - g0 &= 0x3ffffff; - g1 = h1 + c; - c = g1 >> 26; - g1 &= 0x3ffffff; - g2 = h2 + c; - c = g2 >> 26; - g2 &= 0x3ffffff; - g3 = h3 + c; - c = g3 >> 26; - g3 &= 0x3ffffff; - g4 = h4 + c - (1UL << 26); - - /* select h if h < p, or h + -p if h >= p */ - mask = (g4 >> ((sizeof(uint32_t) * 8) - 1)) - 1; - g0 &= mask; - g1 &= mask; - g2 &= mask; - g3 &= mask; - g4 &= mask; - mask = ~mask; - - h0 = (h0 & mask) | g0; - h1 = (h1 & mask) | g1; - h2 = (h2 & mask) | g2; - h3 = (h3 & mask) | g3; - h4 = (h4 & mask) | g4; - - /* h = h % (2^128) */ - h0 = ((h0) | (h1 << 26)) & 0xffffffff; - h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; - h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; - h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; - - /* mac = (h + nonce) % (2^128) */ - f = (uint64_t)h0 + nonce[0]; - h0 = (uint32_t)f; - f = (uint64_t)h1 + nonce[1] + (f >> 32); - h1 = (uint32_t)f; - f = (uint64_t)h2 + nonce[2] + (f >> 32); - h2 = (uint32_t)f; - f = (uint64_t)h3 + nonce[3] + (f >> 32); - h3 = (uint32_t)f; - - put_unaligned_le32(h0, &mac[0]); - put_unaligned_le32(h1, &mac[4]); - put_unaligned_le32(h2, &mac[8]); - put_unaligned_le32(h3, &mac[12]); -} - -static void poly1305_init(struct poly1305_ctx *ctx, - const uint8_t key[POLY1305_KEY_SIZE]) -{ - ctx->nonce[0] = get_unaligned_le32(&key[16]); - ctx->nonce[1] = get_unaligned_le32(&key[20]); - ctx->nonce[2] = get_unaligned_le32(&key[24]); - ctx->nonce[3] = get_unaligned_le32(&key[28]); - - poly1305_init_core(&ctx->state, key); - - ctx->num = 0; -} - -static void poly1305_update(struct poly1305_ctx *ctx, const uint8_t *input, - size_t len) -{ - const size_t num = ctx->num; - size_t rem; - - if (num) { - rem = POLY1305_BLOCK_SIZE - num; - if (len < rem) { - memcpy(ctx->data + num, input, len); - ctx->num = num + len; - return; - } - memcpy(ctx->data + num, input, rem); - poly1305_blocks_core(&ctx->state, ctx->data, - POLY1305_BLOCK_SIZE, 1); - input += rem; - len -= rem; - } - - rem = len % POLY1305_BLOCK_SIZE; - len -= rem; - - if (len >= POLY1305_BLOCK_SIZE) { - poly1305_blocks_core(&ctx->state, input, len, 1); - input += len; - } - - if (rem) - memcpy(ctx->data, input, rem); - - ctx->num = rem; -} - -static void poly1305_final(struct poly1305_ctx *ctx, - uint8_t mac[POLY1305_MAC_SIZE]) -{ - size_t num = ctx->num; - - if (num) { - ctx->data[num++] = 1; - while (num < POLY1305_BLOCK_SIZE) - ctx->data[num++] = 0; - poly1305_blocks_core(&ctx->state, ctx->data, - POLY1305_BLOCK_SIZE, 0); - } - - poly1305_emit_core(&ctx->state, mac, ctx->nonce); - - explicit_bzero(ctx, sizeof(*ctx)); -} - - -static const uint8_t pad0[16] = { 0 }; - -void -chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, - const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) -{ - struct poly1305_ctx poly1305_state; - struct chacha20_ctx chacha20_state; - union { - uint8_t block0[POLY1305_KEY_SIZE]; - uint64_t lens[2]; - } b = { { 0 } }; - - chacha20_init(&chacha20_state, key, nonce); - chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); - poly1305_init(&poly1305_state, b.block0); - - poly1305_update(&poly1305_state, ad, ad_len); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); - - chacha20(&chacha20_state, dst, src, src_len); - - poly1305_update(&poly1305_state, dst, src_len); - poly1305_update(&poly1305_state, pad0, (0x10 - src_len) & 0xf); - - b.lens[0] = cpu_to_le64(ad_len); - b.lens[1] = cpu_to_le64(src_len); - poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); - - poly1305_final(&poly1305_state, dst + src_len); - - explicit_bzero(&chacha20_state, sizeof(chacha20_state)); - explicit_bzero(&b, sizeof(b)); -} - -bool -chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, - const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) -{ - struct poly1305_ctx poly1305_state; - struct chacha20_ctx chacha20_state; - bool ret; - size_t dst_len; - union { - uint8_t block0[POLY1305_KEY_SIZE]; - uint8_t mac[POLY1305_MAC_SIZE]; - uint64_t lens[2]; - } b = { { 0 } }; - - if (src_len < POLY1305_MAC_SIZE) - return false; - - chacha20_init(&chacha20_state, key, nonce); - chacha20(&chacha20_state, b.block0, b.block0, sizeof(b.block0)); - poly1305_init(&poly1305_state, b.block0); - - poly1305_update(&poly1305_state, ad, ad_len); - poly1305_update(&poly1305_state, pad0, (0x10 - ad_len) & 0xf); - - dst_len = src_len - POLY1305_MAC_SIZE; - poly1305_update(&poly1305_state, src, dst_len); - poly1305_update(&poly1305_state, pad0, (0x10 - dst_len) & 0xf); - - b.lens[0] = cpu_to_le64(ad_len); - b.lens[1] = cpu_to_le64(dst_len); - poly1305_update(&poly1305_state, (uint8_t *)b.lens, sizeof(b.lens)); - - poly1305_final(&poly1305_state, b.mac); - - ret = timingsafe_bcmp(b.mac, src + dst_len, POLY1305_MAC_SIZE) == 0; - if (ret) - chacha20(&chacha20_state, dst, src, dst_len); - - explicit_bzero(&chacha20_state, sizeof(chacha20_state)); - explicit_bzero(&b, sizeof(b)); - - return ret; -} - -void -xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, - const size_t src_len, const uint8_t *ad, - const size_t ad_len, - const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) -{ - uint32_t derived_key[CHACHA20_KEY_WORDS]; - - hchacha20(derived_key, nonce, key); - cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); - chacha20poly1305_encrypt(dst, src, src_len, ad, ad_len, - get_unaligned_le64(nonce + 16), - (uint8_t *)derived_key); - explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE); -} - -bool -xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, - const size_t src_len, const uint8_t *ad, - const size_t ad_len, - const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]) -{ - bool ret; - uint32_t derived_key[CHACHA20_KEY_WORDS]; - - hchacha20(derived_key, nonce, key); - cpu_to_le32_array(derived_key, ARRAY_SIZE(derived_key)); - ret = chacha20poly1305_decrypt(dst, src, src_len, ad, ad_len, - get_unaligned_le64(nonce + 16), - (uint8_t *)derived_key); - explicit_bzero(derived_key, CHACHA20POLY1305_KEY_SIZE); - return ret; -} - - -static const uint32_t blake2s_iv[8] = { - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; - -static const uint8_t blake2s_sigma[10][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, -}; - -static inline void blake2s_set_lastblock(struct blake2s_state *state) -{ - state->f[0] = -1; -} - -static inline void blake2s_increment_counter(struct blake2s_state *state, - const uint32_t inc) -{ - state->t[0] += inc; - state->t[1] += (state->t[0] < inc); -} - -static inline void blake2s_init_param(struct blake2s_state *state, - const uint32_t param) -{ - int i; - - memset(state, 0, sizeof(*state)); - for (i = 0; i < 8; ++i) - state->h[i] = blake2s_iv[i]; - state->h[0] ^= param; -} - -void blake2s_init(struct blake2s_state *state, const size_t outlen) -{ - blake2s_init_param(state, 0x01010000 | outlen); - state->outlen = outlen; -} - -void blake2s_init_key(struct blake2s_state *state, const size_t outlen, - const uint8_t *key, const size_t keylen) -{ - uint8_t block[BLAKE2S_BLOCK_SIZE] = { 0 }; - - blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen); - state->outlen = outlen; - memcpy(block, key, keylen); - blake2s_update(state, block, BLAKE2S_BLOCK_SIZE); - explicit_bzero(block, BLAKE2S_BLOCK_SIZE); -} - -static inline void blake2s_compress(struct blake2s_state *state, - const uint8_t *block, size_t nblocks, - const uint32_t inc) -{ - uint32_t m[16]; - uint32_t v[16]; - int i; - - while (nblocks > 0) { - blake2s_increment_counter(state, inc); - memcpy(m, block, BLAKE2S_BLOCK_SIZE); - le32_to_cpu_array(m, ARRAY_SIZE(m)); - memcpy(v, state->h, 32); - v[ 8] = blake2s_iv[0]; - v[ 9] = blake2s_iv[1]; - v[10] = blake2s_iv[2]; - v[11] = blake2s_iv[3]; - v[12] = blake2s_iv[4] ^ state->t[0]; - v[13] = blake2s_iv[5] ^ state->t[1]; - v[14] = blake2s_iv[6] ^ state->f[0]; - v[15] = blake2s_iv[7] ^ state->f[1]; - -#define G(r, i, a, b, c, d) do { \ - a += b + m[blake2s_sigma[r][2 * i + 0]]; \ - d = ror32(d ^ a, 16); \ - c += d; \ - b = ror32(b ^ c, 12); \ - a += b + m[blake2s_sigma[r][2 * i + 1]]; \ - d = ror32(d ^ a, 8); \ - c += d; \ - b = ror32(b ^ c, 7); \ -} while (0) - -#define ROUND(r) do { \ - G(r, 0, v[0], v[ 4], v[ 8], v[12]); \ - G(r, 1, v[1], v[ 5], v[ 9], v[13]); \ - G(r, 2, v[2], v[ 6], v[10], v[14]); \ - G(r, 3, v[3], v[ 7], v[11], v[15]); \ - G(r, 4, v[0], v[ 5], v[10], v[15]); \ - G(r, 5, v[1], v[ 6], v[11], v[12]); \ - G(r, 6, v[2], v[ 7], v[ 8], v[13]); \ - G(r, 7, v[3], v[ 4], v[ 9], v[14]); \ -} while (0) - ROUND(0); - ROUND(1); - ROUND(2); - ROUND(3); - ROUND(4); - ROUND(5); - ROUND(6); - ROUND(7); - ROUND(8); - ROUND(9); - -#undef G -#undef ROUND - - for (i = 0; i < 8; ++i) - state->h[i] ^= v[i] ^ v[i + 8]; - - block += BLAKE2S_BLOCK_SIZE; - --nblocks; - } -} - -void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen) -{ - const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; - - if (!inlen) - return; - if (inlen > fill) { - memcpy(state->buf + state->buflen, in, fill); - blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); - state->buflen = 0; - in += fill; - inlen -= fill; - } - if (inlen > BLAKE2S_BLOCK_SIZE) { - const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); - /* Hash one less (full) block than strictly possible */ - blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); - in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); - inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); - } - memcpy(state->buf + state->buflen, in, inlen); - state->buflen += inlen; -} - -void blake2s_final(struct blake2s_state *state, uint8_t *out) -{ - blake2s_set_lastblock(state); - memset(state->buf + state->buflen, 0, - BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ - blake2s_compress(state, state->buf, 1, state->buflen); - cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); - memcpy(out, state->h, state->outlen); - explicit_bzero(state, sizeof(*state)); -} - -void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, const size_t keylen) -{ - struct blake2s_state state; - - if (keylen) - blake2s_init_key(&state, outlen, key, keylen); - else - blake2s_init(&state, outlen); - - blake2s_update(&state, in, inlen); - blake2s_final(&state, out); -} - -void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, const size_t outlen, - const size_t inlen, const size_t keylen) -{ - struct blake2s_state state; - uint8_t x_key[BLAKE2S_BLOCK_SIZE] __aligned(sizeof(uint32_t)) = { 0 }; - uint8_t i_hash[BLAKE2S_HASH_SIZE] __aligned(sizeof(uint32_t)); - int i; - - if (keylen > BLAKE2S_BLOCK_SIZE) { - blake2s_init(&state, BLAKE2S_HASH_SIZE); - blake2s_update(&state, key, keylen); - blake2s_final(&state, x_key); - } else - memcpy(x_key, key, keylen); - - for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) - x_key[i] ^= 0x36; - - blake2s_init(&state, BLAKE2S_HASH_SIZE); - blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); - blake2s_update(&state, in, inlen); - blake2s_final(&state, i_hash); - - for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i) - x_key[i] ^= 0x5c ^ 0x36; - - blake2s_init(&state, BLAKE2S_HASH_SIZE); - blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE); - blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE); - blake2s_final(&state, i_hash); - - memcpy(out, i_hash, outlen); - explicit_bzero(x_key, BLAKE2S_BLOCK_SIZE); - explicit_bzero(i_hash, BLAKE2S_HASH_SIZE); -} - - -/* Below here is fiat's implementation of x25519. - * - * Copyright (C) 2015-2016 The fiat-crypto Authors. - * Copyright (C) 2018-2021 Jason A. Donenfeld . All Rights Reserved. - * - * This is a machine-generated formally verified implementation of Curve25519 - * ECDH from: . Though originally - * machine generated, it has been tweaked to be suitable for use in the kernel. - * It is optimized for 32-bit machines and machines that cannot work efficiently - * with 128-bit integer types. - */ - -/* fe means field element. Here the field is \Z/(2^255-19). An element t, - * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 - * t[3]+2^102 t[4]+...+2^230 t[9]. - * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. - * Multiplication and carrying produce fe from fe_loose. - */ -typedef struct fe { uint32_t v[10]; } fe; - -/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc - * Addition and subtraction produce fe_loose from (fe, fe). - */ -typedef struct fe_loose { uint32_t v[10]; } fe_loose; - -static inline void fe_frombytes_impl(uint32_t h[10], const uint8_t *s) -{ - /* Ignores top bit of s. */ - uint32_t a0 = get_unaligned_le32(s); - uint32_t a1 = get_unaligned_le32(s+4); - uint32_t a2 = get_unaligned_le32(s+8); - uint32_t a3 = get_unaligned_le32(s+12); - uint32_t a4 = get_unaligned_le32(s+16); - uint32_t a5 = get_unaligned_le32(s+20); - uint32_t a6 = get_unaligned_le32(s+24); - uint32_t a7 = get_unaligned_le32(s+28); - h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */ - h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */ - h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */ - h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */ - h[4] = (a3>> 6); /* (32- 6) = 26 */ - h[5] = a4&((1<<25)-1); /* 25 */ - h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */ - h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */ - h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */ - h[9] = (a7>> 6)&((1<<25)-1); /* 25 */ -} - -static inline void fe_frombytes(fe *h, const uint8_t *s) -{ - fe_frombytes_impl(h->v, s); -} - -static inline uint8_t /*bool*/ -addcarryx_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) -{ - /* This function extracts 25 bits of result and 1 bit of carry - * (26 total), so a 32-bit intermediate is sufficient. - */ - uint32_t x = a + b + c; - *low = x & ((1 << 25) - 1); - return (x >> 25) & 1; -} - -static inline uint8_t /*bool*/ -addcarryx_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) -{ - /* This function extracts 26 bits of result and 1 bit of carry - * (27 total), so a 32-bit intermediate is sufficient. - */ - uint32_t x = a + b + c; - *low = x & ((1 << 26) - 1); - return (x >> 26) & 1; -} - -static inline uint8_t /*bool*/ -subborrow_u25(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) -{ - /* This function extracts 25 bits of result and 1 bit of borrow - * (26 total), so a 32-bit intermediate is sufficient. - */ - uint32_t x = a - b - c; - *low = x & ((1 << 25) - 1); - return x >> 31; -} - -static inline uint8_t /*bool*/ -subborrow_u26(uint8_t /*bool*/ c, uint32_t a, uint32_t b, uint32_t *low) -{ - /* This function extracts 26 bits of result and 1 bit of borrow - *(27 total), so a 32-bit intermediate is sufficient. - */ - uint32_t x = a - b - c; - *low = x & ((1 << 26) - 1); - return x >> 31; -} - -static inline uint32_t cmovznz32(uint32_t t, uint32_t z, uint32_t nz) -{ - t = -!!t; /* all set if nonzero, 0 if 0 */ - return (t&nz) | ((~t)&z); -} - -static inline void fe_freeze(uint32_t out[10], const uint32_t in1[10]) -{ - const uint32_t x17 = in1[9]; - const uint32_t x18 = in1[8]; - const uint32_t x16 = in1[7]; - const uint32_t x14 = in1[6]; - const uint32_t x12 = in1[5]; - const uint32_t x10 = in1[4]; - const uint32_t x8 = in1[3]; - const uint32_t x6 = in1[2]; - const uint32_t x4 = in1[1]; - const uint32_t x2 = in1[0]; - uint32_t x20; uint8_t/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20); - uint32_t x23; uint8_t/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23); - uint32_t x26; uint8_t/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26); - uint32_t x29; uint8_t/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29); - uint32_t x32; uint8_t/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32); - uint32_t x35; uint8_t/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35); - uint32_t x38; uint8_t/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38); - uint32_t x41; uint8_t/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41); - uint32_t x44; uint8_t/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44); - uint32_t x47; uint8_t/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47); - uint32_t x49 = cmovznz32(x48, 0x0, 0xffffffff); - uint32_t x50 = (x49 & 0x3ffffed); - uint32_t x52; uint8_t/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52); - uint32_t x54 = (x49 & 0x1ffffff); - uint32_t x56; uint8_t/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56); - uint32_t x58 = (x49 & 0x3ffffff); - uint32_t x60; uint8_t/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60); - uint32_t x62 = (x49 & 0x1ffffff); - uint32_t x64; uint8_t/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64); - uint32_t x66 = (x49 & 0x3ffffff); - uint32_t x68; uint8_t/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68); - uint32_t x70 = (x49 & 0x1ffffff); - uint32_t x72; uint8_t/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72); - uint32_t x74 = (x49 & 0x3ffffff); - uint32_t x76; uint8_t/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76); - uint32_t x78 = (x49 & 0x1ffffff); - uint32_t x80; uint8_t/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80); - uint32_t x82 = (x49 & 0x3ffffff); - uint32_t x84; uint8_t/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84); - uint32_t x86 = (x49 & 0x1ffffff); - uint32_t x88; addcarryx_u25(x85, x47, x86, &x88); - out[0] = x52; - out[1] = x56; - out[2] = x60; - out[3] = x64; - out[4] = x68; - out[5] = x72; - out[6] = x76; - out[7] = x80; - out[8] = x84; - out[9] = x88; -} - -static inline void fe_tobytes(uint8_t s[32], const fe *f) -{ - uint32_t h[10]; - fe_freeze(h, f->v); - s[0] = h[0] >> 0; - s[1] = h[0] >> 8; - s[2] = h[0] >> 16; - s[3] = (h[0] >> 24) | (h[1] << 2); - s[4] = h[1] >> 6; - s[5] = h[1] >> 14; - s[6] = (h[1] >> 22) | (h[2] << 3); - s[7] = h[2] >> 5; - s[8] = h[2] >> 13; - s[9] = (h[2] >> 21) | (h[3] << 5); - s[10] = h[3] >> 3; - s[11] = h[3] >> 11; - s[12] = (h[3] >> 19) | (h[4] << 6); - s[13] = h[4] >> 2; - s[14] = h[4] >> 10; - s[15] = h[4] >> 18; - s[16] = h[5] >> 0; - s[17] = h[5] >> 8; - s[18] = h[5] >> 16; - s[19] = (h[5] >> 24) | (h[6] << 1); - s[20] = h[6] >> 7; - s[21] = h[6] >> 15; - s[22] = (h[6] >> 23) | (h[7] << 3); - s[23] = h[7] >> 5; - s[24] = h[7] >> 13; - s[25] = (h[7] >> 21) | (h[8] << 4); - s[26] = h[8] >> 4; - s[27] = h[8] >> 12; - s[28] = (h[8] >> 20) | (h[9] << 6); - s[29] = h[9] >> 2; - s[30] = h[9] >> 10; - s[31] = h[9] >> 18; -} - -/* h = f */ -static inline void fe_copy(fe *h, const fe *f) -{ - memmove(h, f, sizeof(uint32_t) * 10); -} - -static inline void fe_copy_lt(fe_loose *h, const fe *f) -{ - memmove(h, f, sizeof(uint32_t) * 10); -} - -/* h = 0 */ -static inline void fe_0(fe *h) -{ - memset(h, 0, sizeof(uint32_t) * 10); -} - -/* h = 1 */ -static inline void fe_1(fe *h) -{ - memset(h, 0, sizeof(uint32_t) * 10); - h->v[0] = 1; -} - -static void fe_add_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) -{ - const uint32_t x20 = in1[9]; - const uint32_t x21 = in1[8]; - const uint32_t x19 = in1[7]; - const uint32_t x17 = in1[6]; - const uint32_t x15 = in1[5]; - const uint32_t x13 = in1[4]; - const uint32_t x11 = in1[3]; - const uint32_t x9 = in1[2]; - const uint32_t x7 = in1[1]; - const uint32_t x5 = in1[0]; - const uint32_t x38 = in2[9]; - const uint32_t x39 = in2[8]; - const uint32_t x37 = in2[7]; - const uint32_t x35 = in2[6]; - const uint32_t x33 = in2[5]; - const uint32_t x31 = in2[4]; - const uint32_t x29 = in2[3]; - const uint32_t x27 = in2[2]; - const uint32_t x25 = in2[1]; - const uint32_t x23 = in2[0]; - out[0] = (x5 + x23); - out[1] = (x7 + x25); - out[2] = (x9 + x27); - out[3] = (x11 + x29); - out[4] = (x13 + x31); - out[5] = (x15 + x33); - out[6] = (x17 + x35); - out[7] = (x19 + x37); - out[8] = (x21 + x39); - out[9] = (x20 + x38); -} - -/* h = f + g - * Can overlap h with f or g. - */ -static inline void fe_add(fe_loose *h, const fe *f, const fe *g) -{ - fe_add_impl(h->v, f->v, g->v); -} - -static void fe_sub_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) -{ - const uint32_t x20 = in1[9]; - const uint32_t x21 = in1[8]; - const uint32_t x19 = in1[7]; - const uint32_t x17 = in1[6]; - const uint32_t x15 = in1[5]; - const uint32_t x13 = in1[4]; - const uint32_t x11 = in1[3]; - const uint32_t x9 = in1[2]; - const uint32_t x7 = in1[1]; - const uint32_t x5 = in1[0]; - const uint32_t x38 = in2[9]; - const uint32_t x39 = in2[8]; - const uint32_t x37 = in2[7]; - const uint32_t x35 = in2[6]; - const uint32_t x33 = in2[5]; - const uint32_t x31 = in2[4]; - const uint32_t x29 = in2[3]; - const uint32_t x27 = in2[2]; - const uint32_t x25 = in2[1]; - const uint32_t x23 = in2[0]; - out[0] = ((0x7ffffda + x5) - x23); - out[1] = ((0x3fffffe + x7) - x25); - out[2] = ((0x7fffffe + x9) - x27); - out[3] = ((0x3fffffe + x11) - x29); - out[4] = ((0x7fffffe + x13) - x31); - out[5] = ((0x3fffffe + x15) - x33); - out[6] = ((0x7fffffe + x17) - x35); - out[7] = ((0x3fffffe + x19) - x37); - out[8] = ((0x7fffffe + x21) - x39); - out[9] = ((0x3fffffe + x20) - x38); -} - -/* h = f - g - * Can overlap h with f or g. - */ -static inline void fe_sub(fe_loose *h, const fe *f, const fe *g) -{ - fe_sub_impl(h->v, f->v, g->v); -} - -static void fe_mul_impl(uint32_t out[10], const uint32_t in1[10], const uint32_t in2[10]) -{ - const uint32_t x20 = in1[9]; - const uint32_t x21 = in1[8]; - const uint32_t x19 = in1[7]; - const uint32_t x17 = in1[6]; - const uint32_t x15 = in1[5]; - const uint32_t x13 = in1[4]; - const uint32_t x11 = in1[3]; - const uint32_t x9 = in1[2]; - const uint32_t x7 = in1[1]; - const uint32_t x5 = in1[0]; - const uint32_t x38 = in2[9]; - const uint32_t x39 = in2[8]; - const uint32_t x37 = in2[7]; - const uint32_t x35 = in2[6]; - const uint32_t x33 = in2[5]; - const uint32_t x31 = in2[4]; - const uint32_t x29 = in2[3]; - const uint32_t x27 = in2[2]; - const uint32_t x25 = in2[1]; - const uint32_t x23 = in2[0]; - uint64_t x40 = ((uint64_t)x23 * x5); - uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5)); - uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5)); - uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5)); - uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5)); - uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5)); - uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5)); - uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5)); - uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5)); - uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5)); - uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9)); - uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9)); - uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13)); - uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13)); - uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17)); - uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17)); - uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19)))); - uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21)); - uint64_t x58 = ((uint64_t)(0x2 * x38) * x20); - uint64_t x59 = (x48 + (x58 << 0x4)); - uint64_t x60 = (x59 + (x58 << 0x1)); - uint64_t x61 = (x60 + x58); - uint64_t x62 = (x47 + (x57 << 0x4)); - uint64_t x63 = (x62 + (x57 << 0x1)); - uint64_t x64 = (x63 + x57); - uint64_t x65 = (x46 + (x56 << 0x4)); - uint64_t x66 = (x65 + (x56 << 0x1)); - uint64_t x67 = (x66 + x56); - uint64_t x68 = (x45 + (x55 << 0x4)); - uint64_t x69 = (x68 + (x55 << 0x1)); - uint64_t x70 = (x69 + x55); - uint64_t x71 = (x44 + (x54 << 0x4)); - uint64_t x72 = (x71 + (x54 << 0x1)); - uint64_t x73 = (x72 + x54); - uint64_t x74 = (x43 + (x53 << 0x4)); - uint64_t x75 = (x74 + (x53 << 0x1)); - uint64_t x76 = (x75 + x53); - uint64_t x77 = (x42 + (x52 << 0x4)); - uint64_t x78 = (x77 + (x52 << 0x1)); - uint64_t x79 = (x78 + x52); - uint64_t x80 = (x41 + (x51 << 0x4)); - uint64_t x81 = (x80 + (x51 << 0x1)); - uint64_t x82 = (x81 + x51); - uint64_t x83 = (x40 + (x50 << 0x4)); - uint64_t x84 = (x83 + (x50 << 0x1)); - uint64_t x85 = (x84 + x50); - uint64_t x86 = (x85 >> 0x1a); - uint32_t x87 = ((uint32_t)x85 & 0x3ffffff); - uint64_t x88 = (x86 + x82); - uint64_t x89 = (x88 >> 0x19); - uint32_t x90 = ((uint32_t)x88 & 0x1ffffff); - uint64_t x91 = (x89 + x79); - uint64_t x92 = (x91 >> 0x1a); - uint32_t x93 = ((uint32_t)x91 & 0x3ffffff); - uint64_t x94 = (x92 + x76); - uint64_t x95 = (x94 >> 0x19); - uint32_t x96 = ((uint32_t)x94 & 0x1ffffff); - uint64_t x97 = (x95 + x73); - uint64_t x98 = (x97 >> 0x1a); - uint32_t x99 = ((uint32_t)x97 & 0x3ffffff); - uint64_t x100 = (x98 + x70); - uint64_t x101 = (x100 >> 0x19); - uint32_t x102 = ((uint32_t)x100 & 0x1ffffff); - uint64_t x103 = (x101 + x67); - uint64_t x104 = (x103 >> 0x1a); - uint32_t x105 = ((uint32_t)x103 & 0x3ffffff); - uint64_t x106 = (x104 + x64); - uint64_t x107 = (x106 >> 0x19); - uint32_t x108 = ((uint32_t)x106 & 0x1ffffff); - uint64_t x109 = (x107 + x61); - uint64_t x110 = (x109 >> 0x1a); - uint32_t x111 = ((uint32_t)x109 & 0x3ffffff); - uint64_t x112 = (x110 + x49); - uint64_t x113 = (x112 >> 0x19); - uint32_t x114 = ((uint32_t)x112 & 0x1ffffff); - uint64_t x115 = (x87 + (0x13 * x113)); - uint32_t x116 = (uint32_t) (x115 >> 0x1a); - uint32_t x117 = ((uint32_t)x115 & 0x3ffffff); - uint32_t x118 = (x116 + x90); - uint32_t x119 = (x118 >> 0x19); - uint32_t x120 = (x118 & 0x1ffffff); - out[0] = x117; - out[1] = x120; - out[2] = (x119 + x93); - out[3] = x96; - out[4] = x99; - out[5] = x102; - out[6] = x105; - out[7] = x108; - out[8] = x111; - out[9] = x114; -} - -static inline void fe_mul_ttt(fe *h, const fe *f, const fe *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static inline void -fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_sqr_impl(uint32_t out[10], const uint32_t in1[10]) -{ - const uint32_t x17 = in1[9]; - const uint32_t x18 = in1[8]; - const uint32_t x16 = in1[7]; - const uint32_t x14 = in1[6]; - const uint32_t x12 = in1[5]; - const uint32_t x10 = in1[4]; - const uint32_t x8 = in1[3]; - const uint32_t x6 = in1[2]; - const uint32_t x4 = in1[1]; - const uint32_t x2 = in1[0]; - uint64_t x19 = ((uint64_t)x2 * x2); - uint64_t x20 = ((uint64_t)(0x2 * x2) * x4); - uint64_t x21 = (0x2 * (((uint64_t)x4 * x4) + ((uint64_t)x2 * x6))); - uint64_t x22 = (0x2 * (((uint64_t)x4 * x6) + ((uint64_t)x2 * x8))); - uint64_t x23 = ((((uint64_t)x6 * x6) + ((uint64_t)(0x4 * x4) * x8)) + ((uint64_t)(0x2 * x2) * x10)); - uint64_t x24 = (0x2 * ((((uint64_t)x6 * x8) + ((uint64_t)x4 * x10)) + ((uint64_t)x2 * x12))); - uint64_t x25 = (0x2 * (((((uint64_t)x8 * x8) + ((uint64_t)x6 * x10)) + ((uint64_t)x2 * x14)) + ((uint64_t)(0x2 * x4) * x12))); - uint64_t x26 = (0x2 * (((((uint64_t)x8 * x10) + ((uint64_t)x6 * x12)) + ((uint64_t)x4 * x14)) + ((uint64_t)x2 * x16))); - uint64_t x27 = (((uint64_t)x10 * x10) + (0x2 * ((((uint64_t)x6 * x14) + ((uint64_t)x2 * x18)) + (0x2 * (((uint64_t)x4 * x16) + ((uint64_t)x8 * x12)))))); - uint64_t x28 = (0x2 * ((((((uint64_t)x10 * x12) + ((uint64_t)x8 * x14)) + ((uint64_t)x6 * x16)) + ((uint64_t)x4 * x18)) + ((uint64_t)x2 * x17))); - uint64_t x29 = (0x2 * (((((uint64_t)x12 * x12) + ((uint64_t)x10 * x14)) + ((uint64_t)x6 * x18)) + (0x2 * (((uint64_t)x8 * x16) + ((uint64_t)x4 * x17))))); - uint64_t x30 = (0x2 * (((((uint64_t)x12 * x14) + ((uint64_t)x10 * x16)) + ((uint64_t)x8 * x18)) + ((uint64_t)x6 * x17))); - uint64_t x31 = (((uint64_t)x14 * x14) + (0x2 * (((uint64_t)x10 * x18) + (0x2 * (((uint64_t)x12 * x16) + ((uint64_t)x8 * x17)))))); - uint64_t x32 = (0x2 * ((((uint64_t)x14 * x16) + ((uint64_t)x12 * x18)) + ((uint64_t)x10 * x17))); - uint64_t x33 = (0x2 * ((((uint64_t)x16 * x16) + ((uint64_t)x14 * x18)) + ((uint64_t)(0x2 * x12) * x17))); - uint64_t x34 = (0x2 * (((uint64_t)x16 * x18) + ((uint64_t)x14 * x17))); - uint64_t x35 = (((uint64_t)x18 * x18) + ((uint64_t)(0x4 * x16) * x17)); - uint64_t x36 = ((uint64_t)(0x2 * x18) * x17); - uint64_t x37 = ((uint64_t)(0x2 * x17) * x17); - uint64_t x38 = (x27 + (x37 << 0x4)); - uint64_t x39 = (x38 + (x37 << 0x1)); - uint64_t x40 = (x39 + x37); - uint64_t x41 = (x26 + (x36 << 0x4)); - uint64_t x42 = (x41 + (x36 << 0x1)); - uint64_t x43 = (x42 + x36); - uint64_t x44 = (x25 + (x35 << 0x4)); - uint64_t x45 = (x44 + (x35 << 0x1)); - uint64_t x46 = (x45 + x35); - uint64_t x47 = (x24 + (x34 << 0x4)); - uint64_t x48 = (x47 + (x34 << 0x1)); - uint64_t x49 = (x48 + x34); - uint64_t x50 = (x23 + (x33 << 0x4)); - uint64_t x51 = (x50 + (x33 << 0x1)); - uint64_t x52 = (x51 + x33); - uint64_t x53 = (x22 + (x32 << 0x4)); - uint64_t x54 = (x53 + (x32 << 0x1)); - uint64_t x55 = (x54 + x32); - uint64_t x56 = (x21 + (x31 << 0x4)); - uint64_t x57 = (x56 + (x31 << 0x1)); - uint64_t x58 = (x57 + x31); - uint64_t x59 = (x20 + (x30 << 0x4)); - uint64_t x60 = (x59 + (x30 << 0x1)); - uint64_t x61 = (x60 + x30); - uint64_t x62 = (x19 + (x29 << 0x4)); - uint64_t x63 = (x62 + (x29 << 0x1)); - uint64_t x64 = (x63 + x29); - uint64_t x65 = (x64 >> 0x1a); - uint32_t x66 = ((uint32_t)x64 & 0x3ffffff); - uint64_t x67 = (x65 + x61); - uint64_t x68 = (x67 >> 0x19); - uint32_t x69 = ((uint32_t)x67 & 0x1ffffff); - uint64_t x70 = (x68 + x58); - uint64_t x71 = (x70 >> 0x1a); - uint32_t x72 = ((uint32_t)x70 & 0x3ffffff); - uint64_t x73 = (x71 + x55); - uint64_t x74 = (x73 >> 0x19); - uint32_t x75 = ((uint32_t)x73 & 0x1ffffff); - uint64_t x76 = (x74 + x52); - uint64_t x77 = (x76 >> 0x1a); - uint32_t x78 = ((uint32_t)x76 & 0x3ffffff); - uint64_t x79 = (x77 + x49); - uint64_t x80 = (x79 >> 0x19); - uint32_t x81 = ((uint32_t)x79 & 0x1ffffff); - uint64_t x82 = (x80 + x46); - uint64_t x83 = (x82 >> 0x1a); - uint32_t x84 = ((uint32_t)x82 & 0x3ffffff); - uint64_t x85 = (x83 + x43); - uint64_t x86 = (x85 >> 0x19); - uint32_t x87 = ((uint32_t)x85 & 0x1ffffff); - uint64_t x88 = (x86 + x40); - uint64_t x89 = (x88 >> 0x1a); - uint32_t x90 = ((uint32_t)x88 & 0x3ffffff); - uint64_t x91 = (x89 + x28); - uint64_t x92 = (x91 >> 0x19); - uint32_t x93 = ((uint32_t)x91 & 0x1ffffff); - uint64_t x94 = (x66 + (0x13 * x92)); - uint32_t x95 = (uint32_t) (x94 >> 0x1a); - uint32_t x96 = ((uint32_t)x94 & 0x3ffffff); - uint32_t x97 = (x95 + x69); - uint32_t x98 = (x97 >> 0x19); - uint32_t x99 = (x97 & 0x1ffffff); - out[0] = x96; - out[1] = x99; - out[2] = (x98 + x72); - out[3] = x75; - out[4] = x78; - out[5] = x81; - out[6] = x84; - out[7] = x87; - out[8] = x90; - out[9] = x93; -} - -static inline void fe_sq_tl(fe *h, const fe_loose *f) -{ - fe_sqr_impl(h->v, f->v); -} - -static inline void fe_sq_tt(fe *h, const fe *f) -{ - fe_sqr_impl(h->v, f->v); -} - -static inline void fe_loose_invert(fe *out, const fe_loose *z) -{ - fe t0; - fe t1; - fe t2; - fe t3; - int i; - - fe_sq_tl(&t0, z); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 2; ++i) - fe_sq_tt(&t1, &t1); - fe_mul_tlt(&t1, z, &t1); - fe_mul_ttt(&t0, &t0, &t1); - fe_sq_tt(&t2, &t0); - fe_mul_ttt(&t1, &t1, &t2); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 5; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 10; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 20; ++i) - fe_sq_tt(&t3, &t3); - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 10; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 50; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 100; ++i) - fe_sq_tt(&t3, &t3); - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 50; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t1, &t1); - for (i = 1; i < 5; ++i) - fe_sq_tt(&t1, &t1); - fe_mul_ttt(out, &t1, &t0); -} - -static inline void fe_invert(fe *out, const fe *z) -{ - fe_loose l; - fe_copy_lt(&l, z); - fe_loose_invert(out, &l); -} - -/* Replace (f,g) with (g,f) if b == 1; - * replace (f,g) with (f,g) if b == 0. - * - * Preconditions: b in {0,1} - */ -static inline void fe_cswap(fe *f, fe *g, unsigned int b) -{ - unsigned i; - b = 0 - b; - for (i = 0; i < 10; i++) { - uint32_t x = f->v[i] ^ g->v[i]; - x &= b; - f->v[i] ^= x; - g->v[i] ^= x; - } -} - -/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/ -static inline void fe_mul_121666_impl(uint32_t out[10], const uint32_t in1[10]) -{ - const uint32_t x20 = in1[9]; - const uint32_t x21 = in1[8]; - const uint32_t x19 = in1[7]; - const uint32_t x17 = in1[6]; - const uint32_t x15 = in1[5]; - const uint32_t x13 = in1[4]; - const uint32_t x11 = in1[3]; - const uint32_t x9 = in1[2]; - const uint32_t x7 = in1[1]; - const uint32_t x5 = in1[0]; - const uint32_t x38 = 0; - const uint32_t x39 = 0; - const uint32_t x37 = 0; - const uint32_t x35 = 0; - const uint32_t x33 = 0; - const uint32_t x31 = 0; - const uint32_t x29 = 0; - const uint32_t x27 = 0; - const uint32_t x25 = 0; - const uint32_t x23 = 121666; - uint64_t x40 = ((uint64_t)x23 * x5); - uint64_t x41 = (((uint64_t)x23 * x7) + ((uint64_t)x25 * x5)); - uint64_t x42 = ((((uint64_t)(0x2 * x25) * x7) + ((uint64_t)x23 * x9)) + ((uint64_t)x27 * x5)); - uint64_t x43 = (((((uint64_t)x25 * x9) + ((uint64_t)x27 * x7)) + ((uint64_t)x23 * x11)) + ((uint64_t)x29 * x5)); - uint64_t x44 = (((((uint64_t)x27 * x9) + (0x2 * (((uint64_t)x25 * x11) + ((uint64_t)x29 * x7)))) + ((uint64_t)x23 * x13)) + ((uint64_t)x31 * x5)); - uint64_t x45 = (((((((uint64_t)x27 * x11) + ((uint64_t)x29 * x9)) + ((uint64_t)x25 * x13)) + ((uint64_t)x31 * x7)) + ((uint64_t)x23 * x15)) + ((uint64_t)x33 * x5)); - uint64_t x46 = (((((0x2 * ((((uint64_t)x29 * x11) + ((uint64_t)x25 * x15)) + ((uint64_t)x33 * x7))) + ((uint64_t)x27 * x13)) + ((uint64_t)x31 * x9)) + ((uint64_t)x23 * x17)) + ((uint64_t)x35 * x5)); - uint64_t x47 = (((((((((uint64_t)x29 * x13) + ((uint64_t)x31 * x11)) + ((uint64_t)x27 * x15)) + ((uint64_t)x33 * x9)) + ((uint64_t)x25 * x17)) + ((uint64_t)x35 * x7)) + ((uint64_t)x23 * x19)) + ((uint64_t)x37 * x5)); - uint64_t x48 = (((((((uint64_t)x31 * x13) + (0x2 * (((((uint64_t)x29 * x15) + ((uint64_t)x33 * x11)) + ((uint64_t)x25 * x19)) + ((uint64_t)x37 * x7)))) + ((uint64_t)x27 * x17)) + ((uint64_t)x35 * x9)) + ((uint64_t)x23 * x21)) + ((uint64_t)x39 * x5)); - uint64_t x49 = (((((((((((uint64_t)x31 * x15) + ((uint64_t)x33 * x13)) + ((uint64_t)x29 * x17)) + ((uint64_t)x35 * x11)) + ((uint64_t)x27 * x19)) + ((uint64_t)x37 * x9)) + ((uint64_t)x25 * x21)) + ((uint64_t)x39 * x7)) + ((uint64_t)x23 * x20)) + ((uint64_t)x38 * x5)); - uint64_t x50 = (((((0x2 * ((((((uint64_t)x33 * x15) + ((uint64_t)x29 * x19)) + ((uint64_t)x37 * x11)) + ((uint64_t)x25 * x20)) + ((uint64_t)x38 * x7))) + ((uint64_t)x31 * x17)) + ((uint64_t)x35 * x13)) + ((uint64_t)x27 * x21)) + ((uint64_t)x39 * x9)); - uint64_t x51 = (((((((((uint64_t)x33 * x17) + ((uint64_t)x35 * x15)) + ((uint64_t)x31 * x19)) + ((uint64_t)x37 * x13)) + ((uint64_t)x29 * x21)) + ((uint64_t)x39 * x11)) + ((uint64_t)x27 * x20)) + ((uint64_t)x38 * x9)); - uint64_t x52 = (((((uint64_t)x35 * x17) + (0x2 * (((((uint64_t)x33 * x19) + ((uint64_t)x37 * x15)) + ((uint64_t)x29 * x20)) + ((uint64_t)x38 * x11)))) + ((uint64_t)x31 * x21)) + ((uint64_t)x39 * x13)); - uint64_t x53 = (((((((uint64_t)x35 * x19) + ((uint64_t)x37 * x17)) + ((uint64_t)x33 * x21)) + ((uint64_t)x39 * x15)) + ((uint64_t)x31 * x20)) + ((uint64_t)x38 * x13)); - uint64_t x54 = (((0x2 * ((((uint64_t)x37 * x19) + ((uint64_t)x33 * x20)) + ((uint64_t)x38 * x15))) + ((uint64_t)x35 * x21)) + ((uint64_t)x39 * x17)); - uint64_t x55 = (((((uint64_t)x37 * x21) + ((uint64_t)x39 * x19)) + ((uint64_t)x35 * x20)) + ((uint64_t)x38 * x17)); - uint64_t x56 = (((uint64_t)x39 * x21) + (0x2 * (((uint64_t)x37 * x20) + ((uint64_t)x38 * x19)))); - uint64_t x57 = (((uint64_t)x39 * x20) + ((uint64_t)x38 * x21)); - uint64_t x58 = ((uint64_t)(0x2 * x38) * x20); - uint64_t x59 = (x48 + (x58 << 0x4)); - uint64_t x60 = (x59 + (x58 << 0x1)); - uint64_t x61 = (x60 + x58); - uint64_t x62 = (x47 + (x57 << 0x4)); - uint64_t x63 = (x62 + (x57 << 0x1)); - uint64_t x64 = (x63 + x57); - uint64_t x65 = (x46 + (x56 << 0x4)); - uint64_t x66 = (x65 + (x56 << 0x1)); - uint64_t x67 = (x66 + x56); - uint64_t x68 = (x45 + (x55 << 0x4)); - uint64_t x69 = (x68 + (x55 << 0x1)); - uint64_t x70 = (x69 + x55); - uint64_t x71 = (x44 + (x54 << 0x4)); - uint64_t x72 = (x71 + (x54 << 0x1)); - uint64_t x73 = (x72 + x54); - uint64_t x74 = (x43 + (x53 << 0x4)); - uint64_t x75 = (x74 + (x53 << 0x1)); - uint64_t x76 = (x75 + x53); - uint64_t x77 = (x42 + (x52 << 0x4)); - uint64_t x78 = (x77 + (x52 << 0x1)); - uint64_t x79 = (x78 + x52); - uint64_t x80 = (x41 + (x51 << 0x4)); - uint64_t x81 = (x80 + (x51 << 0x1)); - uint64_t x82 = (x81 + x51); - uint64_t x83 = (x40 + (x50 << 0x4)); - uint64_t x84 = (x83 + (x50 << 0x1)); - uint64_t x85 = (x84 + x50); - uint64_t x86 = (x85 >> 0x1a); - uint32_t x87 = ((uint32_t)x85 & 0x3ffffff); - uint64_t x88 = (x86 + x82); - uint64_t x89 = (x88 >> 0x19); - uint32_t x90 = ((uint32_t)x88 & 0x1ffffff); - uint64_t x91 = (x89 + x79); - uint64_t x92 = (x91 >> 0x1a); - uint32_t x93 = ((uint32_t)x91 & 0x3ffffff); - uint64_t x94 = (x92 + x76); - uint64_t x95 = (x94 >> 0x19); - uint32_t x96 = ((uint32_t)x94 & 0x1ffffff); - uint64_t x97 = (x95 + x73); - uint64_t x98 = (x97 >> 0x1a); - uint32_t x99 = ((uint32_t)x97 & 0x3ffffff); - uint64_t x100 = (x98 + x70); - uint64_t x101 = (x100 >> 0x19); - uint32_t x102 = ((uint32_t)x100 & 0x1ffffff); - uint64_t x103 = (x101 + x67); - uint64_t x104 = (x103 >> 0x1a); - uint32_t x105 = ((uint32_t)x103 & 0x3ffffff); - uint64_t x106 = (x104 + x64); - uint64_t x107 = (x106 >> 0x19); - uint32_t x108 = ((uint32_t)x106 & 0x1ffffff); - uint64_t x109 = (x107 + x61); - uint64_t x110 = (x109 >> 0x1a); - uint32_t x111 = ((uint32_t)x109 & 0x3ffffff); - uint64_t x112 = (x110 + x49); - uint64_t x113 = (x112 >> 0x19); - uint32_t x114 = ((uint32_t)x112 & 0x1ffffff); - uint64_t x115 = (x87 + (0x13 * x113)); - uint32_t x116 = (uint32_t) (x115 >> 0x1a); - uint32_t x117 = ((uint32_t)x115 & 0x3ffffff); - uint32_t x118 = (x116 + x90); - uint32_t x119 = (x118 >> 0x19); - uint32_t x120 = (x118 & 0x1ffffff); - out[0] = x117; - out[1] = x120; - out[2] = (x119 + x93); - out[3] = x96; - out[4] = x99; - out[5] = x102; - out[6] = x105; - out[7] = x108; - out[8] = x111; - out[9] = x114; -} - -static inline void fe_mul121666(fe *h, const fe_loose *f) -{ - fe_mul_121666_impl(h->v, f->v); -} - -static const uint8_t curve25519_null_point[CURVE25519_KEY_SIZE]; - -bool curve25519(uint8_t out[CURVE25519_KEY_SIZE], - const uint8_t scalar[CURVE25519_KEY_SIZE], - const uint8_t point[CURVE25519_KEY_SIZE]) -{ - fe x1, x2, z2, x3, z3; - fe_loose x2l, z2l, x3l; - unsigned swap = 0; - int pos; - uint8_t e[32]; - - memcpy(e, scalar, 32); - curve25519_clamp_secret(e); - - /* The following implementation was transcribed to Coq and proven to - * correspond to unary scalar multiplication in affine coordinates given - * that x1 != 0 is the x coordinate of some point on the curve. It was - * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives - * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was - * quantified over the underlying field, so it applies to Curve25519 - * itself and the quadratic twist of Curve25519. It was not proven in - * Coq that prime-field arithmetic correctly simulates extension-field - * arithmetic on prime-field values. The decoding of the byte array - * representation of e was not considered. - * - * Specification of Montgomery curves in affine coordinates: - * - * - * Proof that these form a group that is isomorphic to a Weierstrass - * curve: - * - * - * Coq transcription and correctness proof of the loop - * (where scalarbits=255): - * - * - * preconditions: 0 <= e < 2^255 (not necessarily e < order), - * fe_invert(0) = 0 - */ - fe_frombytes(&x1, point); - fe_1(&x2); - fe_0(&z2); - fe_copy(&x3, &x1); - fe_1(&z3); - - for (pos = 254; pos >= 0; --pos) { - fe tmp0, tmp1; - fe_loose tmp0l, tmp1l; - /* loop invariant as of right before the test, for the case - * where x1 != 0: - * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 - * is nonzero - * let r := e >> (pos+1) in the following equalities of - * projective points: - * to_xz (r*P) === if swap then (x3, z3) else (x2, z2) - * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) - * x1 is the nonzero x coordinate of the nonzero - * point (r*P-(r+1)*P) - */ - unsigned b = 1 & (e[pos / 8] >> (pos & 7)); - swap ^= b; - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - swap = b; - /* Coq transcription of ladderstep formula (called from - * transcribed loop): - * - * - * x1 != 0 - * x1 = 0 - */ - fe_sub(&tmp0l, &x3, &z3); - fe_sub(&tmp1l, &x2, &z2); - fe_add(&x2l, &x2, &z2); - fe_add(&z2l, &x3, &z3); - fe_mul_tll(&z3, &tmp0l, &x2l); - fe_mul_tll(&z2, &z2l, &tmp1l); - fe_sq_tl(&tmp0, &tmp1l); - fe_sq_tl(&tmp1, &x2l); - fe_add(&x3l, &z3, &z2); - fe_sub(&z2l, &z3, &z2); - fe_mul_ttt(&x2, &tmp1, &tmp0); - fe_sub(&tmp1l, &tmp1, &tmp0); - fe_sq_tl(&z2, &z2l); - fe_mul121666(&z3, &tmp1l); - fe_sq_tl(&x3, &x3l); - fe_add(&tmp0l, &tmp0, &z3); - fe_mul_ttt(&z3, &x1, &z2); - fe_mul_tll(&z2, &tmp1l, &tmp0l); - } - /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) - * else (x2, z2) - */ - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - - fe_invert(&z2, &z2); - fe_mul_ttt(&x2, &x2, &z2); - fe_tobytes(out, &x2); - - explicit_bzero(&x1, sizeof(x1)); - explicit_bzero(&x2, sizeof(x2)); - explicit_bzero(&z2, sizeof(z2)); - explicit_bzero(&x3, sizeof(x3)); - explicit_bzero(&z3, sizeof(z3)); - explicit_bzero(&x2l, sizeof(x2l)); - explicit_bzero(&z2l, sizeof(z2l)); - explicit_bzero(&x3l, sizeof(x3l)); - explicit_bzero(&e, sizeof(e)); - - return timingsafe_bcmp(out, curve25519_null_point, CURVE25519_KEY_SIZE) != 0; -} diff --git a/sys/dev/if_wg/crypto.h b/sys/dev/if_wg/crypto.h deleted file mode 100644 index 6e045c2fe0bf..000000000000 --- a/sys/dev/if_wg/crypto.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (C) 2015-2021 Jason A. Donenfeld . All Rights Reserved. - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef _WG_CRYPTO -#define _WG_CRYPTO - -#include - -enum chacha20poly1305_lengths { - XCHACHA20POLY1305_NONCE_SIZE = 24, - CHACHA20POLY1305_KEY_SIZE = 32, - CHACHA20POLY1305_AUTHTAG_SIZE = 16 -}; - -void -chacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, - const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -bool -chacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, const size_t src_len, - const uint8_t *ad, const size_t ad_len, - const uint64_t nonce, - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -void -xchacha20poly1305_encrypt(uint8_t *dst, const uint8_t *src, - const size_t src_len, const uint8_t *ad, - const size_t ad_len, - const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - -bool -xchacha20poly1305_decrypt(uint8_t *dst, const uint8_t *src, - const size_t src_len, const uint8_t *ad, - const size_t ad_len, - const uint8_t nonce[XCHACHA20POLY1305_NONCE_SIZE], - const uint8_t key[CHACHA20POLY1305_KEY_SIZE]); - - -enum blake2s_lengths { - BLAKE2S_BLOCK_SIZE = 64, - BLAKE2S_HASH_SIZE = 32, - BLAKE2S_KEY_SIZE = 32 -}; - -struct blake2s_state { - uint32_t h[8]; - uint32_t t[2]; - uint32_t f[2]; - uint8_t buf[BLAKE2S_BLOCK_SIZE]; - unsigned int buflen; - unsigned int outlen; -}; - -void blake2s_init(struct blake2s_state *state, const size_t outlen); - -void blake2s_init_key(struct blake2s_state *state, const size_t outlen, - const uint8_t *key, const size_t keylen); - -void blake2s_update(struct blake2s_state *state, const uint8_t *in, size_t inlen); - -void blake2s_final(struct blake2s_state *state, uint8_t *out); - -void blake2s(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, const size_t keylen); - -void blake2s_hmac(uint8_t *out, const uint8_t *in, const uint8_t *key, - const size_t outlen, const size_t inlen, const size_t keylen); - -enum curve25519_lengths { - CURVE25519_KEY_SIZE = 32 -}; - -bool curve25519(uint8_t mypublic[static CURVE25519_KEY_SIZE], - const uint8_t secret[static CURVE25519_KEY_SIZE], - const uint8_t basepoint[static CURVE25519_KEY_SIZE]); - -static inline bool -curve25519_generate_public(uint8_t pub[static CURVE25519_KEY_SIZE], - const uint8_t secret[static CURVE25519_KEY_SIZE]) -{ - static const uint8_t basepoint[CURVE25519_KEY_SIZE] = { 9 }; - - return curve25519(pub, secret, basepoint); -} - -static inline void curve25519_clamp_secret(uint8_t secret[static CURVE25519_KEY_SIZE]) -{ - secret[0] &= 248; - secret[31] = (secret[31] & 127) | 64; -} - -static inline void curve25519_generate_secret(uint8_t secret[CURVE25519_KEY_SIZE]) -{ - arc4random_buf(secret, CURVE25519_KEY_SIZE); - curve25519_clamp_secret(secret); -} - -#endif diff --git a/sys/dev/if_wg/if_wg.c b/sys/dev/if_wg/if_wg.c deleted file mode 100644 index 8c11cc58a3bb..000000000000 --- a/sys/dev/if_wg/if_wg.c +++ /dev/null @@ -1,3462 +0,0 @@ -/* - * Copyright (C) 2015-2021 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2019-2021 Matt Dunwoodie - * Copyright (c) 2019-2020 Rubicon Communications, LLC (Netgate) - * Copyright (c) 2021 Kyle Evans - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* TODO audit imports */ -#include "opt_inet.h" -#include "opt_inet6.h" - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "support.h" -#include "wg_noise.h" -#include "wg_cookie.h" -#include "if_wg.h" - -/* It'd be nice to use IF_MAXMTU, but that means more complicated mbuf allocations, - * so instead just do the biggest mbuf we can easily allocate minus the usual maximum - * IPv6 overhead of 80 bytes. If somebody wants bigger frames, we can revisit this. */ -#define MAX_MTU (MJUM16BYTES - 80) - -#define DEFAULT_MTU 1420 - -#define MAX_STAGED_PKT 128 -#define MAX_QUEUED_PKT 1024 -#define MAX_QUEUED_PKT_MASK (MAX_QUEUED_PKT - 1) - -#define MAX_QUEUED_HANDSHAKES 4096 - -#define HASHTABLE_PEER_SIZE (1 << 11) -#define HASHTABLE_INDEX_SIZE (1 << 13) -#define MAX_PEERS_PER_IFACE (1 << 20) - -#define REKEY_TIMEOUT 5 -#define REKEY_TIMEOUT_JITTER 334 /* 1/3 sec, round for arc4random_uniform */ -#define KEEPALIVE_TIMEOUT 10 -#define MAX_TIMER_HANDSHAKES (90 / REKEY_TIMEOUT) -#define NEW_HANDSHAKE_TIMEOUT (REKEY_TIMEOUT + KEEPALIVE_TIMEOUT) -#define UNDERLOAD_TIMEOUT 1 - -#define DPRINTF(sc, ...) if (wireguard_debug) if_printf(sc->sc_ifp, ##__VA_ARGS__) - -/* First byte indicating packet type on the wire */ -#define WG_PKT_INITIATION htole32(1) -#define WG_PKT_RESPONSE htole32(2) -#define WG_PKT_COOKIE htole32(3) -#define WG_PKT_DATA htole32(4) - -#define WG_PKT_WITH_PADDING(n) (((n) + (16-1)) & (~(16-1))) -#define WG_KEY_SIZE 32 - -struct wg_pkt_initiation { - uint32_t t; - uint32_t s_idx; - uint8_t ue[NOISE_PUBLIC_KEY_LEN]; - uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN]; - uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]; - struct cookie_macs m; -}; - -struct wg_pkt_response { - uint32_t t; - uint32_t s_idx; - uint32_t r_idx; - uint8_t ue[NOISE_PUBLIC_KEY_LEN]; - uint8_t en[0 + NOISE_AUTHTAG_LEN]; - struct cookie_macs m; -}; - -struct wg_pkt_cookie { - uint32_t t; - uint32_t r_idx; - uint8_t nonce[COOKIE_NONCE_SIZE]; - uint8_t ec[COOKIE_ENCRYPTED_SIZE]; -}; - -struct wg_pkt_data { - uint32_t t; - uint32_t r_idx; - uint8_t nonce[sizeof(uint64_t)]; - uint8_t buf[]; -}; - -struct wg_endpoint { - union { - struct sockaddr r_sa; - struct sockaddr_in r_sin; -#ifdef INET6 - struct sockaddr_in6 r_sin6; -#endif - } e_remote; - union { - struct in_addr l_in; -#ifdef INET6 - struct in6_pktinfo l_pktinfo6; -#define l_in6 l_pktinfo6.ipi6_addr -#endif - } e_local; -}; - -struct wg_tag { - struct m_tag t_tag; - struct wg_endpoint t_endpoint; - struct wg_peer *t_peer; - struct mbuf *t_mbuf; - int t_done; - int t_mtu; -}; - -struct wg_index { - LIST_ENTRY(wg_index) i_entry; - SLIST_ENTRY(wg_index) i_unused_entry; - uint32_t i_key; - struct noise_remote *i_value; -}; - -struct wg_timers { - /* t_lock is for blocking wg_timers_event_* when setting t_disabled. */ - struct rwlock t_lock; - - int t_disabled; - int t_need_another_keepalive; - uint16_t t_persistent_keepalive_interval; - struct callout t_new_handshake; - struct callout t_send_keepalive; - struct callout t_retry_handshake; - struct callout t_zero_key_material; - struct callout t_persistent_keepalive; - - struct mtx t_handshake_mtx; - struct timespec t_handshake_last_sent; - struct timespec t_handshake_complete; - volatile int t_handshake_retries; -}; - -struct wg_aip { - struct radix_node r_nodes[2]; - CK_LIST_ENTRY(wg_aip) r_entry; - struct sockaddr_storage r_addr; - struct sockaddr_storage r_mask; - struct wg_peer *r_peer; -}; - -struct wg_queue { - struct mtx q_mtx; - struct mbufq q; -}; - -struct wg_peer { - CK_LIST_ENTRY(wg_peer) p_hash_entry; - CK_LIST_ENTRY(wg_peer) p_entry; - uint64_t p_id; - struct wg_softc *p_sc; - - struct noise_remote p_remote; - struct cookie_maker p_cookie; - struct wg_timers p_timers; - - struct rwlock p_endpoint_lock; - struct wg_endpoint p_endpoint; - - SLIST_HEAD(,wg_index) p_unused_index; - struct wg_index p_index[3]; - - struct wg_queue p_stage_queue; - struct wg_queue p_encap_queue; - struct wg_queue p_decap_queue; - - struct grouptask p_clear_secrets; - struct grouptask p_send_initiation; - struct grouptask p_send_keepalive; - struct grouptask p_send; - struct grouptask p_recv; - - counter_u64_t p_tx_bytes; - counter_u64_t p_rx_bytes; - - CK_LIST_HEAD(, wg_aip) p_aips; - struct mtx p_lock; - struct epoch_context p_ctx; -}; - -enum route_direction { - /* TODO OpenBSD doesn't use IN/OUT, instead passes the address buffer - * directly to route_lookup. */ - IN, - OUT, -}; - -struct wg_aip_table { - size_t t_count; - struct radix_node_head *t_ip; - struct radix_node_head *t_ip6; -}; - -struct wg_allowedip { - uint16_t family; - union { - struct in_addr ip4; - struct in6_addr ip6; - }; - uint8_t cidr; -}; - -struct wg_hashtable { - struct mtx h_mtx; - SIPHASH_KEY h_secret; - CK_LIST_HEAD(, wg_peer) h_peers_list; - CK_LIST_HEAD(, wg_peer) *h_peers; - u_long h_peers_mask; - size_t h_num_peers; -}; - -struct wg_socket { - struct mtx so_mtx; - struct socket *so_so4; - struct socket *so_so6; - uint32_t so_user_cookie; - in_port_t so_port; -}; - -struct wg_softc { - LIST_ENTRY(wg_softc) sc_entry; - struct ifnet *sc_ifp; - int sc_flags; - - struct ucred *sc_ucred; - struct wg_socket sc_socket; - struct wg_hashtable sc_hashtable; - struct wg_aip_table sc_aips; - - struct mbufq sc_handshake_queue; - struct grouptask sc_handshake; - - struct noise_local sc_local; - struct cookie_checker sc_cookie; - - struct buf_ring *sc_encap_ring; - struct buf_ring *sc_decap_ring; - - struct grouptask *sc_encrypt; - struct grouptask *sc_decrypt; - - struct rwlock sc_index_lock; - LIST_HEAD(,wg_index) *sc_index; - u_long sc_index_mask; - - struct sx sc_lock; - volatile u_int sc_peer_count; -}; - -#define WGF_DYING 0x0001 - -/* TODO the following defines are freebsd specific, we should see what is - * necessary and cleanup from there (i suspect a lot can be junked). */ - -#ifndef ENOKEY -#define ENOKEY ENOTCAPABLE -#endif - -#if __FreeBSD_version > 1300000 -typedef void timeout_t (void *); -#endif - -#define GROUPTASK_DRAIN(gtask) \ - gtaskqueue_drain((gtask)->gt_taskqueue, &(gtask)->gt_task) - -#define MTAG_WIREGUARD 0xBEAD -#define M_ENQUEUED M_PROTO1 - -static int clone_count; -static uma_zone_t ratelimit_zone; -static int wireguard_debug; -static volatile unsigned long peer_counter = 0; -static const char wgname[] = "wg"; -static unsigned wg_osd_jail_slot; - -static struct sx wg_sx; -SX_SYSINIT(wg_sx, &wg_sx, "wg_sx"); - -static LIST_HEAD(, wg_softc) wg_list = LIST_HEAD_INITIALIZER(wg_list); - -SYSCTL_NODE(_net, OID_AUTO, wg, CTLFLAG_RW, 0, "WireGuard"); -SYSCTL_INT(_net_wg, OID_AUTO, debug, CTLFLAG_RWTUN, &wireguard_debug, 0, - "enable debug logging"); - -TASKQGROUP_DECLARE(if_io_tqg); - -MALLOC_DEFINE(M_WG, "WG", "wireguard"); -VNET_DEFINE_STATIC(struct if_clone *, wg_cloner); - - -#define V_wg_cloner VNET(wg_cloner) -#define WG_CAPS IFCAP_LINKSTATE -#define ph_family PH_loc.eight[5] - -struct wg_timespec64 { - uint64_t tv_sec; - uint64_t tv_nsec; -}; - -struct wg_peer_export { - struct sockaddr_storage endpoint; - struct timespec last_handshake; - uint8_t public_key[WG_KEY_SIZE]; - uint8_t preshared_key[NOISE_SYMMETRIC_KEY_LEN]; - size_t endpoint_sz; - struct wg_allowedip *aip; - uint64_t rx_bytes; - uint64_t tx_bytes; - int aip_count; - uint16_t persistent_keepalive; -}; - -static struct wg_tag *wg_tag_get(struct mbuf *); -static struct wg_endpoint *wg_mbuf_endpoint_get(struct mbuf *); -static int wg_socket_init(struct wg_softc *, in_port_t); -static int wg_socket_bind(struct socket *, struct socket *, in_port_t *); -static void wg_socket_set(struct wg_softc *, struct socket *, struct socket *); -static void wg_socket_uninit(struct wg_softc *); -static void wg_socket_set_cookie(struct wg_softc *, uint32_t); -static int wg_send(struct wg_softc *, struct wg_endpoint *, struct mbuf *); -static void wg_timers_event_data_sent(struct wg_timers *); -static void wg_timers_event_data_received(struct wg_timers *); -static void wg_timers_event_any_authenticated_packet_sent(struct wg_timers *); -static void wg_timers_event_any_authenticated_packet_received(struct wg_timers *); -static void wg_timers_event_any_authenticated_packet_traversal(struct wg_timers *); -static void wg_timers_event_handshake_initiated(struct wg_timers *); -static void wg_timers_event_handshake_responded(struct wg_timers *); -static void wg_timers_event_handshake_complete(struct wg_timers *); -static void wg_timers_event_session_derived(struct wg_timers *); -static void wg_timers_event_want_initiation(struct wg_timers *); -static void wg_timers_event_reset_handshake_last_sent(struct wg_timers *); -static void wg_timers_run_send_initiation(struct wg_timers *, int); -static void wg_timers_run_retry_handshake(struct wg_timers *); -static void wg_timers_run_send_keepalive(struct wg_timers *); -static void wg_timers_run_new_handshake(struct wg_timers *); -static void wg_timers_run_zero_key_material(struct wg_timers *); -static void wg_timers_run_persistent_keepalive(struct wg_timers *); -static void wg_timers_init(struct wg_timers *); -static void wg_timers_enable(struct wg_timers *); -static void wg_timers_disable(struct wg_timers *); -static void wg_timers_set_persistent_keepalive(struct wg_timers *, uint16_t); -static void wg_timers_get_last_handshake(struct wg_timers *, struct timespec *); -static int wg_timers_expired_handshake_last_sent(struct wg_timers *); -static int wg_timers_check_handshake_last_sent(struct wg_timers *); -static void wg_queue_init(struct wg_queue *, const char *); -static void wg_queue_deinit(struct wg_queue *); -static void wg_queue_purge(struct wg_queue *); -static struct mbuf *wg_queue_dequeue(struct wg_queue *, struct wg_tag **); -static int wg_queue_len(struct wg_queue *); -static int wg_queue_in(struct wg_peer *, struct mbuf *); -static void wg_queue_out(struct wg_peer *); -static void wg_queue_stage(struct wg_peer *, struct mbuf *); -static int wg_aip_init(struct wg_aip_table *); -static void wg_aip_destroy(struct wg_aip_table *); -static void wg_aip_populate_aip4(struct wg_aip *, const struct in_addr *, uint8_t); -static void wg_aip_populate_aip6(struct wg_aip *, const struct in6_addr *, uint8_t); -static int wg_aip_add(struct wg_aip_table *, struct wg_peer *, const struct wg_allowedip *); -static int wg_peer_remove(struct radix_node *, void *); -static void wg_peer_remove_all(struct wg_softc *); -static int wg_aip_delete(struct wg_aip_table *, struct wg_peer *); -static struct wg_peer *wg_aip_lookup(struct wg_aip_table *, struct mbuf *, enum route_direction); -static void wg_hashtable_init(struct wg_hashtable *); -static void wg_hashtable_destroy(struct wg_hashtable *); -static void wg_hashtable_peer_insert(struct wg_hashtable *, struct wg_peer *); -static struct wg_peer *wg_peer_lookup(struct wg_softc *, const uint8_t [32]); -static void wg_hashtable_peer_remove(struct wg_hashtable *, struct wg_peer *); -static int wg_cookie_validate_packet(struct cookie_checker *, struct mbuf *, int); -static struct wg_peer *wg_peer_alloc(struct wg_softc *); -static void wg_peer_free_deferred(epoch_context_t); -static void wg_peer_destroy(struct wg_peer *); -static void wg_peer_send_buf(struct wg_peer *, uint8_t *, size_t); -static void wg_send_initiation(struct wg_peer *); -static void wg_send_response(struct wg_peer *); -static void wg_send_cookie(struct wg_softc *, struct cookie_macs *, uint32_t, struct mbuf *); -static void wg_peer_set_endpoint_from_tag(struct wg_peer *, struct wg_tag *); -static void wg_peer_clear_src(struct wg_peer *); -static void wg_peer_get_endpoint(struct wg_peer *, struct wg_endpoint *); -static void wg_deliver_out(struct wg_peer *); -static void wg_deliver_in(struct wg_peer *); -static void wg_send_buf(struct wg_softc *, struct wg_endpoint *, uint8_t *, size_t); -static void wg_send_keepalive(struct wg_peer *); -static void wg_handshake(struct wg_softc *, struct mbuf *); -static void wg_encap(struct wg_softc *, struct mbuf *); -static void wg_decap(struct wg_softc *, struct mbuf *); -static void wg_softc_handshake_receive(struct wg_softc *); -static void wg_softc_decrypt(struct wg_softc *); -static void wg_softc_encrypt(struct wg_softc *); -static struct noise_remote *wg_remote_get(struct wg_softc *, uint8_t [NOISE_PUBLIC_KEY_LEN]); -static uint32_t wg_index_set(struct wg_softc *, struct noise_remote *); -static struct noise_remote *wg_index_get(struct wg_softc *, uint32_t); -static void wg_index_drop(struct wg_softc *, uint32_t); -static int wg_update_endpoint_addrs(struct wg_endpoint *, const struct sockaddr *, struct ifnet *); -static void wg_input(struct mbuf *, int, struct inpcb *, const struct sockaddr *, void *); -static void wg_encrypt_dispatch(struct wg_softc *); -static void wg_decrypt_dispatch(struct wg_softc *); -static void crypto_taskq_setup(struct wg_softc *); -static void crypto_taskq_destroy(struct wg_softc *); -static int wg_clone_create(struct if_clone *, int, caddr_t); -static void wg_qflush(struct ifnet *); -static int wg_transmit(struct ifnet *, struct mbuf *); -static int wg_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); -static void wg_clone_destroy(struct ifnet *); -static int wg_peer_to_export(struct wg_peer *, struct wg_peer_export *); -static bool wgc_privileged(struct wg_softc *); -static int wgc_get(struct wg_softc *, struct wg_data_io *); -static int wgc_set(struct wg_softc *, struct wg_data_io *); -static int wg_up(struct wg_softc *); -static void wg_down(struct wg_softc *); -static void wg_reassign(struct ifnet *, struct vnet *, char *unused); -static void wg_init(void *); -static int wg_ioctl(struct ifnet *, u_long, caddr_t); -static void vnet_wg_init(const void *); -static void vnet_wg_uninit(const void *); -static void wg_module_init(void); -static void wg_module_deinit(void); - -/* TODO Peer */ -static struct wg_peer * -wg_peer_alloc(struct wg_softc *sc) -{ - struct wg_peer *peer; - - sx_assert(&sc->sc_lock, SX_XLOCKED); - - peer = malloc(sizeof(*peer), M_WG, M_WAITOK|M_ZERO); - peer->p_sc = sc; - peer->p_id = peer_counter++; - CK_LIST_INIT(&peer->p_aips); - - rw_init(&peer->p_endpoint_lock, "wg_peer_endpoint"); - wg_queue_init(&peer->p_stage_queue, "stageq"); - wg_queue_init(&peer->p_encap_queue, "txq"); - wg_queue_init(&peer->p_decap_queue, "rxq"); - - GROUPTASK_INIT(&peer->p_send_initiation, 0, (gtask_fn_t *)wg_send_initiation, peer); - taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send_initiation, peer, NULL, NULL, "wg initiation"); - GROUPTASK_INIT(&peer->p_send_keepalive, 0, (gtask_fn_t *)wg_send_keepalive, peer); - taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send_keepalive, peer, NULL, NULL, "wg keepalive"); - GROUPTASK_INIT(&peer->p_clear_secrets, 0, (gtask_fn_t *)noise_remote_clear, &peer->p_remote); - taskqgroup_attach(qgroup_if_io_tqg, &peer->p_clear_secrets, - &peer->p_remote, NULL, NULL, "wg clear secrets"); - - GROUPTASK_INIT(&peer->p_send, 0, (gtask_fn_t *)wg_deliver_out, peer); - taskqgroup_attach(qgroup_if_io_tqg, &peer->p_send, peer, NULL, NULL, "wg send"); - GROUPTASK_INIT(&peer->p_recv, 0, (gtask_fn_t *)wg_deliver_in, peer); - taskqgroup_attach(qgroup_if_io_tqg, &peer->p_recv, peer, NULL, NULL, "wg recv"); - - wg_timers_init(&peer->p_timers); - - peer->p_tx_bytes = counter_u64_alloc(M_WAITOK); - peer->p_rx_bytes = counter_u64_alloc(M_WAITOK); - - SLIST_INIT(&peer->p_unused_index); - SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[0], - i_unused_entry); - SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[1], - i_unused_entry); - SLIST_INSERT_HEAD(&peer->p_unused_index, &peer->p_index[2], - i_unused_entry); - - return (peer); -} - -#define WG_HASHTABLE_PEER_FOREACH(peer, i, ht) \ - for (i = 0; i < HASHTABLE_PEER_SIZE; i++) \ - LIST_FOREACH(peer, &(ht)->h_peers[i], p_hash_entry) -#define WG_HASHTABLE_PEER_FOREACH_SAFE(peer, i, ht, tpeer) \ - for (i = 0; i < HASHTABLE_PEER_SIZE; i++) \ - CK_LIST_FOREACH_SAFE(peer, &(ht)->h_peers[i], p_hash_entry, tpeer) -static void -wg_hashtable_init(struct wg_hashtable *ht) -{ - mtx_init(&ht->h_mtx, "hash lock", NULL, MTX_DEF); - arc4random_buf(&ht->h_secret, sizeof(ht->h_secret)); - ht->h_num_peers = 0; - ht->h_peers = hashinit(HASHTABLE_PEER_SIZE, M_DEVBUF, - &ht->h_peers_mask); -} - -static void -wg_hashtable_destroy(struct wg_hashtable *ht) -{ - MPASS(ht->h_num_peers == 0); - mtx_destroy(&ht->h_mtx); - hashdestroy(ht->h_peers, M_DEVBUF, ht->h_peers_mask); -} - -static void -wg_hashtable_peer_insert(struct wg_hashtable *ht, struct wg_peer *peer) -{ - uint64_t key; - - key = siphash24(&ht->h_secret, peer->p_remote.r_public, - sizeof(peer->p_remote.r_public)); - - mtx_lock(&ht->h_mtx); - ht->h_num_peers++; - CK_LIST_INSERT_HEAD(&ht->h_peers[key & ht->h_peers_mask], peer, p_hash_entry); - CK_LIST_INSERT_HEAD(&ht->h_peers_list, peer, p_entry); - mtx_unlock(&ht->h_mtx); -} - -static struct wg_peer * -wg_peer_lookup(struct wg_softc *sc, - const uint8_t pubkey[WG_KEY_SIZE]) -{ - struct wg_hashtable *ht = &sc->sc_hashtable; - uint64_t key; - struct wg_peer *i = NULL; - - key = siphash24(&ht->h_secret, pubkey, WG_KEY_SIZE); - - mtx_lock(&ht->h_mtx); - CK_LIST_FOREACH(i, &ht->h_peers[key & ht->h_peers_mask], p_hash_entry) { - if (timingsafe_bcmp(i->p_remote.r_public, pubkey, - WG_KEY_SIZE) == 0) - break; - } - mtx_unlock(&ht->h_mtx); - - return i; -} - -static void -wg_hashtable_peer_remove(struct wg_hashtable *ht, struct wg_peer *peer) -{ - mtx_lock(&ht->h_mtx); - ht->h_num_peers--; - CK_LIST_REMOVE(peer, p_hash_entry); - CK_LIST_REMOVE(peer, p_entry); - mtx_unlock(&ht->h_mtx); -} - -static void -wg_peer_free_deferred(epoch_context_t ctx) -{ - struct wg_peer *peer = __containerof(ctx, struct wg_peer, p_ctx); - counter_u64_free(peer->p_tx_bytes); - counter_u64_free(peer->p_rx_bytes); - rw_destroy(&peer->p_timers.t_lock); - rw_destroy(&peer->p_endpoint_lock); - free(peer, M_WG); -} - -static void -wg_peer_destroy(struct wg_peer *peer) -{ - /* Callers should already have called: - * wg_hashtable_peer_remove(&sc->sc_hashtable, peer); - */ - wg_aip_delete(&peer->p_sc->sc_aips, peer); - MPASS(CK_LIST_EMPTY(&peer->p_aips)); - - /* We disable all timers, so we can't call the following tasks. */ - wg_timers_disable(&peer->p_timers); - - /* Ensure the tasks have finished running */ - GROUPTASK_DRAIN(&peer->p_clear_secrets); - GROUPTASK_DRAIN(&peer->p_send_initiation); - GROUPTASK_DRAIN(&peer->p_send_keepalive); - GROUPTASK_DRAIN(&peer->p_recv); - GROUPTASK_DRAIN(&peer->p_send); - - taskqgroup_detach(qgroup_if_io_tqg, &peer->p_clear_secrets); - taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send_initiation); - taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send_keepalive); - taskqgroup_detach(qgroup_if_io_tqg, &peer->p_recv); - taskqgroup_detach(qgroup_if_io_tqg, &peer->p_send); - - wg_queue_deinit(&peer->p_decap_queue); - wg_queue_deinit(&peer->p_encap_queue); - wg_queue_deinit(&peer->p_stage_queue); - - /* Final cleanup */ - --peer->p_sc->sc_peer_count; - noise_remote_clear(&peer->p_remote); - DPRINTF(peer->p_sc, "Peer %llu destroyed\n", (unsigned long long)peer->p_id); - NET_EPOCH_CALL(wg_peer_free_deferred, &peer->p_ctx); -} - -static void -wg_peer_set_endpoint_from_tag(struct wg_peer *peer, struct wg_tag *t) -{ - struct wg_endpoint *e = &t->t_endpoint; - - MPASS(e->e_remote.r_sa.sa_family != 0); - if (memcmp(e, &peer->p_endpoint, sizeof(*e)) == 0) - return; - - peer->p_endpoint = *e; -} - -static void -wg_peer_clear_src(struct wg_peer *peer) -{ - rw_rlock(&peer->p_endpoint_lock); - bzero(&peer->p_endpoint.e_local, sizeof(peer->p_endpoint.e_local)); - rw_runlock(&peer->p_endpoint_lock); -} - -static void -wg_peer_get_endpoint(struct wg_peer *p, struct wg_endpoint *e) -{ - memcpy(e, &p->p_endpoint, sizeof(*e)); -} - -/* Allowed IP */ -static int -wg_aip_init(struct wg_aip_table *tbl) -{ - int rc; - - tbl->t_count = 0; - rc = rn_inithead((void **)&tbl->t_ip, - offsetof(struct sockaddr_in, sin_addr) * NBBY); - - if (rc == 0) - return (ENOMEM); - RADIX_NODE_HEAD_LOCK_INIT(tbl->t_ip); -#ifdef INET6 - rc = rn_inithead((void **)&tbl->t_ip6, - offsetof(struct sockaddr_in6, sin6_addr) * NBBY); - if (rc == 0) { - free(tbl->t_ip, M_RTABLE); - return (ENOMEM); - } - RADIX_NODE_HEAD_LOCK_INIT(tbl->t_ip6); -#endif - return (0); -} - -static void -wg_aip_destroy(struct wg_aip_table *tbl) -{ - RADIX_NODE_HEAD_DESTROY(tbl->t_ip); - free(tbl->t_ip, M_RTABLE); -#ifdef INET6 - RADIX_NODE_HEAD_DESTROY(tbl->t_ip6); - free(tbl->t_ip6, M_RTABLE); -#endif -} - -static void -wg_aip_populate_aip4(struct wg_aip *aip, const struct in_addr *addr, - uint8_t mask) -{ - struct sockaddr_in *raddr, *rmask; - uint8_t *p; - unsigned int i; - - raddr = (struct sockaddr_in *)&aip->r_addr; - rmask = (struct sockaddr_in *)&aip->r_mask; - - raddr->sin_len = sizeof(*raddr); - raddr->sin_family = AF_INET; - raddr->sin_addr = *addr; - - rmask->sin_len = sizeof(*rmask); - p = (uint8_t *)&rmask->sin_addr.s_addr; - for (i = 0; i < mask / NBBY; i++) - p[i] = 0xff; - if ((mask % NBBY) != 0) - p[i] = (0xff00 >> (mask % NBBY)) & 0xff; - raddr->sin_addr.s_addr &= rmask->sin_addr.s_addr; -} - -static void -wg_aip_populate_aip6(struct wg_aip *aip, const struct in6_addr *addr, - uint8_t mask) -{ - struct sockaddr_in6 *raddr, *rmask; - - raddr = (struct sockaddr_in6 *)&aip->r_addr; - rmask = (struct sockaddr_in6 *)&aip->r_mask; - - raddr->sin6_len = sizeof(*raddr); - raddr->sin6_family = AF_INET6; - raddr->sin6_addr = *addr; - - rmask->sin6_len = sizeof(*rmask); - in6_prefixlen2mask(&rmask->sin6_addr, mask); - for (int i = 0; i < 4; ++i) - raddr->sin6_addr.__u6_addr.__u6_addr32[i] &= rmask->sin6_addr.__u6_addr.__u6_addr32[i]; -} - -/* wg_aip_take assumes that the caller guarantees the allowed-ip exists. */ -static void -wg_aip_take(struct radix_node_head *root, struct wg_peer *peer, - struct wg_aip *route) -{ - struct radix_node *node; - struct wg_peer *ppeer; - - RADIX_NODE_HEAD_LOCK_ASSERT(root); - - node = root->rnh_lookup(&route->r_addr, &route->r_mask, - &root->rh); - MPASS(node != NULL); - - route = (struct wg_aip *)node; - ppeer = route->r_peer; - if (ppeer != peer) { - route->r_peer = peer; - - CK_LIST_REMOVE(route, r_entry); - CK_LIST_INSERT_HEAD(&peer->p_aips, route, r_entry); - } -} - -static int -wg_aip_add(struct wg_aip_table *tbl, struct wg_peer *peer, - const struct wg_allowedip *aip) -{ - struct radix_node *node; - struct radix_node_head *root; - struct wg_aip *route; - sa_family_t family; - bool needfree = false; - - family = aip->family; - if (family != AF_INET && family != AF_INET6) { - return (EINVAL); - } - - route = malloc(sizeof(*route), M_WG, M_WAITOK|M_ZERO); - switch (family) { - case AF_INET: - root = tbl->t_ip; - - wg_aip_populate_aip4(route, &aip->ip4, aip->cidr); - break; - case AF_INET6: - root = tbl->t_ip6; - - wg_aip_populate_aip6(route, &aip->ip6, aip->cidr); - break; - } - - route->r_peer = peer; - - RADIX_NODE_HEAD_LOCK(root); - node = root->rnh_addaddr(&route->r_addr, &route->r_mask, &root->rh, - route->r_nodes); - if (node == route->r_nodes) { - tbl->t_count++; - CK_LIST_INSERT_HEAD(&peer->p_aips, route, r_entry); - } else { - needfree = true; - wg_aip_take(root, peer, route); - } - RADIX_NODE_HEAD_UNLOCK(root); - if (needfree) { - free(route, M_WG); - } - return (0); -} - -static struct wg_peer * -wg_aip_lookup(struct wg_aip_table *tbl, struct mbuf *m, - enum route_direction dir) -{ - RADIX_NODE_HEAD_RLOCK_TRACKER; - struct ip *iphdr; - struct ip6_hdr *ip6hdr; - struct radix_node_head *root; - struct radix_node *node; - struct wg_peer *peer = NULL; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - void *addr; - int version; - - NET_EPOCH_ASSERT(); - iphdr = mtod(m, struct ip *); - version = iphdr->ip_v; - - if (__predict_false(dir != IN && dir != OUT)) - return NULL; - - if (version == 4) { - root = tbl->t_ip; - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof(struct sockaddr_in); - if (dir == IN) - sin.sin_addr = iphdr->ip_src; - else - sin.sin_addr = iphdr->ip_dst; - addr = &sin; - } else if (version == 6) { - ip6hdr = mtod(m, struct ip6_hdr *); - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_len = sizeof(struct sockaddr_in6); - - root = tbl->t_ip6; - if (dir == IN) - addr = &ip6hdr->ip6_src; - else - addr = &ip6hdr->ip6_dst; - memcpy(&sin6.sin6_addr, addr, sizeof(sin6.sin6_addr)); - addr = &sin6; - } else { - return (NULL); - } - RADIX_NODE_HEAD_RLOCK(root); - if ((node = root->rnh_matchaddr(addr, &root->rh)) != NULL) { - peer = ((struct wg_aip *) node)->r_peer; - } - RADIX_NODE_HEAD_RUNLOCK(root); - return (peer); -} - -struct peer_del_arg { - struct radix_node_head * pda_head; - struct wg_peer *pda_peer; - struct wg_aip_table *pda_tbl; -}; - -static int -wg_peer_remove(struct radix_node *rn, void *arg) -{ - struct peer_del_arg *pda = arg; - struct wg_peer *peer = pda->pda_peer; - struct radix_node_head * rnh = pda->pda_head; - struct wg_aip_table *tbl = pda->pda_tbl; - struct wg_aip *route = (struct wg_aip *)rn; - struct radix_node *x; - - if (route->r_peer != peer) - return (0); - x = (struct radix_node *)rnh->rnh_deladdr(&route->r_addr, - &route->r_mask, &rnh->rh); - if (x != NULL) { - tbl->t_count--; - CK_LIST_REMOVE(route, r_entry); - free(route, M_WG); - } - return (0); -} - -static void -wg_peer_remove_all(struct wg_softc *sc) -{ - struct wg_peer *peer, *tpeer; - - sx_assert(&sc->sc_lock, SX_XLOCKED); - - CK_LIST_FOREACH_SAFE(peer, &sc->sc_hashtable.h_peers_list, - p_entry, tpeer) { - wg_hashtable_peer_remove(&sc->sc_hashtable, peer); - wg_peer_destroy(peer); - } -} - -static int -wg_aip_delete(struct wg_aip_table *tbl, struct wg_peer *peer) -{ - struct peer_del_arg pda; - - pda.pda_peer = peer; - pda.pda_tbl = tbl; - RADIX_NODE_HEAD_LOCK(tbl->t_ip); - pda.pda_head = tbl->t_ip; - rn_walktree(&tbl->t_ip->rh, wg_peer_remove, &pda); - RADIX_NODE_HEAD_UNLOCK(tbl->t_ip); - - RADIX_NODE_HEAD_LOCK(tbl->t_ip6); - pda.pda_head = tbl->t_ip6; - rn_walktree(&tbl->t_ip6->rh, wg_peer_remove, &pda); - RADIX_NODE_HEAD_UNLOCK(tbl->t_ip6); - return (0); -} - -static int -wg_socket_init(struct wg_softc *sc, in_port_t port) -{ - struct thread *td; - struct ucred *cred; - struct socket *so4, *so6; - int rc; - - sx_assert(&sc->sc_lock, SX_XLOCKED); - - so4 = so6 = NULL; - td = curthread; - if ((cred = sc->sc_ucred) == NULL) - return (EBUSY); - - /* - * For socket creation, we use the creds of the thread that created the - * tunnel rather than the current thread to maintain the semantics that - * WireGuard has on Linux with network namespaces -- that the sockets - * are created in their home vnet so that they can be configured and - * functionally attached to a foreign vnet as the jail's only interface - * to the network. - */ - rc = socreate(AF_INET, &so4, SOCK_DGRAM, IPPROTO_UDP, cred, td); - if (rc != 0) - goto out; - - rc = udp_set_kernel_tunneling(so4, wg_input, NULL, sc); - /* - * udp_set_kernel_tunneling can only fail if there is already a tunneling function set. - * This should never happen with a new socket. - */ - MPASS(rc == 0); - - rc = socreate(AF_INET6, &so6, SOCK_DGRAM, IPPROTO_UDP, cred, td); - if (rc != 0) - goto out; - rc = udp_set_kernel_tunneling(so6, wg_input, NULL, sc); - MPASS(rc == 0); - - so4->so_user_cookie = so6->so_user_cookie = sc->sc_socket.so_user_cookie; - - rc = wg_socket_bind(so4, so6, &port); - if (rc == 0) { - sc->sc_socket.so_port = port; - wg_socket_set(sc, so4, so6); - } -out: - if (rc != 0) { - if (so4 != NULL) - soclose(so4); - if (so6 != NULL) - soclose(so6); - } - return (rc); -} - -static void wg_socket_set_cookie(struct wg_softc *sc, uint32_t user_cookie) -{ - struct wg_socket *so = &sc->sc_socket; - - sx_assert(&sc->sc_lock, SX_XLOCKED); - - so->so_user_cookie = user_cookie; - if (so->so_so4) - so->so_so4->so_user_cookie = user_cookie; - if (so->so_so6) - so->so_so6->so_user_cookie = user_cookie; -} - -static void -wg_socket_uninit(struct wg_softc *sc) -{ - wg_socket_set(sc, NULL, NULL); -} - -static void -wg_socket_set(struct wg_softc *sc, struct socket *new_so4, struct socket *new_so6) -{ - struct wg_socket *so = &sc->sc_socket; - struct socket *so4, *so6; - - sx_assert(&sc->sc_lock, SX_XLOCKED); - - so4 = atomic_load_ptr(&so->so_so4); - so6 = atomic_load_ptr(&so->so_so6); - atomic_store_ptr(&so->so_so4, new_so4); - atomic_store_ptr(&so->so_so6, new_so6); - - if (!so4 && !so6) - return; - NET_EPOCH_WAIT(); - if (so4) - soclose(so4); - if (so6) - soclose(so6); -} - -union wg_sockaddr { - struct sockaddr sa; - struct sockaddr_in in4; - struct sockaddr_in6 in6; -}; - -static int -wg_socket_bind(struct socket *so4, struct socket *so6, in_port_t *requested_port) -{ - int rc; - struct thread *td; - union wg_sockaddr laddr; - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - in_port_t port = *requested_port; - - td = curthread; - bzero(&laddr, sizeof(laddr)); - sin = &laddr.in4; - sin->sin_len = sizeof(laddr.in4); - sin->sin_family = AF_INET; - sin->sin_port = htons(port); - sin->sin_addr = (struct in_addr) { 0 }; - - if ((rc = sobind(so4, &laddr.sa, td)) != 0) - return (rc); - - if (port == 0) { - rc = sogetsockaddr(so4, (struct sockaddr **)&sin); - if (rc != 0) - return (rc); - port = ntohs(sin->sin_port); - free(sin, M_SONAME); - } - - sin6 = &laddr.in6; - sin6->sin6_len = sizeof(laddr.in6); - sin6->sin6_family = AF_INET6; - sin6->sin6_port = htons(port); - sin6->sin6_addr = (struct in6_addr) { .s6_addr = { 0 } }; - rc = sobind(so6, &laddr.sa, td); - if (rc != 0) - return (rc); - *requested_port = port; - return (0); -} - -static int -wg_send(struct wg_softc *sc, struct wg_endpoint *e, struct mbuf *m) -{ - struct epoch_tracker et; - struct sockaddr *sa; - struct wg_socket *so = &sc->sc_socket; - struct socket *so4, *so6; - struct mbuf *control = NULL; - int ret = 0; - size_t len = m->m_pkthdr.len; - - /* Get local control address before locking */ - if (e->e_remote.r_sa.sa_family == AF_INET) { - if (e->e_local.l_in.s_addr != INADDR_ANY) - control = sbcreatecontrol((caddr_t)&e->e_local.l_in, - sizeof(struct in_addr), IP_SENDSRCADDR, - IPPROTO_IP); -#ifdef INET6 - } else if (e->e_remote.r_sa.sa_family == AF_INET6) { - if (!IN6_IS_ADDR_UNSPECIFIED(&e->e_local.l_in6)) - control = sbcreatecontrol((caddr_t)&e->e_local.l_pktinfo6, - sizeof(struct in6_pktinfo), IPV6_PKTINFO, - IPPROTO_IPV6); -#endif - } else { - m_freem(m); - return (EAFNOSUPPORT); - } - - /* Get remote address */ - sa = &e->e_remote.r_sa; - - NET_EPOCH_ENTER(et); - so4 = atomic_load_ptr(&so->so_so4); - so6 = atomic_load_ptr(&so->so_so6); - if (e->e_remote.r_sa.sa_family == AF_INET && so4 != NULL) - ret = sosend(so4, sa, NULL, m, control, 0, curthread); - else if (e->e_remote.r_sa.sa_family == AF_INET6 && so6 != NULL) - ret = sosend(so6, sa, NULL, m, control, 0, curthread); - else { - ret = ENOTCONN; - m_freem(control); - m_freem(m); - } - NET_EPOCH_EXIT(et); - if (ret == 0) { - if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1); - if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, len); - } - return (ret); -} - -static void -wg_send_buf(struct wg_softc *sc, struct wg_endpoint *e, uint8_t *buf, - size_t len) -{ - struct mbuf *m; - int ret = 0; - -retry: - m = m_gethdr(M_WAITOK, MT_DATA); - m->m_len = 0; - m_copyback(m, 0, len, buf); - - if (ret == 0) { - ret = wg_send(sc, e, m); - /* Retry if we couldn't bind to e->e_local */ - if (ret == EADDRNOTAVAIL) { - bzero(&e->e_local, sizeof(e->e_local)); - goto retry; - } - } else { - ret = wg_send(sc, e, m); - } - if (ret) - DPRINTF(sc, "Unable to send packet: %d\n", ret); -} - -/* TODO Tag */ -static struct wg_tag * -wg_tag_get(struct mbuf *m) -{ - struct m_tag *tag; - - tag = m_tag_find(m, MTAG_WIREGUARD, NULL); - if (tag == NULL) { - tag = m_tag_get(MTAG_WIREGUARD, sizeof(struct wg_tag), M_NOWAIT|M_ZERO); - m_tag_prepend(m, tag); - MPASS(!SLIST_EMPTY(&m->m_pkthdr.tags)); - MPASS(m_tag_locate(m, MTAG_ABI_COMPAT, MTAG_WIREGUARD, NULL) == tag); - } - return (struct wg_tag *)tag; -} - -static struct wg_endpoint * -wg_mbuf_endpoint_get(struct mbuf *m) -{ - struct wg_tag *hdr; - - if ((hdr = wg_tag_get(m)) == NULL) - return (NULL); - - return (&hdr->t_endpoint); -} - -/* Timers */ -static void -wg_timers_init(struct wg_timers *t) -{ - bzero(t, sizeof(*t)); - - t->t_disabled = 1; - rw_init(&t->t_lock, "wg peer timers"); - callout_init(&t->t_retry_handshake, true); - callout_init(&t->t_send_keepalive, true); - callout_init(&t->t_new_handshake, true); - callout_init(&t->t_zero_key_material, true); - callout_init(&t->t_persistent_keepalive, true); -} - -static void -wg_timers_enable(struct wg_timers *t) -{ - rw_wlock(&t->t_lock); - t->t_disabled = 0; - rw_wunlock(&t->t_lock); - wg_timers_run_persistent_keepalive(t); -} - -static void -wg_timers_disable(struct wg_timers *t) -{ - rw_wlock(&t->t_lock); - t->t_disabled = 1; - t->t_need_another_keepalive = 0; - rw_wunlock(&t->t_lock); - - callout_stop(&t->t_retry_handshake); - callout_stop(&t->t_send_keepalive); - callout_stop(&t->t_new_handshake); - callout_stop(&t->t_zero_key_material); - callout_stop(&t->t_persistent_keepalive); -} - -static void -wg_timers_set_persistent_keepalive(struct wg_timers *t, uint16_t interval) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled) { - t->t_persistent_keepalive_interval = interval; - wg_timers_run_persistent_keepalive(t); - } - rw_runlock(&t->t_lock); -} - -static void -wg_timers_get_last_handshake(struct wg_timers *t, struct timespec *time) -{ - rw_rlock(&t->t_lock); - time->tv_sec = t->t_handshake_complete.tv_sec; - time->tv_nsec = t->t_handshake_complete.tv_nsec; - rw_runlock(&t->t_lock); -} - -static int -wg_timers_expired_handshake_last_sent(struct wg_timers *t) -{ - struct timespec uptime; - struct timespec expire = { .tv_sec = REKEY_TIMEOUT, .tv_nsec = 0 }; - - getnanouptime(&uptime); - timespecadd(&t->t_handshake_last_sent, &expire, &expire); - return timespeccmp(&uptime, &expire, >) ? ETIMEDOUT : 0; -} - -static int -wg_timers_check_handshake_last_sent(struct wg_timers *t) -{ - int ret; - - rw_wlock(&t->t_lock); - if ((ret = wg_timers_expired_handshake_last_sent(t)) == ETIMEDOUT) - getnanouptime(&t->t_handshake_last_sent); - rw_wunlock(&t->t_lock); - return (ret); -} - -/* Should be called after an authenticated data packet is sent. */ -static void -wg_timers_event_data_sent(struct wg_timers *t) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled && !callout_pending(&t->t_new_handshake)) - callout_reset(&t->t_new_handshake, MSEC_2_TICKS( - NEW_HANDSHAKE_TIMEOUT * 1000 + - arc4random_uniform(REKEY_TIMEOUT_JITTER)), - (timeout_t *)wg_timers_run_new_handshake, t); - rw_runlock(&t->t_lock); -} - -/* Should be called after an authenticated data packet is received. */ -static void -wg_timers_event_data_received(struct wg_timers *t) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled) { - if (!callout_pending(&t->t_send_keepalive)) { - callout_reset(&t->t_send_keepalive, - MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000), - (timeout_t *)wg_timers_run_send_keepalive, t); - } else { - t->t_need_another_keepalive = 1; - } - } - rw_runlock(&t->t_lock); -} - -/* - * Should be called after any type of authenticated packet is sent, whether - * keepalive, data, or handshake. - */ -static void -wg_timers_event_any_authenticated_packet_sent(struct wg_timers *t) -{ - callout_stop(&t->t_send_keepalive); -} - -/* - * Should be called after any type of authenticated packet is received, whether - * keepalive, data, or handshake. - */ -static void -wg_timers_event_any_authenticated_packet_received(struct wg_timers *t) -{ - callout_stop(&t->t_new_handshake); -} - -/* - * Should be called before a packet with authentication, whether - * keepalive, data, or handshake is sent, or after one is received. - */ -static void -wg_timers_event_any_authenticated_packet_traversal(struct wg_timers *t) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled && t->t_persistent_keepalive_interval > 0) - callout_reset(&t->t_persistent_keepalive, - MSEC_2_TICKS(t->t_persistent_keepalive_interval * 1000), - (timeout_t *)wg_timers_run_persistent_keepalive, t); - rw_runlock(&t->t_lock); -} - -/* Should be called after a handshake initiation message is sent. */ -static void -wg_timers_event_handshake_initiated(struct wg_timers *t) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled) - callout_reset(&t->t_retry_handshake, MSEC_2_TICKS( - REKEY_TIMEOUT * 1000 + - arc4random_uniform(REKEY_TIMEOUT_JITTER)), - (timeout_t *)wg_timers_run_retry_handshake, t); - rw_runlock(&t->t_lock); -} - -static void -wg_timers_event_handshake_responded(struct wg_timers *t) -{ - rw_wlock(&t->t_lock); - getnanouptime(&t->t_handshake_last_sent); - rw_wunlock(&t->t_lock); -} - -/* - * Should be called after a handshake response message is received and processed - * or when getting key confirmation via the first data message. - */ -static void -wg_timers_event_handshake_complete(struct wg_timers *t) -{ - rw_wlock(&t->t_lock); - if (!t->t_disabled) { - callout_stop(&t->t_retry_handshake); - t->t_handshake_retries = 0; - getnanotime(&t->t_handshake_complete); - wg_timers_run_send_keepalive(t); - } - rw_wunlock(&t->t_lock); -} - -/* - * Should be called after an ephemeral key is created, which is before sending a - * handshake response or after receiving a handshake response. - */ -static void -wg_timers_event_session_derived(struct wg_timers *t) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled) { - callout_reset(&t->t_zero_key_material, - MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000), - (timeout_t *)wg_timers_run_zero_key_material, t); - } - rw_runlock(&t->t_lock); -} - -static void -wg_timers_event_want_initiation(struct wg_timers *t) -{ - rw_rlock(&t->t_lock); - if (!t->t_disabled) - wg_timers_run_send_initiation(t, 0); - rw_runlock(&t->t_lock); -} - -static void -wg_timers_event_reset_handshake_last_sent(struct wg_timers *t) -{ - rw_wlock(&t->t_lock); - t->t_handshake_last_sent.tv_sec -= (REKEY_TIMEOUT + 1); - rw_wunlock(&t->t_lock); -} - -static void -wg_timers_run_send_initiation(struct wg_timers *t, int is_retry) -{ - struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); - if (!is_retry) - t->t_handshake_retries = 0; - if (wg_timers_expired_handshake_last_sent(t) == ETIMEDOUT) - GROUPTASK_ENQUEUE(&peer->p_send_initiation); -} - -static void -wg_timers_run_retry_handshake(struct wg_timers *t) -{ - struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); - - rw_wlock(&t->t_lock); - if (t->t_handshake_retries <= MAX_TIMER_HANDSHAKES) { - t->t_handshake_retries++; - rw_wunlock(&t->t_lock); - - DPRINTF(peer->p_sc, "Handshake for peer %llu did not complete " - "after %d seconds, retrying (try %d)\n", - (unsigned long long)peer->p_id, - REKEY_TIMEOUT, t->t_handshake_retries + 1); - wg_peer_clear_src(peer); - wg_timers_run_send_initiation(t, 1); - } else { - rw_wunlock(&t->t_lock); - - DPRINTF(peer->p_sc, "Handshake for peer %llu did not complete " - "after %d retries, giving up\n", - (unsigned long long) peer->p_id, MAX_TIMER_HANDSHAKES + 2); - - callout_stop(&t->t_send_keepalive); - wg_queue_purge(&peer->p_stage_queue); - if (!callout_pending(&t->t_zero_key_material)) - callout_reset(&t->t_zero_key_material, - MSEC_2_TICKS(REJECT_AFTER_TIME * 3 * 1000), - (timeout_t *)wg_timers_run_zero_key_material, t); - } -} - -static void -wg_timers_run_send_keepalive(struct wg_timers *t) -{ - struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); - - GROUPTASK_ENQUEUE(&peer->p_send_keepalive); - if (t->t_need_another_keepalive) { - t->t_need_another_keepalive = 0; - callout_reset(&t->t_send_keepalive, - MSEC_2_TICKS(KEEPALIVE_TIMEOUT * 1000), - (timeout_t *)wg_timers_run_send_keepalive, t); - } -} - -static void -wg_timers_run_new_handshake(struct wg_timers *t) -{ - struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); - - DPRINTF(peer->p_sc, "Retrying handshake with peer %llu because we " - "stopped hearing back after %d seconds\n", - (unsigned long long)peer->p_id, NEW_HANDSHAKE_TIMEOUT); - wg_peer_clear_src(peer); - - wg_timers_run_send_initiation(t, 0); -} - -static void -wg_timers_run_zero_key_material(struct wg_timers *t) -{ - struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); - - DPRINTF(peer->p_sc, "Zeroing out all keys for peer %llu, since we " - "haven't received a new one in %d seconds\n", - (unsigned long long)peer->p_id, REJECT_AFTER_TIME * 3); - GROUPTASK_ENQUEUE(&peer->p_clear_secrets); -} - -static void -wg_timers_run_persistent_keepalive(struct wg_timers *t) -{ - struct wg_peer *peer = __containerof(t, struct wg_peer, p_timers); - - if (t->t_persistent_keepalive_interval != 0) - GROUPTASK_ENQUEUE(&peer->p_send_keepalive); -} - -/* TODO Handshake */ -static void -wg_peer_send_buf(struct wg_peer *peer, uint8_t *buf, size_t len) -{ - struct wg_endpoint endpoint; - - counter_u64_add(peer->p_tx_bytes, len); - wg_timers_event_any_authenticated_packet_traversal(&peer->p_timers); - wg_timers_event_any_authenticated_packet_sent(&peer->p_timers); - wg_peer_get_endpoint(peer, &endpoint); - wg_send_buf(peer->p_sc, &endpoint, buf, len); -} - -static void -wg_send_initiation(struct wg_peer *peer) -{ - struct wg_pkt_initiation pkt; - struct epoch_tracker et; - - if (wg_timers_check_handshake_last_sent(&peer->p_timers) != ETIMEDOUT) - return; - DPRINTF(peer->p_sc, "Sending handshake initiation to peer %llu\n", - (unsigned long long)peer->p_id); - - NET_EPOCH_ENTER(et); - if (noise_create_initiation(&peer->p_remote, &pkt.s_idx, pkt.ue, - pkt.es, pkt.ets) != 0) - goto out; - pkt.t = WG_PKT_INITIATION; - cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt, - sizeof(pkt)-sizeof(pkt.m)); - wg_peer_send_buf(peer, (uint8_t *)&pkt, sizeof(pkt)); - wg_timers_event_handshake_initiated(&peer->p_timers); -out: - NET_EPOCH_EXIT(et); -} - -static void -wg_send_response(struct wg_peer *peer) -{ - struct wg_pkt_response pkt; - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); - - DPRINTF(peer->p_sc, "Sending handshake response to peer %llu\n", - (unsigned long long)peer->p_id); - - if (noise_create_response(&peer->p_remote, &pkt.s_idx, &pkt.r_idx, - pkt.ue, pkt.en) != 0) - goto out; - if (noise_remote_begin_session(&peer->p_remote) != 0) - goto out; - - wg_timers_event_session_derived(&peer->p_timers); - pkt.t = WG_PKT_RESPONSE; - cookie_maker_mac(&peer->p_cookie, &pkt.m, &pkt, - sizeof(pkt)-sizeof(pkt.m)); - wg_timers_event_handshake_responded(&peer->p_timers); - wg_peer_send_buf(peer, (uint8_t*)&pkt, sizeof(pkt)); -out: - NET_EPOCH_EXIT(et); -} - -static void -wg_send_cookie(struct wg_softc *sc, struct cookie_macs *cm, uint32_t idx, - struct mbuf *m) -{ - struct wg_pkt_cookie pkt; - struct wg_endpoint *e; - - DPRINTF(sc, "Sending cookie response for denied handshake message\n"); - - pkt.t = WG_PKT_COOKIE; - pkt.r_idx = idx; - - e = wg_mbuf_endpoint_get(m); - cookie_checker_create_payload(&sc->sc_cookie, cm, pkt.nonce, - pkt.ec, &e->e_remote.r_sa); - wg_send_buf(sc, e, (uint8_t *)&pkt, sizeof(pkt)); -} - -static void -wg_send_keepalive(struct wg_peer *peer) -{ - struct mbuf *m = NULL; - struct wg_tag *t; - struct epoch_tracker et; - - if (wg_queue_len(&peer->p_stage_queue) != 0) { - NET_EPOCH_ENTER(et); - goto send; - } - if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return; - if ((t = wg_tag_get(m)) == NULL) { - m_freem(m); - return; - } - t->t_peer = peer; - t->t_mbuf = NULL; - t->t_done = 0; - t->t_mtu = 0; /* MTU == 0 OK for keepalive */ - - NET_EPOCH_ENTER(et); - wg_queue_stage(peer, m); -send: - wg_queue_out(peer); - NET_EPOCH_EXIT(et); -} - -static int -wg_cookie_validate_packet(struct cookie_checker *checker, struct mbuf *m, - int under_load) -{ - struct wg_pkt_initiation *init; - struct wg_pkt_response *resp; - struct cookie_macs *macs; - struct wg_endpoint *e; - int type, size; - void *data; - - type = *mtod(m, uint32_t *); - data = m->m_data; - e = wg_mbuf_endpoint_get(m); - if (type == WG_PKT_INITIATION) { - init = mtod(m, struct wg_pkt_initiation *); - macs = &init->m; - size = sizeof(*init) - sizeof(*macs); - } else if (type == WG_PKT_RESPONSE) { - resp = mtod(m, struct wg_pkt_response *); - macs = &resp->m; - size = sizeof(*resp) - sizeof(*macs); - } else - return 0; - - return (cookie_checker_validate_macs(checker, macs, data, size, - under_load, &e->e_remote.r_sa)); -} - - -static void -wg_handshake(struct wg_softc *sc, struct mbuf *m) -{ - struct wg_pkt_initiation *init; - struct wg_pkt_response *resp; - struct noise_remote *remote; - struct wg_pkt_cookie *cook; - struct wg_peer *peer; - struct wg_tag *t; - - /* This is global, so that our load calculation applies to the whole - * system. We don't care about races with it at all. - */ - static struct timeval wg_last_underload; - static const struct timeval underload_interval = { UNDERLOAD_TIMEOUT, 0 }; - bool packet_needs_cookie = false; - int underload, res; - - underload = mbufq_len(&sc->sc_handshake_queue) >= - MAX_QUEUED_HANDSHAKES / 8; - if (underload) - getmicrouptime(&wg_last_underload); - else if (wg_last_underload.tv_sec != 0) { - if (!ratecheck(&wg_last_underload, &underload_interval)) - underload = 1; - else - bzero(&wg_last_underload, sizeof(wg_last_underload)); - } - - res = wg_cookie_validate_packet(&sc->sc_cookie, m, underload); - - if (res && res != EAGAIN) { - printf("validate_packet got %d\n", res); - goto free; - } - if (res == EINVAL) { - DPRINTF(sc, "Invalid initiation MAC\n"); - goto free; - } else if (res == ECONNREFUSED) { - DPRINTF(sc, "Handshake ratelimited\n"); - goto free; - } else if (res == EAGAIN) { - packet_needs_cookie = true; - } else if (res != 0) { - DPRINTF(sc, "Unexpected handshake ratelimit response: %d\n", res); - goto free; - } - - t = wg_tag_get(m); - switch (*mtod(m, uint32_t *)) { - case WG_PKT_INITIATION: - init = mtod(m, struct wg_pkt_initiation *); - - if (packet_needs_cookie) { - wg_send_cookie(sc, &init->m, init->s_idx, m); - goto free; - } - if (noise_consume_initiation(&sc->sc_local, &remote, - init->s_idx, init->ue, init->es, init->ets) != 0) { - DPRINTF(sc, "Invalid handshake initiation"); - goto free; - } - - peer = __containerof(remote, struct wg_peer, p_remote); - DPRINTF(sc, "Receiving handshake initiation from peer %llu\n", - (unsigned long long)peer->p_id); - counter_u64_add(peer->p_rx_bytes, sizeof(*init)); - if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); - if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, sizeof(*init)); - wg_peer_set_endpoint_from_tag(peer, t); - wg_send_response(peer); - break; - case WG_PKT_RESPONSE: - resp = mtod(m, struct wg_pkt_response *); - - if (packet_needs_cookie) { - wg_send_cookie(sc, &resp->m, resp->s_idx, m); - goto free; - } - - if ((remote = wg_index_get(sc, resp->r_idx)) == NULL) { - DPRINTF(sc, "Unknown handshake response\n"); - goto free; - } - peer = __containerof(remote, struct wg_peer, p_remote); - if (noise_consume_response(remote, resp->s_idx, resp->r_idx, - resp->ue, resp->en) != 0) { - DPRINTF(sc, "Invalid handshake response\n"); - goto free; - } - - DPRINTF(sc, "Receiving handshake response from peer %llu\n", - (unsigned long long)peer->p_id); - counter_u64_add(peer->p_rx_bytes, sizeof(*resp)); - if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); - if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, sizeof(*resp)); - wg_peer_set_endpoint_from_tag(peer, t); - if (noise_remote_begin_session(&peer->p_remote) == 0) { - wg_timers_event_session_derived(&peer->p_timers); - wg_timers_event_handshake_complete(&peer->p_timers); - } - break; - case WG_PKT_COOKIE: - cook = mtod(m, struct wg_pkt_cookie *); - - if ((remote = wg_index_get(sc, cook->r_idx)) == NULL) { - DPRINTF(sc, "Unknown cookie index\n"); - goto free; - } - - peer = __containerof(remote, struct wg_peer, p_remote); - - if (cookie_maker_consume_payload(&peer->p_cookie, - cook->nonce, cook->ec) != 0) { - DPRINTF(sc, "Could not decrypt cookie response\n"); - goto free; - } - - DPRINTF(sc, "Receiving cookie response\n"); - goto free; - default: - goto free; - } - MPASS(peer != NULL); - wg_timers_event_any_authenticated_packet_received(&peer->p_timers); - wg_timers_event_any_authenticated_packet_traversal(&peer->p_timers); - -free: - m_freem(m); -} - -static void -wg_softc_handshake_receive(struct wg_softc *sc) -{ - struct mbuf *m; - - while ((m = mbufq_dequeue(&sc->sc_handshake_queue)) != NULL) - wg_handshake(sc, m); -} - -/* TODO Encrypt */ -static void -wg_encap(struct wg_softc *sc, struct mbuf *m) -{ - struct wg_pkt_data *data; - size_t padding_len, plaintext_len, out_len; - struct mbuf *mc; - struct wg_peer *peer; - struct wg_tag *t; - uint64_t nonce; - int res, allocation_order; - - NET_EPOCH_ASSERT(); - t = wg_tag_get(m); - peer = t->t_peer; - - plaintext_len = MIN(WG_PKT_WITH_PADDING(m->m_pkthdr.len), t->t_mtu); - padding_len = plaintext_len - m->m_pkthdr.len; - out_len = sizeof(struct wg_pkt_data) + plaintext_len + NOISE_AUTHTAG_LEN; - - if (out_len <= MCLBYTES) - allocation_order = MCLBYTES; - else if (out_len <= MJUMPAGESIZE) - allocation_order = MJUMPAGESIZE; - else if (out_len <= MJUM9BYTES) - allocation_order = MJUM9BYTES; - else if (out_len <= MJUM16BYTES) - allocation_order = MJUM16BYTES; - else - goto error; - - if ((mc = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, allocation_order)) == NULL) - goto error; - - data = mtod(mc, struct wg_pkt_data *); - m_copydata(m, 0, m->m_pkthdr.len, data->buf); - bzero(data->buf + m->m_pkthdr.len, padding_len); - - data->t = WG_PKT_DATA; - - res = noise_remote_encrypt(&peer->p_remote, &data->r_idx, &nonce, - data->buf, plaintext_len); - nonce = htole64(nonce); /* Wire format is little endian. */ - memcpy(data->nonce, &nonce, sizeof(data->nonce)); - - if (__predict_false(res)) { - if (res == EINVAL) { - wg_timers_event_want_initiation(&peer->p_timers); - m_freem(mc); - goto error; - } else if (res == ESTALE) { - wg_timers_event_want_initiation(&peer->p_timers); - } else { - m_freem(mc); - goto error; - } - } - - /* A packet with length 0 is a keepalive packet */ - if (m->m_pkthdr.len == 0) - DPRINTF(sc, "Sending keepalive packet to peer %llu\n", - (unsigned long long)peer->p_id); - /* - * Set the correct output value here since it will be copied - * when we move the pkthdr in send. - */ - mc->m_len = mc->m_pkthdr.len = out_len; - mc->m_flags &= ~(M_MCAST | M_BCAST); - - t->t_mbuf = mc; - error: - /* XXX membar ? */ - t->t_done = 1; - GROUPTASK_ENQUEUE(&peer->p_send); -} - -static void -wg_decap(struct wg_softc *sc, struct mbuf *m) -{ - struct wg_pkt_data *data; - struct wg_peer *peer, *routed_peer; - struct wg_tag *t; - size_t plaintext_len; - uint8_t version; - uint64_t nonce; - int res; - - NET_EPOCH_ASSERT(); - data = mtod(m, struct wg_pkt_data *); - plaintext_len = m->m_pkthdr.len - sizeof(struct wg_pkt_data); - - t = wg_tag_get(m); - peer = t->t_peer; - - memcpy(&nonce, data->nonce, sizeof(nonce)); - nonce = le64toh(nonce); /* Wire format is little endian. */ - - res = noise_remote_decrypt(&peer->p_remote, data->r_idx, nonce, - data->buf, plaintext_len); - - if (__predict_false(res)) { - if (res == EINVAL) { - goto error; - } else if (res == ECONNRESET) { - wg_timers_event_handshake_complete(&peer->p_timers); - } else if (res == ESTALE) { - wg_timers_event_want_initiation(&peer->p_timers); - } else { - panic("unexpected response: %d\n", res); - } - } - wg_peer_set_endpoint_from_tag(peer, t); - - /* Remove the data header, and crypto mac tail from the packet */ - m_adj(m, sizeof(struct wg_pkt_data)); - m_adj(m, -NOISE_AUTHTAG_LEN); - - /* A packet with length 0 is a keepalive packet */ - if (m->m_pkthdr.len == 0) { - DPRINTF(peer->p_sc, "Receiving keepalive packet from peer " - "%llu\n", (unsigned long long)peer->p_id); - goto done; - } - - version = mtod(m, struct ip *)->ip_v; - if (!((version == 4 && m->m_pkthdr.len >= sizeof(struct ip)) || - (version == 6 && m->m_pkthdr.len >= sizeof(struct ip6_hdr)))) { - DPRINTF(peer->p_sc, "Packet is neither ipv4 nor ipv6 from peer " - "%llu\n", (unsigned long long)peer->p_id); - goto error; - } - - routed_peer = wg_aip_lookup(&peer->p_sc->sc_aips, m, IN); - if (routed_peer != peer) { - DPRINTF(peer->p_sc, "Packet has unallowed src IP from peer " - "%llu\n", (unsigned long long)peer->p_id); - goto error; - } - -done: - t->t_mbuf = m; -error: - t->t_done = 1; - GROUPTASK_ENQUEUE(&peer->p_recv); -} - -static void -wg_softc_decrypt(struct wg_softc *sc) -{ - struct epoch_tracker et; - struct mbuf *m; - - NET_EPOCH_ENTER(et); - while ((m = buf_ring_dequeue_mc(sc->sc_decap_ring)) != NULL) - wg_decap(sc, m); - NET_EPOCH_EXIT(et); -} - -static void -wg_softc_encrypt(struct wg_softc *sc) -{ - struct mbuf *m; - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); - while ((m = buf_ring_dequeue_mc(sc->sc_encap_ring)) != NULL) - wg_encap(sc, m); - NET_EPOCH_EXIT(et); -} - -static void -wg_encrypt_dispatch(struct wg_softc *sc) -{ - for (int i = 0; i < mp_ncpus; i++) { - if (sc->sc_encrypt[i].gt_task.ta_flags & TASK_ENQUEUED) - continue; - GROUPTASK_ENQUEUE(&sc->sc_encrypt[i]); - } -} - -static void -wg_decrypt_dispatch(struct wg_softc *sc) -{ - for (int i = 0; i < mp_ncpus; i++) { - if (sc->sc_decrypt[i].gt_task.ta_flags & TASK_ENQUEUED) - continue; - GROUPTASK_ENQUEUE(&sc->sc_decrypt[i]); - } -} - -static void -wg_deliver_out(struct wg_peer *peer) -{ - struct epoch_tracker et; - struct wg_tag *t; - struct mbuf *m; - struct wg_endpoint endpoint; - size_t len; - int ret; - - NET_EPOCH_ENTER(et); - if (peer->p_sc->sc_ifp->if_link_state == LINK_STATE_DOWN) - goto done; - - wg_peer_get_endpoint(peer, &endpoint); - - while ((m = wg_queue_dequeue(&peer->p_encap_queue, &t)) != NULL) { - /* t_mbuf will contain the encrypted packet */ - if (t->t_mbuf == NULL) { - if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OERRORS, 1); - m_freem(m); - continue; - } - len = t->t_mbuf->m_pkthdr.len; - ret = wg_send(peer->p_sc, &endpoint, t->t_mbuf); - - if (ret == 0) { - wg_timers_event_any_authenticated_packet_traversal( - &peer->p_timers); - wg_timers_event_any_authenticated_packet_sent( - &peer->p_timers); - - if (m->m_pkthdr.len != 0) - wg_timers_event_data_sent(&peer->p_timers); - counter_u64_add(peer->p_tx_bytes, len); - } else if (ret == EADDRNOTAVAIL) { - wg_peer_clear_src(peer); - wg_peer_get_endpoint(peer, &endpoint); - } - m_freem(m); - } -done: - NET_EPOCH_EXIT(et); -} - -static void -wg_deliver_in(struct wg_peer *peer) -{ - struct mbuf *m; - struct ifnet *ifp; - struct wg_softc *sc; - struct epoch_tracker et; - struct wg_tag *t; - uint32_t af; - int version; - - NET_EPOCH_ENTER(et); - sc = peer->p_sc; - ifp = sc->sc_ifp; - - while ((m = wg_queue_dequeue(&peer->p_decap_queue, &t)) != NULL) { - /* t_mbuf will contain the encrypted packet */ - if (t->t_mbuf == NULL) { - if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); - m_freem(m); - continue; - } - MPASS(m == t->t_mbuf); - - wg_timers_event_any_authenticated_packet_received( - &peer->p_timers); - wg_timers_event_any_authenticated_packet_traversal( - &peer->p_timers); - - counter_u64_add(peer->p_rx_bytes, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN); - if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1); - if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len + sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN); - - if (m->m_pkthdr.len == 0) { - m_freem(m); - continue; - } - - m->m_flags &= ~(M_MCAST | M_BCAST); - m->m_pkthdr.rcvif = ifp; - version = mtod(m, struct ip *)->ip_v; - if (version == IPVERSION) { - af = AF_INET; - BPF_MTAP2(ifp, &af, sizeof(af), m); - CURVNET_SET(ifp->if_vnet); - ip_input(m); - CURVNET_RESTORE(); - } else if (version == 6) { - af = AF_INET6; - BPF_MTAP2(ifp, &af, sizeof(af), m); - CURVNET_SET(ifp->if_vnet); - ip6_input(m); - CURVNET_RESTORE(); - } else - m_freem(m); - - wg_timers_event_data_received(&peer->p_timers); - } - NET_EPOCH_EXIT(et); -} - -static int -wg_queue_in(struct wg_peer *peer, struct mbuf *m) -{ - struct buf_ring *parallel = peer->p_sc->sc_decap_ring; - struct wg_queue *serial = &peer->p_decap_queue; - struct wg_tag *t; - int rc; - - MPASS(wg_tag_get(m) != NULL); - - mtx_lock(&serial->q_mtx); - if ((rc = mbufq_enqueue(&serial->q, m)) == ENOBUFS) { - m_freem(m); - if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1); - } else { - m->m_flags |= M_ENQUEUED; - rc = buf_ring_enqueue(parallel, m); - if (rc == ENOBUFS) { - t = wg_tag_get(m); - t->t_done = 1; - } - } - mtx_unlock(&serial->q_mtx); - return (rc); -} - -static void -wg_queue_stage(struct wg_peer *peer, struct mbuf *m) -{ - struct wg_queue *q = &peer->p_stage_queue; - mtx_lock(&q->q_mtx); - STAILQ_INSERT_TAIL(&q->q.mq_head, m, m_stailqpkt); - q->q.mq_len++; - while (mbufq_full(&q->q)) { - m = mbufq_dequeue(&q->q); - if (m) { - m_freem(m); - if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1); - } - } - mtx_unlock(&q->q_mtx); -} - -static void -wg_queue_out(struct wg_peer *peer) -{ - struct buf_ring *parallel = peer->p_sc->sc_encap_ring; - struct wg_queue *serial = &peer->p_encap_queue; - struct wg_tag *t; - struct mbufq staged; - struct mbuf *m; - - if (noise_remote_ready(&peer->p_remote) != 0) { - if (wg_queue_len(&peer->p_stage_queue)) - wg_timers_event_want_initiation(&peer->p_timers); - return; - } - - /* We first "steal" the staged queue to a local queue, so that we can do these - * remaining operations without having to hold the staged queue mutex. */ - STAILQ_INIT(&staged.mq_head); - mtx_lock(&peer->p_stage_queue.q_mtx); - STAILQ_SWAP(&staged.mq_head, &peer->p_stage_queue.q.mq_head, mbuf); - staged.mq_len = peer->p_stage_queue.q.mq_len; - peer->p_stage_queue.q.mq_len = 0; - staged.mq_maxlen = peer->p_stage_queue.q.mq_maxlen; - mtx_unlock(&peer->p_stage_queue.q_mtx); - - while ((m = mbufq_dequeue(&staged)) != NULL) { - if ((t = wg_tag_get(m)) == NULL) { - m_freem(m); - continue; - } - t->t_peer = peer; - mtx_lock(&serial->q_mtx); - if (mbufq_enqueue(&serial->q, m) != 0) { - m_freem(m); - if_inc_counter(peer->p_sc->sc_ifp, IFCOUNTER_OQDROPS, 1); - } else { - m->m_flags |= M_ENQUEUED; - if (buf_ring_enqueue(parallel, m)) { - t = wg_tag_get(m); - t->t_done = 1; - } - } - mtx_unlock(&serial->q_mtx); - } - wg_encrypt_dispatch(peer->p_sc); -} - -static struct mbuf * -wg_queue_dequeue(struct wg_queue *q, struct wg_tag **t) -{ - struct mbuf *m_, *m; - - m = NULL; - mtx_lock(&q->q_mtx); - m_ = mbufq_first(&q->q); - if (m_ != NULL && (*t = wg_tag_get(m_))->t_done) { - m = mbufq_dequeue(&q->q); - m->m_flags &= ~M_ENQUEUED; - } - mtx_unlock(&q->q_mtx); - return (m); -} - -static int -wg_queue_len(struct wg_queue *q) -{ - /* This access races. We might consider adding locking here. */ - return (mbufq_len(&q->q)); -} - -static void -wg_queue_init(struct wg_queue *q, const char *name) -{ - mtx_init(&q->q_mtx, name, NULL, MTX_DEF); - mbufq_init(&q->q, MAX_QUEUED_PKT); -} - -static void -wg_queue_deinit(struct wg_queue *q) -{ - wg_queue_purge(q); - mtx_destroy(&q->q_mtx); -} - -static void -wg_queue_purge(struct wg_queue *q) -{ - mtx_lock(&q->q_mtx); - mbufq_drain(&q->q); - mtx_unlock(&q->q_mtx); -} - -/* TODO Indexes */ -static struct noise_remote * -wg_remote_get(struct wg_softc *sc, uint8_t public[NOISE_PUBLIC_KEY_LEN]) -{ - struct wg_peer *peer; - - if ((peer = wg_peer_lookup(sc, public)) == NULL) - return (NULL); - return (&peer->p_remote); -} - -static uint32_t -wg_index_set(struct wg_softc *sc, struct noise_remote *remote) -{ - struct wg_index *index, *iter; - struct wg_peer *peer; - uint32_t key; - - /* We can modify this without a lock as wg_index_set, wg_index_drop are - * guaranteed to be serialised (per remote). */ - peer = __containerof(remote, struct wg_peer, p_remote); - index = SLIST_FIRST(&peer->p_unused_index); - MPASS(index != NULL); - SLIST_REMOVE_HEAD(&peer->p_unused_index, i_unused_entry); - - index->i_value = remote; - - rw_wlock(&sc->sc_index_lock); -assign_id: - key = index->i_key = arc4random(); - key &= sc->sc_index_mask; - LIST_FOREACH(iter, &sc->sc_index[key], i_entry) - if (iter->i_key == index->i_key) - goto assign_id; - - LIST_INSERT_HEAD(&sc->sc_index[key], index, i_entry); - - rw_wunlock(&sc->sc_index_lock); - - /* Likewise, no need to lock for index here. */ - return index->i_key; -} - -static struct noise_remote * -wg_index_get(struct wg_softc *sc, uint32_t key0) -{ - struct wg_index *iter; - struct noise_remote *remote = NULL; - uint32_t key = key0 & sc->sc_index_mask; - - rw_enter_read(&sc->sc_index_lock); - LIST_FOREACH(iter, &sc->sc_index[key], i_entry) - if (iter->i_key == key0) { - remote = iter->i_value; - break; - } - rw_exit_read(&sc->sc_index_lock); - return remote; -} - -static void -wg_index_drop(struct wg_softc *sc, uint32_t key0) -{ - struct wg_index *iter; - struct wg_peer *peer = NULL; - uint32_t key = key0 & sc->sc_index_mask; - - rw_enter_write(&sc->sc_index_lock); - LIST_FOREACH(iter, &sc->sc_index[key], i_entry) - if (iter->i_key == key0) { - LIST_REMOVE(iter, i_entry); - break; - } - rw_exit_write(&sc->sc_index_lock); - - if (iter == NULL) - return; - - /* We expect a peer */ - peer = __containerof(iter->i_value, struct wg_peer, p_remote); - MPASS(peer != NULL); - SLIST_INSERT_HEAD(&peer->p_unused_index, iter, i_unused_entry); -} - -static int -wg_update_endpoint_addrs(struct wg_endpoint *e, const struct sockaddr *srcsa, - struct ifnet *rcvif) -{ - const struct sockaddr_in *sa4; -#ifdef INET6 - const struct sockaddr_in6 *sa6; -#endif - int ret = 0; - - /* - * UDP passes a 2-element sockaddr array: first element is the - * source addr/port, second the destination addr/port. - */ - if (srcsa->sa_family == AF_INET) { - sa4 = (const struct sockaddr_in *)srcsa; - e->e_remote.r_sin = sa4[0]; - e->e_local.l_in = sa4[1].sin_addr; -#ifdef INET6 - } else if (srcsa->sa_family == AF_INET6) { - sa6 = (const struct sockaddr_in6 *)srcsa; - e->e_remote.r_sin6 = sa6[0]; - e->e_local.l_in6 = sa6[1].sin6_addr; -#endif - } else { - ret = EAFNOSUPPORT; - } - - return (ret); -} - -static void -wg_input(struct mbuf *m0, int offset, struct inpcb *inpcb, - const struct sockaddr *srcsa, void *_sc) -{ - struct wg_pkt_data *pkt_data; - struct wg_endpoint *e; - struct wg_softc *sc = _sc; - struct mbuf *m; - int pktlen, pkttype; - struct noise_remote *remote; - struct wg_tag *t; - void *data; - - /* Caller provided us with srcsa, no need for this header. */ - m_adj(m0, offset + sizeof(struct udphdr)); - - /* - * Ensure mbuf has at least enough contiguous data to peel off our - * headers at the beginning. - */ - if ((m = m_defrag(m0, M_NOWAIT)) == NULL) { - m_freem(m0); - return; - } - data = mtod(m, void *); - pkttype = *(uint32_t*)data; - t = wg_tag_get(m); - if (t == NULL) { - goto free; - } - e = wg_mbuf_endpoint_get(m); - - if (wg_update_endpoint_addrs(e, srcsa, m->m_pkthdr.rcvif)) { - goto free; - } - - pktlen = m->m_pkthdr.len; - - if ((pktlen == sizeof(struct wg_pkt_initiation) && - pkttype == WG_PKT_INITIATION) || - (pktlen == sizeof(struct wg_pkt_response) && - pkttype == WG_PKT_RESPONSE) || - (pktlen == sizeof(struct wg_pkt_cookie) && - pkttype == WG_PKT_COOKIE)) { - if (mbufq_enqueue(&sc->sc_handshake_queue, m) == 0) { - GROUPTASK_ENQUEUE(&sc->sc_handshake); - } else { - DPRINTF(sc, "Dropping handshake packet\n"); - m_freem(m); - } - } else if (pktlen >= sizeof(struct wg_pkt_data) + NOISE_AUTHTAG_LEN - && pkttype == WG_PKT_DATA) { - - pkt_data = data; - remote = wg_index_get(sc, pkt_data->r_idx); - if (remote == NULL) { - if_inc_counter(sc->sc_ifp, IFCOUNTER_IERRORS, 1); - m_freem(m); - } else if (buf_ring_count(sc->sc_decap_ring) > MAX_QUEUED_PKT) { - if_inc_counter(sc->sc_ifp, IFCOUNTER_IQDROPS, 1); - m_freem(m); - } else { - t->t_peer = __containerof(remote, struct wg_peer, - p_remote); - t->t_mbuf = NULL; - t->t_done = 0; - - wg_queue_in(t->t_peer, m); - wg_decrypt_dispatch(sc); - } - } else { -free: - m_freem(m); - } -} - -static int -wg_transmit(struct ifnet *ifp, struct mbuf *m) -{ - struct wg_softc *sc; - sa_family_t family; - struct epoch_tracker et; - struct wg_peer *peer; - struct wg_tag *t; - uint32_t af; - int rc; - - /* - * Work around lifetime issue in the ipv6 mld code. - */ - if (__predict_false(ifp->if_flags & IFF_DYING)) - return (ENXIO); - - rc = 0; - sc = ifp->if_softc; - if ((t = wg_tag_get(m)) == NULL) { - rc = ENOBUFS; - goto early_out; - } - af = m->m_pkthdr.ph_family; - BPF_MTAP2(ifp, &af, sizeof(af), m); - - NET_EPOCH_ENTER(et); - peer = wg_aip_lookup(&sc->sc_aips, m, OUT); - if (__predict_false(peer == NULL)) { - rc = ENOKEY; - goto err; - } - - family = peer->p_endpoint.e_remote.r_sa.sa_family; - if (__predict_false(family != AF_INET && family != AF_INET6)) { - DPRINTF(sc, "No valid endpoint has been configured or " - "discovered for peer %llu\n", (unsigned long long)peer->p_id); - - rc = EHOSTUNREACH; - goto err; - } - t->t_peer = peer; - t->t_mbuf = NULL; - t->t_done = 0; - t->t_mtu = ifp->if_mtu; - - wg_queue_stage(peer, m); - wg_queue_out(peer); - NET_EPOCH_EXIT(et); - return (rc); -err: - NET_EPOCH_EXIT(et); -early_out: - if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1); - /* TODO: send ICMP unreachable */ - m_free(m); - return (rc); -} - -static int -wg_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa, struct route *rt) -{ - m->m_pkthdr.ph_family = sa->sa_family; - return (wg_transmit(ifp, m)); -} - -static int -wg_peer_add(struct wg_softc *sc, const nvlist_t *nvl) -{ - uint8_t public[WG_KEY_SIZE]; - const void *pub_key; - const struct sockaddr *endpoint; - int err; - size_t size; - struct wg_peer *peer = NULL; - bool need_insert = false; - - sx_assert(&sc->sc_lock, SX_XLOCKED); - - if (!nvlist_exists_binary(nvl, "public-key")) { - return (EINVAL); - } - pub_key = nvlist_get_binary(nvl, "public-key", &size); - if (size != WG_KEY_SIZE) { - return (EINVAL); - } - if (noise_local_keys(&sc->sc_local, public, NULL) == 0 && - bcmp(public, pub_key, WG_KEY_SIZE) == 0) { - return (0); // Silently ignored; not actually a failure. - } - peer = wg_peer_lookup(sc, pub_key); - if (nvlist_exists_bool(nvl, "remove") && - nvlist_get_bool(nvl, "remove")) { - if (peer != NULL) { - wg_hashtable_peer_remove(&sc->sc_hashtable, peer); - wg_peer_destroy(peer); - } - return (0); - } - if (nvlist_exists_bool(nvl, "replace-allowedips") && - nvlist_get_bool(nvl, "replace-allowedips") && - peer != NULL) { - - wg_aip_delete(&peer->p_sc->sc_aips, peer); - } - if (peer == NULL) { - if (sc->sc_peer_count >= MAX_PEERS_PER_IFACE) - return (E2BIG); - sc->sc_peer_count++; - - need_insert = true; - peer = wg_peer_alloc(sc); - MPASS(peer != NULL); - noise_remote_init(&peer->p_remote, pub_key, &sc->sc_local); - cookie_maker_init(&peer->p_cookie, pub_key); - } - if (nvlist_exists_binary(nvl, "endpoint")) { - endpoint = nvlist_get_binary(nvl, "endpoint", &size); - if (size > sizeof(peer->p_endpoint.e_remote)) { - err = EINVAL; - goto out; - } - memcpy(&peer->p_endpoint.e_remote, endpoint, size); - } - if (nvlist_exists_binary(nvl, "preshared-key")) { - const void *key; - - key = nvlist_get_binary(nvl, "preshared-key", &size); - if (size != WG_KEY_SIZE) { - err = EINVAL; - goto out; - } - noise_remote_set_psk(&peer->p_remote, key); - } - if (nvlist_exists_number(nvl, "persistent-keepalive-interval")) { - uint64_t pki = nvlist_get_number(nvl, "persistent-keepalive-interval"); - if (pki > UINT16_MAX) { - err = EINVAL; - goto out; - } - wg_timers_set_persistent_keepalive(&peer->p_timers, pki); - } - if (nvlist_exists_nvlist_array(nvl, "allowed-ips")) { - const void *binary; - uint64_t cidr; - const nvlist_t * const * aipl; - struct wg_allowedip aip; - size_t allowedip_count; - - aipl = nvlist_get_nvlist_array(nvl, "allowed-ips", - &allowedip_count); - for (size_t idx = 0; idx < allowedip_count; idx++) { - if (!nvlist_exists_number(aipl[idx], "cidr")) - continue; - cidr = nvlist_get_number(aipl[idx], "cidr"); - if (nvlist_exists_binary(aipl[idx], "ipv4")) { - binary = nvlist_get_binary(aipl[idx], "ipv4", &size); - if (binary == NULL || cidr > 32 || size != sizeof(aip.ip4)) { - err = EINVAL; - goto out; - } - aip.family = AF_INET; - memcpy(&aip.ip4, binary, sizeof(aip.ip4)); - } else if (nvlist_exists_binary(aipl[idx], "ipv6")) { - binary = nvlist_get_binary(aipl[idx], "ipv6", &size); - if (binary == NULL || cidr > 128 || size != sizeof(aip.ip6)) { - err = EINVAL; - goto out; - } - aip.family = AF_INET6; - memcpy(&aip.ip6, binary, sizeof(aip.ip6)); - } else { - continue; - } - aip.cidr = cidr; - - if ((err = wg_aip_add(&sc->sc_aips, peer, &aip)) != 0) { - goto out; - } - } - } - if (need_insert) { - wg_hashtable_peer_insert(&sc->sc_hashtable, peer); - if (sc->sc_ifp->if_link_state == LINK_STATE_UP) - wg_timers_enable(&peer->p_timers); - } - return (0); - -out: - if (need_insert) /* If we fail, only destroy if it was new. */ - wg_peer_destroy(peer); - return (err); -} - -static int -wgc_set(struct wg_softc *sc, struct wg_data_io *wgd) -{ - uint8_t public[WG_KEY_SIZE], private[WG_KEY_SIZE]; - struct ifnet *ifp; - void *nvlpacked; - nvlist_t *nvl; - ssize_t size; - int err; - - ifp = sc->sc_ifp; - if (wgd->wgd_size == 0 || wgd->wgd_data == NULL) - return (EFAULT); - - sx_xlock(&sc->sc_lock); - - nvlpacked = malloc(wgd->wgd_size, M_TEMP, M_WAITOK); - err = copyin(wgd->wgd_data, nvlpacked, wgd->wgd_size); - if (err) - goto out; - nvl = nvlist_unpack(nvlpacked, wgd->wgd_size, 0); - if (nvl == NULL) { - err = EBADMSG; - goto out; - } - if (nvlist_exists_bool(nvl, "replace-peers") && - nvlist_get_bool(nvl, "replace-peers")) - wg_peer_remove_all(sc); - if (nvlist_exists_number(nvl, "listen-port")) { - uint64_t new_port = nvlist_get_number(nvl, "listen-port"); - if (new_port > UINT16_MAX) { - err = EINVAL; - goto out; - } - if (new_port != sc->sc_socket.so_port) { - if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) { - if ((err = wg_socket_init(sc, new_port)) != 0) - goto out; - } else - sc->sc_socket.so_port = new_port; - } - } - if (nvlist_exists_binary(nvl, "private-key")) { - const void *key = nvlist_get_binary(nvl, "private-key", &size); - if (size != WG_KEY_SIZE) { - err = EINVAL; - goto out; - } - - if (noise_local_keys(&sc->sc_local, NULL, private) != 0 || - timingsafe_bcmp(private, key, WG_KEY_SIZE) != 0) { - struct noise_local *local; - struct wg_peer *peer; - struct wg_hashtable *ht = &sc->sc_hashtable; - bool has_identity; - - if (curve25519_generate_public(public, key)) { - /* Peer conflict: remove conflicting peer. */ - if ((peer = wg_peer_lookup(sc, public)) != - NULL) { - wg_hashtable_peer_remove(ht, peer); - wg_peer_destroy(peer); - } - } - - /* - * Set the private key and invalidate all existing - * handshakes. - */ - local = &sc->sc_local; - noise_local_lock_identity(local); - /* Note: we might be removing the private key. */ - has_identity = noise_local_set_private(local, key) == 0; - mtx_lock(&ht->h_mtx); - CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { - noise_remote_precompute(&peer->p_remote); - wg_timers_event_reset_handshake_last_sent( - &peer->p_timers); - noise_remote_expire_current(&peer->p_remote); - } - mtx_unlock(&ht->h_mtx); - cookie_checker_update(&sc->sc_cookie, - has_identity ? public : NULL); - noise_local_unlock_identity(local); - } - } - if (nvlist_exists_number(nvl, "user-cookie")) { - uint64_t user_cookie = nvlist_get_number(nvl, "user-cookie"); - if (user_cookie > UINT32_MAX) { - err = EINVAL; - goto out; - } - wg_socket_set_cookie(sc, user_cookie); - } - if (nvlist_exists_nvlist_array(nvl, "peers")) { - size_t peercount; - const nvlist_t * const*nvl_peers; - - nvl_peers = nvlist_get_nvlist_array(nvl, "peers", &peercount); - for (int i = 0; i < peercount; i++) { - err = wg_peer_add(sc, nvl_peers[i]); - if (err != 0) - goto out; - } - } - - nvlist_destroy(nvl); -out: - free(nvlpacked, M_TEMP); - sx_xunlock(&sc->sc_lock); - return (err); -} - -static unsigned int -in_mask2len(struct in_addr *mask) -{ - unsigned int x, y; - uint8_t *p; - - p = (uint8_t *)mask; - for (x = 0; x < sizeof(*mask); x++) { - if (p[x] != 0xff) - break; - } - y = 0; - if (x < sizeof(*mask)) { - for (y = 0; y < NBBY; y++) { - if ((p[x] & (0x80 >> y)) == 0) - break; - } - } - return x * NBBY + y; -} - -static int -wg_peer_to_export(struct wg_peer *peer, struct wg_peer_export *exp) -{ - struct wg_endpoint *ep; - struct wg_aip *rt; - struct noise_remote *remote; - int i; - - /* Non-sleepable context. */ - NET_EPOCH_ASSERT(); - - bzero(&exp->endpoint, sizeof(exp->endpoint)); - remote = &peer->p_remote; - ep = &peer->p_endpoint; - if (ep->e_remote.r_sa.sa_family != 0) { - exp->endpoint_sz = (ep->e_remote.r_sa.sa_family == AF_INET) ? - sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); - - memcpy(&exp->endpoint, &ep->e_remote, exp->endpoint_sz); - } - - /* We always export it. */ - (void)noise_remote_keys(remote, exp->public_key, exp->preshared_key); - exp->persistent_keepalive = - peer->p_timers.t_persistent_keepalive_interval; - wg_timers_get_last_handshake(&peer->p_timers, &exp->last_handshake); - exp->rx_bytes = counter_u64_fetch(peer->p_rx_bytes); - exp->tx_bytes = counter_u64_fetch(peer->p_tx_bytes); - - exp->aip_count = 0; - CK_LIST_FOREACH(rt, &peer->p_aips, r_entry) { - exp->aip_count++; - } - - /* Early success; no allowed-ips to copy out. */ - if (exp->aip_count == 0) - return (0); - - exp->aip = malloc(exp->aip_count * sizeof(*exp->aip), M_TEMP, M_NOWAIT); - if (exp->aip == NULL) - return (ENOMEM); - - i = 0; - CK_LIST_FOREACH(rt, &peer->p_aips, r_entry) { - exp->aip[i].family = rt->r_addr.ss_family; - if (exp->aip[i].family == AF_INET) { - struct sockaddr_in *sin = - (struct sockaddr_in *)&rt->r_addr; - - exp->aip[i].ip4 = sin->sin_addr; - - sin = (struct sockaddr_in *)&rt->r_mask; - exp->aip[i].cidr = in_mask2len(&sin->sin_addr); - } else if (exp->aip[i].family == AF_INET6) { - struct sockaddr_in6 *sin6 = - (struct sockaddr_in6 *)&rt->r_addr; - - exp->aip[i].ip6 = sin6->sin6_addr; - - sin6 = (struct sockaddr_in6 *)&rt->r_mask; - exp->aip[i].cidr = in6_mask2len(&sin6->sin6_addr, NULL); - } - i++; - if (i == exp->aip_count) - break; - } - - /* Again, AllowedIPs might have shrank; update it. */ - exp->aip_count = i; - - return (0); -} - -static nvlist_t * -wg_peer_export_to_nvl(struct wg_softc *sc, struct wg_peer_export *exp) -{ - struct wg_timespec64 ts64; - nvlist_t *nvl, **nvl_aips; - size_t i; - uint16_t family; - - nvl_aips = NULL; - if ((nvl = nvlist_create(0)) == NULL) - return (NULL); - - nvlist_add_binary(nvl, "public-key", exp->public_key, - sizeof(exp->public_key)); - if (wgc_privileged(sc)) - nvlist_add_binary(nvl, "preshared-key", exp->preshared_key, - sizeof(exp->preshared_key)); - if (exp->endpoint_sz != 0) - nvlist_add_binary(nvl, "endpoint", &exp->endpoint, - exp->endpoint_sz); - - if (exp->aip_count != 0) { - nvl_aips = mallocarray(exp->aip_count, sizeof(*nvl_aips), - M_WG, M_WAITOK | M_ZERO); - } - - for (i = 0; i < exp->aip_count; i++) { - nvl_aips[i] = nvlist_create(0); - if (nvl_aips[i] == NULL) - goto err; - family = exp->aip[i].family; - nvlist_add_number(nvl_aips[i], "cidr", exp->aip[i].cidr); - if (family == AF_INET) - nvlist_add_binary(nvl_aips[i], "ipv4", - &exp->aip[i].ip4, sizeof(exp->aip[i].ip4)); - else if (family == AF_INET6) - nvlist_add_binary(nvl_aips[i], "ipv6", - &exp->aip[i].ip6, sizeof(exp->aip[i].ip6)); - } - - if (i != 0) { - nvlist_add_nvlist_array(nvl, "allowed-ips", - (const nvlist_t *const *)nvl_aips, i); - } - - for (i = 0; i < exp->aip_count; ++i) - nvlist_destroy(nvl_aips[i]); - - free(nvl_aips, M_WG); - nvl_aips = NULL; - - ts64.tv_sec = exp->last_handshake.tv_sec; - ts64.tv_nsec = exp->last_handshake.tv_nsec; - nvlist_add_binary(nvl, "last-handshake-time", &ts64, sizeof(ts64)); - - if (exp->persistent_keepalive != 0) - nvlist_add_number(nvl, "persistent-keepalive-interval", - exp->persistent_keepalive); - - if (exp->rx_bytes != 0) - nvlist_add_number(nvl, "rx-bytes", exp->rx_bytes); - if (exp->tx_bytes != 0) - nvlist_add_number(nvl, "tx-bytes", exp->tx_bytes); - - return (nvl); -err: - for (i = 0; i < exp->aip_count && nvl_aips[i] != NULL; i++) { - nvlist_destroy(nvl_aips[i]); - } - - free(nvl_aips, M_WG); - nvlist_destroy(nvl); - return (NULL); -} - -static int -wg_marshal_peers(struct wg_softc *sc, nvlist_t **nvlp, nvlist_t ***nvl_arrayp, int *peer_countp) -{ - struct wg_peer *peer; - int err, i, peer_count; - nvlist_t *nvl, **nvl_array; - struct epoch_tracker et; - struct wg_peer_export *wpe; - - nvl = NULL; - nvl_array = NULL; - if (nvl_arrayp) - *nvl_arrayp = NULL; - if (nvlp) - *nvlp = NULL; - if (peer_countp) - *peer_countp = 0; - peer_count = sc->sc_hashtable.h_num_peers; - if (peer_count == 0) { - return (ENOENT); - } - - if (nvlp && (nvl = nvlist_create(0)) == NULL) - return (ENOMEM); - - err = i = 0; - nvl_array = malloc(peer_count*sizeof(void*), M_TEMP, M_WAITOK | M_ZERO); - wpe = malloc(peer_count*sizeof(*wpe), M_TEMP, M_WAITOK | M_ZERO); - - NET_EPOCH_ENTER(et); - CK_LIST_FOREACH(peer, &sc->sc_hashtable.h_peers_list, p_entry) { - if ((err = wg_peer_to_export(peer, &wpe[i])) != 0) { - break; - } - - i++; - if (i == peer_count) - break; - } - NET_EPOCH_EXIT(et); - - if (err != 0) - goto out; - - /* Update the peer count, in case we found fewer entries. */ - *peer_countp = peer_count = i; - if (peer_count == 0) { - err = ENOENT; - goto out; - } - - for (i = 0; i < peer_count; i++) { - int idx; - - /* - * Peers are added to the list in reverse order, effectively, - * because it's simpler/quicker to add at the head every time. - * - * Export them in reverse order. No worries if we fail mid-way - * through, the cleanup below will DTRT. - */ - idx = peer_count - i - 1; - nvl_array[idx] = wg_peer_export_to_nvl(sc, &wpe[i]); - if (nvl_array[idx] == NULL) { - break; - } - } - - if (i < peer_count) { - /* Error! */ - *peer_countp = 0; - err = ENOMEM; - } else if (nvl) { - nvlist_add_nvlist_array(nvl, "peers", - (const nvlist_t * const *)nvl_array, peer_count); - if ((err = nvlist_error(nvl))) { - goto out; - } - *nvlp = nvl; - } - *nvl_arrayp = nvl_array; - out: - if (err != 0) { - /* Note that nvl_array is populated in reverse order. */ - for (i = 0; i < peer_count; i++) { - nvlist_destroy(nvl_array[i]); - } - - free(nvl_array, M_TEMP); - if (nvl != NULL) - nvlist_destroy(nvl); - } - - for (i = 0; i < peer_count; i++) - free(wpe[i].aip, M_TEMP); - free(wpe, M_TEMP); - return (err); -} - -static int -wgc_get(struct wg_softc *sc, struct wg_data_io *wgd) -{ - nvlist_t *nvl, **nvl_array; - void *packed; - size_t size; - int peer_count, err; - - nvl = nvlist_create(0); - if (nvl == NULL) - return (ENOMEM); - - sx_slock(&sc->sc_lock); - - err = 0; - packed = NULL; - if (sc->sc_socket.so_port != 0) - nvlist_add_number(nvl, "listen-port", sc->sc_socket.so_port); - if (sc->sc_socket.so_user_cookie != 0) - nvlist_add_number(nvl, "user-cookie", sc->sc_socket.so_user_cookie); - if (sc->sc_local.l_has_identity) { - nvlist_add_binary(nvl, "public-key", sc->sc_local.l_public, WG_KEY_SIZE); - if (wgc_privileged(sc)) - nvlist_add_binary(nvl, "private-key", sc->sc_local.l_private, WG_KEY_SIZE); - } - if (sc->sc_hashtable.h_num_peers > 0) { - err = wg_marshal_peers(sc, NULL, &nvl_array, &peer_count); - if (err) - goto out_nvl; - nvlist_add_nvlist_array(nvl, "peers", - (const nvlist_t * const *)nvl_array, peer_count); - } - packed = nvlist_pack(nvl, &size); - if (packed == NULL) { - err = ENOMEM; - goto out_nvl; - } - if (wgd->wgd_size == 0) { - wgd->wgd_size = size; - goto out_packed; - } - if (wgd->wgd_size < size) { - err = ENOSPC; - goto out_packed; - } - if (wgd->wgd_data == NULL) { - err = EFAULT; - goto out_packed; - } - err = copyout(packed, wgd->wgd_data, size); - wgd->wgd_size = size; - -out_packed: - free(packed, M_NVLIST); -out_nvl: - nvlist_destroy(nvl); - sx_sunlock(&sc->sc_lock); - return (err); -} - -static int -wg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) -{ - struct wg_data_io *wgd = (struct wg_data_io *)data; - struct ifreq *ifr = (struct ifreq *)data; - struct wg_softc *sc = ifp->if_softc; - int ret = 0; - - switch (cmd) { - case SIOCSWG: - ret = priv_check(curthread, PRIV_NET_WG); - if (ret == 0) - ret = wgc_set(sc, wgd); - break; - case SIOCGWG: - ret = wgc_get(sc, wgd); - break; - /* Interface IOCTLs */ - case SIOCSIFADDR: - /* - * This differs from *BSD norms, but is more uniform with how - * WireGuard behaves elsewhere. - */ - break; - case SIOCSIFFLAGS: - if ((ifp->if_flags & IFF_UP) != 0) - ret = wg_up(sc); - else - wg_down(sc); - break; - case SIOCSIFMTU: - if (ifr->ifr_mtu <= 0 || ifr->ifr_mtu > MAX_MTU) - ret = EINVAL; - else - ifp->if_mtu = ifr->ifr_mtu; - break; - case SIOCADDMULTI: - case SIOCDELMULTI: - break; - default: - ret = ENOTTY; - } - - return ret; -} - -static int -wg_up(struct wg_softc *sc) -{ - struct wg_hashtable *ht = &sc->sc_hashtable; - struct ifnet *ifp = sc->sc_ifp; - struct wg_peer *peer; - int rc = EBUSY; - - sx_xlock(&sc->sc_lock); - /* Jail's being removed, no more wg_up(). */ - if ((sc->sc_flags & WGF_DYING) != 0) - goto out; - - /* Silent success if we're already running. */ - rc = 0; - if (ifp->if_drv_flags & IFF_DRV_RUNNING) - goto out; - ifp->if_drv_flags |= IFF_DRV_RUNNING; - - rc = wg_socket_init(sc, sc->sc_socket.so_port); - if (rc == 0) { - mtx_lock(&ht->h_mtx); - CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { - wg_timers_enable(&peer->p_timers); - wg_queue_out(peer); - } - mtx_unlock(&ht->h_mtx); - - if_link_state_change(sc->sc_ifp, LINK_STATE_UP); - } else { - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - } -out: - sx_xunlock(&sc->sc_lock); - return (rc); -} - -static void -wg_down(struct wg_softc *sc) -{ - struct wg_hashtable *ht = &sc->sc_hashtable; - struct ifnet *ifp = sc->sc_ifp; - struct wg_peer *peer; - - sx_xlock(&sc->sc_lock); - if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { - sx_xunlock(&sc->sc_lock); - return; - } - ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - - mtx_lock(&ht->h_mtx); - CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { - wg_queue_purge(&peer->p_stage_queue); - wg_timers_disable(&peer->p_timers); - } - mtx_unlock(&ht->h_mtx); - - mbufq_drain(&sc->sc_handshake_queue); - - mtx_lock(&ht->h_mtx); - CK_LIST_FOREACH(peer, &ht->h_peers_list, p_entry) { - noise_remote_clear(&peer->p_remote); - wg_timers_event_reset_handshake_last_sent(&peer->p_timers); - } - mtx_unlock(&ht->h_mtx); - - if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); - wg_socket_uninit(sc); - - sx_xunlock(&sc->sc_lock); -} - -static void -crypto_taskq_setup(struct wg_softc *sc) -{ - - sc->sc_encrypt = malloc(sizeof(struct grouptask)*mp_ncpus, M_WG, M_WAITOK); - sc->sc_decrypt = malloc(sizeof(struct grouptask)*mp_ncpus, M_WG, M_WAITOK); - - for (int i = 0; i < mp_ncpus; i++) { - GROUPTASK_INIT(&sc->sc_encrypt[i], 0, - (gtask_fn_t *)wg_softc_encrypt, sc); - taskqgroup_attach_cpu(qgroup_if_io_tqg, &sc->sc_encrypt[i], sc, i, NULL, NULL, "wg encrypt"); - GROUPTASK_INIT(&sc->sc_decrypt[i], 0, - (gtask_fn_t *)wg_softc_decrypt, sc); - taskqgroup_attach_cpu(qgroup_if_io_tqg, &sc->sc_decrypt[i], sc, i, NULL, NULL, "wg decrypt"); - } -} - -static void -crypto_taskq_destroy(struct wg_softc *sc) -{ - for (int i = 0; i < mp_ncpus; i++) { - taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_encrypt[i]); - taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_decrypt[i]); - } - free(sc->sc_encrypt, M_WG); - free(sc->sc_decrypt, M_WG); -} - -static int -wg_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - struct wg_softc *sc; - struct ifnet *ifp; - struct noise_upcall noise_upcall; - - sc = malloc(sizeof(*sc), M_WG, M_WAITOK | M_ZERO); - sc->sc_ucred = crhold(curthread->td_ucred); - ifp = sc->sc_ifp = if_alloc(IFT_WIREGUARD); - ifp->if_softc = sc; - if_initname(ifp, wgname, unit); - - noise_upcall.u_arg = sc; - noise_upcall.u_remote_get = - (struct noise_remote *(*)(void *, uint8_t *))wg_remote_get; - noise_upcall.u_index_set = - (uint32_t (*)(void *, struct noise_remote *))wg_index_set; - noise_upcall.u_index_drop = - (void (*)(void *, uint32_t))wg_index_drop; - noise_local_init(&sc->sc_local, &noise_upcall); - cookie_checker_init(&sc->sc_cookie, ratelimit_zone); - - sc->sc_socket.so_port = 0; - - atomic_add_int(&clone_count, 1); - ifp->if_capabilities = ifp->if_capenable = WG_CAPS; - - mbufq_init(&sc->sc_handshake_queue, MAX_QUEUED_HANDSHAKES); - sx_init(&sc->sc_lock, "wg softc lock"); - rw_init(&sc->sc_index_lock, "wg index lock"); - sc->sc_peer_count = 0; - sc->sc_encap_ring = buf_ring_alloc(MAX_QUEUED_PKT, M_WG, M_WAITOK, NULL); - sc->sc_decap_ring = buf_ring_alloc(MAX_QUEUED_PKT, M_WG, M_WAITOK, NULL); - GROUPTASK_INIT(&sc->sc_handshake, 0, - (gtask_fn_t *)wg_softc_handshake_receive, sc); - taskqgroup_attach(qgroup_if_io_tqg, &sc->sc_handshake, sc, NULL, NULL, "wg tx initiation"); - crypto_taskq_setup(sc); - - wg_hashtable_init(&sc->sc_hashtable); - sc->sc_index = hashinit(HASHTABLE_INDEX_SIZE, M_DEVBUF, &sc->sc_index_mask); - wg_aip_init(&sc->sc_aips); - - if_setmtu(ifp, ETHERMTU - 80); - ifp->if_flags = IFF_BROADCAST | IFF_MULTICAST | IFF_NOARP; - ifp->if_init = wg_init; - ifp->if_reassign = wg_reassign; - ifp->if_qflush = wg_qflush; - ifp->if_transmit = wg_transmit; - ifp->if_output = wg_output; - ifp->if_ioctl = wg_ioctl; - - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(uint32_t)); - - sx_xlock(&wg_sx); - LIST_INSERT_HEAD(&wg_list, sc, sc_entry); - sx_xunlock(&wg_sx); - - return 0; -} - -static void -wg_clone_destroy(struct ifnet *ifp) -{ - struct wg_softc *sc = ifp->if_softc; - struct ucred *cred; - - sx_xlock(&wg_sx); - sx_xlock(&sc->sc_lock); - sc->sc_flags |= WGF_DYING; - cred = sc->sc_ucred; - sc->sc_ucred = NULL; - sx_xunlock(&sc->sc_lock); - LIST_REMOVE(sc, sc_entry); - sx_xunlock(&wg_sx); - - if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); - - sx_xlock(&sc->sc_lock); - wg_socket_uninit(sc); - sx_xunlock(&sc->sc_lock); - - /* - * No guarantees that all traffic have passed until the epoch has - * elapsed with the socket closed. - */ - NET_EPOCH_WAIT(); - - taskqgroup_drain_all(qgroup_if_io_tqg); - sx_xlock(&sc->sc_lock); - wg_peer_remove_all(sc); - epoch_drain_callbacks(net_epoch_preempt); - sx_xunlock(&sc->sc_lock); - sx_destroy(&sc->sc_lock); - rw_destroy(&sc->sc_index_lock); - taskqgroup_detach(qgroup_if_io_tqg, &sc->sc_handshake); - crypto_taskq_destroy(sc); - buf_ring_free(sc->sc_encap_ring, M_WG); - buf_ring_free(sc->sc_decap_ring, M_WG); - - wg_aip_destroy(&sc->sc_aips); - wg_hashtable_destroy(&sc->sc_hashtable); - - if (cred != NULL) - crfree(cred); - if_detach(sc->sc_ifp); - if_free(sc->sc_ifp); - /* Ensure any local/private keys are cleaned up */ - explicit_bzero(sc, sizeof(*sc)); - free(sc, M_WG); - - atomic_add_int(&clone_count, -1); -} - -static void -wg_qflush(struct ifnet *ifp __unused) -{ -} - -/* - * Privileged information (private-key, preshared-key) are only exported for - * root and jailed root by default. - */ -static bool -wgc_privileged(struct wg_softc *sc) -{ - struct thread *td; - - td = curthread; - return (priv_check(td, PRIV_NET_WG) == 0); -} - -static void -wg_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused, - char *unused __unused) -{ - struct wg_softc *sc; - - sc = ifp->if_softc; - wg_down(sc); -} - -static void -wg_init(void *xsc) -{ - struct wg_softc *sc; - - sc = xsc; - wg_up(sc); -} - -static void -vnet_wg_init(const void *unused __unused) -{ - - V_wg_cloner = if_clone_simple(wgname, wg_clone_create, wg_clone_destroy, - 0); -} -VNET_SYSINIT(vnet_wg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, - vnet_wg_init, NULL); - -static void -vnet_wg_uninit(const void *unused __unused) -{ - - if_clone_detach(V_wg_cloner); -} -VNET_SYSUNINIT(vnet_wg_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, - vnet_wg_uninit, NULL); - -static int -wg_prison_remove(void *obj, void *data __unused) -{ - const struct prison *pr = obj; - struct wg_softc *sc; - struct ucred *cred; - bool dying; - - /* - * Do a pass through all if_wg interfaces and release creds on any from - * the jail that are supposed to be going away. This will, in turn, let - * the jail die so that we don't end up with Schrödinger's jail. - */ - sx_slock(&wg_sx); - LIST_FOREACH(sc, &wg_list, sc_entry) { - cred = NULL; - - sx_xlock(&sc->sc_lock); - dying = (sc->sc_flags & WGF_DYING) != 0; - if (!dying && sc->sc_ucred != NULL && - sc->sc_ucred->cr_prison == pr) { - /* Home jail is going away. */ - cred = sc->sc_ucred; - sc->sc_ucred = NULL; - - sc->sc_flags |= WGF_DYING; - } - - /* - * If this is our foreign vnet going away, we'll also down the - * link and kill the socket because traffic needs to stop. Any - * address will be revoked in the rehoming process. - */ - if (cred != NULL || (!dying && - sc->sc_ifp->if_vnet == pr->pr_vnet)) { - if_link_state_change(sc->sc_ifp, LINK_STATE_DOWN); - /* Have to kill the sockets, as they also hold refs. */ - wg_socket_uninit(sc); - } - - sx_xunlock(&sc->sc_lock); - - if (cred != NULL) { - CURVNET_SET(sc->sc_ifp->if_vnet); - if_purgeaddrs(sc->sc_ifp); - CURVNET_RESTORE(); - crfree(cred); - } - } - sx_sunlock(&wg_sx); - - return (0); -} - -static void -wg_module_init(void) -{ - osd_method_t methods[PR_MAXMETHOD] = { - [PR_METHOD_REMOVE] = wg_prison_remove, - }; - - ratelimit_zone = uma_zcreate("wg ratelimit", sizeof(struct ratelimit), - NULL, NULL, NULL, NULL, 0, 0); - wg_osd_jail_slot = osd_jail_register(NULL, methods); -} - -static void -wg_module_deinit(void) -{ - - uma_zdestroy(ratelimit_zone); - osd_jail_deregister(wg_osd_jail_slot); - - MPASS(LIST_EMPTY(&wg_list)); -} - -static int -wg_module_event_handler(module_t mod, int what, void *arg) -{ - - switch (what) { - case MOD_LOAD: - wg_module_init(); - break; - case MOD_UNLOAD: - if (atomic_load_int(&clone_count) == 0) - wg_module_deinit(); - else - return (EBUSY); - break; - default: - return (EOPNOTSUPP); - } - return (0); -} - -static moduledata_t wg_moduledata = { - "wg", - wg_module_event_handler, - NULL -}; - -DECLARE_MODULE(wg, wg_moduledata, SI_SUB_PSEUDO, SI_ORDER_ANY); -MODULE_VERSION(wg, 1); -MODULE_DEPEND(wg, crypto, 1, 1, 1); diff --git a/sys/dev/if_wg/if_wg.h b/sys/dev/if_wg/if_wg.h deleted file mode 100644 index 2a100456d406..000000000000 --- a/sys/dev/if_wg/if_wg.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2019 Matt Dunwoodie - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * $FreeBSD$ - */ - -#ifndef __IF_WG_H__ -#define __IF_WG_H__ - -#include -#include - -struct wg_data_io { - char wgd_name[IFNAMSIZ]; - void *wgd_data; - size_t wgd_size; -}; - -#define WG_KEY_SIZE 32 - -#define SIOCSWG _IOWR('i', 210, struct wg_data_io) -#define SIOCGWG _IOWR('i', 211, struct wg_data_io) - -#endif /* __IF_WG_H__ */ diff --git a/sys/dev/if_wg/support.h b/sys/dev/if_wg/support.h deleted file mode 100644 index 412806b465af..000000000000 --- a/sys/dev/if_wg/support.h +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Copyright (C) 2021 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2021 Matt Dunwoodie - */ - -#ifndef _WG_SUPPORT -#define _WG_SUPPORT - -#include -#include -#include -#include -#include -#include -#include -#include - -/* TODO the following is openbsd compat defines to allow us to copy the wg_* - * files from openbsd (almost) verbatim. this will greatly increase maintenance - * across the platforms. it should be moved to it's own file. the only thing - * we're missing from this is struct pool (freebsd: uma_zone_t), which isn't a - * show stopper, but is something worth considering in the future. - * - md */ - -#define rw_assert_wrlock(x) rw_assert(x, RA_WLOCKED) -#define rw_enter_write rw_wlock -#define rw_exit_write rw_wunlock -#define rw_enter_read rw_rlock -#define rw_exit_read rw_runlock -#define rw_exit rw_unlock - -#define RW_DOWNGRADE 1 -#define rw_enter(x, y) do { \ - CTASSERT(y == RW_DOWNGRADE); \ - rw_downgrade(x); \ -} while (0) - -MALLOC_DECLARE(M_WG); - -#include -typedef struct { - uint64_t k0; - uint64_t k1; -} SIPHASH_KEY; - -static inline uint64_t -siphash24(const SIPHASH_KEY *key, const void *src, size_t len) -{ - SIPHASH_CTX ctx; - - return (SipHashX(&ctx, 2, 4, (const uint8_t *)key, src, len)); -} -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#endif diff --git a/sys/dev/if_wg/wg_cookie.c b/sys/dev/if_wg/wg_cookie.c deleted file mode 100644 index c56b2fb2e75e..000000000000 --- a/sys/dev/if_wg/wg_cookie.c +++ /dev/null @@ -1,438 +0,0 @@ -/* - * Copyright (C) 2015-2020 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2019-2020 Matt Dunwoodie - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include -#include /* Because systm doesn't include M_NOWAIT, M_DEVBUF */ -#include - -#include "support.h" -#include "wg_cookie.h" - -static void cookie_precompute_key(uint8_t *, - const uint8_t[COOKIE_INPUT_SIZE], const char *); -static void cookie_macs_mac1(struct cookie_macs *, const void *, size_t, - const uint8_t[COOKIE_KEY_SIZE]); -static void cookie_macs_mac2(struct cookie_macs *, const void *, size_t, - const uint8_t[COOKIE_COOKIE_SIZE]); -static int cookie_timer_expired(struct timespec *, time_t, long); -static void cookie_checker_make_cookie(struct cookie_checker *, - uint8_t[COOKIE_COOKIE_SIZE], struct sockaddr *); -static int ratelimit_init(struct ratelimit *, uma_zone_t); -static void ratelimit_deinit(struct ratelimit *); -static void ratelimit_gc(struct ratelimit *, int); -static int ratelimit_allow(struct ratelimit *, struct sockaddr *); - -/* Public Functions */ -void -cookie_maker_init(struct cookie_maker *cp, const uint8_t key[COOKIE_INPUT_SIZE]) -{ - bzero(cp, sizeof(*cp)); - cookie_precompute_key(cp->cp_mac1_key, key, COOKIE_MAC1_KEY_LABEL); - cookie_precompute_key(cp->cp_cookie_key, key, COOKIE_COOKIE_KEY_LABEL); - rw_init(&cp->cp_lock, "cookie_maker"); -} - -int -cookie_checker_init(struct cookie_checker *cc, uma_zone_t zone) -{ - int res; - bzero(cc, sizeof(*cc)); - - rw_init(&cc->cc_key_lock, "cookie_checker_key"); - rw_init(&cc->cc_secret_lock, "cookie_checker_secret"); - - if ((res = ratelimit_init(&cc->cc_ratelimit_v4, zone)) != 0) - return res; -#ifdef INET6 - if ((res = ratelimit_init(&cc->cc_ratelimit_v6, zone)) != 0) { - ratelimit_deinit(&cc->cc_ratelimit_v4); - return res; - } -#endif - return 0; -} - -void -cookie_checker_update(struct cookie_checker *cc, - const uint8_t key[COOKIE_INPUT_SIZE]) -{ - rw_enter_write(&cc->cc_key_lock); - if (key) { - cookie_precompute_key(cc->cc_mac1_key, key, COOKIE_MAC1_KEY_LABEL); - cookie_precompute_key(cc->cc_cookie_key, key, COOKIE_COOKIE_KEY_LABEL); - } else { - bzero(cc->cc_mac1_key, sizeof(cc->cc_mac1_key)); - bzero(cc->cc_cookie_key, sizeof(cc->cc_cookie_key)); - } - rw_exit_write(&cc->cc_key_lock); -} - -void -cookie_checker_deinit(struct cookie_checker *cc) -{ - ratelimit_deinit(&cc->cc_ratelimit_v4); -#ifdef INET6 - ratelimit_deinit(&cc->cc_ratelimit_v6); -#endif -} - -void -cookie_checker_create_payload(struct cookie_checker *cc, - struct cookie_macs *cm, uint8_t nonce[COOKIE_NONCE_SIZE], - uint8_t ecookie[COOKIE_ENCRYPTED_SIZE], struct sockaddr *sa) -{ - uint8_t cookie[COOKIE_COOKIE_SIZE]; - - cookie_checker_make_cookie(cc, cookie, sa); - arc4random_buf(nonce, COOKIE_NONCE_SIZE); - - rw_enter_read(&cc->cc_key_lock); - xchacha20poly1305_encrypt(ecookie, cookie, COOKIE_COOKIE_SIZE, - cm->mac1, COOKIE_MAC_SIZE, nonce, cc->cc_cookie_key); - rw_exit_read(&cc->cc_key_lock); - - explicit_bzero(cookie, sizeof(cookie)); -} - -int -cookie_maker_consume_payload(struct cookie_maker *cp, - uint8_t nonce[COOKIE_NONCE_SIZE], uint8_t ecookie[COOKIE_ENCRYPTED_SIZE]) -{ - int ret = 0; - uint8_t cookie[COOKIE_COOKIE_SIZE]; - - rw_enter_write(&cp->cp_lock); - - if (cp->cp_mac1_valid == 0) { - ret = ETIMEDOUT; - goto error; - } - - if (xchacha20poly1305_decrypt(cookie, ecookie, COOKIE_ENCRYPTED_SIZE, - cp->cp_mac1_last, COOKIE_MAC_SIZE, nonce, cp->cp_cookie_key) == 0) { - ret = EINVAL; - goto error; - } - - memcpy(cp->cp_cookie, cookie, COOKIE_COOKIE_SIZE); - getnanouptime(&cp->cp_birthdate); - cp->cp_mac1_valid = 0; - -error: - rw_exit_write(&cp->cp_lock); - return ret; -} - -void -cookie_maker_mac(struct cookie_maker *cp, struct cookie_macs *cm, void *buf, - size_t len) -{ - rw_enter_read(&cp->cp_lock); - - cookie_macs_mac1(cm, buf, len, cp->cp_mac1_key); - - memcpy(cp->cp_mac1_last, cm->mac1, COOKIE_MAC_SIZE); - cp->cp_mac1_valid = 1; - - if (!cookie_timer_expired(&cp->cp_birthdate, - COOKIE_SECRET_MAX_AGE - COOKIE_SECRET_LATENCY, 0)) - cookie_macs_mac2(cm, buf, len, cp->cp_cookie); - else - bzero(cm->mac2, COOKIE_MAC_SIZE); - - rw_exit_read(&cp->cp_lock); -} - -int -cookie_checker_validate_macs(struct cookie_checker *cc, struct cookie_macs *cm, - void *buf, size_t len, int busy, struct sockaddr *sa) -{ - struct cookie_macs our_cm; - uint8_t cookie[COOKIE_COOKIE_SIZE]; - - /* Validate incoming MACs */ - rw_enter_read(&cc->cc_key_lock); - cookie_macs_mac1(&our_cm, buf, len, cc->cc_mac1_key); - rw_exit_read(&cc->cc_key_lock); - - /* If mac1 is invald, we want to drop the packet */ - if (timingsafe_bcmp(our_cm.mac1, cm->mac1, COOKIE_MAC_SIZE) != 0) - return EINVAL; - - if (busy != 0) { - cookie_checker_make_cookie(cc, cookie, sa); - cookie_macs_mac2(&our_cm, buf, len, cookie); - - /* If the mac2 is invalid, we want to send a cookie response */ - if (timingsafe_bcmp(our_cm.mac2, cm->mac2, COOKIE_MAC_SIZE) != 0) - return EAGAIN; - - /* If the mac2 is valid, we may want rate limit the peer. - * ratelimit_allow will return either 0 or ECONNREFUSED, - * implying there is no ratelimiting, or we should ratelimit - * (refuse) respectively. */ - if (sa->sa_family == AF_INET) - return ratelimit_allow(&cc->cc_ratelimit_v4, sa); -#ifdef INET6 - else if (sa->sa_family == AF_INET6) - return ratelimit_allow(&cc->cc_ratelimit_v6, sa); -#endif - else - return EAFNOSUPPORT; - } - return 0; -} - -/* Private functions */ -static void -cookie_precompute_key(uint8_t *key, const uint8_t input[COOKIE_INPUT_SIZE], - const char *label) -{ - struct blake2s_state blake; - - blake2s_init(&blake, COOKIE_KEY_SIZE); - blake2s_update(&blake, label, strlen(label)); - blake2s_update(&blake, input, COOKIE_INPUT_SIZE); - /* TODO we shouldn't need to provide outlen to _final. we can align - * this with openbsd after fixing the blake library. */ - blake2s_final(&blake, key); -} - -static void -cookie_macs_mac1(struct cookie_macs *cm, const void *buf, size_t len, - const uint8_t key[COOKIE_KEY_SIZE]) -{ - struct blake2s_state state; - blake2s_init_key(&state, COOKIE_MAC_SIZE, key, COOKIE_KEY_SIZE); - blake2s_update(&state, buf, len); - blake2s_final(&state, cm->mac1); -} - -static void -cookie_macs_mac2(struct cookie_macs *cm, const void *buf, size_t len, - const uint8_t key[COOKIE_COOKIE_SIZE]) -{ - struct blake2s_state state; - blake2s_init_key(&state, COOKIE_MAC_SIZE, key, COOKIE_COOKIE_SIZE); - blake2s_update(&state, buf, len); - blake2s_update(&state, cm->mac1, COOKIE_MAC_SIZE); - blake2s_final(&state, cm->mac2); -} - -static int -cookie_timer_expired(struct timespec *birthdate, time_t sec, long nsec) -{ - struct timespec uptime; - struct timespec expire = { .tv_sec = sec, .tv_nsec = nsec }; - - if (birthdate->tv_sec == 0 && birthdate->tv_nsec == 0) - return ETIMEDOUT; - - getnanouptime(&uptime); - timespecadd(birthdate, &expire, &expire); - return timespeccmp(&uptime, &expire, >) ? ETIMEDOUT : 0; -} - -static void -cookie_checker_make_cookie(struct cookie_checker *cc, - uint8_t cookie[COOKIE_COOKIE_SIZE], struct sockaddr *sa) -{ - struct blake2s_state state; - - rw_enter_write(&cc->cc_secret_lock); - if (cookie_timer_expired(&cc->cc_secret_birthdate, - COOKIE_SECRET_MAX_AGE, 0)) { - arc4random_buf(cc->cc_secret, COOKIE_SECRET_SIZE); - getnanouptime(&cc->cc_secret_birthdate); - } - blake2s_init_key(&state, COOKIE_COOKIE_SIZE, cc->cc_secret, - COOKIE_SECRET_SIZE); - rw_exit_write(&cc->cc_secret_lock); - - if (sa->sa_family == AF_INET) { - blake2s_update(&state, (uint8_t *)&satosin(sa)->sin_addr, - sizeof(struct in_addr)); - blake2s_update(&state, (uint8_t *)&satosin(sa)->sin_port, - sizeof(in_port_t)); - blake2s_final(&state, cookie); -#ifdef INET6 - } else if (sa->sa_family == AF_INET6) { - blake2s_update(&state, (uint8_t *)&satosin6(sa)->sin6_addr, - sizeof(struct in6_addr)); - blake2s_update(&state, (uint8_t *)&satosin6(sa)->sin6_port, - sizeof(in_port_t)); - blake2s_final(&state, cookie); -#endif - } else { - arc4random_buf(cookie, COOKIE_COOKIE_SIZE); - } -} - -static int -ratelimit_init(struct ratelimit *rl, uma_zone_t zone) -{ - rw_init(&rl->rl_lock, "ratelimit_lock"); - arc4random_buf(&rl->rl_secret, sizeof(rl->rl_secret)); - rl->rl_table = hashinit_flags(RATELIMIT_SIZE, M_DEVBUF, - &rl->rl_table_mask, M_NOWAIT); - rl->rl_zone = zone; - rl->rl_table_num = 0; - return rl->rl_table == NULL ? ENOBUFS : 0; -} - -static void -ratelimit_deinit(struct ratelimit *rl) -{ - rw_enter_write(&rl->rl_lock); - ratelimit_gc(rl, 1); - hashdestroy(rl->rl_table, M_DEVBUF, rl->rl_table_mask); - rw_exit_write(&rl->rl_lock); -} - -static void -ratelimit_gc(struct ratelimit *rl, int force) -{ - size_t i; - struct ratelimit_entry *r, *tr; - struct timespec expiry; - - rw_assert_wrlock(&rl->rl_lock); - - if (force) { - for (i = 0; i < RATELIMIT_SIZE; i++) { - LIST_FOREACH_SAFE(r, &rl->rl_table[i], r_entry, tr) { - rl->rl_table_num--; - LIST_REMOVE(r, r_entry); - uma_zfree(rl->rl_zone, r); - } - } - return; - } - - if ((cookie_timer_expired(&rl->rl_last_gc, ELEMENT_TIMEOUT, 0) && - rl->rl_table_num > 0)) { - getnanouptime(&rl->rl_last_gc); - getnanouptime(&expiry); - expiry.tv_sec -= ELEMENT_TIMEOUT; - - for (i = 0; i < RATELIMIT_SIZE; i++) { - LIST_FOREACH_SAFE(r, &rl->rl_table[i], r_entry, tr) { - if (timespeccmp(&r->r_last_time, &expiry, <)) { - rl->rl_table_num--; - LIST_REMOVE(r, r_entry); - uma_zfree(rl->rl_zone, r); - } - } - } - } -} - -static int -ratelimit_allow(struct ratelimit *rl, struct sockaddr *sa) -{ - uint64_t key, tokens; - struct timespec diff; - struct ratelimit_entry *r; - int ret = ECONNREFUSED; - - if (sa->sa_family == AF_INET) - /* TODO siphash24 is the FreeBSD siphash, OK? */ - key = siphash24(&rl->rl_secret, &satosin(sa)->sin_addr, - IPV4_MASK_SIZE); -#ifdef INET6 - else if (sa->sa_family == AF_INET6) - key = siphash24(&rl->rl_secret, &satosin6(sa)->sin6_addr, - IPV6_MASK_SIZE); -#endif - else - return ret; - - rw_enter_write(&rl->rl_lock); - - LIST_FOREACH(r, &rl->rl_table[key & rl->rl_table_mask], r_entry) { - if (r->r_af != sa->sa_family) - continue; - - if (r->r_af == AF_INET && bcmp(&r->r_in, - &satosin(sa)->sin_addr, IPV4_MASK_SIZE) != 0) - continue; - -#ifdef INET6 - if (r->r_af == AF_INET6 && bcmp(&r->r_in6, - &satosin6(sa)->sin6_addr, IPV6_MASK_SIZE) != 0) - continue; -#endif - - /* If we get to here, we've found an entry for the endpoint. - * We apply standard token bucket, by calculating the time - * lapsed since our last_time, adding that, ensuring that we - * cap the tokens at TOKEN_MAX. If the endpoint has no tokens - * left (that is tokens <= INITIATION_COST) then we block the - * request, otherwise we subtract the INITITIATION_COST and - * return OK. */ - diff = r->r_last_time; - getnanouptime(&r->r_last_time); - timespecsub(&r->r_last_time, &diff, &diff); - - tokens = r->r_tokens + diff.tv_sec * NSEC_PER_SEC + diff.tv_nsec; - - if (tokens > TOKEN_MAX) - tokens = TOKEN_MAX; - - if (tokens >= INITIATION_COST) { - r->r_tokens = tokens - INITIATION_COST; - goto ok; - } else { - r->r_tokens = tokens; - goto error; - } - } - - /* If we get to here, we didn't have an entry for the endpoint. */ - ratelimit_gc(rl, 0); - - /* Hard limit on number of entries */ - if (rl->rl_table_num >= RATELIMIT_SIZE_MAX) - goto error; - - /* Goto error if out of memory */ - if ((r = uma_zalloc(rl->rl_zone, M_NOWAIT)) == NULL) - goto error; - - rl->rl_table_num++; - - /* Insert entry into the hashtable and ensure it's initialised */ - LIST_INSERT_HEAD(&rl->rl_table[key & rl->rl_table_mask], r, r_entry); - r->r_af = sa->sa_family; - if (r->r_af == AF_INET) - memcpy(&r->r_in, &satosin(sa)->sin_addr, IPV4_MASK_SIZE); -#ifdef INET6 - else if (r->r_af == AF_INET6) - memcpy(&r->r_in6, &satosin6(sa)->sin6_addr, IPV6_MASK_SIZE); -#endif - - getnanouptime(&r->r_last_time); - r->r_tokens = TOKEN_MAX - INITIATION_COST; -ok: - ret = 0; -error: - rw_exit_write(&rl->rl_lock); - return ret; -} diff --git a/sys/dev/if_wg/wg_cookie.h b/sys/dev/if_wg/wg_cookie.h deleted file mode 100644 index 699f3ebf40c1..000000000000 --- a/sys/dev/if_wg/wg_cookie.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (C) 2015-2020 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2019-2020 Matt Dunwoodie - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __COOKIE_H__ -#define __COOKIE_H__ - -#include -#include -#include -#include - -#include - -#include "crypto.h" - -#define COOKIE_MAC_SIZE 16 -#define COOKIE_KEY_SIZE 32 -#define COOKIE_NONCE_SIZE XCHACHA20POLY1305_NONCE_SIZE -#define COOKIE_COOKIE_SIZE 16 -#define COOKIE_SECRET_SIZE 32 -#define COOKIE_INPUT_SIZE 32 -#define COOKIE_ENCRYPTED_SIZE (COOKIE_COOKIE_SIZE + COOKIE_MAC_SIZE) - -#define COOKIE_MAC1_KEY_LABEL "mac1----" -#define COOKIE_COOKIE_KEY_LABEL "cookie--" -#define COOKIE_SECRET_MAX_AGE 120 -#define COOKIE_SECRET_LATENCY 5 - -/* Constants for initiation rate limiting */ -#define RATELIMIT_SIZE (1 << 13) -#define RATELIMIT_SIZE_MAX (RATELIMIT_SIZE * 8) -#define NSEC_PER_SEC 1000000000LL -#define INITIATIONS_PER_SECOND 20 -#define INITIATIONS_BURSTABLE 5 -#define INITIATION_COST (NSEC_PER_SEC / INITIATIONS_PER_SECOND) -#define TOKEN_MAX (INITIATION_COST * INITIATIONS_BURSTABLE) -#define ELEMENT_TIMEOUT 1 -#define IPV4_MASK_SIZE 4 /* Use all 4 bytes of IPv4 address */ -#define IPV6_MASK_SIZE 8 /* Use top 8 bytes (/64) of IPv6 address */ - -struct cookie_macs { - uint8_t mac1[COOKIE_MAC_SIZE]; - uint8_t mac2[COOKIE_MAC_SIZE]; -}; - -struct ratelimit_entry { - LIST_ENTRY(ratelimit_entry) r_entry; - sa_family_t r_af; - union { - struct in_addr r_in; -#ifdef INET6 - struct in6_addr r_in6; -#endif - }; - struct timespec r_last_time; /* nanouptime */ - uint64_t r_tokens; -}; - -struct ratelimit { - SIPHASH_KEY rl_secret; - uma_zone_t rl_zone; - - struct rwlock rl_lock; - LIST_HEAD(, ratelimit_entry) *rl_table; - u_long rl_table_mask; - size_t rl_table_num; - struct timespec rl_last_gc; /* nanouptime */ -}; - -struct cookie_maker { - uint8_t cp_mac1_key[COOKIE_KEY_SIZE]; - uint8_t cp_cookie_key[COOKIE_KEY_SIZE]; - - struct rwlock cp_lock; - uint8_t cp_cookie[COOKIE_COOKIE_SIZE]; - struct timespec cp_birthdate; /* nanouptime */ - int cp_mac1_valid; - uint8_t cp_mac1_last[COOKIE_MAC_SIZE]; -}; - -struct cookie_checker { - struct ratelimit cc_ratelimit_v4; -#ifdef INET6 - struct ratelimit cc_ratelimit_v6; -#endif - - struct rwlock cc_key_lock; - uint8_t cc_mac1_key[COOKIE_KEY_SIZE]; - uint8_t cc_cookie_key[COOKIE_KEY_SIZE]; - - struct rwlock cc_secret_lock; - struct timespec cc_secret_birthdate; /* nanouptime */ - uint8_t cc_secret[COOKIE_SECRET_SIZE]; -}; - -void cookie_maker_init(struct cookie_maker *, const uint8_t[COOKIE_INPUT_SIZE]); -int cookie_checker_init(struct cookie_checker *, uma_zone_t); -void cookie_checker_update(struct cookie_checker *, - const uint8_t[COOKIE_INPUT_SIZE]); -void cookie_checker_deinit(struct cookie_checker *); -void cookie_checker_create_payload(struct cookie_checker *, - struct cookie_macs *cm, uint8_t[COOKIE_NONCE_SIZE], - uint8_t [COOKIE_ENCRYPTED_SIZE], struct sockaddr *); -int cookie_maker_consume_payload(struct cookie_maker *, - uint8_t[COOKIE_NONCE_SIZE], uint8_t[COOKIE_ENCRYPTED_SIZE]); -void cookie_maker_mac(struct cookie_maker *, struct cookie_macs *, - void *, size_t); -int cookie_checker_validate_macs(struct cookie_checker *, - struct cookie_macs *, void *, size_t, int, struct sockaddr *); - -#endif /* __COOKIE_H__ */ diff --git a/sys/dev/if_wg/wg_noise.c b/sys/dev/if_wg/wg_noise.c deleted file mode 100644 index ae527eb99bde..000000000000 --- a/sys/dev/if_wg/wg_noise.c +++ /dev/null @@ -1,963 +0,0 @@ -/* - * Copyright (C) 2015-2020 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2019-2020 Matt Dunwoodie - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include -#include - -#include "support.h" -#include "wg_noise.h" - -/* Private functions */ -static struct noise_keypair * - noise_remote_keypair_allocate(struct noise_remote *); -static void - noise_remote_keypair_free(struct noise_remote *, - struct noise_keypair *); -static uint32_t noise_remote_handshake_index_get(struct noise_remote *); -static void noise_remote_handshake_index_drop(struct noise_remote *); - -static uint64_t noise_counter_send(struct noise_counter *); -static int noise_counter_recv(struct noise_counter *, uint64_t); - -static void noise_kdf(uint8_t *, uint8_t *, uint8_t *, const uint8_t *, - size_t, size_t, size_t, size_t, - const uint8_t [NOISE_HASH_LEN]); -static int noise_mix_dh( - uint8_t [NOISE_HASH_LEN], - uint8_t [NOISE_SYMMETRIC_KEY_LEN], - const uint8_t [NOISE_PUBLIC_KEY_LEN], - const uint8_t [NOISE_PUBLIC_KEY_LEN]); -static int noise_mix_ss( - uint8_t ck[NOISE_HASH_LEN], - uint8_t key[NOISE_SYMMETRIC_KEY_LEN], - const uint8_t ss[NOISE_PUBLIC_KEY_LEN]); -static void noise_mix_hash( - uint8_t [NOISE_HASH_LEN], - const uint8_t *, - size_t); -static void noise_mix_psk( - uint8_t [NOISE_HASH_LEN], - uint8_t [NOISE_HASH_LEN], - uint8_t [NOISE_SYMMETRIC_KEY_LEN], - const uint8_t [NOISE_SYMMETRIC_KEY_LEN]); -static void noise_param_init( - uint8_t [NOISE_HASH_LEN], - uint8_t [NOISE_HASH_LEN], - const uint8_t [NOISE_PUBLIC_KEY_LEN]); - -static void noise_msg_encrypt(uint8_t *, const uint8_t *, size_t, - uint8_t [NOISE_SYMMETRIC_KEY_LEN], - uint8_t [NOISE_HASH_LEN]); -static int noise_msg_decrypt(uint8_t *, const uint8_t *, size_t, - uint8_t [NOISE_SYMMETRIC_KEY_LEN], - uint8_t [NOISE_HASH_LEN]); -static void noise_msg_ephemeral( - uint8_t [NOISE_HASH_LEN], - uint8_t [NOISE_HASH_LEN], - const uint8_t src[NOISE_PUBLIC_KEY_LEN]); - -static void noise_tai64n_now(uint8_t [NOISE_TIMESTAMP_LEN]); -static int noise_timer_expired(struct timespec *, time_t, long); - -/* Set/Get noise parameters */ -void -noise_local_init(struct noise_local *l, struct noise_upcall *upcall) -{ - bzero(l, sizeof(*l)); - rw_init(&l->l_identity_lock, "noise_local_identity"); - l->l_upcall = *upcall; -} - -void -noise_local_lock_identity(struct noise_local *l) -{ - rw_enter_write(&l->l_identity_lock); -} - -void -noise_local_unlock_identity(struct noise_local *l) -{ - rw_exit_write(&l->l_identity_lock); -} - -int -noise_local_set_private(struct noise_local *l, - const uint8_t private[NOISE_PUBLIC_KEY_LEN]) -{ - rw_assert_wrlock(&l->l_identity_lock); - - memcpy(l->l_private, private, NOISE_PUBLIC_KEY_LEN); - curve25519_clamp_secret(l->l_private); - l->l_has_identity = curve25519_generate_public(l->l_public, private); - - return l->l_has_identity ? 0 : ENXIO; -} - -int -noise_local_keys(struct noise_local *l, uint8_t public[NOISE_PUBLIC_KEY_LEN], - uint8_t private[NOISE_PUBLIC_KEY_LEN]) -{ - int ret = 0; - rw_enter_read(&l->l_identity_lock); - if (l->l_has_identity) { - if (public != NULL) - memcpy(public, l->l_public, NOISE_PUBLIC_KEY_LEN); - if (private != NULL) - memcpy(private, l->l_private, NOISE_PUBLIC_KEY_LEN); - } else { - ret = ENXIO; - } - rw_exit_read(&l->l_identity_lock); - return ret; -} - -void -noise_remote_init(struct noise_remote *r, - const uint8_t public[NOISE_PUBLIC_KEY_LEN], struct noise_local *l) -{ - bzero(r, sizeof(*r)); - memcpy(r->r_public, public, NOISE_PUBLIC_KEY_LEN); - rw_init(&r->r_handshake_lock, "noise_handshake"); - rw_init(&r->r_keypair_lock, "noise_keypair"); - - SLIST_INSERT_HEAD(&r->r_unused_keypairs, &r->r_keypair[0], kp_entry); - SLIST_INSERT_HEAD(&r->r_unused_keypairs, &r->r_keypair[1], kp_entry); - SLIST_INSERT_HEAD(&r->r_unused_keypairs, &r->r_keypair[2], kp_entry); - - KASSERT(l != NULL, ("must provide local")); - r->r_local = l; - - rw_enter_write(&l->l_identity_lock); - noise_remote_precompute(r); - rw_exit_write(&l->l_identity_lock); -} - -int -noise_remote_set_psk(struct noise_remote *r, - const uint8_t psk[NOISE_SYMMETRIC_KEY_LEN]) -{ - int same; - rw_enter_write(&r->r_handshake_lock); - same = !timingsafe_bcmp(r->r_psk, psk, NOISE_SYMMETRIC_KEY_LEN); - if (!same) { - memcpy(r->r_psk, psk, NOISE_SYMMETRIC_KEY_LEN); - } - rw_exit_write(&r->r_handshake_lock); - return same ? EEXIST : 0; -} - -int -noise_remote_keys(struct noise_remote *r, uint8_t public[NOISE_PUBLIC_KEY_LEN], - uint8_t psk[NOISE_SYMMETRIC_KEY_LEN]) -{ - static uint8_t null_psk[NOISE_SYMMETRIC_KEY_LEN]; - int ret; - - if (public != NULL) - memcpy(public, r->r_public, NOISE_PUBLIC_KEY_LEN); - - rw_enter_read(&r->r_handshake_lock); - if (psk != NULL) - memcpy(psk, r->r_psk, NOISE_SYMMETRIC_KEY_LEN); - ret = timingsafe_bcmp(r->r_psk, null_psk, NOISE_SYMMETRIC_KEY_LEN); - rw_exit_read(&r->r_handshake_lock); - - /* If r_psk != null_psk return 0, else ENOENT (no psk) */ - return ret ? 0 : ENOENT; -} - -void -noise_remote_precompute(struct noise_remote *r) -{ - struct noise_local *l = r->r_local; - rw_assert_wrlock(&l->l_identity_lock); - if (!l->l_has_identity) - bzero(r->r_ss, NOISE_PUBLIC_KEY_LEN); - else if (!curve25519(r->r_ss, l->l_private, r->r_public)) - bzero(r->r_ss, NOISE_PUBLIC_KEY_LEN); - - rw_enter_write(&r->r_handshake_lock); - noise_remote_handshake_index_drop(r); - explicit_bzero(&r->r_handshake, sizeof(r->r_handshake)); - rw_exit_write(&r->r_handshake_lock); -} - -/* Handshake functions */ -int -noise_create_initiation(struct noise_remote *r, uint32_t *s_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], - uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN], - uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]) -{ - struct noise_handshake *hs = &r->r_handshake; - struct noise_local *l = r->r_local; - uint8_t key[NOISE_SYMMETRIC_KEY_LEN]; - int ret = EINVAL; - - rw_enter_read(&l->l_identity_lock); - rw_enter_write(&r->r_handshake_lock); - if (!l->l_has_identity) - goto error; - noise_param_init(hs->hs_ck, hs->hs_hash, r->r_public); - - /* e */ - curve25519_generate_secret(hs->hs_e); - if (curve25519_generate_public(ue, hs->hs_e) == 0) - goto error; - noise_msg_ephemeral(hs->hs_ck, hs->hs_hash, ue); - - /* es */ - if (noise_mix_dh(hs->hs_ck, key, hs->hs_e, r->r_public) != 0) - goto error; - - /* s */ - noise_msg_encrypt(es, l->l_public, - NOISE_PUBLIC_KEY_LEN, key, hs->hs_hash); - - /* ss */ - if (noise_mix_ss(hs->hs_ck, key, r->r_ss) != 0) - goto error; - - /* {t} */ - noise_tai64n_now(ets); - noise_msg_encrypt(ets, ets, - NOISE_TIMESTAMP_LEN, key, hs->hs_hash); - - noise_remote_handshake_index_drop(r); - hs->hs_state = CREATED_INITIATION; - hs->hs_local_index = noise_remote_handshake_index_get(r); - *s_idx = hs->hs_local_index; - ret = 0; -error: - rw_exit_write(&r->r_handshake_lock); - rw_exit_read(&l->l_identity_lock); - explicit_bzero(key, NOISE_SYMMETRIC_KEY_LEN); - return ret; -} - -int -noise_consume_initiation(struct noise_local *l, struct noise_remote **rp, - uint32_t s_idx, uint8_t ue[NOISE_PUBLIC_KEY_LEN], - uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN], - uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]) -{ - struct noise_remote *r; - struct noise_handshake hs; - uint8_t key[NOISE_SYMMETRIC_KEY_LEN]; - uint8_t r_public[NOISE_PUBLIC_KEY_LEN]; - uint8_t timestamp[NOISE_TIMESTAMP_LEN]; - int ret = EINVAL; - - rw_enter_read(&l->l_identity_lock); - if (!l->l_has_identity) - goto error; - noise_param_init(hs.hs_ck, hs.hs_hash, l->l_public); - - /* e */ - noise_msg_ephemeral(hs.hs_ck, hs.hs_hash, ue); - - /* es */ - if (noise_mix_dh(hs.hs_ck, key, l->l_private, ue) != 0) - goto error; - - /* s */ - if (noise_msg_decrypt(r_public, es, - NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN, key, hs.hs_hash) != 0) - goto error; - - /* Lookup the remote we received from */ - if ((r = l->l_upcall.u_remote_get(l->l_upcall.u_arg, r_public)) == NULL) - goto error; - - /* ss */ - if (noise_mix_ss(hs.hs_ck, key, r->r_ss) != 0) - goto error; - - /* {t} */ - if (noise_msg_decrypt(timestamp, ets, - NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN, key, hs.hs_hash) != 0) - goto error; - - hs.hs_state = CONSUMED_INITIATION; - hs.hs_local_index = 0; - hs.hs_remote_index = s_idx; - memcpy(hs.hs_e, ue, NOISE_PUBLIC_KEY_LEN); - - /* We have successfully computed the same results, now we ensure that - * this is not an initiation replay, or a flood attack */ - rw_enter_write(&r->r_handshake_lock); - - /* Replay */ - if (memcmp(timestamp, r->r_timestamp, NOISE_TIMESTAMP_LEN) > 0) - memcpy(r->r_timestamp, timestamp, NOISE_TIMESTAMP_LEN); - else - goto error_set; - /* Flood attack */ - if (noise_timer_expired(&r->r_last_init, 0, REJECT_INTERVAL)) - getnanouptime(&r->r_last_init); - else - goto error_set; - - /* Ok, we're happy to accept this initiation now */ - noise_remote_handshake_index_drop(r); - r->r_handshake = hs; - *rp = r; - ret = 0; -error_set: - rw_exit_write(&r->r_handshake_lock); -error: - rw_exit_read(&l->l_identity_lock); - explicit_bzero(key, NOISE_SYMMETRIC_KEY_LEN); - explicit_bzero(&hs, sizeof(hs)); - return ret; -} - -int -noise_create_response(struct noise_remote *r, uint32_t *s_idx, uint32_t *r_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], uint8_t en[0 + NOISE_AUTHTAG_LEN]) -{ - struct noise_handshake *hs = &r->r_handshake; - uint8_t key[NOISE_SYMMETRIC_KEY_LEN]; - uint8_t e[NOISE_PUBLIC_KEY_LEN]; - int ret = EINVAL; - - rw_enter_read(&r->r_local->l_identity_lock); - rw_enter_write(&r->r_handshake_lock); - - if (hs->hs_state != CONSUMED_INITIATION) - goto error; - - /* e */ - curve25519_generate_secret(e); - if (curve25519_generate_public(ue, e) == 0) - goto error; - noise_msg_ephemeral(hs->hs_ck, hs->hs_hash, ue); - - /* ee */ - if (noise_mix_dh(hs->hs_ck, NULL, e, hs->hs_e) != 0) - goto error; - - /* se */ - if (noise_mix_dh(hs->hs_ck, NULL, e, r->r_public) != 0) - goto error; - - /* psk */ - noise_mix_psk(hs->hs_ck, hs->hs_hash, key, r->r_psk); - - /* {} */ - noise_msg_encrypt(en, NULL, 0, key, hs->hs_hash); - - hs->hs_state = CREATED_RESPONSE; - hs->hs_local_index = noise_remote_handshake_index_get(r); - *r_idx = hs->hs_remote_index; - *s_idx = hs->hs_local_index; - ret = 0; -error: - rw_exit_write(&r->r_handshake_lock); - rw_exit_read(&r->r_local->l_identity_lock); - explicit_bzero(key, NOISE_SYMMETRIC_KEY_LEN); - explicit_bzero(e, NOISE_PUBLIC_KEY_LEN); - return ret; -} - -int -noise_consume_response(struct noise_remote *r, uint32_t s_idx, uint32_t r_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], uint8_t en[0 + NOISE_AUTHTAG_LEN]) -{ - struct noise_local *l = r->r_local; - struct noise_handshake hs; - uint8_t key[NOISE_SYMMETRIC_KEY_LEN]; - uint8_t preshared_key[NOISE_PUBLIC_KEY_LEN]; - int ret = EINVAL; - - rw_enter_read(&l->l_identity_lock); - if (!l->l_has_identity) - goto error; - - rw_enter_read(&r->r_handshake_lock); - hs = r->r_handshake; - memcpy(preshared_key, r->r_psk, NOISE_SYMMETRIC_KEY_LEN); - rw_exit_read(&r->r_handshake_lock); - - if (hs.hs_state != CREATED_INITIATION || - hs.hs_local_index != r_idx) - goto error; - - /* e */ - noise_msg_ephemeral(hs.hs_ck, hs.hs_hash, ue); - - /* ee */ - if (noise_mix_dh(hs.hs_ck, NULL, hs.hs_e, ue) != 0) - goto error; - - /* se */ - if (noise_mix_dh(hs.hs_ck, NULL, l->l_private, ue) != 0) - goto error; - - /* psk */ - noise_mix_psk(hs.hs_ck, hs.hs_hash, key, preshared_key); - - /* {} */ - if (noise_msg_decrypt(NULL, en, - 0 + NOISE_AUTHTAG_LEN, key, hs.hs_hash) != 0) - goto error; - - hs.hs_remote_index = s_idx; - - rw_enter_write(&r->r_handshake_lock); - if (r->r_handshake.hs_state == hs.hs_state && - r->r_handshake.hs_local_index == hs.hs_local_index) { - r->r_handshake = hs; - r->r_handshake.hs_state = CONSUMED_RESPONSE; - ret = 0; - } - rw_exit_write(&r->r_handshake_lock); -error: - rw_exit_read(&l->l_identity_lock); - explicit_bzero(&hs, sizeof(hs)); - explicit_bzero(key, NOISE_SYMMETRIC_KEY_LEN); - return ret; -} - -int -noise_remote_begin_session(struct noise_remote *r) -{ - struct noise_handshake *hs = &r->r_handshake; - struct noise_keypair kp, *next, *current, *previous; - - rw_enter_write(&r->r_handshake_lock); - - /* We now derive the keypair from the handshake */ - if (hs->hs_state == CONSUMED_RESPONSE) { - kp.kp_is_initiator = 1; - noise_kdf(kp.kp_send, kp.kp_recv, NULL, NULL, - NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0, - hs->hs_ck); - } else if (hs->hs_state == CREATED_RESPONSE) { - kp.kp_is_initiator = 0; - noise_kdf(kp.kp_recv, kp.kp_send, NULL, NULL, - NOISE_SYMMETRIC_KEY_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, 0, - hs->hs_ck); - } else { - rw_exit_write(&r->r_handshake_lock); - return EINVAL; - } - - kp.kp_valid = 1; - kp.kp_local_index = hs->hs_local_index; - kp.kp_remote_index = hs->hs_remote_index; - getnanouptime(&kp.kp_birthdate); - bzero(&kp.kp_ctr, sizeof(kp.kp_ctr)); - rw_init(&kp.kp_ctr.c_lock, "noise_counter"); - - /* Now we need to add_new_keypair */ - rw_enter_write(&r->r_keypair_lock); - next = r->r_next; - current = r->r_current; - previous = r->r_previous; - - if (kp.kp_is_initiator) { - if (next != NULL) { - r->r_next = NULL; - r->r_previous = next; - noise_remote_keypair_free(r, current); - } else { - r->r_previous = current; - } - - noise_remote_keypair_free(r, previous); - - r->r_current = noise_remote_keypair_allocate(r); - *r->r_current = kp; - } else { - noise_remote_keypair_free(r, next); - r->r_previous = NULL; - noise_remote_keypair_free(r, previous); - - r->r_next = noise_remote_keypair_allocate(r); - *r->r_next = kp; - } - rw_exit_write(&r->r_keypair_lock); - - explicit_bzero(&r->r_handshake, sizeof(r->r_handshake)); - rw_exit_write(&r->r_handshake_lock); - - explicit_bzero(&kp, sizeof(kp)); - return 0; -} - -void -noise_remote_clear(struct noise_remote *r) -{ - rw_enter_write(&r->r_handshake_lock); - noise_remote_handshake_index_drop(r); - explicit_bzero(&r->r_handshake, sizeof(r->r_handshake)); - rw_exit_write(&r->r_handshake_lock); - - rw_enter_write(&r->r_keypair_lock); - noise_remote_keypair_free(r, r->r_next); - noise_remote_keypair_free(r, r->r_current); - noise_remote_keypair_free(r, r->r_previous); - r->r_next = NULL; - r->r_current = NULL; - r->r_previous = NULL; - rw_exit_write(&r->r_keypair_lock); -} - -void -noise_remote_expire_current(struct noise_remote *r) -{ - rw_enter_write(&r->r_keypair_lock); - if (r->r_next != NULL) - r->r_next->kp_valid = 0; - if (r->r_current != NULL) - r->r_current->kp_valid = 0; - rw_exit_write(&r->r_keypair_lock); -} - -int -noise_remote_ready(struct noise_remote *r) -{ - struct noise_keypair *kp; - int ret; - - rw_enter_read(&r->r_keypair_lock); - /* kp_ctr isn't locked here, we're happy to accept a racy read. */ - if ((kp = r->r_current) == NULL || - !kp->kp_valid || - noise_timer_expired(&kp->kp_birthdate, REJECT_AFTER_TIME, 0) || - kp->kp_ctr.c_recv >= REJECT_AFTER_MESSAGES || - kp->kp_ctr.c_send >= REJECT_AFTER_MESSAGES) - ret = EINVAL; - else - ret = 0; - rw_exit_read(&r->r_keypair_lock); - return ret; -} - -int -noise_remote_encrypt(struct noise_remote *r, uint32_t *r_idx, uint64_t *nonce, - uint8_t *buf, size_t buflen) -{ - struct noise_keypair *kp; - int ret = EINVAL; - - rw_enter_read(&r->r_keypair_lock); - if ((kp = r->r_current) == NULL) - goto error; - - /* We confirm that our values are within our tolerances. We want: - * - a valid keypair - * - our keypair to be less than REJECT_AFTER_TIME seconds old - * - our receive counter to be less than REJECT_AFTER_MESSAGES - * - our send counter to be less than REJECT_AFTER_MESSAGES - * - * kp_ctr isn't locked here, we're happy to accept a racy read. */ - if (!kp->kp_valid || - noise_timer_expired(&kp->kp_birthdate, REJECT_AFTER_TIME, 0) || - kp->kp_ctr.c_recv >= REJECT_AFTER_MESSAGES || - ((*nonce = noise_counter_send(&kp->kp_ctr)) > REJECT_AFTER_MESSAGES)) - goto error; - - /* We encrypt into the same buffer, so the caller must ensure that buf - * has NOISE_AUTHTAG_LEN bytes to store the MAC. The nonce and index - * are passed back out to the caller through the provided data pointer. */ - *r_idx = kp->kp_remote_index; - chacha20poly1305_encrypt(buf, buf, buflen, - NULL, 0, *nonce, kp->kp_send); - - /* If our values are still within tolerances, but we are approaching - * the tolerances, we notify the caller with ESTALE that they should - * establish a new keypair. The current keypair can continue to be used - * until the tolerances are hit. We notify if: - * - our send counter is valid and not less than REKEY_AFTER_MESSAGES - * - we're the initiator and our keypair is older than - * REKEY_AFTER_TIME seconds */ - ret = ESTALE; - if ((kp->kp_valid && *nonce >= REKEY_AFTER_MESSAGES) || - (kp->kp_is_initiator && - noise_timer_expired(&kp->kp_birthdate, REKEY_AFTER_TIME, 0))) - goto error; - - ret = 0; -error: - rw_exit_read(&r->r_keypair_lock); - return ret; -} - -int -noise_remote_decrypt(struct noise_remote *r, uint32_t r_idx, uint64_t nonce, - uint8_t *buf, size_t buflen) -{ - struct noise_keypair *kp; - int ret = EINVAL; - - /* We retrieve the keypair corresponding to the provided index. We - * attempt the current keypair first as that is most likely. We also - * want to make sure that the keypair is valid as it would be - * catastrophic to decrypt against a zero'ed keypair. */ - rw_enter_read(&r->r_keypair_lock); - - if (r->r_current != NULL && r->r_current->kp_local_index == r_idx) { - kp = r->r_current; - } else if (r->r_previous != NULL && r->r_previous->kp_local_index == r_idx) { - kp = r->r_previous; - } else if (r->r_next != NULL && r->r_next->kp_local_index == r_idx) { - kp = r->r_next; - } else { - goto error; - } - - /* We confirm that our values are within our tolerances. These values - * are the same as the encrypt routine. - * - * kp_ctr isn't locked here, we're happy to accept a racy read. */ - if (noise_timer_expired(&kp->kp_birthdate, REJECT_AFTER_TIME, 0) || - kp->kp_ctr.c_recv >= REJECT_AFTER_MESSAGES) - goto error; - - /* Decrypt, then validate the counter. We don't want to validate the - * counter before decrypting as we do not know the message is authentic - * prior to decryption. */ - if (chacha20poly1305_decrypt(buf, buf, buflen, - NULL, 0, nonce, kp->kp_recv) == 0) - goto error; - - if (noise_counter_recv(&kp->kp_ctr, nonce) != 0) - goto error; - - /* If we've received the handshake confirming data packet then move the - * next keypair into current. If we do slide the next keypair in, then - * we skip the REKEY_AFTER_TIME_RECV check. This is safe to do as a - * data packet can't confirm a session that we are an INITIATOR of. */ - if (kp == r->r_next) { - rw_exit_read(&r->r_keypair_lock); - rw_enter_write(&r->r_keypair_lock); - if (kp == r->r_next && kp->kp_local_index == r_idx) { - noise_remote_keypair_free(r, r->r_previous); - r->r_previous = r->r_current; - r->r_current = r->r_next; - r->r_next = NULL; - - ret = ECONNRESET; - goto error; - } - rw_enter(&r->r_keypair_lock, RW_DOWNGRADE); - } - - /* Similar to when we encrypt, we want to notify the caller when we - * are approaching our tolerances. We notify if: - * - we're the initiator and the current keypair is older than - * REKEY_AFTER_TIME_RECV seconds. */ - ret = ESTALE; - kp = r->r_current; - if (kp != NULL && - kp->kp_valid && - kp->kp_is_initiator && - noise_timer_expired(&kp->kp_birthdate, REKEY_AFTER_TIME_RECV, 0)) - goto error; - - ret = 0; - -error: - rw_exit(&r->r_keypair_lock); - return ret; -} - -/* Private functions - these should not be called outside this file under any - * circumstances. */ -static struct noise_keypair * -noise_remote_keypair_allocate(struct noise_remote *r) -{ - struct noise_keypair *kp; - kp = SLIST_FIRST(&r->r_unused_keypairs); - SLIST_REMOVE_HEAD(&r->r_unused_keypairs, kp_entry); - return kp; -} - -static void -noise_remote_keypair_free(struct noise_remote *r, struct noise_keypair *kp) -{ - struct noise_upcall *u = &r->r_local->l_upcall; - if (kp != NULL) { - SLIST_INSERT_HEAD(&r->r_unused_keypairs, kp, kp_entry); - u->u_index_drop(u->u_arg, kp->kp_local_index); - bzero(kp->kp_send, sizeof(kp->kp_send)); - bzero(kp->kp_recv, sizeof(kp->kp_recv)); - } -} - -static uint32_t -noise_remote_handshake_index_get(struct noise_remote *r) -{ - struct noise_upcall *u = &r->r_local->l_upcall; - return u->u_index_set(u->u_arg, r); -} - -static void -noise_remote_handshake_index_drop(struct noise_remote *r) -{ - struct noise_handshake *hs = &r->r_handshake; - struct noise_upcall *u = &r->r_local->l_upcall; - rw_assert_wrlock(&r->r_handshake_lock); - if (hs->hs_state != HS_ZEROED) - u->u_index_drop(u->u_arg, hs->hs_local_index); -} - -static uint64_t -noise_counter_send(struct noise_counter *ctr) -{ - uint64_t ret; - rw_enter_write(&ctr->c_lock); - ret = ctr->c_send++; - rw_exit_write(&ctr->c_lock); - return ret; -} - -static int -noise_counter_recv(struct noise_counter *ctr, uint64_t recv) -{ - uint64_t i, top, index_recv, index_ctr; - unsigned long bit; - int ret = EEXIST; - - rw_enter_write(&ctr->c_lock); - - /* Check that the recv counter is valid */ - if (ctr->c_recv >= REJECT_AFTER_MESSAGES || - recv >= REJECT_AFTER_MESSAGES) - goto error; - - /* If the packet is out of the window, invalid */ - if (recv + COUNTER_WINDOW_SIZE < ctr->c_recv) - goto error; - - /* If the new counter is ahead of the current counter, we'll need to - * zero out the bitmap that has previously been used */ - index_recv = recv / COUNTER_BITS; - index_ctr = ctr->c_recv / COUNTER_BITS; - - if (recv > ctr->c_recv) { - top = MIN(index_recv - index_ctr, COUNTER_NUM); - for (i = 1; i <= top; i++) - ctr->c_backtrack[ - (i + index_ctr) & (COUNTER_NUM - 1)] = 0; - ctr->c_recv = recv; - } - - index_recv %= COUNTER_NUM; - bit = 1ul << (recv % COUNTER_BITS); - - if (ctr->c_backtrack[index_recv] & bit) - goto error; - - ctr->c_backtrack[index_recv] |= bit; - - ret = 0; -error: - rw_exit_write(&ctr->c_lock); - return ret; -} - -static void -noise_kdf(uint8_t *a, uint8_t *b, uint8_t *c, const uint8_t *x, - size_t a_len, size_t b_len, size_t c_len, size_t x_len, - const uint8_t ck[NOISE_HASH_LEN]) -{ - uint8_t out[BLAKE2S_HASH_SIZE + 1]; - uint8_t sec[BLAKE2S_HASH_SIZE]; - -#ifdef DIAGNOSTIC - MPASS(a_len <= BLAKE2S_HASH_SIZE && b_len <= BLAKE2S_HASH_SIZE && - c_len <= BLAKE2S_HASH_SIZE); - MPASS(!(b || b_len || c || c_len) || (a && a_len)); - MPASS(!(c || c_len) || (b && b_len)); -#endif - - /* Extract entropy from "x" into sec */ - blake2s_hmac(sec, x, ck, BLAKE2S_HASH_SIZE, x_len, NOISE_HASH_LEN); - - if (a == NULL || a_len == 0) - goto out; - - /* Expand first key: key = sec, data = 0x1 */ - out[0] = 1; - blake2s_hmac(out, out, sec, BLAKE2S_HASH_SIZE, 1, BLAKE2S_HASH_SIZE); - memcpy(a, out, a_len); - - if (b == NULL || b_len == 0) - goto out; - - /* Expand second key: key = sec, data = "a" || 0x2 */ - out[BLAKE2S_HASH_SIZE] = 2; - blake2s_hmac(out, out, sec, BLAKE2S_HASH_SIZE, BLAKE2S_HASH_SIZE + 1, - BLAKE2S_HASH_SIZE); - memcpy(b, out, b_len); - - if (c == NULL || c_len == 0) - goto out; - - /* Expand third key: key = sec, data = "b" || 0x3 */ - out[BLAKE2S_HASH_SIZE] = 3; - blake2s_hmac(out, out, sec, BLAKE2S_HASH_SIZE, BLAKE2S_HASH_SIZE + 1, - BLAKE2S_HASH_SIZE); - memcpy(c, out, c_len); - -out: - /* Clear sensitive data from stack */ - explicit_bzero(sec, BLAKE2S_HASH_SIZE); - explicit_bzero(out, BLAKE2S_HASH_SIZE + 1); -} - -static int -noise_mix_dh(uint8_t ck[NOISE_HASH_LEN], uint8_t key[NOISE_SYMMETRIC_KEY_LEN], - const uint8_t private[NOISE_PUBLIC_KEY_LEN], - const uint8_t public[NOISE_PUBLIC_KEY_LEN]) -{ - uint8_t dh[NOISE_PUBLIC_KEY_LEN]; - - if (!curve25519(dh, private, public)) - return EINVAL; - noise_kdf(ck, key, NULL, dh, - NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, ck); - explicit_bzero(dh, NOISE_PUBLIC_KEY_LEN); - return 0; -} - -static int -noise_mix_ss(uint8_t ck[NOISE_HASH_LEN], uint8_t key[NOISE_SYMMETRIC_KEY_LEN], - const uint8_t ss[NOISE_PUBLIC_KEY_LEN]) -{ - static uint8_t null_point[NOISE_PUBLIC_KEY_LEN]; - if (timingsafe_bcmp(ss, null_point, NOISE_PUBLIC_KEY_LEN) == 0) - return ENOENT; - noise_kdf(ck, key, NULL, ss, - NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, 0, NOISE_PUBLIC_KEY_LEN, ck); - return 0; -} - -static void -noise_mix_hash(uint8_t hash[NOISE_HASH_LEN], const uint8_t *src, - size_t src_len) -{ - struct blake2s_state blake; - - blake2s_init(&blake, NOISE_HASH_LEN); - blake2s_update(&blake, hash, NOISE_HASH_LEN); - blake2s_update(&blake, src, src_len); - blake2s_final(&blake, hash); -} - -static void -noise_mix_psk(uint8_t ck[NOISE_HASH_LEN], uint8_t hash[NOISE_HASH_LEN], - uint8_t key[NOISE_SYMMETRIC_KEY_LEN], - const uint8_t psk[NOISE_SYMMETRIC_KEY_LEN]) -{ - uint8_t tmp[NOISE_HASH_LEN]; - - noise_kdf(ck, tmp, key, psk, - NOISE_HASH_LEN, NOISE_HASH_LEN, NOISE_SYMMETRIC_KEY_LEN, - NOISE_SYMMETRIC_KEY_LEN, ck); - noise_mix_hash(hash, tmp, NOISE_HASH_LEN); - explicit_bzero(tmp, NOISE_HASH_LEN); -} - -static void -noise_param_init(uint8_t ck[NOISE_HASH_LEN], uint8_t hash[NOISE_HASH_LEN], - const uint8_t s[NOISE_PUBLIC_KEY_LEN]) -{ - struct blake2s_state blake; - - blake2s(ck, (uint8_t *)NOISE_HANDSHAKE_NAME, NULL, - NOISE_HASH_LEN, strlen(NOISE_HANDSHAKE_NAME), 0); - blake2s_init(&blake, NOISE_HASH_LEN); - blake2s_update(&blake, ck, NOISE_HASH_LEN); - blake2s_update(&blake, (uint8_t *)NOISE_IDENTIFIER_NAME, - strlen(NOISE_IDENTIFIER_NAME)); - blake2s_final(&blake, hash); - - noise_mix_hash(hash, s, NOISE_PUBLIC_KEY_LEN); -} - -static void -noise_msg_encrypt(uint8_t *dst, const uint8_t *src, size_t src_len, - uint8_t key[NOISE_SYMMETRIC_KEY_LEN], uint8_t hash[NOISE_HASH_LEN]) -{ - /* Nonce always zero for Noise_IK */ - chacha20poly1305_encrypt(dst, src, src_len, - hash, NOISE_HASH_LEN, 0, key); - noise_mix_hash(hash, dst, src_len + NOISE_AUTHTAG_LEN); -} - -static int -noise_msg_decrypt(uint8_t *dst, const uint8_t *src, size_t src_len, - uint8_t key[NOISE_SYMMETRIC_KEY_LEN], uint8_t hash[NOISE_HASH_LEN]) -{ - /* Nonce always zero for Noise_IK */ - if (!chacha20poly1305_decrypt(dst, src, src_len, - hash, NOISE_HASH_LEN, 0, key)) - return EINVAL; - noise_mix_hash(hash, src, src_len); - return 0; -} - -static void -noise_msg_ephemeral(uint8_t ck[NOISE_HASH_LEN], uint8_t hash[NOISE_HASH_LEN], - const uint8_t src[NOISE_PUBLIC_KEY_LEN]) -{ - noise_mix_hash(hash, src, NOISE_PUBLIC_KEY_LEN); - noise_kdf(ck, NULL, NULL, src, NOISE_HASH_LEN, 0, 0, - NOISE_PUBLIC_KEY_LEN, ck); -} - -static void -noise_tai64n_now(uint8_t output[NOISE_TIMESTAMP_LEN]) -{ - struct timespec time; - uint64_t sec; - uint32_t nsec; - - getnanotime(&time); - - /* Round down the nsec counter to limit precise timing leak. */ - time.tv_nsec &= REJECT_INTERVAL_MASK; - - /* https://cr.yp.to/libtai/tai64.html */ - sec = htobe64(0x400000000000000aULL + time.tv_sec); - nsec = htobe32(time.tv_nsec); - - /* memcpy to output buffer, assuming output could be unaligned. */ - memcpy(output, &sec, sizeof(sec)); - memcpy(output + sizeof(sec), &nsec, sizeof(nsec)); -} - -static int -noise_timer_expired(struct timespec *birthdate, time_t sec, long nsec) -{ - struct timespec uptime; - struct timespec expire = { .tv_sec = sec, .tv_nsec = nsec }; - - /* We don't really worry about a zeroed birthdate, to avoid the extra - * check on every encrypt/decrypt. This does mean that r_last_init - * check may fail if getnanouptime is < REJECT_INTERVAL from 0. */ - - getnanouptime(&uptime); - timespecadd(birthdate, &expire, &expire); - return timespeccmp(&uptime, &expire, >) ? ETIMEDOUT : 0; -} diff --git a/sys/dev/if_wg/wg_noise.h b/sys/dev/if_wg/wg_noise.h deleted file mode 100644 index 95617ae9bfef..000000000000 --- a/sys/dev/if_wg/wg_noise.h +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (C) 2015-2020 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2019-2020 Matt Dunwoodie - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef __NOISE_H__ -#define __NOISE_H__ - -#include -#include -#include - -#include "crypto.h" - -#define NOISE_PUBLIC_KEY_LEN CURVE25519_KEY_SIZE -#define NOISE_SYMMETRIC_KEY_LEN CHACHA20POLY1305_KEY_SIZE -#define NOISE_TIMESTAMP_LEN (sizeof(uint64_t) + sizeof(uint32_t)) -#define NOISE_AUTHTAG_LEN CHACHA20POLY1305_AUTHTAG_SIZE -#define NOISE_HASH_LEN BLAKE2S_HASH_SIZE - -/* Protocol string constants */ -#define NOISE_HANDSHAKE_NAME "Noise_IKpsk2_25519_ChaChaPoly_BLAKE2s" -#define NOISE_IDENTIFIER_NAME "WireGuard v1 zx2c4 Jason@zx2c4.com" - -/* Constants for the counter */ -#define COUNTER_BITS_TOTAL 8192 -#define COUNTER_BITS (sizeof(unsigned long) * 8) -#define COUNTER_NUM (COUNTER_BITS_TOTAL / COUNTER_BITS) -#define COUNTER_WINDOW_SIZE (COUNTER_BITS_TOTAL - COUNTER_BITS) - -/* Constants for the keypair */ -#define REKEY_AFTER_MESSAGES (1ull << 60) -#define REJECT_AFTER_MESSAGES (UINT64_MAX - COUNTER_WINDOW_SIZE - 1) -#define REKEY_AFTER_TIME 120 -#define REKEY_AFTER_TIME_RECV 165 -#define REJECT_AFTER_TIME 180 -#define REJECT_INTERVAL (1000000000 / 50) /* fifty times per sec */ -/* 24 = floor(log2(REJECT_INTERVAL)) */ -#define REJECT_INTERVAL_MASK (~((1ull<<24)-1)) - -enum noise_state_hs { - HS_ZEROED = 0, - CREATED_INITIATION, - CONSUMED_INITIATION, - CREATED_RESPONSE, - CONSUMED_RESPONSE, -}; - -struct noise_handshake { - enum noise_state_hs hs_state; - uint32_t hs_local_index; - uint32_t hs_remote_index; - uint8_t hs_e[NOISE_PUBLIC_KEY_LEN]; - uint8_t hs_hash[NOISE_HASH_LEN]; - uint8_t hs_ck[NOISE_HASH_LEN]; -}; - -struct noise_counter { - struct rwlock c_lock; - uint64_t c_send; - uint64_t c_recv; - unsigned long c_backtrack[COUNTER_NUM]; -}; - -struct noise_keypair { - SLIST_ENTRY(noise_keypair) kp_entry; - int kp_valid; - int kp_is_initiator; - uint32_t kp_local_index; - uint32_t kp_remote_index; - uint8_t kp_send[NOISE_SYMMETRIC_KEY_LEN]; - uint8_t kp_recv[NOISE_SYMMETRIC_KEY_LEN]; - struct timespec kp_birthdate; /* nanouptime */ - struct noise_counter kp_ctr; -}; - -struct noise_remote { - uint8_t r_public[NOISE_PUBLIC_KEY_LEN]; - struct noise_local *r_local; - uint8_t r_ss[NOISE_PUBLIC_KEY_LEN]; - - struct rwlock r_handshake_lock; - struct noise_handshake r_handshake; - uint8_t r_psk[NOISE_SYMMETRIC_KEY_LEN]; - uint8_t r_timestamp[NOISE_TIMESTAMP_LEN]; - struct timespec r_last_init; /* nanouptime */ - - struct rwlock r_keypair_lock; - SLIST_HEAD(,noise_keypair) r_unused_keypairs; - struct noise_keypair *r_next, *r_current, *r_previous; - struct noise_keypair r_keypair[3]; /* 3: next, current, previous. */ - -}; - -struct noise_local { - struct rwlock l_identity_lock; - int l_has_identity; - uint8_t l_public[NOISE_PUBLIC_KEY_LEN]; - uint8_t l_private[NOISE_PUBLIC_KEY_LEN]; - - struct noise_upcall { - void *u_arg; - struct noise_remote * - (*u_remote_get)(void *, uint8_t[NOISE_PUBLIC_KEY_LEN]); - uint32_t - (*u_index_set)(void *, struct noise_remote *); - void (*u_index_drop)(void *, uint32_t); - } l_upcall; -}; - -/* Set/Get noise parameters */ -void noise_local_init(struct noise_local *, struct noise_upcall *); -void noise_local_lock_identity(struct noise_local *); -void noise_local_unlock_identity(struct noise_local *); -int noise_local_set_private(struct noise_local *, - const uint8_t[NOISE_PUBLIC_KEY_LEN]); -int noise_local_keys(struct noise_local *, uint8_t[NOISE_PUBLIC_KEY_LEN], - uint8_t[NOISE_PUBLIC_KEY_LEN]); - -void noise_remote_init(struct noise_remote *, - const uint8_t[NOISE_PUBLIC_KEY_LEN], struct noise_local *); -int noise_remote_set_psk(struct noise_remote *, - const uint8_t[NOISE_SYMMETRIC_KEY_LEN]); -int noise_remote_keys(struct noise_remote *, uint8_t[NOISE_PUBLIC_KEY_LEN], - uint8_t[NOISE_SYMMETRIC_KEY_LEN]); - -/* Should be called anytime noise_local_set_private is called */ -void noise_remote_precompute(struct noise_remote *); - -/* Cryptographic functions */ -int noise_create_initiation( - struct noise_remote *, - uint32_t *s_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], - uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN], - uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]); - -int noise_consume_initiation( - struct noise_local *, - struct noise_remote **, - uint32_t s_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], - uint8_t es[NOISE_PUBLIC_KEY_LEN + NOISE_AUTHTAG_LEN], - uint8_t ets[NOISE_TIMESTAMP_LEN + NOISE_AUTHTAG_LEN]); - -int noise_create_response( - struct noise_remote *, - uint32_t *s_idx, - uint32_t *r_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], - uint8_t en[0 + NOISE_AUTHTAG_LEN]); - -int noise_consume_response( - struct noise_remote *, - uint32_t s_idx, - uint32_t r_idx, - uint8_t ue[NOISE_PUBLIC_KEY_LEN], - uint8_t en[0 + NOISE_AUTHTAG_LEN]); - -int noise_remote_begin_session(struct noise_remote *); -void noise_remote_clear(struct noise_remote *); -void noise_remote_expire_current(struct noise_remote *); - -int noise_remote_ready(struct noise_remote *); - -int noise_remote_encrypt( - struct noise_remote *, - uint32_t *r_idx, - uint64_t *nonce, - uint8_t *buf, - size_t buflen); -int noise_remote_decrypt( - struct noise_remote *, - uint32_t r_idx, - uint64_t nonce, - uint8_t *buf, - size_t buflen); - -#endif /* __NOISE_H__ */ diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 30499dce729c..b5c8f6ebf9be 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -1,4596 +1,4595 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1999 Poul-Henning Kamp. * Copyright (c) 2008 Bjoern A. Zeeb. * Copyright (c) 2009 James Gritton. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ #include #define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000" #define PRISON0_HOSTUUID_MODULE "hostuuid" MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures"); /* Keep struct prison prison0 and some code in kern_jail_set() readable. */ #ifdef INET #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL #endif #else /* !INET */ #ifdef INET6 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL #else #define _PR_IP_SADDRSEL 0 #endif #endif /* prison0 describes what is "real" about the system. */ struct prison prison0 = { .pr_id = 0, .pr_name = "0", .pr_ref = 1, .pr_uref = 1, .pr_path = "/", .pr_securelevel = -1, .pr_devfs_rsnum = 0, .pr_state = PRISON_STATE_ALIVE, .pr_childmax = JAIL_MAX, .pr_hostuuid = DEFAULT_HOSTUUID, .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children), #ifdef VIMAGE .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL, #else .pr_flags = PR_HOST|_PR_IP_SADDRSEL, #endif .pr_allow = PR_ALLOW_ALL_STATIC, }; MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); struct bool_flags { const char *name; const char *noname; volatile u_int flag; }; struct jailsys_flags { const char *name; unsigned disable; unsigned new; }; /* allprison, allprison_racct and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); LIST_HEAD(, prison_racct) allprison_racct; int lastprid = 0; static int get_next_prid(struct prison **insprp); static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable); static char *prison_path(struct prison *pr1, struct prison *pr2); #ifdef RACCT static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ #define PD_DEUREF 0x02 /* Decrement pr_uref */ #define PD_KILL 0x04 /* Remove jail, kill processes, etc */ #define PD_LOCKED 0x10 /* pr_mtx is held */ #define PD_LIST_SLOCKED 0x20 /* allprison_lock is held shared */ #define PD_LIST_XLOCKED 0x40 /* allprison_lock is held exclusive */ #define PD_OP_FLAGS 0x07 /* Operation flags */ #define PD_LOCK_FLAGS 0x70 /* Lock status flags */ /* * Parameter names corresponding to PR_* flag values. Size values are for kvm * as we cannot figure out the size of a sparse array, or an array without a * terminating entry. */ static struct bool_flags pr_flag_bool[] = { {"persist", "nopersist", PR_PERSIST}, #ifdef INET {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL}, #endif #ifdef INET6 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL}, #endif }; const size_t pr_flag_bool_size = sizeof(pr_flag_bool); static struct jailsys_flags pr_flag_jailsys[] = { {"host", 0, PR_HOST}, #ifdef VIMAGE {"vnet", 0, PR_VNET}, #endif #ifdef INET {"ip4", PR_IP4_USER, PR_IP4_USER}, #endif #ifdef INET6 {"ip6", PR_IP6_USER, PR_IP6_USER}, #endif }; const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys); /* * Make this array full-size so dynamic parameters can be added. * It is protected by prison0.mtx, but lockless reading is allowed * with an atomic check of the flag values. */ static struct bool_flags pr_flag_allow[NBBY * NBPW] = { {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME}, {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC}, {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS}, {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS}, {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT}, {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS}, {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF}, {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK}, {"allow.reserved_ports", "allow.noreserved_ports", PR_ALLOW_RESERVED_PORTS}, {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF}, {"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug", PR_ALLOW_UNPRIV_DEBUG}, {"allow.suser", "allow.nosuser", PR_ALLOW_SUSER}, }; static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC; const size_t pr_flag_allow_size = sizeof(pr_flag_allow); #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | \ PR_ALLOW_RESERVED_PORTS | \ PR_ALLOW_UNPRIV_DEBUG | \ PR_ALLOW_SUSER) #define JAIL_DEFAULT_ENFORCE_STATFS 2 #define JAIL_DEFAULT_DEVFS_RSNUM 0 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS; static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM; #if defined(INET) || defined(INET6) static unsigned jail_max_af_ips = 255; #endif /* * Initialize the parts of prison0 that can't be static-initialized with * constants. This is called from proc0_init() after creating thread0 cpuset. */ void prison0_init(void) { uint8_t *file, *data; size_t size; prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset); prison0.pr_osreldate = osreldate; strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease)); /* If we have a preloaded hostuuid, use it. */ file = preload_search_by_type(PRISON0_HOSTUUID_MODULE); if (file != NULL) { data = preload_fetch_addr(file); size = preload_fetch_size(file); if (data != NULL) { /* * The preloaded data may include trailing whitespace, almost * certainly a newline; skip over any whitespace or * non-printable characters to be safe. */ while (size > 0 && data[size - 1] <= 0x20) { data[size--] = '\0'; } if (validate_uuid(data, size, NULL, 0) == 0) { (void)strlcpy(prison0.pr_hostuuid, data, size + 1); } else if (bootverbose) { printf("hostuuid: preload data malformed: '%s'", data); } } } if (bootverbose) printf("hostuuid: using %s\n", prison0.pr_hostuuid); } /* * struct jail_args { * struct jail *jail; * }; */ int sys_jail(struct thread *td, struct jail_args *uap) { uint32_t version; int error; struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) return (error); switch (version) { case 0: { struct jail_v0 j0; /* FreeBSD single IPv4 jails. */ bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); j.version = j0.version; j.path = j0.path; j.hostname = j0.hostname; j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */ break; } case 1: /* * Version 1 was used by multi-IPv4 jail implementations * that never made it into the official kernel. */ return (EINVAL); case 2: /* JAIL_API_VERSION */ /* FreeBSD multi-IPv4/IPv6,noIP jails. */ error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); break; default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } return (kern_jail(td, &j)); } int kern_jail(struct thread *td, struct jail *j) { struct iovec optiov[2 * (4 + nitems(pr_flag_allow) #ifdef INET + 1 #endif #ifdef INET6 + 1 #endif )]; struct uio opt; char *u_path, *u_hostname, *u_name; struct bool_flags *bf; #ifdef INET uint32_t ip4s; struct in_addr *u_ip4; #endif #ifdef INET6 struct in6_addr *u_ip6; #endif size_t tmplen; int error, enforce_statfs; bzero(&optiov, sizeof(optiov)); opt.uio_iov = optiov; opt.uio_iovcnt = 0; opt.uio_offset = -1; opt.uio_resid = -1; opt.uio_segflg = UIO_SYSSPACE; opt.uio_rw = UIO_READ; opt.uio_td = td; /* Set permissions for top-level jails from sysctls. */ if (!jailed(td->td_ucred)) { for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { optiov[opt.uio_iovcnt].iov_base = __DECONST(char *, (jail_default_allow & bf->flag) ? bf->name : bf->noname); optiov[opt.uio_iovcnt].iov_len = strlen(optiov[opt.uio_iovcnt].iov_base) + 1; opt.uio_iovcnt += 2; } optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); opt.uio_iovcnt++; enforce_statfs = jail_default_enforce_statfs; optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); opt.uio_iovcnt++; } tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; #ifdef INET ip4s = (j->version == 0) ? 1 : j->ip4s; if (ip4s > jail_max_af_ips) return (EINVAL); tmplen += ip4s * sizeof(struct in_addr); #else if (j->ip4s > 0) return (EINVAL); #endif #ifdef INET6 if (j->ip6s > jail_max_af_ips) return (EINVAL); tmplen += j->ip6s * sizeof(struct in6_addr); #else if (j->ip6s > 0) return (EINVAL); #endif u_path = malloc(tmplen, M_TEMP, M_WAITOK); u_hostname = u_path + MAXPATHLEN; u_name = u_hostname + MAXHOSTNAMELEN; #ifdef INET u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); #endif #ifdef INET6 #ifdef INET u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); #else u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); #endif #endif optiov[opt.uio_iovcnt].iov_base = "path"; optiov[opt.uio_iovcnt].iov_len = sizeof("path"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_path; error = copyinstr(j->path, u_path, MAXPATHLEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = "host.hostname"; optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_hostname; error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; if (j->jailname != NULL) { optiov[opt.uio_iovcnt].iov_base = "name"; optiov[opt.uio_iovcnt].iov_len = sizeof("name"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_name; error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; } #ifdef INET optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip4; optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); if (j->version == 0) u_ip4->s_addr = j->ip4s; else { error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } } opt.uio_iovcnt++; #endif #ifdef INET6 optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); opt.uio_iovcnt++; optiov[opt.uio_iovcnt].iov_base = u_ip6; optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); if (error) { free(u_path, M_TEMP); return (error); } opt.uio_iovcnt++; #endif KASSERT(opt.uio_iovcnt <= nitems(optiov), ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } /* * struct jail_set_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_set(struct thread *td, struct jail_set_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_set(td, auio, uap->flags); free(auio, M_IOV); return (error); } int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { struct nameidata nd; #ifdef INET struct in_addr *ip4; #endif #ifdef INET6 struct in6_addr *ip6; #endif struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) struct prison *tppr; void *op; #endif unsigned long hid; size_t namelen, onamelen, pnamelen; int born, created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; int jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; #if defined(INET) || defined(INET6) int ii, ij; #endif #ifdef INET int ip4s, redo_ip4; #endif #ifdef INET6 int ip6s, redo_ip6; #endif uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); if (!error && (flags & JAIL_ATTACH)) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); mypr = td->td_ucred->cr_prison; if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this * takes care of some expensive stuff (path lookup) before getting * the allprison lock. * * XXX Jails are not filesystems, and jail parameters are not mount * options. But it makes more sense to re-use the vfsopt code * than duplicate it under a different name. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); #ifdef INET ip4 = NULL; #endif #ifdef INET6 ip6 = NULL; #endif g_path = NULL; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { error = EINVAL; vfs_opterror(opts, "no valid operation (create or update)"); goto done_errmsg; } error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; else if (error != 0) goto done_free; error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel)); if (error == ENOENT) gotslevel = 0; else if (error != 0) goto done_free; else gotslevel = 1; error = vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax)); if (error == ENOENT) gotchildmax = 0; else if (error != 0) goto done_free; else gotchildmax = 1; error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); if (error == ENOENT) gotenforce = 0; else if (error != 0) goto done_free; else if (enforce < 0 || enforce > 2) { error = EINVAL; goto done_free; } else gotenforce = 1; error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum)); if (error == ENOENT) gotrsnum = 0; else if (error != 0) goto done_free; else gotrsnum = 1; pr_flags = ch_flags = 0; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { vfs_flagopt(opts, bf->name, &pr_flags, bf->flag); vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag); } ch_flags |= pr_flags; for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys)); if (error == ENOENT) continue; if (error != 0) goto done_free; switch (jsys) { case JAIL_SYS_DISABLE: if (!jsf->disable) { error = EINVAL; goto done_free; } pr_flags |= jsf->disable; break; case JAIL_SYS_NEW: pr_flags |= jsf->new; break; case JAIL_SYS_INHERIT: break; default: error = EINVAL; goto done_free; } ch_flags |= jsf->new | jsf->disable; } if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { error = EINVAL; vfs_opterror(opts, "new jail must persist or attach"); goto done_errmsg; } #ifdef VIMAGE if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) { error = EINVAL; vfs_opterror(opts, "vnet cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_errmsg; } #endif #ifdef INET6 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_errmsg; } #endif pr_allow = ch_allow = 0; for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { vfs_flagopt(opts, bf->name, &pr_allow, bf->flag); vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag); } ch_allow |= pr_allow; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; else if (error != 0) goto done_free; else { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostname", (void **)&host, &len); if (error == ENOENT) host = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || host[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len); if (error == ENOENT) domain = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || domain[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > MAXHOSTNAMELEN) { error = ENAMETOOLONG; goto done_free; } } error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len); if (error == ENOENT) uuid = NULL; else if (error != 0) goto done_free; else { ch_flags |= PR_HOST; pr_flags |= PR_HOST; if (len == 0 || uuid[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len > HOSTUUIDLEN) { error = ENAMETOOLONG; goto done_free; } } #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32; error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32)); hid = hid32; } else #endif error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid)); if (error == ENOENT) gothid = 0; else if (error != 0) goto done_free; else { gothid = 1; ch_flags |= PR_HOST; pr_flags |= PR_HOST; } #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) ip4s = 0; else if (error != 0) goto done_free; else if (ip4s & (sizeof(*ip4) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP4_USER; pr_flags |= PR_IP4_USER; if (ip4s > 0) { ip4s /= sizeof(*ip4); if (ip4s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv4 addresses"); goto done_errmsg; } ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); bcopy(op, ip4, ip4s * sizeof(*ip4)); /* * IP addresses are all sorted but ip[0] to preserve * the primary IP address as given from userland. * This special IP is used for unbound outgoing * connections as well for "loopback" traffic in case * source address selection cannot find any more fitting * address to connect from. */ if (ip4s > 1) qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), prison_qcmp_v4); /* * Check for duplicate addresses and do some simple * zero and broadcast checks. If users give other bogus * addresses it is their problem. * * We do not have to care about byte order for these * checks so we will do them in NBO. */ for (ii = 0; ii < ip4s; ii++) { if (ip4[ii].s_addr == INADDR_ANY || ip4[ii].s_addr == INADDR_BROADCAST) { error = EINVAL; goto done_free; } if ((ii+1) < ip4s && (ip4[0].s_addr == ip4[ii+1].s_addr || ip4[ii].s_addr == ip4[ii+1].s_addr)) { error = EINVAL; goto done_free; } } } } #endif #ifdef INET6 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s); if (error == ENOENT) ip6s = 0; else if (error != 0) goto done_free; else if (ip6s & (sizeof(*ip6) - 1)) { error = EINVAL; goto done_free; } else { ch_flags |= PR_IP6_USER; pr_flags |= PR_IP6_USER; if (ip6s > 0) { ip6s /= sizeof(*ip6); if (ip6s > jail_max_af_ips) { error = EINVAL; vfs_opterror(opts, "too many IPv6 addresses"); goto done_errmsg; } ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); bcopy(op, ip6, ip6s * sizeof(*ip6)); if (ip6s > 1) qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), prison_qcmp_v6); for (ii = 0; ii < ip6s; ii++) { if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { error = EINVAL; goto done_free; } if ((ii+1) < ip6s && (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) { error = EINVAL; goto done_free; } } } } #endif #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_errmsg; } #endif error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len); if (error == ENOENT) osrelstr = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osrelease cannot be changed after creation"); goto done_errmsg; } if (len == 0 || osrelstr[len - 1] != '\0') { error = EINVAL; goto done_free; } if (len >= OSRELEASELEN) { error = ENAMETOOLONG; vfs_opterror(opts, "osrelease string must be 1-%d bytes long", OSRELEASELEN - 1); goto done_errmsg; } } error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt)); if (error == ENOENT) osreldt = 0; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be changed after creation"); goto done_errmsg; } if (osreldt == 0) { error = EINVAL; vfs_opterror(opts, "osreldate cannot be 0"); goto done_errmsg; } } root = NULL; error = vfs_getopt(opts, "path", (void **)&path, &len); if (error == ENOENT) path = NULL; else if (error != 0) goto done_free; else { if (flags & JAIL_UPDATE) { error = EINVAL; vfs_opterror(opts, "path cannot be changed after creation"); goto done_errmsg; } if (len == 0 || path[len - 1] != '\0') { error = EINVAL; goto done_free; } NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, td); error = namei(&nd); if (error) goto done_free; root = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); strlcpy(g_path, path, MAXPATHLEN); error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN); if (error == 0) { path = g_path; } else { /* exit on other errors */ goto done_free; } if (root->v_type != VDIR) { error = ENOTDIR; vput(root); goto done_free; } VOP_UNLOCK(root); } /* * Find the specified jail, or at least its parent. * This abuses the file error codes ENOENT and EEXIST. */ pr = NULL; inspr = NULL; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); if (*p != '\0') jid = 0; } sx_xlock(&allprison_lock); drflags = PD_LIST_XLOCKED; ppr = mypr; if (!prison_isalive(ppr)) { /* This jail is dying. This process will surely follow. */ error = EAGAIN; goto done_deref; } if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); goto done_deref; } /* * See if a requested jid already exists. Keep track of * where it can be inserted later. */ TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) break; pr = inspr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; inspr = NULL; break; } if (pr != NULL) { /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { /* * Even creators that cannot see the jail will * get EEXIST. */ error = EEXIST; vfs_opterror(opts, "jail %d already exists", jid); goto done_deref; } if (!prison_ischild(mypr, pr)) { /* * Updaters get ENOENT if they cannot see the * jail. This is true even for CREATE | UPDATE, * which normally cannot give this error. */ error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_deref; } ppr = pr->pr_parent; if (!prison_isalive(ppr)) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", ppr->pr_id); goto done_deref; } if (!prison_isalive(pr)) { if (!(flags & JAIL_DYING)) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done_deref; } if ((flags & JAIL_ATTACH) || (pr_flags & PR_PERSIST)) { /* * A dying jail might be resurrected * (via attach or persist), but first * it must determine if another jail * has claimed its name. Accomplish * this by implicitly re-setting the * name. */ if (name == NULL) name = prison_name(mypr, pr); } } } else { /* Update: jid must exist. */ if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done_deref; } } } /* * If the caller provided a name, look for a jail by that name. * This has different semantics for creates and updates keyed by jid * (where the name must not already exist in a different jail), * and updates keyed by the name itself (where the name must exist * because that is the jail being updated). */ namelc = NULL; if (name != NULL) { namelc = strrchr(name, '.'); if (namelc == NULL) namelc = name; else { /* * This is a hierarchical name. Split it into the * parent and child names, and make sure the parent * exists or matches an already found jail. */ if (pr != NULL) { if (strncmp(name, ppr->pr_name, namelc - name) || ppr->pr_name[namelc - name] != '\0') { error = EINVAL; vfs_opterror(opts, "cannot change jail's parent"); goto done_deref; } } else { *namelc = '\0'; ppr = prison_find_name(mypr, name); if (ppr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } mtx_unlock(&ppr->pr_mtx); if (!prison_isalive(ppr)) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_deref; } *namelc = '.'; } namelc++; } if (namelc[0] != '\0') { pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && !strcmp(tpr->pr_name + pnamelen, namelc)) { if (prison_isalive(tpr)) { if (pr == NULL && cuflags != JAIL_CREATE) { /* * Use this jail * for updates. */ pr = tpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; break; } /* * Create, or update(jid): * name must not exist in an * active sibling jail. */ error = EEXIST; vfs_opterror(opts, "jail \"%s\" already exists", name); goto done_deref; } if (pr == NULL && cuflags != JAIL_CREATE) { deadpr = tpr; } } } /* If no active jail is found, use a dying one. */ if (deadpr != NULL && pr == NULL) { if (flags & JAIL_DYING) { pr = deadpr; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else if (cuflags == JAIL_UPDATE) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done_deref; } } /* Update: name must exist if no jid. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done_deref; } } } /* Update: must provide a jid or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); goto done_deref; } /* If there's no prison to update, create a new one and link it in. */ created = pr == NULL; if (created) { for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent) if (tpr->pr_childcount >= tpr->pr_childmax) { error = EPERM; vfs_opterror(opts, "prison limit exceeded"); goto done_deref; } if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { error = EAGAIN; vfs_opterror(opts, "no available jail IDs"); goto done_deref; } pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); pr->pr_state = PRISON_STATE_INVALID; refcount_init(&pr->pr_ref, 1); refcount_init(&pr->pr_uref, 0); drflags |= PD_DEREF; LIST_INIT(&pr->pr_children); mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); pr->pr_id = jid; if (inspr != NULL) TAILQ_INSERT_BEFORE(inspr, pr, pr_list); else TAILQ_INSERT_TAIL(&allprison, pr, pr_list); pr->pr_parent = ppr; prison_hold(ppr); prison_proc_hold(ppr); LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) namelc = ""; if (path == NULL) { path = "/"; root = mypr->pr_root; vref(root); } strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN); pr->pr_flags |= PR_HOST; #if defined(INET) || defined(INET6) #ifdef VIMAGE if (!(pr_flags & PR_VNET)) #endif { #ifdef INET if (!(ch_flags & PR_IP4_USER)) pr->pr_flags |= PR_IP4 | PR_IP4_USER; else if (!(pr_flags & PR_IP4_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP4; if (ppr->pr_ip4 != NULL) { pr->pr_ip4s = ppr->pr_ip4s; pr->pr_ip4 = malloc(pr->pr_ip4s * sizeof(struct in_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip4, pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); } } #endif #ifdef INET6 if (!(ch_flags & PR_IP6_USER)) pr->pr_flags |= PR_IP6 | PR_IP6_USER; else if (!(pr_flags & PR_IP6_USER)) { pr->pr_flags |= ppr->pr_flags & PR_IP6; if (ppr->pr_ip6 != NULL) { pr->pr_ip6s = ppr->pr_ip6s; pr->pr_ip6 = malloc(pr->pr_ip6s * sizeof(struct in6_addr), M_PRISON, M_WAITOK); bcopy(ppr->pr_ip6, pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); } } #endif } #endif /* Source address selection is always on by default. */ pr->pr_flags |= _PR_IP_SADDRSEL; pr->pr_securelevel = ppr->pr_securelevel; pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; pr->pr_enforce_statfs = jail_default_enforce_statfs; pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum; pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate; if (osrelstr == NULL) strlcpy(pr->pr_osrelease, ppr->pr_osrelease, sizeof(pr->pr_osrelease)); else strlcpy(pr->pr_osrelease, osrelstr, sizeof(pr->pr_osrelease)); #ifdef VIMAGE /* Allocate a new vnet if specified. */ pr->pr_vnet = (pr_flags & PR_VNET) ? vnet_alloc() : ppr->pr_vnet; #endif /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an erorr. */ error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; } else { /* * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ prison_hold(pr); drflags |= PD_DEREF; #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { error = EINVAL; vfs_opterror(opts, "vnet jails cannot have IP address restrictions"); goto done_deref; } #endif #ifdef INET if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip4 cannot be changed after creation"); goto done_deref; } #endif #ifdef INET6 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) { error = EINVAL; vfs_opterror(opts, "ip6 cannot be changed after creation"); goto done_deref; } #endif } /* Do final error checking before setting anything. */ if (gotslevel) { if (slevel < ppr->pr_securelevel) { error = EPERM; goto done_deref; } } if (gotchildmax) { if (childmax >= ppr->pr_childmax) { error = EPERM; goto done_deref; } } if (gotenforce) { if (enforce < ppr->pr_enforce_statfs) { error = EPERM; goto done_deref; } } if (gotrsnum) { /* * devfs_rsnum is a uint16_t */ if (rsnum < 0 || rsnum > 65535) { error = EINVAL; goto done_deref; } /* * Nested jails always inherit parent's devfs ruleset */ if (jailed(td->td_ucred)) { if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) { error = EPERM; goto done_deref; } else rsnum = ppr->pr_devfs_rsnum; } } #ifdef INET if (ip4s > 0) { if (ppr->pr_flags & PR_IP4) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. Don't worry * about the parent being unlocked, as any * setting is done with allprison_lock held. */ for (ij = 0; ij < ppr->pr_ip4s; ij++) if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref; } if (ip4s > 1) { for (ii = ij = 1; ii < ip4s; ii++) { if (ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) continue; for (; ij < ppr->pr_ip4s; ij++) if (ip4[ii].s_addr == ppr->pr_ip4[ij].s_addr) break; if (ij == ppr->pr_ip4s) break; } if (ij == ppr->pr_ip4s) { error = EPERM; goto done_deref; } } } /* * Check for conflicting IP addresses. We permit them * if there is no more than one IP on each jail. If * there is a duplicate on a jail with more than one * IP stop checking and return error. */ #ifdef VIMAGE for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #else tppr = &prison0; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif !prison_isalive(tpr)) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP4_USER)) continue; descend = 0; if (tpr->pr_ip4 == NULL || (ip4s == 1 && tpr->pr_ip4s == 1)) continue; for (ii = 0; ii < ip4s; ii++) { if (prison_check_ip4_locked(tpr, &ip4[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref; } } } } #endif #ifdef INET6 if (ip6s > 0) { if (ppr->pr_flags & PR_IP6) { /* * Make sure the new set of IP addresses is a * subset of the parent's list. */ for (ij = 0; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL(&ip6[0], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref; } if (ip6s > 1) { for (ii = ij = 1; ii < ip6s; ii++) { if (IN6_ARE_ADDR_EQUAL(&ip6[ii], &ppr->pr_ip6[0])) continue; for (; ij < ppr->pr_ip6s; ij++) if (IN6_ARE_ADDR_EQUAL( &ip6[ii], &ppr->pr_ip6[ij])) break; if (ij == ppr->pr_ip6s) break; } if (ij == ppr->pr_ip6s) { error = EPERM; goto done_deref; } } } /* Check for conflicting IP addresses. */ #ifdef VIMAGE for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent) if (tppr->pr_flags & PR_VNET) break; #else tppr = &prison0; #endif FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) { if (tpr == pr || #ifdef VIMAGE (tpr != tppr && (tpr->pr_flags & PR_VNET)) || #endif !prison_isalive(tpr)) { descend = 0; continue; } if (!(tpr->pr_flags & PR_IP6_USER)) continue; descend = 0; if (tpr->pr_ip6 == NULL || (ip6s == 1 && tpr->pr_ip6s == 1)) continue; for (ii = 0; ii < ip6s; ii++) { if (prison_check_ip6_locked(tpr, &ip6[ii]) == 0) { error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref; } } } } #endif onamelen = namelen = 0; if (namelc != NULL) { /* Give a default name of the jid. Also allow the name to be * explicitly the jid - but not any other number, and only in * normal form (no leading zero/etc). */ if (namelc[0] == '\0') snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid); else if ((strtoul(namelc, &p, 10) != jid || namelc[0] < '1' || namelc[0] > '9') && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric (unless it is the jid)"); goto done_deref; } /* * Make sure the name isn't too long for the prison or its * children. */ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; onamelen = strlen(pr->pr_name + pnamelen); namelen = strlen(namelc); if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { if (strlen(tpr->pr_name) + (namelen - onamelen) >= sizeof(pr->pr_name)) { error = ENAMETOOLONG; goto done_deref; } } } pr_allow_diff = pr_allow & ~ppr->pr_allow; if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) { error = EPERM; goto done_deref; } /* * Let modules check their parameters. This requires unlocking and * then re-locking the prison, but this is still a valid state as long * as allprison_lock remains xlocked. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_CHECK, opts); if (error != 0) goto done_deref; mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; /* At this point, all valid parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done_deref; } } /* Set the parameters of the prison. */ #ifdef INET redo_ip4 = 0; if (pr_flags & PR_IP4_USER) { pr->pr_flags |= PR_IP4; free(pr->pr_ip4, M_PRISON); pr->pr_ip4s = ip4s; pr->pr_ip4 = ip4; ip4 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, NULL)) { redo_ip4 = 1; descend = 0; } } } #endif #ifdef INET6 redo_ip6 = 0; if (pr_flags & PR_IP6_USER) { pr->pr_flags |= PR_IP6; free(pr->pr_ip6, M_PRISON); pr->pr_ip6s = ip6s; pr->pr_ip6 = ip6; ip6 = NULL; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, NULL)) { redo_ip6 = 1; descend = 0; } } } #endif if (gotslevel) { pr->pr_securelevel = slevel; /* Set all child jails to be at least this level. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_securelevel < slevel) tpr->pr_securelevel = slevel; } if (gotchildmax) { pr->pr_childmax = childmax; /* Set all child jails to under this limit. */ FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level) if (tpr->pr_childmax > childmax - level) tpr->pr_childmax = childmax > level ? childmax - level : 0; } if (gotenforce) { pr->pr_enforce_statfs = enforce; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) if (tpr->pr_enforce_statfs < enforce) tpr->pr_enforce_statfs = enforce; } if (gotrsnum) { pr->pr_devfs_rsnum = rsnum; /* Pass this restriction on to the children. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) tpr->pr_devfs_rsnum = rsnum; } if (namelc != NULL) { if (ppr == &prison0) strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name)); else snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", ppr->pr_name, namelc); /* Change this component of child names. */ FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, strlen(tpr->pr_name + onamelen) + 1); bcopy(pr->pr_name, tpr->pr_name, namelen); } } if (path != NULL) { /* Try to keep a real-rooted full pathname. */ strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; root = NULL; } if (PR_HOST & ch_flags & ~pr_flags) { if (pr->pr_flags & PR_HOST) { /* * Copy the parent's host info. As with pr_ip4 above, * the lack of a lock on the parent is not a problem; * it is always set with allprison_lock at least * shared, and is held exclusively here. */ strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname, sizeof(pr->pr_hostname)); strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname, sizeof(pr->pr_domainname)); strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid, sizeof(pr->pr_hostuuid)); pr->pr_hostid = pr->pr_parent->pr_hostid; } } else if (host != NULL || domain != NULL || uuid != NULL || gothid) { /* Set this prison, and any descendants without PR_HOST. */ if (host != NULL) strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname)); if (domain != NULL) strlcpy(pr->pr_domainname, domain, sizeof(pr->pr_domainname)); if (uuid != NULL) strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid)); if (gothid) pr->pr_hostid = hid; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { if (tpr->pr_flags & PR_HOST) descend = 0; else { if (host != NULL) strlcpy(tpr->pr_hostname, pr->pr_hostname, sizeof(tpr->pr_hostname)); if (domain != NULL) strlcpy(tpr->pr_domainname, pr->pr_domainname, sizeof(tpr->pr_domainname)); if (uuid != NULL) strlcpy(tpr->pr_hostuuid, pr->pr_hostuuid, sizeof(tpr->pr_hostuuid)); if (gothid) tpr->pr_hostid = hid; } } } pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; if ((tallow = ch_allow & ~pr_allow)) prison_set_allow_locked(pr, tallow, 0); /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. */ born = !prison_isalive(pr); if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { if (pr_flags & PR_PERSIST) { prison_hold(pr); /* * This may make a dead prison alive again, but wait * to label it as such until after OSD calls have had * a chance to run (and perhaps to fail). */ refcount_acquire(&pr->pr_uref); } else { drflags |= PD_DEUREF; prison_free_not_last(pr); } } pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* * Any errors past this point will need to de-persist newly created * prisons, as well as call remove methods. */ if (born) drflags |= PD_KILL; #ifdef RACCT if (racct_enable && created) prison_racct_attach(pr); #endif /* Locks may have prevented a complete restriction of child IP * addresses. If so, allocate some more memory and try again. */ #ifdef INET while (redo_ip4) { ip4s = pr->pr_ip4s; ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip4 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip4(tpr, ip4)) { if (ip4 != NULL) ip4 = NULL; else redo_ip4 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif #ifdef INET6 while (redo_ip6) { ip6s = pr->pr_ip6s; ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); mtx_lock(&pr->pr_mtx); redo_ip6 = 0; FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { #ifdef VIMAGE if (tpr->pr_flags & PR_VNET) { descend = 0; continue; } #endif if (prison_restrict_ip6(tpr, ip6)) { if (ip6 != NULL) ip6 = NULL; else redo_ip6 = 1; } } mtx_unlock(&pr->pr_mtx); } #endif /* Let the modules do their work. */ if (born) { error = osd_jail_call(pr, PR_METHOD_CREATE, opts); if (error) goto done_deref; } error = osd_jail_call(pr, PR_METHOD_SET, opts); if (error) goto done_deref; /* * A new prison is now ready to be seen; either it has gained a user * reference via persistence, or is about to gain one via attachment. */ if (born) { drflags = prison_lock_xlock(pr, drflags); pr->pr_state = PRISON_STATE_ALIVE; } /* Attach this process to the prison if requested. */ if (flags & JAIL_ATTACH) { error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS)); drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED); if (error) { vfs_opterror(opts, "attach failed"); goto done_deref; } } #ifdef RACCT if (racct_enable && !created) { if (drflags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; } if (drflags & PD_LIST_XLOCKED) { sx_xunlock(&allprison_lock); drflags &= ~PD_LIST_XLOCKED; } prison_racct_modify(pr); } #endif drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; done_deref: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (drflags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); if (root != NULL) vrele(root); done_errmsg: if (error) { /* Write the error message back to userspace. */ if (vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len) == 0 && errmsg_len > 0) { errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1; if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } done_free: #ifdef INET free(ip4, M_PRISON); #endif #ifdef INET6 free(ip6, M_PRISON); #endif if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); return (error); } /* * Find the next available prison ID. Return the ID on success, or zero * on failure. Also set a pointer to the allprison list entry the prison * should be inserted before. */ static int get_next_prid(struct prison **insprp) { struct prison *inspr; int jid, maxid; jid = lastprid % JAIL_MAX + 1; if (TAILQ_EMPTY(&allprison) || TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) { /* * A common case is for all jails to be implicitly numbered, * which means they'll go on the end of the list, at least * for the first JAIL_MAX times. */ inspr = NULL; } else { /* * Take two passes through the allprison list: first starting * with the proposed jid, then ending with it. */ for (maxid = JAIL_MAX; maxid != 0; ) { TAILQ_FOREACH(inspr, &allprison, pr_list) { if (inspr->pr_id < jid) continue; if (inspr->pr_id > jid) { /* Found an opening. */ maxid = 0; break; } if (++jid > maxid) { if (lastprid == maxid || lastprid == 0) { /* * The entire legal range * has been traversed */ return 0; } /* Try again from the start. */ jid = 1; maxid = lastprid; break; } } if (inspr == NULL) { /* Found room at the end of the list. */ break; } } } *insprp = inspr; lastprid = jid; return (jid); } /* * struct jail_get_args { * struct iovec *iovp; * unsigned int iovcnt; * int flags; * }; */ int sys_jail_get(struct thread *td, struct jail_get_args *uap) { struct uio *auio; int error; /* Check that we have an even number of iovecs. */ if (uap->iovcnt & 1) return (EINVAL); error = copyinuio(uap->iovp, uap->iovcnt, &auio); if (error) return (error); error = kern_jail_get(td, auio, uap->flags); if (error == 0) error = copyout(auio->uio_iov, uap->iovp, uap->iovcnt * sizeof (struct iovec)); free(auio, M_IOV); return (error); } int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; pr = NULL; /* * Find the prison specified by one of: lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id > jid && ((flags & JAIL_DYING) || prison_isalive(pr)) && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); drflags |= PD_LOCKED; goto found_prison; } } error = ENOENT; vfs_opterror(opts, "no jail after %d", jid); goto done; } else if (error != ENOENT) goto done; error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { pr = prison_find_child(mypr, jid); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail %d is dying", jid); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail %d not found", jid); goto done; } } else if (error != ENOENT) goto done; error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == 0) { if (len == 0 || name[len - 1] != '\0') { error = EINVAL; goto done; } pr = prison_find_name(mypr, name); if (pr != NULL) { drflags |= PD_LOCKED; if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { error = ENOENT; vfs_opterror(opts, "jail \"%s\" is dying", name); goto done; } goto found_prison; } error = ENOENT; vfs_opterror(opts, "jail \"%s\" not found", name); goto done; } else if (error != ENOENT) goto done; vfs_opterror(opts, "no jail specified"); error = ENOENT; goto done; found_prison: /* Get the parameters of the prison. */ prison_hold(pr); drflags |= PD_DEREF; td->td_retval[0] = pr->pr_id; error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "name", prison_name(mypr, pr)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done; #ifdef INET error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4, pr->pr_ip4s * sizeof(*pr->pr_ip4)); if (error != 0 && error != ENOENT) goto done; #endif #ifdef INET6 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6, pr->pr_ip6s * sizeof(*pr->pr_ip6)); if (error != 0 && error != ENOENT) goto done; #endif error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel, sizeof(pr->pr_securelevel)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.cur", &pr->pr_childcount, sizeof(pr->pr_childcount)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "children.max", &pr->pr_childmax, sizeof(pr->pr_childmax)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostname", pr->pr_hostname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.domainname", pr->pr_domainname); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid); if (error != 0 && error != ENOENT) goto done; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { uint32_t hid32 = pr->pr_hostid; error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32)); } else #endif error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid, sizeof(pr->pr_hostid)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum, sizeof(pr->pr_devfs_rsnum)); if (error != 0 && error != ENOENT) goto done; for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) { i = (pr->pr_flags & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE : (f == jsf->new) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT; error = vfs_setopt(opts, jsf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { i = (pr->pr_allow & bf->flag) ? 1 : 0; error = vfs_setopt(opts, bf->name, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, bf->noname, &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; } i = !prison_isalive(pr); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; i = !i; error = vfs_setopt(opts, "nodying", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate, sizeof(pr->pr_osreldate)); if (error != 0 && error != ENOENT) goto done; error = vfs_setopts(opts, "osrelease", pr->pr_osrelease); if (error != 0 && error != ENOENT) goto done; /* Get the module parameters. */ mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; error = osd_jail_call(pr, PR_METHOD_GET, opts); if (error) goto done; prison_deref(pr, drflags); pr = NULL; drflags = 0; /* By now, all parameters should have been noted. */ TAILQ_FOREACH(opt, opts, link) { if (!opt->seen && strcmp(opt->name, "errmsg")) { error = EINVAL; vfs_opterror(opts, "unknown parameter: %s", opt->name); goto done; } } /* Write the fetched parameters back to userspace. */ error = 0; TAILQ_FOREACH(opt, opts, link) { if (opt->pos >= 0 && opt->pos != errmsg_pos) { pos = 2 * opt->pos + 1; optuio->uio_iov[pos].iov_len = opt->len; if (opt->value != NULL) { if (optuio->uio_segflg == UIO_SYSSPACE) { bcopy(opt->value, optuio->uio_iov[pos].iov_base, opt->len); } else { error = copyout(opt->value, optuio->uio_iov[pos].iov_base, opt->len); if (error) break; } } } } done: /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); errmsg_pos = 2 * errmsg_pos + 1; if (errmsg_len > 0) { if (optuio->uio_segflg == UIO_SYSSPACE) bcopy(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); else copyout(errmsg, optuio->uio_iov[errmsg_pos].iov_base, errmsg_len); } } vfs_freeopts(opts); return (error); } /* * struct jail_remove_args { * int jid; * }; */ int sys_jail_remove(struct thread *td, struct jail_remove_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); return (0); } prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); return (0); } /* * struct jail_attach_args { * int jid; * }; */ int sys_jail_attach(struct thread *td, struct jail_attach_args *uap) { struct prison *pr; int error; error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); } /* Do not allow a process to attach to a prison that is not alive. */ if (!prison_isalive(pr)) { mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); return (EINVAL); } return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { struct proc *p; struct ucred *newcred, *oldcred; int error; mtx_assert(&pr->pr_mtx, MA_OWNED); sx_assert(&allprison_lock, SX_LOCKED); drflags &= PD_LOCK_FLAGS; /* * XXX: Note that there is a slight race here if two threads * in the same privileged process attempt to attach to two * different jails at the same time. It is important for * user processes not to do this, or they might end up with * a process root from one prison, but attached to the jail * of another. */ prison_hold(pr); refcount_acquire(&pr->pr_uref); drflags |= PD_DEREF | PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; /* Let modules do whatever they need to prepare for attaching. */ error = osd_jail_call(pr, PR_METHOD_ATTACH, td); if (error) { prison_deref(pr, drflags); return (error); } sx_unlock(&allprison_lock); drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED); /* * Reparent the newly attached process to this jail. */ p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) goto e_revert_osd; vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY); if ((error = change_dir(pr->pr_root, td)) != 0) goto e_unlock; #ifdef MAC if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root))) goto e_unlock; #endif VOP_UNLOCK(pr->pr_root); if ((error = pwd_chroot_chdir(td, pr->pr_root))) goto e_revert_osd; newcred = crget(); PROC_LOCK(p); oldcred = crcopysafe(p, newcred); newcred->cr_prison = pr; proc_set_cred(p, newcred); setsugid(p); #ifdef RACCT racct_proc_ucred_changed(p, oldcred, newcred); crhold(newcred); #endif PROC_UNLOCK(p); #ifdef RCTL rctl_proc_ucred_changed(p, newcred); crfree(newcred); #endif prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); /* * If the prison was killed while changing credentials, die along * with it. */ if (!prison_isalive(pr)) { PROC_LOCK(p); kern_psignal(p, SIGKILL); PROC_UNLOCK(p); } return (0); e_unlock: VOP_UNLOCK(pr->pr_root); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ sx_slock(&allprison_lock); drflags |= PD_LIST_SLOCKED; (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); prison_deref(pr, drflags); return (error); } /* * Returns a locked prison instance, or NULL on failure. */ struct prison * prison_find(int prid) { struct prison *pr; sx_assert(&allprison_lock, SX_LOCKED); TAILQ_FOREACH(pr, &allprison, pr_list) { if (pr->pr_id < prid) continue; if (pr->pr_id > prid) break; KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } return (NULL); } /* * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * prison_find_child(struct prison *mypr, int prid) { struct prison *pr; int descend; sx_assert(&allprison_lock, SX_LOCKED); FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (pr->pr_id == prid) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); mtx_lock(&pr->pr_mtx); return (pr); } } return (NULL); } /* * Look for the name relative to mypr. Returns a locked prison or NULL. */ struct prison * prison_find_name(struct prison *mypr, const char *name) { struct prison *pr, *deadpr; size_t mylen; int descend; sx_assert(&allprison_lock, SX_LOCKED); mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; deadpr = NULL; FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { if (!strcmp(pr->pr_name + mylen, name)) { KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr)); if (prison_isalive(pr)) { mtx_lock(&pr->pr_mtx); return (pr); } deadpr = pr; } } /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) mtx_lock(&deadpr->pr_mtx); return (deadpr); } /* * See if a prison has the specific flag set. The prison should be locked, * unless checking for flags that are only set at jail creation (such as * PR_IP4 and PR_IP6), or only the single bit is examined, without regard * to any other prison data. */ int prison_flag(struct ucred *cred, unsigned flag) { return (cred->cr_prison->pr_flags & flag); } int prison_allow(struct ucred *cred, unsigned flag) { return ((cred->cr_prison->pr_allow & flag) != 0); } /* * Hold a prison reference, by incrementing pr_ref. It is generally * an error to hold a prison that does not already have a reference. * A prison record will remain valid as long as it has at least one * reference, and will not be removed as long as either the prison * mutex or the allprison lock is held (allprison_lock may be shared). */ void prison_hold_locked(struct prison *pr) { /* Locking is no longer required. */ prison_hold(pr); } void prison_hold(struct prison *pr) { #ifdef INVARIANTS int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref); KASSERT(was_valid, ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_acquire(&pr->pr_ref); #endif } /* * Remove a prison reference. If that was the last reference, the * prison will be removed (at a later time). */ void prison_free_locked(struct prison *pr) { mtx_assert(&pr->pr_mtx, MA_OWNED); /* * Locking is no longer required, but unlock because the caller * expects it. */ mtx_unlock(&pr->pr_mtx); prison_free(pr); } void prison_free(struct prison *pr) { KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { /* * Don't remove the last reference in this context, * in case there are locks held. */ taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } } static void prison_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_ref); KASSERT(!lastref, ("prison_free_not_last freed last ref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_ref); #endif } /* * Hold a a prison for user visibility, by incrementing pr_uref. * It is generally an error to hold a prison that isn't already * user-visible, except through the the jail system calls. It is also * an error to hold an invalid prison. A prison record will remain * alive as long as it has at least one user reference, and will not * be set to the dying state until the prison mutex and allprison_lock * are both freed. */ void prison_proc_hold(struct prison *pr) { #ifdef INVARIANTS int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref); KASSERT(was_alive, ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); #else refcount_acquire(&pr->pr_uref); #endif } /* * Remove a prison user reference. If it was the last reference, the * prison will be considered "dying", and may be removed once all of * its references are dropped. */ void prison_proc_free(struct prison *pr) { /* * Locking is only required when releasing the last reference. * This allows assurance that a locked prison will remain alive * until it is unlocked. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { /* * Don't remove the last user reference in this context, * which is expected to be a process that is not only locked, * but also half dead. Add a reference so any calls to * prison_free() won't re-submit the task. */ prison_hold(pr); mtx_lock(&pr->pr_mtx); KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC), ("Redundant last reference in prison_proc_free (jid=%d)", pr->pr_id)); pr->pr_flags |= PR_COMPLETE_PROC; mtx_unlock(&pr->pr_mtx); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); } } static void prison_proc_free_not_last(struct prison *pr) { #ifdef INVARIANTS int lastref; KASSERT(refcount_load(&pr->pr_uref) > 0, ("Trying to free dead prison %p (jid=%d).", pr, pr->pr_id)); lastref = refcount_release(&pr->pr_uref); KASSERT(!lastref, ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).", pr, pr->pr_id)); #else refcount_release(&pr->pr_uref); #endif } /* * Complete a call to either prison_free or prison_proc_free. */ static void prison_complete(void *context, int pending) { struct prison *pr = context; int drflags; /* * This could be called to release the last reference, or the last * user reference (plus the reference held in prison_proc_free). */ drflags = prison_lock_xlock(pr, PD_DEREF); if (pr->pr_flags & PR_COMPLETE_PROC) { pr->pr_flags &= ~PR_COMPLETE_PROC; drflags |= PD_DEUREF; } prison_deref(pr, drflags); } /* * Remove a prison reference and/or user reference (usually). * This assumes context that allows sleeping (for allprison_lock), * with no non-sleeping locks held, except perhaps the prison itself. * If there are no more references, release and delist the prison. * On completion, the prison lock and the allprison lock are both * unlocked. */ static void prison_deref(struct prison *pr, int flags) { struct prisonlist freeprison; struct prison *killpr, *rpr, *ppr, *tpr; struct proc *p; killpr = NULL; TAILQ_INIT(&freeprison); /* * Release this prison as requested, which may cause its parent * to be released, and then maybe its grandparent, etc. */ for (;;) { if (flags & PD_KILL) { /* Kill the prison and its descendents. */ KASSERT(pr != &prison0, ("prison_deref trying to kill prison0")); if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); prison_deref_kill(pr, &freeprison); } if (flags & PD_DEUREF) { /* Drop a user reference. */ KASSERT(refcount_load(&pr->pr_uref) > 0, ("prison_deref PD_DEUREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_uref)) { if (!(flags & PD_DEREF)) { prison_hold(pr); flags |= PD_DEREF; } flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_uref) && pr->pr_state == PRISON_STATE_ALIVE) { /* * When the last user references goes, * this becomes a dying prison. */ KASSERT( refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); } } } if (flags & PD_KILL) { /* * Any remaining user references are probably processes * that need to be killed, either in this prison or its * descendants. */ if (refcount_load(&pr->pr_uref) > 0) killpr = pr; /* Make sure the parent prison doesn't get killed. */ flags &= ~PD_KILL; } if (flags & PD_DEREF) { /* Drop a reference. */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("prison_deref PD_DEREF on a dead prison (jid=%d)", pr->pr_id)); if (!refcount_release_if_not_last(&pr->pr_ref)) { flags = prison_lock_xlock(pr, flags); if (refcount_release(&pr->pr_ref)) { /* * When the last reference goes, * unlink the prison and set it aside. */ KASSERT( refcount_load(&pr->pr_uref) == 0, ("prison_deref: last ref, " "but still has %d urefs (jid=%d)", pr->pr_uref, pr->pr_id)); KASSERT( refcount_load(&prison0.pr_ref) != 0, ("prison0 pr_ref=0")); pr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, pr, pr_list); LIST_REMOVE(pr, pr_sibling); TAILQ_INSERT_TAIL(&freeprison, pr, pr_list); for (ppr = pr->pr_parent; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; /* * Removing a prison frees references * from its parent. */ mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; pr = pr->pr_parent; flags |= PD_DEREF | PD_DEUREF; continue; } } } break; } /* Release all the prison locks. */ if (flags & PD_LOCKED) mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); else if (flags & PD_LIST_XLOCKED) sx_xunlock(&allprison_lock); /* Kill any processes attached to a killed prison. */ if (killpr != NULL) { sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NEW && p->p_ucred != NULL) { for (ppr = p->p_ucred->cr_prison; ppr != &prison0; ppr = ppr->pr_parent) if (ppr == killpr) { kern_psignal(p, SIGKILL); break; } } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Finish removing any unreferenced prisons, which couldn't happen * while allprison_lock was held (to avoid a LOR on vrele). */ TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) { #ifdef VIMAGE if (rpr->pr_vnet != rpr->pr_parent->pr_vnet) vnet_destroy(rpr->pr_vnet); #endif if (rpr->pr_root != NULL) vrele(rpr->pr_root); mtx_destroy(&rpr->pr_mtx); #ifdef INET free(rpr->pr_ip4, M_PRISON); #endif #ifdef INET6 free(rpr->pr_ip6, M_PRISON); #endif if (rpr->pr_cpuset != NULL) cpuset_rel(rpr->pr_cpuset); osd_jail_exit(rpr); #ifdef RACCT if (racct_enable) prison_racct_detach(rpr); #endif TAILQ_REMOVE(&freeprison, rpr, pr_list); free(rpr, M_PRISON); } } /* * Kill the prison and its descendants. Mark them as dying, clear the * persist flag, and call module remove methods. */ static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) { struct prison *cpr, *ppr, *rpr; bool descend; /* * Unlike the descendants, the target prison can be killed * even if it is currently dying. This is useful for failed * creation in jail_set(2). */ KASSERT(refcount_load(&pr->pr_ref) > 0, ("Trying to kill dead prison %p (jid=%d).", pr, pr->pr_id)); refcount_acquire(&pr->pr_uref); pr->pr_state = PRISON_STATE_DYING; mtx_unlock(&pr->pr_mtx); rpr = NULL; FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) { if (descend) { if (!prison_isalive(cpr)) { descend = false; continue; } prison_hold(cpr); prison_proc_hold(cpr); mtx_lock(&cpr->pr_mtx); cpr->pr_state = PRISON_STATE_DYING; cpr->pr_flags |= PR_REMOVE; mtx_unlock(&cpr->pr_mtx); continue; } if (!(cpr->pr_flags & PR_REMOVE)) continue; (void)osd_jail_call(cpr, PR_METHOD_REMOVE, NULL); mtx_lock(&cpr->pr_mtx); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(cpr); prison_free_not_last(cpr); } (void)refcount_release(&cpr->pr_uref); if (refcount_release(&cpr->pr_ref)) { /* * When the last reference goes, unlink the prison * and set it aside for prison_deref() to handle. * Delay unlinking the sibling list to keep the loop * safe. */ if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); rpr = cpr; rpr->pr_state = PRISON_STATE_INVALID; TAILQ_REMOVE(&allprison, rpr, pr_list); TAILQ_INSERT_TAIL(freeprison, rpr, pr_list); /* * Removing a prison frees references from its parent. */ ppr = rpr->pr_parent; prison_proc_free_not_last(ppr); prison_free_not_last(ppr); for (; ppr != NULL; ppr = ppr->pr_parent) ppr->pr_childcount--; } mtx_unlock(&cpr->pr_mtx); } if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); mtx_lock(&pr->pr_mtx); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); prison_free_not_last(pr); } (void)refcount_release(&pr->pr_uref); } /* * Given the current locking state in the flags, make sure allprison_lock * is held exclusive, and the prison is locked. Return flags indicating * the new state. */ static int prison_lock_xlock(struct prison *pr, int flags) { if (!(flags & PD_LIST_XLOCKED)) { /* * Get allprison_lock, which may be an upgrade, * and may require unlocking the prison. */ if (flags & PD_LOCKED) { mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; } if (flags & PD_LIST_SLOCKED) { if (!sx_try_upgrade(&allprison_lock)) { sx_sunlock(&allprison_lock); sx_xlock(&allprison_lock); } flags &= ~PD_LIST_SLOCKED; } else sx_xlock(&allprison_lock); flags |= PD_LIST_XLOCKED; } if (!(flags & PD_LOCKED)) { /* Lock the prison mutex. */ mtx_lock(&pr->pr_mtx); flags |= PD_LOCKED; } return flags; } /* * Set or clear a permission bit in the pr_allow field, passing restrictions * (cleared permission) down to child jails. */ void prison_set_allow(struct ucred *cred, unsigned flag, int enable) { struct prison *pr; pr = cred->cr_prison; sx_slock(&allprison_lock); mtx_lock(&pr->pr_mtx); prison_set_allow_locked(pr, flag, enable); mtx_unlock(&pr->pr_mtx); sx_sunlock(&allprison_lock); } static void prison_set_allow_locked(struct prison *pr, unsigned flag, int enable) { struct prison *cpr; int descend; if (enable != 0) pr->pr_allow |= flag; else { pr->pr_allow &= ~flag; FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) cpr->pr_allow &= ~flag; } } /* * Check if a jail supports the given address family. * * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT * if not. */ int prison_check_af(struct ucred *cred, int af) { struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); pr = cred->cr_prison; #ifdef VIMAGE /* Prisons with their own network stack are not limited. */ if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (af) { #ifdef INET case AF_INET: if (pr->pr_flags & PR_IP4) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif #ifdef INET6 case AF_INET6: if (pr->pr_flags & PR_IP6) { mtx_lock(&pr->pr_mtx); if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) error = EAFNOSUPPORT; mtx_unlock(&pr->pr_mtx); } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * * Returns 0 if jail doesn't restrict the address family or if address belongs * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, const struct sockaddr *sa) { #ifdef INET const struct sockaddr_in *sai; #endif #ifdef INET6 const struct sockaddr_in6 *sai6; #endif int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(sa != NULL, ("%s: sa is NULL", __func__)); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif error = 0; switch (sa->sa_family) { #ifdef INET case AF_INET: sai = (const struct sockaddr_in *)sa; error = prison_check_ip4(cred, &sai->sin_addr); break; #endif #ifdef INET6 case AF_INET6: sai6 = (const struct sockaddr_in6 *)sa; error = prison_check_ip6(cred, &sai6->sin6_addr); break; #endif default: if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); } /* * Return 0 if jails permit p1 to frob p2, otherwise ESRCH. */ int prison_check(struct ucred *cred1, struct ucred *cred2) { return ((cred1->cr_prison == cred2->cr_prison || prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); } /* * Return 1 if p2 is a child of p1, otherwise 0. */ int prison_ischild(struct prison *pr1, struct prison *pr2) { for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) if (pr1 == pr2) return (1); return (0); } /* * Return true if the prison is currently alive. A prison is alive if it * holds user references and it isn't being removed. */ bool prison_isalive(struct prison *pr) { if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE)) return (false); return (true); } /* * Return true if the prison is currently valid. A prison is valid if it has * been fully created, and is not being destroyed. Note that dying prisons * are still considered valid. Invalid prisons won't be found under normal * circumstances, as they're only put in that state by functions that have * an exclusive hold on allprison_lock. */ bool prison_isvalid(struct prison *pr) { if (__predict_false(pr->pr_state == PRISON_STATE_INVALID)) return (false); if (__predict_false(refcount_load(&pr->pr_ref) == 0)) return (false); return (true); } /* * Return 1 if the passed credential is in a jail and that jail does not * have its own virtual network stack, otherwise 0. */ int jailed_without_vnet(struct ucred *cred) { if (!jailed(cred)) return (0); #ifdef VIMAGE if (prison_owns_vnet(cred)) return (0); #endif return (1); } /* * Return the correct hostname (domainname, et al) for the passed credential. */ void getcredhostname(struct ucred *cred, char *buf, size_t size) { struct prison *pr; /* * A NULL credential can be used to shortcut to the physical * system's hostname. */ pr = (cred != NULL) ? cred->cr_prison : &prison0; mtx_lock(&pr->pr_mtx); strlcpy(buf, pr->pr_hostname, size); mtx_unlock(&pr->pr_mtx); } void getcreddomainname(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_domainname, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostuuid(struct ucred *cred, char *buf, size_t size) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(buf, cred->cr_prison->pr_hostuuid, size); mtx_unlock(&cred->cr_prison->pr_mtx); } void getcredhostid(struct ucred *cred, unsigned long *hostid) { mtx_lock(&cred->cr_prison->pr_mtx); *hostid = cred->cr_prison->pr_hostid; mtx_unlock(&cred->cr_prison->pr_mtx); } void getjailname(struct ucred *cred, char *name, size_t len) { mtx_lock(&cred->cr_prison->pr_mtx); strlcpy(name, cred->cr_prison->pr_name, len); mtx_unlock(&cred->cr_prison->pr_mtx); } #ifdef VIMAGE /* * Determine whether the prison represented by cred owns * its vnet rather than having it inherited. * * Returns 1 in case the prison owns the vnet, 0 otherwise. */ int prison_owns_vnet(struct ucred *cred) { /* * vnets cannot be added/removed after jail creation, * so no need to lock here. */ return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0); } #endif /* * Determine whether the subject represented by cred can "see" * status of a mount point. * Returns: 0 for permitted, ENOENT otherwise. * XXX: This function should be called cr_canseemount() and should be * placed in kern_prot.c. */ int prison_canseemount(struct ucred *cred, struct mount *mp) { struct prison *pr; struct statfs *sp; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return (0); if (pr->pr_root->v_mount == mp) return (0); if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. * This is ugly check, but this is the only situation when jail's * directory ends with '/'. */ if (strcmp(pr->pr_path, "/") == 0) return (0); len = strlen(pr->pr_path); sp = &mp->mnt_stat; if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0) return (ENOENT); /* * Be sure that we don't have situation where jail's root directory * is "/some/path" and mount point is "/some/pathpath". */ if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/') return (ENOENT); return (0); } void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) { char jpath[MAXPATHLEN]; struct prison *pr; size_t len; pr = cred->cr_prison; if (pr->pr_enforce_statfs == 0) return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", sizeof(sp->f_mntonname)); return; } if (pr->pr_root->v_mount == mp) { /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); *sp->f_mntonname = '/'; return; } /* * If jail's chroot directory is set to "/" we should be able to see * all mount-points from inside a jail. */ if (strcmp(pr->pr_path, "/") == 0) return; len = strlen(pr->pr_path); strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath)); /* * Clear current buffer data, so we are sure nothing from * the valid path left there. */ bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); if (*jpath == '\0') { /* Should never happen. */ *sp->f_mntonname = '/'; } else { strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname)); } } /* * Check with permission for a specific privilege is granted within jail. We * have a specific list of accepted privileges; the rest are denied. */ int prison_priv_check(struct ucred *cred, int priv) { struct prison *pr; int error; /* * Some policies have custom handlers. This routine should not be * called for them. See priv_check_cred(). */ switch (priv) { case PRIV_VFS_LOOKUP: case PRIV_VFS_GENERATION: KASSERT(0, ("prison_priv_check instead of a custom handler " "called for %d\n", priv)); } if (!jailed(cred)) return (0); #ifdef VIMAGE /* * Privileges specific to prisons with a virtual network stack. * There might be a duplicate entry here in case the privilege * is only granted conditionally in the legacy jail case. */ switch (priv) { #ifdef notyet /* * NFS-specific privileges. */ case PRIV_NFS_DAEMON: case PRIV_NFS_LOCKD: #endif /* * Network stack privileges. */ case PRIV_NET_BRIDGE: case PRIV_NET_GRE: case PRIV_NET_BPF: case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */ case PRIV_NET_ROUTE: case PRIV_NET_TAP: case PRIV_NET_SETIFMTU: case PRIV_NET_SETIFFLAGS: case PRIV_NET_SETIFCAP: case PRIV_NET_SETIFDESCR: case PRIV_NET_SETIFNAME : case PRIV_NET_SETIFMETRIC: case PRIV_NET_SETIFPHYS: case PRIV_NET_SETIFMAC: case PRIV_NET_SETLANPCP: case PRIV_NET_ADDMULTI: case PRIV_NET_DELMULTI: case PRIV_NET_HWIOCTL: case PRIV_NET_SETLLADDR: case PRIV_NET_ADDIFGROUP: case PRIV_NET_DELIFGROUP: case PRIV_NET_IFCREATE: case PRIV_NET_IFDESTROY: case PRIV_NET_ADDIFADDR: case PRIV_NET_DELIFADDR: case PRIV_NET_LAGG: case PRIV_NET_GIF: case PRIV_NET_SETIFVNET: case PRIV_NET_SETIFFIB: - case PRIV_NET_WG: /* * 802.11-related privileges. */ case PRIV_NET80211_VAP_GETKEY: case PRIV_NET80211_VAP_MANAGE: #ifdef notyet /* * ATM privileges. */ case PRIV_NETATM_CFG: case PRIV_NETATM_ADD: case PRIV_NETATM_DEL: case PRIV_NETATM_SET: /* * Bluetooth privileges. */ case PRIV_NETBLUETOOTH_RAW: #endif /* * Netgraph and netgraph module privileges. */ case PRIV_NETGRAPH_CONTROL: #ifdef notyet case PRIV_NETGRAPH_TTY: #endif /* * IPv4 and IPv6 privileges. */ case PRIV_NETINET_IPFW: case PRIV_NETINET_DIVERT: case PRIV_NETINET_PF: case PRIV_NETINET_DUMMYNET: case PRIV_NETINET_CARP: case PRIV_NETINET_MROUTE: case PRIV_NETINET_RAW: case PRIV_NETINET_ADDRCTRL6: case PRIV_NETINET_ND6: case PRIV_NETINET_SCOPE6: case PRIV_NETINET_ALIFETIME6: case PRIV_NETINET_IPSEC: case PRIV_NETINET_BINDANY: #ifdef notyet /* * NCP privileges. */ case PRIV_NETNCP: /* * SMB privileges. */ case PRIV_NETSMB: #endif /* * No default: or deny here. * In case of no permit fall through to next switch(). */ if (cred->cr_prison->pr_flags & PR_VNET) return (0); } #endif /* VIMAGE */ switch (priv) { /* * Allow ktrace privileges for root in jail. */ case PRIV_KTRACE: #if 0 /* * Allow jailed processes to configure audit identity and * submit audit records (login, etc). In the future we may * want to further refine the relationship between audit and * jail. */ case PRIV_AUDIT_GETAUDIT: case PRIV_AUDIT_SETAUDIT: case PRIV_AUDIT_SUBMIT: #endif /* * Allow jailed processes to manipulate process UNIX * credentials in any way they see fit. */ case PRIV_CRED_SETUID: case PRIV_CRED_SETEUID: case PRIV_CRED_SETGID: case PRIV_CRED_SETEGID: case PRIV_CRED_SETGROUPS: case PRIV_CRED_SETREUID: case PRIV_CRED_SETREGID: case PRIV_CRED_SETRESUID: case PRIV_CRED_SETRESGID: /* * Jail implements visibility constraints already, so allow * jailed root to override uid/gid-based constraints. */ case PRIV_SEEOTHERGIDS: case PRIV_SEEOTHERUIDS: /* * Jail implements inter-process debugging limits already, so * allow jailed root various debugging privileges. */ case PRIV_DEBUG_DIFFCRED: case PRIV_DEBUG_SUGID: case PRIV_DEBUG_UNPRIV: /* * Allow jail to set various resource limits and login * properties, and for now, exceed process resource limits. */ case PRIV_PROC_LIMIT: case PRIV_PROC_SETLOGIN: case PRIV_PROC_SETRLIMIT: /* * System V and POSIX IPC privileges are granted in jail. */ case PRIV_IPC_READ: case PRIV_IPC_WRITE: case PRIV_IPC_ADMIN: case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: /* * Jail operations within a jail work on child jails. */ case PRIV_JAIL_ATTACH: case PRIV_JAIL_SET: case PRIV_JAIL_REMOVE: /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other * processes in the same jail. Likewise for signalling. */ case PRIV_SCHED_DIFFCRED: case PRIV_SCHED_CPUSET: case PRIV_SIGNAL_DIFFCRED: case PRIV_SIGNAL_SUGID: /* * Allow jailed processes to write to sysctls marked as jail * writable. */ case PRIV_SYSCTL_WRITEJAIL: /* * Allow root in jail to manage a variety of quota * properties. These should likely be conditional on a * configuration option. */ case PRIV_VFS_GETQUOTA: case PRIV_VFS_SETQUOTA: /* * Since Jail relies on chroot() to implement file system * protections, grant many VFS privileges to root in jail. * Be careful to exclude mount-related and NFS-related * privileges. */ case PRIV_VFS_READ: case PRIV_VFS_WRITE: case PRIV_VFS_ADMIN: case PRIV_VFS_EXEC: case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */ case PRIV_VFS_CHFLAGS_DEV: case PRIV_VFS_CHOWN: case PRIV_VFS_CHROOT: case PRIV_VFS_RETAINSUGID: case PRIV_VFS_FCHROOT: case PRIV_VFS_LINK: case PRIV_VFS_SETGID: case PRIV_VFS_STAT: case PRIV_VFS_STICKYFILE: /* * As in the non-jail case, non-root users are expected to be * able to read kernel/phyiscal memory (provided /dev/[k]mem * exists in the jail and they have permission to access it). */ case PRIV_KMEM_READ: return (0); /* * Depending on the global setting, allow privilege of * setting system flags. */ case PRIV_VFS_SYSFLAGS: if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); /* * Depending on the global setting, allow privilege of * mounting/unmounting file systems. */ case PRIV_VFS_MOUNT: case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: pr = cred->cr_prison; prison_lock(pr); if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2) error = 0; else error = EPERM; prison_unlock(pr); return (error); /* * Jails should hold no disposition on the PRIV_VFS_READ_DIR * policy. priv_check_cred will not specifically allow it, and * we may want a MAC policy to allow it. */ case PRIV_VFS_READ_DIR: return (0); /* * Conditionnaly allow locking (unlocking) physical pages * in memory. */ case PRIV_VM_MLOCK: case PRIV_VM_MUNLOCK: if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK) return (0); else return (EPERM); /* * Conditionally allow jailed root to bind reserved ports. */ case PRIV_NETINET_RESERVEDPORT: if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS) return (0); else return (EPERM); /* * Allow jailed root to reuse in-use ports. */ case PRIV_NETINET_REUSEPORT: return (0); /* * Allow jailed root to set certain IPv4/6 (option) headers. */ case PRIV_NETINET_SETHDROPTS: return (0); /* * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); /* * Since jail implements its own visibility limits on netstat * sysctls, allow getcred. This allows identd to work in * jail. */ case PRIV_NETINET_GETCRED: return (0); /* * Allow jailed root to set loginclass. */ case PRIV_PROC_SETLOGINCLASS: return (0); /* * Do not allow a process inside a jail to read the kernel * message buffer unless explicitly permitted. */ case PRIV_MSGBUF: if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF) return (0); return (EPERM); default: /* * In all remaining cases, deny the privilege request. This * includes almost all network privileges, many system * configuration privileges. */ return (EPERM); } } /* * Return the part of pr2's name that is relative to pr1, or the whole name * if it does not directly follow. */ char * prison_name(struct prison *pr1, struct prison *pr2) { char *name; /* Jails see themselves as "0" (if they see themselves at all). */ if (pr1 == pr2) return "0"; name = pr2->pr_name; if (prison_ischild(pr1, pr2)) { /* * pr1 isn't locked (and allprison_lock may not be either) * so its length can't be counted on. But the number of dots * can be counted on - and counted. */ for (; pr1 != &prison0; pr1 = pr1->pr_parent) name = strchr(name, '.') + 1; } return (name); } /* * Return the part of pr2's path that is relative to pr1, or the whole path * if it does not directly follow. */ static char * prison_path(struct prison *pr1, struct prison *pr2) { char *path1, *path2; int len1; path1 = pr1->pr_path; path2 = pr2->pr_path; if (!strcmp(path1, "/")) return (path2); len1 = strlen(path1); if (strncmp(path1, path2, len1)) return (path2); if (path2[len1] == '\0') return "/"; if (path2[len1] == '/') return (path2 + len1); return (path2); } /* * Jail-related sysctls. */ static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jails"); static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; #endif #ifdef INET6 struct in6_addr *ip6 = NULL; int ip6s = 0; #endif int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { #if defined(INET) || defined(INET6) again: #endif mtx_lock(&cpr->pr_mtx); #ifdef INET if (cpr->pr_ip4s > 0) { if (ip4s < cpr->pr_ip4s) { ip4s = cpr->pr_ip4s; mtx_unlock(&cpr->pr_mtx); ip4 = realloc(ip4, ip4s * sizeof(struct in_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip4, ip4, cpr->pr_ip4s * sizeof(struct in_addr)); } #endif #ifdef INET6 if (cpr->pr_ip6s > 0) { if (ip6s < cpr->pr_ip6s) { ip6s = cpr->pr_ip6s; mtx_unlock(&cpr->pr_mtx); ip6 = realloc(ip6, ip6s * sizeof(struct in6_addr), M_TEMP, M_WAITOK); goto again; } bcopy(cpr->pr_ip6, ip6, cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; xp->pr_id = cpr->pr_id; xp->pr_state = cpr->pr_state; strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host)); strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET xp->pr_ip4s = cpr->pr_ip4s; #endif #ifdef INET6 xp->pr_ip6s = cpr->pr_ip6s; #endif mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; #ifdef INET if (xp->pr_ip4s > 0) { error = SYSCTL_OUT(req, ip4, xp->pr_ip4s * sizeof(struct in_addr)); if (error) break; } #endif #ifdef INET6 if (xp->pr_ip6s > 0) { error = SYSCTL_OUT(req, ip6, xp->pr_ip6s * sizeof(struct in6_addr)); if (error) break; } #endif } sx_sunlock(&allprison_lock); free(xp, M_TEMP); #ifdef INET free(ip4, M_TEMP); #endif #ifdef INET6 free(ip6, M_TEMP); #endif return (error); } SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_list, "S", "List of active jails"); static int sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) { int error, injail; injail = jailed(req->td->td_ucred); error = SYSCTL_OUT(req, &injail, sizeof(injail)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); static int sysctl_jail_vnet(SYSCTL_HANDLER_ARGS) { int error, havevnet; #ifdef VIMAGE struct ucred *cred = req->td->td_ucred; havevnet = jailed(cred) && prison_owns_vnet(cred); #else havevnet = 0; #endif error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet)); return (error); } SYSCTL_PROC(_security_jail, OID_AUTO, vnet, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_vnet, "I", "Jail owns vnet?"); #if defined(INET) || defined(INET6) SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, &jail_max_af_ips, 0, "Number of IP addresses a jail may have at most per address family (deprecated)"); #endif /* * Default parameters for jail(2) compatibility. For historical reasons, * the sysctl names have varying similarity to the parameter names. Prisons * just see their own parameters, and can't change them. */ static int sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) { int error, i; /* Get the current flag value, and convert it to a boolean. */ if (req->td->td_ucred->cr_prison == &prison0) { mtx_lock(&prison0.pr_mtx); i = (jail_default_allow & arg2) != 0; mtx_unlock(&prison0.pr_mtx); } else i = prison_allow(req->td->td_ucred, arg2); if (arg1 != NULL) i = !i; error = sysctl_handle_int(oidp, &i, 0, req); if (error || !req->newptr) return (error); i = i ? arg2 : 0; if (arg1 != NULL) i ^= arg2; /* * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 * for writing. */ mtx_lock(&prison0.pr_mtx); jail_default_allow = (jail_default_allow & ~arg2) | i; mtx_unlock(&prison0.pr_mtx); return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", "Processes in jail can set their hostnames (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", "Processes in jail can use System V IPC primitives (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", "Prison root can create raw sockets (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", "Processes in jail can alter system file flags (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", "Processes in jail can mount/unmount jail-friendly file systems (deprecated)"); static int sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) { struct prison *pr; int level, error; pr = req->td->td_ucred->cr_prison; level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); *(int *)arg1 = level; return (0); } SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), sysctl_jail_default_level, "I", "Processes in jail cannot see all mounted file systems (deprecated)"); SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum), sysctl_jail_default_level, "I", "Ruleset for the devfs filesystem in jail (deprecated)"); /* * Nodes to describe jail parameters. Maximum length of string parameters * is returned in the string itself, and the other parameters exist merely * to make themselves and their types known. */ SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Jail parameters"); int sysctl_jail_param(SYSCTL_HANDLER_ARGS) { int i; long l; size_t s; char numbuf[12]; switch (oidp->oid_kind & CTLTYPE) { case CTLTYPE_LONG: case CTLTYPE_ULONG: l = 0; #ifdef SCTL_MASK32 if (!(req->flags & SCTL_MASK32)) #endif return (SYSCTL_OUT(req, &l, sizeof(l))); case CTLTYPE_INT: case CTLTYPE_UINT: i = 0; return (SYSCTL_OUT(req, &i, sizeof(i))); case CTLTYPE_STRING: snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2); return (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); case CTLTYPE_STRUCT: s = (size_t)arg2; return (SYSCTL_OUT(req, &s, sizeof(s))); } return (0); } /* * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail secure level"); SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail value for kern.osreldate and uname -K"); SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, "Jail value for kern.osrelease and uname -r"); SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail cannot see all mounted file systems"); SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW, "I", "Ruleset for in-jail devfs mounts"); SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail persistence"); #ifdef VIMAGE SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN, "E,jailsys", "Virtual network stack"); #endif SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, "B", "Jail is in the process of shutting down"); SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails"); SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD, "I", "Current number of child jails"); SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW, "I", "Maximum number of child jails"); SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info"); SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail hostname"); SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail NIS domainname"); SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN, "Jail host UUID"); SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW, "LU", "Jail host ID"); SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); #ifdef INET SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN, "Jail IPv4 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), "S,in_addr,a", "Jail IPv4 addresses"); SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv4 source address selection rather than the " "primary jail IPv4 address."); #endif #ifdef INET6 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN, "Jail IPv6 address virtualization"); SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), "S,in6_addr,a", "Jail IPv6 addresses"); SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW, "B", "Do (not) use IPv6 source address selection rather than the " "primary jail IPv6 address."); #endif SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set hostname"); SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may use SYSV IPC"); SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create raw sockets"); SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may alter system file flags"); SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may set file quotas"); SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may lock (unlock) physical pages in memory"); SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may bind sockets to reserved ports"); SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may read the kernel message buffer"); SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW, "B", "Unprivileged processes may use process debugging facilities"); SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW, "B", "Processes in jail with uid 0 have privilege"); SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags"); SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW, "B", "Jail may mount/unmount jail-friendly file systems in general"); /* * Add a dynamic parameter allow., or allow... Return * its associated bit in the pr_allow bitmask, or zero if the parameter was * not created. */ unsigned prison_add_allow(const char *prefix, const char *name, const char *prefix_descr, const char *descr) { struct bool_flags *bf; struct sysctl_oid *parent; char *allow_name, *allow_noname, *allowed; #ifndef NO_SYSCTL_DESCR char *descr_deprecated; #endif u_int allow_flag; if (prefix ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name) < 0 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 || asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) { free(allow_name, M_PRISON); return 0; } /* * See if this parameter has already beed added, i.e. a module was * previously loaded/unloaded. */ mtx_lock(&prison0.pr_mtx); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) { if (strcmp(bf->name, allow_name) == 0) { allow_flag = bf->flag; goto no_add; } } /* * Find a free bit in pr_allow_all, failing if there are none * (which shouldn't happen as long as we keep track of how many * potential dynamic flags exist). */ for (allow_flag = 1;; allow_flag <<= 1) { if (allow_flag == 0) goto no_add; if ((pr_allow_all & allow_flag) == 0) break; } /* Note the parameter in the next open slot in pr_flag_allow. */ for (bf = pr_flag_allow; ; bf++) { if (bf == pr_flag_allow + nitems(pr_flag_allow)) { /* This should never happen, but is not fatal. */ allow_flag = 0; goto no_add; } if (atomic_load_int(&bf->flag) == 0) break; } bf->name = allow_name; bf->noname = allow_noname; pr_allow_all |= allow_flag; /* * prison0 always has permission for the new parameter. * Other jails must have it granted to them. */ prison0.pr_allow |= allow_flag; /* The flag indicates a valid entry, so make sure it is set last. */ atomic_store_rel_int(&bf->flag, allow_flag); mtx_unlock(&prison0.pr_mtx); /* * Create sysctls for the paramter, and the back-compat global * permission. */ parent = prefix ? SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(&sysctl___security_jail_param_allow), OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr) : &sysctl___security_jail_param_allow; (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, "B", descr); if ((prefix ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name) : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) { #ifndef NO_SYSCTL_DESCR (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)", descr); #endif (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag, sysctl_jail_default_allow, "I", descr_deprecated); #ifndef NO_SYSCTL_DESCR free(descr_deprecated, M_TEMP); #endif free(allowed, M_TEMP); } return allow_flag; no_add: mtx_unlock(&prison0.pr_mtx); free(allow_name, M_PRISON); free(allow_noname, M_PRISON); return allow_flag; } /* * The VFS system will register jail-aware filesystems here. They each get * a parameter allow.mount.xxxfs and a flag to check when a jailed user * attempts to mount. */ void prison_add_vfs(struct vfsconf *vfsp) { #ifdef NO_SYSCTL_DESCR vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, NULL); #else char *descr; (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system", vfsp->vfc_name); vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name, NULL, descr); free(descr, M_TEMP); #endif } #ifdef RACCT void prison_racct_foreach(void (*callback)(struct racct *racct, void *arg2, void *arg3), void (*pre)(void), void (*post)(void), void *arg2, void *arg3) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_slock(&allprison_lock); if (pre != NULL) (pre)(); LIST_FOREACH(prr, &allprison_racct, prr_next) (callback)(prr->prr_racct, arg2, arg3); if (post != NULL) (post)(); sx_sunlock(&allprison_lock); } static struct prison_racct * prison_racct_find_locked(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN) return (NULL); LIST_FOREACH(prr, &allprison_racct, prr_next) { if (strcmp(name, prr->prr_name) != 0) continue; /* Found prison_racct with a matching name? */ prison_racct_hold(prr); return (prr); } /* Add new prison_racct. */ prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK); racct_create(&prr->prr_racct); strcpy(prr->prr_name, name); refcount_init(&prr->prr_refcount, 1); LIST_INSERT_HEAD(&allprison_racct, prr, prr_next); return (prr); } struct prison_racct * prison_racct_find(const char *name) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_xlock(&allprison_lock); prr = prison_racct_find_locked(name); sx_xunlock(&allprison_lock); return (prr); } void prison_racct_hold(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); refcount_acquire(&prr->prr_refcount); } static void prison_racct_free_locked(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); if (refcount_release(&prr->prr_refcount)) { racct_destroy(&prr->prr_racct); LIST_REMOVE(prr, prr_next); free(prr, M_PRISON_RACCT); } } void prison_racct_free(struct prison_racct *prr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (refcount_release_if_not_last(&prr->prr_refcount)) return; sx_xlock(&allprison_lock); prison_racct_free_locked(prr); sx_xunlock(&allprison_lock); } static void prison_racct_attach(struct prison *pr) { struct prison_racct *prr; ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_XLOCKED); prr = prison_racct_find_locked(pr->pr_name); KASSERT(prr != NULL, ("cannot find prison_racct")); pr->pr_prison_racct = prr; } /* * Handle jail renaming. From the racct point of view, renaming means * moving from one prison_racct to another. */ static void prison_racct_modify(struct prison *pr) { #ifdef RCTL struct proc *p; struct ucred *cred; #endif struct prison_racct *oldprr; ASSERT_RACCT_ENABLED(); sx_slock(&allproc_lock); sx_xlock(&allprison_lock); if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) { sx_xunlock(&allprison_lock); sx_sunlock(&allproc_lock); return; } oldprr = pr->pr_prison_racct; pr->pr_prison_racct = NULL; prison_racct_attach(pr); /* * Move resource utilisation records. */ racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct); #ifdef RCTL /* * Force rctl to reattach rules to processes. */ FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); cred = crhold(p->p_ucred); PROC_UNLOCK(p); rctl_proc_ucred_changed(p, cred); crfree(cred); } #endif sx_sunlock(&allproc_lock); prison_racct_free_locked(oldprr); sx_xunlock(&allprison_lock); } static void prison_racct_detach(struct prison *pr) { ASSERT_RACCT_ENABLED(); sx_assert(&allprison_lock, SA_UNLOCKED); if (pr->pr_prison_racct == NULL) return; prison_racct_free(pr->pr_prison_racct); pr->pr_prison_racct = NULL; } #endif /* RACCT */ #ifdef DDB static void db_show_prison(struct prison *pr) { struct bool_flags *bf; struct jailsys_flags *jsf; #if defined(INET) || defined(INET6) int ii; #endif unsigned f; #ifdef INET char ip4buf[INET_ADDRSTRLEN]; #endif #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" state = %s\n", pr->pr_state == PRISON_STATE_ALIVE ? "alive" : pr->pr_state == PRISON_STATE_DYING ? "dying" : "invalid"); db_printf(" path = %s\n", pr->pr_path); db_printf(" cpuset = %d\n", pr->pr_cpuset ? pr->pr_cpuset->cs_id : -1); #ifdef VIMAGE db_printf(" vnet = %p\n", pr->pr_vnet); #endif db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum); db_printf(" children.max = %d\n", pr->pr_childmax); db_printf(" children.cur = %d\n", pr->pr_childcount); db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = 0x%x", pr->pr_flags); for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++) if (pr->pr_flags & bf->flag) db_printf(" %s", bf->name); for (jsf = pr_flag_jailsys; jsf < pr_flag_jailsys + nitems(pr_flag_jailsys); jsf++) { f = pr->pr_flags & (jsf->disable | jsf->new); db_printf(" %-16s= %s\n", jsf->name, (f != 0 && f == jsf->disable) ? "disable" : (f == jsf->new) ? "new" : "inherit"); } db_printf(" allow = 0x%x", pr->pr_allow); for (bf = pr_flag_allow; bf < pr_flag_allow + nitems(pr_flag_allow) && atomic_load_int(&bf->flag) != 0; bf++) if (pr->pr_allow & bf->flag) db_printf(" %s", bf->name); db_printf("\n"); db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_hostname); db_printf(" host.domainname = %s\n", pr->pr_domainname); db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid); db_printf(" host.hostid = %lu\n", pr->pr_hostid); #ifdef INET db_printf(" ip4s = %d\n", pr->pr_ip4s); for (ii = 0; ii < pr->pr_ip4s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip4.addr =" : " ", inet_ntoa_r(pr->pr_ip4[ii], ip4buf)); #endif #ifdef INET6 db_printf(" ip6s = %d\n", pr->pr_ip6s); for (ii = 0; ii < pr->pr_ip6s; ii++) db_printf(" %s %s\n", ii == 0 ? "ip6.addr =" : " ", ip6_sprintf(ip6buf, &pr->pr_ip6[ii])); #endif } DB_SHOW_COMMAND(prison, db_show_prison_command) { struct prison *pr; if (!have_addr) { /* * Show all prisons in the list, and prison0 which is not * listed. */ db_show_prison(&prison0); if (!db_pager_quit) { TAILQ_FOREACH(pr, &allprison, pr_list) { db_show_prison(pr); if (db_pager_quit) break; } } return; } if (addr == 0) pr = &prison0; else { /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr && pr->pr_ref > 0) break; if (pr == NULL) /* Look again, without requiring a reference. */ TAILQ_FOREACH(pr, &allprison, pr_list) if (pr->pr_id == addr) break; if (pr == NULL) /* Assume address points to a valid prison. */ pr = (struct prison *)addr; } db_show_prison(pr); } #endif /* DDB */ diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index eb748928cd91..7f06b51cf096 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -1,4426 +1,4415 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2004 The FreeBSD Foundation * Copyright (c) 2004-2008 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 */ /* * Comments on the socket life cycle: * * soalloc() sets of socket layer state for a socket, called only by * socreate() and sonewconn(). Socket layer private. * * sodealloc() tears down socket layer state for a socket, called only by * sofree() and sonewconn(). Socket layer private. * * pru_attach() associates protocol layer state with an allocated socket; * called only once, may fail, aborting socket allocation. This is called * from socreate() and sonewconn(). Socket layer private. * * pru_detach() disassociates protocol layer state from an attached socket, * and will be called exactly once for sockets in which pru_attach() has * been successfully called. If pru_attach() returned an error, * pru_detach() will not be called. Socket layer private. * * pru_abort() and pru_close() notify the protocol layer that the last * consumer of a socket is starting to tear down the socket, and that the * protocol should terminate the connection. Historically, pru_abort() also * detached protocol state from the socket state, but this is no longer the * case. * * socreate() creates a socket and attaches protocol state. This is a public * interface that may be used by socket layer consumers to create new * sockets. * * sonewconn() creates a socket and attaches protocol state. This is a * public interface that may be used by protocols to create new sockets when * a new connection is received and will be available for accept() on a * listen socket. * * soclose() destroys a socket after possibly waiting for it to disconnect. * This is a public interface that socket consumers should use to close and * release a socket when done with it. * * soabort() destroys a socket without waiting for it to disconnect (used * only for incoming connections that are already partially or fully * connected). This is used internally by the socket layer when clearing * listen socket queues (due to overflow or close on the listen socket), but * is also a public interface protocols may use to abort connections in * their incomplete listen queues should they no longer be required. Sockets * placed in completed connection listen queues should not be aborted for * reasons described in the comment above the soclose() implementation. This * is not a general purpose close routine, and except in the specific * circumstances described here, should not be used. * * sofree() will free a socket and its protocol state if all references on * the socket have been released, and is the public interface to attempt to * free a socket when a reference is removed. This is a socket layer private * interface. * * NOTE: In addition to socreate() and soclose(), which provide a single * socket reference to the consumer to be managed as required, there are two * calls to explicitly manage socket references, soref(), and sorele(). * Currently, these are generally required only when transitioning a socket * from a listen queue to a file descriptor, in order to prevent garbage * collection of the socket at an untimely moment. For a number of reasons, * these interfaces are not preferred, and should be avoided. * * NOTE: With regard to VNETs the general rule is that callers do not set * curvnet. Exceptions to this rule include soabort(), sodisconnect(), * sofree() (and with that sorele(), sotryfree()), as well as sonewconn() * and sorflush(), which are usually called from a pre-set VNET context. * sopoll() currently does not need a VNET context to be set. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_kern_tls.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include /* for struct knote */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #include #include #endif static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); static void so_rdknl_lock(void *); static void so_rdknl_unlock(void *); static void so_rdknl_assert_lock(void *, int); static void so_wrknl_lock(void *); static void so_wrknl_unlock(void *); static void so_wrknl_assert_lock(void *, int); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); static int filt_soempty(struct knote *kn, long hint); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { .f_isfd = 1, .f_detach = filt_sordetach, .f_event = filt_soread, }; static struct filterops sowrite_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_sowrite, }; static struct filterops soempty_filtops = { .f_isfd = 1, .f_detach = filt_sowdetach, .f_event = filt_soempty, }; so_gen_t so_gencnt; /* generation count for sockets */ MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); #define VNET_SO_ASSERT(so) \ VNET_ASSERT(curvnet != NULL, \ ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]); #define V_socket_hhh VNET(socket_hhh) /* * Limit on the number of connections in the listen queue waiting * for accept(2). * NB: The original sysctl somaxconn is still available but hidden * to prevent confusion about the actual purpose of this number. */ static u_int somaxconn = SOMAXCONN; static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS) { int error; int val; val = somaxconn; error = sysctl_handle_int(oidp, &val, 0, req); if (error || !req->newptr ) return (error); /* * The purpose of the UINT_MAX / 3 limit, is so that the formula * 3 * so_qlimit / 2 * below, will not overflow. */ if (val < 1 || val > UINT_MAX / 3) return (EINVAL); somaxconn = val; return (0); } SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size"); SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, 0, sizeof(int), sysctl_somaxconn, "I", "Maximum listen socket pending connection accept queue size (compat)"); static int numopensockets; SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, &numopensockets, 0, "Number of open sockets"); /* * accept_mtx locks down per-socket fields relating to accept queues. See * socketvar.h for an annotation of the protected fields of struct socket. */ struct mtx accept_mtx; MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); /* * so_global_mtx protects so_gencnt, numopensockets, and the per-socket * so_gencnt field. */ static struct mtx so_global_mtx; MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); /* * General IPC sysctl name space, used by sockets and a variety of other IPC * types. */ SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IPC"); /* * Initialize the socket subsystem and set up the socket * memory allocator. */ static uma_zone_t socket_zone; int maxsockets; static void socket_zone_change(void *tag) { maxsockets = uma_zone_set_max(socket_zone, maxsockets); } static void socket_hhook_register(int subtype) { if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register hook\n", __func__); } static void socket_hhook_deregister(int subtype) { if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } static void socket_init(void *tag) { socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); maxsockets = uma_zone_set_max(socket_zone, maxsockets); uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL, EVENTHANDLER_PRI_FIRST); } SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL); static void socket_vnet_init(const void *unused __unused) { int i; /* We expect a contiguous range */ for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_register(i); } VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_init, NULL); static void socket_vnet_uninit(const void *unused __unused) { int i; for (i = 0; i <= HHOOK_SOCKET_LAST; i++) socket_hhook_deregister(i); } VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_vnet_uninit, NULL); /* * Initialise maxsockets. This SYSINIT must be run after * tunable_mbinit(). */ static void init_maxsockets(void *ignored) { TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets); maxsockets = imax(maxsockets, maxfiles); } SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL); /* * Sysctl to get and set the maximum global sockets limit. Notify protocols * of the change so that they can update their dependent limits as required. */ static int sysctl_maxsockets(SYSCTL_HANDLER_ARGS) { int error, newmaxsockets; newmaxsockets = maxsockets; error = sysctl_handle_int(oidp, &newmaxsockets, 0, req); if (error == 0 && req->newptr) { if (newmaxsockets > maxsockets && newmaxsockets <= maxfiles) { maxsockets = newmaxsockets; EVENTHANDLER_INVOKE(maxsockets_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &maxsockets, 0, sysctl_maxsockets, "IU", "Maximum number of sockets available"); /* * Socket operation routines. These routines are called by the routines in * sys_socket.c or from a system process, and implement the semantics of * socket operations by switching out to the protocol specific routines. */ /* * Get a socket structure from our zone, and initialize it. Note that it * would probably be better to allocate socket and PCB at the same time, but * I'm not convinced that all the protocols can be easily modified to do * this. * * soalloc() returns a socket with a ref count of 0. */ static struct socket * soalloc(struct vnet *vnet) { struct socket *so; so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO); if (so == NULL) return (NULL); #ifdef MAC if (mac_socket_init(so, M_NOWAIT) != 0) { uma_zfree(socket_zone, so); return (NULL); } #endif if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) { uma_zfree(socket_zone, so); return (NULL); } /* * The socket locking protocol allows to lock 2 sockets at a time, * however, the first one must be a listening socket. WITNESS lacks * a feature to change class of an existing lock, so we use DUPOK. */ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); so->so_rcv.sb_sel = &so->so_rdsel; so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); TAILQ_INIT(&so->so_rcv.sb_aiojobq); TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so); TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so); #ifdef VIMAGE VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet = vnet; #endif /* We shouldn't need the so_global_mtx */ if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) { /* Do we need more comprehensive error returns? */ uma_zfree(socket_zone, so); return (NULL); } mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; ++numopensockets; #ifdef VIMAGE vnet->vnet_sockcnt++; #endif mtx_unlock(&so_global_mtx); return (so); } /* * Free the storage associated with a socket at the socket layer, tear down * locks, labels, etc. All protocol state is assumed already to have been * torn down (and possibly never set up) by the caller. */ static void sodealloc(struct socket *so) { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; --numopensockets; /* Could be below, but faster here. */ #ifdef VIMAGE VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p", __func__, __LINE__, so)); so->so_vnet->vnet_sockcnt--; #endif mtx_unlock(&so_global_mtx); #ifdef MAC mac_socket_destroy(so); #endif hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE); crfree(so->so_cred); khelp_destroy_osd(&so->osd); if (SOLISTENING(so)) { if (so->sol_accept_filter != NULL) accept_filt_setopt(so, NULL); } else { if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); } mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } /* * socreate returns a socket with a ref count of 1. The socket should be * closed with soclose(). */ int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td) { struct protosw *prp; struct socket *so; int error; if (proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); if (prp == NULL) { /* No support for domain. */ if (pffinddomain(dom) == NULL) return (EAFNOSUPPORT); /* No support for socket type. */ if (proto == 0 && type != 0) return (EPROTOTYPE); return (EPROTONOSUPPORT); } if (prp->pr_usrreqs->pru_attach == NULL || prp->pr_usrreqs->pru_attach == pru_attach_notsupp) return (EPROTONOSUPPORT); if (prison_check_af(cred, prp->pr_domain->dom_family) != 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); so = soalloc(CRED_TO_VNET(cred)); if (so == NULL) return (ENOBUFS); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || (prp->pr_domain->dom_family == PF_INET6) || (prp->pr_domain->dom_family == PF_ROUTE)) so->so_fibnum = td->td_proc->p_fibnum; else so->so_fibnum = 0; so->so_proto = prp; #ifdef MAC mac_socket_create(cred, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. */ CURVNET_SET(so->so_vnet); error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { sodealloc(so); return (error); } soref(so); *aso = so; return (0); } #ifdef REGRESSION static int regression_sonewconn_earlytest = 1; SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW, ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test"); #endif static struct timeval overinterval = { 60, 0 }; SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW, &overinterval, "Delay in seconds between warnings for listen socket overflows"); /* * When an attempt at a new connection is noted on a socket which accepts * connections, sonewconn is called. If the connection is possible (subject * to space constraints, etc.) then we allocate a new structure, properly * linked into the data structure of the original socket, and return this. * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. * * Note: the ref count on the socket is 0 on return. */ struct socket * sonewconn(struct socket *head, int connstatus) { struct sbuf descrsb; struct socket *so; int len, overcount; u_int qlen; const char localprefix[] = "local:"; char descrbuf[SUNPATHLEN + sizeof(localprefix)]; #if defined(INET6) char addrbuf[INET6_ADDRSTRLEN]; #elif defined(INET) char addrbuf[INET_ADDRSTRLEN]; #endif bool dolog, over; SOLISTEN_LOCK(head); over = (head->sol_qlen > 3 * head->sol_qlimit / 2); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else if (over) { #endif head->sol_overcount++; dolog = !!ratecheck(&head->sol_lastover, &overinterval); /* * If we're going to log, copy the overflow count and queue * length from the listen socket before dropping the lock. * Also, reset the overflow count. */ if (dolog) { overcount = head->sol_overcount; head->sol_overcount = 0; qlen = head->sol_qlen; } SOLISTEN_UNLOCK(head); if (dolog) { /* * Try to print something descriptive about the * socket for the error message. */ sbuf_new(&descrsb, descrbuf, sizeof(descrbuf), SBUF_FIXEDLEN); switch (head->so_proto->pr_domain->dom_family) { #if defined(INET) || defined(INET6) #ifdef INET case AF_INET: #endif #ifdef INET6 case AF_INET6: if (head->so_proto->pr_domain->dom_family == AF_INET6 || (sotoinpcb(head)->inp_inc.inc_flags & INC_ISIPV6)) { ip6_sprintf(addrbuf, &sotoinpcb(head)->inp_inc.inc6_laddr); sbuf_printf(&descrsb, "[%s]", addrbuf); } else #endif { #ifdef INET inet_ntoa_r( sotoinpcb(head)->inp_inc.inc_laddr, addrbuf); sbuf_cat(&descrsb, addrbuf); #endif } sbuf_printf(&descrsb, ":%hu (proto %u)", ntohs(sotoinpcb(head)->inp_inc.inc_lport), head->so_proto->pr_protocol); break; #endif /* INET || INET6 */ case AF_UNIX: sbuf_cat(&descrsb, localprefix); if (sotounpcb(head)->unp_addr != NULL) len = sotounpcb(head)->unp_addr->sun_len - offsetof(struct sockaddr_un, sun_path); else len = 0; if (len > 0) sbuf_bcat(&descrsb, sotounpcb(head)->unp_addr->sun_path, len); else sbuf_cat(&descrsb, "(unknown)"); break; } /* * If we can't print something more specific, at least * print the domain name. */ if (sbuf_finish(&descrsb) != 0 || sbuf_len(&descrsb) <= 0) { sbuf_clear(&descrsb); sbuf_cat(&descrsb, head->so_proto->pr_domain->dom_name ?: "unknown"); sbuf_finish(&descrsb); } KASSERT(sbuf_len(&descrsb) > 0, ("%s: sbuf creation failed", __func__)); log(LOG_DEBUG, "%s: pcb %p (%s): Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", __func__, head->so_pcb, sbuf_data(&descrsb), qlen, overcount); sbuf_delete(&descrsb); overcount = 0; } return (NULL); } SOLISTEN_UNLOCK(head); VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options & ~SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; so->so_snd.sb_lowat = head->sol_sbsnd_lowat; so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; so->so_snd.sb_timeo = head->sol_sbsnd_timeo; so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; SOLISTEN_LOCK(head); if (head->sol_accept_filter != NULL) connstatus = 0; so->so_state |= connstatus; soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); so->so_qstate = SQ_COMP; head->sol_qlen++; solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for * us to insert on the tail. In pre-locking revisions, this * was a simple if(), but as we could be racing with other * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; sp = TAILQ_FIRST(&head->sol_incomp); TAILQ_REMOVE(&head->sol_incomp, sp, so_list); head->sol_incqlen--; SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); SOLISTEN_LOCK(head); } TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); so->so_qstate = SQ_INCOMP; head->sol_incqlen++; SOLISTEN_UNLOCK(head); } return (so); } #if defined(SCTP) || defined(SCTP_SUPPORT) /* * Socket part of sctp_peeloff(). Detach a new socket from an * association. The new socket is returned with a reference. */ struct socket * sopeeloff(struct socket *head) { struct socket *so; VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", __func__, __LINE__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", __func__, head->so_pcb); return (NULL); } so->so_type = head->so_type; so->so_options = head->so_options; so->so_linger = head->so_linger; so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; so->so_fibnum = head->so_fibnum; so->so_proto = head->so_proto; so->so_cred = crhold(head->so_cred); #ifdef MAC mac_socket_newconn(head, so); #endif knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, so_rdknl_assert_lock); knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, so_wrknl_assert_lock); VNET_SO_ASSERT(head); if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", __func__, head->so_pcb); return (NULL); } so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_snd.sb_lowat = head->so_snd.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; so->so_snd.sb_timeo = head->so_snd.sb_timeo; so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; soref(so); return (so); } #endif /* SCTP */ -int -sogetsockaddr(struct socket *so, struct sockaddr **nam) -{ - int error; - - CURVNET_SET(so->so_vnet); - error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, nam); - CURVNET_RESTORE(); - return (error); -} - int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); CURVNET_RESTORE(); return (error); } int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td); CURVNET_RESTORE(); return (error); } /* * solisten() transitions a socket from a non-listening state to a listening * state, but can also be used to update the listen queue depth on an * existing listen socket. The protocol will call back into the sockets * layer using solisten_proto_check() and solisten_proto() to check and set * socket-layer listen state. Call backs are used so that the protocol can * acquire both protocol and socket layer locks in whatever order is required * by the protocol. * * Protocol implementors are advised to hold the socket lock across the * socket-layer test and set to avoid races at the socket layer. */ int solisten(struct socket *so, int backlog, struct thread *td) { int error; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); CURVNET_RESTORE(); return (error); } int solisten_proto_check(struct socket *so) { SOCK_LOCK_ASSERT(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) return (EINVAL); return (0); } void solisten_proto(struct socket *so, int backlog) { int sbrcv_lowat, sbsnd_lowat; u_int sbrcv_hiwat, sbsnd_hiwat; short sbrcv_flags, sbsnd_flags; sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); if (SOLISTENING(so)) goto listening; /* * Change this socket to listening state. */ sbrcv_lowat = so->so_rcv.sb_lowat; sbsnd_lowat = so->so_snd.sb_lowat; sbrcv_hiwat = so->so_rcv.sb_hiwat; sbsnd_hiwat = so->so_snd.sb_hiwat; sbrcv_flags = so->so_rcv.sb_flags; sbsnd_flags = so->so_snd.sb_flags; sbrcv_timeo = so->so_rcv.sb_timeo; sbsnd_timeo = so->so_snd.sb_timeo; sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); sx_destroy(&so->so_snd.sb_sx); sx_destroy(&so->so_rcv.sb_sx); SOCKBUF_LOCK_DESTROY(&so->so_snd); SOCKBUF_LOCK_DESTROY(&so->so_rcv); #ifdef INVARIANTS bzero(&so->so_rcv, sizeof(struct socket) - offsetof(struct socket, so_rcv)); #endif so->sol_sbrcv_lowat = sbrcv_lowat; so->sol_sbsnd_lowat = sbsnd_lowat; so->sol_sbrcv_hiwat = sbrcv_hiwat; so->sol_sbsnd_hiwat = sbsnd_hiwat; so->sol_sbrcv_flags = sbrcv_flags; so->sol_sbsnd_flags = sbsnd_flags; so->sol_sbrcv_timeo = sbrcv_timeo; so->sol_sbsnd_timeo = sbsnd_timeo; so->sol_qlen = so->sol_incqlen = 0; TAILQ_INIT(&so->sol_incomp); TAILQ_INIT(&so->sol_comp); so->sol_accept_filter = NULL; so->sol_accept_filter_arg = NULL; so->sol_accept_filter_str = NULL; so->sol_upcall = NULL; so->sol_upcallarg = NULL; so->so_options |= SO_ACCEPTCONN; listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; so->sol_qlimit = backlog; } /* * Wakeup listeners/subsystems once we have a complete connection. * Enters with lock, returns unlocked. */ void solisten_wakeup(struct socket *sol) { if (sol->sol_upcall != NULL) (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); else { selwakeuppri(&sol->so_rdsel, PSOCK); KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); } SOLISTEN_UNLOCK(sol); wakeup_one(&sol->sol_comp); if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL) pgsigio(&sol->so_sigio, SIGIO, 0); } /* * Return single connection off a listening socket queue. Main consumer of * the function is kern_accept4(). Some modules, that do their own accept * management also use the function. * * Listening socket must be locked on entry and is returned unlocked on * return. * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. */ int solisten_dequeue(struct socket *head, struct socket **ret, int flags) { struct socket *so; int error; SOLISTEN_LOCK_ASSERT(head); while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, "accept", 0); if (error != 0) { SOLISTEN_UNLOCK(head); return (error); } } if (head->so_error) { error = head->so_error; head->so_error = 0; } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) error = EWOULDBLOCK; else error = 0; if (error) { SOLISTEN_UNLOCK(head); return (error); } so = TAILQ_FIRST(&head->sol_comp); SOCK_LOCK(so); KASSERT(so->so_qstate == SQ_COMP, ("%s: so %p not SQ_COMP", __func__, so)); soref(so); head->sol_qlen--; so->so_qstate = SQ_NONE; so->so_listen = NULL; TAILQ_REMOVE(&head->sol_comp, so, so_list); if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; SOCK_UNLOCK(so); sorele(head); *ret = so; return (0); } /* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are * cleared in socket or protocol code. * * sofree() will free the socket if: * * - There are no outstanding file descriptor references or related consumers * (so_count == 0). * * - The socket has been closed by user space, if ever open (SS_NOFDREF). * * - The protocol does not have an outstanding strong reference on the socket * (SS_PROTOREF). * * - The socket is not in a completed connection queue, so a process has been * notified that it is present. If it is removed, the user process may * block in accept() despite select() saying the socket was ready. */ void sofree(struct socket *so) { struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); return; } if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { struct socket *sol; sol = so->so_listen; KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); /* * To solve race between close of a listening socket and * a socket on its incomplete queue, we need to lock both. * The order is first listening socket, then regular. * Since we don't have SS_NOFDREF neither SS_PROTOREF, this * function and the listening socket are the only pointers * to so. To preserve so and sol, we reference both and then * relock. * After relock the socket may not move to so_comp since it * doesn't have PCB already, but it may be removed from * so_incomp. If that happens, we share responsiblity on * freeing the socket, but soclose() has already removed * it from queue. */ soref(sol); soref(so); SOCK_UNLOCK(so); SOLISTEN_LOCK(sol); SOCK_LOCK(so); if (so->so_qstate == SQ_INCOMP) { KASSERT(so->so_listen == sol, ("%s: so %p migrated out of sol %p", __func__, so, sol)); TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; /* This is guarenteed not to be the last. */ refcount_release(&sol->so_count); so->so_qstate = SQ_NONE; so->so_listen = NULL; } else KASSERT(so->so_listen == NULL, ("%s: so %p not on (in)comp with so_listen", __func__, so)); sorele(sol); KASSERT(so->so_count == 1, ("%s: so %p count %u", __func__, so, so->so_count)); so->so_count = 0; } if (SOLISTENING(so)) so->so_error = ECONNABORTED; SOCK_UNLOCK(so); if (so->so_dtor != NULL) so->so_dtor(so); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(so); if (pr->pr_usrreqs->pru_detach != NULL) (*pr->pr_usrreqs->pru_detach)(so); /* * From this point on, we assume that no other references to this * socket exist anywhere else in the stack. Therefore, no locks need * to be acquired or held. * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ if (!SOLISTENING(so)) { sbdestroy(&so->so_snd, so); sbdestroy(&so->so_rcv, so); } seldrain(&so->so_rdsel); seldrain(&so->so_wrsel); knlist_destroy(&so->so_rdsel.si_note); knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } /* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * * This function will sorele() the socket. Note that soclose() may be called * prior to the ref count reaching zero. The actual socket structure will * not be freed until the ref count reaches zero. */ int soclose(struct socket *so) { struct accept_queue lqueue; bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); CURVNET_SET(so->so_vnet); funsetown(&so->so_sigio); if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error) { if (error == ENOTCONN) error = 0; goto drop; } } if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) { if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; while (so->so_state & SS_ISCONNECTED) { error = tsleep(&so->so_timeo, PSOCK | PCATCH, "soclos", so->so_linger * hz); if (error) break; } } } drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); SOCK_LOCK(so); if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; TAILQ_INIT(&lqueue); TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); so->sol_qlen = so->sol_incqlen = 0; TAILQ_FOREACH(sp, &lqueue, so_list) { SOCK_LOCK(sp); sp->so_qstate = SQ_NONE; sp->so_listen = NULL; SOCK_UNLOCK(sp); /* Guaranteed not to be the last. */ refcount_release(&so->so_count); } } KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; sorele(so); if (listening) { struct socket *sp, *tsp; TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) { SOCK_LOCK(sp); if (sp->so_count == 0) { SOCK_UNLOCK(sp); soabort(sp); } else /* sp is now in sofree() */ SOCK_UNLOCK(sp); } } CURVNET_RESTORE(); return (error); } /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen * socket is closed while there are sockets waiting to be accepted. * * This interface is tricky, because it is called on an unreferenced socket, * and must be called only by a thread that has actually removed the socket * from the listen queue it was on, or races with other threads are risked. * * This interface will call into the protocol code, so must not be called * with any socket locks held. Protocols do call it while holding their own * recursible protocol mutexes, but this is something that should be subject * to review in the future. */ void soabort(struct socket *so) { /* * In as much as is possible, assert that no references to this * socket are held. This is not quite the same as asserting that the * current thread is responsible for arranging for no references, but * is as close as we can get for now. */ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); SOCK_LOCK(so); sofree(so); } int soaccept(struct socket *so, struct sockaddr **nam) { int error; SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); so->so_state &= ~SS_NOFDREF; SOCK_UNLOCK(so); CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); CURVNET_RESTORE(); return (error); } int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { return (soconnectat(AT_FDCWD, so, nam, td)); } int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { int error; if (so->so_options & SO_ACCEPTCONN) return (EOPNOTSUPP); CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. * Otherwise, if connected, try to disconnect first. This allows * user to disconnect by connecting to, e.g., a null address. */ if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) { error = EISCONN; } else { /* * Prevent accumulated error from previous connection from * biting us. */ so->so_error = 0; if (fd == AT_FDCWD) { error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); } else { error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd, so, nam, td); } } CURVNET_RESTORE(); return (error); } int soconnect2(struct socket *so1, struct socket *so2) { int error; CURVNET_SET(so1->so_vnet); error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); CURVNET_RESTORE(); return (error); } int sodisconnect(struct socket *so) { int error; if ((so->so_state & SS_ISCONNECTED) == 0) return (ENOTCONN); if (so->so_state & SS_ISDISCONNECTING) return (EALREADY); VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM")); KASSERT(so->so_proto->pr_flags & PR_ATOMIC, ("sosend_dgram: !PR_ATOMIC")); if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. */ if (resid < 0) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0; if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto out; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto out; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection-based * socket if it supports implied connect. Return ENOTCONN if * not connected and no address is supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto out; } } else if (addr == NULL) { if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; SOCKBUF_UNLOCK(&so->so_snd); goto out; } } /* * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a * problem and need fixing. */ space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; space -= clen; SOCKBUF_UNLOCK(&so->so_snd); if (resid > space) { error = EMSGSIZE; goto out; } if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf chain. * If no data is to be copied in, a single empty mbuf * is returned. */ top = m_uiotombuf(uio, M_WAITOK, space, max_hdr, (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0))); if (top == NULL) { error = EFAULT; /* only possible error */ goto out; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } KASSERT(resid == 0, ("sosend_dgram: resid != 0")); /* * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock * than with. */ if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously done could be out * of date. We could have received a reset packet in an interrupt or * maybe we slept while doing page faults in uiomove() etc. We could * probably recheck again inside the locking protection here, but * there are probably other places that this also happens. We must * rethink this. */ VNET_SO_ASSERT(so); error = (*so->so_proto->pr_usrreqs->pru_send)(so, (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands this flag and * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } clen = 0; control = NULL; top = NULL; out: if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { long space; ssize_t resid; int clen = 0, error, dontroute; int atomic = sosendallatonce(so) || top; int pru_flag; #ifdef KERN_TLS struct ktls_session *tls; int tls_enq_cnt, tls_pruflag; uint8_t tls_rtype; tls = NULL; tls_rtype = TLS_RLTYPE_APP; #endif if (uio != NULL) resid = uio->uio_resid; else if ((top->m_flags & M_PKTHDR) != 0) resid = top->m_pkthdr.len; else resid = m_length(top, NULL); /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); if (td != NULL) td->td_ru.ru_msgsnd++; if (control != NULL) clen = control->m_len; error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; #ifdef KERN_TLS tls_pruflag = 0; tls = ktls_hold(so->so_snd.sb_tls_info); if (tls != NULL) { if (tls->mode == TCP_TLS_MODE_SW) tls_pruflag = PRUS_NOTREADY; if (control != NULL) { struct cmsghdr *cm = mtod(control, struct cmsghdr *); if (clen >= sizeof(*cm) && cm->cmsg_type == TLS_SET_RECORD_TYPE) { tls_rtype = *((uint8_t *)CMSG_DATA(cm)); clen = 0; m_freem(control); control = NULL; atomic = 1; } } } #endif restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0) { /* * `sendto' and `sendmsg' is allowed on a connection- * based socket if it supports implied connect. * Return ENOTCONN if not connected and no address is * supplied. */ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } } else if (addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); if (so->so_proto->pr_flags & PR_CONNREQUIRED) error = ENOTCONN; else error = EDESTADDRREQ; goto release; } } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); space -= clen; do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; #ifdef KERN_TLS if (tls != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); tls_rtype = TLS_RLTYPE_APP; } #endif } else { /* * Copy the data from userland into a mbuf * chain. If resid is 0, which can happen * only if we have control to send, then * a single empty mbuf is returned. This * is a workaround to prevent protocol send * methods to panic. */ #ifdef KERN_TLS if (tls != NULL) { top = m_uiotombuf(uio, M_WAITOK, space, tls->params.max_frame_len, M_EXTPG | ((flags & MSG_EOR) ? M_EOR : 0)); if (top != NULL) { ktls_frame(top, tls, &tls_enq_cnt, tls_rtype); } tls_rtype = TLS_RLTYPE_APP; } else #endif top = m_uiotombuf(uio, M_WAITOK, space, (atomic ? max_hdr : 0), (atomic ? M_PKTHDR : 0) | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { error = EFAULT; /* only possible error */ goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } if (dontroute) { SOCK_LOCK(so); so->so_options |= SO_DONTROUTE; SOCK_UNLOCK(so); } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date. We could have received * a reset packet in an interrupt or maybe we slept * while doing page faults in uiomove() etc. We * could probably recheck again inside the locking * protection here, but there are probably other * places that this also happens. We must rethink * this. */ VNET_SO_ASSERT(so); pru_flag = (flags & MSG_OOB) ? PRUS_OOB : /* * If the user set MSG_EOF, the protocol understands * this flag and nothing left to send then use * PRU_SEND_EOF instead of PRU_SEND. */ ((flags & MSG_EOF) && (so->so_proto->pr_flags & PR_IMPLOPCL) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (flags & MSG_MORETOCOME) || (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0; #ifdef KERN_TLS pru_flag |= tls_pruflag; #endif error = (*so->so_proto->pr_usrreqs->pru_send)(so, pru_flag, top, addr, control, td); if (dontroute) { SOCK_LOCK(so); so->so_options &= ~SO_DONTROUTE; SOCK_UNLOCK(so); } #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { /* * Note that error is intentionally * ignored. * * Like sendfile(), we rely on the * completion routine (pru_ready()) * to free the mbufs in the event that * pru_send() encountered an error and * did not append them to the sockbuf. */ soref(so); ktls_enqueue(top, so, tls_enq_cnt); } #endif clen = 0; control = NULL; top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: #ifdef KERN_TLS if (tls != NULL) ktls_free(tls); #endif if (top != NULL) m_freem(top); if (control != NULL) m_freem(control); return (error); } int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top, control, flags, td); else { m_freem(top); m_freem(control); error = ENOTCONN; } CURVNET_RESTORE(); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); VNET_SO_ASSERT(so); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Following replacement or removal of the first mbuf on the first mbuf chain * of a socket buffer, push necessary state changes back into the socket * buffer so that other consumers see the values consistently. 'nextrecord' * is the callers locally stored value of the original value of * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. * NOTE: 'nextrecord' may be NULL. */ static __inline void sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) { SOCKBUF_LOCK_ASSERT(sb); /* * First, update for the new value of nextrecord. If necessary, make * it the first record. */ if (sb->sb_mb != NULL) sb->sb_mb->m_nextpkt = nextrecord; else sb->sb_mb = nextrecord; /* * Now update any dependent socket buffer fields to reflect the new * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the * addition of a second clause that takes care of the case where * sb_mb has been updated, but remains the last record. */ if (sb->sb_mb == NULL) { sb->sb_mbtail = NULL; sb->sb_lastrecord = NULL; } else if (sb->sb_mb->m_nextpkt == NULL) sb->sb_lastrecord = sb->sb_mb; } /* * Implement receive operations on a socket. We depend on the way that * records are added to the sockbuf by sbappend. In particular, each record * (mbufs linked through m_next) must begin with an address if the protocol * so specifies, followed by an optional mbuf or mbufs containing ancillary * data, and then zero or more mbufs of data. In order to allow parallelism * between network receive and copying to user space, as well as avoid * sleeping with a mutex held, we release the socket buffer mutex during the * user space copy. Although the sockbuf is locked, new data may still be * appended, and thus we must maintain consistency of the sockbuf during that * time. * * The caller may receive the data as a single mbuf chain by supplying an * mbuf **mp0 for use in returning the chain. The uio is then used only for * the count in uio_resid. */ int soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, **mp; int flags, error, offset; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; ssize_t orig_resid = uio->uio_resid; mp = mp0; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp != NULL) *mp = NULL; if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING) && uio->uio_resid) { VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, 0); } error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); if (error) return (error); restart: SOCKBUF_LOCK(&so->so_rcv); m = so->so_rcv.sb_mb; /* * If we have less data than requested, block awaiting more (subject * to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_DONTWAIT is not set */ if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && sbavail(&so->so_rcv) < uio->uio_resid) && sbavail(&so->so_rcv) < so->so_rcv.sb_lowat && m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { KASSERT(m != NULL || !sbavail(&so->so_rcv), ("receive: m == %p sbavail == %u", m, sbavail(&so->so_rcv))); if (so->so_error) { if (m != NULL) goto dontblock; error = so->so_error; if ((flags & MSG_PEEK) == 0) so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); goto release; } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { if (m != NULL) goto dontblock; #ifdef KERN_TLS else if (so->so_rcv.sb_tlsdcc == 0 && so->so_rcv.sb_tlscc == 0) { #else else { #endif SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { m = so->so_rcv.sb_mb; goto dontblock; } if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 && (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENOTCONN; goto release; } if (uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); error = EWOULDBLOCK; goto release; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (error) goto release; goto restart; } dontblock: /* * From this point onward, we maintain 'nextrecord' as a cache of the * pointer to the next record in the socket buffer. We must keep the * various socket buffer pointers and local stack versions of the * pointers in sync, pushing out modifications before dropping the * socket buffer mutex, and re-reading them when picking it up. * * Otherwise, we will race with the network stack appending new data * or records onto the socket buffer by using inconsistent/stale * versions of the field, possibly resulting in socket buffer * corruption. * * By holding the high-level sblock(), we prevent simultaneous * readers from pulling off the front of the socket buffer. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); orig_resid = 0; if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); if (flags & MSG_PEEK) { m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; sockbuf_pushsync(&so->so_rcv, nextrecord); } } /* * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. If MSG_PEEK, we * just copy the data; if !MSG_PEEK, we call into the protocol to * perform externalization (or freeing if controlp == NULL). */ if (m != NULL && m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; #ifdef KERN_TLS struct cmsghdr *cmsg; struct tls_get_record tgr; /* * For MSG_TLSAPPDATA, check for a non-application data * record. If found, return ENXIO without removing * it from the receive queue. This allows a subsequent * call without MSG_TLSAPPDATA to receive it. * Note that, for TLS, there should only be a single * control mbuf with the TLS_GET_RECORD message in it. */ if (flags & MSG_TLSAPPDATA) { cmsg = mtod(m, struct cmsghdr *); if (cmsg->cmsg_type == TLS_GET_RECORD && cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) { memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr)); /* This will need to change for TLS 1.3. */ if (tgr.tls_type != TLS_RLTYPE_APP) { SOCKBUF_UNLOCK(&so->so_rcv); error = ENXIO; goto release; } } } #endif do { if (flags & MSG_PEEK) { if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_NOWAIT); controlp = &(*controlp)->m_next; } m = m->m_next; } else { sbfree(&so->so_rcv, m); so->so_rcv.sb_mb = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = so->so_rcv.sb_mb; } } while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sockbuf_pushsync(&so->so_rcv, nextrecord); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); SOCKBUF_LOCK(&so->so_rcv); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { orig_resid = 0; while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } if (m != NULL) nextrecord = so->so_rcv.sb_mb->m_nextpkt; else nextrecord = so->so_rcv.sb_mb; orig_resid = 0; } if (m != NULL) { if ((flags & MSG_PEEK) == 0) { KASSERT(m->m_nextpkt == nextrecord, ("soreceive: post-control, nextrecord !sync")); if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_mb == m, ("soreceive: post-control, sb_mb!=m")); KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive: post-control, lastrecord!=m")); } } type = m->m_type; if (type == MT_OOBDATA) flags |= MSG_OOB; } else { if ((flags & MSG_PEEK) == 0) { KASSERT(so->so_rcv.sb_mb == nextrecord, ("soreceive: sb_mb != nextrecord")); if (so->so_rcv.sb_mb == NULL) { KASSERT(so->so_rcv.sb_lastrecord == NULL, ("soreceive: sb_lastercord != NULL")); } } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * Now continue to read any data mbufs off of the head of the socket * buffer until the read request is satisfied. Note that 'type' is * used to store the type of any mbuf reads that have happened so far * such that soreceive() can stop reading if the type changes, which * causes soreceive() to return only one of regular data and inline * out-of-band data in a single socket receive operation. */ moff = 0; offset = 0; while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0 && error == 0) { /* * If the type of mbuf has changed since the last mbuf * examined ('type'), end the receive operation. */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) { if (type != m->m_type) break; } else if (type == MT_OOBDATA) break; else KASSERT(m->m_type == MT_DATA, ("m->m_type == %d", m->m_type)); so->so_rcv.sb_state &= ~SBS_RCVATMARK; len = uio->uio_resid; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; if (len > m->m_len - moff) len = m->m_len - moff; /* * If mp is set, just pass back the mbufs. Otherwise copy * them out via the uio, then free. Sockbuf must be * consistent here (points to current mbuf, it points to next * record) when we drop priority; we must note any additions * to the sockbuf when we block interrupts again. */ if (mp == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if ((m->m_flags & M_EXTPG) != 0) error = m_unmappedtouio(m, moff, uio, (int)len); else error = uiomove(mtod(m, char *) + moff, (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* * The MT_SONAME mbuf has already been removed * from the record, so it is necessary to * remove the data mbufs, if any, to preserve * the invariant in the case of PR_ADDR that * requires MT_SONAME mbufs at the head of * each record. */ if (pr->pr_flags & PR_ATOMIC && ((flags & MSG_PEEK) == 0)) (void)sbdroprecord_locked(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } else uio->uio_resid -= len; SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (len == m->m_len - moff) { if (m->m_flags & M_EOR) flags |= MSG_EOR; if (flags & MSG_PEEK) { m = m->m_next; moff = 0; } else { nextrecord = m->m_nextpkt; sbfree(&so->so_rcv, m); if (mp != NULL) { m->m_nextpkt = NULL; *mp = m; mp = &m->m_next; so->so_rcv.sb_mb = m = m->m_next; *mp = NULL; } else { so->so_rcv.sb_mb = m_free(m); m = so->so_rcv.sb_mb; } sockbuf_pushsync(&so->so_rcv, nextrecord); SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); } } else { if (flags & MSG_PEEK) moff += len; else { if (mp != NULL) { if (flags & MSG_DONTWAIT) { *mp = m_copym(m, 0, len, M_NOWAIT); if (*mp == NULL) { /* * m_copym() couldn't * allocate an mbuf. * Adjust uio_resid back * (it was adjusted * down by len bytes, * which we didn't end * up "copying" over). */ uio->uio_resid += len; break; } } else { SOCKBUF_UNLOCK(&so->so_rcv); *mp = m_copym(m, 0, len, M_WAITOK); SOCKBUF_LOCK(&so->so_rcv); } } sbcut_locked(&so->so_rcv, len); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_oobmark) { if ((flags & MSG_PEEK) == 0) { so->so_oobmark -= len; if (so->so_oobmark == 0) { so->so_rcv.sb_state |= SBS_RCVATMARK; break; } } else { offset += len; if (offset == so->so_oobmark) break; } } if (flags & MSG_EOR) break; /* * If the MSG_WAITALL flag is set (for non-atomic socket), we * must not quit until "uio->uio_resid == 0" or an error * termination. If a signal/timeout occurs, return with a * short count but without error. Keep sockbuf locked * against other readers. */ while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && nextrecord == NULL) { SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) break; /* * Notify the protocol that some data has been * drained before blocking. */ if (pr->pr_flags & PR_WANTRCVD) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * We could receive some data while was notifying * the protocol. Skip blocking in this case. */ if (so->so_rcv.sb_mb == NULL) { error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); goto release; } } m = so->so_rcv.sb_mb; if (m != NULL) nextrecord = m->m_nextpkt; } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m != NULL && pr->pr_flags & PR_ATOMIC) { flags |= MSG_TRUNC; if ((flags & MSG_PEEK) == 0) (void) sbdroprecord_locked(&so->so_rcv); } if ((flags & MSG_PEEK) == 0) { if (m == NULL) { /* * First part is an inline SB_EMPTY_FIXUP(). Second * part makes sure sb_lastrecord is up-to-date if * there is still data in the socket buffer. */ so->so_rcv.sb_mb = nextrecord; if (so->so_rcv.sb_mb == NULL) { so->so_rcv.sb_mbtail = NULL; so->so_rcv.sb_lastrecord = NULL; } else if (nextrecord->m_nextpkt == NULL) so->so_rcv.sb_lastrecord = nextrecord; } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); /* * If soreceive() is being done from the socket callback, * then don't need to generate ACK to peer to update window, * since ACK will be generated on return to TCP. */ if (!(flags & MSG_SOCALLBCK) && (pr->pr_flags & PR_WANTRCVD)) { SOCKBUF_UNLOCK(&so->so_rcv); VNET_SO_ASSERT(so); (*pr->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(&so->so_rcv); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { SOCKBUF_UNLOCK(&so->so_rcv); goto restart; } SOCKBUF_UNLOCK(&so->so_rcv); if (flagsp != NULL) *flagsp |= flags; release: sbunlock(&so->so_rcv); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ int soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (controlp != NULL) *controlp = NULL; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; #ifdef KERN_TLS /* * KTLS store TLS records as records with a control message to * describe the framing. * * We check once here before acquiring locks to optimize the * common case. */ if (sb->sb_tls_info != NULL) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); #endif /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) return (error); SOCKBUF_LOCK(sb); #ifdef KERN_TLS if (sb->sb_tls_info != NULL) { SOCKBUF_UNLOCK(sb); sbunlock(sb); return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); } #endif /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are or were connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { error = ENOTCONN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb) > 0) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb) > 0) goto deliver; else goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { if (*mp0 == NULL) *mp0 = sb->sb_mb; else m_cat(*mp0, sb->sb_mb); for (m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { KASSERT(!(m->m_flags & M_NOTAVAIL), ("%s: m %p not available", __func__, m)); len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } n->m_next = NULL; sb->sb_mb = m; sb->sb_lastrecord = sb->sb_mb; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= len; if (*mp0 != NULL) m_cat(*mp0, m); else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ if ((so->so_proto->pr_flags & PR_WANTRCVD) && (((flags & MSG_WAITALL) && uio->uio_resid > 0) || !(flags & MSG_SOCALLBCK))) { SOCKBUF_UNLOCK(sb); VNET_SO_ASSERT(so); (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); SOCKBUF_LOCK(sb); } } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Optimized version of soreceive() for simple datagram cases from userspace. * Unlike in the stream case, we're able to drop a datagram if copyout() * fails, and because we handle datagrams atomically, we don't need to use a * sleep lock to prevent I/O interlacing. */ int soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { struct mbuf *m, *m2; int flags, error; ssize_t len; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; if (psa != NULL) *psa = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; /* * For any complicated cases, fall back to the full * soreceive_generic(). */ if (mp0 != NULL || (flags & MSG_PEEK) || (flags & MSG_OOB)) return (soreceive_generic(so, psa, uio, mp0, controlp, flagsp)); /* * Enforce restrictions on use. */ KASSERT((pr->pr_flags & PR_WANTRCVD) == 0, ("soreceive_dgram: wantrcvd")); KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic")); KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0, ("soreceive_dgram: SBS_RCVATMARK")); KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0, ("soreceive_dgram: P_CONNREQUIRED")); /* * Loop blocking while waiting for a datagram. */ SOCKBUF_LOCK(&so->so_rcv); while ((m = so->so_rcv.sb_mb) == NULL) { KASSERT(sbavail(&so->so_rcv) == 0, ("soreceive_dgram: sb_mb NULL but sbavail %u", sbavail(&so->so_rcv))); if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_rcv); return (error); } if (so->so_rcv.sb_state & SBS_CANTRCVMORE || uio->uio_resid == 0) { SOCKBUF_UNLOCK(&so->so_rcv); return (0); } if ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO))) { SOCKBUF_UNLOCK(&so->so_rcv); return (EWOULDBLOCK); } SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); error = sbwait(&so->so_rcv); if (error) { SOCKBUF_UNLOCK(&so->so_rcv); return (error); } } SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); nextrecord = m->m_nextpkt; if (nextrecord == NULL) { KASSERT(so->so_rcv.sb_lastrecord == m, ("soreceive_dgram: lastrecord != m")); } KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord, ("soreceive_dgram: m_nextpkt != nextrecord")); /* * Pull 'm' and its chain off the front of the packet queue. */ so->so_rcv.sb_mb = NULL; sockbuf_pushsync(&so->so_rcv, nextrecord); /* * Walk 'm's chain and free that many bytes from the socket buffer. */ for (m2 = m; m2 != NULL; m2 = m2->m_next) sbfree(&so->so_rcv, m2); /* * Do a few last checks before we let go of the lock. */ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type)); if (psa != NULL) *psa = sodupsockaddr(mtod(m, struct sockaddr *), M_NOWAIT); m = m_free(m); } if (m == NULL) { /* XXXRW: Can this happen? */ return (0); } /* * Packet to copyout() is now in 'm' and it is disconnected from the * queue. * * Process one or more MT_CONTROL mbufs present before any data mbufs * in the first mbuf chain on the socket buffer. We call into the * protocol to perform externalization (or freeing if controlp == * NULL). In some cases there can be only MT_CONTROL mbufs without * MT_DATA mbufs. */ if (m->m_type == MT_CONTROL) { struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; do { m2 = m->m_next; m->m_next = NULL; *cme = m; cme = &(*cme)->m_next; m = m2; } while (m != NULL && m->m_type == MT_CONTROL); while (cm != NULL) { cmn = cm->m_next; cm->m_next = NULL; if (pr->pr_domain->dom_externalize != NULL) { error = (*pr->pr_domain->dom_externalize) (cm, controlp, flags); } else if (controlp != NULL) *controlp = cm; else m_freem(cm); if (controlp != NULL) { while (*controlp != NULL) controlp = &(*controlp)->m_next; } cm = cmn; } } KASSERT(m == NULL || m->m_type == MT_DATA, ("soreceive_dgram: !data")); while (m != NULL && uio->uio_resid > 0) { len = uio->uio_resid; if (len > m->m_len) len = m->m_len; error = uiomove(mtod(m, char *), (int)len, uio); if (error) { m_freem(m); return (error); } if (len == m->m_len) m = m_free(m); else { m->m_data += len; m->m_len -= len; } } if (m != NULL) { flags |= MSG_TRUNC; m_freem(m); } if (flagsp != NULL) *flagsp |= flags; return (0); } int soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error; CURVNET_SET(so->so_vnet); if (!SOLISTENING(so)) error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0, controlp, flagsp)); else error = ENOTCONN; CURVNET_RESTORE(); return (error); } int soshutdown(struct socket *so, int how) { struct protosw *pr = so->so_proto; int error, soerror_enotconn; if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) return (EINVAL); soerror_enotconn = 0; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { /* * POSIX mandates us to return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recvXXX(2) * by other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still asking protocol to perform pru_shutdown(). */ if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) return (ENOTCONN); soerror_enotconn = 1; } if (SOLISTENING(so)) { if (how != SHUT_WR) { SOLISTEN_LOCK(so); so->so_error = ECONNABORTED; solisten_wakeup(so); /* unlocks so */ } goto done; } CURVNET_SET(so->so_vnet); if (pr->pr_usrreqs->pru_flush != NULL) (*pr->pr_usrreqs->pru_flush)(so, how); if (how != SHUT_WR) sorflush(so); if (how != SHUT_RD) { error = (*pr->pr_usrreqs->pru_shutdown)(so); wakeup(&so->so_timeo); CURVNET_RESTORE(); return ((error == 0 && soerror_enotconn) ? ENOTCONN : error); } wakeup(&so->so_timeo); CURVNET_RESTORE(); done: return (soerror_enotconn ? ENOTCONN : 0); } void sorflush(struct socket *so) { struct sockbuf *sb = &so->so_rcv; struct protosw *pr = so->so_proto; struct socket aso; VNET_SO_ASSERT(so); /* * In order to avoid calling dom_dispose with the socket buffer mutex * held, and in order to generally avoid holding the lock for a long * time, we make a copy of the socket buffer and clear the original * (except locks, state). The new socket buffer copy won't have * initialized locks so we can only call routines that won't use or * assert those locks. * * Dislodge threads currently blocked in receive and wait to acquire * a lock against other simultaneous readers before clearing the * socket buffer. Don't let our acquire be interrupted by a signal * despite any existing socket disposition on interruptable waiting. */ socantrcvmore(so); (void) sblock(sb, SBL_WAIT | SBL_NOINTR); /* * Invalidate/clear most of the sockbuf structure, but leave selinfo * and mutex data unchanged. */ SOCKBUF_LOCK(sb); bzero(&aso, sizeof(aso)); aso.so_pcb = so->so_pcb; bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); bzero(&sb->sb_startzero, sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); SOCKBUF_UNLOCK(sb); sbunlock(sb); /* * Dispose of special rights and flush the copied socket. Don't call * any unsafe routines (that rely on locks being initialized) on aso. */ if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) (*pr->pr_domain->dom_dispose)(&aso); sbrelease_internal(&aso.so_rcv, so); } /* * Wrapper for Socket established helper hook. * Parameters: socket, context of the hook point, hook id. */ static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id) { struct socket_hhook_data hhook_data = { .so = so, .hctx = hctx, .m = NULL, .status = 0 }; CURVNET_SET(so->so_vnet); HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd); CURVNET_RESTORE(); /* Ugly but needed, since hhooks return void for now */ return (hhook_data.status); } /* * Perhaps this routine, and sooptcopyout(), below, ought to come in an * additional variant to handle the case where the option value needs to be * some kind of integer, but not a specific size. In addition to their use * here, these functions are also called by the protocol-level pr_ctloutput() * routines. */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize; /* * If the user gives us more than we wanted, we ignore it, but if we * don't get the minimum length the caller wants, we return EINVAL. * On success, sopt->sopt_valsize is set to however much we actually * retrieved. */ if ((valsize = sopt->sopt_valsize) < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; if (sopt->sopt_td != NULL) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(sopt->sopt_val, buf, valsize); return (0); } /* * Kernel version of setsockopt(2). * * XXX: optlen is size_t, not socklen_t */ int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen) { struct sockopt sopt; sopt.sopt_level = level; sopt.sopt_name = optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = optval; sopt.sopt_valsize = optlen; sopt.sopt_td = NULL; return (sosetopt(so, &sopt)); } int sosetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; sbintime_t val; uint32_t val32; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_setopt(so, sopt); if (error) goto bad; break; case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) goto bad; if (l.l_linger < 0 || l.l_linger > USHRT_MAX || l.l_linger > (INT_MAX / hz)) { error = EDOM; goto bad; } SOCK_LOCK(so); so->so_linger = l.l_linger; if (l.l_onoff) so->so_options |= SO_LINGER; else so->so_options &= ~SO_LINGER; SOCK_UNLOCK(so); break; case SO_DEBUG: case SO_KEEPALIVE: case SO_DONTROUTE: case SO_USELOOPBACK: case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; SOCK_LOCK(so); if (optval) so->so_options |= sopt->sopt_name; else so->so_options &= ~sopt->sopt_name; SOCK_UNLOCK(so); break; case SO_SETFIB: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval >= rt_numfibs) { error = EINVAL; goto bad; } if (((so->so_proto->pr_domain->dom_family == PF_INET) || (so->so_proto->pr_domain->dom_family == PF_INET6) || (so->so_proto->pr_domain->dom_family == PF_ROUTE))) so->so_fibnum = optval; else so->so_fibnum = 0; break; case SO_USER_COOKIE: error = sooptcopyin(sopt, &val32, sizeof val32, sizeof val32); if (error) goto bad; so->so_user_cookie = val32; break; case SO_SNDBUF: case SO_RCVBUF: case SO_SNDLOWAT: case SO_RCVLOWAT: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; /* * Values < 1 make no sense for any of these options, * so disallow them. */ if (optval < 1) { error = EINVAL; goto bad; } error = sbsetopt(so, sopt->sopt_name, optval); break; case SO_SNDTIMEO: case SO_RCVTIMEO: #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; error = sooptcopyin(sopt, &tv32, sizeof tv32, sizeof tv32); CP(tv32, tv, tv_sec); CP(tv32, tv, tv_usec); } else #endif error = sooptcopyin(sopt, &tv, sizeof tv, sizeof tv); if (error) goto bad; if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) { error = EDOM; goto bad; } if (tv.tv_sec > INT32_MAX) val = SBT_MAX; else val = tvtosbt(tv); switch (sopt->sopt_name) { case SO_SNDTIMEO: so->so_snd.sb_timeo = val; break; case SO_RCVTIMEO: so->so_rcv.sb_timeo = val; break; } break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof extmac, sizeof extmac); if (error) goto bad; error = mac_setsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); #else error = EOPNOTSUPP; #endif break; case SO_TS_CLOCK: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) goto bad; if (optval < 0 || optval > SO_TS_CLOCK_MAX) { error = EINVAL; goto bad; } so->so_ts_clock = optval; break; case SO_MAX_PACING_RATE: error = sooptcopyin(sopt, &val32, sizeof(val32), sizeof(val32)); if (error) goto bad; so->so_max_pacing_rate = val32; break; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) (void)(*so->so_proto->pr_ctloutput)(so, sopt); } bad: CURVNET_RESTORE(); return (error); } /* * Helper routine for getsockopt. */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { int error; size_t valsize; error = 0; /* * Documented get behavior is that we always return a value, possibly * truncated to fit in the user's buffer. Traditional behavior is * that we always tell the user precisely how much we copied, rather * than something useful like the total amount we had available for * her. Note that this interface is not idempotent; the entire * answer must be generated ahead of time. */ valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != NULL) { if (sopt->sopt_td != NULL) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, sopt->sopt_val, valsize); } return (error); } int sogetopt(struct socket *so, struct sockopt *sopt) { int error, optval; struct linger l; struct timeval tv; #ifdef MAC struct mac extmac; #endif CURVNET_SET(so->so_vnet); error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) error = (*so->so_proto->pr_ctloutput)(so, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); return (error); } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: error = accept_filt_getopt(so, sopt); break; case SO_LINGER: SOCK_LOCK(so); l.l_onoff = so->so_options & SO_LINGER; l.l_linger = so->so_linger; SOCK_UNLOCK(so); error = sooptcopyout(sopt, &l, sizeof l); break; case SO_USELOOPBACK: case SO_DONTROUTE: case SO_DEBUG: case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: case SO_NO_DDP: case SO_NO_OFFLOAD: optval = so->so_options & sopt->sopt_name; integer: error = sooptcopyout(sopt, &optval, sizeof optval); break; case SO_DOMAIN: optval = so->so_proto->pr_domain->dom_family; goto integer; case SO_TYPE: optval = so->so_type; goto integer; case SO_PROTOCOL: optval = so->so_proto->pr_protocol; goto integer; case SO_ERROR: SOCK_LOCK(so); optval = so->so_error; so->so_error = 0; SOCK_UNLOCK(so); goto integer; case SO_SNDBUF: optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat; goto integer; case SO_RCVBUF: optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat; goto integer; case SO_SNDLOWAT: optval = SOLISTENING(so) ? so->sol_sbsnd_lowat : so->so_snd.sb_lowat; goto integer; case SO_RCVLOWAT: optval = SOLISTENING(so) ? so->sol_sbrcv_lowat : so->so_rcv.sb_lowat; goto integer; case SO_SNDTIMEO: case SO_RCVTIMEO: tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ? so->so_snd.sb_timeo : so->so_rcv.sb_timeo); #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) { struct timeval32 tv32; CP(tv, tv32, tv_sec); CP(tv, tv32, tv_usec); error = sooptcopyout(sopt, &tv32, sizeof tv32); } else #endif error = sooptcopyout(sopt, &tv, sizeof tv); break; case SO_LABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_PEERLABEL: #ifdef MAC error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) goto bad; error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) goto bad; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; case SO_LISTENQLIMIT: optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: optval = so->so_ts_clock; goto integer; case SO_MAX_PACING_RATE: optval = so->so_max_pacing_rate; goto integer; default: if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0) error = hhook_run_socket(so, sopt, HHOOK_SOCKET_OPT); else error = ENOPROTOOPT; break; } } #ifdef MAC bad: #endif CURVNET_RESTORE(); return (error); } int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) return ENOBUFS; if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; *mp = m; m_prev = m; while (sopt_size) { MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { m_freem(*mp); return ENOBUFS; } if (sopt_size > MLEN) { MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK : M_NOWAIT); if ((m->m_flags & M_EXT) == 0) { m_freem(m); m_freem(*mp); return ENOBUFS; } m->m_len = min(MCLBYTES, sopt_size); } else { m->m_len = min(MLEN, sopt_size); } sopt_size -= m->m_len; m_prev->m_next = m; m_prev = m; } return (0); } int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; m = m->m_next; } if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ panic("ip6_sooptmcopyin"); return (0); } int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { struct mbuf *m0 = m; size_t valsize = 0; if (sopt->sopt_val == NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { if (sopt->sopt_td != NULL) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, m->m_len); if (error != 0) { m_freem(m0); return(error); } } else bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); sopt->sopt_valsize -= m->m_len; sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; valsize += m->m_len; m = m->m_next; } if (m != NULL) { /* enough soopt buffer should be given from user-land */ m_freem(m0); return(EINVAL); } sopt->sopt_valsize = valsize; return (0); } /* * sohasoutofband(): protocol notifies socket layer of the arrival of new * out-of-band data, which will then notify socket consumers. */ void sohasoutofband(struct socket *so) { if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); selwakeuppri(&so->so_rdsel, PSOCK); } int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { /* * We do not need to set or assert curvnet as long as everyone uses * sopoll_generic(). */ return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred, td)); } int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { int revents; SOCK_LOCK(so); if (SOLISTENING(so)) { if (!(events & (POLLIN | POLLRDNORM))) revents = 0; else if (!TAILQ_EMPTY(&so->sol_comp)) revents = events & (POLLIN | POLLRDNORM); else if ((events & POLLINIGNEOF) == 0 && so->so_error) revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP; else { selrecord(td, &so->so_rdsel); revents = 0; } } else { revents = 0; SOCKBUF_LOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (events & (POLLIN | POLLRDNORM)) if (soreadabledata(so)) revents |= events & (POLLIN | POLLRDNORM); if (events & (POLLOUT | POLLWRNORM)) if (sowriteable(so)) revents |= events & (POLLOUT | POLLWRNORM); if (events & (POLLPRI | POLLRDBAND)) if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) revents |= events & (POLLPRI | POLLRDBAND); if ((events & POLLINIGNEOF) == 0) { if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { revents |= events & (POLLIN | POLLRDNORM); if (so->so_snd.sb_state & SBS_CANTSENDMORE) revents |= POLLHUP; } } if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(td, &so->so_rdsel); so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(td, &so->so_wrsel); so->so_snd.sb_flags |= SB_SEL; } } SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_snd); } SOCK_UNLOCK(so); return (revents); } int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } SOCK_LOCK(so); if (SOLISTENING(so)) { knlist_add(knl, kn, 1); } else { SOCKBUF_LOCK(sb); knlist_add(knl, kn, 1); sb->sb_flags |= SB_KNOTE; SOCKBUF_UNLOCK(sb); } SOCK_UNLOCK(so); return (0); } /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. */ int pru_accept_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job) { return EOPNOTSUPP; } int pru_attach_notsupp(struct socket *so, int proto, struct thread *td) { return EOPNOTSUPP; } int pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam, struct thread *td) { return EOPNOTSUPP; } int pru_connect2_notsupp(struct socket *so1, struct socket *so2) { return EOPNOTSUPP; } int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return EOPNOTSUPP; } int pru_disconnect_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) { return EOPNOTSUPP; } int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_rcvd_notsupp(struct socket *so, int flags) { return EOPNOTSUPP; } int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) { return EOPNOTSUPP; } int pru_send_notsupp(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { return EOPNOTSUPP; } int pru_ready_notsupp(struct socket *so, struct mbuf *m, int count) { return (EOPNOTSUPP); } /* * This isn't really a ``null'' operation, but it's the default one and * doesn't do anything destructive. */ int pru_sense_null(struct socket *so, struct stat *sb) { sb->st_blksize = so->so_snd.sb_hiwat; return 0; } int pru_shutdown_notsupp(struct socket *so) { return EOPNOTSUPP; } int pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam) { return EOPNOTSUPP; } int pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { return EOPNOTSUPP; } int pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { return EOPNOTSUPP; } int pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred, struct thread *td) { return EOPNOTSUPP; } static void filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_rdknl_lock(so); knlist_remove(&so->so_rdsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; so_rdknl_unlock(so); } /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) { SOCK_LOCK_ASSERT(so); kn->kn_data = so->sol_qlen; if (so->so_error) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } return (!TAILQ_EMPTY(&so->sol_comp)); } SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); if (kn->kn_sfflags & NOTE_LOWAT) { if (kn->kn_data >= kn->kn_sdata) return (1); } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat) return (1); /* This hook returning non-zero indicates an event, not error */ return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD)); } static void filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; so_wrknl_lock(so); knlist_remove(&so->so_wrsel.si_note, kn, 1); if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; so_wrknl_unlock(so); } /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (0); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; return (1); } else if (so->so_error) /* temporary udp error */ return (1); else if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) return (0); else if (kn->kn_sfflags & NOTE_LOWAT) return (kn->kn_data >= kn->kn_sdata); else return (kn->kn_data >= so->so_snd.sb_lowat); } static int filt_soempty(struct knote *kn, long hint) { struct socket *so; so = kn->kn_fp->f_data; if (SOLISTENING(so)) return (1); SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); if (kn->kn_data == 0) return (1); else return (0); } int socheckuid(struct socket *so, uid_t uid) { if (so == NULL) return (EPERM); if (so->so_cred->cr_uid != uid) return (EPERM); return (0); } /* * These functions are used by protocols to notify the socket layer (and its * consumers) of state changes in the sockets driven by protocol-side events. */ /* * Procedures to manipulate state flags of socket and do appropriate wakeups. * * Normal sequence from the active (originating) side is that * soisconnecting() is called during processing of connect() call, resulting * in an eventual call to soisconnected() if/when the connection is * established. When the connection is torn down soisdisconnecting() is * called during processing of disconnect() call, and soisdisconnected() is * called when the connection to the peer is totally severed. The semantics * of these routines are such that connectionless protocols can call * soisconnected() and soisdisconnected() only, bypassing the in-progress * calls when setting up a ``connection'' takes no time. * * From the passive side, a socket is created with two queues of sockets: * so_incomp for connections in progress and so_comp for connections already * made and awaiting user acceptance. As a protocol is preparing incoming * connections, it creates a socket structure queued on so_incomp by calling * sonewconn(). When the connection is established, soisconnected() is * called, and transfers the socket structure to so_comp, making it available * to accept(). * * If a socket is closed with sockets on either so_incomp or so_comp, these * sockets are dropped. * * If higher-level protocols are implemented in the kernel, the wakeups done * here will sometimes cause software-interrupt process scheduling. */ void soisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISCONNECTING; SOCK_UNLOCK(so); } void soisconnected(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; if (so->so_qstate == SQ_INCOMP) { struct socket *head = so->so_listen; int ret; KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so)); /* * Promoting a socket from incomplete queue to complete, we * need to go through reverse order of locking. We first do * trylock, and if that doesn't succeed, we go the hard way * leaving a reference and rechecking consistency after proper * locking. */ if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) { soref(head); SOCK_UNLOCK(so); SOLISTEN_LOCK(head); SOCK_LOCK(so); if (__predict_false(head != so->so_listen)) { /* * The socket went off the listen queue, * should be lost race to close(2) of sol. * The socket is about to soabort(). */ SOCK_UNLOCK(so); sorele(head); return; } /* Not the last one, as so holds a ref. */ refcount_release(&head->so_count); } again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { TAILQ_REMOVE(&head->sol_incomp, so, so_list); head->sol_incqlen--; TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); head->sol_qlen++; so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); solisten_wakeup(head); /* unlocks */ } else { SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, head->sol_accept_filter->accf_callback, head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; ret = head->sol_accept_filter->accf_callback(so, head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); goto again; } SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); SOLISTEN_UNLOCK(head); } return; } SOCK_UNLOCK(so); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); } void soisdisconnecting(struct socket *so) { SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; if (!SOLISTENING(so)) { SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); socantsendmore_locked(so); } SOCK_UNLOCK(so); wakeup(&so->so_timeo); } void soisdisconnected(struct socket *so) { SOCK_LOCK(so); /* * There is at least one reader of so_state that does not * acquire socket lock, namely soreceive_generic(). Ensure * that it never sees all flags that track connection status * cleared, by ordering the update with a barrier semantic of * our release thread fence. */ so->so_state |= SS_ISDISCONNECTED; atomic_thread_fence_rel(); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); if (!SOLISTENING(so)) { SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); socantrcvmore_locked(so); SOCKBUF_LOCK(&so->so_snd); sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); socantsendmore_locked(so); } else SOCK_UNLOCK(so); wakeup(&so->so_timeo); } /* * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. */ struct sockaddr * sodupsockaddr(const struct sockaddr *sa, int mflags) { struct sockaddr *sa2; sa2 = malloc(sa->sa_len, M_SONAME, mflags); if (sa2) bcopy(sa, sa2, sa->sa_len); return sa2; } /* * Register per-socket destructor. */ void sodtor_set(struct socket *so, so_dtor_t *func) { SOCK_LOCK_ASSERT(so); so->so_dtor = func; } /* * Register per-socket buffer upcalls. */ void soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; } void soupcall_clear(struct socket *so, int which) { struct sockbuf *sb; KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); switch (which) { case SO_RCV: sb = &so->so_rcv; break; case SO_SND: sb = &so->so_snd; break; default: panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_upcall != NULL, ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } void solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) { SOLISTEN_LOCK_ASSERT(so); so->sol_upcall = func; so->sol_upcallarg = arg; } static void so_rdknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_rcv); } static void so_rdknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_rcv); } static void so_rdknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_rcv); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); } } static void so_wrknl_lock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_LOCK(so); else SOCKBUF_LOCK(&so->so_snd); } static void so_wrknl_unlock(void *arg) { struct socket *so = arg; if (SOLISTENING(so)) SOCK_UNLOCK(so); else SOCKBUF_UNLOCK(&so->so_snd); } static void so_wrknl_assert_lock(void *arg, int what) { struct socket *so = arg; if (what == LA_LOCKED) { if (SOLISTENING(so)) SOCK_LOCK_ASSERT(so); else SOCKBUF_LOCK_ASSERT(&so->so_snd); } else { if (SOLISTENING(so)) SOCK_UNLOCK_ASSERT(so); else SOCKBUF_UNLOCK_ASSERT(&so->so_snd); } } /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void sotoxsocket(struct socket *so, struct xsocket *xso) { bzero(xso, sizeof(*xso)); xso->xso_len = sizeof *xso; xso->xso_so = (uintptr_t)so; xso->so_type = so->so_type; xso->so_options = so->so_options; xso->so_linger = so->so_linger; xso->so_state = so->so_state; xso->so_pcb = (uintptr_t)so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_uid = so->so_cred->cr_uid; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; if (SOLISTENING(so)) { xso->so_qlen = so->sol_qlen; xso->so_incqlen = so->sol_incqlen; xso->so_qlimit = so->sol_qlimit; xso->so_oobmark = 0; } else { xso->so_state |= so->so_qstate; xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_oobmark = so->so_oobmark; sbtoxsockbuf(&so->so_snd, &xso->so_snd); sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); } } struct sockbuf * so_sockbuf_rcv(struct socket *so) { return (&so->so_rcv); } struct sockbuf * so_sockbuf_snd(struct socket *so) { return (&so->so_snd); } int so_state_get(const struct socket *so) { return (so->so_state); } void so_state_set(struct socket *so, int val) { so->so_state = val; } int so_options_get(const struct socket *so) { return (so->so_options); } void so_options_set(struct socket *so, int val) { so->so_options = val; } int so_error_get(const struct socket *so) { return (so->so_error); } void so_error_set(struct socket *so, int val) { so->so_error = val; } int so_linger_get(const struct socket *so) { return (so->so_linger); } void so_linger_set(struct socket *so, int val) { KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz), ("%s: val %d out of range", __func__, val)); so->so_linger = val; } struct protosw * so_protosw_get(const struct socket *so) { return (so->so_proto); } void so_protosw_set(struct socket *so, struct protosw *val) { so->so_proto = val; } void so_sorwakeup(struct socket *so) { sorwakeup(so); } void so_sowwakeup(struct socket *so) { sowwakeup(so); } void so_sorwakeup_locked(struct socket *so) { sorwakeup_locked(so); } void so_sowwakeup_locked(struct socket *so) { sowwakeup_locked(so); } void so_lock(struct socket *so) { SOCK_LOCK(so); } void so_unlock(struct socket *so) { SOCK_UNLOCK(so); } diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index b9c2630561cb..23fae343924a 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -1,1630 +1,1632 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef COMPAT_FREEBSD32 #include #endif #include #include #include static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); static int accept1(struct thread *td, int s, struct sockaddr *uname, socklen_t *anamelen, int flags); static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat); static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat); static int sockargs(struct mbuf **, char *, socklen_t, int); /* * Convert a user file descriptor to a kernel file entry and check if required * capability rights are present. * If required copy of current set of capability rights is returned. * A reference on the file entry is held upon returning. */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) { struct file *fp; int error; error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, td); if (havecapsp != NULL) filecaps_free(havecapsp); return (ENOTSOCK); } if (fflagp != NULL) *fflagp = fp->f_flag; *fpp = fp; return (0); } /* * System call interface to the socket abstraction. */ #if defined(COMPAT_43) #define COMPAT_OLDSOCK #endif int sys_socket(struct thread *td, struct socket_args *uap) { return (kern_socket(td, uap->domain, uap->type, uap->protocol)); } int kern_socket(struct thread *td, int domain, int type, int protocol) { struct socket *so; struct file *fp; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = falloc(td, &fp, &fd, oflag); if (error != 0) return (error); /* An extra reference on `fp' has been held for us by falloc(). */ error = socreate(domain, &so, type, protocol, td->td_ucred, td); if (error != 0) { fdclose(td, fp, fd); } else { finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops); if ((fflag & FNONBLOCK) != 0) (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td); td->td_retval[0] = fd; } fdrop(fp, td); return (error); } int sys_bind(struct thread *td, struct bind_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, &cap_bind_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_bind(td->td_ucred, so, sa); if (error == 0) { #endif if (dirfd == AT_FDCWD) error = sobind(so, sa, td); else error = sobindat(dirfd, so, sa, td); #ifdef MAC } #endif fdrop(fp, td); return (error); } int sys_bindat(struct thread *td, struct bindat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_bindat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int sys_listen(struct thread *td, struct listen_args *uap) { return (kern_listen(td, uap->s, uap->backlog)); } int kern_listen(struct thread *td, int s, int backlog) { struct socket *so; struct file *fp; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_listen_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif error = solisten(so, backlog, td); fdrop(fp, td); } return (error); } /* * accept1() */ static int accept1(td, s, uname, anamelen, flags) struct thread *td; int s; struct sockaddr *uname; socklen_t *anamelen; int flags; { struct sockaddr *name; socklen_t namelen; struct file *fp; int error; if (uname == NULL) return (kern_accept4(td, s, NULL, NULL, flags, NULL)); error = copyin(anamelen, &namelen, sizeof (namelen)); if (error != 0) return (error); error = kern_accept4(td, s, &name, &namelen, flags, &fp); if (error != 0) return (error); if (error == 0 && uname != NULL) { #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT) && (flags & ACCEPT4_COMPAT) != 0) ((struct osockaddr *)name)->sa_family = name->sa_family; #endif error = copyout(name, uname, namelen); } if (error == 0) error = copyout(&namelen, anamelen, sizeof(namelen)); if (error != 0) fdclose(td, fp, td->td_retval[0]); fdrop(fp, td); free(name, M_SONAME); return (error); } int kern_accept(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, struct file **fp) { return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp)); } int kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { struct file *headfp, *nfp = NULL; struct sockaddr *sa = NULL; struct socket *head, *so; struct filecaps fcaps; u_int fflag; pid_t pgid; int error, fd, tmp; if (name != NULL) *name = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_accept_rights, &headfp, &fflag, &fcaps); if (error != 0) return (error); head = headfp->f_data; if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; } #ifdef MAC error = mac_socket_check_accept(td->td_ucred, head); if (error != 0) goto done; #endif error = falloc_caps(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; SOCK_LOCK(head); if (!SOLISTENING(head)) { SOCK_UNLOCK(head); error = EINVAL; goto noconnection; } error = solisten_dequeue(head, &so, flags); if (error != 0) goto noconnection; /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; /* Connection has been removed from the listen queue. */ KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { fflag &= ~(FNONBLOCK | FASYNC); if (flags & SOCK_NONBLOCK) fflag |= FNONBLOCK; } finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); /* Sync socket nonblocking/async state with file flags */ tmp = fflag & FNONBLOCK; (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); error = soaccept(so, &sa); if (error != 0) goto noconnection; if (sa == NULL) { if (name) *namelen = 0; goto done; } AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa); if (name) { /* check sa_len before it is destroyed */ if (*namelen > sa->sa_len) *namelen = sa->sa_len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif *name = sa; sa = NULL; } noconnection: free(sa, M_SONAME); /* * close the new descriptor, assuming someone hasn't ripped it * out from under us. */ if (error != 0) fdclose(td, nfp, fd); /* * Release explicitly held references before returning. We return * a reference on nfp to the caller on success if they request it. */ done: if (nfp == NULL) filecaps_free(&fcaps); if (fp != NULL) { if (error == 0) { *fp = nfp; nfp = NULL; } else *fp = NULL; } if (nfp != NULL) fdrop(nfp, td); fdrop(headfp, td); return (error); } int sys_accept(td, uap) struct thread *td; struct accept_args *uap; { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT)); } int sys_accept4(td, uap) struct thread *td; struct accept4_args *uap; { if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return (EINVAL); return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags)); } #ifdef COMPAT_OLDSOCK int oaccept(struct thread *td, struct oaccept_args *uap) { return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT | ACCEPT4_COMPAT)); } #endif /* COMPAT_OLDSOCK */ int sys_connect(struct thread *td, struct connect_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, AT_FDCWD, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa) { struct socket *so; struct file *fp; int error, interrupted = 0; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD)) return (ECAPMODE); #endif AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, &cap_connect_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if (so->so_state & SS_ISCONNECTING) { error = EALREADY; goto done1; } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(sa); #endif #ifdef MAC error = mac_socket_check_connect(td->td_ucred, so, sa); if (error != 0) goto bad; #endif if (dirfd == AT_FDCWD) error = soconnect(so, sa, td); else error = soconnectat(dirfd, so, sa, td); if (error != 0) goto bad; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { error = EINPROGRESS; goto done1; } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) interrupted = 1; break; } } if (error == 0) { error = so->so_error; so->so_error = 0; } SOCK_UNLOCK(so); bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; if (error == ERESTART) error = EINTR; done1: fdrop(fp, td); return (error); } int sys_connectat(struct thread *td, struct connectat_args *uap) { struct sockaddr *sa; int error; error = getsockaddr(&sa, uap->name, uap->namelen); if (error == 0) { error = kern_connectat(td, uap->fd, uap->s, sa); free(sa, M_SONAME); } return (error); } int kern_socketpair(struct thread *td, int domain, int type, int protocol, int *rsv) { struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, oflag, fflag; AUDIT_ARG_SOCKET(domain, type, protocol); oflag = 0; fflag = 0; if ((type & SOCK_CLOEXEC) != 0) { type &= ~SOCK_CLOEXEC; oflag |= O_CLOEXEC; } if ((type & SOCK_NONBLOCK) != 0) { type &= ~SOCK_NONBLOCK; fflag |= FNONBLOCK; } #ifdef MAC /* We might want to have a separate check for socket pairs. */ error = mac_socket_check_create(td->td_ucred, domain, type, protocol); if (error != 0) return (error); #endif error = socreate(domain, &so1, type, protocol, td->td_ucred, td); if (error != 0) return (error); error = socreate(domain, &so2, type, protocol, td->td_ucred, td); if (error != 0) goto free1; /* On success extra reference to `fp1' and 'fp2' is set by falloc. */ error = falloc(td, &fp1, &fd, oflag); if (error != 0) goto free2; rsv[0] = fd; fp1->f_data = so1; /* so1 already has ref count */ error = falloc(td, &fp2, &fd, oflag); if (error != 0) goto free3; fp2->f_data = so2; /* so2 already has ref count */ rsv[1] = fd; error = soconnect2(so1, so2); if (error != 0) goto free4; if (type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ error = soconnect2(so2, so1); if (error != 0) goto free4; } else if (so1->so_proto->pr_flags & PR_CONNREQUIRED) { struct unpcb *unp, *unp2; unp = sotounpcb(so1); unp2 = sotounpcb(so2); /* * No need to lock the unps, because the sockets are brand-new. * No other threads can be using them yet */ unp_copy_peercred(td, unp, unp2, unp); } finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data, &socketops); finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data, &socketops); if ((fflag & FNONBLOCK) != 0) { (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td); (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td); } fdrop(fp1, td); fdrop(fp2, td); return (0); free4: fdclose(td, fp2, rsv[1]); fdrop(fp2, td); free3: fdclose(td, fp1, rsv[0]); fdrop(fp1, td); free2: if (so2 != NULL) (void)soclose(so2); free1: if (so1 != NULL) (void)soclose(so1); return (error); } int sys_socketpair(struct thread *td, struct socketpair_args *uap) { int error, sv[2]; error = kern_socketpair(td, uap->domain, uap->type, uap->protocol, sv); if (error != 0) return (error); error = copyout(sv, uap->rsv, 2 * sizeof(int)); if (error != 0) { (void)kern_close(td, sv[0]); (void)kern_close(td, sv[1]); } return (error); } static int sendit(struct thread *td, int s, struct msghdr *mp, int flags) { struct mbuf *control; struct sockaddr *to; int error; #ifdef CAPABILITY_MODE if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL)) return (ECAPMODE); #endif if (mp->msg_name != NULL) { error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) { to = NULL; goto bad; } mp->msg_name = to; } else { to = NULL; } if (mp->msg_control) { if (mp->msg_controllen < sizeof(struct cmsghdr) #ifdef COMPAT_OLDSOCK && (mp->msg_flags != MSG_COMPAT || !SV_PROC_FLAG(td->td_proc, SV_AOUT)) #endif ) { error = EINVAL; goto bad; } error = sockargs(&control, mp->msg_control, mp->msg_controllen, MT_CONTROL); if (error != 0) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT && SV_PROC_FLAG(td->td_proc, SV_AOUT)) { struct cmsghdr *cm; M_PREPEND(control, sizeof(*cm), M_WAITOK); cm = mtod(control, struct cmsghdr *); cm->cmsg_len = control->m_len; cm->cmsg_level = SOL_SOCKET; cm->cmsg_type = SCM_RIGHTS; } #endif } else { control = NULL; } error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE); bad: free(to, M_SONAME); return (error); } int kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct file *fp; struct uio auio; struct iovec *iov; struct socket *so; cap_rights_t *rights; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int i, error; AUDIT_ARG_FD(s); rights = &cap_send_rights; if (mp->msg_name != NULL) { AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); rights = &cap_send_connect_rights; } error = getsock_cap(td, s, rights, &fp, NULL, NULL); if (error != 0) { m_freem(control); return (error); } so = (struct socket *)fp->f_data; #ifdef KTRACE if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(mp->msg_name); #endif #ifdef MAC if (mp->msg_name != NULL) { error = mac_socket_check_connect(td->td_ucred, so, mp->msg_name); if (error != 0) { m_freem(control); goto bad; } } error = mac_socket_check_send(td->td_ucred, so); if (error != 0) { m_freem(control); goto bad; } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = segflg; auio.uio_rw = UIO_WRITE; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { error = EINVAL; m_freem(control); goto bad; } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif len = auio.uio_resid; error = sosend(so, mp->msg_name, &auio, 0, control, flags, td); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; /* Generation of SIGPIPE can be controlled per socket */ if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) && !(flags & MSG_NOSIGNAL)) { PROC_LOCK(td->td_proc); tdsignal(td, SIGPIPE); PROC_UNLOCK(td->td_proc); } } if (error == 0) td->td_retval[0] = len - auio.uio_resid; #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = td->td_retval[0]; ktrgenio(s, UIO_WRITE, ktruio, error); } #endif bad: fdrop(fp, td); return (error); } int sys_sendto(struct thread *td, struct sendto_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = __DECONST(void *, uap->to); msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = 0; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags = 0; #endif aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; return (sendit(td, uap->s, &msg, uap->flags)); } #ifdef COMPAT_OLDSOCK int osend(struct thread *td, struct osend_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = __DECONST(void *, uap->buf); aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = 0; return (sendit(td, uap->s, &msg, uap->flags)); } int osendmsg(struct thread *td, struct osendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; msg.msg_flags = MSG_COMPAT; error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } #endif int sys_sendmsg(struct thread *td, struct sendmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags = 0; #endif error = sendit(td, uap->s, &msg, uap->flags); free(iov, M_IOV); return (error); } int kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg, struct mbuf **controlp) { struct uio auio; struct iovec *iov; struct mbuf *control, *m; caddr_t ctlbuf; struct file *fp; struct socket *so; struct sockaddr *fromsa = NULL; #ifdef KTRACE struct uio *ktruio = NULL; #endif ssize_t len; int error, i; if (controlp != NULL) *controlp = NULL; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_recv_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; #ifdef MAC error = mac_socket_check_receive(td->td_ucred, so); if (error != 0) { fdrop(fp, td); return (error); } #endif auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; auio.uio_segflg = UIO_USERSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; auio.uio_offset = 0; /* XXX */ auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { if ((auio.uio_resid += iov->iov_len) < 0) { fdrop(fp, td); return (EINVAL); } } #ifdef KTRACE if (KTRPOINT(td, KTR_GENIO)) ktruio = cloneuio(&auio); #endif control = NULL; len = auio.uio_resid; error = soreceive(so, &fromsa, &auio, NULL, (mp->msg_control || controlp) ? &control : NULL, &mp->msg_flags); if (error != 0) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } if (fromsa != NULL) AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa); #ifdef KTRACE if (ktruio != NULL) { ktruio->uio_resid = len - auio.uio_resid; ktrgenio(s, UIO_READ, ktruio, error); } #endif if (error != 0) goto out; td->td_retval[0] = len - auio.uio_resid; if (mp->msg_name) { len = mp->msg_namelen; if (len <= 0 || fromsa == NULL) len = 0; else { /* save sa_len before it is destroyed by MSG_COMPAT */ len = MIN(len, fromsa->sa_len); #ifdef COMPAT_OLDSOCK if ((mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)fromsa)->sa_family = fromsa->sa_family; #endif if (fromseg == UIO_USERSPACE) { error = copyout(fromsa, mp->msg_name, (unsigned)len); if (error != 0) goto out; } else bcopy(fromsa, mp->msg_name, len); } mp->msg_namelen = len; } if (mp->msg_control && controlp == NULL) { #ifdef COMPAT_OLDSOCK /* * We assume that old recvmsg calls won't receive access * rights and other control info, esp. as control info * is always optional and those options didn't exist in 4.3. * If we receive rights, trim the cmsghdr; anything else * is tossed. */ if (control && (mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) { if (mtod(control, struct cmsghdr *)->cmsg_level != SOL_SOCKET || mtod(control, struct cmsghdr *)->cmsg_type != SCM_RIGHTS) { mp->msg_controllen = 0; goto out; } control->m_len -= sizeof (struct cmsghdr); control->m_data += sizeof (struct cmsghdr); } #endif ctlbuf = mp->msg_control; len = mp->msg_controllen; mp->msg_controllen = 0; for (m = control; m != NULL && len >= m->m_len; m = m->m_next) { if ((error = copyout(mtod(m, caddr_t), ctlbuf, m->m_len)) != 0) goto out; ctlbuf += m->m_len; len -= m->m_len; mp->msg_controllen += m->m_len; } if (m != NULL) { mp->msg_flags |= MSG_CTRUNC; m_dispose_extcontrolm(m); } } out: fdrop(fp, td); #ifdef KTRACE if (fromsa && KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(fromsa); #endif free(fromsa, M_SONAME); if (error == 0 && controlp != NULL) *controlp = control; else if (control != NULL) { if (error != 0) m_dispose_extcontrolm(control); m_freem(control); } return (error); } static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp) { int error; error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL); if (error != 0) return (error); if (namelenp != NULL) { error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t)); #ifdef COMPAT_OLDSOCK if ((mp->msg_flags & MSG_COMPAT) != 0 && SV_PROC_FLAG(td->td_proc, SV_AOUT)) error = 0; /* old recvfrom didn't check */ #endif } return (error); } int sys_recvfrom(struct thread *td, struct recvfrom_args *uap) { struct msghdr msg; struct iovec aiov; int error; if (uap->fromlenaddr) { error = copyin(uap->fromlenaddr, &msg.msg_namelen, sizeof (msg.msg_namelen)); if (error != 0) goto done2; } else { msg.msg_namelen = 0; } msg.msg_name = uap->from; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; error = recvit(td, uap->s, &msg, uap->fromlenaddr); done2: return (error); } #ifdef COMPAT_OLDSOCK int orecvfrom(struct thread *td, struct recvfrom_args *uap) { uap->flags |= MSG_COMPAT; return (sys_recvfrom(td, uap)); } #endif #ifdef COMPAT_OLDSOCK int orecv(struct thread *td, struct orecv_args *uap) { struct msghdr msg; struct iovec aiov; msg.msg_name = 0; msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = uap->buf; aiov.iov_len = uap->len; msg.msg_control = 0; msg.msg_flags = uap->flags; return (recvit(td, uap->s, &msg, NULL)); } /* * Old recvmsg. This code takes advantage of the fact that the old msghdr * overlays the new one, missing only the flags, and with the (old) access * rights where the control fields are now. */ int orecvmsg(struct thread *td, struct orecvmsg_args *uap) { struct msghdr msg; struct iovec *iov; int error; error = copyin(uap->msg, &msg, sizeof (struct omsghdr)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags | MSG_COMPAT; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen); if (msg.msg_controllen && error == 0) error = copyout(&msg.msg_controllen, &uap->msg->msg_accrightslen, sizeof (int)); free(iov, M_IOV); return (error); } #endif int sys_recvmsg(struct thread *td, struct recvmsg_args *uap) { struct msghdr msg; struct iovec *uiov, *iov; int error; error = copyin(uap->msg, &msg, sizeof (msg)); if (error != 0) return (error); error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); if (error != 0) return (error); msg.msg_flags = uap->flags; #ifdef COMPAT_OLDSOCK if (SV_PROC_FLAG(td->td_proc, SV_AOUT)) msg.msg_flags &= ~MSG_COMPAT; #endif uiov = msg.msg_iov; msg.msg_iov = iov; error = recvit(td, uap->s, &msg, NULL); if (error == 0) { msg.msg_iov = uiov; error = copyout(&msg, uap->msg, sizeof(msg)); } free(iov, M_IOV); return (error); } int sys_shutdown(struct thread *td, struct shutdown_args *uap) { return (kern_shutdown(td, uap->s, uap->how)); } int kern_shutdown(struct thread *td, int s, int how) { struct socket *so; struct file *fp; int error; AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_shutdown_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = soshutdown(so, how); /* * Previous versions did not return ENOTCONN, but 0 in * case the socket was not connected. Some important * programs like syslogd up to r279016, 2015-02-19, * still depend on this behavior. */ if (error == ENOTCONN && td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) error = 0; fdrop(fp, td); } return (error); } int sys_setsockopt(struct thread *td, struct setsockopt_args *uap) { return (kern_setsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, uap->valsize)); } int kern_setsockopt(struct thread *td, int s, int level, int name, const void *val, enum uio_seg valseg, socklen_t valsize) { struct socket *so; struct file *fp; struct sockopt sopt; int error; if (val == NULL && valsize != 0) return (EFAULT); if ((int)valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = __DECONST(void *, val); sopt.sopt_valsize = valsize; switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_setsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_setsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sosetopt(so, &sopt); fdrop(fp, td); } return(error); } int sys_getsockopt(struct thread *td, struct getsockopt_args *uap) { socklen_t valsize; int error; if (uap->val) { error = copyin(uap->avalsize, &valsize, sizeof (valsize)); if (error != 0) return (error); } error = kern_getsockopt(td, uap->s, uap->level, uap->name, uap->val, UIO_USERSPACE, &valsize); if (error == 0) error = copyout(&valsize, uap->avalsize, sizeof (valsize)); return (error); } /* * Kernel version of getsockopt. * optval can be a userland or userspace. optlen is always a kernel pointer. */ int kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { struct socket *so; struct file *fp; struct sockopt sopt; int error; if (val == NULL) *valsize = 0; if ((int)*valsize < 0) return (EINVAL); sopt.sopt_dir = SOPT_GET; sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = val; sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */ switch (valseg) { case UIO_USERSPACE: sopt.sopt_td = td; break; case UIO_SYSSPACE: sopt.sopt_td = NULL; break; default: panic("kern_getsockopt called with bad valseg"); } AUDIT_ARG_FD(s); error = getsock_cap(td, s, &cap_getsockopt_rights, &fp, NULL, NULL); if (error == 0) { so = fp->f_data; error = sogetopt(so, &sopt); *valsize = sopt.sopt_valsize; fdrop(fp, td); } return (error); } /* * getsockname1() - Get socket name. */ static int getsockname1(struct thread *td, struct getsockname_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof(len)); if (error != 0) return (error); error = kern_getsockname(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, &cap_getsockname_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; *sa = NULL; - error = sogetsockaddr(so, sa); + CURVNET_SET(so->so_vnet); + error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa); + CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: fdrop(fp, td); if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } return (error); } int sys_getsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetsockname(struct thread *td, struct getsockname_args *uap) { return (getsockname1(td, uap, 1)); } #endif /* COMPAT_OLDSOCK */ /* * getpeername1() - Get name of peer for connected socket. */ static int getpeername1(struct thread *td, struct getpeername_args *uap, int compat) { struct sockaddr *sa; socklen_t len; int error; error = copyin(uap->alen, &len, sizeof (len)); if (error != 0) return (error); error = kern_getpeername(td, uap->fdes, &sa, &len); if (error != 0) return (error); if (len != 0) { #ifdef COMPAT_OLDSOCK if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT)) ((struct osockaddr *)sa)->sa_family = sa->sa_family; #endif error = copyout(sa, uap->asa, (u_int)len); } free(sa, M_SONAME); if (error == 0) error = copyout(&len, uap->alen, sizeof(len)); return (error); } int kern_getpeername(struct thread *td, int fd, struct sockaddr **sa, socklen_t *alen) { struct socket *so; struct file *fp; socklen_t len; int error; AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, &cap_getpeername_rights, &fp, NULL, NULL); if (error != 0) return (error); so = fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) { error = ENOTCONN; goto done; } *sa = NULL; CURVNET_SET(so->so_vnet); error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa); CURVNET_RESTORE(); if (error != 0) goto bad; if (*sa == NULL) len = 0; else len = MIN(*alen, (*sa)->sa_len); *alen = len; #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktrsockaddr(*sa); #endif bad: if (error != 0 && *sa != NULL) { free(*sa, M_SONAME); *sa = NULL; } done: fdrop(fp, td); return (error); } int sys_getpeername(struct thread *td, struct getpeername_args *uap) { return (getpeername1(td, uap, 0)); } #ifdef COMPAT_OLDSOCK int ogetpeername(struct thread *td, struct ogetpeername_args *uap) { /* XXX uap should have type `getpeername_args *' to begin with. */ return (getpeername1(td, (struct getpeername_args *)uap, 1)); } #endif /* COMPAT_OLDSOCK */ static int sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type) { struct sockaddr *sa; struct mbuf *m; int error; if (buflen > MLEN) { #ifdef COMPAT_OLDSOCK if (type == MT_SONAME && buflen <= 112 && SV_CURPROC_FLAG(SV_AOUT)) buflen = MLEN; /* unix domain compat. hack */ else #endif if (buflen > MCLBYTES) return (EINVAL); } m = m_get2(buflen, M_WAITOK, type, 0); m->m_len = buflen; error = copyin(buf, mtod(m, void *), buflen); if (error != 0) (void) m_free(m); else { *mp = m; if (type == MT_SONAME) { sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX && SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = buflen; } } return (error); } int getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len) { struct sockaddr *sa; int error; if (len > SOCK_MAXADDRLEN) return (ENAMETOOLONG); if (len < offsetof(struct sockaddr, sa_data[0])) return (EINVAL); sa = malloc(len, M_SONAME, M_WAITOK); error = copyin(uaddr, sa, len); if (error != 0) { free(sa, M_SONAME); } else { #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN if (sa->sa_family == 0 && sa->sa_len < AF_MAX && SV_CURPROC_FLAG(SV_AOUT)) sa->sa_family = sa->sa_len; #endif sa->sa_len = len; *namp = sa; } return (error); } /* * Dispose of externalized rights from an SCM_RIGHTS message. This function * should be used in error or truncation cases to avoid leaking file descriptors * into the recipient's (the current thread's) table. */ void m_dispose_extcontrolm(struct mbuf *m) { struct cmsghdr *cm; struct file *fp; struct thread *td; socklen_t clen, datalen; int error, fd, *fds, nfd; td = curthread; for (; m != NULL; m = m->m_next) { if (m->m_type != MT_EXTCONTROL) continue; cm = mtod(m, struct cmsghdr *); clen = m->m_len; while (clen > 0) { if (clen < sizeof(*cm)) panic("%s: truncated mbuf %p", __func__, m); datalen = CMSG_SPACE(cm->cmsg_len - CMSG_SPACE(0)); if (clen < datalen) panic("%s: truncated mbuf %p", __func__, m); if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) { fds = (int *)CMSG_DATA(cm); nfd = (cm->cmsg_len - CMSG_SPACE(0)) / sizeof(int); while (nfd-- > 0) { fd = *fds++; error = fget(td, fd, &cap_no_rights, &fp); if (error == 0) { fdclose(td, fp, fd); fdrop(fp, td); } } } clen -= datalen; cm = (struct cmsghdr *)((uint8_t *)cm + datalen); } m_chtype(m, MT_CONTROL); } } diff --git a/sys/modules/Makefile b/sys/modules/Makefile index 32919674901b..0195ce5064d2 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -1,847 +1,846 @@ # $FreeBSD$ SYSDIR?=${SRCTOP}/sys .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR_PARALLEL= # Modules that include binary-only blobs of microcode should be selectable by # MK_SOURCELESS_UCODE option (see below). .include "${SYSDIR}/conf/config.mk" .if defined(MODULES_OVERRIDE) && !defined(ALL_MODULES) SUBDIR=${MODULES_OVERRIDE} .else SUBDIR= \ ${_3dfx} \ ${_3dfx_linux} \ ${_aac} \ ${_aacraid} \ accf_data \ accf_dns \ accf_http \ acl_nfs4 \ acl_posix1e \ ${_acpi} \ ae \ ${_aesni} \ age \ ${_agp} \ ahci \ aic7xxx \ alc \ ale \ alq \ ${_amd_ecc_inject} \ ${_amdgpio} \ ${_amdsbwd} \ ${_amdsmn} \ ${_amdtemp} \ amr \ ${_an} \ ${_aout} \ ${_arcmsr} \ ${_allwinner} \ ${_armv8crypto} \ ${_asmc} \ ata \ ath \ ath_dfs \ ath_hal \ ath_hal_ar5210 \ ath_hal_ar5211 \ ath_hal_ar5212 \ ath_hal_ar5416 \ ath_hal_ar9300 \ ath_main \ ath_rate \ ath_pci \ ${_autofs} \ axgbe \ backlight \ ${_bce} \ ${_bcm283x_clkman} \ ${_bcm283x_pwm} \ bfe \ bge \ bhnd \ ${_bxe} \ ${_bios} \ ${_blake2} \ bnxt \ bridgestp \ bwi \ bwn \ ${_bytgpio} \ ${_chvgpio} \ cam \ ${_cardbus} \ ${_carp} \ cas \ ${_cbb} \ cc \ ${_ccp} \ cd9660 \ cd9660_iconv \ ${_ce} \ ${_cfi} \ ${_chromebook_platform} \ ${_ciss} \ cloudabi \ ${_cloudabi32} \ ${_cloudabi64} \ ${_coretemp} \ ${_cp} \ ${_cpsw} \ ${_cpuctl} \ ${_cpufreq} \ ${_crypto} \ ${_cryptodev} \ ctl \ ${_cxgb} \ ${_cxgbe} \ dc \ dcons \ dcons_crom \ ${_dpdk_lpm4} \ ${_dpdk_lpm6} \ ${_dpms} \ dummynet \ ${_dwwdt} \ ${_efirt} \ ${_em} \ ${_ena} \ esp \ ${_et} \ evdev \ ${_exca} \ ext2fs \ fdc \ fdescfs \ ${_ffec} \ filemon \ firewire \ firmware \ ${_ftwd} \ fusefs \ ${_fxp} \ gem \ geom \ ${_glxiic} \ ${_glxsb} \ gpio \ hid \ hifn \ ${_hpt27xx} \ ${_hptiop} \ ${_hptmv} \ ${_hptnr} \ ${_hptrr} \ hwpmc \ ${_hwpmc_mips24k} \ ${_hwpmc_mips74k} \ ${_hyperv} \ i2c \ ${_iavf} \ ${_ibcore} \ ${_ichwd} \ ${_ice} \ ${_ice_ddp} \ ${_ida} \ if_bridge \ if_disc \ if_edsc \ ${_if_enc} \ if_epair \ ${_if_gif} \ ${_if_gre} \ ${_if_me} \ if_infiniband \ if_lagg \ ${_if_stf} \ if_tuntap \ if_vlan \ if_vxlan \ - ${_if_wg} \ iflib \ ${_iir} \ imgact_binmisc \ ${_intelspi} \ ${_io} \ ${_ioat} \ ${_ipoib} \ ${_ipdivert} \ ${_ipfilter} \ ${_ipfw} \ ipfw_nat \ ${_ipfw_nat64} \ ${_ipfw_nptv6} \ ${_ipfw_pmod} \ ${_ipmi} \ ip6_mroute_mod \ ip_mroute_mod \ ${_ips} \ ${_ipsec} \ ${_ipw} \ ${_ipwfw} \ ${_isci} \ ${_iser} \ isp \ ${_ispfw} \ ${_itwd} \ ${_iwi} \ ${_iwifw} \ ${_iwm} \ ${_iwmfw} \ ${_iwn} \ ${_iwnfw} \ ${_ix} \ ${_ixv} \ ${_ixl} \ jme \ kbdmux \ kgssapi \ kgssapi_krb5 \ khelp \ krpc \ ksyms \ ${_ktls_ocf} \ le \ lge \ libalias \ libiconv \ libmchain \ lindebugfs \ linuxkpi \ ${_lio} \ lpt \ mac_biba \ mac_bsdextended \ mac_ifoff \ mac_lomac \ mac_mls \ mac_none \ mac_ntpd \ mac_partition \ mac_portacl \ mac_seeotheruids \ mac_stub \ mac_test \ ${_malo} \ md \ mdio \ mem \ mfi \ mii \ mlx \ mlxfw \ ${_mlx4} \ ${_mlx4ib} \ ${_mlx4en} \ ${_mlx5} \ ${_mlx5en} \ ${_mlx5ib} \ ${_mly} \ mmc \ mmcsd \ ${_mpr} \ ${_mps} \ mpt \ mqueue \ mrsas \ msdosfs \ msdosfs_iconv \ msk \ ${_mthca} \ mvs \ mwl \ ${_mwlfw} \ mxge \ my \ ${_nctgpio} \ ${_netgraph} \ ${_nfe} \ nfscl \ nfscommon \ nfsd \ nfslockd \ nfssvc \ nge \ nmdm \ nullfs \ ${_ntb} \ ${_nvd} \ ${_nvdimm} \ ${_nvme} \ ${_nvram} \ oce \ ${_ocs_fc} \ ${_ossl} \ otus \ ${_otusfw} \ ow \ ${_padlock} \ ${_padlock_rng} \ ${_pccard} \ ${_pchtherm} \ ${_pcfclock} \ ${_pf} \ ${_pflog} \ ${_pfsync} \ plip \ ${_pms} \ ppbus \ ppc \ ppi \ pps \ procfs \ proto \ pseudofs \ ${_pst} \ pty \ puc \ pwm \ ${_qat} \ ${_qatfw} \ ${_qlxge} \ ${_qlxgb} \ ${_qlxgbe} \ ${_qlnx} \ ral \ ${_ralfw} \ ${_random_fortuna} \ ${_random_other} \ rc4 \ ${_rdma} \ ${_rdrand_rng} \ re \ rl \ ${_rockchip} \ rtsx \ rtwn \ rtwn_pci \ rtwn_usb \ ${_rtwnfw} \ ${_s3} \ ${_safe} \ safexcel \ ${_sbni} \ scc \ ${_sctp} \ sdhci \ ${_sdhci_acpi} \ sdhci_pci \ sdio \ sem \ send \ ${_sfxge} \ sge \ ${_sgx} \ ${_sgx_linux} \ siftr \ siis \ sis \ sk \ ${_smartpqi} \ smbfs \ snp \ sound \ ${_speaker} \ spi \ ${_splash} \ ${_sppp} \ ste \ stge \ ${_sume} \ ${_superio} \ ${_sym} \ ${_syscons} \ sysvipc \ tcp \ ${_ti} \ tmpfs \ ${_toecore} \ ${_tpm} \ ${_twa} \ twe \ tws \ uart \ udf \ udf_iconv \ ufs \ uinput \ unionfs \ usb \ ${_vesa} \ virtio \ vge \ ${_viawd} \ videomode \ vkbd \ ${_vmd} \ ${_vmm} \ ${_vmware} \ vr \ vte \ ${_wbwd} \ wlan \ wlan_acl \ wlan_amrr \ wlan_ccmp \ wlan_rssadapt \ wlan_tkip \ wlan_wep \ wlan_xauth \ ${_wpi} \ ${_wpifw} \ ${_x86bios} \ xdr \ xl \ xz \ zlib .if ${MK_AUTOFS} != "no" || defined(ALL_MODULES) _autofs= autofs .endif .if ${MK_CDDL} != "no" || defined(ALL_MODULES) .if (${MACHINE_CPUARCH} != "arm" || ${MACHINE_ARCH:Marmv[67]*} != "") && \ ${MACHINE_CPUARCH} != "mips" .if ${KERN_OPTS:MKDTRACE_HOOKS} SUBDIR+= dtrace .endif .endif SUBDIR+= opensolaris .endif .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) .if exists(${SRCTOP}/sys/opencrypto) _crypto= crypto _cryptodev= cryptodev _random_fortuna=random_fortuna _random_other= random_other _ktls_ocf= ktls_ocf .endif .endif .if ${MK_CUSE} != "no" || defined(ALL_MODULES) SUBDIR+= cuse .endif .if ${MK_EFI} != "no" .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" _efirt= efirt .endif .endif .if (${MK_INET_SUPPORT} != "no" || ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _carp= carp _toecore= toecore _if_enc= if_enc _if_gif= if_gif _if_gre= if_gre _ipfw_pmod= ipfw_pmod .if ${KERN_OPTS:MIPSEC_SUPPORT} && !${KERN_OPTS:MIPSEC} _ipsec= ipsec .endif .if ${KERN_OPTS:MSCTP_SUPPORT} || ${KERN_OPTS:MSCTP} _sctp= sctp .endif .endif .if (${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _if_stf= if_stf .endif .if ${MK_INET_SUPPORT} != "no" || defined(ALL_MODULES) _if_wg= if_wg .endif .if ${MK_INET_SUPPORT} != "no" || defined(ALL_MODULES) _if_me= if_me _ipdivert= ipdivert _ipfw= ipfw .if ${MK_INET6_SUPPORT} != "no" || defined(ALL_MODULES) _ipfw_nat64= ipfw_nat64 .endif .endif .if ${MK_INET6_SUPPORT} != "no" || defined(ALL_MODULES) _ipfw_nptv6= ipfw_nptv6 .endif .if ${MK_IPFILTER} != "no" || defined(ALL_MODULES) _ipfilter= ipfilter .endif .if ${MK_INET_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO} _dpdk_lpm4= dpdk_lpm4 .endif .if ${MK_INET6_SUPPORT} != "no" && ${KERN_OPTS:MFIB_ALGO} _dpdk_lpm6= dpdk_lpm6 .endif .if ${MK_ISCSI} != "no" || defined(ALL_MODULES) SUBDIR+= cfiscsi SUBDIR+= iscsi SUBDIR+= iscsi_initiator .endif .if !empty(OPT_FDT) SUBDIR+= fdt .endif # Linuxulator .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" SUBDIR+= linprocfs SUBDIR+= linsysfs .endif .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" SUBDIR+= linux .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" SUBDIR+= linux64 SUBDIR+= linux_common .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" _ena= ena .if ${MK_OFED} != "no" || defined(ALL_MODULES) _ibcore= ibcore _ipoib= ipoib _iser= iser .endif _ipmi= ipmi _mlx4= mlx4 _mlx5= mlx5 .if (${MK_INET_SUPPORT} != "no" && ${MK_INET6_SUPPORT} != "no") || \ defined(ALL_MODULES) _mlx4en= mlx4en _mlx5en= mlx5en .endif .if ${MK_OFED} != "no" || defined(ALL_MODULES) _mthca= mthca _mlx4ib= mlx4ib _mlx5ib= mlx5ib .endif _ossl= ossl _vmware= vmware .endif .if ${MK_NETGRAPH} != "no" || defined(ALL_MODULES) _netgraph= netgraph .endif .if (${MK_PF} != "no" && (${MK_INET_SUPPORT} != "no" || \ ${MK_INET6_SUPPORT} != "no")) || defined(ALL_MODULES) _pf= pf _pflog= pflog .if ${MK_INET_SUPPORT} != "no" _pfsync= pfsync .endif .endif .if ${MK_SOURCELESS_UCODE} != "no" _bce= bce _fxp= fxp _ispfw= ispfw _ti= ti .if ${MACHINE_CPUARCH} != "mips" _mwlfw= mwlfw _otusfw= otusfw _ralfw= ralfw _rtwnfw= rtwnfw .endif .endif .if ${MK_SOURCELESS_UCODE} != "no" && ${MACHINE_CPUARCH} != "arm" && \ ${MACHINE_CPUARCH} != "mips" && \ ${MACHINE_ARCH} != "powerpc" && ${MACHINE_ARCH} != "powerpcspe" && \ ${MACHINE_CPUARCH} != "riscv" _cxgbe= cxgbe .endif .if ${MACHINE_ARCH} == "amd64" || ${MACHINE_ARCH} == "arm64" _ice= ice .if ${MK_SOURCELESS_UCODE} != "no" _ice_ddp= ice_ddp .endif .endif # These rely on 64bit atomics .if ${MACHINE_ARCH} != "powerpc" && ${MACHINE_ARCH} != "powerpcspe" && \ ${MACHINE_CPUARCH} != "mips" _mps= mps _mpr= mpr .endif .if ${MK_TESTS} != "no" || defined(ALL_MODULES) SUBDIR+= tests .endif .if ${MK_ZFS} != "no" || (defined(ALL_MODULES) && ${MACHINE_CPUARCH} != "powerpc") SUBDIR+= zfs .endif .if (${MACHINE_CPUARCH} == "mips" && ${MACHINE_ARCH:Mmips64} == "") _hwpmc_mips24k= hwpmc_mips24k _hwpmc_mips74k= hwpmc_mips74k .endif .if ${MACHINE_CPUARCH} != "aarch64" && ${MACHINE_CPUARCH} != "arm" && \ ${MACHINE_CPUARCH} != "mips" && ${MACHINE_CPUARCH} != "powerpc" && \ ${MACHINE_CPUARCH} != "riscv" _syscons= syscons .endif .if ${MACHINE_CPUARCH} != "mips" # no BUS_SPACE_UNSPECIFIED # No barrier instruction support (specific to this driver) _sym= sym # intr_disable() is a macro, causes problems .if ${MK_SOURCELESS_UCODE} != "no" _cxgb= cxgb .endif .endif .if ${MACHINE_CPUARCH} == "aarch64" _allwinner= allwinner _armv8crypto= armv8crypto _dwwdt= dwwdt _em= em _rockchip= rockchip .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _agp= agp _an= an _aout= aout _bios= bios .if ${MK_SOURCELESS_UCODE} != "no" _bxe= bxe .endif _cardbus= cardbus _cbb= cbb _cpuctl= cpuctl _cpufreq= cpufreq _dpms= dpms _em= em _et= et _ftwd= ftwd _exca= exca _io= io _itwd= itwd _ix= ix _ixv= ixv .if ${MK_SOURCELESS_UCODE} != "no" _lio= lio .endif _nctgpio= nctgpio _ntb= ntb _ocs_fc= ocs_fc _pccard= pccard _qat= qat _qatfw= qatfw .if ${MK_OFED} != "no" || defined(ALL_MODULES) _rdma= rdma .endif _safe= safe _speaker= speaker _splash= splash _sppp= sppp _wbwd= wbwd _aac= aac _aacraid= aacraid _acpi= acpi .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _aesni= aesni .endif _amd_ecc_inject=amd_ecc_inject _amdsbwd= amdsbwd _amdsmn= amdsmn _amdtemp= amdtemp _arcmsr= arcmsr _asmc= asmc .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _blake2= blake2 .endif _bytgpio= bytgpio _chvgpio= chvgpio _ciss= ciss _chromebook_platform= chromebook_platform _coretemp= coretemp .if ${MK_SOURCELESS_HOST} != "no" && empty(KCSAN_ENABLED) _hpt27xx= hpt27xx .endif _hptiop= hptiop .if ${MK_SOURCELESS_HOST} != "no" && empty(KCSAN_ENABLED) _hptmv= hptmv _hptnr= hptnr _hptrr= hptrr .endif _hyperv= hyperv _ichwd= ichwd _ida= ida _iir= iir _intelspi= intelspi _ips= ips _isci= isci _ipw= ipw _iwi= iwi _iwm= iwm _iwn= iwn .if ${MK_SOURCELESS_UCODE} != "no" _ipwfw= ipwfw _iwifw= iwifw _iwmfw= iwmfw _iwnfw= iwnfw .endif _mly= mly _nfe= nfe _nvd= nvd _nvme= nvme _nvram= nvram .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _padlock= padlock _padlock_rng= padlock_rng _rdrand_rng= rdrand_rng .endif _pchtherm = pchtherm _s3= s3 _sdhci_acpi= sdhci_acpi _superio= superio _tpm= tpm _twa= twa _vesa= vesa _viawd= viawd _wpi= wpi .if ${MK_SOURCELESS_UCODE} != "no" _wpifw= wpifw .endif _x86bios= x86bios .endif .if ${MACHINE_CPUARCH} == "amd64" _amdgpio= amdgpio _ccp= ccp _iavf= iavf _ioat= ioat _ixl= ixl _nvdimm= nvdimm _pms= pms _qlxge= qlxge _qlxgb= qlxgb _sume= sume _vmd= vmd .if ${MK_SOURCELESS_UCODE} != "no" _qlxgbe= qlxgbe _qlnx= qlnx .endif _sfxge= sfxge _sgx= sgx _sgx_linux= sgx_linux _smartpqi= smartpqi .if ${MK_BHYVE} != "no" || defined(ALL_MODULES) .if ${KERN_OPTS:MSMP} _vmm= vmm .endif .endif .endif .if ${MACHINE_CPUARCH} == "i386" # XXX some of these can move to the general case when de-i386'ed # XXX some of these can move now, but are untested on other architectures. _3dfx= 3dfx _3dfx_linux= 3dfx_linux .if ${MK_SOURCELESS_HOST} != "no" _ce= ce .endif .if ${MK_SOURCELESS_HOST} != "no" _cp= cp .endif _glxiic= glxiic _glxsb= glxsb _pcfclock= pcfclock _pst= pst _sbni= sbni .endif .if ${MACHINE_ARCH} == "armv7" _cfi= cfi _cpsw= cpsw .endif .if ${MACHINE_CPUARCH} == "powerpc" _aacraid= aacraid _agp= agp _an= an _cardbus= cardbus _cbb= cbb _cfi= cfi _cpufreq= cpufreq _exca= exca _ffec= ffec _nvd= nvd _nvme= nvme _pccard= pccard .endif .if ${MACHINE_ARCH:Mpowerpc64*} != "" _ipmi= ipmi _ixl= ixl _nvram= opal_nvram .endif .if ${MACHINE_CPUARCH} == "powerpc" && ${MACHINE_ARCH} != "powerpcspe" # Don't build powermac_nvram for powerpcspe, it's never supported. _nvram+= powermac_nvram .endif .if (${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_ARCH:Marmv[67]*} != "" || ${MACHINE_CPUARCH} == "i386") _cloudabi32= cloudabi32 .endif .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" _cloudabi64= cloudabi64 .endif .endif .if ${MACHINE_ARCH:Marmv[67]*} != "" || ${MACHINE_CPUARCH} == "aarch64" _bcm283x_clkman= bcm283x_clkman _bcm283x_pwm= bcm283x_pwm .endif .if !(${COMPILER_TYPE} == "clang" && ${COMPILER_VERSION} < 110000) # LLVM 10 crashes when building if_malo_pci.c, fixed in LLVM11: # https://bugs.llvm.org/show_bug.cgi?id=44351 _malo= malo .endif SUBDIR+=${MODULES_EXTRA} .for reject in ${WITHOUT_MODULES} SUBDIR:= ${SUBDIR:N${reject}} .endfor # Calling kldxref(8) for each module is expensive. .if !defined(NO_XREF) .MAKEFLAGS+= -DNO_XREF afterinstall: .PHONY @if type kldxref >/dev/null 2>&1; then \ ${ECHO} ${KLDXREF_CMD} ${DESTDIR}${KMODDIR}; \ ${KLDXREF_CMD} ${DESTDIR}${KMODDIR}; \ fi .endif SUBDIR:= ${SUBDIR:u:O} .include diff --git a/sys/modules/if_wg/Makefile b/sys/modules/if_wg/Makefile deleted file mode 100644 index 851c55673738..000000000000 --- a/sys/modules/if_wg/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# $FreeBSD$ - - -KMOD= if_wg - -.PATH: ${SRCTOP}/sys/dev/if_wg - -SRCS= opt_inet.h opt_inet6.h device_if.h bus_if.h ifdi_if.h - -SRCS+= if_wg.c wg_noise.c wg_cookie.c crypto.c - -.include diff --git a/sys/net/if_types.h b/sys/net/if_types.h index 030f773d5654..1103d5f90928 100644 --- a/sys/net/if_types.h +++ b/sys/net/if_types.h @@ -1,277 +1,276 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_types.h 8.3 (Berkeley) 4/28/95 * $FreeBSD$ * $NetBSD: if_types.h,v 1.16 2000/04/19 06:30:53 itojun Exp $ */ #ifndef _NET_IF_TYPES_H_ #define _NET_IF_TYPES_H_ /* * Interface types for benefit of parsing media address headers. * This list is derived from the SNMP list of ifTypes, originally * documented in RFC1573, now maintained as: * * http://www.iana.org/assignments/smi-numbers */ typedef enum { IFT_OTHER = 0x1, /* none of the following */ IFT_1822 = 0x2, /* old-style arpanet imp */ IFT_HDH1822 = 0x3, /* HDH arpanet imp */ IFT_X25DDN = 0x4, /* x25 to imp */ IFT_X25 = 0x5, /* PDN X25 interface (RFC877) */ IFT_ETHER = 0x6, /* Ethernet CSMA/CD */ IFT_ISO88023 = 0x7, /* CMSA/CD */ IFT_ISO88024 = 0x8, /* Token Bus */ IFT_ISO88025 = 0x9, /* Token Ring */ IFT_ISO88026 = 0xa, /* MAN */ IFT_STARLAN = 0xb, IFT_P10 = 0xc, /* Proteon 10MBit ring */ IFT_P80 = 0xd, /* Proteon 80MBit ring */ IFT_HY = 0xe, /* Hyperchannel */ IFT_FDDI = 0xf, IFT_LAPB = 0x10, IFT_SDLC = 0x11, IFT_T1 = 0x12, IFT_CEPT = 0x13, /* E1 - european T1 */ IFT_ISDNBASIC = 0x14, IFT_ISDNPRIMARY = 0x15, IFT_PTPSERIAL = 0x16, /* Proprietary PTP serial */ IFT_PPP = 0x17, /* RFC 1331 */ IFT_LOOP = 0x18, /* loopback */ IFT_EON = 0x19, /* ISO over IP */ IFT_XETHER = 0x1a, /* obsolete 3MB experimental ethernet */ IFT_NSIP = 0x1b, /* XNS over IP */ IFT_SLIP = 0x1c, /* IP over generic TTY */ IFT_ULTRA = 0x1d, /* Ultra Technologies */ IFT_DS3 = 0x1e, /* Generic T3 */ IFT_SIP = 0x1f, /* SMDS */ IFT_FRELAY = 0x20, /* Frame Relay DTE only */ IFT_RS232 = 0x21, IFT_PARA = 0x22, /* parallel-port */ IFT_ARCNET = 0x23, IFT_ARCNETPLUS = 0x24, IFT_ATM = 0x25, /* ATM cells */ IFT_MIOX25 = 0x26, IFT_SONET = 0x27, /* SONET or SDH */ IFT_X25PLE = 0x28, IFT_ISO88022LLC = 0x29, IFT_LOCALTALK = 0x2a, IFT_SMDSDXI = 0x2b, IFT_FRELAYDCE = 0x2c, /* Frame Relay DCE */ IFT_V35 = 0x2d, IFT_HSSI = 0x2e, IFT_HIPPI = 0x2f, IFT_MODEM = 0x30, /* Generic Modem */ IFT_AAL5 = 0x31, /* AAL5 over ATM */ IFT_SONETPATH = 0x32, IFT_SONETVT = 0x33, IFT_SMDSICIP = 0x34, /* SMDS InterCarrier Interface */ IFT_PROPVIRTUAL = 0x35, /* Proprietary Virtual/internal */ IFT_PROPMUX = 0x36, /* Proprietary Multiplexing */ IFT_IEEE80212 = 0x37, /* 100BaseVG */ IFT_FIBRECHANNEL = 0x38, /* Fibre Channel */ IFT_HIPPIINTERFACE = 0x39, /* HIPPI interfaces */ IFT_FRAMERELAYINTERCONNECT = 0x3a, /* Obsolete, use 0x20 either 0x2c */ IFT_AFLANE8023 = 0x3b, /* ATM Emulated LAN for 802.3 */ IFT_AFLANE8025 = 0x3c, /* ATM Emulated LAN for 802.5 */ IFT_CCTEMUL = 0x3d, /* ATM Emulated circuit */ IFT_FASTETHER = 0x3e, /* Fast Ethernet (100BaseT) */ IFT_ISDN = 0x3f, /* ISDN and X.25 */ IFT_V11 = 0x40, /* CCITT V.11/X.21 */ IFT_V36 = 0x41, /* CCITT V.36 */ IFT_G703AT64K = 0x42, /* CCITT G703 at 64Kbps */ IFT_G703AT2MB = 0x43, /* Obsolete see DS1-MIB */ IFT_QLLC = 0x44, /* SNA QLLC */ IFT_FASTETHERFX = 0x45, /* Fast Ethernet (100BaseFX) */ IFT_CHANNEL = 0x46, /* channel */ IFT_IEEE80211 = 0x47, /* radio spread spectrum (unused) */ IFT_IBM370PARCHAN = 0x48, /* IBM System 360/370 OEMI Channel */ IFT_ESCON = 0x49, /* IBM Enterprise Systems Connection */ IFT_DLSW = 0x4a, /* Data Link Switching */ IFT_ISDNS = 0x4b, /* ISDN S/T interface */ IFT_ISDNU = 0x4c, /* ISDN U interface */ IFT_LAPD = 0x4d, /* Link Access Protocol D */ IFT_IPSWITCH = 0x4e, /* IP Switching Objects */ IFT_RSRB = 0x4f, /* Remote Source Route Bridging */ IFT_ATMLOGICAL = 0x50, /* ATM Logical Port */ IFT_DS0 = 0x51, /* Digital Signal Level 0 */ IFT_DS0BUNDLE = 0x52, /* group of ds0s on the same ds1 */ IFT_BSC = 0x53, /* Bisynchronous Protocol */ IFT_ASYNC = 0x54, /* Asynchronous Protocol */ IFT_CNR = 0x55, /* Combat Net Radio */ IFT_ISO88025DTR = 0x56, /* ISO 802.5r DTR */ IFT_EPLRS = 0x57, /* Ext Pos Loc Report Sys */ IFT_ARAP = 0x58, /* Appletalk Remote Access Protocol */ IFT_PROPCNLS = 0x59, /* Proprietary Connectionless Protocol*/ IFT_HOSTPAD = 0x5a, /* CCITT-ITU X.29 PAD Protocol */ IFT_TERMPAD = 0x5b, /* CCITT-ITU X.3 PAD Facility */ IFT_FRAMERELAYMPI = 0x5c, /* Multiproto Interconnect over FR */ IFT_X213 = 0x5d, /* CCITT-ITU X213 */ IFT_ADSL = 0x5e, /* Asymmetric Digital Subscriber Loop */ IFT_RADSL = 0x5f, /* Rate-Adapt. Digital Subscriber Loop*/ IFT_SDSL = 0x60, /* Symmetric Digital Subscriber Loop */ IFT_VDSL = 0x61, /* Very H-Speed Digital Subscrib. Loop*/ IFT_ISO88025CRFPINT = 0x62, /* ISO 802.5 CRFP */ IFT_MYRINET = 0x63, /* Myricom Myrinet */ IFT_VOICEEM = 0x64, /* voice recEive and transMit */ IFT_VOICEFXO = 0x65, /* voice Foreign Exchange Office */ IFT_VOICEFXS = 0x66, /* voice Foreign Exchange Station */ IFT_VOICEENCAP = 0x67, /* voice encapsulation */ IFT_VOICEOVERIP = 0x68, /* voice over IP encapsulation */ IFT_ATMDXI = 0x69, /* ATM DXI */ IFT_ATMFUNI = 0x6a, /* ATM FUNI */ IFT_ATMIMA = 0x6b, /* ATM IMA */ IFT_PPPMULTILINKBUNDLE = 0x6c, /* PPP Multilink Bundle */ IFT_IPOVERCDLC = 0x6d, /* IBM ipOverCdlc */ IFT_IPOVERCLAW = 0x6e, /* IBM Common Link Access to Workstn */ IFT_STACKTOSTACK = 0x6f, /* IBM stackToStack */ IFT_VIRTUALIPADDRESS = 0x70, /* IBM VIPA */ IFT_MPC = 0x71, /* IBM multi-protocol channel support */ IFT_IPOVERATM = 0x72, /* IBM ipOverAtm */ IFT_ISO88025FIBER = 0x73, /* ISO 802.5j Fiber Token Ring */ IFT_TDLC = 0x74, /* IBM twinaxial data link control */ IFT_GIGABITETHERNET = 0x75, /* Gigabit Ethernet */ IFT_HDLC = 0x76, /* HDLC */ IFT_LAPF = 0x77, /* LAP F */ IFT_V37 = 0x78, /* V.37 */ IFT_X25MLP = 0x79, /* Multi-Link Protocol */ IFT_X25HUNTGROUP = 0x7a, /* X25 Hunt Group */ IFT_TRANSPHDLC = 0x7b, /* Transp HDLC */ IFT_INTERLEAVE = 0x7c, /* Interleave channel */ IFT_FAST = 0x7d, /* Fast channel */ IFT_IP = 0x7e, /* IP (for APPN HPR in IP networks) */ IFT_DOCSCABLEMACLAYER = 0x7f, /* CATV Mac Layer */ IFT_DOCSCABLEDOWNSTREAM = 0x80, /* CATV Downstream interface */ IFT_DOCSCABLEUPSTREAM = 0x81, /* CATV Upstream interface */ IFT_A12MPPSWITCH = 0x82, /* Avalon Parallel Processor */ IFT_TUNNEL = 0x83, /* Encapsulation interface */ IFT_COFFEE = 0x84, /* coffee pot */ IFT_CES = 0x85, /* Circiut Emulation Service */ IFT_ATMSUBINTERFACE = 0x86, /* (x) ATM Sub Interface */ IFT_L2VLAN = 0x87, /* Layer 2 Virtual LAN using 802.1Q */ IFT_L3IPVLAN = 0x88, /* Layer 3 Virtual LAN - IP Protocol */ IFT_L3IPXVLAN = 0x89, /* Layer 3 Virtual LAN - IPX Prot. */ IFT_DIGITALPOWERLINE = 0x8a, /* IP over Power Lines */ IFT_MEDIAMAILOVERIP = 0x8b, /* (xxx) Multimedia Mail over IP */ IFT_DTM = 0x8c, /* Dynamic synchronous Transfer Mode */ IFT_DCN = 0x8d, /* Data Communications Network */ IFT_IPFORWARD = 0x8e, /* IP Forwarding Interface */ IFT_MSDSL = 0x8f, /* Multi-rate Symmetric DSL */ IFT_IEEE1394 = 0x90, /* IEEE1394 High Performance SerialBus*/ IFT_IFGSN = 0x91, /* HIPPI-6400 */ IFT_DVBRCCMACLAYER = 0x92, /* DVB-RCC MAC Layer */ IFT_DVBRCCDOWNSTREAM = 0x93, /* DVB-RCC Downstream Channel */ IFT_DVBRCCUPSTREAM = 0x94, /* DVB-RCC Upstream Channel */ IFT_ATMVIRTUAL = 0x95, /* ATM Virtual Interface */ IFT_MPLSTUNNEL = 0x96, /* MPLS Tunnel Virtual Interface */ IFT_SRP = 0x97, /* Spatial Reuse Protocol */ IFT_VOICEOVERATM = 0x98, /* Voice over ATM */ IFT_VOICEOVERFRAMERELAY = 0x99, /* Voice Over Frame Relay */ IFT_IDSL = 0x9a, /* Digital Subscriber Loop over ISDN */ IFT_COMPOSITELINK = 0x9b, /* Avici Composite Link Interface */ IFT_SS7SIGLINK = 0x9c, /* SS7 Signaling Link */ IFT_PROPWIRELESSP2P = 0x9d, /* Prop. P2P wireless interface */ IFT_FRFORWARD = 0x9e, /* Frame forward Interface */ IFT_RFC1483 = 0x9f, /* Multiprotocol over ATM AAL5 */ IFT_USB = 0xa0, /* USB Interface */ IFT_IEEE8023ADLAG = 0xa1, /* IEEE 802.3ad Link Aggregate*/ IFT_BGPPOLICYACCOUNTING = 0xa2, /* BGP Policy Accounting */ IFT_FRF16MFRBUNDLE = 0xa3, /* FRF.16 Multilik Frame Relay*/ IFT_H323GATEKEEPER = 0xa4, /* H323 Gatekeeper */ IFT_H323PROXY = 0xa5, /* H323 Voice and Video Proxy */ IFT_MPLS = 0xa6, /* MPLS */ IFT_MFSIGLINK = 0xa7, /* Multi-frequency signaling link */ IFT_HDSL2 = 0xa8, /* High Bit-Rate DSL, 2nd gen. */ IFT_SHDSL = 0xa9, /* Multirate HDSL2 */ IFT_DS1FDL = 0xaa, /* Facility Data Link (4Kbps) on a DS1*/ IFT_POS = 0xab, /* Packet over SONET/SDH Interface */ IFT_DVBASILN = 0xac, /* DVB-ASI Input */ IFT_DVBASIOUT = 0xad, /* DVB-ASI Output */ IFT_PLC = 0xae, /* Power Line Communications */ IFT_NFAS = 0xaf, /* Non-Facility Associated Signaling */ IFT_TR008 = 0xb0, /* TROO8 */ IFT_GR303RDT = 0xb1, /* Remote Digital Terminal */ IFT_GR303IDT = 0xb2, /* Integrated Digital Terminal */ IFT_ISUP = 0xb3, /* ISUP */ IFT_PROPDOCSWIRELESSMACLAYER = 0xb4, /* prop/Wireless MAC Layer */ IFT_PROPDOCSWIRELESSDOWNSTREAM = 0xb5, /* prop/Wireless Downstream */ IFT_PROPDOCSWIRELESSUPSTREAM = 0xb6, /* prop/Wireless Upstream */ IFT_HIPERLAN2 = 0xb7, /* HIPERLAN Type 2 Radio Interface */ IFT_PROPBWAP2MP = 0xb8, /* PropBroadbandWirelessAccess P2MP*/ IFT_SONETOVERHEADCHANNEL = 0xb9, /* SONET Overhead Channel */ IFT_DIGITALWRAPPEROVERHEADCHANNEL = 0xba, /* Digital Wrapper Overhead */ IFT_AAL2 = 0xbb, /* ATM adaptation layer 2 */ IFT_RADIOMAC = 0xbc, /* MAC layer over radio links */ IFT_ATMRADIO = 0xbd, /* ATM over radio links */ IFT_IMT = 0xbe, /* Inter-Machine Trunks */ IFT_MVL = 0xbf, /* Multiple Virtual Lines DSL */ IFT_REACHDSL = 0xc0, /* Long Reach DSL */ IFT_FRDLCIENDPT = 0xc1, /* Frame Relay DLCI End Point */ IFT_ATMVCIENDPT = 0xc2, /* ATM VCI End Point */ IFT_OPTICALCHANNEL = 0xc3, /* Optical Channel */ IFT_OPTICALTRANSPORT = 0xc4, /* Optical Transport */ IFT_INFINIBAND = 0xc7, /* Infiniband */ IFT_INFINIBANDLAG = 0xc8, /* Infiniband Link Aggregate */ IFT_BRIDGE = 0xd1, /* Transparent bridge interface */ IFT_STF = 0xd7, /* 6to4 interface */ /* * Not based on IANA assignments. Conflicting with IANA assignments. * We should make them negative probably. * This requires changes to struct if_data. */ IFT_GIF = 0xf0, /* Generic tunnel interface */ IFT_PVC = 0xf1, /* Unused */ IFT_ENC = 0xf4, /* Encapsulating interface */ IFT_PFLOG = 0xf6, /* PF packet filter logging */ IFT_PFSYNC = 0xf7, /* PF packet filter synchronization */ - IFT_WIREGUARD = 0xf8, /* WireGuard tunnel */ } ifType; /* * Some (broken) software uses #ifdef IFT_TYPE to check whether * an operating systems supports certain interface type. Lack of * ifdef leads to a piece of functionality compiled out. */ #ifndef BURN_BRIDGES #define IFT_BRIDGE IFT_BRIDGE #define IFT_PPP IFT_PPP #define IFT_PROPVIRTUAL IFT_PROPVIRTUAL #define IFT_L2VLAN IFT_L2VLAN #define IFT_L3IPVLAN IFT_L3IPVLAN #define IFT_IEEE1394 IFT_IEEE1394 #define IFT_INFINIBAND IFT_INFINIBAND #endif #endif /* !_NET_IF_TYPES_H_ */ diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c index 7937749c1299..62f0ac733a23 100644 --- a/sys/netinet6/nd6.c +++ b/sys/netinet6/nd6.c @@ -1,2662 +1,2662 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */ #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */ #define SIN6(s) ((const struct sockaddr_in6 *)(s)) MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery"); /* timer values */ VNET_DEFINE(int, nd6_prune) = 1; /* walk list every 1 seconds */ VNET_DEFINE(int, nd6_delay) = 5; /* delay first probe time 5 second */ VNET_DEFINE(int, nd6_umaxtries) = 3; /* maximum unicast query */ VNET_DEFINE(int, nd6_mmaxtries) = 3; /* maximum multicast query */ VNET_DEFINE(int, nd6_useloopback) = 1; /* use loopback interface for * local traffic */ VNET_DEFINE(int, nd6_gctimer) = (60 * 60 * 24); /* 1 day: garbage * collection timer */ /* preventing too many loops in ND option parsing */ VNET_DEFINE_STATIC(int, nd6_maxndopt) = 10; /* max # of ND options allowed */ VNET_DEFINE(int, nd6_maxnudhint) = 0; /* max # of subsequent upper * layer hints */ VNET_DEFINE_STATIC(int, nd6_maxqueuelen) = 16; /* max pkts cached in unresolved * ND entries */ #define V_nd6_maxndopt VNET(nd6_maxndopt) #define V_nd6_maxqueuelen VNET(nd6_maxqueuelen) #ifdef ND6_DEBUG VNET_DEFINE(int, nd6_debug) = 1; #else VNET_DEFINE(int, nd6_debug) = 0; #endif static eventhandler_tag lle_event_eh, iflladdr_event_eh, ifnet_link_event_eh; VNET_DEFINE(struct nd_prhead, nd_prefix); VNET_DEFINE(struct rwlock, nd6_lock); VNET_DEFINE(uint64_t, nd6_list_genid); VNET_DEFINE(struct mtx, nd6_onlink_mtx); VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL; #define V_nd6_recalc_reachtm_interval VNET(nd6_recalc_reachtm_interval) int (*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int); static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *); static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *); static void nd6_slowtimo(void *); static int regen_tmpaddr(struct in6_ifaddr *); static void nd6_free(struct llentry **, int); static void nd6_free_redirect(const struct llentry *); static void nd6_llinfo_timer(void *); static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **); static int nd6_need_cache(struct ifnet *); VNET_DEFINE_STATIC(struct callout, nd6_slowtimo_ch); #define V_nd6_slowtimo_ch VNET(nd6_slowtimo_ch) VNET_DEFINE_STATIC(struct callout, nd6_timer_ch); #define V_nd6_timer_ch VNET(nd6_timer_ch) SYSCTL_DECL(_net_inet6_icmp6); static void nd6_lle_event(void *arg __unused, struct llentry *lle, int evt) { struct rt_addrinfo rtinfo; struct sockaddr_in6 dst; struct sockaddr_dl gw; struct ifnet *ifp; int type; int fibnum; LLE_WLOCK_ASSERT(lle); if (lltable_get_af(lle->lle_tbl) != AF_INET6) return; switch (evt) { case LLENTRY_RESOLVED: type = RTM_ADD; KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); break; case LLENTRY_EXPIRED: type = RTM_DELETE; break; default: return; } ifp = lltable_get_ifp(lle->lle_tbl); bzero(&dst, sizeof(dst)); bzero(&gw, sizeof(gw)); bzero(&rtinfo, sizeof(rtinfo)); lltable_fill_sa_entry(lle, (struct sockaddr *)&dst); dst.sin6_scope_id = in6_getscopezone(ifp, in6_addrscope(&dst.sin6_addr)); gw.sdl_len = sizeof(struct sockaddr_dl); gw.sdl_family = AF_LINK; gw.sdl_alen = ifp->if_addrlen; gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib; rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | ( type == RTM_ADD ? RTF_UP: 0), 0, fibnum); } /* * A handler for interface link layer address change event. */ static void nd6_iflladdr(void *arg __unused, struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return; lltable_update_ifaddr(LLTABLE6(ifp)); } void nd6_init(void) { mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF); rw_init(&V_nd6_lock, "nd6 list"); LIST_INIT(&V_nd_prefix); nd6_defrouter_init(); /* Start timers. */ callout_init(&V_nd6_slowtimo_ch, 0); callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); callout_init(&V_nd6_timer_ch, 0); callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet); nd6_dad_init(); if (IS_DEFAULT_VNET(curvnet)) { lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); ifnet_link_event_eh = EVENTHANDLER_REGISTER(ifnet_link_event, nd6_ifnet_link_event, NULL, EVENTHANDLER_PRI_ANY); } } #ifdef VIMAGE void nd6_destroy() { callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); if (IS_DEFAULT_VNET(curvnet)) { EVENTHANDLER_DEREGISTER(ifnet_link_event, ifnet_link_event_eh); EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); } rw_destroy(&V_nd6_lock); mtx_destroy(&V_nd6_onlink_mtx); } #endif struct nd_ifinfo * nd6_ifattach(struct ifnet *ifp) { struct nd_ifinfo *nd; nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO); nd->initialized = 1; nd->chlim = IPV6_DEFHLIM; nd->basereachable = REACHABLE_TIME; nd->reachable = ND_COMPUTE_RTIME(nd->basereachable); nd->retrans = RETRANS_TIMER; nd->flags = ND6_IFF_PERFORMNUD; /* Set IPv6 disabled on all interfaces but loopback by default. */ if ((ifp->if_flags & IFF_LOOPBACK) == 0) nd->flags |= ND6_IFF_IFDISABLED; /* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL. * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by * default regardless of the V_ip6_auto_linklocal configuration to * give a reasonable default behavior. */ - if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE && - ifp->if_type != IFT_WIREGUARD) || (ifp->if_flags & IFF_LOOPBACK)) + if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) || + (ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_AUTO_LINKLOCAL; /* * A loopback interface does not need to accept RTADV. * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by * default regardless of the V_ip6_accept_rtadv configuration to * prevent the interface from accepting RA messages arrived * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV. */ if (V_ip6_accept_rtadv && !(ifp->if_flags & IFF_LOOPBACK) && (ifp->if_type != IFT_BRIDGE)) { nd->flags |= ND6_IFF_ACCEPT_RTADV; /* If we globally accept rtadv, assume IPv6 on. */ nd->flags &= ~ND6_IFF_IFDISABLED; } if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK)) nd->flags |= ND6_IFF_NO_RADR; /* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */ nd6_setmtu0(ifp, nd); return nd; } void nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd) { struct epoch_tracker et; struct ifaddr *ifa, *next; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; /* stop DAD processing */ nd6_dad_stop(ifa); } NET_EPOCH_EXIT(et); free(nd, M_IP6NDP); } /* * Reset ND level link MTU. This function is called when the physical MTU * changes, which means we might have to adjust the ND level MTU. */ void nd6_setmtu(struct ifnet *ifp) { if (ifp->if_afdata[AF_INET6] == NULL) return; nd6_setmtu0(ifp, ND_IFINFO(ifp)); } /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */ void nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi) { u_int32_t omaxmtu; omaxmtu = ndi->maxmtu; ndi->maxmtu = ifp->if_mtu; /* * Decreasing the interface MTU under IPV6 minimum MTU may cause * undesirable situation. We thus notify the operator of the change * explicitly. The check for omaxmtu is necessary to restrict the * log to the case of changing the MTU, not initializing it. */ if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) { log(LOG_NOTICE, "nd6_setmtu0: " "new link MTU on %s (%lu) is too small for IPv6\n", if_name(ifp), (unsigned long)ndi->maxmtu); } if (ndi->maxmtu > V_in6_maxmtu) in6_setmaxmtu(); /* check all interfaces just in case */ } void nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts) { bzero(ndopts, sizeof(*ndopts)); ndopts->nd_opts_search = (struct nd_opt_hdr *)opt; ndopts->nd_opts_last = (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len); if (icmp6len == 0) { ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } } /* * Take one ND option. */ struct nd_opt_hdr * nd6_option(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int olen; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return NULL; if (ndopts->nd_opts_done) return NULL; nd_opt = ndopts->nd_opts_search; /* make sure nd_opt_len is inside the buffer */ if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) { bzero(ndopts, sizeof(*ndopts)); return NULL; } olen = nd_opt->nd_opt_len << 3; if (olen == 0) { /* * Message validation requires that all included * options have a length that is greater than zero. */ bzero(ndopts, sizeof(*ndopts)); return NULL; } ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen); if (ndopts->nd_opts_search > ndopts->nd_opts_last) { /* option overruns the end of buffer, invalid */ bzero(ndopts, sizeof(*ndopts)); return NULL; } else if (ndopts->nd_opts_search == ndopts->nd_opts_last) { /* reached the end of options chain */ ndopts->nd_opts_done = 1; ndopts->nd_opts_search = NULL; } return nd_opt; } /* * Parse multiple ND options. * This function is much easier to use, for ND routines that do not need * multiple options of the same type. */ int nd6_options(union nd_opts *ndopts) { struct nd_opt_hdr *nd_opt; int i = 0; KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__)); KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts", __func__)); if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. */ ICMP6STAT_INC(icp6s_nd_badopt); bzero(ndopts, sizeof(*ndopts)); return -1; } if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LINKADDR: case ND_OPT_TARGET_LINKADDR: case ND_OPT_MTU: case ND_OPT_REDIRECTED_HEADER: case ND_OPT_NONCE: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { nd6log((LOG_INFO, "duplicated ND6 option found (type=%d)\n", nd_opt->nd_opt_type)); /* XXX bark? */ } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFORMATION: if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; /* What about ND_OPT_ROUTE_INFO? RFC 4191 */ case ND_OPT_RDNSS: /* RFC 6106 */ case ND_OPT_DNSSL: /* RFC 6106 */ /* * Silently ignore options we know and do not care about * in the kernel. */ break; default: /* * Unknown options must be silently ignored, * to accommodate future extension to the protocol. */ nd6log((LOG_DEBUG, "nd6_options: unsupported option %d - " "option ignored\n", nd_opt->nd_opt_type)); } skip1: i++; if (i > V_nd6_maxndopt) { ICMP6STAT_INC(icp6s_nd_toomanyopt); nd6log((LOG_INFO, "too many loop in nd opt\n")); break; } if (ndopts->nd_opts_done) break; } return 0; } /* * ND6 timer routine to handle ND6 entries */ static void nd6_llinfo_settimer_locked(struct llentry *ln, long tick) { int canceled; LLE_WLOCK_ASSERT(ln); if (tick < 0) { ln->la_expire = 0; ln->ln_ntick = 0; canceled = callout_stop(&ln->lle_timer); } else { ln->la_expire = time_uptime + tick / hz; LLE_ADDREF(ln); if (tick > INT_MAX) { ln->ln_ntick = tick - INT_MAX; canceled = callout_reset(&ln->lle_timer, INT_MAX, nd6_llinfo_timer, ln); } else { ln->ln_ntick = 0; canceled = callout_reset(&ln->lle_timer, tick, nd6_llinfo_timer, ln); } } if (canceled > 0) LLE_REMREF(ln); } /* * Gets source address of the first packet in hold queue * and stores it in @src. * Returns pointer to @src (if hold queue is not empty) or NULL. * * Set noinline to be dtrace-friendly */ static __noinline struct in6_addr * nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src) { struct ip6_hdr hdr; struct mbuf *m; if (ln->la_hold == NULL) return (NULL); /* * assume every packet in la_hold has the same IP header */ m = ln->la_hold; if (sizeof(hdr) > m->m_len) return (NULL); m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr); *src = hdr.ip6_src; return (src); } /* * Checks if we need to switch from STALE state. * * RFC 4861 requires switching from STALE to DELAY state * on first packet matching entry, waiting V_nd6_delay and * transition to PROBE state (if upper layer confirmation was * not received). * * This code performs a bit differently: * On packet hit we don't change state (but desired state * can be guessed by control plane). However, after V_nd6_delay * seconds code will transition to PROBE state (so DELAY state * is kinda skipped in most situations). * * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so * we perform the following upon entering STALE state: * * 1) Arm timer to run each V_nd6_delay seconds to make sure that * if packet was transmitted at the start of given interval, we * would be able to switch to PROBE state in V_nd6_delay seconds * as user expects. * * 2) Reschedule timer until original V_nd6_gctimer expires keeping * lle in STALE state (remaining timer value stored in lle_remtime). * * 3) Reschedule timer if packet was transmitted less that V_nd6_delay * seconds ago. * * Returns non-zero value if the entry is still STALE (storing * the next timer interval in @pdelay). * * Returns zero value if original timer expired or we need to switch to * PROBE (store that in @do_switch variable). */ static int nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch) { int nd_delay, nd_gctimer, r_skip_req; time_t lle_hittime; long delay; *do_switch = 0; nd_gctimer = V_nd6_gctimer; nd_delay = V_nd6_delay; LLE_REQ_LOCK(lle); r_skip_req = lle->r_skip_req; lle_hittime = lle->lle_hittime; LLE_REQ_UNLOCK(lle); if (r_skip_req > 0) { /* * Nonzero r_skip_req value was set upon entering * STALE state. Since value was not changed, no * packets were passed using this lle. Ask for * timer reschedule and keep STALE state. */ delay = (long)(MIN(nd_gctimer, nd_delay)); delay *= hz; if (lle->lle_remtime > delay) lle->lle_remtime -= delay; else { delay = lle->lle_remtime; lle->lle_remtime = 0; } if (delay == 0) { /* * The original ng6_gctime timeout ended, * no more rescheduling. */ return (0); } *pdelay = delay; return (1); } /* * Packet received. Verify timestamp */ delay = (long)(time_uptime - lle_hittime); if (delay < nd_delay) { /* * V_nd6_delay still not passed since the first * hit in STALE state. * Reshedule timer and return. */ *pdelay = (long)(nd_delay - delay) * hz; return (1); } /* Request switching to probe */ *do_switch = 1; return (0); } /* * Switch @lle state to new state optionally arming timers. * * Set noinline to be dtrace-friendly */ __noinline void nd6_llinfo_setstate(struct llentry *lle, int newstate) { struct ifnet *ifp; int nd_gctimer, nd_delay; long delay, remtime; delay = 0; remtime = 0; switch (newstate) { case ND6_LLINFO_INCOMPLETE: ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000; break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(lle)) { ifp = lle->lle_tbl->llt_ifp; delay = (long)ND_IFINFO(ifp)->reachable * hz; } break; case ND6_LLINFO_STALE: /* * Notify fast path that we want to know if any packet * is transmitted by setting r_skip_req. */ LLE_REQ_LOCK(lle); lle->r_skip_req = 1; LLE_REQ_UNLOCK(lle); nd_delay = V_nd6_delay; nd_gctimer = V_nd6_gctimer; delay = (long)(MIN(nd_gctimer, nd_delay)) * hz; remtime = (long)nd_gctimer * hz - delay; break; case ND6_LLINFO_DELAY: lle->la_asked = 0; delay = (long)V_nd6_delay * hz; break; } if (delay > 0) nd6_llinfo_settimer_locked(lle, delay); lle->lle_remtime = remtime; lle->ln_state = newstate; } /* * Timer-dependent part of nd state machine. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_llinfo_timer(void *arg) { struct epoch_tracker et; struct llentry *ln; struct in6_addr *dst, *pdst, *psrc, src; struct ifnet *ifp; struct nd_ifinfo *ndi; int do_switch, send_ns; long delay; KASSERT(arg != NULL, ("%s: arg NULL", __func__)); ln = (struct llentry *)arg; ifp = lltable_get_ifp(ln->lle_tbl); CURVNET_SET(ifp->if_vnet); ND6_RLOCK(); LLE_WLOCK(ln); if (callout_pending(&ln->lle_timer)) { /* * Here we are a bit odd here in the treatment of * active/pending. If the pending bit is set, it got * rescheduled before I ran. The active * bit we ignore, since if it was stopped * in ll_tablefree() and was currently running * it would have return 0 so the code would * not have deleted it since the callout could * not be stopped so we want to go through * with the delete here now. If the callout * was restarted, the pending bit will be back on and * we just want to bail since the callout_reset would * return 1 and our reference would have been removed * by nd6_llinfo_settimer_locked above since canceled * would have been 1. */ LLE_WUNLOCK(ln); ND6_RUNLOCK(); CURVNET_RESTORE(); return; } NET_EPOCH_ENTER(et); ndi = ND_IFINFO(ifp); send_ns = 0; dst = &ln->r_l3addr.addr6; pdst = dst; if (ln->ln_ntick > 0) { if (ln->ln_ntick > INT_MAX) { ln->ln_ntick -= INT_MAX; nd6_llinfo_settimer_locked(ln, INT_MAX); } else { ln->ln_ntick = 0; nd6_llinfo_settimer_locked(ln, ln->ln_ntick); } goto done; } if (ln->la_flags & LLE_STATIC) { goto done; } if (ln->la_flags & LLE_DELETED) { nd6_free(&ln, 0); goto done; } switch (ln->ln_state) { case ND6_LLINFO_INCOMPLETE: if (ln->la_asked < V_nd6_mmaxtries) { ln->la_asked++; send_ns = 1; /* Send NS to multicast address */ pdst = NULL; } else { struct mbuf *m = ln->la_hold; if (m) { struct mbuf *m0; /* * assuming every packet in la_hold has the * same IP header. Send error after unlock. */ m0 = m->m_nextpkt; m->m_nextpkt = NULL; ln->la_hold = m0; clear_llinfo_pqueue(ln); } nd6_free(&ln, 0); if (m != NULL) { struct mbuf *n = m; /* * if there are any ummapped mbufs, we * must free them, rather than using * them for an ICMP, as they cannot be * checksummed. */ while ((n = n->m_next) != NULL) { if (n->m_flags & M_EXTPG) break; } if (n != NULL) { m_freem(m); m = NULL; } else { icmp6_error2(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0, ifp); } } } break; case ND6_LLINFO_REACHABLE: if (!ND6_LLINFO_PERMANENT(ln)) nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); break; case ND6_LLINFO_STALE: if (nd6_is_stale(ln, &delay, &do_switch) != 0) { /* * No packet has used this entry and GC timeout * has not been passed. Reshedule timer and * return. */ nd6_llinfo_settimer_locked(ln, delay); break; } if (do_switch == 0) { /* * GC timer has ended and entry hasn't been used. * Run Garbage collector (RFC 4861, 5.3) */ if (!ND6_LLINFO_PERMANENT(ln)) nd6_free(&ln, 1); break; } /* Entry has been used AND delay timer has ended. */ /* FALLTHROUGH */ case ND6_LLINFO_DELAY: if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->la_asked = 1; nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE); send_ns = 1; } else nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */ break; case ND6_LLINFO_PROBE: if (ln->la_asked < V_nd6_umaxtries) { ln->la_asked++; send_ns = 1; } else { nd6_free(&ln, 0); } break; default: panic("%s: paths in a dark night can be confusing: %d", __func__, ln->ln_state); } done: if (ln != NULL) ND6_RUNLOCK(); if (send_ns != 0) { nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000); psrc = nd6_llinfo_get_holdsrc(ln, &src); LLE_FREE_LOCKED(ln); ln = NULL; nd6_ns_output(ifp, psrc, pdst, dst, NULL); } if (ln != NULL) LLE_FREE_LOCKED(ln); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } /* * ND6 timer routine to expire default route list and prefix list */ void nd6_timer(void *arg) { CURVNET_SET((struct vnet *) arg); struct epoch_tracker et; struct nd_prhead prl; struct nd_prefix *pr, *npr; struct ifnet *ifp; struct in6_ifaddr *ia6, *nia6; uint64_t genid; LIST_INIT(&prl); NET_EPOCH_ENTER(et); nd6_defrouter_timer(); /* * expire interface addresses. * in the past the loop was inside prefix expiry processing. * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. * * XXXRW: in6_ifaddrhead locking. */ addrloop: CK_STAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) { /* check address lifetime */ if (IFA6_IS_INVALID(ia6)) { int regen = 0; /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when * we suspended a laptop PC, then turned it on after a * period that could invalidate all temporary * addresses. Although we may have to restart the * loop (see below), it must be after purging the * address. Otherwise, we'd see an infinite loop of * regeneration. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { if (regen_tmpaddr(ia6) == 0) regen = 1; } in6_purgeaddr(&ia6->ia_ifa); if (regen) goto addrloop; /* XXX: see below */ } else if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; ia6->ia6_flags |= IN6_IFF_DEPRECATED; /* * If a temporary address has just become deprecated, * regenerate a new one if possible. */ if (V_ip6_use_tempaddr && (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && (oldflags & IN6_IFF_DEPRECATED) == 0) { if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is * generated. * XXX: this means the address chain * has changed while we are still in * the loop. Although the change * would not cause disaster (because * it's not a deletion, but an * addition,) we'd rather restart the * loop just for safety. Or does this * significantly reduce performance?? */ goto addrloop; } } } else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) { /* * Schedule DAD for a tentative address. This happens * if the interface was down or not running * when the address was configured. */ int delay; delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); nd6_dad_start((struct ifaddr *)ia6, delay); } else { /* * Check status of the interface. If it is down, * mark the address as tentative for future DAD. */ ifp = ia6->ia_ifp; if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0 && ((ifp->if_flags & IFF_UP) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)){ ia6->ia6_flags &= ~IN6_IFF_DUPLICATED; ia6->ia6_flags |= IN6_IFF_TENTATIVE; } /* * A new RA might have made a deprecated address * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; } } NET_EPOCH_EXIT(et); ND6_WLOCK(); restart: LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { /* * Expire prefixes. Since the pltime is only used for * autoconfigured addresses, pltime processing for prefixes is * not necessary. * * Only unlink after all derived addresses have expired. This * may not occur until two hours after the prefix has expired * per RFC 4862. If the prefix expires before its derived * addresses, mark it off-link. This will be done automatically * after unlinking if no address references remain. */ if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME || time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime) continue; if (pr->ndpr_addrcnt == 0) { nd6_prefix_unlink(pr, &prl); continue; } if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { genid = V_nd6_list_genid; nd6_prefix_ref(pr); ND6_WUNLOCK(); ND6_ONLINK_LOCK(); (void)nd6_prefix_offlink(pr); ND6_ONLINK_UNLOCK(); ND6_WLOCK(); nd6_prefix_rele(pr); if (genid != V_nd6_list_genid) goto restart; } } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz, nd6_timer, curvnet); CURVNET_RESTORE(); } /* * ia6 - deprecated/invalidated temporary address */ static int regen_tmpaddr(struct in6_ifaddr *ia6) { struct ifaddr *ifa; struct ifnet *ifp; struct in6_ifaddr *public_ifa6 = NULL; NET_EPOCH_ASSERT(); ifp = ia6->ia_ifa.ifa_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in6_ifaddr *it6; if (ifa->ifa_addr->sa_family != AF_INET6) continue; it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; /* ignore autoconf addresses with different prefixes. */ if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) continue; /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still * preferred, do not create another one. It would be rare, but * could happen, for example, when we resume a laptop PC after * a long period. */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { public_ifa6 = NULL; break; } /* * This is a public autoconf address that has the same prefix * as ours. If it is preferred, keep it. We can't break the * loop here, because there may be a still-preferred temporary * address with the prefix. */ if (!IFA6_IS_DEPRECATED(it6)) public_ifa6 = it6; } if (public_ifa6 != NULL) ifa_ref(&public_ifa6->ia_ifa); if (public_ifa6 != NULL) { int e; if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) { ifa_free(&public_ifa6->ia_ifa); log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); return (-1); } ifa_free(&public_ifa6->ia_ifa); return (0); } return (-1); } /* * Remove prefix and default router list entries corresponding to ifp. Neighbor * cache entries are freed in in6_domifdetach(). */ void nd6_purge(struct ifnet *ifp) { struct nd_prhead prl; struct nd_prefix *pr, *npr; LIST_INIT(&prl); /* Purge default router list entries toward ifp. */ nd6_defrouter_purge(ifp); ND6_WLOCK(); /* * Remove prefixes on ifp. We should have already removed addresses on * this interface, so no addresses should be referencing these prefixes. */ LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) { if (pr->ndpr_ifp == ifp) nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); /* Delete the unlinked prefix objects. */ while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); nd6_prefix_del(pr); } /* cancel default outgoing interface setting */ if (V_nd6_defifindex == ifp->if_index) nd6_setdefaultiface(0); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* Refresh default router list. */ defrouter_select_fib(ifp->if_fib); } } /* * the caller acquires and releases the lock on the lltbls * Returns the llentry locked */ struct llentry * nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; IF_AFDATA_LOCK_ASSERT(ifp); ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6); return (ln); } static struct llentry * nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp) { struct sockaddr_in6 sin6; struct llentry *ln; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6); if (ln != NULL) ln->ln_state = ND6_LLINFO_NOSTATE; return (ln); } /* * Test whether a given IPv6 address is a neighbor or not, ignoring * the actual neighbor cache. The neighbor cache is ignored in order * to not reenter the routing code from within itself. */ static int nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct nd_prefix *pr; struct ifaddr *ifa; struct rt_addrinfo info; struct sockaddr_in6 rt_key; const struct sockaddr *dst6; uint64_t genid; int error, fibnum; /* * A link-local address is always a neighbor. * XXX: a link does not necessarily specify a single interface. */ if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { struct sockaddr_in6 sin6_copy; u_int32_t zone; /* * We need sin6_copy since sa6_recoverscope() may modify the * content (XXX). */ sin6_copy = *addr; if (sa6_recoverscope(&sin6_copy)) return (0); /* XXX: should be impossible */ if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) return (0); if (sin6_copy.sin6_scope_id == zone) return (1); else return (0); } bzero(&rt_key, sizeof(rt_key)); bzero(&info, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; /* * If the address matches one of our addresses, * it should be a neighbor. * If the address matches one of our on-link prefixes, it should be a * neighbor. */ ND6_RLOCK(); restart: LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { if (pr->ndpr_ifp != ifp) continue; if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { dst6 = (const struct sockaddr *)&pr->ndpr_prefix; /* * We only need to check all FIBs if add_addr_allfibs * is unset. If set, checking any FIB will suffice. */ fibnum = V_rt_add_addr_allfibs ? rt_numfibs - 1 : 0; for (; fibnum < rt_numfibs; fibnum++) { genid = V_nd6_list_genid; ND6_RUNLOCK(); /* * Restore length field before * retrying lookup */ rt_key.sin6_len = sizeof(rt_key); error = rib_lookup_info(fibnum, dst6, 0, 0, &info); ND6_RLOCK(); if (genid != V_nd6_list_genid) goto restart; if (error == 0) break; } if (error != 0) continue; /* * This is the case where multiple interfaces * have the same prefix, but only one is installed * into the routing table and that prefix entry * is not the one being examined here. */ if (!IN6_ARE_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &rt_key.sin6_addr)) continue; } if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, &addr->sin6_addr, &pr->ndpr_mask)) { ND6_RUNLOCK(); return (1); } } ND6_RUNLOCK(); /* * If the address is assigned on the node of the other side of * a p2p interface, the address should be a neighbor. */ if (ifp->if_flags & IFF_POINTOPOINT) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sin6_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { NET_EPOCH_EXIT(et); return 1; } } NET_EPOCH_EXIT(et); } /* * If the default router list is empty, all addresses are regarded * as on-link, and thus, as a neighbor. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && nd6_defrouter_list_empty() && V_nd6_defifindex == ifp->if_index) { return (1); } return (0); } /* * Detect if a given IPv6 address identifies a neighbor on a given link. * XXX: should take care of the destination of a p2p link? */ int nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp) { struct llentry *lle; int rc = 0; NET_EPOCH_ASSERT(); IF_AFDATA_UNLOCK_ASSERT(ifp); if (nd6_is_new_addr_neighbor(addr, ifp)) return (1); /* * Even if the address matches none of our addresses, it might be * in the neighbor cache. */ if ((lle = nd6_lookup(&addr->sin6_addr, 0, ifp)) != NULL) { LLE_RUNLOCK(lle); rc = 1; } return (rc); } /* * Free an nd6 llinfo entry. * Since the function would cause significant changes in the kernel, DO NOT * make it global, unless you have a strong reason for the change, and are sure * that the change is safe. * * Set noinline to be dtrace-friendly */ static __noinline void nd6_free(struct llentry **lnp, int gc) { struct ifnet *ifp; struct llentry *ln; struct nd_defrouter *dr; ln = *lnp; *lnp = NULL; LLE_WLOCK_ASSERT(ln); ND6_RLOCK_ASSERT(); ifp = lltable_get_ifp(ln->lle_tbl); if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0) dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp); else dr = NULL; ND6_RUNLOCK(); if ((ln->la_flags & LLE_DELETED) == 0) EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED); /* * we used to have pfctlinput(PRC_HOSTDEAD) here. * even though it is not harmful, it was not really necessary. */ /* cancel timer */ nd6_llinfo_settimer_locked(ln, -1); if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { if (dr != NULL && dr->expire && ln->ln_state == ND6_LLINFO_STALE && gc) { /* * If the reason for the deletion is just garbage * collection, and the neighbor is an active default * router, do not delete it. Instead, reset the GC * timer using the router's lifetime. * Simply deleting the entry would affect default * router selection, which is not necessarily a good * thing, especially when we're using router preference * values. * XXX: the check for ln_state would be redundant, * but we intentionally keep it just in case. */ if (dr->expire > time_uptime) nd6_llinfo_settimer_locked(ln, (dr->expire - time_uptime) * hz); else nd6_llinfo_settimer_locked(ln, (long)V_nd6_gctimer * hz); LLE_REMREF(ln); LLE_WUNLOCK(ln); defrouter_rele(dr); return; } if (dr) { /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised * prefixes. */ /* * Temporarily fake the state to choose a new default * router and to perform on-link determination of * prefixes correctly. * Below the state will be set correctly, * or the entry itself will be deleted. */ ln->ln_state = ND6_LLINFO_INCOMPLETE; } if (ln->ln_router || dr) { /* * We need to unlock to avoid a LOR with rt6_flush() with the * rnh and for the calls to pfxlist_onlink_check() and * defrouter_select_fib() in the block further down for calls * into nd6_lookup(). We still hold a ref. */ LLE_WUNLOCK(ln); /* * rt6_flush must be called whether or not the neighbor * is in the Default Router List. * See a corresponding comment in nd6_na_input(). */ rt6_flush(&ln->r_l3addr.addr6, ifp); } if (dr) { /* * Since defrouter_select_fib() does not affect the * on-link determination and MIP6 needs the check * before the default router selection, we perform * the check now. */ pfxlist_onlink_check(); /* * Refresh default router list. */ defrouter_select_fib(dr->ifp->if_fib); } /* * If this entry was added by an on-link redirect, remove the * corresponding host route. */ if (ln->la_flags & LLE_REDIRECT) nd6_free_redirect(ln); if (ln->ln_router || dr) LLE_WLOCK(ln); } /* * Save to unlock. We still hold an extra reference and will not * free(9) in llentry_free() if someone else holds one as well. */ LLE_WUNLOCK(ln); IF_AFDATA_LOCK(ifp); LLE_WLOCK(ln); /* Guard against race with other llentry_free(). */ if (ln->la_flags & LLE_LINKED) { /* Remove callout reference */ LLE_REMREF(ln); lltable_unlink_entry(ln->lle_tbl, ln); } IF_AFDATA_UNLOCK(ifp); llentry_free(ln); if (dr != NULL) defrouter_rele(dr); } static int nd6_isdynrte(const struct rtentry *rt, const struct nhop_object *nh, void *xap) { if (nh->nh_flags & NHF_REDIRECT) return (1); return (0); } /* * Remove the rtentry for the given llentry, * both of which were installed by a redirect. */ static void nd6_free_redirect(const struct llentry *ln) { int fibnum; struct sockaddr_in6 sin6; struct rt_addrinfo info; struct rib_cmd_info rc; struct epoch_tracker et; lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6); memset(&info, 0, sizeof(info)); info.rti_info[RTAX_DST] = (struct sockaddr *)&sin6; info.rti_filter = nd6_isdynrte; NET_EPOCH_ENTER(et); for (fibnum = 0; fibnum < rt_numfibs; fibnum++) rib_action(fibnum, RTM_DELETE, &info, &rc); NET_EPOCH_EXIT(et); } /* * Updates status of the default router route. */ static void check_release_defrouter(struct rib_cmd_info *rc, void *_cbdata) { struct nd_defrouter *dr; struct nhop_object *nh; nh = rc->rc_nh_old; if ((nh != NULL) && (nh->nh_flags & NHF_DEFAULT)) { dr = defrouter_lookup(&nh->gw6_sa.sin6_addr, nh->nh_ifp); if (dr != NULL) { dr->installed = 0; defrouter_rele(dr); } } } void nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg) { #ifdef ROUTE_MPATH rib_decompose_notification(rc, check_release_defrouter, NULL); #else check_release_defrouter(rc, NULL); #endif } int nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data; struct in6_ndifreq *ndif = (struct in6_ndifreq *)data; struct epoch_tracker et; int error = 0; if (ifp->if_afdata[AF_INET6] == NULL) return (EPFNOSUPPORT); switch (cmd) { case OSIOCGIFINFO_IN6: #define ND ndi->ndi /* XXX: old ndp(8) assumes a positive value for linkmtu. */ bzero(&ND, sizeof(ND)); ND.linkmtu = IN6_LINKMTU(ifp); ND.maxmtu = ND_IFINFO(ifp)->maxmtu; ND.basereachable = ND_IFINFO(ifp)->basereachable; ND.reachable = ND_IFINFO(ifp)->reachable; ND.retrans = ND_IFINFO(ifp)->retrans; ND.flags = ND_IFINFO(ifp)->flags; ND.recalctm = ND_IFINFO(ifp)->recalctm; ND.chlim = ND_IFINFO(ifp)->chlim; break; case SIOCGIFINFO_IN6: ND = *ND_IFINFO(ifp); break; case SIOCSIFINFO_IN6: /* * used to change host variables from userland. * intended for a use on router to reflect RA configurations. */ /* 0 means 'unspecified' */ if (ND.linkmtu != 0) { if (ND.linkmtu < IPV6_MMTU || ND.linkmtu > IN6_LINKMTU(ifp)) { error = EINVAL; break; } ND_IFINFO(ifp)->linkmtu = ND.linkmtu; } if (ND.basereachable != 0) { int obasereachable = ND_IFINFO(ifp)->basereachable; ND_IFINFO(ifp)->basereachable = ND.basereachable; if (ND.basereachable != obasereachable) ND_IFINFO(ifp)->reachable = ND_COMPUTE_RTIME(ND.basereachable); } if (ND.retrans != 0) ND_IFINFO(ifp)->retrans = ND.retrans; if (ND.chlim != 0) ND_IFINFO(ifp)->chlim = ND.chlim; /* FALLTHROUGH */ case SIOCSIFINFO_FLAGS: { struct ifaddr *ifa; struct in6_ifaddr *ia; if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && !(ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 1->0 transision */ /* * If the interface is marked as ND6_IFF_IFDISABLED and * has an link-local address with IN6_IFF_DUPLICATED, * do not clear ND6_IFF_IFDISABLED. * See RFC 4862, Section 5.4.5. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } NET_EPOCH_EXIT(et); if (ifa != NULL) { /* LLA is duplicated. */ ND.flags |= ND6_IFF_IFDISABLED; log(LOG_ERR, "Cannot enable an interface" " with a link-local address marked" " duplicate.\n"); } else { ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED; if (ifp->if_flags & IFF_UP) in6_if_up(ifp); } } else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) && (ND.flags & ND6_IFF_IFDISABLED)) { /* ifdisabled 0->1 transision */ /* Mark all IPv6 address as tentative. */ ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED; if (V_ip6_dad_count > 0 && (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) { NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; ia->ia6_flags |= IN6_IFF_TENTATIVE; } NET_EPOCH_EXIT(et); } } if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) { if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) { /* auto_linklocal 0->1 transision */ /* If no link-local address on ifp, configure */ ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL; in6_ifattach(ifp, NULL); } else if (!(ND.flags & ND6_IFF_IFDISABLED) && ifp->if_flags & IFF_UP) { /* * When the IF already has * ND6_IFF_AUTO_LINKLOCAL, no link-local * address is assigned, and IFF_UP, try to * assign one. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) break; } NET_EPOCH_EXIT(et); if (ifa != NULL) /* No LLA is configured. */ in6_ifattach(ifp, NULL); } } ND_IFINFO(ifp)->flags = ND.flags; break; } #undef ND case SIOCSNDFLUSH_IN6: /* XXX: the ioctl name is confusing... */ /* sync kernel routing table with the default router list */ defrouter_reset(); defrouter_select_fib(RT_ALL_FIBS); break; case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct in6_ifaddr *ia, *ia_next; struct nd_prefix *pr, *next; struct nd_prhead prl; LIST_INIT(&prl); ND6_WLOCK(); LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) { if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) continue; /* XXX */ nd6_prefix_unlink(pr, &prl); } ND6_WUNLOCK(); while ((pr = LIST_FIRST(&prl)) != NULL) { LIST_REMOVE(pr, ndpr_entry); /* XXXRW: in6_ifaddrhead locking. */ CK_STAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link, ia_next) { if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) continue; if (ia->ia6_ndpr == pr) in6_purgeaddr(&ia->ia_ifa); } nd6_prefix_del(pr); } break; } case SIOCSRTRFLUSH_IN6: { /* flush all the default routers */ defrouter_reset(); nd6_defrouter_flush_all(); defrouter_select_fib(RT_ALL_FIBS); break; } case SIOCGNBRINFO_IN6: { struct llentry *ln; struct in6_addr nb_addr = nbi->addr; /* make local for safety */ if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0) return (error); NET_EPOCH_ENTER(et); ln = nd6_lookup(&nb_addr, 0, ifp); NET_EPOCH_EXIT(et); if (ln == NULL) { error = EINVAL; break; } nbi->state = ln->ln_state; nbi->asked = ln->la_asked; nbi->isrouter = ln->ln_router; if (ln->la_expire == 0) nbi->expire = 0; else nbi->expire = ln->la_expire + ln->lle_remtime / hz + (time_second - time_uptime); LLE_RUNLOCK(ln); break; } case SIOCGDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ ndif->ifindex = V_nd6_defifindex; break; case SIOCSDEFIFACE_IN6: /* XXX: should be implemented as a sysctl? */ return (nd6_setdefaultiface(ndif->ifindex)); } return (error); } /* * Calculates new isRouter value based on provided parameters and * returns it. */ static int nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr, int ln_router) { /* * ICMP6 type dependent behavior. * * NS: clear IsRouter if new entry * RS: clear IsRouter * RA: set IsRouter if there's lladdr * redir: clear IsRouter if new entry * * RA case, (1): * The spec says that we must set IsRouter in the following cases: * - If lladdr exist, set IsRouter. This means (1-5). * - If it is old entry (!newentry), set IsRouter. This means (7). * So, based on the spec, in (1-5) and (7) cases we must set IsRouter. * A quetion arises for (1) case. (1) case has no lladdr in the * neighbor cache, this is similar to (6). * This case is rare but we figured that we MUST NOT set IsRouter. * * is_new old_addr new_addr NS RS RA redir * D R * 0 n n (1) c ? s * 0 y n (2) c s s * 0 n y (3) c s s * 0 y y (4) c s s * 0 y y (5) c s s * 1 -- n (6) c c c s * 1 -- y (7) c c s c s * * (c=clear s=set) */ switch (type & 0xff) { case ND_NEIGHBOR_SOLICIT: /* * New entry must have is_router flag cleared. */ if (is_new) /* (6-7) */ ln_router = 0; break; case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the * is_router flag. Otherwise, if the entry is newly created, * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln_router = 1; else { if (is_new) /* (6-7) */ ln_router = 0; } break; case ND_ROUTER_SOLICIT: /* * is_router flag must always be cleared. */ ln_router = 0; break; case ND_ROUTER_ADVERT: /* * Mark an entry with lladdr as a router. */ if ((!is_new && (old_addr || new_addr)) || /* (2-5) */ (is_new && new_addr)) { /* (7) */ ln_router = 1; } break; } return (ln_router); } /* * Create neighbor cache entry and cache link-layer address, * on reception of inbound ND6 packets. (RS/RA/NS/redirect) * * type - ICMP6 type * code - type dependent information * */ void nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr, int lladdrlen, int type, int code) { struct llentry *ln = NULL, *ln_tmp; int is_newentry; int do_update; int olladdr; int llchange; int flags; uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; u_char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; NET_EPOCH_ASSERT(); IF_AFDATA_UNLOCK_ASSERT(ifp); KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__)); KASSERT(from != NULL, ("%s: from == NULL", __func__)); /* nothing must be updated for unspecified address */ if (IN6_IS_ADDR_UNSPECIFIED(from)) return; /* * Validation about ifp->if_addrlen and lladdrlen must be done in * the caller. * * XXX If the link does not have link-layer adderss, what should * we do? (ifp->if_addrlen == 0) * Spec says nothing in sections for RA, RS and NA. There's small * description on it in NS section (RFC 2461 7.2.3). */ flags = lladdr ? LLE_EXCLUSIVE : 0; ln = nd6_lookup(from, flags, ifp); is_newentry = 0; if (ln == NULL) { flags |= LLE_EXCLUSIVE; ln = nd6_alloc(from, 0, ifp); if (ln == NULL) return; /* * Since we already know all the data for the new entry, * fill it before insertion. */ if (lladdr != NULL) { linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ ln_tmp = nd6_lookup(from, LLE_EXCLUSIVE, ifp); if (ln_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp == NULL) { /* No existing lle, mark as new entry (6,7) */ is_newentry = 1; if (lladdr != NULL) { /* (7) */ nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } } else { lltable_free_entry(LLTABLE6(ifp), ln); ln = ln_tmp; ln_tmp = NULL; } } /* do nothing if static ndp is set */ if ((ln->la_flags & LLE_STATIC)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); return; } olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { llchange = bcmp(lladdr, ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; else llchange = 0; /* * newentry olladdr lladdr llchange (*=record) * 0 n n -- (1) * 0 y n -- (2) * 0 n y y (3) * STALE * 0 y y n (4) * * 0 y y y (5) * STALE * 1 -- n -- (6) NOSTATE(= PASSIVE) * 1 -- y -- (7) * STALE */ do_update = 0; if (is_newentry == 0 && llchange != 0) { do_update = 1; /* (3,5) */ /* * Record source link-layer address * XXX is it dependent to ifp->if_type? */ linkhdrsize = sizeof(linkhdr); if (lltable_calc_llheader(ifp, AF_INET6, lladdr, linkhdr, &linkhdrsize, &lladdr_off) != 0) return; if (lltable_try_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, lladdr_off) == 0) { /* Entry was deleted */ return; } nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (ln->la_hold != NULL) nd6_grab_holdchain(ln, &chain, &sin6); } /* Calculates new router status */ router = nd6_is_router(type, code, is_newentry, olladdr, lladdr != NULL ? 1 : 0, ln->ln_router); ln->ln_router = router; /* Mark non-router redirects with special flag */ if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER) ln->la_flags |= LLE_REDIRECT; if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(ln); else LLE_RUNLOCK(ln); if (chain != NULL) nd6_flush_holdchain(ifp, chain, &sin6); /* * When the link-layer address of a router changes, select the * best router again. In particular, when the neighbor entry is newly * created, it might affect the selection policy. * Question: can we restrict the first condition to the "is_newentry" * case? * XXX: when we hear an RA from a new router with the link-layer * address option, defrouter_select_fib() is called twice, since * defrtrlist_update called the function as well. However, I believe * we can compromise the overhead, since it only happens the first * time. * XXX: although defrouter_select_fib() should not have a bad effect * for those are not autoconfigured hosts, we explicitly avoid such * cases for safety. */ if ((do_update || is_newentry) && router && ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) { /* * guaranteed recursion */ defrouter_select_fib(ifp->if_fib); } } static void nd6_slowtimo(void *arg) { struct epoch_tracker et; CURVNET_SET((struct vnet *) arg); struct nd_ifinfo *nd6if; struct ifnet *ifp; callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz, nd6_slowtimo, curvnet); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (ifp->if_afdata[AF_INET6] == NULL) continue; nd6if = ND_IFINFO(ifp); if (nd6if->basereachable && /* already initialized */ (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) { /* * Since reachable time rarely changes by router * advertisements, we SHOULD insure that a new random * value gets recomputed at least once every few hours. * (RFC 2461, 6.3.4) */ nd6if->recalctm = V_nd6_recalc_reachtm_interval; nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable); } } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } void nd6_grab_holdchain(struct llentry *ln, struct mbuf **chain, struct sockaddr_in6 *sin6) { LLE_WLOCK_ASSERT(ln); *chain = ln->la_hold; ln->la_hold = NULL; lltable_fill_sa_entry(ln, (struct sockaddr *)sin6); if (ln->ln_state == ND6_LLINFO_STALE) { /* * The first time we send a packet to a * neighbor whose entry is STALE, we have * to change the state to DELAY and a sets * a timer to expire in DELAY_FIRST_PROBE_TIME * seconds to ensure do neighbor unreachability * detection on expiration. * (RFC 2461 7.3.3) */ nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY); } } int nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m, struct sockaddr_in6 *dst, struct route *ro) { int error; int ip6len; struct ip6_hdr *ip6; struct m_tag *mtag; #ifdef MAC mac_netinet6_nd6_send(ifp, m); #endif /* * If called from nd6_ns_output() (NS), nd6_na_output() (NA), * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA * as handled by rtsol and rtadvd), mbufs will be tagged for SeND * to be diverted to user space. When re-injected into the kernel, * send_output() will directly dispatch them to the outgoing interface. */ if (send_sendso_input_hook != NULL) { mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL); if (mtag != NULL) { ip6 = mtod(m, struct ip6_hdr *); ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen); /* Use the SEND socket */ error = send_sendso_input_hook(m, ifp, SND_OUT, ip6len); /* -1 == no app on SEND socket */ if (error == 0 || error != -1) return (error); } } m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL, mtod(m, struct ip6_hdr *)); if ((ifp->if_flags & IFF_LOOPBACK) == 0) origifp = ifp; error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro); return (error); } /* * Lookup link headerfor @sa_dst address. Stores found * data in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification * is required, call "slow" version. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *ln = NULL; const struct sockaddr_in6 *dst6; NET_EPOCH_ASSERT(); if (pflags != NULL) *pflags = 0; dst6 = (const struct sockaddr_in6 *)sa_dst; /* discard the packet if IPv6 operation is disabled on the interface */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) { m_freem(m); return (ENETDOWN); /* better error? */ } if (m != NULL && m->m_flags & M_MCAST) { switch (ifp->if_type) { case IFT_ETHER: case IFT_L2VLAN: case IFT_BRIDGE: ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr, desten); return (0); default: m_freem(m); return (EAFNOSUPPORT); } } ln = nd6_lookup(&dst6->sin6_addr, plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, ifp); if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) { /* Entry found, let's copy lle info */ bcopy(ln->r_linkdata, desten, ln->r_hdrlen); if (pflags != NULL) *pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR); /* Check if we have feedback request from nd6 timer */ if (ln->r_skip_req != 0) { LLE_REQ_LOCK(ln); ln->r_skip_req = 0; /* Notify that entry was used */ ln->lle_hittime = time_uptime; LLE_REQ_UNLOCK(ln); } if (plle) { LLE_ADDREF(ln); *plle = ln; LLE_WUNLOCK(ln); } return (0); } else if (plle && ln) LLE_WUNLOCK(ln); return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags, plle)); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Heavy version. * Function assume that destination LLE does not exist, * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired. * * Set noinline to be dtrace-friendly */ static __noinline int nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags, struct llentry **plle) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; int send_ns, ll_len; char *lladdr; NET_EPOCH_ASSERT(); /* * Address resolution or Neighbor Unreachability Detection * for the next hop. * At this point, the destination of the packet must be a unicast * or an anycast address(i.e. not a multicast). */ if (lle == NULL) { lle = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp)) { /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe * it is tolerable, because this should be a rare case. */ lle = nd6_alloc(&dst->sin6_addr, 0, ifp); if (lle == NULL) { char ip6buf[INET6_ADDRSTRLEN]; log(LOG_DEBUG, "nd6_output: can't allocate llinfo for %s " "(ln=%p)\n", ip6_sprintf(ip6buf, &dst->sin6_addr), lle); m_freem(m); return (ENOBUFS); } IF_AFDATA_WLOCK(ifp); LLE_WLOCK(lle); /* Prefer any existing entry over newly-created one */ lle_tmp = nd6_lookup(&dst->sin6_addr, LLE_EXCLUSIVE, ifp); if (lle_tmp == NULL) lltable_link_entry(LLTABLE6(ifp), lle); IF_AFDATA_WUNLOCK(ifp); if (lle_tmp != NULL) { lltable_free_entry(LLTABLE6(ifp), lle); lle = lle_tmp; lle_tmp = NULL; } } } if (lle == NULL) { m_freem(m); return (ENOBUFS); } LLE_WLOCK_ASSERT(lle); /* * The first time we send a packet to a neighbor whose entry is * STALE, we have to change the state to DELAY and a sets a timer to * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do * neighbor unreachability detection on expiration. * (RFC 2461 7.3.3) */ if (lle->ln_state == ND6_LLINFO_STALE) nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY); /* * If the neighbor cache entry has a state other than INCOMPLETE * (i.e. its link-layer address is already resolved), just * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { if (flags & LLE_ADDRONLY) { lladdr = lle->ll_addr; ll_len = ifp->if_addrlen; } else { lladdr = lle->r_linkdata; ll_len = lle->r_hdrlen; } bcopy(lladdr, desten, ll_len); if (pflags != NULL) *pflags = lle->la_flags; if (plle) { LLE_ADDREF(lle); *plle = lle; } LLE_WUNLOCK(lle); return (0); } /* * There is a neighbor cache entry, but no ethernet address * response yet. Append this latest packet to the end of the * packet queue in the mbuf. When it exceeds nd6_maxqueuelen, * the oldest packet in the queue will be removed. */ if (lle->la_hold != NULL) { struct mbuf *m_hold; int i; i = 0; for (m_hold = lle->la_hold; m_hold; m_hold = m_hold->m_nextpkt){ i++; if (m_hold->m_nextpkt == NULL) { m_hold->m_nextpkt = m; break; } } while (i >= V_nd6_maxqueuelen) { m_hold = lle->la_hold; lle->la_hold = lle->la_hold->m_nextpkt; m_freem(m_hold); i--; } } else { lle->la_hold = m; } /* * If there has been no NS for the neighbor after entering the * INCOMPLETE state, send the first solicitation. * Note that for newly-created lle la_asked will be 0, * so we will transition from ND6_LLINFO_NOSTATE to * ND6_LLINFO_INCOMPLETE state here. */ psrc = NULL; send_ns = 0; if (lle->la_asked == 0) { lle->la_asked++; send_ns = 1; psrc = nd6_llinfo_get_holdsrc(lle, &src); nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE); } LLE_WUNLOCK(lle); if (send_ns != 0) nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL); return (EWOULDBLOCK); } /* * Do L2 address resolution for @sa_dst address. Stores found * address in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * Return values: * - 0 on success (address copied to buffer). * - EWOULDBLOCK (no local error, but address is still unresolved) * - other errors (alloc failure, etc) */ int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, char *desten, uint32_t *pflags) { int error; flags |= LLE_ADDRONLY; error = nd6_resolve_slow(ifp, flags, NULL, (const struct sockaddr_in6 *)dst, desten, pflags, NULL); return (error); } int nd6_flush_holdchain(struct ifnet *ifp, struct mbuf *chain, struct sockaddr_in6 *dst) { struct mbuf *m, *m_head; int error = 0; m_head = chain; while (m_head) { m = m_head; m_head = m_head->m_nextpkt; m->m_nextpkt = NULL; error = nd6_output_ifp(ifp, ifp, m, dst, NULL); } /* * XXX * note that intermediate errors are blindly ignored */ return (error); } static int nd6_need_cache(struct ifnet *ifp) { /* * XXX: we currently do not make neighbor cache on any interface * other than Ethernet and GIF. * * RFC2893 says: * - unidirectional tunnels needs no ND */ switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE1394: case IFT_L2VLAN: case IFT_INFINIBAND: case IFT_BRIDGE: case IFT_PROPVIRTUAL: return (1); default: return (0); } } /* * Add pernament ND6 link-layer record for given * interface address. * * Very similar to IPv4 arp_ifinit(), but: * 1) IPv6 DAD is performed in different place * 2) It is called by IPv6 protocol stack in contrast to * arp_ifinit() which is typically called in SIOCSIFADDR * driver ioctl handler. * */ int nd6_add_ifa_lle(struct in6_ifaddr *ia) { struct ifnet *ifp; struct llentry *ln, *ln_tmp; struct sockaddr *dst; ifp = ia->ia_ifa.ifa_ifp; if (nd6_need_cache(ifp) == 0) return (0); dst = (struct sockaddr *)&ia->ia_addr; ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst); if (ln == NULL) return (ENOBUFS); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Unlink any entry if exists */ ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_EXCLUSIVE, dst); if (ln_tmp != NULL) lltable_unlink_entry(LLTABLE6(ifp), ln_tmp); lltable_link_entry(LLTABLE6(ifp), ln); IF_AFDATA_WUNLOCK(ifp); if (ln_tmp != NULL) EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); LLE_WUNLOCK(ln); if (ln_tmp != NULL) llentry_free(ln_tmp); return (0); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ void nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all) { struct sockaddr_in6 mask, addr; struct sockaddr *saddr, *smask; struct ifnet *ifp; ifp = ia->ia_ifa.ifa_ifp; memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); saddr = (struct sockaddr *)&addr; smask = (struct sockaddr *)&mask; if (all != 0) lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC); else lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr); } static void clear_llinfo_pqueue(struct llentry *ln) { struct mbuf *m_hold, *m_hold_next; for (m_hold = ln->la_hold; m_hold; m_hold = m_hold_next) { m_hold_next = m_hold->m_nextpkt; m_freem(m_hold); } ln->la_hold = NULL; } static int nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS) { struct in6_prefix p; struct sockaddr_in6 s6; struct nd_prefix *pr; struct nd_pfxrouter *pfr; time_t maxexpire; int error; char ip6buf[INET6_ADDRSTRLEN]; if (req->newptr) return (EPERM); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); bzero(&p, sizeof(p)); p.origin = PR_ORIG_RA; bzero(&s6, sizeof(s6)); s6.sin6_family = AF_INET6; s6.sin6_len = sizeof(s6); ND6_RLOCK(); LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) { p.prefix = pr->ndpr_prefix; if (sa6_recoverscope(&p.prefix)) { log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &p.prefix.sin6_addr)); /* XXX: press on... */ } p.raflags = pr->ndpr_raf; p.prefixlen = pr->ndpr_plen; p.vltime = pr->ndpr_vltime; p.pltime = pr->ndpr_pltime; p.if_index = pr->ndpr_ifp->if_index; if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME) p.expire = 0; else { /* XXX: we assume time_t is signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate) p.expire = pr->ndpr_lastupdate + pr->ndpr_vltime + (time_second - time_uptime); else p.expire = maxexpire; } p.refcnt = pr->ndpr_addrcnt; p.flags = pr->ndpr_stateflags; p.advrtrs = 0; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) p.advrtrs++; error = SYSCTL_OUT(req, &p, sizeof(p)); if (error != 0) break; LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) { s6.sin6_addr = pfr->router->rtaddr; if (sa6_recoverscope(&s6)) log(LOG_ERR, "scope error in prefix list (%s)\n", ip6_sprintf(ip6buf, &pfr->router->rtaddr)); error = SYSCTL_OUT(req, &s6, sizeof(s6)); if (error != 0) goto out; } } out: ND6_RUNLOCK(); return (error); } SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, nd6_sysctl_prlist, "S,in6_prefix", "NDP prefix list"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, ""); SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), ""); diff --git a/sys/sys/priv.h b/sys/sys/priv.h index 9d8a3204add5..7ef54782a60d 100644 --- a/sys/sys/priv.h +++ b/sys/sys/priv.h @@ -1,545 +1,544 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 nCircle Network Security, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson for the TrustedBSD * Project under contract to nCircle Network Security, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /* * Privilege checking interface for BSD kernel. */ #ifndef _SYS_PRIV_H_ #define _SYS_PRIV_H_ /* * Privilege list, sorted loosely by kernel subsystem. * * Think carefully before adding or reusing one of these privileges -- are * there existing instances referring to the same privilege? Third party * vendors may request the assignment of privileges to be used in loadable * modules. Particular numeric privilege assignments are part of the * loadable kernel module ABI, and should not be changed across minor * releases. * * When adding a new privilege, remember to determine if it's appropriate * for use in jail, and update the privilege switch in prison_priv_check() * in kern_jail.c as necessary. */ /* * Track beginning of privilege list. */ #define _PRIV_LOWEST 1 /* * The remaining privileges typically correspond to one or a small * number of specific privilege checks, and have (relatively) precise * meanings. They are loosely sorted into a set of base system * privileges, such as the ability to reboot, and then loosely by * subsystem, indicated by a subsystem name. */ #define _PRIV_ROOT 1 /* Removed. */ #define PRIV_ACCT 2 /* Manage process accounting. */ #define PRIV_MAXFILES 3 /* Exceed system open files limit. */ #define PRIV_MAXPROC 4 /* Exceed system processes limit. */ #define PRIV_KTRACE 5 /* Set/clear KTRFAC_ROOT on ktrace. */ #define PRIV_SETDUMPER 6 /* Configure dump device. */ #define PRIV_REBOOT 8 /* Can reboot system. */ #define PRIV_SWAPON 9 /* Can swapon(). */ #define PRIV_SWAPOFF 10 /* Can swapoff(). */ #define PRIV_MSGBUF 11 /* Can read kernel message buffer. */ #define PRIV_IO 12 /* Can perform low-level I/O. */ #define PRIV_KEYBOARD 13 /* Reprogram keyboard. */ #define PRIV_DRIVER 14 /* Low-level driver privilege. */ #define PRIV_ADJTIME 15 /* Set time adjustment. */ #define PRIV_NTP_ADJTIME 16 /* Set NTP time adjustment. */ #define PRIV_CLOCK_SETTIME 17 /* Can call clock_settime. */ #define PRIV_SETTIMEOFDAY 18 /* Can call settimeofday. */ #define _PRIV_SETHOSTID 19 /* Removed. */ #define _PRIV_SETDOMAINNAME 20 /* Removed. */ /* * Audit subsystem privileges. */ #define PRIV_AUDIT_CONTROL 40 /* Can configure audit. */ #define PRIV_AUDIT_FAILSTOP 41 /* Can run during audit fail stop. */ #define PRIV_AUDIT_GETAUDIT 42 /* Can get proc audit properties. */ #define PRIV_AUDIT_SETAUDIT 43 /* Can set proc audit properties. */ #define PRIV_AUDIT_SUBMIT 44 /* Can submit an audit record. */ /* * Credential management privileges. */ #define PRIV_CRED_SETUID 50 /* setuid. */ #define PRIV_CRED_SETEUID 51 /* seteuid to !ruid and !svuid. */ #define PRIV_CRED_SETGID 52 /* setgid. */ #define PRIV_CRED_SETEGID 53 /* setgid to !rgid and !svgid. */ #define PRIV_CRED_SETGROUPS 54 /* Set process additional groups. */ #define PRIV_CRED_SETREUID 55 /* setreuid. */ #define PRIV_CRED_SETREGID 56 /* setregid. */ #define PRIV_CRED_SETRESUID 57 /* setresuid. */ #define PRIV_CRED_SETRESGID 58 /* setresgid. */ #define PRIV_SEEOTHERGIDS 59 /* Exempt bsd.seeothergids. */ #define PRIV_SEEOTHERUIDS 60 /* Exempt bsd.seeotheruids. */ /* * Debugging privileges. */ #define PRIV_DEBUG_DIFFCRED 80 /* Exempt debugging other users. */ #define PRIV_DEBUG_SUGID 81 /* Exempt debugging setuid proc. */ #define PRIV_DEBUG_UNPRIV 82 /* Exempt unprivileged debug limit. */ #define PRIV_DEBUG_DENIED 83 /* Exempt P2_NOTRACE. */ /* * Dtrace privileges. */ #define PRIV_DTRACE_KERNEL 90 /* Allow use of DTrace on the kernel. */ #define PRIV_DTRACE_PROC 91 /* Allow attaching DTrace to process. */ #define PRIV_DTRACE_USER 92 /* Process may submit DTrace events. */ /* * Firmware privilegs. */ #define PRIV_FIRMWARE_LOAD 100 /* Can load firmware. */ /* * Jail privileges. */ #define PRIV_JAIL_ATTACH 110 /* Attach to a jail. */ #define PRIV_JAIL_SET 111 /* Set jail parameters. */ #define PRIV_JAIL_REMOVE 112 /* Remove a jail. */ /* * Kernel environment privileges. */ #define PRIV_KENV_SET 120 /* Set kernel env. variables. */ #define PRIV_KENV_UNSET 121 /* Unset kernel env. variables. */ /* * Loadable kernel module privileges. */ #define PRIV_KLD_LOAD 130 /* Load a kernel module. */ #define PRIV_KLD_UNLOAD 131 /* Unload a kernel module. */ /* * Privileges associated with the MAC Framework and specific MAC policy * modules. */ #define PRIV_MAC_PARTITION 140 /* Privilege in mac_partition policy. */ #define PRIV_MAC_PRIVS 141 /* Privilege in the mac_privs policy. */ /* * Process-related privileges. */ #define PRIV_PROC_LIMIT 160 /* Exceed user process limit. */ #define PRIV_PROC_SETLOGIN 161 /* Can call setlogin. */ #define PRIV_PROC_SETRLIMIT 162 /* Can raise resources limits. */ #define PRIV_PROC_SETLOGINCLASS 163 /* Can call setloginclass(2). */ /* * System V IPC privileges. */ #define PRIV_IPC_READ 170 /* Can override IPC read perm. */ #define PRIV_IPC_WRITE 171 /* Can override IPC write perm. */ #define PRIV_IPC_ADMIN 172 /* Can override IPC owner-only perm. */ #define PRIV_IPC_MSGSIZE 173 /* Exempt IPC message queue limit. */ /* * POSIX message queue privileges. */ #define PRIV_MQ_ADMIN 180 /* Can override msgq owner-only perm. */ /* * Performance monitoring counter privileges. */ #define PRIV_PMC_MANAGE 190 /* Can administer PMC. */ #define PRIV_PMC_SYSTEM 191 /* Can allocate a system-wide PMC. */ /* * Scheduling privileges. */ #define PRIV_SCHED_DIFFCRED 200 /* Exempt scheduling other users. */ #define PRIV_SCHED_SETPRIORITY 201 /* Can set lower nice value for proc. */ #define PRIV_SCHED_RTPRIO 202 /* Can set real time scheduling. */ #define PRIV_SCHED_SETPOLICY 203 /* Can set scheduler policy. */ #define PRIV_SCHED_SET 204 /* Can set thread scheduler. */ #define PRIV_SCHED_SETPARAM 205 /* Can set thread scheduler params. */ #define PRIV_SCHED_CPUSET 206 /* Can manipulate cpusets. */ #define PRIV_SCHED_CPUSET_INTR 207 /* Can adjust IRQ to CPU binding. */ /* * POSIX semaphore privileges. */ #define PRIV_SEM_WRITE 220 /* Can override sem write perm. */ /* * Signal privileges. */ #define PRIV_SIGNAL_DIFFCRED 230 /* Exempt signalling other users. */ #define PRIV_SIGNAL_SUGID 231 /* Non-conserv signal setuid proc. */ /* * Sysctl privileges. */ #define PRIV_SYSCTL_DEBUG 240 /* Can invoke sysctl.debug. */ #define PRIV_SYSCTL_WRITE 241 /* Can write sysctls. */ #define PRIV_SYSCTL_WRITEJAIL 242 /* Can write sysctls, jail permitted. */ /* * TTY privileges. */ #define PRIV_TTY_CONSOLE 250 /* Set console to tty. */ #define PRIV_TTY_DRAINWAIT 251 /* Set tty drain wait time. */ #define PRIV_TTY_DTRWAIT 252 /* Set DTR wait on tty. */ #define PRIV_TTY_EXCLUSIVE 253 /* Override tty exclusive flag. */ #define _PRIV_TTY_PRISON 254 /* Removed. */ #define PRIV_TTY_STI 255 /* Simulate input on another tty. */ #define PRIV_TTY_SETA 256 /* Set tty termios structure. */ /* * UFS-specific privileges. */ #define PRIV_UFS_EXTATTRCTL 270 /* Can configure EAs on UFS1. */ #define PRIV_UFS_QUOTAOFF 271 /* quotaoff(). */ #define PRIV_UFS_QUOTAON 272 /* quotaon(). */ #define PRIV_UFS_SETUSE 273 /* setuse(). */ /* * ZFS-specific privileges. */ #define PRIV_ZFS_POOL_CONFIG 280 /* Can configure ZFS pools. */ #define PRIV_ZFS_INJECT 281 /* Can inject faults in the ZFS fault injection framework. */ #define PRIV_ZFS_JAIL 282 /* Can attach/detach ZFS file systems to/from jails. */ /* * NFS-specific privileges. */ #define PRIV_NFS_DAEMON 290 /* Can become the NFS daemon. */ #define PRIV_NFS_LOCKD 291 /* Can become NFS lock daemon. */ /* * VFS privileges. */ #define PRIV_VFS_READ 310 /* Override vnode DAC read perm. */ #define PRIV_VFS_WRITE 311 /* Override vnode DAC write perm. */ #define PRIV_VFS_ADMIN 312 /* Override vnode DAC admin perm. */ #define PRIV_VFS_EXEC 313 /* Override vnode DAC exec perm. */ #define PRIV_VFS_LOOKUP 314 /* Override vnode DAC lookup perm. */ #define PRIV_VFS_BLOCKRESERVE 315 /* Can use free block reserve. */ #define PRIV_VFS_CHFLAGS_DEV 316 /* Can chflags() a device node. */ #define PRIV_VFS_CHOWN 317 /* Can set user; group to non-member. */ #define PRIV_VFS_CHROOT 318 /* chroot(). */ #define PRIV_VFS_RETAINSUGID 319 /* Can retain sugid bits on change. */ #define PRIV_VFS_EXCEEDQUOTA 320 /* Exempt from quota restrictions. */ #define PRIV_VFS_EXTATTR_SYSTEM 321 /* Operate on system EA namespace. */ #define PRIV_VFS_FCHROOT 322 /* fchroot(). */ #define PRIV_VFS_FHOPEN 323 /* Can fhopen(). */ #define PRIV_VFS_FHSTAT 324 /* Can fhstat(). */ #define PRIV_VFS_FHSTATFS 325 /* Can fhstatfs(). */ #define PRIV_VFS_GENERATION 326 /* stat() returns generation number. */ #define PRIV_VFS_GETFH 327 /* Can retrieve file handles. */ #define PRIV_VFS_GETQUOTA 328 /* getquota(). */ #define PRIV_VFS_LINK 329 /* bsd.hardlink_check_uid */ #define PRIV_VFS_MKNOD_BAD 330 /* Was: mknod() can mark bad inodes. */ #define PRIV_VFS_MKNOD_DEV 331 /* Can mknod() to create dev nodes. */ #define PRIV_VFS_MKNOD_WHT 332 /* Can mknod() to create whiteout. */ #define PRIV_VFS_MOUNT 333 /* Can mount(). */ #define PRIV_VFS_MOUNT_OWNER 334 /* Can manage other users' file systems. */ #define PRIV_VFS_MOUNT_EXPORTED 335 /* Can set MNT_EXPORTED on mount. */ #define PRIV_VFS_MOUNT_PERM 336 /* Override dev node perms at mount. */ #define PRIV_VFS_MOUNT_SUIDDIR 337 /* Can set MNT_SUIDDIR on mount. */ #define PRIV_VFS_MOUNT_NONUSER 338 /* Can perform a non-user mount. */ #define PRIV_VFS_SETGID 339 /* Can setgid if not in group. */ #define PRIV_VFS_SETQUOTA 340 /* setquota(). */ #define PRIV_VFS_STICKYFILE 341 /* Can set sticky bit on file. */ #define PRIV_VFS_SYSFLAGS 342 /* Can modify system flags. */ #define PRIV_VFS_UNMOUNT 343 /* Can unmount(). */ #define PRIV_VFS_STAT 344 /* Override vnode MAC stat perm. */ #define PRIV_VFS_READ_DIR 345 /* Can read(2) a dirfd, needs sysctl. */ /* * Virtual memory privileges. */ #define PRIV_VM_MADV_PROTECT 360 /* Can set MADV_PROTECT. */ #define PRIV_VM_MLOCK 361 /* Can mlock(), mlockall(). */ #define PRIV_VM_MUNLOCK 362 /* Can munlock(), munlockall(). */ #define PRIV_VM_SWAP_NOQUOTA 363 /* * Can override the global * swap reservation limits. */ #define PRIV_VM_SWAP_NORLIMIT 364 /* * Can override the per-uid * swap reservation limits. */ /* * Device file system privileges. */ #define PRIV_DEVFS_RULE 370 /* Can manage devfs rules. */ #define PRIV_DEVFS_SYMLINK 371 /* Can create symlinks in devfs. */ /* * Random number generator privileges. */ #define PRIV_RANDOM_RESEED 380 /* Closing /dev/random reseeds. */ /* * Network stack privileges. */ #define PRIV_NET_BRIDGE 390 /* Administer bridge. */ #define PRIV_NET_GRE 391 /* Administer GRE. */ #define _PRIV_NET_PPP 392 /* Removed. */ #define _PRIV_NET_SLIP 393 /* Removed. */ #define PRIV_NET_BPF 394 /* Monitor BPF. */ #define PRIV_NET_RAW 395 /* Open raw socket. */ #define PRIV_NET_ROUTE 396 /* Administer routing. */ #define PRIV_NET_TAP 397 /* Can open tap device. */ #define PRIV_NET_SETIFMTU 398 /* Set interface MTU. */ #define PRIV_NET_SETIFFLAGS 399 /* Set interface flags. */ #define PRIV_NET_SETIFCAP 400 /* Set interface capabilities. */ #define PRIV_NET_SETIFNAME 401 /* Set interface name. */ #define PRIV_NET_SETIFMETRIC 402 /* Set interface metrics. */ #define PRIV_NET_SETIFPHYS 403 /* Set interface physical layer prop. */ #define PRIV_NET_SETIFMAC 404 /* Set interface MAC label. */ #define PRIV_NET_ADDMULTI 405 /* Add multicast addr. to ifnet. */ #define PRIV_NET_DELMULTI 406 /* Delete multicast addr. from ifnet. */ #define PRIV_NET_HWIOCTL 407 /* Issue hardware ioctl on ifnet. */ #define PRIV_NET_SETLLADDR 408 /* Set interface link-level address. */ #define PRIV_NET_ADDIFGROUP 409 /* Add new interface group. */ #define PRIV_NET_DELIFGROUP 410 /* Delete interface group. */ #define PRIV_NET_IFCREATE 411 /* Create cloned interface. */ #define PRIV_NET_IFDESTROY 412 /* Destroy cloned interface. */ #define PRIV_NET_ADDIFADDR 413 /* Add protocol addr to interface. */ #define PRIV_NET_DELIFADDR 414 /* Delete protocol addr on interface. */ #define PRIV_NET_LAGG 415 /* Administer lagg interface. */ #define PRIV_NET_GIF 416 /* Administer gif interface. */ #define PRIV_NET_SETIFVNET 417 /* Move interface to vnet. */ #define PRIV_NET_SETIFDESCR 418 /* Set interface description. */ #define PRIV_NET_SETIFFIB 419 /* Set interface fib. */ #define PRIV_NET_VXLAN 420 /* Administer vxlan. */ #define PRIV_NET_SETLANPCP 421 /* Set LAN priority. */ #define PRIV_NET_SETVLANPCP PRIV_NET_SETLANPCP /* Alias Set VLAN priority */ -#define PRIV_NET_WG 422 /* Administrate if_wg. */ /* * 802.11-related privileges. */ #define PRIV_NET80211_VAP_GETKEY 440 /* Query VAP 802.11 keys. */ #define PRIV_NET80211_VAP_MANAGE 441 /* Administer 802.11 VAP */ #define PRIV_NET80211_VAP_SETMAC 442 /* Set VAP MAC address */ #define PRIV_NET80211_CREATE_VAP 443 /* Create a new VAP */ /* * Placeholder for AppleTalk privileges, not supported anymore. */ #define _PRIV_NETATALK_RESERVEDPORT 450 /* Bind low port number. */ /* * ATM privileges. */ #define PRIV_NETATM_CFG 460 #define PRIV_NETATM_ADD 461 #define PRIV_NETATM_DEL 462 #define PRIV_NETATM_SET 463 /* * Bluetooth privileges. */ #define PRIV_NETBLUETOOTH_RAW 470 /* Open raw bluetooth socket. */ /* * Netgraph and netgraph module privileges. */ #define PRIV_NETGRAPH_CONTROL 480 /* Open netgraph control socket. */ #define PRIV_NETGRAPH_TTY 481 /* Configure tty for netgraph. */ /* * IPv4 and IPv6 privileges. */ #define PRIV_NETINET_RESERVEDPORT 490 /* Bind low port number. */ #define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ #define PRIV_NETINET_DIVERT 492 /* Open IP divert socket. */ #define PRIV_NETINET_PF 493 /* Administer pf firewall. */ #define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ #define PRIV_NETINET_CARP 495 /* Administer CARP. */ #define PRIV_NETINET_MROUTE 496 /* Administer multicast routing. */ #define PRIV_NETINET_RAW 497 /* Open netinet raw socket. */ #define PRIV_NETINET_GETCRED 498 /* Query netinet pcb credentials. */ #define PRIV_NETINET_ADDRCTRL6 499 /* Administer IPv6 address scopes. */ #define PRIV_NETINET_ND6 500 /* Administer IPv6 neighbor disc. */ #define PRIV_NETINET_SCOPE6 501 /* Administer IPv6 address scopes. */ #define PRIV_NETINET_ALIFETIME6 502 /* Administer IPv6 address lifetimes. */ #define PRIV_NETINET_IPSEC 503 /* Administer IPSEC. */ #define PRIV_NETINET_REUSEPORT 504 /* Allow [rapid] port/address reuse. */ #define PRIV_NETINET_SETHDROPTS 505 /* Set certain IPv4/6 header options. */ #define PRIV_NETINET_BINDANY 506 /* Allow bind to any address. */ #define PRIV_NETINET_HASHKEY 507 /* Get and set hash keys for IPv4/6. */ /* * Placeholders for IPX/SPX privileges, not supported any more. */ #define _PRIV_NETIPX_RESERVEDPORT 520 /* Bind low port number. */ #define _PRIV_NETIPX_RAW 521 /* Open netipx raw socket. */ /* * NCP privileges. */ #define PRIV_NETNCP 530 /* Use another user's connection. */ /* * SMB privileges. */ #define PRIV_NETSMB 540 /* Use another user's connection. */ /* * VM86 privileges. */ #define PRIV_VM86_INTCALL 550 /* Allow invoking vm86 int handlers. */ /* * Set of reserved privilege values, which will be allocated to code as * needed, in order to avoid renumbering later privileges due to insertion. */ #define _PRIV_RESERVED0 560 #define _PRIV_RESERVED1 561 #define _PRIV_RESERVED2 562 #define _PRIV_RESERVED3 563 #define _PRIV_RESERVED4 564 #define _PRIV_RESERVED5 565 #define _PRIV_RESERVED6 566 #define _PRIV_RESERVED7 567 #define _PRIV_RESERVED8 568 #define _PRIV_RESERVED9 569 #define _PRIV_RESERVED10 570 #define _PRIV_RESERVED11 571 #define _PRIV_RESERVED12 572 #define _PRIV_RESERVED13 573 #define _PRIV_RESERVED14 574 #define _PRIV_RESERVED15 575 /* * Define a set of valid privilege numbers that can be used by loadable * modules that don't yet have privilege reservations. Ideally, these should * not be used, since their meaning is opaque to any policies that are aware * of specific privileges, such as jail, and as such may be arbitrarily * denied. */ #define PRIV_MODULE0 600 #define PRIV_MODULE1 601 #define PRIV_MODULE2 602 #define PRIV_MODULE3 603 #define PRIV_MODULE4 604 #define PRIV_MODULE5 605 #define PRIV_MODULE6 606 #define PRIV_MODULE7 607 #define PRIV_MODULE8 608 #define PRIV_MODULE9 609 #define PRIV_MODULE10 610 #define PRIV_MODULE11 611 #define PRIV_MODULE12 612 #define PRIV_MODULE13 613 #define PRIV_MODULE14 614 #define PRIV_MODULE15 615 /* * DDB(4) privileges. */ #define PRIV_DDB_CAPTURE 620 /* Allow reading of DDB capture log. */ /* * Arla/nnpfs privileges. */ #define PRIV_NNPFS_DEBUG 630 /* Perforn ARLA_VIOC_NNPFSDEBUG. */ /* * cpuctl(4) privileges. */ #define PRIV_CPUCTL_WRMSR 640 /* Write model-specific register. */ #define PRIV_CPUCTL_UPDATE 641 /* Update cpu microcode. */ /* * Capi4BSD privileges. */ #define PRIV_C4B_RESET_CTLR 650 /* Load firmware, reset controller. */ #define PRIV_C4B_TRACE 651 /* Unrestricted CAPI message tracing. */ /* * OpenAFS privileges. */ #define PRIV_AFS_ADMIN 660 /* Can change AFS client settings. */ #define PRIV_AFS_DAEMON 661 /* Can become the AFS daemon. */ /* * Resource Limits privileges. */ #define PRIV_RCTL_GET_RACCT 670 #define PRIV_RCTL_GET_RULES 671 #define PRIV_RCTL_GET_LIMITS 672 #define PRIV_RCTL_ADD_RULE 673 #define PRIV_RCTL_REMOVE_RULE 674 /* * mem(4) privileges. */ #define PRIV_KMEM_READ 680 /* Open mem/kmem for reading. */ #define PRIV_KMEM_WRITE 681 /* Open mem/kmem for writing. */ /* * Track end of privilege list. */ #define _PRIV_HIGHEST 682 /* * Validate that a named privilege is known by the privilege system. Invalid * privileges presented to the privilege system by a priv_check interface * will result in a panic. This is only approximate due to sparse allocation * of the privilege space. */ #define PRIV_VALID(x) ((x) > _PRIV_LOWEST && (x) < _PRIV_HIGHEST) #ifdef _KERNEL /* * Privilege check interfaces, modeled after historic suser() interfaces, but * with the addition of a specific privilege name. No flags are currently * defined for the API. Historically, flags specified using the real uid * instead of the effective uid, and whether or not the check should be * allowed in jail. */ struct thread; struct ucred; int priv_check(struct thread *td, int priv); int priv_check_cred(struct ucred *cred, int priv); int priv_check_cred_vfs_lookup(struct ucred *cred); int priv_check_cred_vfs_lookup_nomac(struct ucred *cred); int priv_check_cred_vfs_generation(struct ucred *cred); #endif #endif /* !_SYS_PRIV_H_ */ diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index a5bb8a2587ea..295a1cf3d37f 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -1,548 +1,547 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)socketvar.h 8.3 (Berkeley) 2/19/95 * * $FreeBSD$ */ #ifndef _SYS_SOCKETVAR_H_ #define _SYS_SOCKETVAR_H_ /* * Socket generation count type. Also used in xinpcb, xtcpcb, xunpcb. */ typedef uint64_t so_gen_t; #if defined(_KERNEL) || defined(_WANT_SOCKET) #include /* for TAILQ macros */ #include /* for struct selinfo */ #include #include #include #include #include #ifdef _KERNEL #include #include #endif struct vnet; /* * Kernel structure per socket. * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. */ typedef int so_upcall_t(struct socket *, void *, int); typedef void so_dtor_t(struct socket *); struct socket; enum socket_qstate { SQ_NONE = 0, SQ_INCOMP = 0x0800, /* on sol_incomp */ SQ_COMP = 0x1000, /* on sol_comp */ }; /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (cr) locked by SOCKBUF_LOCK(&so->so_rcv). * (cs) locked by SOCKBUF_LOCK(&so->so_snd). * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. * (k) locked by KTLS workqueue mutex */ TAILQ_HEAD(accept_queue, socket); struct socket { struct mtx so_lock; volatile u_int so_count; /* (b / refcount) */ struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ int so_options; /* (b) from socket call, see socket.h */ short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ so_dtor_t *so_dtor; /* (b) optional destructor */ struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach * some user-specified metadata to a socket, which then can be * used by the kernel for various actions. * so_user_cookie is used by ipfw/dummynet. */ int so_fibnum; /* routing domain for this socket */ uint32_t so_user_cookie; int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ union { /* Regular (data flow) socket. */ struct { /* (cr, cs) Receive and send buffers. */ struct sockbuf so_rcv, so_snd; /* (e) Our place on accept queue. */ TAILQ_ENTRY(socket) so_list; struct socket *so_listen; /* (b) */ enum socket_qstate so_qstate; /* (b) */ /* (b) cached MAC label for peer */ struct label *so_peerlabel; u_long so_oobmark; /* chars to oob mark */ /* (k) Our place on KTLS RX work queue. */ STAILQ_ENTRY(socket) so_ktls_rx_list; }; /* * Listening socket, where accepts occur, is so_listen in all * subsidiary sockets. If so_listen is NULL, socket is not * related to an accept. For a listening socket itself * sol_incomp queues partially completed connections, while * sol_comp is a queue of connections ready to be accepted. * If a connection is aborted and it has so_listen set, then * it has to be pulled out of either sol_incomp or sol_comp. * We allow connections to queue up based on current queue * lengths and limit on number of queued connections for this * socket. */ struct { /* (e) queue of partial unaccepted connections */ struct accept_queue sol_incomp; /* (e) queue of complete unaccepted connections */ struct accept_queue sol_comp; u_int sol_qlen; /* (e) sol_comp length */ u_int sol_incqlen; /* (e) sol_incomp length */ u_int sol_qlimit; /* (e) queue limit */ /* accept_filter(9) optional data */ struct accept_filter *sol_accept_filter; void *sol_accept_filter_arg; /* saved filter args */ char *sol_accept_filter_str; /* saved user args */ /* Optional upcall, for kernel socket. */ so_upcall_t *sol_upcall; /* (e) */ void *sol_upcallarg; /* (e) */ /* Socket buffer parameters, to be copied to * dataflow sockets, accepted from this one. */ int sol_sbrcv_lowat; int sol_sbsnd_lowat; u_int sol_sbrcv_hiwat; u_int sol_sbsnd_hiwat; short sol_sbrcv_flags; short sol_sbsnd_flags; sbintime_t sol_sbrcv_timeo; sbintime_t sol_sbsnd_timeo; /* Information tracking listen queue overflows. */ struct timeval sol_lastover; /* (e) */ int sol_overcount; /* (e) */ }; }; }; #endif /* defined(_KERNEL) || defined(_WANT_SOCKET) */ /* * Socket state bits. * * Historically, these bits were all kept in the so_state field. * They are now split into separate, lock-specific fields. * so_state maintains basic socket state protected by the socket lock. * so_qstate holds information about the socket accept queues. * Each socket buffer also has a state field holding information * relevant to that socket buffer (can't send, rcv). * Many fields will be read without locks to improve performance and avoid * lock order issues. However, this approach must be used with caution. */ #define SS_NOFDREF 0x0001 /* no file table ref any more */ #define SS_ISCONNECTED 0x0002 /* socket connected to a peer */ #define SS_ISCONNECTING 0x0004 /* in process of connecting to peer */ #define SS_ISDISCONNECTING 0x0008 /* in process of disconnecting */ #define SS_NBIO 0x0100 /* non-blocking ops */ #define SS_ASYNC 0x0200 /* async i/o notify */ #define SS_ISCONFIRMING 0x0400 /* deciding to accept connection req */ #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ /* * Protocols can mark a socket as SS_PROTOREF to indicate that, following * pru_detach, they still want the socket to persist, and will free it * themselves when they are done. Protocols should only ever call sofree() * following setting this flag in pru_detach(), and never otherwise, as * sofree() bypasses socket reference counting. */ #define SS_PROTOREF 0x4000 /* strong protocol reference */ #ifdef _KERNEL #define SOCK_MTX(so) &(so)->so_lock #define SOCK_LOCK(so) mtx_lock(&(so)->so_lock) #define SOCK_OWNED(so) mtx_owned(&(so)->so_lock) #define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock) #define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED) #define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED) #define SOLISTENING(sol) (((sol)->so_options & SO_ACCEPTCONN) != 0) #define SOLISTEN_LOCK(sol) do { \ mtx_lock(&(sol)->so_lock); \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ } while (0) #define SOLISTEN_TRYLOCK(sol) mtx_trylock(&(sol)->so_lock) #define SOLISTEN_UNLOCK(sol) do { \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ mtx_unlock(&(sol)->so_lock); \ } while (0) #define SOLISTEN_LOCK_ASSERT(sol) do { \ mtx_assert(&(sol)->so_lock, MA_OWNED); \ KASSERT(SOLISTENING(sol), \ ("%s: %p not listening", __func__, (sol))); \ } while (0) /* * Macros for sockets and socket buffering. */ /* * Flags to sblock(). */ #define SBL_WAIT 0x00000001 /* Wait if not immediately available. */ #define SBL_NOINTR 0x00000002 /* Force non-interruptible sleep. */ #define SBL_VALID (SBL_WAIT | SBL_NOINTR) /* * Do we need to notify the other side when I/O is possible? */ #define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT | SB_SEL | SB_ASYNC | \ SB_UPCALL | SB_AIO | SB_KNOTE)) != 0) /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) /* can we read something from so? */ #define soreadabledata(so) \ (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) /* can we write something to so? */ #define sowriteable(so) \ ((sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat && \ (((so)->so_state&SS_ISCONNECTED) || \ ((so)->so_proto->pr_flags&PR_CONNREQUIRED)==0)) || \ ((so)->so_snd.sb_state & SBS_CANTSENDMORE) || \ (so)->so_error) /* * soref()/sorele() ref-count the socket structure. * soref() may be called without owning socket lock, but in that case a * caller must own something that holds socket, and so_count must be not 0. * Note that you must still explicitly close the socket, but the last ref * count will free the structure. */ #define soref(so) refcount_acquire(&(so)->so_count) #define sorele(so) do { \ SOCK_LOCK_ASSERT(so); \ if (refcount_release(&(so)->so_count)) \ sofree(so); \ else \ SOCK_UNLOCK(so); \ } while (0) /* * In sorwakeup() and sowwakeup(), acquire the socket buffer lock to * avoid a non-atomic test-and-wakeup. However, sowakeup is * responsible for releasing the lock if it is called. We unlock only * if we don't call into sowakeup. If any code is introduced that * directly invokes the underlying sowakeup() primitives, it must * maintain the same semantics. */ #define sorwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_rcv); \ if (sb_notify(&(so)->so_rcv)) \ sowakeup((so), &(so)->so_rcv); \ else \ SOCKBUF_UNLOCK(&(so)->so_rcv); \ } while (0) #define sorwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_rcv); \ sorwakeup_locked(so); \ } while (0) #define sowwakeup_locked(so) do { \ SOCKBUF_LOCK_ASSERT(&(so)->so_snd); \ if (sb_notify(&(so)->so_snd)) \ sowakeup((so), &(so)->so_snd); \ else \ SOCKBUF_UNLOCK(&(so)->so_snd); \ } while (0) #define sowwakeup(so) do { \ SOCKBUF_LOCK(&(so)->so_snd); \ sowwakeup_locked(so); \ } while (0) struct accept_filter { char accf_name[16]; int (*accf_callback) (struct socket *so, void *arg, int waitflag); void * (*accf_create) (struct socket *so, char *arg); void (*accf_destroy) (struct socket *so); SLIST_ENTRY(accept_filter) accf_next; }; #define ACCEPT_FILTER_DEFINE(modname, filtname, cb, create, destroy, ver) \ static struct accept_filter modname##_filter = { \ .accf_name = filtname, \ .accf_callback = cb, \ .accf_create = create, \ .accf_destroy = destroy, \ }; \ static moduledata_t modname##_mod = { \ .name = __XSTRING(modname), \ .evhand = accept_filt_generic_mod_event, \ .priv = &modname##_filter, \ }; \ DECLARE_MODULE(modname, modname##_mod, SI_SUB_DRIVERS, \ SI_ORDER_MIDDLE); \ MODULE_VERSION(modname, ver) #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); MALLOC_DECLARE(M_SONAME); #endif /* * Socket specific helper hook point identifiers * Do not leave holes in the sequence, hook registration is a loop. */ #define HHOOK_SOCKET_OPT 0 #define HHOOK_SOCKET_CREATE 1 #define HHOOK_SOCKET_RCV 2 #define HHOOK_SOCKET_SND 3 #define HHOOK_FILT_SOREAD 4 #define HHOOK_FILT_SOWRITE 5 #define HHOOK_SOCKET_CLOSE 6 #define HHOOK_SOCKET_LAST HHOOK_SOCKET_CLOSE struct socket_hhook_data { struct socket *so; struct mbuf *m; void *hctx; /* hook point specific data*/ int status; }; extern int maxsockets; extern u_long sb_max; extern so_gen_t so_gencnt; struct file; struct filecaps; struct filedesc; struct mbuf; struct sockaddr; struct ucred; struct uio; /* 'which' values for socket upcalls. */ #define SO_RCV 1 #define SO_SND 2 /* Return values for socket upcalls. */ #define SU_OK 0 #define SU_ISCONNECTED 1 /* * From uipc_socket and friends */ int getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp, u_int *fflagp, struct filecaps *havecaps); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); void soaio_enqueue(struct task *task); void soaio_rcv(void *context, int pending); void soaio_snd(void *context, int pending); int socheckuid(struct socket *so, uid_t uid); -int sogetsockaddr(struct socket *so, struct sockaddr **nam); int sobind(struct socket *so, struct sockaddr *nam, struct thread *td); int sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soclose(struct socket *so); int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td); int soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int soconnect2(struct socket *so1, struct socket *so2); int socreate(int dom, struct socket **aso, int type, int proto, struct ucred *cred, struct thread *td); int sodisconnect(struct socket *so); void sodtor_set(struct socket *, so_dtor_t *); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); int solisten_proto_check(struct socket *so); int solisten_dequeue(struct socket *, struct socket **, int); struct socket * sonewconn(struct socket *head, int connstatus); struct socket * sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int soreceive(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_stream(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_dgram(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreceive_generic(struct socket *so, struct sockaddr **paddr, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); int soreserve(struct socket *so, u_long sndcc, u_long rcvcc); void sorflush(struct socket *so); int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); int soshutdown(struct socket *so, int how); void soupcall_clear(struct socket *, int); void soupcall_set(struct socket *, int, so_upcall_t, void *); void solisten_upcall_set(struct socket *, so_upcall_t, void *); void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup_aio(struct socket *so, struct sockbuf *sb); void solisten_wakeup(struct socket *); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); void soisconnected(struct socket *so); void soisconnecting(struct socket *so); void soisdisconnected(struct socket *so); void soisdisconnecting(struct socket *so); void socantrcvmore(struct socket *so); void socantrcvmore_locked(struct socket *so); void socantsendmore(struct socket *so); void socantsendmore_locked(struct socket *so); /* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); int accept_filt_del(char *name); struct accept_filter *accept_filt_get(char *name); #ifdef ACCEPT_FILTER_MOD #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_accf); #endif int accept_filt_generic_mod_event(module_t mod, int event, void *data); #endif #endif /* _KERNEL */ /* * Structure to export socket from kernel to utilities, via sysctl(3). */ struct xsocket { ksize_t xso_len; /* length of this structure */ kvaddr_t xso_so; /* kernel address of struct socket */ kvaddr_t so_pcb; /* kernel address of struct inpcb */ uint64_t so_oobmark; int64_t so_spare64[8]; int32_t xso_protocol; int32_t xso_family; uint32_t so_qlen; uint32_t so_incqlen; uint32_t so_qlimit; pid_t so_pgid; uid_t so_uid; int32_t so_spare32[8]; int16_t so_type; int16_t so_options; int16_t so_linger; int16_t so_state; int16_t so_timeo; uint16_t so_error; struct xsockbuf { uint32_t sb_cc; uint32_t sb_hiwat; uint32_t sb_mbcnt; uint32_t sb_mcnt; uint32_t sb_ccnt; uint32_t sb_mbmax; int32_t sb_lowat; int32_t sb_timeo; int16_t sb_flags; } so_rcv, so_snd; }; #ifdef _KERNEL void sotoxsocket(struct socket *so, struct xsocket *xso); void sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb); #endif /* * Socket buffer state bits. Exported via libprocstat(3). */ #define SBS_CANTSENDMORE 0x0010 /* can't send more data to peer */ #define SBS_CANTRCVMORE 0x0020 /* can't receive more data from peer */ #define SBS_RCVATMARK 0x0040 /* at mark on input */ #endif /* !_SYS_SOCKETVAR_H_ */ diff --git a/tests/sys/netinet/Makefile b/tests/sys/netinet/Makefile index 95cd044f7031..56a1cf877135 100644 --- a/tests/sys/netinet/Makefile +++ b/tests/sys/netinet/Makefile @@ -1,32 +1,24 @@ # $FreeBSD$ PACKAGE= tests TESTSDIR= ${TESTSBASE}/sys/netinet BINDIR= ${TESTSDIR} ATF_TESTS_C= ip_reass_test \ so_reuseport_lb_test \ socket_afinet \ tcp_connect_port_test -ATF_TESTS_SH= carp fibs \ - fibs_test \ - redirect \ - divert \ - forward \ - output \ - lpm \ - arp \ - if_wg_test +ATF_TESTS_SH= carp fibs fibs_test redirect divert forward output lpm arp TEST_METADATA.output+= required_programs="python" PROGS= udp_dontroute tcp_user_cookie ${PACKAGE}FILES+= redirect.py ${PACKAGE}FILESMODE_redirect.py=0555 MAN= .include diff --git a/tests/sys/netinet/if_wg_test.sh b/tests/sys/netinet/if_wg_test.sh deleted file mode 100644 index b0ab70108cf4..000000000000 --- a/tests/sys/netinet/if_wg_test.sh +++ /dev/null @@ -1,188 +0,0 @@ -# $FreeBSD$ -# -# SPDX-License-Identifier: BSD-2-Clause-FreeBSD -# -# Copyright (c) 2021 The FreeBSD Foundation -# -# This software was developed by Mark Johnston under sponsorship -# from the FreeBSD Foundation. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -# SUCH DAMAGE. - -. $(atf_get_srcdir)/../common/vnet.subr - -atf_test_case "wg_basic" "cleanup" -wg_basic_head() -{ - atf_set descr 'Create a wg(4) tunnel over an epair and pass traffic between jails' - atf_set require.user root -} - -wg_basic_body() -{ - local epair pri1 pri2 pub1 pub2 wg1 wg2 - local endpoint1 endpoint2 tunnel1 tunnel2 - - kldload -n if_wg - - pri1=$(openssl rand -base64 32) - pri2=$(openssl rand -base64 32) - - endpoint1=192.168.2.1 - endpoint2=192.168.2.2 - tunnel1=169.254.0.1 - tunnel2=169.254.0.2 - - epair=$(vnet_mkepair) - - vnet_init - - vnet_mkjail wgtest1 ${epair}a - vnet_mkjail wgtest2 ${epair}b - - # Workaround for PR 254212. - jexec wgtest1 ifconfig lo0 up - jexec wgtest2 ifconfig lo0 up - - jexec wgtest1 ifconfig ${epair}a $endpoint1 up - jexec wgtest2 ifconfig ${epair}b $endpoint2 up - - wg1=$(jexec wgtest1 ifconfig wg create listen-port 12345 private-key "$pri1") - pub1=$(jexec wgtest1 ifconfig $wg1 | awk '/public-key:/ {print $2}') - wg2=$(jexec wgtest2 ifconfig wg create listen-port 12345 private-key "$pri2") - pub2=$(jexec wgtest2 ifconfig $wg2 | awk '/public-key:/ {print $2}') - - atf_check -s exit:0 -o ignore \ - jexec wgtest1 ifconfig $wg1 peer public-key "$pub2" \ - endpoint ${endpoint2}:12345 allowed-ips ${tunnel2}/32 - atf_check -s exit:0 \ - jexec wgtest1 ifconfig $wg1 inet $tunnel1 up - - atf_check -s exit:0 -o ignore \ - jexec wgtest2 ifconfig $wg2 peer public-key "$pub1" \ - endpoint ${endpoint1}:12345 allowed-ips ${tunnel1}/32 - atf_check -s exit:0 \ - jexec wgtest2 ifconfig $wg2 inet $tunnel2 up - - # Generous timeout since the handshake takes some time. - atf_check -s exit:0 -o ignore jexec wgtest1 ping -o -t 5 -i 0.25 $tunnel2 - atf_check -s exit:0 -o ignore jexec wgtest2 ping -o -t 5 -i 0.25 $tunnel1 -} - -wg_basic_cleanup() -{ - vnet_cleanup -} - -# The kernel is expecteld to silently ignore any attempt to add a peer with a -# public key identical to the host's. -atf_test_case "wg_key_peerdev_shared" "cleanup" -wg_key_peerdev_shared_head() -{ - atf_set descr 'Create a wg(4) interface with a shared pubkey between device and a peer' - atf_set require.user root -} - -wg_key_peerdev_shared_body() -{ - local epair pri1 pub1 wg1 - local endpoint1 tunnel1 - - kldload -n if_wg - - pri1=$(openssl rand -base64 32) - - endpoint1=192.168.2.1 - tunnel1=169.254.0.1 - - vnet_mkjail wgtest1 - - wg1=$(jexec wgtest1 ifconfig wg create listen-port 12345 private-key "$pri1") - pub1=$(jexec wgtest1 ifconfig $wg1 | awk '/public-key:/ {print $2}') - - atf_check -s exit:0 \ - jexec wgtest1 ifconfig ${wg1} peer public-key "${pub1}" \ - allowed-ips "${tunnel1}/32" - - atf_check -o empty jexec wgtest1 ifconfig ${wg1} peers -} - -wg_key_peerdev_shared_cleanup() -{ - vnet_cleanup -} - -# When a wg(8) interface has a private key reassigned that corresponds to the -# public key already on a peer, the kernel is expected to deconfigure the peer -# to resolve the conflict. -atf_test_case "wg_key_peerdev_makeshared" "cleanup" -wg_key_peerdev_makeshared_head() -{ - atf_set descr 'Create a wg(4) interface and assign peer key to device' - atf_set require.progs wg -} - -wg_key_peerdev_makeshared_body() -{ - local epair pri1 pub1 pri2 wg1 wg2 - local endpoint1 tunnel1 - - kldload -n if_wg - - pri1=$(openssl rand -base64 32) - pri2=$(openssl rand -base64 32) - - endpoint1=192.168.2.1 - tunnel1=169.254.0.1 - - vnet_mkjail wgtest1 - - wg1=$(jexec wgtest1 ifconfig wg create listen-port 12345 private-key "$pri1") - pub1=$(jexec wgtest1 ifconfig $wg1 | awk '/public-key:/ {print $2}') - - wg2=$(jexec wgtest1 ifconfig wg create listen-port 12345 private-key "$pri2") - - atf_check -s exit:0 -o ignore \ - jexec wgtest1 ifconfig ${wg2} peer public-key "${pub1}" \ - allowed-ips "${tunnel1}/32" - - atf_check -o not-empty jexec wgtest1 ifconfig ${wg2} peers - - jexec wgtest1 sh -c "echo '${pri1}' > pri1" - - atf_check -s exit:0 \ - jexec wgtest1 wg set ${wg2} private-key pri1 - - atf_check -o empty jexec wgtest1 ifconfig ${wg2} peers -} - -wg_key_peerdev_makeshared_cleanup() -{ - vnet_cleanup -} - -atf_init_test_cases() -{ - atf_add_test_case "wg_basic" - atf_add_test_case "wg_key_peerdev_shared" - atf_add_test_case "wg_key_peerdev_makeshared" -}