Index: head/sys/amd64/include/vdso.h =================================================================== --- head/sys/amd64/include/vdso.h (nonexistent) +++ head/sys/amd64/include/vdso.h (revision 237433) @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + */ +/* $FreeBSD$ */ + +#include Property changes on: head/sys/amd64/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/arm/include/vdso.h =================================================================== --- head/sys/arm/include/vdso.h (nonexistent) +++ head/sys/arm/include/vdso.h (revision 237433) @@ -0,0 +1,34 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _ARM_VDSO_H +#define _ARM_VDSO_H + +#define VDSO_TIMEHANDS_MD \ + uint32_t th_res[8]; + +#endif Property changes on: head/sys/arm/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/conf/files.arm =================================================================== --- head/sys/conf/files.arm (revision 237432) +++ head/sys/conf/files.arm (revision 237433) @@ -1,86 +1,87 @@ # $FreeBSD$ crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb arm/arm/autoconf.c standard arm/arm/bcopy_page.S standard arm/arm/bcopyinout.S standard arm/arm/blockio.S standard arm/arm/bootconfig.c standard arm/arm/bus_space_asm_generic.S standard arm/arm/busdma_machdep.c standard arm/arm/copystr.S standard arm/arm/cpufunc.c standard arm/arm/cpufunc_asm.S standard arm/arm/cpufunc_asm_armv4.S standard arm/arm/db_disasm.c optional ddb arm/arm/db_interface.c optional ddb arm/arm/db_trace.c optional ddb arm/arm/disassem.c optional ddb arm/arm/dump_machdep.c standard arm/arm/elf_machdep.c standard arm/arm/exception.S standard arm/arm/fiq.c standard arm/arm/fiq_subr.S standard arm/arm/fusu.S standard arm/arm/gdb_machdep.c optional gdb arm/arm/identcpu.c standard arm/arm/in_cksum.c optional inet | inet6 arm/arm/in_cksum_arm.S optional inet | inet6 arm/arm/intr.c standard arm/arm/locore.S standard no-obj arm/arm/machdep.c standard arm/arm/mem.c optional mem arm/arm/minidump_machdep.c optional mem arm/arm/nexus.c standard arm/arm/pmap.c standard arm/arm/setcpsr.S standard arm/arm/setstack.s standard arm/arm/stack_machdep.c optional ddb | stack arm/arm/support.S standard arm/arm/swtch.S standard arm/arm/sys_machdep.c standard arm/arm/trap.c standard arm/arm/uio_machdep.c standard arm/arm/undefined.c standard arm/arm/vectors.S standard arm/arm/vm_machdep.c standard arm/fpe-arm/armfpe_glue.S optional armfpe arm/fpe-arm/armfpe_init.c optional armfpe arm/fpe-arm/armfpe.S optional armfpe cddl/compat/opensolaris/kern/opensolaris_atomic.c optional zfs compile-with "${ZFS_C}" dev/hwpmc/hwpmc_arm.c optional hwpmc dev/ofw/openfirm.c optional fdt dev/ofw/openfirmio.c optional fdt dev/ofw/ofw_bus_if.m optional fdt dev/ofw/ofw_if.m optional fdt dev/ofw/ofw_bus_subr.c optional fdt dev/ofw/ofw_fdt.c optional fdt geom/geom_bsd.c optional geom_bsd geom/geom_bsd_enc.c optional geom_bsd geom/geom_mbr.c optional geom_mbr geom/geom_mbr_enc.c optional geom_mbr libkern/arm/divsi3.S standard libkern/arm/ffs.S standard libkern/arm/muldi3.c standard libkern/ashldi3.c standard libkern/ashrdi3.c standard libkern/divdi3.c standard libkern/ffsl.c standard libkern/fls.c standard libkern/flsl.c standard libkern/lshrdi3.c standard libkern/memchr.c optional fdt libkern/moddi3.c standard libkern/qdivrem.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard #XXX: We can't use these versions, as strcmp.c is included conf/files #libkern/arm/strcmp.S standard #libkern/arm/strncmp.S standard # +kern/subr_dummy_vdso_tc.c standard board_id.h standard \ dependency "$S/arm/conf/genboardid.awk $S/arm/conf/mach-types" \ compile-with "${AWK} -f $S/arm/conf/genboardid.awk $S/arm/conf/mach-types > board_id.h" \ no-obj no-implicit-rule before-depend \ clean "board_id.h" Index: head/sys/conf/files.ia64 =================================================================== --- head/sys/conf/files.ia64 (revision 237432) +++ head/sys/conf/files.ia64 (revision 237433) @@ -1,140 +1,141 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # # font8x16.o optional std8x16font \ compile-with "uudecode < /usr/share/syscons/fonts/${STD8X16FONT}-8x16.fnt && file2c 'unsigned char font_16[16*256] = {' '};' < ${STD8X16FONT}-8x16 > font8x16.c && ${CC} -c ${CFLAGS} font8x16.c" \ no-implicit-rule before-depend \ clean "${STD8X16FONT}-8x16 font8x16.c" # atkbdmap.h optional atkbd_dflt_keymap \ compile-with "/usr/sbin/kbdcontrol -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "atkbdmap.h" # font.h optional sc_dflt_font \ compile-with "uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'static u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'static u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'static u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h" \ no-obj no-implicit-rule before-depend \ clean "font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8" # ukbdmap.h optional ukbd_dflt_keymap \ compile-with "/usr/sbin/kbdcontrol -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "ukbdmap.h" # cddl/contrib/opensolaris/common/atomic/ia64/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" compat/freebsd32/freebsd32_ioctl.c optional compat_freebsd32 compat/freebsd32/freebsd32_misc.c optional compat_freebsd32 compat/freebsd32/freebsd32_syscalls.c optional compat_freebsd32 compat/freebsd32/freebsd32_sysent.c optional compat_freebsd32 compat/ia32/ia32_sysvec.c optional compat_freebsd32 contrib/ia64/libuwx/src/uwx_bstream.c standard contrib/ia64/libuwx/src/uwx_context.c standard contrib/ia64/libuwx/src/uwx_env.c standard contrib/ia64/libuwx/src/uwx_scoreboard.c standard contrib/ia64/libuwx/src/uwx_step.c standard contrib/ia64/libuwx/src/uwx_str.c standard contrib/ia64/libuwx/src/uwx_swap.c standard contrib/ia64/libuwx/src/uwx_trace.c standard contrib/ia64/libuwx/src/uwx_uinfo.c standard contrib/ia64/libuwx/src/uwx_utable.c standard crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb dev/atkbdc/atkbd.c optional atkbd atkbdc dev/atkbdc/atkbd_atkbdc.c optional atkbd atkbdc dev/atkbdc/atkbdc.c optional atkbdc dev/atkbdc/atkbdc_isa.c optional atkbdc isa dev/atkbdc/atkbdc_subr.c optional atkbdc dev/atkbdc/psm.c optional psm atkbdc dev/fb/fb.c optional fb | vga dev/fb/vga.c optional vga dev/hwpmc/hwpmc_ia64.c optional hwpmc dev/io/iodev.c optional io dev/kbd/kbd.c optional atkbd | sc | ukbd dev/syscons/scterm-teken.c optional sc dev/syscons/scvgarndr.c optional sc vga dev/syscons/scvtb.c optional sc dev/uart/uart_cpu_ia64.c optional uart dev/acpica/acpi_if.m standard ia64/acpica/OsdEnvironment.c optional acpi ia64/acpica/acpi_machdep.c optional acpi ia64/acpica/acpi_wakeup.c optional acpi ia64/acpica/madt.c optional acpi ia64/disasm/disasm_decode.c standard ia64/disasm/disasm_extract.c standard ia64/disasm/disasm_format.c standard ia64/ia32/ia32_misc.c optional compat_freebsd32 ia64/ia32/ia32_reg.c optional compat_freebsd32 ia64/ia32/ia32_signal.c optional compat_freebsd32 ia64/ia32/ia32_trap.c optional compat_freebsd32 ia64/ia64/autoconf.c standard ia64/ia64/bus_machdep.c standard ia64/ia64/busdma_machdep.c standard ia64/ia64/clock.c standard ia64/ia64/context.S standard ia64/ia64/db_machdep.c optional ddb ia64/ia64/dump_machdep.c standard ia64/ia64/efi.c standard ia64/ia64/elf_machdep.c standard ia64/ia64/emulate.c standard ia64/ia64/exception.S standard ia64/ia64/gdb_machdep.c optional gdb ia64/ia64/highfp.c standard ia64/ia64/in_cksum.c optional inet | inet6 ia64/ia64/interrupt.c standard ia64/ia64/iodev_machdep.c optional io ia64/ia64/locore.S standard no-obj ia64/ia64/machdep.c standard ia64/ia64/mca.c standard ia64/ia64/mem.c optional mem ia64/ia64/mp_locore.S optional smp ia64/ia64/mp_machdep.c optional smp ia64/ia64/nexus.c standard ia64/ia64/pal.S standard ia64/ia64/physical.S standard ia64/ia64/pmap.c standard ia64/ia64/ptrace_machdep.c standard ia64/ia64/sal.c standard ia64/ia64/sapic.c standard ia64/ia64/setjmp.S standard ia64/ia64/ssc.c optional ski ia64/ia64/sscdisk.c optional ski ia64/ia64/stack_machdep.c optional ddb | stack ia64/ia64/support.S standard ia64/ia64/sys_machdep.c standard ia64/ia64/syscall.S standard ia64/ia64/trap.c standard ia64/ia64/uio_machdep.c standard ia64/ia64/uma_machdep.c standard ia64/ia64/unaligned.c standard ia64/ia64/unwind.c standard ia64/ia64/vm_machdep.c standard ia64/isa/isa.c optional isa ia64/isa/isa_dma.c optional isa ia64/pci/pci_cfgreg.c optional pci isa/syscons_isa.c optional sc isa/vga_isa.c optional vga kern/imgact_elf32.c optional compat_freebsd32 kern/kern_clocksource.c standard libkern/bcmp.c standard libkern/ffsl.c standard libkern/fls.c standard libkern/flsl.c standard libkern/ia64/__divdi3.S standard libkern/ia64/__divsi3.S standard libkern/ia64/__moddi3.S standard libkern/ia64/__modsi3.S standard libkern/ia64/__udivdi3.S standard libkern/ia64/__udivsi3.S standard libkern/ia64/__umoddi3.S standard libkern/ia64/__umodsi3.S standard libkern/ia64/bswap16.S standard libkern/ia64/bswap32.S standard libkern/memmove.c standard libkern/memset.c standard +kern/subr_dummy_vdso_tc.c standard Index: head/sys/conf/files.mips =================================================================== --- head/sys/conf/files.mips (revision 237432) +++ head/sys/conf/files.mips (revision 237433) @@ -1,123 +1,124 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # Copyright (c) 2001, 2004-2005, Juniper Networks, Inc. # All rights reserved. # JNPR: files.mips,v 1.11 2007/08/09 12:25:35 katta # # ---------------------------------------------------------------------- # Phase 2 # ---------------------------------------------------------------------- # This file tells config what files go into building a kernel, # files marked standard are always included. # # Copyright (c) 2001, 2004-2005, Juniper Networks, Inc. # All rights reserved. # JNPR: files.mips,v 1.11 2007/08/09 12:25:35 katta # $FreeBSD$ # # ---------------------------------------------------------------------- # Phase 2 # ---------------------------------------------------------------------- mips/mips/machdep.c standard mips/mips/mp_machdep.c optional smp mips/mips/mpboot.S optional smp # ---------------------------------------------------------------------- # Phase 3 # ---------------------------------------------------------------------- mips/mips/autoconf.c standard mips/mips/cpu.c standard mips/mips/elf_machdep.c standard mips/mips/exception.S standard mips/mips/gdb_machdep.c standard mips/mips/pmap.c standard mips/mips/trap.c standard mips/mips/vm_machdep.c standard # ---------------------------------------------------------------------- # Phase 4 # ---------------------------------------------------------------------- # ---------------------------------------------------------------------- # Phase 5 # ---------------------------------------------------------------------- mips/mips/fp.S standard mips/mips/pm_machdep.c standard mips/mips/swtch.S standard mips/mips/tlb.c standard mips/mips/bus_space_generic.c standard mips/mips/busdma_machdep.c standard mips/mips/cache.c standard mips/mips/cache_mipsNN.c standard mips/mips/db_disasm.c optional ddb mips/mips/db_interface.c optional ddb mips/mips/db_trace.c optional ddb mips/mips/dump_machdep.c standard mips/mips/in_cksum.c optional inet | inet6 mips/mips/locore.S standard no-obj mips/mips/minidump_machdep.c standard mips/mips/mem.c optional mem mips/mips/libkern_machdep.c standard mips/mips/nexus.c standard mips/mips/stack_machdep.c optional ddb | stack mips/mips/support.S standard mips/mips/sys_machdep.c standard mips/mips/swtch.S standard mips/mips/uio_machdep.c standard mips/mips/uma_machdep.c standard crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb geom/geom_bsd.c optional geom_bsd geom/geom_bsd_enc.c optional geom_bsd geom/geom_mbr.c optional geom_mbr geom/geom_mbr_enc.c optional geom_mbr libkern/ashldi3.c standard libkern/ashrdi3.c standard libkern/cmpdi2.c standard libkern/ffsl.c standard libkern/fls.c standard libkern/flsl.c standard libkern/lshrdi3.c standard libkern/memchr.c optional fdt libkern/memmove.c standard libkern/ucmpdi2.c standard #XXX: We can't use these versions, as strcmp.c is included conf/files #libkern/mips/strcmp.S standard #libkern/mips/strncmp.S standard cddl/compat/opensolaris/kern/opensolaris_atomic.c optional zfs compile-with "${ZFS_C}" compat/freebsd32/freebsd32_ioctl.c optional compat_freebsd32 compat/freebsd32/freebsd32_misc.c optional compat_freebsd32 compat/freebsd32/freebsd32_syscalls.c optional compat_freebsd32 compat/freebsd32/freebsd32_sysent.c optional compat_freebsd32 kern/imgact_elf32.c optional compat_freebsd32 +kern/subr_dummy_vdso_tc.c standard mips/mips/freebsd32_machdep.c optional compat_freebsd32 kern/kern_clocksource.c standard kern/link_elf_obj.c standard dev/cfe/cfe_api.c optional cfe dev/cfe/cfe_console.c optional cfe_console dev/cfe/cfe_env.c optional cfe_env #dev/cfe/cfe_resource.c optional cfe # not yet needed dev/siba/siba.c optional siba dev/siba/siba_cc.c optional siba dev/siba/siba_core.c optional siba dev/siba/siba_pcib.c optional siba pci #mips/sentry5/siba_mips.c optional siba # not yet dev/hwpmc/hwpmc_mips.c optional hwpmc dev/rt/if_rt.c optional rt dev/nvram2env/nvram2env.c optional nvram2env dev/ofw/openfirm.c optional fdt dev/ofw/openfirmio.c optional fdt dev/ofw/ofw_bus_if.m optional fdt dev/ofw/ofw_if.m optional fdt dev/ofw/ofw_bus_subr.c optional fdt dev/ofw/ofw_fdt.c optional fdt dev/fdt/fdt_mips.c optional fdt Index: head/sys/conf/files.powerpc =================================================================== --- head/sys/conf/files.powerpc (revision 237432) +++ head/sys/conf/files.powerpc (revision 237433) @@ -1,232 +1,233 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # # font.h optional sc \ compile-with "uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h" \ no-obj no-implicit-rule before-depend \ clean "font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8" # # There is only an asm version on ppc64. cddl/compat/opensolaris/kern/opensolaris_atomic.c optional zfs powerpc compile-with "${ZFS_C}" cddl/contrib/opensolaris/common/atomic/powerpc64/opensolaris_atomic.S optional zfs powerpc64 compile-with "${ZFS_S}" crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb dev/bm/if_bm.c optional bm powermac dev/adb/adb_bus.c optional adb dev/adb/adb_kbd.c optional adb dev/adb/adb_mouse.c optional adb dev/adb/adb_hb_if.m optional adb dev/adb/adb_if.m optional adb dev/adb/adb_buttons.c optional adb dev/agp/agp_apple.c optional agp powermac dev/cfi/cfi_bus_fdt.c optional cfi fdt dev/fb/fb.c optional sc dev/fdt/fdt_powerpc.c optional fdt dev/hwpmc/hwpmc_powerpc.c optional hwpmc dev/iicbus/ad7417.c optional ad7417 powermac dev/iicbus/ds1775.c optional ds1775 powermac dev/iicbus/max6690.c optional max6690 powermac dev/kbd/kbd.c optional sc dev/ofw/openfirm.c optional aim | fdt dev/ofw/openfirmio.c optional aim | fdt dev/ofw/ofw_bus_if.m optional aim | fdt dev/ofw/ofw_if.m optional aim | fdt dev/ofw/ofw_bus_subr.c optional aim | fdt dev/ofw/ofw_console.c optional aim dev/ofw/ofw_disk.c optional ofwd aim dev/ofw/ofw_fdt.c optional fdt dev/ofw/ofw_iicbus.c optional iicbus aim dev/ofw/ofw_standard.c optional aim powerpc dev/powermac_nvram/powermac_nvram.c optional powermac_nvram powermac dev/quicc/quicc_bfe_fdt.c optional quicc mpc85xx dev/scc/scc_bfe_macio.c optional scc powermac dev/sec/sec.c optional sec mpc85xx dev/sound/macio/aoa.c optional snd_davbus | snd_ai2s powermac dev/sound/macio/davbus.c optional snd_davbus powermac dev/sound/macio/i2s.c optional snd_ai2s powermac dev/sound/macio/snapper.c optional snd_ai2s iicbus powermac dev/sound/macio/tumbler.c optional snd_ai2s iicbus powermac dev/syscons/scgfbrndr.c optional sc dev/syscons/scterm-teken.c optional sc dev/syscons/scvtb.c optional sc dev/tsec/if_tsec.c optional tsec dev/tsec/if_tsec_fdt.c optional tsec fdt dev/uart/uart_cpu_powerpc.c optional uart aim kern/kern_clocksource.c standard +kern/subr_dummy_vdso_tc.c standard kern/syscalls.c optional ktr libkern/ashldi3.c optional powerpc libkern/ashrdi3.c optional powerpc libkern/bcmp.c standard libkern/cmpdi2.c optional powerpc libkern/divdi3.c optional powerpc libkern/ffs.c standard libkern/ffsl.c standard libkern/fls.c standard libkern/flsl.c standard libkern/lshrdi3.c optional powerpc libkern/memchr.c optional fdt libkern/memmove.c standard libkern/memset.c standard libkern/moddi3.c optional powerpc libkern/qdivrem.c optional powerpc libkern/ucmpdi2.c optional powerpc libkern/udivdi3.c optional powerpc libkern/umoddi3.c optional powerpc powerpc/aim/clock.c optional aim powerpc/aim/copyinout.c optional aim powerpc/aim/interrupt.c optional aim powerpc/aim/locore.S optional aim no-obj powerpc/aim/machdep.c optional aim powerpc/aim/mmu_oea.c optional aim powerpc powerpc/aim/mmu_oea64.c optional aim powerpc/aim/moea64_if.m optional aim powerpc/aim/moea64_native.c optional aim powerpc/aim/mp_cpudep.c optional aim smp powerpc/aim/nexus.c optional aim powerpc/aim/slb.c optional aim powerpc64 powerpc/aim/swtch32.S optional aim powerpc powerpc/aim/swtch64.S optional aim powerpc64 powerpc/aim/trap.c optional aim powerpc/aim/uma_machdep.c optional aim powerpc/aim/vm_machdep.c optional aim powerpc/booke/clock.c optional booke powerpc/booke/copyinout.c optional booke powerpc/booke/interrupt.c optional booke powerpc/booke/locore.S optional booke no-obj powerpc/booke/machdep.c optional booke powerpc/booke/machdep_e500.c optional booke_e500 powerpc/booke/mp_cpudep.c optional booke smp powerpc/booke/platform_bare.c optional mpc85xx powerpc/booke/pmap.c optional booke powerpc/booke/swtch.S optional booke powerpc/booke/trap.c optional booke powerpc/booke/vm_machdep.c optional booke powerpc/cpufreq/dfs.c optional cpufreq powerpc/cpufreq/pcr.c optional cpufreq aim powerpc/fpu/fpu_add.c optional fpu_emu powerpc powerpc/fpu/fpu_compare.c optional fpu_emu powerpc powerpc/fpu/fpu_div.c optional fpu_emu powerpc powerpc/fpu/fpu_emu.c optional fpu_emu powerpc powerpc/fpu/fpu_explode.c optional fpu_emu powerpc powerpc/fpu/fpu_implode.c optional fpu_emu powerpc powerpc/fpu/fpu_mul.c optional fpu_emu powerpc powerpc/fpu/fpu_sqrt.c optional fpu_emu powerpc powerpc/fpu/fpu_subr.c optional fpu_emu powerpc powerpc/mambo/mambocall.S optional mambo powerpc/mambo/mambo.c optional mambo powerpc/mambo/mambo_console.c optional mambo powerpc/mambo/mambo_disk.c optional mambo powerpc/mambo/mambo_openpic.c optional mambo powerpc/mpc85xx/atpic.c optional mpc85xx isa powerpc/mpc85xx/ds1553_bus_fdt.c optional ds1553 fdt powerpc/mpc85xx/ds1553_core.c optional ds1553 powerpc/mpc85xx/i2c.c optional iicbus fdt powerpc/mpc85xx/isa.c optional mpc85xx isa powerpc/mpc85xx/lbc.c optional mpc85xx powerpc/mpc85xx/mpc85xx.c optional mpc85xx powerpc/mpc85xx/nexus.c optional mpc85xx powerpc/mpc85xx/pci_fdt.c optional pci mpc85xx powerpc/ofw/ofw_cpu.c optional aim powerpc/ofw/ofw_machdep.c optional aim powerpc/ofw/ofw_pci.c optional pci aim powerpc/ofw/ofw_pcibus.c optional pci aim powerpc/ofw/ofw_pcib_pci.c optional pci aim powerpc/ofw/ofw_real.c optional aim powerpc/ofw/ofw_syscons.c optional sc aim powerpc/ofw/ofwcall32.S optional aim powerpc powerpc/ofw/ofwcall64.S optional aim powerpc64 powerpc/ofw/ofwmagic.S optional aim powerpc/ofw/rtas.c optional aim powerpc/powermac/ata_kauai.c optional powermac ata | powermac atamacio powerpc/powermac/ata_macio.c optional powermac ata | powermac atamacio powerpc/powermac/ata_dbdma.c optional powermac ata | powermac atamacio powerpc/powermac/atibl.c optional powermac atibl powerpc/powermac/cuda.c optional powermac cuda powerpc/powermac/cpcht.c optional powermac pci powerpc/powermac/dbdma.c optional powermac pci powerpc/powermac/fcu.c optional powermac fcu powerpc/powermac/grackle.c optional powermac pci powerpc/powermac/hrowpic.c optional powermac pci powerpc/powermac/kiic.c optional powermac kiic powerpc/powermac/macgpio.c optional powermac pci powerpc/powermac/macio.c optional powermac pci powerpc/powermac/openpic_macio.c optional powermac pci powerpc/powermac/platform_powermac.c optional powermac powerpc/powermac/powermac_thermal.c optional powermac powerpc/powermac/pswitch.c optional powermac pswitch powerpc/powermac/pmu.c optional powermac pmu powerpc/powermac/smu.c optional powermac smu powerpc/powermac/smusat.c optional powermac smu powerpc/powermac/uninorth.c optional powermac powerpc/powermac/uninorthpci.c optional powermac pci powerpc/powermac/vcoregpio.c optional powermac powerpc/powermac/windtunnel.c optional powermac windtunnel powerpc/powerpc/altivec.c optional aim powerpc/powerpc/autoconf.c standard powerpc/powerpc/bcopy.c standard powerpc/powerpc/bus_machdep.c standard powerpc/powerpc/busdma_machdep.c standard powerpc/powerpc/copystr.c standard powerpc/powerpc/cpu.c standard powerpc/powerpc/db_disasm.c optional ddb powerpc/powerpc/db_hwwatch.c optional ddb powerpc/powerpc/db_interface.c optional ddb powerpc/powerpc/db_trace.c optional ddb powerpc/powerpc/dump_machdep.c standard powerpc/powerpc/elf32_machdep.c optional powerpc | compat_freebsd32 powerpc/powerpc/elf64_machdep.c optional powerpc64 powerpc/powerpc/exec_machdep.c standard powerpc/powerpc/fpu.c optional aim powerpc/powerpc/fuswintr.c standard powerpc/powerpc/gdb_machdep.c optional gdb powerpc/powerpc/in_cksum.c optional inet | inet6 powerpc/powerpc/intr_machdep.c standard powerpc/powerpc/iommu_if.m standard powerpc/powerpc/mem.c optional mem powerpc/powerpc/mmu_if.m standard powerpc/powerpc/mp_machdep.c optional smp powerpc/powerpc/openpic.c standard powerpc/powerpc/openpic_fdt.c optional fdt powerpc/powerpc/pic_if.m standard powerpc/powerpc/pmap_dispatch.c standard powerpc/powerpc/platform.c standard powerpc/powerpc/platform_if.m standard powerpc/powerpc/sc_machdep.c optional sc powerpc/powerpc/setjmp.S standard powerpc/powerpc/sigcode32.S optional powerpc | compat_freebsd32 powerpc/powerpc/sigcode64.S optional powerpc64 powerpc/powerpc/stack_machdep.c optional ddb | stack powerpc/powerpc/suswintr.c standard powerpc/powerpc/syncicache.c standard powerpc/powerpc/sys_machdep.c standard powerpc/powerpc/uio_machdep.c standard powerpc/ps3/ehci_ps3.c optional ps3 ehci powerpc/ps3/ohci_ps3.c optional ps3 ohci powerpc/ps3/if_glc.c optional ps3 glc powerpc/ps3/mmu_ps3.c optional ps3 powerpc/ps3/platform_ps3.c optional ps3 powerpc/ps3/ps3bus.c optional ps3 powerpc/ps3/ps3cdrom.c optional ps3 scbus powerpc/ps3/ps3disk.c optional ps3 powerpc/ps3/ps3pic.c optional ps3 powerpc/ps3/ps3_syscons.c optional ps3 sc powerpc/ps3/ps3-hvcall.S optional ps3 sc powerpc/psim/iobus.c optional psim powerpc/psim/ata_iobus.c optional ata psim powerpc/psim/openpic_iobus.c optional psim powerpc/psim/uart_iobus.c optional uart psim compat/freebsd32/freebsd32_ioctl.c optional compat_freebsd32 compat/freebsd32/freebsd32_misc.c optional compat_freebsd32 compat/freebsd32/freebsd32_syscalls.c optional compat_freebsd32 compat/freebsd32/freebsd32_sysent.c optional compat_freebsd32 kern/imgact_elf32.c optional compat_freebsd32 Index: head/sys/conf/files.sparc64 =================================================================== --- head/sys/conf/files.sparc64 (revision 237432) +++ head/sys/conf/files.sparc64 (revision 237433) @@ -1,145 +1,146 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # atkbdmap.h optional atkbd_dflt_keymap \ compile-with "/usr/sbin/kbdcontrol -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "atkbdmap.h" # sunkbdmap.h optional sunkbd_dflt_keymap \ compile-with "/usr/sbin/kbdcontrol -L ${SUNKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > sunkbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "sunkbdmap.h" # ukbdmap.h optional ukbd_dflt_keymap \ compile-with "/usr/sbin/kbdcontrol -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "ukbdmap.h" # cddl/contrib/opensolaris/common/atomic/sparc64/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" crypto/blowfish/bf_enc.c optional crypto | ipsec crypto/des/des_enc.c optional crypto | ipsec | netsmb dev/atkbdc/atkbd.c optional atkbd atkbdc dev/atkbdc/atkbd_atkbdc.c optional atkbd atkbdc dev/atkbdc/atkbdc.c optional atkbdc dev/atkbdc/atkbdc_ebus.c optional atkbdc ebus dev/atkbdc/atkbdc_isa.c optional atkbdc isa dev/atkbdc/atkbdc_subr.c optional atkbdc dev/atkbdc/psm.c optional psm atkbdc dev/auxio/auxio.c optional auxio sbus | auxio ebus dev/esp/esp_sbus.c optional esp sbus dev/fb/creator.c optional creator sc dev/fb/fb.c optional sc dev/fb/gallant12x22.c optional sc dev/fb/machfb.c optional machfb sc dev/hwpmc/hwpmc_sparc64.c optional hwpmc dev/kbd/kbd.c optional atkbd | sc | ukbd dev/le/if_le_lebuffer.c optional le sbus dev/le/if_le_ledma.c optional le sbus dev/le/lebuffer_sbus.c optional le sbus dev/ofw/ofw_bus_if.m standard dev/ofw/ofw_bus_subr.c standard dev/ofw/ofw_console.c optional ofw_console dev/ofw/ofw_if.m standard dev/ofw/ofw_standard.c standard dev/ofw/openfirm.c standard dev/ofw/openfirmio.c standard dev/ofw/openpromio.c standard dev/pcf/envctrl.c optional pcf ebus dev/pcf/pcf_ebus.c optional pcf ebus dev/sound/sbus/cs4231.c optional snd_audiocs ebus | \ snd_audiocs sbus dev/syscons/scgfbrndr.c optional sc dev/syscons/scterm-teken.c optional sc dev/syscons/scvtb.c optional sc dev/uart/uart_cpu_sparc64.c optional uart dev/uart/uart_kbd_sun.c optional uart sc kern/kern_clocksource.c standard +kern/subr_dummy_vdso_tc.c standard kern/syscalls.c optional ktr libkern/ffs.c standard libkern/ffsl.c standard libkern/fls.c standard libkern/flsl.c standard libkern/memmove.c standard sparc64/central/central.c optional central sparc64/ebus/ebus.c optional ebus sparc64/ebus/epic.c optional epic ebus sparc64/fhc/clkbrd.c optional fhc sparc64/fhc/fhc.c optional fhc sparc64/isa/isa.c optional isa sparc64/isa/isa_dma.c optional isa sparc64/isa/ofw_isa.c optional ebus | isa sparc64/pci/apb.c optional pci sparc64/pci/fire.c optional pci sparc64/pci/ofw_pcib.c optional pci sparc64/pci/ofw_pcib_subr.c optional pci sparc64/pci/ofw_pcibus.c optional pci sparc64/pci/ofw_pci_if.m optional pci sparc64/pci/psycho.c optional pci sparc64/pci/sbbc.c optional sbbc uart sparc64/pci/schizo.c optional pci sparc64/sbus/dma_sbus.c optional sbus sparc64/sbus/sbus.c optional sbus sparc64/sbus/lsi64854.c optional sbus sparc64/sparc64/ata_machdep.c optional ada | atadisk | da sparc64/sparc64/autoconf.c standard sparc64/sparc64/bus_machdep.c standard sparc64/sparc64/cache.c standard sparc64/sparc64/cam_machdep.c optional scbus sparc64/sparc64/cheetah.c standard sparc64/sparc64/clock.c standard sparc64/sparc64/counter.c standard sparc64/sparc64/db_disasm.c optional ddb sparc64/sparc64/db_interface.c optional ddb sparc64/sparc64/db_trace.c optional ddb sparc64/sparc64/db_hwwatch.c optional ddb sparc64/sparc64/dump_machdep.c standard sparc64/sparc64/elf_machdep.c standard sparc64/sparc64/exception.S standard no-obj \ compile-with "${NORMAL_S} -mcpu=ultrasparc" sparc64/sparc64/eeprom.c optional eeprom ebus | eeprom fhc | \ eeprom sbus sparc64/sparc64/gdb_machdep.c optional gdb sparc64/sparc64/identcpu.c standard sparc64/sparc64/in_cksum.c optional inet | inet6 sparc64/sparc64/interrupt.S standard no-obj \ compile-with "${NORMAL_S} -mcpu=ultrasparc" sparc64/sparc64/intr_machdep.c standard sparc64/sparc64/iommu.c standard sparc64/sparc64/jbusppm.c standard sparc64/sparc64/locore.S standard no-obj sparc64/sparc64/machdep.c standard sparc64/sparc64/mem.c optional mem sparc64/sparc64/mp_exception.S optional smp \ compile-with "${NORMAL_S} -mcpu=ultrasparc" sparc64/sparc64/mp_locore.S optional smp sparc64/sparc64/mp_machdep.c optional smp sparc64/sparc64/nexus.c standard sparc64/sparc64/ofw_machdep.c standard sparc64/sparc64/pmap.c standard sparc64/sparc64/prof_machdep.c optional profiling-routine sparc64/sparc64/rtc.c optional rtc ebus | rtc isa sparc64/sparc64/rwindow.c standard sparc64/sparc64/sc_machdep.c optional sc sparc64/sparc64/schppm.c standard sparc64/sparc64/spitfire.c standard sparc64/sparc64/ssm.c standard sparc64/sparc64/stack_machdep.c optional ddb | stack sparc64/sparc64/support.S standard \ compile-with "${NORMAL_S} -mcpu=ultrasparc" sparc64/sparc64/sys_machdep.c standard sparc64/sparc64/swtch.S standard sparc64/sparc64/tick.c standard sparc64/sparc64/tlb.c standard sparc64/sparc64/trap.c standard sparc64/sparc64/tsb.c standard sparc64/sparc64/uio_machdep.c standard sparc64/sparc64/upa.c optional creator sparc64/sparc64/vm_machdep.c standard sparc64/sparc64/zeus.c standard Index: head/sys/i386/include/vdso.h =================================================================== --- head/sys/i386/include/vdso.h (nonexistent) +++ head/sys/i386/include/vdso.h (revision 237433) @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + */ +/* $FreeBSD$ */ + +#include Property changes on: head/sys/i386/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/ia64/include/vdso.h =================================================================== --- head/sys/ia64/include/vdso.h (nonexistent) +++ head/sys/ia64/include/vdso.h (revision 237433) @@ -0,0 +1,41 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IA64_VDSO_H +#define _IA64_VDSO_H + +#define VDSO_TIMEHANDS_MD \ + uint32_t th_res[8]; + +#ifdef _KERNEL +#ifdef COMPAT_FREEBSD32 + +#define VDSO_TIMEHANDS_MD32 VDSO_TIMEHANDS_MD + +#endif +#endif +#endif Property changes on: head/sys/ia64/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/kern/imgact_elf.c =================================================================== --- head/sys/kern/imgact_elf.c (revision 237432) +++ head/sys/kern/imgact_elf.c (revision 237433) @@ -1,1711 +1,1715 @@ /*- * Copyright (c) 2000 David O'Brien * Copyright (c) 1995-1996 Søren Schmidt * Copyright (c) 1996 Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_compat.h" #include "opt_core.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define OLD_EI_BRAND 8 static int __elfN(check_header)(const Elf_Ehdr *hdr); static Elf_Brandinfo *__elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel); static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize); static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize); static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp); static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel); static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0, ""); #ifdef COMPRESS_USER_CORES static int compress_core(gzFile, char *, char *, unsigned int, struct thread * td); #define CORE_BUF_SIZE (16 * 1024) #endif int __elfN(fallback_brand) = -1; SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort"); TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand", &__elfN(fallback_brand)); static int elf_legacy_coredump = 0; SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, &elf_legacy_coredump, 0, ""); int __elfN(nxstack) = #if defined(__amd64__) || defined(__powerpc64__) /* both 64 and 32 bit */ 1; #else 0; #endif SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, nxstack, CTLFLAG_RW, &__elfN(nxstack), 0, __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) ": enable non-executable stack"); #if __ELF_WORD_SIZE == 32 #if defined(__amd64__) || defined(__ia64__) int i386_read_exec = 0; SYSCTL_INT(_kern_elf32, OID_AUTO, read_exec, CTLFLAG_RW, &i386_read_exec, 0, "enable execution from readable segments"); #endif #endif static Elf_Brandinfo *elf_brand_list[MAX_BRANDS]; #define trunc_page_ps(va, ps) ((va) & ~(ps - 1)) #define round_page_ps(va, ps) (((va) + (ps - 1)) & ~(ps - 1)) #define aligned(a, t) (trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a)) static const char FREEBSD_ABI_VENDOR[] = "FreeBSD"; Elf_Brandnote __elfN(freebsd_brandnote) = { .hdr.n_namesz = sizeof(FREEBSD_ABI_VENDOR), .hdr.n_descsz = sizeof(int32_t), .hdr.n_type = 1, .vendor = FREEBSD_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = __elfN(freebsd_trans_osrel) }; static boolean_t __elfN(freebsd_trans_osrel)(const Elf_Note *note, int32_t *osrel) { uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); *osrel = *(const int32_t *)(p); return (TRUE); } static const char GNU_ABI_VENDOR[] = "GNU"; static int GNU_KFREEBSD_ABI_DESC = 3; Elf_Brandnote __elfN(kfreebsd_brandnote) = { .hdr.n_namesz = sizeof(GNU_ABI_VENDOR), .hdr.n_descsz = 16, /* XXX at least 16 */ .hdr.n_type = 1, .vendor = GNU_ABI_VENDOR, .flags = BN_TRANSLATE_OSREL, .trans_osrel = kfreebsd_trans_osrel }; static boolean_t kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel) { const Elf32_Word *desc; uintptr_t p; p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); desc = (const Elf32_Word *)p; if (desc[0] != GNU_KFREEBSD_ABI_DESC) return (FALSE); /* * Debian GNU/kFreeBSD embed the earliest compatible kernel version * (__FreeBSD_version: Rxx) in the LSB way. */ *osrel = desc[1] * 100000 + desc[2] * 1000 + desc[3]; return (TRUE); } int __elfN(insert_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == NULL) { elf_brand_list[i] = entry; break; } } if (i == MAX_BRANDS) { printf("WARNING: %s: could not insert brandinfo entry: %p\n", __func__, entry); return (-1); } return (0); } int __elfN(remove_brand_entry)(Elf_Brandinfo *entry) { int i; for (i = 0; i < MAX_BRANDS; i++) { if (elf_brand_list[i] == entry) { elf_brand_list[i] = NULL; break; } } if (i == MAX_BRANDS) return (-1); return (0); } int __elfN(brand_inuse)(Elf_Brandinfo *entry) { struct proc *p; int rval = FALSE; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_sysent == entry->sysvec) { rval = TRUE; break; } } sx_sunlock(&allproc_lock); return (rval); } static Elf_Brandinfo * __elfN(get_brandinfo)(struct image_params *imgp, const char *interp, int32_t *osrel) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; Elf_Brandinfo *bi; boolean_t ret; int i; /* * We support four types of branding -- (1) the ELF EI_OSABI field * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string * branding w/in the ELF header, (3) path of the `interp_path' * field, and (4) the ".note.ABI-tag" ELF section. */ /* Look for an ".note.ABI-tag" ELF section */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL) continue; if (hdr->e_machine == bi->machine && (bi->flags & (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) { ret = __elfN(check_note)(imgp, bi->brand_note, osrel); if (ret) return (bi); } } /* If the executable has a brand, search for it in the brand list. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY) continue; if (hdr->e_machine == bi->machine && (hdr->e_ident[EI_OSABI] == bi->brand || strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND], bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0)) return (bi); } /* Lacking a known brand, search for a recognized interpreter. */ if (interp != NULL) { for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY) continue; if (hdr->e_machine == bi->machine && strcmp(interp, bi->interp_path) == 0) return (bi); } } /* Lacking a recognized interpreter, try the default brand */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi == NULL || bi->flags & BI_BRAND_NOTE_MANDATORY) continue; if (hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand) return (bi); } return (NULL); } static int __elfN(check_header)(const Elf_Ehdr *hdr) { Elf_Brandinfo *bi; int i; if (!IS_ELF(*hdr) || hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || hdr->e_ident[EI_DATA] != ELF_TARG_DATA || hdr->e_ident[EI_VERSION] != EV_CURRENT || hdr->e_phentsize != sizeof(Elf_Phdr) || hdr->e_version != ELF_TARG_VER) return (ENOEXEC); /* * Make sure we have at least one brand for this machine. */ for (i = 0; i < MAX_BRANDS; i++) { bi = elf_brand_list[i]; if (bi != NULL && bi->machine == hdr->e_machine) break; } if (i == MAX_BRANDS) return (ENOEXEC); return (0); } static int __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot) { struct sf_buf *sf; int error; vm_offset_t off; /* * Create the page if it doesn't exist yet. Ignore errors. */ vm_map_lock(map); vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), VM_PROT_ALL, VM_PROT_ALL, 0); vm_map_unlock(map); /* * Find the page from the underlying object. */ if (object) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, end - start); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } } return (KERN_SUCCESS); } static int __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset, vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow) { struct sf_buf *sf; vm_offset_t off; vm_size_t sz; int error, rv; if (start != trunc_page(start)) { rv = __elfN(map_partial)(map, object, offset, start, round_page(start), prot); if (rv) return (rv); offset += round_page(start) - start; start = round_page(start); } if (end != round_page(end)) { rv = __elfN(map_partial)(map, object, offset + trunc_page(end) - start, trunc_page(end), end, prot); if (rv) return (rv); end = trunc_page(end); } if (end > start) { if (offset & PAGE_MASK) { /* * The mapping is not page aligned. This means we have * to copy the data. Sigh. */ rv = vm_map_find(map, NULL, 0, &start, end - start, FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0); if (rv) return (rv); if (object == NULL) return (KERN_SUCCESS); for (; start < end; start += sz) { sf = vm_imgact_map_page(object, offset); if (sf == NULL) return (KERN_FAILURE); off = offset - trunc_page(offset); sz = end - start; if (sz > PAGE_SIZE - off) sz = PAGE_SIZE - off; error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, sz); vm_imgact_unmap_page(sf); if (error) { return (KERN_FAILURE); } offset += sz; } rv = KERN_SUCCESS; } else { vm_object_reference(object); vm_map_lock(map); rv = vm_map_insert(map, object, offset, start, end, prot, VM_PROT_ALL, cow); vm_map_unlock(map); if (rv != KERN_SUCCESS) vm_object_deallocate(object); } return (rv); } else { return (KERN_SUCCESS); } } static int __elfN(load_section)(struct image_params *imgp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot, size_t pagesize) { struct sf_buf *sf; size_t map_len; vm_map_t map; vm_object_t object; vm_offset_t map_addr; int error, rv, cow; size_t copy_len; vm_offset_t file_addr; /* * It's necessary to fail if the filsz + offset taken from the * header is greater than the actual file pager object's size. * If we were to allow this, then the vm_map_find() below would * walk right off the end of the file object and into the ether. * * While I'm here, might as well check for something else that * is invalid: filsz cannot be greater than memsz. */ if ((off_t)filsz + offset > imgp->attr->va_size || filsz > memsz) { uprintf("elf_load_section: truncated ELF file\n"); return (ENOEXEC); } object = imgp->object; map = &imgp->proc->p_vmspace->vm_map; map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize); file_addr = trunc_page_ps(offset, pagesize); /* * We have two choices. We can either clear the data in the last page * of an oversized mapping, or we can start the anon mapping a page * early and copy the initialized data into that first page. We * choose the second.. */ if (memsz > filsz) map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr; else map_len = round_page_ps(offset + filsz, pagesize) - file_addr; if (map_len != 0) { /* cow flags: don't dump readonly sections in core */ cow = MAP_COPY_ON_WRITE | MAP_PREFAULT | (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP); rv = __elfN(map_insert)(map, object, file_addr, /* file offset */ map_addr, /* virtual start */ map_addr + map_len,/* virtual end */ prot, cow); if (rv != KERN_SUCCESS) return (EINVAL); /* we can stop now if we've covered it all */ if (memsz == filsz) { return (0); } } /* * We have to get the remaining bit of the file into the first part * of the oversized map segment. This is normally because the .data * segment in the file is extended to provide bss. It's a neat idea * to try and save a page, but it's a pain in the behind to implement. */ copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize); map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize); map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) - map_addr; /* This had damn well better be true! */ if (map_len != 0) { rv = __elfN(map_insert)(map, NULL, 0, map_addr, map_addr + map_len, VM_PROT_ALL, 0); if (rv != KERN_SUCCESS) { return (EINVAL); } } if (copy_len != 0) { vm_offset_t off; sf = vm_imgact_map_page(object, offset + filsz); if (sf == NULL) return (EIO); /* send the page fragment to user space */ off = trunc_page_ps(offset + filsz, pagesize) - trunc_page(offset + filsz); error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)map_addr, copy_len); vm_imgact_unmap_page(sf); if (error) { return (error); } } /* * set it to the specified protection. * XXX had better undo the damage from pasting over the cracks here! */ vm_map_protect(map, trunc_page(map_addr), round_page(map_addr + map_len), prot, FALSE); return (0); } /* * Load the file "file" into memory. It may be either a shared object * or an executable. * * The "addr" reference parameter is in/out. On entry, it specifies * the address where a shared object should be loaded. If the file is * an executable, this value is ignored. On exit, "addr" specifies * where the file was actually loaded. * * The "entry" reference parameter is out only. On exit, it specifies * the entry point for the loaded file. */ static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr, u_long *entry, size_t pagesize) { struct { struct nameidata nd; struct vattr attr; struct image_params image_params; } *tempdata; const Elf_Ehdr *hdr = NULL; const Elf_Phdr *phdr = NULL; struct nameidata *nd; struct vattr *attr; struct image_params *imgp; vm_prot_t prot; u_long rbase; u_long base_addr = 0; int vfslocked, error, i, numsegs; #ifdef CAPABILITY_MODE /* * XXXJA: This check can go away once we are sufficiently confident * that the checks in namei() are correct. */ if (IN_CAPABILITY_MODE(curthread)) return (ECAPMODE); #endif tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK); nd = &tempdata->nd; attr = &tempdata->attr; imgp = &tempdata->image_params; /* * Initialize part of the common data */ imgp->proc = p; imgp->attr = attr; imgp->firstpage = NULL; imgp->image_header = NULL; imgp->object = NULL; imgp->execlabel = NULL; NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, curthread); vfslocked = 0; if ((error = namei(nd)) != 0) { nd->ni_vp = NULL; goto fail; } vfslocked = NDHASGIANT(nd); NDFREE(nd, NDF_ONLY_PNBUF); imgp->vp = nd->ni_vp; /* * Check permissions, modes, uid, etc on the file, and "open" it. */ error = exec_check_permissions(imgp); if (error) goto fail; error = exec_map_first_page(imgp); if (error) goto fail; /* * Also make certain that the interpreter stays the same, so set * its VV_TEXT flag, too. */ nd->ni_vp->v_vflag |= VV_TEXT; imgp->object = nd->ni_vp->v_object; hdr = (const Elf_Ehdr *)imgp->image_header; if ((error = __elfN(check_header)(hdr)) != 0) goto fail; if (hdr->e_type == ET_DYN) rbase = *addr; else if (hdr->e_type == ET_EXEC) rbase = 0; else { error = ENOEXEC; goto fail; } /* Only support headers that fit within first page for now */ /* (multiplication of two Elf_Half fields will not overflow) */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) { error = ENOEXEC; goto fail; } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) { error = ENOEXEC; goto fail; } for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_LOAD && phdr[i].p_memsz != 0) { /* Loadable segment */ prot = __elfN(trans_prot)(phdr[i].p_flags); error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase, phdr[i].p_memsz, phdr[i].p_filesz, prot, pagesize); if (error != 0) goto fail; /* * Establish the base address if this is the * first segment. */ if (numsegs == 0) base_addr = trunc_page(phdr[i].p_vaddr + rbase); numsegs++; } } *addr = base_addr; *entry = (unsigned long)hdr->e_entry + rbase; fail: if (imgp->firstpage) exec_unmap_first_page(imgp); if (nd->ni_vp) vput(nd->ni_vp); VFS_UNLOCK_GIANT(vfslocked); free(tempdata, M_TEMP); return (error); } static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp) { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; const Elf_Phdr *phdr; Elf_Auxargs *elf_auxargs; struct vmspace *vmspace; vm_prot_t prot; u_long text_size = 0, data_size = 0, total_size = 0; u_long text_addr = 0, data_addr = 0; u_long seg_size, seg_addr; u_long addr, baddr, et_dyn_addr, entry = 0, proghdr = 0; int32_t osrel = 0; int error = 0, i, n; const char *interp = NULL, *newinterp = NULL; Elf_Brandinfo *brand_info; char *path; struct sysentvec *sv; /* * Do we have a valid ELF header ? * * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later * if particular brand doesn't support it. */ if (__elfN(check_header)(hdr) != 0 || (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)) return (-1); /* * From here on down, we return an errno, not -1, as we've * detected an ELF file. */ if ((hdr->e_phoff > PAGE_SIZE) || (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { /* Only support headers in first page for now */ return (ENOEXEC); } phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); if (!aligned(phdr, Elf_Addr)) return (ENOEXEC); n = 0; baddr = 0; for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: if (n == 0) baddr = phdr[i].p_vaddr; n++; break; case PT_INTERP: /* Path to interpreter */ if (phdr[i].p_filesz > MAXPATHLEN || phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) return (ENOEXEC); interp = imgp->image_header + phdr[i].p_offset; break; case PT_GNU_STACK: if (__elfN(nxstack)) imgp->stack_prot = __elfN(trans_prot)(phdr[i].p_flags); break; } } brand_info = __elfN(get_brandinfo)(imgp, interp, &osrel); if (brand_info == NULL) { uprintf("ELF binary type \"%u\" not known.\n", hdr->e_ident[EI_OSABI]); return (ENOEXEC); } if (hdr->e_type == ET_DYN) { if ((brand_info->flags & BI_CAN_EXEC_DYN) == 0) return (ENOEXEC); /* * Honour the base load address from the dso if it is * non-zero for some reason. */ if (baddr == 0) et_dyn_addr = ET_DYN_LOAD_ADDR; else et_dyn_addr = 0; } else et_dyn_addr = 0; sv = brand_info->sysvec; if (interp != NULL && brand_info->interp_newpath != NULL) newinterp = brand_info->interp_newpath; /* * Avoid a possible deadlock if the current address space is destroyed * and that address space maps the locked vnode. In the common case, * the locked vnode's v_usecount is decremented but remains greater * than zero. Consequently, the vnode lock is not needed by vrele(). * However, in cases where the vnode lock is external, such as nullfs, * v_usecount may become zero. * * The VV_TEXT flag prevents modifications to the executable while * the vnode is unlocked. */ VOP_UNLOCK(imgp->vp, 0); error = exec_new_vmspace(imgp, sv); imgp->proc->p_sysent = sv; vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); for (i = 0; i < hdr->e_phnum; i++) { switch (phdr[i].p_type) { case PT_LOAD: /* Loadable segment */ if (phdr[i].p_memsz == 0) break; prot = __elfN(trans_prot)(phdr[i].p_flags); #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER) /* * Some x86 binaries assume read == executable, * notably the M3 runtime and therefore cvsup */ if (prot & VM_PROT_READ) prot |= VM_PROT_EXECUTE; #endif error = __elfN(load_section)(imgp, phdr[i].p_offset, (caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr, phdr[i].p_memsz, phdr[i].p_filesz, prot, sv->sv_pagesize); if (error != 0) return (error); /* * If this segment contains the program headers, * remember their virtual address for the AT_PHDR * aux entry. Static binaries don't usually include * a PT_PHDR entry. */ if (phdr[i].p_offset == 0 && hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize <= phdr[i].p_filesz) proghdr = phdr[i].p_vaddr + hdr->e_phoff + et_dyn_addr; seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr); seg_size = round_page(phdr[i].p_memsz + phdr[i].p_vaddr + et_dyn_addr - seg_addr); /* * Make the largest executable segment the official * text segment and all others data. * * Note that obreak() assumes that data_addr + * data_size == end of data load area, and the ELF * file format expects segments to be sorted by * address. If multiple data segments exist, the * last one will be used. */ if (phdr[i].p_flags & PF_X && text_size < seg_size) { text_size = seg_size; text_addr = seg_addr; } else { data_size = seg_size; data_addr = seg_addr; } total_size += seg_size; break; case PT_PHDR: /* Program header table info */ proghdr = phdr[i].p_vaddr + et_dyn_addr; break; default: break; } } if (data_addr == 0 && data_size == 0) { data_addr = text_addr; data_size = text_size; } entry = (u_long)hdr->e_entry + et_dyn_addr; /* * Check limits. It should be safe to check the * limits after loading the segments since we do * not actually fault in all the segments pages. */ PROC_LOCK(imgp->proc); if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) || text_size > maxtsiz || total_size > lim_cur(imgp->proc, RLIMIT_VMEM) || racct_set(imgp->proc, RACCT_DATA, data_size) != 0 || racct_set(imgp->proc, RACCT_VMEM, total_size) != 0) { PROC_UNLOCK(imgp->proc); return (ENOMEM); } vmspace = imgp->proc->p_vmspace; vmspace->vm_tsize = text_size >> PAGE_SHIFT; vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; vmspace->vm_dsize = data_size >> PAGE_SHIFT; vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; /* * We load the dynamic linker where a userland call * to mmap(0, ...) would put it. The rationale behind this * calculation is that it leaves room for the heap to grow to * its maximum allowed size. */ addr = round_page((vm_offset_t)vmspace->vm_daddr + lim_max(imgp->proc, RLIMIT_DATA)); PROC_UNLOCK(imgp->proc); imgp->entry_addr = entry; if (interp != NULL) { int have_interp = FALSE; VOP_UNLOCK(imgp->vp, 0); if (brand_info->emul_path != NULL && brand_info->emul_path[0] != '\0') { path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); snprintf(path, MAXPATHLEN, "%s%s", brand_info->emul_path, interp); error = __elfN(load_file)(imgp->proc, path, &addr, &imgp->entry_addr, sv->sv_pagesize); free(path, M_TEMP); if (error == 0) have_interp = TRUE; } if (!have_interp && newinterp != NULL) { error = __elfN(load_file)(imgp->proc, newinterp, &addr, &imgp->entry_addr, sv->sv_pagesize); if (error == 0) have_interp = TRUE; } if (!have_interp) { error = __elfN(load_file)(imgp->proc, interp, &addr, &imgp->entry_addr, sv->sv_pagesize); } vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) { uprintf("ELF interpreter %s not found\n", interp); return (error); } } else addr = et_dyn_addr; /* * Construct auxargs table (used by the fixup routine) */ elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); elf_auxargs->execfd = -1; elf_auxargs->phdr = proghdr; elf_auxargs->phent = hdr->e_phentsize; elf_auxargs->phnum = hdr->e_phnum; elf_auxargs->pagesz = PAGE_SIZE; elf_auxargs->base = addr; elf_auxargs->flags = 0; elf_auxargs->entry = entry; imgp->auxargs = elf_auxargs; imgp->interpreted = 0; imgp->reloc_base = addr; imgp->proc->p_osrel = osrel; return (error); } #define suword __CONCAT(suword, __ELF_WORD_SIZE) int __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp) { Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; Elf_Addr *base; Elf_Addr *pos; base = (Elf_Addr *)*stack_base; pos = base + (imgp->args->argc + imgp->args->envc + 2); if (args->execfd != -1) AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); AUXARGS_ENTRY(pos, AT_PHENT, args->phent); AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); AUXARGS_ENTRY(pos, AT_BASE, args->base); if (imgp->execpathp != 0) AUXARGS_ENTRY(pos, AT_EXECPATH, imgp->execpathp); AUXARGS_ENTRY(pos, AT_OSRELDATE, osreldate); if (imgp->canary != 0) { AUXARGS_ENTRY(pos, AT_CANARY, imgp->canary); AUXARGS_ENTRY(pos, AT_CANARYLEN, imgp->canarylen); } AUXARGS_ENTRY(pos, AT_NCPUS, mp_ncpus); if (imgp->pagesizes != 0) { AUXARGS_ENTRY(pos, AT_PAGESIZES, imgp->pagesizes); AUXARGS_ENTRY(pos, AT_PAGESIZESLEN, imgp->pagesizeslen); } + if (imgp->sysent->sv_timekeep_base != 0) { + AUXARGS_ENTRY(pos, AT_TIMEKEEP, + imgp->sysent->sv_timekeep_base); + } AUXARGS_ENTRY(pos, AT_STACKPROT, imgp->sysent->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : imgp->sysent->sv_stackprot); AUXARGS_ENTRY(pos, AT_NULL, 0); free(imgp->auxargs, M_TEMP); imgp->auxargs = NULL; base--; suword(base, (long)imgp->args->argc); *stack_base = (register_t *)base; return (0); } /* * Code for generating ELF core dumps. */ typedef void (*segment_callback)(vm_map_entry_t, void *); /* Closure for cb_put_phdr(). */ struct phdr_closure { Elf_Phdr *phdr; /* Program header to fill in */ Elf_Off offset; /* Offset of segment in core file */ }; /* Closure for cb_size_segment(). */ struct sseg_closure { int count; /* Count of writable segments. */ size_t size; /* Total size of all writable segments. */ }; static void cb_put_phdr(vm_map_entry_t, void *); static void cb_size_segment(vm_map_entry_t, void *); static void each_writable_segment(struct thread *, segment_callback, void *); static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *, int, void *, size_t, gzFile); static void __elfN(puthdr)(struct thread *, void *, size_t *, int); static void __elfN(putnote)(void *, size_t *, const char *, int, const void *, size_t); #ifdef COMPRESS_USER_CORES extern int compress_user_cores; extern int compress_user_cores_gzlevel; #endif static int core_output(struct vnode *vp, void *base, size_t len, off_t offset, struct ucred *active_cred, struct ucred *file_cred, struct thread *td, char *core_buf, gzFile gzfile) { int error; if (gzfile) { #ifdef COMPRESS_USER_CORES error = compress_core(gzfile, base, core_buf, len, td); #else panic("shouldn't be here"); #endif } else { error = vn_rdwr_inchunks(UIO_WRITE, vp, base, len, offset, UIO_USERSPACE, IO_UNIT | IO_DIRECT, active_cred, file_cred, NULL, td); } return (error); } int __elfN(coredump)(struct thread *td, struct vnode *vp, off_t limit, int flags) { struct ucred *cred = td->td_ucred; int error = 0; struct sseg_closure seginfo; void *hdr; size_t hdrsize; gzFile gzfile = Z_NULL; char *core_buf = NULL; #ifdef COMPRESS_USER_CORES char gzopen_flags[8]; char *p; int doing_compress = flags & IMGACT_CORE_COMPRESS; #endif hdr = NULL; #ifdef COMPRESS_USER_CORES if (doing_compress) { p = gzopen_flags; *p++ = 'w'; if (compress_user_cores_gzlevel >= 0 && compress_user_cores_gzlevel <= 9) *p++ = '0' + compress_user_cores_gzlevel; *p = 0; gzfile = gz_open("", gzopen_flags, vp); if (gzfile == Z_NULL) { error = EFAULT; goto done; } core_buf = malloc(CORE_BUF_SIZE, M_TEMP, M_WAITOK | M_ZERO); if (!core_buf) { error = ENOMEM; goto done; } } #endif /* Size the program segments. */ seginfo.count = 0; seginfo.size = 0; each_writable_segment(td, cb_size_segment, &seginfo); /* * Calculate the size of the core file header area by making * a dry run of generating it. Nothing is written, but the * size is calculated. */ hdrsize = 0; __elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count); #ifdef RACCT PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_CORE, hdrsize + seginfo.size); PROC_UNLOCK(td->td_proc); if (error != 0) { error = EFAULT; goto done; } #endif if (hdrsize + seginfo.size >= limit) { error = EFAULT; goto done; } /* * Allocate memory for building the header, fill it up, * and write it out. */ hdr = malloc(hdrsize, M_TEMP, M_WAITOK); if (hdr == NULL) { error = EINVAL; goto done; } error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize, gzfile); /* Write the contents of all of the writable segments. */ if (error == 0) { Elf_Phdr *php; off_t offset; int i; php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; offset = hdrsize; for (i = 0; i < seginfo.count; i++) { error = core_output(vp, (caddr_t)(uintptr_t)php->p_vaddr, php->p_filesz, offset, cred, NOCRED, curthread, core_buf, gzfile); if (error != 0) break; offset += php->p_filesz; php++; } } if (error) { log(LOG_WARNING, "Failed to write core file for process %s (error %d)\n", curproc->p_comm, error); } done: #ifdef COMPRESS_USER_CORES if (core_buf) free(core_buf, M_TEMP); if (gzfile) gzclose(gzfile); #endif free(hdr, M_TEMP); return (error); } /* * A callback for each_writable_segment() to write out the segment's * program header entry. */ static void cb_put_phdr(entry, closure) vm_map_entry_t entry; void *closure; { struct phdr_closure *phc = (struct phdr_closure *)closure; Elf_Phdr *phdr = phc->phdr; phc->offset = round_page(phc->offset); phdr->p_type = PT_LOAD; phdr->p_offset = phc->offset; phdr->p_vaddr = entry->start; phdr->p_paddr = 0; phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; phdr->p_align = PAGE_SIZE; phdr->p_flags = __elfN(untrans_prot)(entry->protection); phc->offset += phdr->p_filesz; phc->phdr++; } /* * A callback for each_writable_segment() to gather information about * the number of segments and their total size. */ static void cb_size_segment(entry, closure) vm_map_entry_t entry; void *closure; { struct sseg_closure *ssc = (struct sseg_closure *)closure; ssc->count++; ssc->size += entry->end - entry->start; } /* * For each writable segment in the process's memory map, call the given * function with a pointer to the map entry and some arbitrary * caller-supplied data. */ static void each_writable_segment(td, func, closure) struct thread *td; segment_callback func; void *closure; { struct proc *p = td->td_proc; vm_map_t map = &p->p_vmspace->vm_map; vm_map_entry_t entry; vm_object_t backing_object, object; boolean_t ignore_entry; vm_map_lock_read(map); for (entry = map->header.next; entry != &map->header; entry = entry->next) { /* * Don't dump inaccessible mappings, deal with legacy * coredump mode. * * Note that read-only segments related to the elf binary * are marked MAP_ENTRY_NOCOREDUMP now so we no longer * need to arbitrarily ignore such segments. */ if (elf_legacy_coredump) { if ((entry->protection & VM_PROT_RW) != VM_PROT_RW) continue; } else { if ((entry->protection & VM_PROT_ALL) == 0) continue; } /* * Dont include memory segment in the coredump if * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in * madvise(2). Do not dump submaps (i.e. parts of the * kernel map). */ if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP)) continue; if ((object = entry->object.vm_object) == NULL) continue; /* Ignore memory-mapped devices and such things. */ VM_OBJECT_LOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_LOCK(backing_object); VM_OBJECT_UNLOCK(object); object = backing_object; } ignore_entry = object->type != OBJT_DEFAULT && object->type != OBJT_SWAP && object->type != OBJT_VNODE; VM_OBJECT_UNLOCK(object); if (ignore_entry) continue; (*func)(entry, closure); } vm_map_unlock_read(map); } /* * Write the core file header to the file, including padding up to * the page boundary. */ static int __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize, gzfile) struct thread *td; struct vnode *vp; struct ucred *cred; int numsegs; size_t hdrsize; void *hdr; gzFile gzfile; { size_t off; /* Fill in the header. */ bzero(hdr, hdrsize); off = 0; __elfN(puthdr)(td, hdr, &off, numsegs); if (!gzfile) { /* Write it to the core file. */ return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL, td)); } else { #ifdef COMPRESS_USER_CORES if (gzwrite(gzfile, hdr, hdrsize) != hdrsize) { log(LOG_WARNING, "Failed to compress core file header for process" " %s.\n", curproc->p_comm); return (EFAULT); } else { return (0); } #else panic("shouldn't be here"); #endif } } #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 #include typedef struct prstatus32 elf_prstatus_t; typedef struct prpsinfo32 elf_prpsinfo_t; typedef struct fpreg32 elf_prfpregset_t; typedef struct fpreg32 elf_fpregset_t; typedef struct reg32 elf_gregset_t; typedef struct thrmisc32 elf_thrmisc_t; #else typedef prstatus_t elf_prstatus_t; typedef prpsinfo_t elf_prpsinfo_t; typedef prfpregset_t elf_prfpregset_t; typedef prfpregset_t elf_fpregset_t; typedef gregset_t elf_gregset_t; typedef thrmisc_t elf_thrmisc_t; #endif static void __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs) { struct { elf_prstatus_t status; elf_prfpregset_t fpregset; elf_prpsinfo_t psinfo; elf_thrmisc_t thrmisc; } *tempdata; elf_prstatus_t *status; elf_prfpregset_t *fpregset; elf_prpsinfo_t *psinfo; elf_thrmisc_t *thrmisc; struct proc *p; struct thread *thr; size_t ehoff, noteoff, notesz, phoff; p = td->td_proc; ehoff = *off; *off += sizeof(Elf_Ehdr); phoff = *off; *off += (numsegs + 1) * sizeof(Elf_Phdr); noteoff = *off; /* * Don't allocate space for the notes if we're just calculating * the size of the header. We also don't collect the data. */ if (dst != NULL) { tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK); status = &tempdata->status; fpregset = &tempdata->fpregset; psinfo = &tempdata->psinfo; thrmisc = &tempdata->thrmisc; } else { tempdata = NULL; status = NULL; fpregset = NULL; psinfo = NULL; thrmisc = NULL; } if (dst != NULL) { psinfo->pr_version = PRPSINFO_VERSION; psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t); strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname)); /* * XXX - We don't fill in the command line arguments properly * yet. */ strlcpy(psinfo->pr_psargs, p->p_comm, sizeof(psinfo->pr_psargs)); } __elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo, sizeof *psinfo); /* * To have the debugger select the right thread (LWP) as the initial * thread, we dump the state of the thread passed to us in td first. * This is the thread that causes the core dump and thus likely to * be the right thread one wants to have selected in the debugger. */ thr = td; while (thr != NULL) { if (dst != NULL) { status->pr_version = PRSTATUS_VERSION; status->pr_statussz = sizeof(elf_prstatus_t); status->pr_gregsetsz = sizeof(elf_gregset_t); status->pr_fpregsetsz = sizeof(elf_fpregset_t); status->pr_osreldate = osreldate; status->pr_cursig = p->p_sig; status->pr_pid = thr->td_tid; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 fill_regs32(thr, &status->pr_reg); fill_fpregs32(thr, fpregset); #else fill_regs(thr, &status->pr_reg); fill_fpregs(thr, fpregset); #endif memset(&thrmisc->_pad, 0, sizeof (thrmisc->_pad)); strcpy(thrmisc->pr_tname, thr->td_name); } __elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status, sizeof *status); __elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset, sizeof *fpregset); __elfN(putnote)(dst, off, "FreeBSD", NT_THRMISC, thrmisc, sizeof *thrmisc); /* * Allow for MD specific notes, as well as any MD * specific preparations for writing MI notes. */ __elfN(dump_thread)(thr, dst, off); thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) : TAILQ_NEXT(thr, td_plist); if (thr == td) thr = TAILQ_NEXT(thr, td_plist); } notesz = *off - noteoff; if (dst != NULL) free(tempdata, M_TEMP); /* Align up to a page boundary for the program segments. */ *off = round_page(*off); if (dst != NULL) { Elf_Ehdr *ehdr; Elf_Phdr *phdr; struct phdr_closure phc; /* * Fill in the ELF header. */ ehdr = (Elf_Ehdr *)((char *)dst + ehoff); ehdr->e_ident[EI_MAG0] = ELFMAG0; ehdr->e_ident[EI_MAG1] = ELFMAG1; ehdr->e_ident[EI_MAG2] = ELFMAG2; ehdr->e_ident[EI_MAG3] = ELFMAG3; ehdr->e_ident[EI_CLASS] = ELF_CLASS; ehdr->e_ident[EI_DATA] = ELF_DATA; ehdr->e_ident[EI_VERSION] = EV_CURRENT; ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD; ehdr->e_ident[EI_ABIVERSION] = 0; ehdr->e_ident[EI_PAD] = 0; ehdr->e_type = ET_CORE; #if defined(COMPAT_FREEBSD32) && __ELF_WORD_SIZE == 32 ehdr->e_machine = ELF_ARCH32; #else ehdr->e_machine = ELF_ARCH; #endif ehdr->e_version = EV_CURRENT; ehdr->e_entry = 0; ehdr->e_phoff = phoff; ehdr->e_flags = 0; ehdr->e_ehsize = sizeof(Elf_Ehdr); ehdr->e_phentsize = sizeof(Elf_Phdr); ehdr->e_phnum = numsegs + 1; ehdr->e_shentsize = sizeof(Elf_Shdr); ehdr->e_shnum = 0; ehdr->e_shstrndx = SHN_UNDEF; /* * Fill in the program header entries. */ phdr = (Elf_Phdr *)((char *)dst + phoff); /* The note segement. */ phdr->p_type = PT_NOTE; phdr->p_offset = noteoff; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = notesz; phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 0; phdr++; /* All the writable segments from the program. */ phc.phdr = phdr; phc.offset = *off; each_writable_segment(td, cb_put_phdr, &phc); } } static void __elfN(putnote)(void *dst, size_t *off, const char *name, int type, const void *desc, size_t descsz) { Elf_Note note; note.n_namesz = strlen(name) + 1; note.n_descsz = descsz; note.n_type = type; if (dst != NULL) bcopy(¬e, (char *)dst + *off, sizeof note); *off += sizeof note; if (dst != NULL) bcopy(name, (char *)dst + *off, note.n_namesz); *off += roundup2(note.n_namesz, sizeof(Elf_Size)); if (dst != NULL) bcopy(desc, (char *)dst + *off, note.n_descsz); *off += roundup2(note.n_descsz, sizeof(Elf_Size)); } static boolean_t __elfN(parse_notes)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel, const Elf_Phdr *pnote) { const Elf_Note *note, *note0, *note_end; const char *note_name; int i; if (pnote == NULL || pnote->p_offset >= PAGE_SIZE || pnote->p_offset + pnote->p_filesz >= PAGE_SIZE) return (FALSE); note = note0 = (const Elf_Note *)(imgp->image_header + pnote->p_offset); note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset + pnote->p_filesz); for (i = 0; i < 100 && note >= note0 && note < note_end; i++) { if (!aligned(note, Elf32_Addr)) return (FALSE); if (note->n_namesz != checknote->hdr.n_namesz || note->n_descsz != checknote->hdr.n_descsz || note->n_type != checknote->hdr.n_type) goto nextnote; note_name = (const char *)(note + 1); if (strncmp(checknote->vendor, note_name, checknote->hdr.n_namesz) != 0) goto nextnote; /* * Fetch the osreldate for binary * from the ELF OSABI-note if necessary. */ if ((checknote->flags & BN_TRANSLATE_OSREL) != 0 && checknote->trans_osrel != NULL) return (checknote->trans_osrel(note, osrel)); return (TRUE); nextnote: note = (const Elf_Note *)((const char *)(note + 1) + roundup2(note->n_namesz, sizeof(Elf32_Addr)) + roundup2(note->n_descsz, sizeof(Elf32_Addr))); } return (FALSE); } /* * Try to find the appropriate ABI-note section for checknote, * fetch the osreldate for binary from the ELF OSABI-note. Only the * first page of the image is searched, the same as for headers. */ static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *checknote, int32_t *osrel) { const Elf_Phdr *phdr; const Elf_Ehdr *hdr; int i; hdr = (const Elf_Ehdr *)imgp->image_header; phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); for (i = 0; i < hdr->e_phnum; i++) { if (phdr[i].p_type == PT_NOTE && __elfN(parse_notes)(imgp, checknote, osrel, &phdr[i])) return (TRUE); } return (FALSE); } /* * Tell kern_execve.c about it, with a little help from the linker. */ static struct execsw __elfN(execsw) = { __CONCAT(exec_, __elfN(imgact)), __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) }; EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw)); #ifdef COMPRESS_USER_CORES /* * Compress and write out a core segment for a user process. * * 'inbuf' is the starting address of a VM segment in the process' address * space that is to be compressed and written out to the core file. 'dest_buf' * is a buffer in the kernel's address space. The segment is copied from * 'inbuf' to 'dest_buf' first before being processed by the compression * routine gzwrite(). This copying is necessary because the content of the VM * segment may change between the compression pass and the crc-computation pass * in gzwrite(). This is because realtime threads may preempt the UNIX kernel. */ static int compress_core (gzFile file, char *inbuf, char *dest_buf, unsigned int len, struct thread *td) { int len_compressed; int error = 0; unsigned int chunk_len; while (len) { chunk_len = (len > CORE_BUF_SIZE) ? CORE_BUF_SIZE : len; copyin(inbuf, dest_buf, chunk_len); len_compressed = gzwrite(file, dest_buf, chunk_len); EVENTHANDLER_INVOKE(app_coredump_progress, td, len_compressed); if ((unsigned int)len_compressed != chunk_len) { log(LOG_WARNING, "compress_core: length mismatch (0x%x returned, " "0x%x expected)\n", len_compressed, chunk_len); EVENTHANDLER_INVOKE(app_coredump_error, td, "compress_core: length mismatch %x -> %x", chunk_len, len_compressed); error = EFAULT; break; } inbuf += chunk_len; len -= chunk_len; maybe_yield(); } return (error); } #endif /* COMPRESS_USER_CORES */ static vm_prot_t __elfN(trans_prot)(Elf_Word flags) { vm_prot_t prot; prot = 0; if (flags & PF_X) prot |= VM_PROT_EXECUTE; if (flags & PF_W) prot |= VM_PROT_WRITE; if (flags & PF_R) prot |= VM_PROT_READ; #if __ELF_WORD_SIZE == 32 #if defined(__amd64__) || defined(__ia64__) if (i386_read_exec && (flags & PF_R)) prot |= VM_PROT_EXECUTE; #endif #endif return (prot); } static Elf_Word __elfN(untrans_prot)(vm_prot_t prot) { Elf_Word flags; flags = 0; if (prot & VM_PROT_EXECUTE) flags |= PF_X; if (prot & VM_PROT_READ) flags |= PF_R; if (prot & VM_PROT_WRITE) flags |= PF_W; return (flags); } Index: head/sys/kern/kern_exec.c =================================================================== --- head/sys/kern/kern_exec.c (revision 237432) +++ head/sys/kern/kern_exec.c (revision 237433) @@ -1,1623 +1,1716 @@ /*- * Copyright (c) 1993, David Greenman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" +#include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exec; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, kernel, , exec, exec); SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *"); SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure); SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int"); SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success); SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *"); MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments"); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS); static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS); static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS); static int do_execve(struct thread *td, struct image_args *args, struct mac *mac_p); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD, NULL, 0, sysctl_kern_ps_strings, "LU", ""); /* XXX This should be vm_size_t. */ SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD| CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", ""); SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD, NULL, 0, sysctl_kern_stackprot, "I", ""); u_long ps_arg_cache_limit = PAGE_SIZE / 16; SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, &ps_arg_cache_limit, 0, ""); static int map_at_zero = 0; TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero); SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0, "Permit processes to map an object at virtual address 0."); static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_psstrings; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings, sizeof(p->p_sysent->sv_psstrings)); return error; } static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS) { struct proc *p; int error; p = curproc; #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { unsigned int val; val = (unsigned int)p->p_sysent->sv_usrstack; error = SYSCTL_OUT(req, &val, sizeof(val)); } else #endif error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack, sizeof(p->p_sysent->sv_usrstack)); return error; } static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS) { struct proc *p; p = curproc; return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot, sizeof(p->p_sysent->sv_stackprot))); } /* * Each of the items is a pointer to a `const struct execsw', hence the * double pointer here. */ static const struct execsw **execsw; #ifndef _SYS_SYSPROTO_H_ struct execve_args { char *fname; char **argv; char **envv; }; #endif int sys_execve(td, uap) struct thread *td; struct execve_args /* { char *fname; char **argv; char **envv; } */ *uap; { int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, NULL); return (error); } #ifndef _SYS_SYSPROTO_H_ struct fexecve_args { int fd; char **argv; char **envv; } #endif int sys_fexecve(struct thread *td, struct fexecve_args *uap) { int error; struct image_args args; error = exec_copyin_args(&args, NULL, UIO_SYSSPACE, uap->argv, uap->envv); if (error == 0) { args.fd = uap->fd; error = kern_execve(td, &args, NULL); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct __mac_execve_args { char *fname; char **argv; char **envv; struct mac *mac_p; }; #endif int sys___mac_execve(td, uap) struct thread *td; struct __mac_execve_args /* { char *fname; char **argv; char **envv; struct mac *mac_p; } */ *uap; { #ifdef MAC int error; struct image_args args; error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE, uap->argv, uap->envv); if (error == 0) error = kern_execve(td, &args, uap->mac_p); return (error); #else return (ENOSYS); #endif } /* * XXX: kern_execve has the astonishing property of not always returning to * the caller. If sufficiently bad things happen during the call to * do_execve(), it can end up calling exit1(); as a result, callers must * avoid doing anything which they might need to undo (e.g., allocating * memory). */ int kern_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; int error; AUDIT_ARG_ARGV(args->begin_argv, args->argc, args->begin_envv - args->begin_argv); AUDIT_ARG_ENVV(args->begin_envv, args->envc, args->endp - args->begin_envv); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); if (thread_single(SINGLE_BOUNDARY)) { PROC_UNLOCK(p); exec_free_args(args); return (ERESTART); /* Try again later. */ } PROC_UNLOCK(p); } error = do_execve(td, args, mac_p); if (p->p_flag & P_HADTHREADS) { PROC_LOCK(p); /* * If success, we upgrade to SINGLE_EXIT state to * force other threads to suicide. */ if (error == 0) thread_single(SINGLE_EXIT); else thread_single_end(); PROC_UNLOCK(p); } return (error); } /* * In-kernel implementation of execve(). All arguments are assumed to be * userspace pointers from the passed thread. */ static int do_execve(td, args, mac_p) struct thread *td; struct image_args *args; struct mac *mac_p; { struct proc *p = td->td_proc; struct nameidata nd; struct ucred *newcred = NULL, *oldcred; struct uidinfo *euip; register_t *stack_base; int error, i; struct image_params image_params, *imgp; struct vattr attr; int (*img_first)(struct image_params *); struct pargs *oldargs = NULL, *newargs = NULL; struct sigacts *oldsigacts, *newsigacts; #ifdef KTRACE struct vnode *tracevp = NULL; struct ucred *tracecred = NULL; #endif struct vnode *textvp = NULL, *binvp = NULL; int credential_changing; int vfslocked; int textset; #ifdef MAC struct label *interpvplabel = NULL; int will_transition; #endif #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif static const char fexecv_proc_title[] = "(fexecv)"; vfslocked = 0; imgp = &image_params; /* * Lock the process and set the P_INEXEC flag to indicate that * it should be left alone until we're done here. This is * necessary to avoid race conditions - e.g. in ptrace() - * that might allow a local user to illicitly obtain elevated * privileges. */ PROC_LOCK(p); KASSERT((p->p_flag & P_INEXEC) == 0, ("%s(): process already has P_INEXEC flag", __func__)); p->p_flag |= P_INEXEC; PROC_UNLOCK(p); /* * Initialize part of the common data */ imgp->proc = p; imgp->execlabel = NULL; imgp->attr = &attr; imgp->entry_addr = 0; imgp->reloc_base = 0; imgp->vmspace_destroyed = 0; imgp->interpreted = 0; imgp->opened = 0; imgp->interpreter_name = NULL; imgp->auxargs = NULL; imgp->vp = NULL; imgp->object = NULL; imgp->firstpage = NULL; imgp->ps_strings = 0; imgp->auxarg_size = 0; imgp->args = args; imgp->execpath = imgp->freepath = NULL; imgp->execpathp = 0; imgp->canary = 0; imgp->canarylen = 0; imgp->pagesizes = 0; imgp->pagesizeslen = 0; imgp->stack_prot = 0; #ifdef MAC error = mac_execve_enter(imgp, mac_p); if (error) goto exec_fail; #endif imgp->image_header = NULL; /* * Translate the file name. namei() returns a vnode pointer * in ni_vp amoung other things. * * XXXAUDIT: It would be desirable to also audit the name of the * interpreter if this is an interpreted binary. */ if (args->fname != NULL) { NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td); } SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 ); interpret: if (args->fname != NULL) { #ifdef CAPABILITY_MODE /* * While capability mode can't reach this point via direct * path arguments to execve(), we also don't allow * interpreters to be used in capability mode (for now). * Catch indirect lookups and return a permissions error. */ if (IN_CAPABILITY_MODE(td)) { error = ECAPMODE; goto exec_fail; } #endif error = namei(&nd); if (error) goto exec_fail; vfslocked = NDHASGIANT(&nd); binvp = nd.ni_vp; imgp->vp = binvp; } else { AUDIT_ARG_FD(args->fd); /* * Some might argue that CAP_READ and/or CAP_MMAP should also * be required here; such arguments will be entertained. */ error = fgetvp_read(td, args->fd, CAP_FEXECVE, &binvp); if (error) goto exec_fail; vfslocked = VFS_LOCK_GIANT(binvp->v_mount); vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(binvp); imgp->vp = binvp; } /* * Check file permissions (also 'opens' file) */ error = exec_check_permissions(imgp); if (error) goto exec_fail_dealloc; imgp->object = imgp->vp->v_object; if (imgp->object != NULL) vm_object_reference(imgp->object); /* * Set VV_TEXT now so no one can write to the executable while we're * activating it. * * Remember if this was set before and unset it in case this is not * actually an executable image. */ textset = imgp->vp->v_vflag & VV_TEXT; ASSERT_VOP_ELOCKED(imgp->vp, "vv_text"); imgp->vp->v_vflag |= VV_TEXT; error = exec_map_first_page(imgp); if (error) goto exec_fail_dealloc; imgp->proc->p_osrel = 0; /* * If the current process has a special image activator it * wants to try first, call it. For example, emulating shell * scripts differently. */ error = -1; if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL) error = img_first(imgp); /* * Loop through the list of image activators, calling each one. * An activator returns -1 if there is no match, 0 on success, * and an error otherwise. */ for (i = 0; error == -1 && execsw[i]; ++i) { if (execsw[i]->ex_imgact == NULL || execsw[i]->ex_imgact == img_first) { continue; } error = (*execsw[i]->ex_imgact)(imgp); } if (error) { if (error == -1) { if (textset == 0) { ASSERT_VOP_ELOCKED(imgp->vp, "vv_text"); imgp->vp->v_vflag &= ~VV_TEXT; } error = ENOEXEC; } goto exec_fail_dealloc; } /* * Special interpreter operation, cleanup and loop up to try to * activate the interpreter. */ if (imgp->interpreted) { exec_unmap_first_page(imgp); /* * VV_TEXT needs to be unset for scripts. There is a short * period before we determine that something is a script where * VV_TEXT will be set. The vnode lock is held over this * entire period so nothing should illegitimately be blocked. */ imgp->vp->v_vflag &= ~VV_TEXT; /* free name buffer and old vnode */ if (args->fname != NULL) NDFREE(&nd, NDF_ONLY_PNBUF); #ifdef MAC mac_execve_interpreter_enter(binvp, &interpvplabel); #endif if (imgp->opened) { VOP_CLOSE(binvp, FREAD, td->td_ucred, td); imgp->opened = 0; } vput(binvp); vm_object_deallocate(imgp->object); imgp->object = NULL; VFS_UNLOCK_GIANT(vfslocked); vfslocked = 0; /* set new name to that of the interpreter */ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE, UIO_SYSSPACE, imgp->interpreter_name, td); args->fname = imgp->interpreter_name; goto interpret; } /* * NB: We unlock the vnode here because it is believed that none * of the sv_copyout_strings/sv_fixup operations require the vnode. */ VOP_UNLOCK(imgp->vp, 0); /* * Do the best to calculate the full path to the image file. */ if (imgp->auxargs != NULL && ((args->fname != NULL && args->fname[0] == '/') || vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0)) imgp->execpath = args->fname; /* * Copy out strings (args and env) and initialize stack base */ if (p->p_sysent->sv_copyout_strings) stack_base = (*p->p_sysent->sv_copyout_strings)(imgp); else stack_base = exec_copyout_strings(imgp); /* * If custom stack fixup routine present for this process * let it do the stack setup. * Else stuff argument count as first item on stack */ if (p->p_sysent->sv_fixup != NULL) (*p->p_sysent->sv_fixup)(&stack_base, imgp); else suword(--stack_base, imgp->args->argc); /* * For security and other reasons, the file descriptor table cannot * be shared after an exec. */ fdunshare(p, td); /* * Malloc things before we need locks. */ newcred = crget(); euip = uifind(attr.va_uid); i = imgp->args->begin_envv - imgp->args->begin_argv; /* Cache arguments if they fit inside our allowance */ if (ps_arg_cache_limit >= i + sizeof(struct pargs)) { newargs = pargs_alloc(i); bcopy(imgp->args->begin_argv, newargs->ar_args, i); } /* close files on exec */ fdcloseexec(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); /* Get a reference to the vnode prior to locking the proc */ VREF(binvp); /* * For security and other reasons, signal handlers cannot * be shared after an exec. The new process gets a copy of the old * handlers. In execsigs(), the new process will have its signals * reset. */ PROC_LOCK(p); oldcred = crcopysafe(p, newcred); if (sigacts_shared(p->p_sigacts)) { oldsigacts = p->p_sigacts; PROC_UNLOCK(p); newsigacts = sigacts_alloc(); sigacts_copy(newsigacts, oldsigacts); PROC_LOCK(p); p->p_sigacts = newsigacts; } else oldsigacts = NULL; /* Stop profiling */ stopprofclock(p); /* reset caught signals */ execsigs(p); /* name this process - nameiexec(p, ndp) */ bzero(p->p_comm, sizeof(p->p_comm)); if (args->fname) bcopy(nd.ni_cnd.cn_nameptr, p->p_comm, min(nd.ni_cnd.cn_namelen, MAXCOMLEN)); else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0) bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title)); bcopy(p->p_comm, td->td_name, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif /* * mark as execed, wakeup the process that vforked (if any) and tell * it that it now has its own resources back */ p->p_flag |= P_EXEC; if (p->p_pptr && (p->p_flag & P_PPWAIT)) { p->p_flag &= ~P_PPWAIT; cv_broadcast(&p->p_pwait); } /* * Implement image setuid/setgid. * * Don't honor setuid/setgid if the filesystem prohibits it or if * the process is being traced. * * We disable setuid/setgid/etc in compatibility mode on the basis * that most setugid applications are not written with that * environment in mind, and will therefore almost certainly operate * incorrectly. In principle there's no reason that setugid * applications might not be useful in capability mode, so we may want * to reconsider this conservative design choice in the future. * * XXXMAC: For the time being, use NOSUID to also prohibit * transitions on the file system. */ credential_changing = 0; credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid != attr.va_uid; credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid != attr.va_gid; #ifdef MAC will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp, interpvplabel, imgp); credential_changing |= will_transition; #endif if (credential_changing && #ifdef CAPABILITY_MODE ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) && #endif (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && (p->p_flag & P_TRACED) == 0) { /* * Turn off syscall tracing for set-id programs, except for * root. Record any set-id flags first to make sure that * we do not regain any tracing during a possible block. */ setsugid(p); #ifdef KTRACE if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) ktrprocexec(p, &tracecred, &tracevp); #endif /* * Close any file descriptors 0..2 that reference procfs, * then make sure file descriptors 0..2 are in use. * * setugidsafety() may call closef() and then pfind() * which may grab the process lock. * fdcheckstd() may call falloc() which may block to * allocate memory, so temporarily drop the process lock. */ PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); setugidsafety(td); error = fdcheckstd(td); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); if (error != 0) goto done1; PROC_LOCK(p); /* * Set the new credentials. */ if (attr.va_mode & S_ISUID) change_euid(newcred, euip); if (attr.va_mode & S_ISGID) change_egid(newcred, attr.va_gid); #ifdef MAC if (will_transition) { mac_vnode_execve_transition(oldcred, newcred, imgp->vp, interpvplabel, imgp); } #endif /* * Implement correct POSIX saved-id behavior. * * XXXMAC: Note that the current logic will save the * uid and gid if a MAC domain transition occurs, even * though maybe it shouldn't. */ change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } else { if (oldcred->cr_uid == oldcred->cr_ruid && oldcred->cr_gid == oldcred->cr_rgid) p->p_flag &= ~P_SUGID; /* * Implement correct POSIX saved-id behavior. * * XXX: It's not clear that the existing behavior is * POSIX-compliant. A number of sources indicate that the * saved uid/gid should only be updated if the new ruid is * not equal to the old ruid, or the new euid is not equal * to the old euid and the new euid is not equal to the old * ruid. The FreeBSD code always updates the saved uid/gid. * Also, this code uses the new (replaced) euid and egid as * the source, which may or may not be the right ones to use. */ if (oldcred->cr_svuid != oldcred->cr_uid || oldcred->cr_svgid != oldcred->cr_gid) { change_svuid(newcred, newcred->cr_uid); change_svgid(newcred, newcred->cr_gid); p->p_ucred = newcred; newcred = NULL; } } /* * Store the vp for use in procfs. This vnode was referenced prior * to locking the proc lock. */ textvp = p->p_textvp; p->p_textvp = binvp; #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exec if it * has declared an interest. */ if (dtrace_fasttrap_exec) dtrace_fasttrap_exec(p); #endif /* * Notify others that we exec'd, and clear the P_INEXEC flag * as we're now a bona fide freshly-execed process. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXEC); p->p_flag &= ~P_INEXEC; /* clear "fork but no exec" flag, as we _are_ execing */ p->p_acflag &= ~AFORK; /* * Free any previous argument cache and replace it with * the new argument cache, if any. */ oldargs = p->p_args; p->p_args = newargs; newargs = NULL; #ifdef HWPMC_HOOKS /* * Check if system-wide sampling is in effect or if the * current process is using PMCs. If so, do exec() time * processing. This processing needs to happen AFTER the * P_INEXEC flag is cleared. * * The proc lock needs to be released before taking the PMC * SX. */ if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) { PROC_UNLOCK(p); VOP_UNLOCK(imgp->vp, 0); pe.pm_credentialschanged = credential_changing; pe.pm_entryaddr = imgp->entry_addr; PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe); vn_lock(imgp->vp, LK_SHARED | LK_RETRY); } else PROC_UNLOCK(p); #else /* !HWPMC_HOOKS */ PROC_UNLOCK(p); #endif /* Set values passed into the program in registers. */ if (p->p_sysent->sv_setregs) (*p->p_sysent->sv_setregs)(td, imgp, (u_long)(uintptr_t)stack_base); else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0); done1: /* * Free any resources malloc'd earlier that we didn't use. */ uifree(euip); if (newcred == NULL) crfree(oldcred); else crfree(newcred); VOP_UNLOCK(imgp->vp, 0); /* * Handle deferred decrement of ref counts. */ if (textvp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(textvp->v_mount); vrele(textvp); VFS_UNLOCK_GIANT(tvfslocked); } if (binvp && error != 0) vrele(binvp); #ifdef KTRACE if (tracevp != NULL) { int tvfslocked; tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount); vrele(tracevp); VFS_UNLOCK_GIANT(tvfslocked); } if (tracecred != NULL) crfree(tracecred); #endif vn_lock(imgp->vp, LK_SHARED | LK_RETRY); pargs_drop(oldargs); pargs_drop(newargs); if (oldsigacts != NULL) sigacts_free(oldsigacts); exec_fail_dealloc: /* * free various allocated resources */ if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); if (imgp->vp != NULL) { if (args->fname) NDFREE(&nd, NDF_ONLY_PNBUF); if (imgp->opened) VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td); vput(imgp->vp); } if (imgp->object != NULL) vm_object_deallocate(imgp->object); free(imgp->freepath, M_TEMP); if (error == 0) { PROC_LOCK(p); td->td_dbgflags |= TDB_EXEC; PROC_UNLOCK(p); /* * Stop the process here if its stop event mask has * the S_EXEC bit set. */ STOPEVENT(p, S_EXEC, 0); goto done2; } exec_fail: /* we're done here, clear P_INEXEC */ PROC_LOCK(p); p->p_flag &= ~P_INEXEC; PROC_UNLOCK(p); SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0); done2: #ifdef MAC mac_execve_exit(imgp); mac_execve_interpreter_exit(interpvplabel); #endif VFS_UNLOCK_GIANT(vfslocked); exec_free_args(args); if (error && imgp->vmspace_destroyed) { /* sorry, no more process anymore. exit gracefully */ exit1(td, W_EXITCODE(0, SIGABRT)); /* NOT REACHED */ } #ifdef KTRACE if (error == 0) ktrprocctor(p); #endif return (error); } int exec_map_first_page(imgp) struct image_params *imgp; { int rv, i; int initial_pagein; vm_page_t ma[VM_INITIAL_PAGEIN]; vm_object_t object; if (imgp->firstpage != NULL) exec_unmap_first_page(imgp); object = imgp->vp->v_object; if (object == NULL) return (EACCES); VM_OBJECT_LOCK(object); #if VM_NRESERVLEVEL > 0 if ((object->flags & OBJ_COLORED) == 0) { object->flags |= OBJ_COLORED; object->pg_color = 0; } #endif ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); if (ma[0]->valid != VM_PAGE_BITS_ALL) { initial_pagein = VM_INITIAL_PAGEIN; if (initial_pagein > object->size) initial_pagein = object->size; for (i = 1; i < initial_pagein; i++) { if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) { if (ma[i]->valid) break; if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy) break; vm_page_busy(ma[i]); } else { ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (ma[i] == NULL) break; } } initial_pagein = i; rv = vm_pager_get_pages(object, ma, initial_pagein, 0); ma[0] = vm_page_lookup(object, 0); if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) { if (ma[0] != NULL) { vm_page_lock(ma[0]); vm_page_free(ma[0]); vm_page_unlock(ma[0]); } VM_OBJECT_UNLOCK(object); return (EIO); } } vm_page_lock(ma[0]); vm_page_hold(ma[0]); vm_page_unlock(ma[0]); vm_page_wakeup(ma[0]); VM_OBJECT_UNLOCK(object); imgp->firstpage = sf_buf_alloc(ma[0], 0); imgp->image_header = (char *)sf_buf_kva(imgp->firstpage); return (0); } void exec_unmap_first_page(imgp) struct image_params *imgp; { vm_page_t m; if (imgp->firstpage != NULL) { m = sf_buf_page(imgp->firstpage); sf_buf_free(imgp->firstpage); imgp->firstpage = NULL; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); } } /* * Destroy old address space, and allocate a new stack * The new stack is only SGROWSIZ large because it is grown * automatically in trap.c. */ int exec_new_vmspace(imgp, sv) struct image_params *imgp; struct sysentvec *sv; { int error; struct proc *p = imgp->proc; struct vmspace *vmspace = p->p_vmspace; vm_object_t obj; vm_offset_t sv_minuser, stack_addr; vm_map_t map; u_long ssiz; imgp->vmspace_destroyed = 1; imgp->sysent = sv; /* May be called with Giant held */ EVENTHANDLER_INVOKE(process_exec, p, imgp); /* * Blow away entire process VM, if address space not shared, * otherwise, create a new VM space so that other threads are * not disrupted */ map = &vmspace->vm_map; if (map_at_zero) sv_minuser = sv->sv_minuser; else sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE); if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser && vm_map_max(map) == sv->sv_maxuser) { shmexit(vmspace); pmap_remove_pages(vmspace_pmap(vmspace)); vm_map_remove(map, vm_map_min(map), vm_map_max(map)); } else { error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); if (error) return (error); vmspace = p->p_vmspace; map = &vmspace->vm_map; } /* Map a shared page */ obj = sv->sv_shared_page_obj; if (obj != NULL) { vm_object_reference(obj); error = vm_map_fixed(map, obj, 0, sv->sv_shared_page_base, sv->sv_shared_page_len, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, MAP_COPY_ON_WRITE | MAP_ACC_NO_CHARGE); if (error) { vm_object_deallocate(obj); return (error); } } /* Allocate a new stack */ if (sv->sv_maxssiz != NULL) ssiz = *sv->sv_maxssiz; else ssiz = maxssiz; stack_addr = sv->sv_usrstack - ssiz; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot : sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN); if (error) return (error); #ifdef __ia64__ /* Allocate a new register stack */ stack_addr = IA64_BACKINGSTORE; error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz, sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP); if (error) return (error); #endif /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the * VM_STACK case, but they are still used to monitor the size of the * process stack so we can check the stack rlimit. */ vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz; return (0); } /* * Copy out argument and environment strings from the old process address * space into the temporary string buffer. */ int exec_copyin_args(struct image_args *args, char *fname, enum uio_seg segflg, char **argv, char **envv) { char *argp, *envp; int error; size_t length; bzero(args, sizeof(*args)); if (argv == NULL) return (EFAULT); /* * Allocate demand-paged memory for the file name, argument, and * environment strings. */ error = exec_alloc_args(args); if (error != 0) return (error); /* * Copy the file name. */ if (fname != NULL) { args->fname = args->buf; error = (segflg == UIO_SYSSPACE) ? copystr(fname, args->fname, PATH_MAX, &length) : copyinstr(fname, args->fname, PATH_MAX, &length); if (error != 0) goto err_exit; } else length = 0; args->begin_argv = args->buf + length; args->endp = args->begin_argv; args->stringspace = ARG_MAX; /* * extract arguments first */ while ((argp = (caddr_t) (intptr_t) fuword(argv++))) { if (argp == (caddr_t) -1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(argp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->argc++; } args->begin_envv = args->endp; /* * extract environment strings */ if (envv) { while ((envp = (caddr_t)(intptr_t)fuword(envv++))) { if (envp == (caddr_t)-1) { error = EFAULT; goto err_exit; } if ((error = copyinstr(envp, args->endp, args->stringspace, &length))) { if (error == ENAMETOOLONG) error = E2BIG; goto err_exit; } args->stringspace -= length; args->endp += length; args->envc++; } } return (0); err_exit: exec_free_args(args); return (error); } /* * Allocate temporary demand-paged, zero-filled memory for the file name, * argument, and environment strings. Returns zero if the allocation succeeds * and ENOMEM otherwise. */ int exec_alloc_args(struct image_args *args) { args->buf = (char *)kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX); return (args->buf != NULL ? 0 : ENOMEM); } void exec_free_args(struct image_args *args) { if (args->buf != NULL) { kmem_free_wakeup(exec_map, (vm_offset_t)args->buf, PATH_MAX + ARG_MAX); args->buf = NULL; } if (args->fname_buf != NULL) { free(args->fname_buf, M_TEMP); args->fname_buf = NULL; } } /* * Copy strings out to the new process address space, constructing new arg * and env vector tables. Return a pointer to the base so that it can be used * as the initial stack pointer. */ register_t * exec_copyout_strings(imgp) struct image_params *imgp; { int argc, envc; char **vectp; char *stringp, *destp; register_t *stack_base; struct ps_strings *arginfo; struct proc *p; size_t execpath_len; int szsigcode, szps; char canary[sizeof(long) * 8]; szps = sizeof(pagesizes[0]) * MAXPAGESIZES; /* * Calculate string base and vector table pointers. * Also deal with signal trampoline code for this exec type. */ if (imgp->execpath != NULL && imgp->auxargs != NULL) execpath_len = strlen(imgp->execpath) + 1; else execpath_len = 0; p = imgp->proc; szsigcode = 0; arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; if (p->p_sysent->sv_sigcode_base == 0) { if (p->p_sysent->sv_szsigcode != NULL) szsigcode = *(p->p_sysent->sv_szsigcode); } destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - roundup(execpath_len, sizeof(char *)) - roundup(sizeof(canary), sizeof(char *)) - roundup(szps, sizeof(char *)) - roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); /* * install sigcode */ if (szsigcode != 0) copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo - szsigcode), szsigcode); /* * Copy the image path for the rtld. */ if (execpath_len != 0) { imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len; copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); } /* * Prepare the canary for SSP. */ arc4rand(canary, sizeof(canary), 0); imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len - sizeof(canary); copyout(canary, (void *)imgp->canary, sizeof(canary)); imgp->canarylen = sizeof(canary); /* * Prepare the pagesizes array. */ imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len - roundup(sizeof(canary), sizeof(char *)) - szps; copyout(pagesizes, (void *)imgp->pagesizes, szps); imgp->pagesizeslen = szps; /* * If we have a valid auxargs ptr, prepare some room * on the stack. */ if (imgp->auxargs) { /* * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for * lower compatibility. */ imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : (AT_COUNT * 2); /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets,and imgp->auxarg_size is room * for argument of Runtime loader. */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); } else { /* * The '+ 2' is for the null pointers at the end of each of * the arg and env vector sets */ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(char *)); } /* * vectp also becomes our initial stack base */ stack_base = (register_t *)vectp; stringp = imgp->args->begin_argv; argc = imgp->args->argc; envc = imgp->args->envc; /* * Copy out strings - arguments and environment. */ copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); /* * Fill in "ps_strings" struct for ps, w, etc. */ suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nargvstr, argc); /* * Fill in argument portion of vector table. */ for (; argc > 0; --argc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* a null vector table pointer separates the argp's from the envp's */ suword(vectp++, 0); suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); suword32(&arginfo->ps_nenvstr, envc); /* * Fill in environment portion of vector table. */ for (; envc > 0; --envc) { suword(vectp++, (long)(intptr_t)destp); while (*stringp++ != 0) destp++; destp++; } /* end of vector table is a null pointer */ suword(vectp, 0); return (stack_base); } /* * Check permissions of file to execute. * Called with imgp->vp locked. * Return 0 for success or error code on failure. */ int exec_check_permissions(imgp) struct image_params *imgp; { struct vnode *vp = imgp->vp; struct vattr *attr = imgp->attr; struct thread *td; int error; td = curthread; /* Get file attributes */ error = VOP_GETATTR(vp, attr, td->td_ucred); if (error) return (error); #ifdef MAC error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp); if (error) return (error); #endif /* * 1) Check if file execution is disabled for the filesystem that * this file resides on. * 2) Ensure that at least one execute bit is on. Otherwise, a * privileged user will always succeed, and we don't want this * to happen unless the file really is executable. * 3) Ensure that the file is a regular file. */ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 || (attr->va_type != VREG)) return (EACCES); /* * Zero length files can't be exec'd */ if (attr->va_size == 0) return (ENOEXEC); /* * Check for execute permission to file based on current credentials. */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) return (error); /* * Check number of open-for-writes on the file and deny execution * if there are any. */ if (vp->v_writecount) return (ETXTBSY); /* * Call filesystem specific open routine (which does nothing in the * general case). */ error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error == 0) imgp->opened = 1; return (error); } /* * Exec handler registration */ int exec_register(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 2; /* New slot and trailing NULL */ if (execsw) for (es = execsw; *es; es++) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; if (execsw) for (es = execsw; *es; es++) *xs++ = *es; *xs++ = execsw_arg; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } int exec_unregister(execsw_arg) const struct execsw *execsw_arg; { const struct execsw **es, **xs, **newexecsw; int count = 1; if (execsw == NULL) panic("unregister with no handlers left?\n"); for (es = execsw; *es; es++) { if (*es == execsw_arg) break; } if (*es == NULL) return (ENOENT); for (es = execsw; *es; es++) if (*es != execsw_arg) count++; newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); if (newexecsw == NULL) return (ENOMEM); xs = newexecsw; for (es = execsw; *es; es++) if (*es != execsw_arg) *xs++ = *es; *xs = NULL; if (execsw) free(execsw, M_TEMP); execsw = newexecsw; return (0); } static struct sx shared_page_alloc_sx; static vm_object_t shared_page_obj; static int shared_page_free; struct sf_buf * shared_page_write_start(int base) { vm_page_t m; struct sf_buf *s; VM_OBJECT_LOCK(shared_page_obj); m = vm_page_grab(shared_page_obj, OFF_TO_IDX(base), VM_ALLOC_RETRY); VM_OBJECT_UNLOCK(shared_page_obj); s = sf_buf_alloc(m, SFB_DEFAULT); return (s); } void shared_page_write_end(struct sf_buf *sf) { vm_page_t m; m = sf_buf_page(sf); sf_buf_free(sf); VM_OBJECT_LOCK(shared_page_obj); vm_page_wakeup(m); VM_OBJECT_UNLOCK(shared_page_obj); } void shared_page_write(int base, int size, const void *data) { struct sf_buf *sf; vm_offset_t sk; sf = shared_page_write_start(base); sk = sf_buf_kva(sf); bcopy(data, (void *)(sk + (base & PAGE_MASK)), size); shared_page_write_end(sf); } static int shared_page_alloc_locked(int size, int align) { int res; res = roundup(shared_page_free, align); if (res + size >= IDX_TO_OFF(shared_page_obj->size)) res = -1; else shared_page_free = res + size; return (res); } int shared_page_alloc(int size, int align) { int res; sx_xlock(&shared_page_alloc_sx); res = shared_page_alloc_locked(size, align); sx_xunlock(&shared_page_alloc_sx); return (res); } int shared_page_fill(int size, int align, const void *data) { int res; sx_xlock(&shared_page_alloc_sx); res = shared_page_alloc_locked(size, align); if (res != -1) shared_page_write(res, size, data); sx_xunlock(&shared_page_alloc_sx); return (res); } static void shared_page_init(void *dummy __unused) { vm_page_t m; sx_init(&shared_page_alloc_sx, "shpsx"); shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE, VM_PROT_DEFAULT, 0, NULL); VM_OBJECT_LOCK(shared_page_obj); m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO); m->valid = VM_PAGE_BITS_ALL; VM_OBJECT_UNLOCK(shared_page_obj); } SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init, NULL); +static void +timehands_update(void *arg) +{ + struct sysentvec *sv; + struct sf_buf *sf; + struct vdso_timehands th; + struct vdso_timekeep *tk; + uint32_t enabled, idx; + + sv = arg; + sx_xlock(&shared_page_alloc_sx); + enabled = tc_fill_vdso_timehands(&th); + sf = shared_page_write_start(sv->sv_timekeep_off); + tk = (void *)(sf_buf_kva(sf) + (sv->sv_timekeep_off & PAGE_MASK)); + idx = sv->sv_timekeep_curr; + atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0); + if (++idx >= VDSO_TH_NUM) + idx = 0; + sv->sv_timekeep_curr = idx; + if (++sv->sv_timekeep_gen == 0) + sv->sv_timekeep_gen = 1; + th.th_gen = 0; + if (enabled) + tk->tk_th[idx] = th; + tk->tk_enabled = enabled; + atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen); + tk->tk_current = idx; + shared_page_write_end(sf); + sx_xunlock(&shared_page_alloc_sx); +} + +#ifdef COMPAT_FREEBSD32 +static void +timehands_update32(void *arg) +{ + struct sysentvec *sv; + struct sf_buf *sf; + struct vdso_timekeep32 *tk; + struct vdso_timehands32 th; + uint32_t enabled, idx; + + sv = arg; + sx_xlock(&shared_page_alloc_sx); + enabled = tc_fill_vdso_timehands32(&th); + sf = shared_page_write_start(sv->sv_timekeep_off); + tk = (void *)(sf_buf_kva(sf) + (sv->sv_timekeep_off & PAGE_MASK)); + idx = sv->sv_timekeep_curr; + atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0); + if (++idx >= VDSO_TH_NUM) + idx = 0; + sv->sv_timekeep_curr = idx; + if (++sv->sv_timekeep_gen == 0) + sv->sv_timekeep_gen = 1; + th.th_gen = 0; + if (enabled) + tk->tk_th[idx] = th; + tk->tk_enabled = enabled; + atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen); + tk->tk_current = idx; + shared_page_write_end(sf); + sx_xunlock(&shared_page_alloc_sx); +} +#endif + void exec_sysvec_init(void *param) { struct sysentvec *sv; + int tk_base; + uint32_t tk_ver; sv = (struct sysentvec *)param; if ((sv->sv_flags & SV_SHP) == 0) return; sv->sv_shared_page_obj = shared_page_obj; sv->sv_sigcode_base = sv->sv_shared_page_base + shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode); + tk_ver = VDSO_TK_VER_CURR; +#ifdef COMPAT_FREEBSD32 + if ((sv->sv_flags & SV_ILP32) != 0) { + tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) + + sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16); + KASSERT(tk_base != -1, ("tk_base -1 for 32bit")); + EVENTHANDLER_REGISTER(tc_windup, timehands_update32, sv, + EVENTHANDLER_PRI_ANY); + shared_page_write(tk_base + offsetof(struct vdso_timekeep32, + tk_ver), sizeof(uint32_t), &tk_ver); + } else { +#endif + tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) + + sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16); + KASSERT(tk_base != -1, ("tk_base -1 for native")); + EVENTHANDLER_REGISTER(tc_windup, timehands_update, sv, + EVENTHANDLER_PRI_ANY); + shared_page_write(tk_base + offsetof(struct vdso_timekeep, + tk_ver), sizeof(uint32_t), &tk_ver); +#ifdef COMPAT_FREEBSD32 + } +#endif + sv->sv_timekeep_base = sv->sv_shared_page_base + tk_base; + sv->sv_timekeep_off = tk_base; + EVENTHANDLER_INVOKE(tc_windup); } Index: head/sys/kern/kern_tc.c =================================================================== --- head/sys/kern/kern_tc.c (revision 237432) +++ head/sys/kern/kern_tc.c (revision 237433) @@ -1,1846 +1,1930 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Julien Ridoux at the University * of Melbourne under sponsorship from the FreeBSD Foundation. */ #include __FBSDID("$FreeBSD$"); +#include "opt_compat.h" #include "opt_ntp.h" #include "opt_ffclock.h" #include #include #ifdef FFCLOCK #include #include #endif #include #include #include #include #include +#include #include #include +#include /* * A large step happens on boot. This constant detects such steps. * It is relatively small so that ntp_update_second gets called enough * in the typical 'missed a couple of seconds' case, but doesn't loop * forever when the time step is large. */ #define LARGE_STEP 200 /* * Implement a dummy timecounter which we can use until we get a real one * in the air. This allows the console and other early stuff to use * time services. */ static u_int dummy_get_timecount(struct timecounter *tc) { static u_int now; return (++now); } static struct timecounter dummy_timecounter = { dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000 }; struct timehands { /* These fields must be initialized by the driver. */ struct timecounter *th_counter; int64_t th_adjustment; uint64_t th_scale; u_int th_offset_count; struct bintime th_offset; struct timeval th_microtime; struct timespec th_nanotime; /* Fields not to be copied in tc_windup start with th_generation. */ volatile u_int th_generation; struct timehands *th_next; }; static struct timehands th0; static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0}; static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9}; static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8}; static struct timehands th6 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th7}; static struct timehands th5 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th6}; static struct timehands th4 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th5}; static struct timehands th3 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th4}; static struct timehands th2 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th3}; static struct timehands th1 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th2}; static struct timehands th0 = { &dummy_timecounter, 0, (uint64_t)-1 / 1000000, 0, {1, 0}, {0, 0}, {0, 0}, 1, &th1 }; static struct timehands *volatile timehands = &th0; struct timecounter *timecounter = &dummy_timecounter; static struct timecounter *timecounters = &dummy_timecounter; int tc_min_ticktock_freq = 1; time_t time_second = 1; time_t time_uptime = 1; struct bintime boottimebin; struct timeval boottime; static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD, NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime"); SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, ""); static int timestepwarnings; SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, ×tepwarnings, 0, "Log time steps"); static void tc_windup(void); +static void tc_windup_push_vdso(void *ctx, int pending); static void cpu_tick_calibrate(int); +static struct task tc_windup_push_vdso_task = TASK_INITIALIZER(0, + tc_windup_push_vdso, 0); + static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS) { #ifndef __mips__ #ifdef SCTL_MASK32 int tv[2]; if (req->flags & SCTL_MASK32) { tv[0] = boottime.tv_sec; tv[1] = boottime.tv_usec; return SYSCTL_OUT(req, tv, sizeof(tv)); } else #endif #endif return SYSCTL_OUT(req, &boottime, sizeof(boottime)); } static int sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS) { u_int ncount; struct timecounter *tc = arg1; ncount = tc->tc_get_timecount(tc); return sysctl_handle_int(oidp, &ncount, 0, req); } static int sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS) { uint64_t freq; struct timecounter *tc = arg1; freq = tc->tc_frequency; return sysctl_handle_64(oidp, &freq, 0, req); } /* * Return the difference between the timehands' counter value now and what * was when we copied it to the timehands' offset_count. */ static __inline u_int tc_delta(struct timehands *th) { struct timecounter *tc; tc = th->th_counter; return ((tc->tc_get_timecount(tc) - th->th_offset_count) & tc->tc_counter_mask); } /* * Functions for reading the time. We have to loop until we are sure that * the timehands that we operated on was not updated under our feet. See * the comment in for a description of these 12 functions. */ #ifdef FFCLOCK void fbclock_binuptime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); } while (gen == 0 || gen != th->th_generation); } void fbclock_nanouptime(struct timespec *tsp) { struct bintime bt; fbclock_binuptime(&bt); bintime2timespec(&bt, tsp); } void fbclock_microuptime(struct timeval *tvp) { struct bintime bt; fbclock_binuptime(&bt); bintime2timeval(&bt, tvp); } void fbclock_bintime(struct bintime *bt) { fbclock_binuptime(bt); bintime_add(bt, &boottimebin); } void fbclock_nanotime(struct timespec *tsp) { struct bintime bt; fbclock_bintime(&bt); bintime2timespec(&bt, tsp); } void fbclock_microtime(struct timeval *tvp) { struct bintime bt; fbclock_bintime(&bt); bintime2timeval(&bt, tvp); } void fbclock_getbinuptime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; *bt = th->th_offset; } while (gen == 0 || gen != th->th_generation); } void fbclock_getnanouptime(struct timespec *tsp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; bintime2timespec(&th->th_offset, tsp); } while (gen == 0 || gen != th->th_generation); } void fbclock_getmicrouptime(struct timeval *tvp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; bintime2timeval(&th->th_offset, tvp); } while (gen == 0 || gen != th->th_generation); } void fbclock_getbintime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; *bt = th->th_offset; } while (gen == 0 || gen != th->th_generation); bintime_add(bt, &boottimebin); } void fbclock_getnanotime(struct timespec *tsp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; *tsp = th->th_nanotime; } while (gen == 0 || gen != th->th_generation); } void fbclock_getmicrotime(struct timeval *tvp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = th->th_generation; *tvp = th->th_microtime; } while (gen == 0 || gen != th->th_generation); } #else /* !FFCLOCK */ void binuptime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); } while (gen == 0 || gen != th->th_generation); } void nanouptime(struct timespec *tsp) { struct bintime bt; binuptime(&bt); bintime2timespec(&bt, tsp); } void microuptime(struct timeval *tvp) { struct bintime bt; binuptime(&bt); bintime2timeval(&bt, tvp); } void bintime(struct bintime *bt) { binuptime(bt); bintime_add(bt, &boottimebin); } void nanotime(struct timespec *tsp) { struct bintime bt; bintime(&bt); bintime2timespec(&bt, tsp); } void microtime(struct timeval *tvp) { struct bintime bt; bintime(&bt); bintime2timeval(&bt, tvp); } void getbinuptime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; *bt = th->th_offset; } while (gen == 0 || gen != th->th_generation); } void getnanouptime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; bintime2timespec(&th->th_offset, tsp); } while (gen == 0 || gen != th->th_generation); } void getmicrouptime(struct timeval *tvp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; bintime2timeval(&th->th_offset, tvp); } while (gen == 0 || gen != th->th_generation); } void getbintime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; *bt = th->th_offset; } while (gen == 0 || gen != th->th_generation); bintime_add(bt, &boottimebin); } void getnanotime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; *tsp = th->th_nanotime; } while (gen == 0 || gen != th->th_generation); } void getmicrotime(struct timeval *tvp) { struct timehands *th; u_int gen; do { th = timehands; gen = th->th_generation; *tvp = th->th_microtime; } while (gen == 0 || gen != th->th_generation); } #endif /* FFCLOCK */ #ifdef FFCLOCK /* * Support for feed-forward synchronization algorithms. This is heavily inspired * by the timehands mechanism but kept independent from it. *_windup() functions * have some connection to avoid accessing the timecounter hardware more than * necessary. */ /* Feed-forward clock estimates kept updated by the synchronization daemon. */ struct ffclock_estimate ffclock_estimate; struct bintime ffclock_boottime; /* Feed-forward boot time estimate. */ uint32_t ffclock_status; /* Feed-forward clock status. */ int8_t ffclock_updated; /* New estimates are available. */ struct mtx ffclock_mtx; /* Mutex on ffclock_estimate. */ struct fftimehands { struct ffclock_estimate cest; struct bintime tick_time; struct bintime tick_time_lerp; ffcounter tick_ffcount; uint64_t period_lerp; volatile uint8_t gen; struct fftimehands *next; }; #define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x)) static struct fftimehands ffth[10]; static struct fftimehands *volatile fftimehands = ffth; static void ffclock_init(void) { struct fftimehands *cur; struct fftimehands *last; memset(ffth, 0, sizeof(ffth)); last = ffth + NUM_ELEMENTS(ffth) - 1; for (cur = ffth; cur < last; cur++) cur->next = cur + 1; last->next = ffth; ffclock_updated = 0; ffclock_status = FFCLOCK_STA_UNSYNC; mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF); } /* * Reset the feed-forward clock estimates. Called from inittodr() to get things * kick started and uses the timecounter nominal frequency as a first period * estimate. Note: this function may be called several time just after boot. * Note: this is the only function that sets the value of boot time for the * monotonic (i.e. uptime) version of the feed-forward clock. */ void ffclock_reset_clock(struct timespec *ts) { struct timecounter *tc; struct ffclock_estimate cest; tc = timehands->th_counter; memset(&cest, 0, sizeof(struct ffclock_estimate)); timespec2bintime(ts, &ffclock_boottime); timespec2bintime(ts, &(cest.update_time)); ffclock_read_counter(&cest.update_ffcount); cest.leapsec_next = 0; cest.period = ((1ULL << 63) / tc->tc_frequency) << 1; cest.errb_abs = 0; cest.errb_rate = 0; cest.status = FFCLOCK_STA_UNSYNC; cest.leapsec_total = 0; cest.leapsec = 0; mtx_lock(&ffclock_mtx); bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate)); ffclock_updated = INT8_MAX; mtx_unlock(&ffclock_mtx); printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name, (unsigned long long)tc->tc_frequency, (long)ts->tv_sec, (unsigned long)ts->tv_nsec); } /* * Sub-routine to convert a time interval measured in RAW counter units to time * in seconds stored in bintime format. * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be * larger than the max value of u_int (on 32 bit architecture). Loop to consume * extra cycles. */ static void ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt) { struct bintime bt2; ffcounter delta, delta_max; delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1; bintime_clear(bt); do { if (ffdelta > delta_max) delta = delta_max; else delta = ffdelta; bt2.sec = 0; bt2.frac = period; bintime_mul(&bt2, (unsigned int)delta); bintime_add(bt, &bt2); ffdelta -= delta; } while (ffdelta > 0); } /* * Update the fftimehands. * Push the tick ffcount and time(s) forward based on current clock estimate. * The conversion from ffcounter to bintime relies on the difference clock * principle, whose accuracy relies on computing small time intervals. If a new * clock estimate has been passed by the synchronisation daemon, make it * current, and compute the linear interpolation for monotonic time if needed. */ static void ffclock_windup(unsigned int delta) { struct ffclock_estimate *cest; struct fftimehands *ffth; struct bintime bt, gap_lerp; ffcounter ffdelta; uint64_t frac; unsigned int polling; uint8_t forward_jump, ogen; /* * Pick the next timehand, copy current ffclock estimates and move tick * times and counter forward. */ forward_jump = 0; ffth = fftimehands->next; ogen = ffth->gen; ffth->gen = 0; cest = &ffth->cest; bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate)); ffdelta = (ffcounter)delta; ffth->period_lerp = fftimehands->period_lerp; ffth->tick_time = fftimehands->tick_time; ffclock_convert_delta(ffdelta, cest->period, &bt); bintime_add(&ffth->tick_time, &bt); ffth->tick_time_lerp = fftimehands->tick_time_lerp; ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt); bintime_add(&ffth->tick_time_lerp, &bt); ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta; /* * Assess the status of the clock, if the last update is too old, it is * likely the synchronisation daemon is dead and the clock is free * running. */ if (ffclock_updated == 0) { ffdelta = ffth->tick_ffcount - cest->update_ffcount; ffclock_convert_delta(ffdelta, cest->period, &bt); if (bt.sec > 2 * FFCLOCK_SKM_SCALE) ffclock_status |= FFCLOCK_STA_UNSYNC; } /* * If available, grab updated clock estimates and make them current. * Recompute time at this tick using the updated estimates. The clock * estimates passed the feed-forward synchronisation daemon may result * in time conversion that is not monotonically increasing (just after * the update). time_lerp is a particular linear interpolation over the * synchronisation algo polling period that ensures monotonicity for the * clock ids requesting it. */ if (ffclock_updated > 0) { bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate)); ffdelta = ffth->tick_ffcount - cest->update_ffcount; ffth->tick_time = cest->update_time; ffclock_convert_delta(ffdelta, cest->period, &bt); bintime_add(&ffth->tick_time, &bt); /* ffclock_reset sets ffclock_updated to INT8_MAX */ if (ffclock_updated == INT8_MAX) ffth->tick_time_lerp = ffth->tick_time; if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >)) forward_jump = 1; else forward_jump = 0; bintime_clear(&gap_lerp); if (forward_jump) { gap_lerp = ffth->tick_time; bintime_sub(&gap_lerp, &ffth->tick_time_lerp); } else { gap_lerp = ffth->tick_time_lerp; bintime_sub(&gap_lerp, &ffth->tick_time); } /* * The reset from the RTC clock may be far from accurate, and * reducing the gap between real time and interpolated time * could take a very long time if the interpolated clock insists * on strict monotonicity. The clock is reset under very strict * conditions (kernel time is known to be wrong and * synchronization daemon has been restarted recently. * ffclock_boottime absorbs the jump to ensure boot time is * correct and uptime functions stay consistent. */ if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) && ((cest->status & FFCLOCK_STA_UNSYNC) == 0) && ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) { if (forward_jump) bintime_add(&ffclock_boottime, &gap_lerp); else bintime_sub(&ffclock_boottime, &gap_lerp); ffth->tick_time_lerp = ffth->tick_time; bintime_clear(&gap_lerp); } ffclock_status = cest->status; ffth->period_lerp = cest->period; /* * Compute corrected period used for the linear interpolation of * time. The rate of linear interpolation is capped to 5000PPM * (5ms/s). */ if (bintime_isset(&gap_lerp)) { ffdelta = cest->update_ffcount; ffdelta -= fftimehands->cest.update_ffcount; ffclock_convert_delta(ffdelta, cest->period, &bt); polling = bt.sec; bt.sec = 0; bt.frac = 5000000 * (uint64_t)18446744073LL; bintime_mul(&bt, polling); if (bintime_cmp(&gap_lerp, &bt, >)) gap_lerp = bt; /* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */ frac = 0; if (gap_lerp.sec > 0) { frac -= 1; frac /= ffdelta / gap_lerp.sec; } frac += gap_lerp.frac / ffdelta; if (forward_jump) ffth->period_lerp += frac; else ffth->period_lerp -= frac; } ffclock_updated = 0; } if (++ogen == 0) ogen = 1; ffth->gen = ogen; fftimehands = ffth; } /* * Adjust the fftimehands when the timecounter is changed. Stating the obvious, * the old and new hardware counter cannot be read simultaneously. tc_windup() * does read the two counters 'back to back', but a few cycles are effectively * lost, and not accumulated in tick_ffcount. This is a fairly radical * operation for a feed-forward synchronization daemon, and it is its job to not * pushing irrelevant data to the kernel. Because there is no locking here, * simply force to ignore pending or next update to give daemon a chance to * realize the counter has changed. */ static void ffclock_change_tc(struct timehands *th) { struct fftimehands *ffth; struct ffclock_estimate *cest; struct timecounter *tc; uint8_t ogen; tc = th->th_counter; ffth = fftimehands->next; ogen = ffth->gen; ffth->gen = 0; cest = &ffth->cest; bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate)); cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1; cest->errb_abs = 0; cest->errb_rate = 0; cest->status |= FFCLOCK_STA_UNSYNC; ffth->tick_ffcount = fftimehands->tick_ffcount; ffth->tick_time_lerp = fftimehands->tick_time_lerp; ffth->tick_time = fftimehands->tick_time; ffth->period_lerp = cest->period; /* Do not lock but ignore next update from synchronization daemon. */ ffclock_updated--; if (++ogen == 0) ogen = 1; ffth->gen = ogen; fftimehands = ffth; } /* * Retrieve feed-forward counter and time of last kernel tick. */ void ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags) { struct fftimehands *ffth; uint8_t gen; /* * No locking but check generation has not changed. Also need to make * sure ffdelta is positive, i.e. ffcount > tick_ffcount. */ do { ffth = fftimehands; gen = ffth->gen; if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) *bt = ffth->tick_time_lerp; else *bt = ffth->tick_time; *ffcount = ffth->tick_ffcount; } while (gen == 0 || gen != ffth->gen); } /* * Absolute clock conversion. Low level function to convert ffcounter to * bintime. The ffcounter is converted using the current ffclock period estimate * or the "interpolated period" to ensure monotonicity. * NOTE: this conversion may have been deferred, and the clock updated since the * hardware counter has been read. */ void ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags) { struct fftimehands *ffth; struct bintime bt2; ffcounter ffdelta; uint8_t gen; /* * No locking but check generation has not changed. Also need to make * sure ffdelta is positive, i.e. ffcount > tick_ffcount. */ do { ffth = fftimehands; gen = ffth->gen; if (ffcount > ffth->tick_ffcount) ffdelta = ffcount - ffth->tick_ffcount; else ffdelta = ffth->tick_ffcount - ffcount; if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) { *bt = ffth->tick_time_lerp; ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2); } else { *bt = ffth->tick_time; ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2); } if (ffcount > ffth->tick_ffcount) bintime_add(bt, &bt2); else bintime_sub(bt, &bt2); } while (gen == 0 || gen != ffth->gen); } /* * Difference clock conversion. * Low level function to Convert a time interval measured in RAW counter units * into bintime. The difference clock allows measuring small intervals much more * reliably than the absolute clock. */ void ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt) { struct fftimehands *ffth; uint8_t gen; /* No locking but check generation has not changed. */ do { ffth = fftimehands; gen = ffth->gen; ffclock_convert_delta(ffdelta, ffth->cest.period, bt); } while (gen == 0 || gen != ffth->gen); } /* * Access to current ffcounter value. */ void ffclock_read_counter(ffcounter *ffcount) { struct timehands *th; struct fftimehands *ffth; unsigned int gen, delta; /* * ffclock_windup() called from tc_windup(), safe to rely on * th->th_generation only, for correct delta and ffcounter. */ do { th = timehands; gen = th->th_generation; ffth = fftimehands; delta = tc_delta(th); *ffcount = ffth->tick_ffcount; } while (gen == 0 || gen != th->th_generation); *ffcount += delta; } void binuptime(struct bintime *bt) { binuptime_fromclock(bt, sysclock_active); } void nanouptime(struct timespec *tsp) { nanouptime_fromclock(tsp, sysclock_active); } void microuptime(struct timeval *tvp) { microuptime_fromclock(tvp, sysclock_active); } void bintime(struct bintime *bt) { bintime_fromclock(bt, sysclock_active); } void nanotime(struct timespec *tsp) { nanotime_fromclock(tsp, sysclock_active); } void microtime(struct timeval *tvp) { microtime_fromclock(tvp, sysclock_active); } void getbinuptime(struct bintime *bt) { getbinuptime_fromclock(bt, sysclock_active); } void getnanouptime(struct timespec *tsp) { getnanouptime_fromclock(tsp, sysclock_active); } void getmicrouptime(struct timeval *tvp) { getmicrouptime_fromclock(tvp, sysclock_active); } void getbintime(struct bintime *bt) { getbintime_fromclock(bt, sysclock_active); } void getnanotime(struct timespec *tsp) { getnanotime_fromclock(tsp, sysclock_active); } void getmicrotime(struct timeval *tvp) { getmicrouptime_fromclock(tvp, sysclock_active); } #endif /* FFCLOCK */ /* * System clock currently providing time to the system. Modifiable via sysctl * when the FFCLOCK option is defined. */ int sysclock_active = SYSCLOCK_FBCK; /* Internal NTP status and error estimates. */ extern int time_status; extern long time_esterror; /* * Take a snapshot of sysclock data which can be used to compare system clocks * and generate timestamps after the fact. */ void sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast) { struct fbclock_info *fbi; struct timehands *th; struct bintime bt; unsigned int delta, gen; #ifdef FFCLOCK ffcounter ffcount; struct fftimehands *ffth; struct ffclock_info *ffi; struct ffclock_estimate cest; ffi = &clock_snap->ff_info; #endif fbi = &clock_snap->fb_info; delta = 0; do { th = timehands; gen = th->th_generation; fbi->th_scale = th->th_scale; fbi->tick_time = th->th_offset; #ifdef FFCLOCK ffth = fftimehands; ffi->tick_time = ffth->tick_time_lerp; ffi->tick_time_lerp = ffth->tick_time_lerp; ffi->period = ffth->cest.period; ffi->period_lerp = ffth->period_lerp; clock_snap->ffcount = ffth->tick_ffcount; cest = ffth->cest; #endif if (!fast) delta = tc_delta(th); } while (gen == 0 || gen != th->th_generation); clock_snap->delta = delta; clock_snap->sysclock_active = sysclock_active; /* Record feedback clock status and error. */ clock_snap->fb_info.status = time_status; /* XXX: Very crude estimate of feedback clock error. */ bt.sec = time_esterror / 1000000; bt.frac = ((time_esterror - bt.sec) * 1000000) * (uint64_t)18446744073709ULL; clock_snap->fb_info.error = bt; #ifdef FFCLOCK if (!fast) clock_snap->ffcount += delta; /* Record feed-forward clock leap second adjustment. */ ffi->leapsec_adjustment = cest.leapsec_total; if (clock_snap->ffcount > cest.leapsec_next) ffi->leapsec_adjustment -= cest.leapsec; /* Record feed-forward clock status and error. */ clock_snap->ff_info.status = cest.status; ffcount = clock_snap->ffcount - cest.update_ffcount; ffclock_convert_delta(ffcount, cest.period, &bt); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */ bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL); /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */ bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL); clock_snap->ff_info.error = bt; #endif } /* * Convert a sysclock snapshot into a struct bintime based on the specified * clock source and flags. */ int sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt, int whichclock, uint32_t flags) { #ifdef FFCLOCK struct bintime bt2; uint64_t period; #endif switch (whichclock) { case SYSCLOCK_FBCK: *bt = cs->fb_info.tick_time; /* If snapshot was created with !fast, delta will be >0. */ if (cs->delta > 0) bintime_addx(bt, cs->fb_info.th_scale * cs->delta); if ((flags & FBCLOCK_UPTIME) == 0) bintime_add(bt, &boottimebin); break; #ifdef FFCLOCK case SYSCLOCK_FFWD: if (flags & FFCLOCK_LERP) { *bt = cs->ff_info.tick_time_lerp; period = cs->ff_info.period_lerp; } else { *bt = cs->ff_info.tick_time; period = cs->ff_info.period; } /* If snapshot was created with !fast, delta will be >0. */ if (cs->delta > 0) { ffclock_convert_delta(cs->delta, period, &bt2); bintime_add(bt, &bt2); } /* Leap second adjustment. */ if (flags & FFCLOCK_LEAPSEC) bt->sec -= cs->ff_info.leapsec_adjustment; /* Boot time adjustment, for uptime/monotonic clocks. */ if (flags & FFCLOCK_UPTIME) bintime_sub(bt, &ffclock_boottime); break; #endif default: return (EINVAL); break; } return (0); } /* * Initialize a new timecounter and possibly use it. */ void tc_init(struct timecounter *tc) { u_int u; struct sysctl_oid *tc_root; u = tc->tc_frequency / tc->tc_counter_mask; /* XXX: We need some margin here, 10% is a guess */ u *= 11; u /= 10; if (u > hz && tc->tc_quality >= 0) { tc->tc_quality = -2000; if (bootverbose) { printf("Timecounter \"%s\" frequency %ju Hz", tc->tc_name, (uintmax_t)tc->tc_frequency); printf(" -- Insufficient hz, needs at least %u\n", u); } } else if (tc->tc_quality >= 0 || bootverbose) { printf("Timecounter \"%s\" frequency %ju Hz quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, tc->tc_quality); } tc->tc_next = timecounters; timecounters = tc; /* * Set up sysctl tree for this counter. */ tc_root = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name, CTLFLAG_RW, 0, "timecounter description"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0, "mask for implemented bits"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc), sysctl_kern_timecounter_get, "IU", "current timecounter value"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc), sysctl_kern_timecounter_freq, "QU", "timecounter frequency"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "quality", CTLFLAG_RD, &(tc->tc_quality), 0, "goodness of time counter"); /* * Never automatically use a timecounter with negative quality. * Even though we run on the dummy counter, switching here may be * worse since this timecounter may not be monotonous. */ if (tc->tc_quality < 0) return; if (tc->tc_quality < timecounter->tc_quality) return; if (tc->tc_quality == timecounter->tc_quality && tc->tc_frequency < timecounter->tc_frequency) return; (void)tc->tc_get_timecount(tc); (void)tc->tc_get_timecount(tc); timecounter = tc; } /* Report the frequency of the current timecounter. */ uint64_t tc_getfrequency(void) { return (timehands->th_counter->tc_frequency); } /* * Step our concept of UTC. This is done by modifying our estimate of * when we booted. * XXX: not locked. */ void tc_setclock(struct timespec *ts) { struct timespec tbef, taft; struct bintime bt, bt2; cpu_tick_calibrate(1); nanotime(&tbef); timespec2bintime(ts, &bt); binuptime(&bt2); bintime_sub(&bt, &bt2); bintime_add(&bt2, &boottimebin); boottimebin = bt; bintime2timeval(&bt, &boottime); /* XXX fiddle all the little crinkly bits around the fiords... */ tc_windup(); nanotime(&taft); if (timestepwarnings) { log(LOG_INFO, "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n", (intmax_t)tbef.tv_sec, tbef.tv_nsec, (intmax_t)taft.tv_sec, taft.tv_nsec, (intmax_t)ts->tv_sec, ts->tv_nsec); } cpu_tick_calibrate(1); } /* * Initialize the next struct timehands in the ring and make * it the active timehands. Along the way we might switch to a different * timecounter and/or do seconds processing in NTP. Slightly magic. */ static void tc_windup(void) { struct bintime bt; struct timehands *th, *tho; uint64_t scale; u_int delta, ncount, ogen; int i; time_t t; /* * Make the next timehands a copy of the current one, but do not * overwrite the generation or next pointer. While we update * the contents, the generation must be zero. */ tho = timehands; th = tho->th_next; ogen = th->th_generation; th->th_generation = 0; bcopy(tho, th, offsetof(struct timehands, th_generation)); /* * Capture a timecounter delta on the current timecounter and if * changing timecounters, a counter value from the new timecounter. * Update the offset fields accordingly. */ delta = tc_delta(th); if (th->th_counter != timecounter) ncount = timecounter->tc_get_timecount(timecounter); else ncount = 0; #ifdef FFCLOCK ffclock_windup(delta); #endif th->th_offset_count += delta; th->th_offset_count &= th->th_counter->tc_counter_mask; while (delta > th->th_counter->tc_frequency) { /* Eat complete unadjusted seconds. */ delta -= th->th_counter->tc_frequency; th->th_offset.sec++; } if ((delta > th->th_counter->tc_frequency / 2) && (th->th_scale * delta < ((uint64_t)1 << 63))) { /* The product th_scale * delta just barely overflows. */ th->th_offset.sec++; } bintime_addx(&th->th_offset, th->th_scale * delta); /* * Hardware latching timecounters may not generate interrupts on * PPS events, so instead we poll them. There is a finite risk that * the hardware might capture a count which is later than the one we * got above, and therefore possibly in the next NTP second which might * have a different rate than the current NTP second. It doesn't * matter in practice. */ if (tho->th_counter->tc_poll_pps) tho->th_counter->tc_poll_pps(tho->th_counter); /* * Deal with NTP second processing. The for loop normally * iterates at most once, but in extreme situations it might * keep NTP sane if timeouts are not run for several seconds. * At boot, the time step can be large when the TOD hardware * has been read, so on really large steps, we call * ntp_update_second only twice. We need to call it twice in * case we missed a leap second. */ bt = th->th_offset; bintime_add(&bt, &boottimebin); i = bt.sec - tho->th_microtime.tv_sec; if (i > LARGE_STEP) i = 2; for (; i > 0; i--) { t = bt.sec; ntp_update_second(&th->th_adjustment, &bt.sec); if (bt.sec != t) boottimebin.sec += bt.sec - t; } /* Update the UTC timestamps used by the get*() functions. */ /* XXX shouldn't do this here. Should force non-`get' versions. */ bintime2timeval(&bt, &th->th_microtime); bintime2timespec(&bt, &th->th_nanotime); /* Now is a good time to change timecounters. */ if (th->th_counter != timecounter) { #ifndef __arm__ if ((timecounter->tc_flags & TC_FLAGS_C3STOP) != 0) cpu_disable_deep_sleep++; if ((th->th_counter->tc_flags & TC_FLAGS_C3STOP) != 0) cpu_disable_deep_sleep--; #endif th->th_counter = timecounter; th->th_offset_count = ncount; tc_min_ticktock_freq = max(1, timecounter->tc_frequency / (((uint64_t)timecounter->tc_counter_mask + 1) / 3)); #ifdef FFCLOCK ffclock_change_tc(th); #endif } /*- * Recalculate the scaling factor. We want the number of 1/2^64 * fractions of a second per period of the hardware counter, taking * into account the th_adjustment factor which the NTP PLL/adjtime(2) * processing provides us with. * * The th_adjustment is nanoseconds per second with 32 bit binary * fraction and we want 64 bit binary fraction of second: * * x = a * 2^32 / 10^9 = a * 4.294967296 * * The range of th_adjustment is +/- 5000PPM so inside a 64bit int * we can only multiply by about 850 without overflowing, that * leaves no suitably precise fractions for multiply before divide. * * Divide before multiply with a fraction of 2199/512 results in a * systematic undercompensation of 10PPM of th_adjustment. On a * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. * * We happily sacrifice the lowest of the 64 bits of our result * to the goddess of code clarity. * */ scale = (uint64_t)1 << 63; scale += (th->th_adjustment / 1024) * 2199; scale /= th->th_counter->tc_frequency; th->th_scale = scale * 2; /* * Now that the struct timehands is again consistent, set the new * generation number, making sure to not make it zero. */ if (++ogen == 0) ogen = 1; th->th_generation = ogen; /* Go live with the new struct timehands. */ #ifdef FFCLOCK switch (sysclock_active) { case SYSCLOCK_FBCK: #endif time_second = th->th_microtime.tv_sec; time_uptime = th->th_offset.sec; #ifdef FFCLOCK break; case SYSCLOCK_FFWD: time_second = fftimehands->tick_time_lerp.sec; time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec; break; } #endif timehands = th; + taskqueue_enqueue_fast(taskqueue_fast, &tc_windup_push_vdso_task); } /* Report or change the active timecounter hardware. */ static int sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS) { char newname[32]; struct timecounter *newtc, *tc; int error; tc = timecounter; strlcpy(newname, tc->tc_name, sizeof(newname)); error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req); if (error != 0 || req->newptr == NULL || strcmp(newname, tc->tc_name) == 0) return (error); for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { if (strcmp(newname, newtc->tc_name) != 0) continue; /* Warm up new timecounter. */ (void)newtc->tc_get_timecount(newtc); (void)newtc->tc_get_timecount(newtc); timecounter = newtc; + EVENTHANDLER_INVOKE(tc_windup); return (0); } return (EINVAL); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_kern_timecounter_hardware, "A", "Timecounter hardware selected"); /* Report or change the active timecounter hardware. */ static int sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS) { char buf[32], *spc; struct timecounter *tc; int error; spc = ""; error = 0; for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) { sprintf(buf, "%s%s(%d)", spc, tc->tc_name, tc->tc_quality); error = SYSCTL_OUT(req, buf, strlen(buf)); spc = " "; } return (error); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected"); /* * RFC 2783 PPS-API implementation. */ int pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps) { pps_params_t *app; struct pps_fetch_args *fapi; #ifdef FFCLOCK struct pps_fetch_ffc_args *fapi_ffc; #endif #ifdef PPS_SYNC struct pps_kcbind_args *kapi; #endif KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl")); switch (cmd) { case PPS_IOC_CREATE: return (0); case PPS_IOC_DESTROY: return (0); case PPS_IOC_SETPARAMS: app = (pps_params_t *)data; if (app->mode & ~pps->ppscap) return (EINVAL); #ifdef FFCLOCK /* Ensure only a single clock is selected for ffc timestamp. */ if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK) return (EINVAL); #endif pps->ppsparam = *app; return (0); case PPS_IOC_GETPARAMS: app = (pps_params_t *)data; *app = pps->ppsparam; app->api_version = PPS_API_VERS_1; return (0); case PPS_IOC_GETCAP: *(int*)data = pps->ppscap; return (0); case PPS_IOC_FETCH: fapi = (struct pps_fetch_args *)data; if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) return (EOPNOTSUPP); pps->ppsinfo.current_mode = pps->ppsparam.mode; fapi->pps_info_buf = pps->ppsinfo; return (0); #ifdef FFCLOCK case PPS_IOC_FETCH_FFCOUNTER: fapi_ffc = (struct pps_fetch_ffc_args *)data; if (fapi_ffc->tsformat && fapi_ffc->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec) return (EOPNOTSUPP); pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode; fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc; /* Overwrite timestamps if feedback clock selected. */ switch (pps->ppsparam.mode & PPS_TSCLK_MASK) { case PPS_TSCLK_FBCK: fapi_ffc->pps_info_buf_ffc.assert_timestamp = pps->ppsinfo.assert_timestamp; fapi_ffc->pps_info_buf_ffc.clear_timestamp = pps->ppsinfo.clear_timestamp; break; case PPS_TSCLK_FFWD: break; default: break; } return (0); #endif /* FFCLOCK */ case PPS_IOC_KCBIND: #ifdef PPS_SYNC kapi = (struct pps_kcbind_args *)data; /* XXX Only root should be able to do this */ if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (kapi->kernel_consumer != PPS_KC_HARDPPS) return (EINVAL); if (kapi->edge & ~pps->ppscap) return (EINVAL); pps->kcmode = kapi->edge; return (0); #else return (EOPNOTSUPP); #endif default: return (ENOIOCTL); } } void pps_init(struct pps_state *pps) { pps->ppscap |= PPS_TSFMT_TSPEC; if (pps->ppscap & PPS_CAPTUREASSERT) pps->ppscap |= PPS_OFFSETASSERT; if (pps->ppscap & PPS_CAPTURECLEAR) pps->ppscap |= PPS_OFFSETCLEAR; #ifdef FFCLOCK pps->ppscap |= PPS_TSCLK_MASK; #endif } void pps_capture(struct pps_state *pps) { struct timehands *th; KASSERT(pps != NULL, ("NULL pps pointer in pps_capture")); th = timehands; pps->capgen = th->th_generation; pps->capth = th; #ifdef FFCLOCK pps->capffth = fftimehands; #endif pps->capcount = th->th_counter->tc_get_timecount(th->th_counter); if (pps->capgen != th->th_generation) pps->capgen = 0; } void pps_event(struct pps_state *pps, int event) { struct bintime bt; struct timespec ts, *tsp, *osp; u_int tcount, *pcount; int foff, fhard; pps_seq_t *pseq; #ifdef FFCLOCK struct timespec *tsp_ffc; pps_seq_t *pseq_ffc; ffcounter *ffcount; #endif KASSERT(pps != NULL, ("NULL pps pointer in pps_event")); /* If the timecounter was wound up underneath us, bail out. */ if (pps->capgen == 0 || pps->capgen != pps->capth->th_generation) return; /* Things would be easier with arrays. */ if (event == PPS_CAPTUREASSERT) { tsp = &pps->ppsinfo.assert_timestamp; osp = &pps->ppsparam.assert_offset; foff = pps->ppsparam.mode & PPS_OFFSETASSERT; fhard = pps->kcmode & PPS_CAPTUREASSERT; pcount = &pps->ppscount[0]; pseq = &pps->ppsinfo.assert_sequence; #ifdef FFCLOCK ffcount = &pps->ppsinfo_ffc.assert_ffcount; tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp; pseq_ffc = &pps->ppsinfo_ffc.assert_sequence; #endif } else { tsp = &pps->ppsinfo.clear_timestamp; osp = &pps->ppsparam.clear_offset; foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; fhard = pps->kcmode & PPS_CAPTURECLEAR; pcount = &pps->ppscount[1]; pseq = &pps->ppsinfo.clear_sequence; #ifdef FFCLOCK ffcount = &pps->ppsinfo_ffc.clear_ffcount; tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp; pseq_ffc = &pps->ppsinfo_ffc.clear_sequence; #endif } /* * If the timecounter changed, we cannot compare the count values, so * we have to drop the rest of the PPS-stuff until the next event. */ if (pps->ppstc != pps->capth->th_counter) { pps->ppstc = pps->capth->th_counter; *pcount = pps->capcount; pps->ppscount[2] = pps->capcount; return; } /* Convert the count to a timespec. */ tcount = pps->capcount - pps->capth->th_offset_count; tcount &= pps->capth->th_counter->tc_counter_mask; bt = pps->capth->th_offset; bintime_addx(&bt, pps->capth->th_scale * tcount); bintime_add(&bt, &boottimebin); bintime2timespec(&bt, &ts); /* If the timecounter was wound up underneath us, bail out. */ if (pps->capgen != pps->capth->th_generation) return; *pcount = pps->capcount; (*pseq)++; *tsp = ts; if (foff) { timespecadd(tsp, osp); if (tsp->tv_nsec < 0) { tsp->tv_nsec += 1000000000; tsp->tv_sec -= 1; } } #ifdef FFCLOCK *ffcount = pps->capffth->tick_ffcount + tcount; bt = pps->capffth->tick_time; ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt); bintime_add(&bt, &pps->capffth->tick_time); bintime2timespec(&bt, &ts); (*pseq_ffc)++; *tsp_ffc = ts; #endif #ifdef PPS_SYNC if (fhard) { uint64_t scale; /* * Feed the NTP PLL/FLL. * The FLL wants to know how many (hardware) nanoseconds * elapsed since the previous event. */ tcount = pps->capcount - pps->ppscount[2]; pps->ppscount[2] = pps->capcount; tcount &= pps->capth->th_counter->tc_counter_mask; scale = (uint64_t)1 << 63; scale /= pps->capth->th_counter->tc_frequency; scale *= 2; bt.sec = 0; bt.frac = 0; bintime_addx(&bt, scale * tcount); bintime2timespec(&bt, &ts); hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec); } #endif } /* * Timecounters need to be updated every so often to prevent the hardware * counter from overflowing. Updating also recalculates the cached values * used by the get*() family of functions, so their precision depends on * the update frequency. */ static int tc_tick; SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0, "Approximate number of hardclock ticks in a millisecond"); void tc_ticktock(int cnt) { static int count; count += cnt; if (count < tc_tick) return; count = 0; tc_windup(); } static void inittimecounter(void *dummy) { u_int p; /* * Set the initial timeout to * max(1, ). * People should probably not use the sysctl to set the timeout * to smaller than its inital value, since that value is the * smallest reasonable one. If they want better timestamps they * should use the non-"get"* functions. */ if (hz > 1000) tc_tick = (hz + 500) / 1000; else tc_tick = 1; p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); #ifdef FFCLOCK ffclock_init(); #endif /* warm up new timecounter (again) and get rolling. */ (void)timecounter->tc_get_timecount(timecounter); (void)timecounter->tc_get_timecount(timecounter); tc_windup(); } SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL); /* Cpu tick handling -------------------------------------------------*/ static int cpu_tick_variable; static uint64_t cpu_tick_frequency; static uint64_t tc_cpu_ticks(void) { static uint64_t base; static unsigned last; unsigned u; struct timecounter *tc; tc = timehands->th_counter; u = tc->tc_get_timecount(tc) & tc->tc_counter_mask; if (u < last) base += (uint64_t)tc->tc_counter_mask + 1; last = u; return (u + base); } void cpu_tick_calibration(void) { static time_t last_calib; if (time_uptime != last_calib && !(time_uptime & 0xf)) { cpu_tick_calibrate(0); last_calib = time_uptime; } } /* * This function gets called every 16 seconds on only one designated * CPU in the system from hardclock() via cpu_tick_calibration()(). * * Whenever the real time clock is stepped we get called with reset=1 * to make sure we handle suspend/resume and similar events correctly. */ static void cpu_tick_calibrate(int reset) { static uint64_t c_last; uint64_t c_this, c_delta; static struct bintime t_last; struct bintime t_this, t_delta; uint32_t divi; if (reset) { /* The clock was stepped, abort & reset */ t_last.sec = 0; return; } /* we don't calibrate fixed rate cputicks */ if (!cpu_tick_variable) return; getbinuptime(&t_this); c_this = cpu_ticks(); if (t_last.sec != 0) { c_delta = c_this - c_last; t_delta = t_this; bintime_sub(&t_delta, &t_last); /* * Headroom: * 2^(64-20) / 16[s] = * 2^(44) / 16[s] = * 17.592.186.044.416 / 16 = * 1.099.511.627.776 [Hz] */ divi = t_delta.sec << 20; divi |= t_delta.frac >> (64 - 20); c_delta <<= 20; c_delta /= divi; if (c_delta > cpu_tick_frequency) { if (0 && bootverbose) printf("cpu_tick increased to %ju Hz\n", c_delta); cpu_tick_frequency = c_delta; } } c_last = c_this; t_last = t_this; } void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var) { if (func == NULL) { cpu_ticks = tc_cpu_ticks; } else { cpu_tick_frequency = freq; cpu_tick_variable = var; cpu_ticks = func; } } uint64_t cpu_tickrate(void) { if (cpu_ticks == tc_cpu_ticks) return (tc_getfrequency()); return (cpu_tick_frequency); } /* * We need to be slightly careful converting cputicks to microseconds. * There is plenty of margin in 64 bits of microseconds (half a million * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply * before divide conversion (to retain precision) we find that the * margin shrinks to 1.5 hours (one millionth of 146y). * With a three prong approach we never lose significant bits, no * matter what the cputick rate and length of timeinterval is. */ uint64_t cputick2usec(uint64_t tick) { if (tick > 18446744073709551LL) /* floor(2^64 / 1000) */ return (tick / (cpu_tickrate() / 1000000LL)); else if (tick > 18446744073709LL) /* floor(2^64 / 1000000) */ return ((tick * 1000LL) / (cpu_tickrate() / 1000LL)); else return ((tick * 1000000LL) / cpu_tickrate()); } cpu_tick_f *cpu_ticks = tc_cpu_ticks; + +static int vdso_th_enable = 1; +static int +sysctl_fast_gettime(SYSCTL_HANDLER_ARGS) +{ + int old_vdso_th_enable, error; + + old_vdso_th_enable = vdso_th_enable; + error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req); + if (error != 0) + return (error); + vdso_th_enable = old_vdso_th_enable; + EVENTHANDLER_INVOKE(tc_windup); + return (0); +} +SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day"); + +uint32_t +tc_fill_vdso_timehands(struct vdso_timehands *vdso_th) +{ + struct timehands *th; + uint32_t enabled; + int gen; + + do { + th = timehands; + gen = th->th_generation; + vdso_th->th_algo = VDSO_TH_ALGO_1; + vdso_th->th_scale = th->th_scale; + vdso_th->th_offset_count = th->th_offset_count; + vdso_th->th_counter_mask = th->th_counter->tc_counter_mask; + vdso_th->th_offset = th->th_offset; + vdso_th->th_boottime = boottimebin; + enabled = cpu_fill_vdso_timehands(vdso_th); + } while (gen == 0 || timehands->th_generation != gen); + if (!vdso_th_enable) + enabled = 0; + return (enabled); +} + +#ifdef COMPAT_FREEBSD32 +uint32_t +tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32) +{ + struct timehands *th; + uint32_t enabled; + int gen; + + do { + th = timehands; + gen = th->th_generation; + vdso_th32->th_algo = VDSO_TH_ALGO_1; + *(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale; + vdso_th32->th_offset_count = th->th_offset_count; + vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask; + vdso_th32->th_offset.sec = th->th_offset.sec; + *(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac; + vdso_th32->th_boottime.sec = boottimebin.sec; + *(uint64_t *)&vdso_th32->th_boottime.frac[0] = boottimebin.frac; + enabled = cpu_fill_vdso_timehands32(vdso_th32); + } while (gen == 0 || timehands->th_generation != gen); + if (!vdso_th_enable) + enabled = 0; + return (enabled); +} +#endif + +static void +tc_windup_push_vdso(void *ctx, int pending) +{ + + EVENTHANDLER_INVOKE(tc_windup); +} Index: head/sys/kern/subr_dummy_vdso_tc.c =================================================================== --- head/sys/kern/subr_dummy_vdso_tc.c (nonexistent) +++ head/sys/kern/subr_dummy_vdso_tc.c (revision 237433) @@ -0,0 +1,49 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" + +#include +#include + +uint32_t +cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th) +{ + + return (0); +} + +#ifdef COMPAT_FREEBSD32 +uint32_t +cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32) +{ + + return (0); +} +#endif Property changes on: head/sys/kern/subr_dummy_vdso_tc.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/mips/include/vdso.h =================================================================== --- head/sys/mips/include/vdso.h (nonexistent) +++ head/sys/mips/include/vdso.h (revision 237433) @@ -0,0 +1,41 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MIPS_VDSO_H +#define _MIPS_VDSO_H + +#define VDSO_TIMEHANDS_MD \ + uint32_t th_res[8]; + +#ifdef _KERNEL +#ifdef COMPAT_FREEBSD32 + +#define VDSO_TIMEHANDS_MD32 VDSO_TIMEHANDS_MD + +#endif +#endif +#endif Property changes on: head/sys/mips/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/pc98/include/vdso.h =================================================================== --- head/sys/pc98/include/vdso.h (nonexistent) +++ head/sys/pc98/include/vdso.h (revision 237433) @@ -0,0 +1,6 @@ +/*- + * This file is in the public domain. + */ +/* $FreeBSD$ */ + +#include Property changes on: head/sys/pc98/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/powerpc/include/vdso.h =================================================================== --- head/sys/powerpc/include/vdso.h (nonexistent) +++ head/sys/powerpc/include/vdso.h (revision 237433) @@ -0,0 +1,41 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _POWERPC_VDSO_H +#define _POWERPC_VDSO_H + +#define VDSO_TIMEHANDS_MD \ + uint32_t th_res[8]; + +#ifdef _KERNEL +#ifdef COMPAT_FREEBSD32 + +#define VDSO_TIMEHANDS_MD32 VDSO_TIMEHANDS_MD + +#endif +#endif +#endif Property changes on: head/sys/powerpc/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/sparc64/include/vdso.h =================================================================== --- head/sys/sparc64/include/vdso.h (nonexistent) +++ head/sys/sparc64/include/vdso.h (revision 237433) @@ -0,0 +1,34 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPARC64_VDSO_H +#define _SPARC64_VDSO_H + +#define VDSO_TIMEHANDS_MD \ + uint32_t th_res[8]; + +#endif Property changes on: head/sys/sparc64/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/sys/sysent.h =================================================================== --- head/sys/sys/sysent.h (revision 237432) +++ head/sys/sys/sysent.h (revision 237433) @@ -1,273 +1,277 @@ /*- * Copyright (c) 1982, 1988, 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SYSENT_H_ #define _SYS_SYSENT_H_ #include struct rlimit; struct sysent; struct thread; struct ksiginfo; typedef int sy_call_t(struct thread *, void *); /* Used by the machine dependent syscall() code. */ typedef void (*systrace_probe_func_t)(u_int32_t, int, struct sysent *, void *, int); /* * Used by loaded syscalls to convert arguments to a DTrace array * of 64-bit arguments. */ typedef void (*systrace_args_func_t)(int, void *, u_int64_t *, int *); extern systrace_probe_func_t systrace_probe_func; struct sysent { /* system call table */ int sy_narg; /* number of arguments */ sy_call_t *sy_call; /* implementing function */ au_event_t sy_auevent; /* audit event associated with syscall */ systrace_args_func_t sy_systrace_args_func; /* optional argument conversion function. */ u_int32_t sy_entry; /* DTrace entry ID for systrace. */ u_int32_t sy_return; /* DTrace return ID for systrace. */ u_int32_t sy_flags; /* General flags for system calls. */ u_int32_t sy_thrcnt; }; /* * A system call is permitted in capability mode. */ #define SYF_CAPENABLED 0x00000001 #define SY_THR_FLAGMASK 0x7 #define SY_THR_STATIC 0x1 #define SY_THR_DRAINING 0x2 #define SY_THR_ABSENT 0x4 #define SY_THR_INCR 0x8 struct image_params; struct __sigset; struct syscall_args; struct trapframe; struct vnode; struct sysentvec { int sv_size; /* number of entries */ struct sysent *sv_table; /* pointer to sysent */ u_int sv_mask; /* optional mask to index */ int sv_sigsize; /* size of signal translation table */ int *sv_sigtbl; /* signal translation table */ int sv_errsize; /* size of errno translation table */ int *sv_errtbl; /* errno translation table */ int (*sv_transtrap)(int, int); /* translate trap-to-signal mapping */ int (*sv_fixup)(register_t **, struct image_params *); /* stack fixup function */ void (*sv_sendsig)(void (*)(int), struct ksiginfo *, struct __sigset *); /* send signal */ char *sv_sigcode; /* start of sigtramp code */ int *sv_szsigcode; /* size of sigtramp code */ void (*sv_prepsyscall)(struct trapframe *, int *, u_int *, caddr_t *); char *sv_name; /* name of binary type */ int (*sv_coredump)(struct thread *, struct vnode *, off_t, int); /* function to dump core, or NULL */ int (*sv_imgact_try)(struct image_params *); int sv_minsigstksz; /* minimum signal stack size */ int sv_pagesize; /* pagesize */ vm_offset_t sv_minuser; /* VM_MIN_ADDRESS */ vm_offset_t sv_maxuser; /* VM_MAXUSER_ADDRESS */ vm_offset_t sv_usrstack; /* USRSTACK */ vm_offset_t sv_psstrings; /* PS_STRINGS */ int sv_stackprot; /* vm protection for stack */ register_t *(*sv_copyout_strings)(struct image_params *); void (*sv_setregs)(struct thread *, struct image_params *, u_long); void (*sv_fixlimit)(struct rlimit *, int); u_long *sv_maxssiz; u_int sv_flags; void (*sv_set_syscall_retval)(struct thread *, int); int (*sv_fetch_syscall_args)(struct thread *, struct syscall_args *); const char **sv_syscallnames; vm_offset_t sv_shared_page_base; vm_offset_t sv_shared_page_len; vm_offset_t sv_sigcode_base; + vm_offset_t sv_timekeep_base; + int sv_timekeep_off; + int sv_timekeep_curr; + uint32_t sv_timekeep_gen; void *sv_shared_page_obj; void (*sv_schedtail)(struct thread *); }; #define SV_ILP32 0x000100 #define SV_LP64 0x000200 #define SV_IA32 0x004000 #define SV_AOUT 0x008000 #define SV_SHP 0x010000 #define SV_ABI_MASK 0xff #define SV_PROC_FLAG(p, x) ((p)->p_sysent->sv_flags & (x)) #define SV_PROC_ABI(p) ((p)->p_sysent->sv_flags & SV_ABI_MASK) #define SV_CURPROC_FLAG(x) SV_PROC_FLAG(curproc, x) #define SV_CURPROC_ABI() SV_PROC_ABI(curproc) /* same as ELFOSABI_XXX, to prevent header pollution */ #define SV_ABI_LINUX 3 #define SV_ABI_FREEBSD 9 #define SV_ABI_UNDEF 255 #ifdef _KERNEL extern struct sysentvec aout_sysvec; extern struct sysentvec elf_freebsd_sysvec; extern struct sysentvec null_sysvec; extern struct sysent sysent[]; extern const char *syscallnames[]; #if defined(__amd64__) || defined(__ia64__) extern int i386_read_exec; #endif #define NO_SYSCALL (-1) struct module; struct syscall_module_data { int (*chainevh)(struct module *, int, void *); /* next handler */ void *chainarg; /* arg for next event handler */ int *offset; /* offset into sysent */ struct sysent *new_sysent; /* new sysent */ struct sysent old_sysent; /* old sysent */ }; #define MAKE_SYSENT(syscallname) \ static struct sysent syscallname##_sysent = { \ (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ (sy_call_t *)& sys_##syscallname, \ SYS_AUE_##syscallname \ } #define MAKE_SYSENT_COMPAT(syscallname) \ static struct sysent syscallname##_sysent = { \ (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ (sy_call_t *)& syscallname, \ SYS_AUE_##syscallname \ } #define SYSCALL_MODULE(name, offset, new_sysent, evh, arg) \ static struct syscall_module_data name##_syscall_mod = { \ evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL } \ }; \ \ static moduledata_t name##_mod = { \ "sys/" #name, \ syscall_module_handler, \ &name##_syscall_mod \ }; \ DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE) #define SYSCALL_MODULE_HELPER(syscallname) \ static int syscallname##_syscall = SYS_##syscallname; \ MAKE_SYSENT(syscallname); \ SYSCALL_MODULE(syscallname, \ & syscallname##_syscall, & syscallname##_sysent, \ NULL, NULL) #define SYSCALL_MODULE_PRESENT(syscallname) \ (sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys && \ sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys) /* * Syscall registration helpers with resource allocation handling. */ struct syscall_helper_data { struct sysent new_sysent; struct sysent old_sysent; int syscall_no; int registered; }; #define SYSCALL_INIT_HELPER(syscallname) { \ .new_sysent = { \ .sy_narg = (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)& sys_ ## syscallname, \ .sy_auevent = SYS_AUE_##syscallname \ }, \ .syscall_no = SYS_##syscallname \ } #define SYSCALL_INIT_HELPER_COMPAT(syscallname) { \ .new_sysent = { \ .sy_narg = (sizeof(struct syscallname ## _args ) \ / sizeof(register_t)), \ .sy_call = (sy_call_t *)& syscallname, \ .sy_auevent = SYS_AUE_##syscallname \ }, \ .syscall_no = SYS_##syscallname \ } #define SYSCALL_INIT_LAST { \ .syscall_no = NO_SYSCALL \ } int syscall_register(int *offset, struct sysent *new_sysent, struct sysent *old_sysent); int syscall_deregister(int *offset, struct sysent *old_sysent); int syscall_module_handler(struct module *mod, int what, void *arg); int syscall_helper_register(struct syscall_helper_data *sd); int syscall_helper_unregister(struct syscall_helper_data *sd); struct proc; const char *syscallname(struct proc *p, u_int code); /* Special purpose system call functions. */ struct nosys_args; int lkmnosys(struct thread *, struct nosys_args *); int lkmressys(struct thread *, struct nosys_args *); int syscall_thread_enter(struct thread *td, struct sysent *se); void syscall_thread_exit(struct thread *td, struct sysent *se); struct sf_buf; int shared_page_alloc(int size, int align); int shared_page_fill(int size, int align, const void *data); void shared_page_write(int base, int size, const void *data); void exec_sysvec_init(void *param); struct sf_buf *shared_page_write_start(int base); void shared_page_write_end(struct sf_buf *sf); #define INIT_SYSENTVEC(name, sv) \ SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \ (sysinit_cfunc_t)exec_sysvec_init, sv); #endif /* _KERNEL */ #endif /* !_SYS_SYSENT_H_ */ Index: head/sys/sys/vdso.h =================================================================== --- head/sys/sys/vdso.h (nonexistent) +++ head/sys/sys/vdso.h (revision 237433) @@ -0,0 +1,124 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_VDSO_H +#define _SYS_VDSO_H + +#include +#include +#include + +struct vdso_timehands { + uint32_t th_algo; + uint32_t th_gen; + uint64_t th_scale; + uint32_t th_offset_count; + uint32_t th_counter_mask; + struct bintime th_offset; + struct bintime th_boottime; + VDSO_TIMEHANDS_MD +}; + +struct vdso_timekeep { + uint32_t tk_ver; + uint32_t tk_enabled; + uint32_t tk_current; + struct vdso_timehands tk_th[]; +}; + +#define VDSO_TK_CURRENT_BUSY 0xffffffff +#define VDSO_TK_VER_1 0x1 +#define VDSO_TK_VER_CURR VDSO_TK_VER_1 +#define VDSO_TH_ALGO_1 0x1 + +#ifndef _KERNEL + +struct timespec; +struct timeval; +struct timezone; + +int __vdso_clock_gettime(clockid_t clock_id, struct timespec *ts); +#pragma weak __vdso_clock_gettime + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +#pragma weak __vdso_gettimeofday + +u_int __vdso_gettc(const struct vdso_timehands *vdso_th); +#pragma weak __vdso_gettc + +#endif + +#ifdef _KERNEL + +uint32_t tc_fill_vdso_timehands(struct vdso_timehands *vdso_th); + +/* + * The cpu_fill_vdso_timehands() function should fill MD-part of the + * struct vdso_timehands, which is both machine- and + * timecounter-depended. The return value should be 1 if fast + * userspace timecounter is enabled by hardware, and 0 otherwise. The + * global sysctl enable override is handled by machine-independed code + * after cpu_fill_vdso_timehands() call is made. + */ +uint32_t cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th); + +typedef void (*tc_windup_fn)(void *); +EVENTHANDLER_DECLARE(tc_windup, tc_windup_fn); + +#define VDSO_TH_NUM 4 + +#ifdef COMPAT_FREEBSD32 +struct bintime32 { + uint32_t sec; + uint32_t frac[2]; +}; + +struct vdso_timehands32 { + uint32_t th_algo; + uint32_t th_gen; + uint32_t th_scale[2]; + uint32_t th_offset_count; + uint32_t th_counter_mask; + struct bintime32 th_offset; + struct bintime32 th_boottime; + VDSO_TIMEHANDS_MD32 +}; + +struct vdso_timekeep32 { + uint32_t tk_ver; + uint32_t tk_enabled; + uint32_t tk_current; + struct vdso_timehands32 tk_th[]; +}; + +uint32_t tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32); +uint32_t cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32); + +#endif +#endif + +#endif Property changes on: head/sys/sys/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/x86/include/vdso.h =================================================================== --- head/sys/x86/include/vdso.h (nonexistent) +++ head/sys/x86/include/vdso.h (revision 237433) @@ -0,0 +1,42 @@ +/*- + * Copyright 2012 Konstantin Belousov . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _X86_VDSO_H +#define _X86_VDSO_H + +#define VDSO_TIMEHANDS_MD \ + uint32_t th_x86_shift; \ + uint32_t th_res[7]; + +#ifdef _KERNEL +#ifdef COMPAT_FREEBSD32 + +#define VDSO_TIMEHANDS_MD32 VDSO_TIMEHANDS_MD + +#endif +#endif +#endif Property changes on: head/sys/x86/include/vdso.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: head/sys/x86/x86/tsc.c =================================================================== --- head/sys/x86/x86/tsc.c (revision 237432) +++ head/sys/x86/x86/tsc.c (revision 237433) @@ -1,606 +1,628 @@ /*- * Copyright (c) 1998-2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_compat.h" #include "opt_clock.h" #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include "cpufreq_if.h" uint64_t tsc_freq; int tsc_is_invariant; int tsc_perf_stat; static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag; SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); TUNABLE_INT("kern.timecounter.invariant_tsc", &tsc_is_invariant); #ifdef SMP static int smp_tsc; SYSCTL_INT(_kern_timecounter, OID_AUTO, smp_tsc, CTLFLAG_RDTUN, &smp_tsc, 0, "Indicates whether the TSC is safe to use in SMP mode"); TUNABLE_INT("kern.timecounter.smp_tsc", &smp_tsc); #endif static int tsc_disabled; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc, CTLFLAG_RDTUN, &tsc_disabled, 0, "Disable x86 Time Stamp Counter"); TUNABLE_INT("machdep.disable_tsc", &tsc_disabled); static int tsc_skip_calibration; SYSCTL_INT(_machdep, OID_AUTO, disable_tsc_calibration, CTLFLAG_RDTUN, &tsc_skip_calibration, 0, "Disable TSC frequency calibration"); TUNABLE_INT("machdep.disable_tsc_calibration", &tsc_skip_calibration); static void tsc_freq_changed(void *arg, const struct cf_level *level, int status); static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status); static unsigned tsc_get_timecount(struct timecounter *tc); static unsigned tsc_get_timecount_low(struct timecounter *tc); static void tsc_levels_changed(void *arg, int unit); static struct timecounter tsc_timecounter = { tsc_get_timecount, /* get_timecount */ 0, /* no poll_pps */ ~0u, /* counter_mask */ 0, /* frequency */ "TSC", /* name */ 800, /* quality (adjusted in code) */ }; #define VMW_HVMAGIC 0x564d5868 #define VMW_HVPORT 0x5658 #define VMW_HVCMD_GETVERSION 10 #define VMW_HVCMD_GETHZ 45 static __inline void vmware_hvcall(u_int cmd, u_int *p) { __asm __volatile("inl %w3, %0" : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) : "0" (VMW_HVMAGIC), "1" (UINT_MAX), "2" (cmd), "3" (VMW_HVPORT) : "memory"); } static int tsc_freq_vmware(void) { char hv_sig[13]; u_int regs[4]; char *p; u_int hv_high; int i; /* * [RFC] CPUID usage for interaction between Hypervisors and Linux. * http://lkml.org/lkml/2008/10/1/246 * * KB1009458: Mechanisms to determine if software is running in * a VMware virtual machine * http://kb.vmware.com/kb/1009458 */ hv_high = 0; if ((cpu_feature2 & CPUID2_HV) != 0) { do_cpuid(0x40000000, regs); hv_high = regs[0]; for (i = 1, p = hv_sig; i < 4; i++, p += sizeof(regs) / 4) memcpy(p, ®s[i], sizeof(regs[i])); *p = '\0'; if (bootverbose) { /* * HV vendor ID string * ------------+-------------- * KVM "KVMKVMKVM" * Microsoft "Microsoft Hv" * VMware "VMwareVMware" * Xen "XenVMMXenVMM" */ printf("Hypervisor: Origin = \"%s\"\n", hv_sig); } if (strncmp(hv_sig, "VMwareVMware", 12) != 0) return (0); } else { p = getenv("smbios.system.serial"); if (p == NULL) return (0); if (strncmp(p, "VMware-", 7) != 0 && strncmp(p, "VMW", 3) != 0) { freeenv(p); return (0); } freeenv(p); vmware_hvcall(VMW_HVCMD_GETVERSION, regs); if (regs[1] != VMW_HVMAGIC) return (0); } if (hv_high >= 0x40000010) { do_cpuid(0x40000010, regs); tsc_freq = regs[0] * 1000; } else { vmware_hvcall(VMW_HVCMD_GETHZ, regs); if (regs[1] != UINT_MAX) tsc_freq = regs[0] | ((uint64_t)regs[1] << 32); } tsc_is_invariant = 1; return (1); } static void tsc_freq_intel(void) { char brand[48]; u_int regs[4]; uint64_t freq; char *p; u_int i; /* * Intel Processor Identification and the CPUID Instruction * Application Note 485. * http://www.intel.com/assets/pdf/appnote/241618.pdf */ if (cpu_exthigh >= 0x80000004) { p = brand; for (i = 0x80000002; i < 0x80000005; i++) { do_cpuid(i, regs); memcpy(p, regs, sizeof(regs)); p += sizeof(regs); } p = NULL; for (i = 0; i < sizeof(brand) - 1; i++) if (brand[i] == 'H' && brand[i + 1] == 'z') p = brand + i; if (p != NULL) { p -= 5; switch (p[4]) { case 'M': i = 1; break; case 'G': i = 1000; break; case 'T': i = 1000000; break; default: return; } #define C2D(c) ((c) - '0') if (p[1] == '.') { freq = C2D(p[0]) * 1000; freq += C2D(p[2]) * 100; freq += C2D(p[3]) * 10; freq *= i * 1000; } else { freq = C2D(p[0]) * 1000; freq += C2D(p[1]) * 100; freq += C2D(p[2]) * 10; freq += C2D(p[3]); freq *= i * 1000000; } #undef C2D tsc_freq = freq; } } } static void probe_tsc_freq(void) { u_int regs[4]; uint64_t tsc1, tsc2; if (cpu_high >= 6) { do_cpuid(6, regs); if ((regs[2] & CPUID_PERF_STAT) != 0) { /* * XXX Some emulators expose host CPUID without actual * support for these MSRs. We must test whether they * really work. */ wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); DELAY(10); if (rdmsr(MSR_MPERF) > 0 && rdmsr(MSR_APERF) > 0) tsc_perf_stat = 1; } } if (tsc_freq_vmware()) return; switch (cpu_vendor_id) { case CPU_VENDOR_AMD: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) >= 0x10)) tsc_is_invariant = 1; break; case CPU_VENDOR_INTEL: if ((amd_pminfo & AMDPM_TSC_INVARIANT) != 0 || (vm_guest == VM_GUEST_NO && ((CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xe) || (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x3)))) tsc_is_invariant = 1; break; case CPU_VENDOR_CENTAUR: if (vm_guest == VM_GUEST_NO && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf && (rdmsr(0x1203) & 0x100000000ULL) == 0) tsc_is_invariant = 1; break; } if (tsc_skip_calibration) { if (cpu_vendor_id == CPU_VENDOR_INTEL) tsc_freq_intel(); return; } if (bootverbose) printf("Calibrating TSC clock ... "); tsc1 = rdtsc(); DELAY(1000000); tsc2 = rdtsc(); tsc_freq = tsc2 - tsc1; if (bootverbose) printf("TSC clock: %ju Hz\n", (intmax_t)tsc_freq); } void init_TSC(void) { if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; probe_tsc_freq(); /* * Inform CPU accounting about our boot-time clock rate. This will * be updated if someone loads a cpufreq driver after boot that * discovers a new max frequency. */ if (tsc_freq != 0) set_cputicker(rdtsc, tsc_freq, !tsc_is_invariant); if (tsc_is_invariant) return; /* Register to find out about changes in CPU frequency. */ tsc_pre_tag = EVENTHANDLER_REGISTER(cpufreq_pre_change, tsc_freq_changing, NULL, EVENTHANDLER_PRI_FIRST); tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_FIRST); tsc_levels_tag = EVENTHANDLER_REGISTER(cpufreq_levels_changed, tsc_levels_changed, NULL, EVENTHANDLER_PRI_ANY); } #ifdef SMP #define TSC_READ(x) \ static void \ tsc_read_##x(void *arg) \ { \ uint32_t *tsc = arg; \ u_int cpu = PCPU_GET(cpuid); \ \ tsc[cpu * 3 + x] = rdtsc32(); \ } TSC_READ(0) TSC_READ(1) TSC_READ(2) #undef TSC_READ #define N 1000 static void comp_smp_tsc(void *arg) { uint32_t *tsc; int32_t d1, d2; u_int cpu = PCPU_GET(cpuid); u_int i, j, size; size = (mp_maxid + 1) * 3; for (i = 0, tsc = arg; i < N; i++, tsc += size) CPU_FOREACH(j) { if (j == cpu) continue; d1 = tsc[cpu * 3 + 1] - tsc[j * 3]; d2 = tsc[cpu * 3 + 2] - tsc[j * 3 + 1]; if (d1 <= 0 || d2 <= 0) { smp_tsc = 0; return; } } } static int test_smp_tsc(void) { uint32_t *data, *tsc; u_int i, size; if (!smp_tsc && !tsc_is_invariant) return (-100); size = (mp_maxid + 1) * 3; data = malloc(sizeof(*data) * size * N, M_TEMP, M_WAITOK); for (i = 0, tsc = data; i < N; i++, tsc += size) smp_rendezvous(tsc_read_0, tsc_read_1, tsc_read_2, tsc); smp_tsc = 1; /* XXX */ smp_rendezvous(smp_no_rendevous_barrier, comp_smp_tsc, smp_no_rendevous_barrier, data); free(data, M_TEMP); if (bootverbose) printf("SMP: %sed TSC synchronization test\n", smp_tsc ? "pass" : "fail"); if (smp_tsc && tsc_is_invariant) { switch (cpu_vendor_id) { case CPU_VENDOR_AMD: /* * Starting with Family 15h processors, TSC clock * source is in the north bridge. Check whether * we have a single-socket/multi-core platform. * XXX Need more work for complex cases. */ if (CPUID_TO_FAMILY(cpu_id) < 0x15 || (amd_feature2 & AMDID2_CMP) == 0 || smp_cpus > (cpu_procinfo2 & AMDID_CMP_CORES) + 1) break; return (1000); case CPU_VENDOR_INTEL: /* * XXX Assume Intel platforms have synchronized TSCs. */ return (1000); } return (800); } return (-100); } #undef N #endif /* SMP */ static void init_TSC_tc(void) { uint64_t max_freq; int shift; if ((cpu_feature & CPUID_TSC) == 0 || tsc_disabled) return; /* * Limit timecounter frequency to fit in an int and prevent it from * overflowing too fast. */ max_freq = UINT_MAX; /* * We can not use the TSC if we support APM. Precise timekeeping * on an APM'ed machine is at best a fools pursuit, since * any and all of the time spent in various SMM code can't * be reliably accounted for. Reading the RTC is your only * source of reliable time info. The i8254 loses too, of course, * but we need to have some kind of time... * We don't know at this point whether APM is going to be used * or not, nor when it might be activated. Play it safe. */ if (power_pm_get_type() == POWER_PM_TYPE_APM) { tsc_timecounter.tc_quality = -1000; if (bootverbose) printf("TSC timecounter disabled: APM enabled.\n"); goto init; } /* * We cannot use the TSC if it stops incrementing in deep sleep. * Currently only Intel CPUs are known for this problem unless * the invariant TSC bit is set. */ if (cpu_can_deep_sleep && cpu_vendor_id == CPU_VENDOR_INTEL && (amd_pminfo & AMDPM_TSC_INVARIANT) == 0) { tsc_timecounter.tc_quality = -1000; tsc_timecounter.tc_flags |= TC_FLAGS_C3STOP; if (bootverbose) printf("TSC timecounter disabled: C3 enabled.\n"); goto init; } #ifdef SMP /* * We can not use the TSC in SMP mode unless the TSCs on all CPUs are * synchronized. If the user is sure that the system has synchronized * TSCs, set kern.timecounter.smp_tsc tunable to a non-zero value. * We also limit the frequency even lower to avoid "temporal anomalies" * as much as possible. The TSC seems unreliable in virtualized SMP * environments, so it is set to a negative quality in those cases. */ if (smp_cpus > 1) { if (vm_guest != 0) { tsc_timecounter.tc_quality = -100; } else { tsc_timecounter.tc_quality = test_smp_tsc(); max_freq >>= 8; } } else #endif if (tsc_is_invariant) tsc_timecounter.tc_quality = 1000; init: for (shift = 0; shift < 31 && (tsc_freq >> shift) > max_freq; shift++) ; if (shift > 0) { tsc_timecounter.tc_get_timecount = tsc_get_timecount_low; tsc_timecounter.tc_name = "TSC-low"; if (bootverbose) printf("TSC timecounter discards lower %d bit(s)\n", shift); } if (tsc_freq != 0) { tsc_timecounter.tc_frequency = tsc_freq >> shift; tsc_timecounter.tc_priv = (void *)(intptr_t)shift; tc_init(&tsc_timecounter); } } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); /* * When cpufreq levels change, find out about the (new) max frequency. We * use this to update CPU accounting in case it got a lower estimate at boot. */ static void tsc_levels_changed(void *arg, int unit) { device_t cf_dev; struct cf_level *levels; int count, error; uint64_t max_freq; /* Only use values from the first CPU, assuming all are equal. */ if (unit != 0) return; /* Find the appropriate cpufreq device instance. */ cf_dev = devclass_get_device(devclass_find("cpufreq"), unit); if (cf_dev == NULL) { printf("tsc_levels_changed() called but no cpufreq device?\n"); return; } /* Get settings from the device and find the max frequency. */ count = 64; levels = malloc(count * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return; error = CPUFREQ_LEVELS(cf_dev, levels, &count); if (error == 0 && count != 0) { max_freq = (uint64_t)levels[0].total_set.freq * 1000000; set_cputicker(rdtsc, max_freq, 1); } else printf("tsc_levels_changed: no max freq found\n"); free(levels, M_TEMP); } /* * If the TSC timecounter is in use, veto the pending change. It may be * possible in the future to handle a dynamically-changing timecounter rate. */ static void tsc_freq_changing(void *arg, const struct cf_level *level, int *status) { if (*status != 0 || timecounter != &tsc_timecounter) return; printf("timecounter TSC must not be in use when " "changing frequencies; change denied\n"); *status = EBUSY; } /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg, const struct cf_level *level, int status) { uint64_t freq; /* If there was an error during the transition, don't do anything. */ if (tsc_disabled || status != 0) return; /* Total setting for this level gives the new frequency in MHz. */ freq = (uint64_t)level->total_set.freq * 1000000; atomic_store_rel_64(&tsc_freq, freq); tsc_timecounter.tc_frequency = freq >> (int)(intptr_t)tsc_timecounter.tc_priv; } static int sysctl_machdep_tsc_freq(SYSCTL_HANDLER_ARGS) { int error; uint64_t freq; freq = atomic_load_acq_64(&tsc_freq); if (freq == 0) return (EOPNOTSUPP); error = sysctl_handle_64(oidp, &freq, 0, req); if (error == 0 && req->newptr != NULL) { atomic_store_rel_64(&tsc_freq, freq); atomic_store_rel_64(&tsc_timecounter.tc_frequency, freq >> (int)(intptr_t)tsc_timecounter.tc_priv); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, tsc_freq, CTLTYPE_U64 | CTLFLAG_RW, 0, 0, sysctl_machdep_tsc_freq, "QU", "Time Stamp Counter frequency"); static u_int tsc_get_timecount(struct timecounter *tc __unused) { return (rdtsc32()); } static u_int tsc_get_timecount_low(struct timecounter *tc) { uint32_t rv; __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" ((int)(intptr_t)tc->tc_priv) : "edx"); return (rv); } + +uint32_t +cpu_fill_vdso_timehands(struct vdso_timehands *vdso_th) +{ + + vdso_th->th_x86_shift = (int)(intptr_t)timecounter->tc_priv; + bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); + return (timecounter == &tsc_timecounter); +} + +#ifdef COMPAT_FREEBSD32 +uint32_t +cpu_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32) +{ + + vdso_th32->th_x86_shift = (int)(intptr_t)timecounter->tc_priv; + bzero(vdso_th32->th_res, sizeof(vdso_th32->th_res)); + return (timecounter == &tsc_timecounter); +} +#endif