Index: head/lib/libc/amd64/sys/__vdso_gettc.c =================================================================== --- head/lib/libc/amd64/sys/__vdso_gettc.c (revision 286283) +++ head/lib/libc/amd64/sys/__vdso_gettc.c (revision 286284) @@ -1,60 +1,70 @@ /*- * Copyright (c) 2012 Konstantin Belousov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "libc_private.h" static u_int __vdso_gettc_low(const struct vdso_timehands *th) { - uint32_t rv; + u_int rv; - __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" + __asm __volatile("lfence; rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" (th->th_x86_shift) : "edx"); return (rv); } +static u_int +__vdso_rdtsc32(void) +{ + u_int rv; + + __asm __volatile("lfence;rdtsc" : "=a" (rv) : : "edx"); + return (rv); +} + #pragma weak __vdso_gettc u_int __vdso_gettc(const struct vdso_timehands *th) { - return (th->th_x86_shift > 0 ? __vdso_gettc_low(th) : rdtsc32()); + return (th->th_x86_shift > 0 ? __vdso_gettc_low(th) : + __vdso_rdtsc32()); } #pragma weak __vdso_gettimekeep int __vdso_gettimekeep(struct vdso_timekeep **tk) { return (_elf_aux_info(AT_TIMEKEEP, tk, sizeof(*tk))); } Index: head/lib/libc/i386/sys/__vdso_gettc.c =================================================================== --- head/lib/libc/i386/sys/__vdso_gettc.c (revision 286283) +++ head/lib/libc/i386/sys/__vdso_gettc.c (revision 286284) @@ -1,60 +1,114 @@ /*- * Copyright (c) 2012 Konstantin Belousov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include "libc_private.h" +static int lfence_works = -1; + +static int +get_lfence_usage(void) +{ + u_int cpuid_supported, p[4]; + + if (lfence_works == -1) { + __asm __volatile( + " pushfl\n" + " popl %%eax\n" + " movl %%eax,%%ecx\n" + " xorl $0x200000,%%eax\n" + " pushl %%eax\n" + " popfl\n" + " pushfl\n" + " popl %%eax\n" + " xorl %%eax,%%ecx\n" + " je 1f\n" + " movl $1,%0\n" + " jmp 2f\n" + "1: movl $0,%0\n" + "2:\n" + : "=r" (cpuid_supported) : : "eax", "ecx"); + if (cpuid_supported) { + __asm __volatile( + " pushl %%ebx\n" + " cpuid\n" + " movl %%ebx,%1\n" + " popl %%ebx\n" + : "=a" (p[0]), "=r" (p[1]), "=c" (p[2]), "=d" (p[3]) + : "0" (0x1)); + lfence_works = (p[3] & CPUID_SSE2) != 0; + } else + lfence_works = 0; + } + return (lfence_works); +} + static u_int __vdso_gettc_low(const struct vdso_timehands *th) { - uint32_t rv; + u_int rv; + if (get_lfence_usage() == 1) + lfence(); __asm __volatile("rdtsc; shrd %%cl, %%edx, %0" : "=a" (rv) : "c" (th->th_x86_shift) : "edx"); return (rv); } +static u_int +__vdso_rdtsc32(void) +{ + u_int rv; + + if (get_lfence_usage() == 1) + lfence(); + rv = rdtsc32(); + return (rv); +} + #pragma weak __vdso_gettc u_int __vdso_gettc(const struct vdso_timehands *th) { - return (th->th_x86_shift > 0 ? __vdso_gettc_low(th) : rdtsc32()); + return (th->th_x86_shift > 0 ? __vdso_gettc_low(th) : + __vdso_rdtsc32()); } #pragma weak __vdso_gettimekeep int __vdso_gettimekeep(struct vdso_timekeep **tk) { return (_elf_aux_info(AT_TIMEKEEP, tk, sizeof(*tk))); } Index: head/lib/libc/sys/__vdso_gettimeofday.c =================================================================== --- head/lib/libc/sys/__vdso_gettimeofday.c (revision 286283) +++ head/lib/libc/sys/__vdso_gettimeofday.c (revision 286284) @@ -1,144 +1,147 @@ /*- * Copyright (c) 2012 Konstantin Belousov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "libc_private.h" static u_int tc_delta(const struct vdso_timehands *th) { return ((__vdso_gettc(th) - th->th_offset_count) & th->th_counter_mask); } +/* + * Calculate the absolute or boot-relative time from the + * machine-specific fast timecounter and the published timehands + * structure read from the shared page. + * + * The lockless reading scheme is similar to the one used to read the + * in-kernel timehands, see sys/kern/kern_tc.c:binuptime(). This code + * is based on the kernel implementation. + */ static int binuptime(struct bintime *bt, struct vdso_timekeep *tk, int abs) { struct vdso_timehands *th; uint32_t curr, gen; do { if (!tk->tk_enabled) return (ENOSYS); - /* - * XXXKIB. The load of tk->tk_current should use - * atomic_load_acq_32 to provide load barrier. But - * since tk points to r/o mapped page, x86 - * implementation of atomic_load_acq faults. - */ - curr = tk->tk_current; - rmb(); + curr = atomic_load_acq_32(&tk->tk_current); th = &tk->tk_th[curr]; if (th->th_algo != VDSO_TH_ALGO_1) return (ENOSYS); - gen = th->th_gen; + gen = atomic_load_acq_32(&th->th_gen); *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); if (abs) bintime_add(bt, &th->th_boottime); /* - * Barrier for load of both tk->tk_current and th->th_gen. + * Ensure that the load of th_offset is completed + * before the load of th_gen. */ - rmb(); + atomic_thread_fence_acq(); } while (curr != tk->tk_current || gen == 0 || gen != th->th_gen); return (0); } static struct vdso_timekeep *tk; #pragma weak __vdso_gettimeofday int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { struct bintime bt; int error; if (tz != NULL) return (ENOSYS); if (tk == NULL) { error = __vdso_gettimekeep(&tk); if (error != 0 || tk == NULL) return (ENOSYS); } if (tk->tk_ver != VDSO_TK_VER_CURR) return (ENOSYS); error = binuptime(&bt, tk, 1); if (error != 0) return (error); bintime2timeval(&bt, tv); return (0); } #pragma weak __vdso_clock_gettime int __vdso_clock_gettime(clockid_t clock_id, struct timespec *ts) { struct bintime bt; int abs, error; if (tk == NULL) { error = _elf_aux_info(AT_TIMEKEEP, &tk, sizeof(tk)); if (error != 0 || tk == NULL) return (ENOSYS); } if (tk->tk_ver != VDSO_TK_VER_CURR) return (ENOSYS); switch (clock_id) { case CLOCK_REALTIME: case CLOCK_REALTIME_PRECISE: case CLOCK_REALTIME_FAST: case CLOCK_SECOND: abs = 1; break; case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_PRECISE: case CLOCK_MONOTONIC_FAST: case CLOCK_UPTIME: case CLOCK_UPTIME_PRECISE: case CLOCK_UPTIME_FAST: abs = 0; break; default: return (ENOSYS); } error = binuptime(&bt, tk, abs); if (error != 0) return (error); bintime2timespec(&bt, ts); if (clock_id == CLOCK_SECOND) ts->tv_nsec = 0; return (0); } Index: head/sys/kern/kern_sharedpage.c =================================================================== --- head/sys/kern/kern_sharedpage.c (revision 286283) +++ head/sys/kern/kern_sharedpage.c (revision 286284) @@ -1,239 +1,255 @@ /*- * Copyright (c) 2010, 2012 Konstantin Belousov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct sx shared_page_alloc_sx; static vm_object_t shared_page_obj; static int shared_page_free; char *shared_page_mapping; void shared_page_write(int base, int size, const void *data) { bcopy(data, shared_page_mapping + base, size); } static int shared_page_alloc_locked(int size, int align) { int res; res = roundup(shared_page_free, align); if (res + size >= IDX_TO_OFF(shared_page_obj->size)) res = -1; else shared_page_free = res + size; return (res); } int shared_page_alloc(int size, int align) { int res; sx_xlock(&shared_page_alloc_sx); res = shared_page_alloc_locked(size, align); sx_xunlock(&shared_page_alloc_sx); return (res); } int shared_page_fill(int size, int align, const void *data) { int res; sx_xlock(&shared_page_alloc_sx); res = shared_page_alloc_locked(size, align); if (res != -1) shared_page_write(res, size, data); sx_xunlock(&shared_page_alloc_sx); return (res); } static void shared_page_init(void *dummy __unused) { vm_page_t m; vm_offset_t addr; sx_init(&shared_page_alloc_sx, "shpsx"); shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE, VM_PROT_DEFAULT, 0, NULL); VM_OBJECT_WLOCK(shared_page_obj); m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_NOBUSY | VM_ALLOC_ZERO); m->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(shared_page_obj); addr = kva_alloc(PAGE_SIZE); pmap_qenter(addr, &m, 1); shared_page_mapping = (char *)addr; } SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init, NULL); +/* + * Push the timehands update to the shared page. + * + * The lockless update scheme is similar to the one used to update the + * in-kernel timehands, see sys/kern/kern_tc.c:tc_windup() (which + * calls us after the timehands are updated). + */ static void timehands_update(struct sysentvec *sv) { struct vdso_timehands th; struct vdso_timekeep *tk; uint32_t enabled, idx; enabled = tc_fill_vdso_timehands(&th); - tk = (struct vdso_timekeep *)(shared_page_mapping + - sv->sv_timekeep_off); + th.th_gen = 0; idx = sv->sv_timekeep_curr; - atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0); if (++idx >= VDSO_TH_NUM) idx = 0; sv->sv_timekeep_curr = idx; if (++sv->sv_timekeep_gen == 0) sv->sv_timekeep_gen = 1; - th.th_gen = 0; + + tk = (struct vdso_timekeep *)(shared_page_mapping + + sv->sv_timekeep_off); + tk->tk_th[idx].th_gen = 0; + atomic_thread_fence_rel(); if (enabled) tk->tk_th[idx] = th; - tk->tk_enabled = enabled; atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen); - tk->tk_current = idx; + atomic_store_rel_32(&tk->tk_current, idx); + + /* + * The ordering of the assignment to tk_enabled relative to + * the update of the vdso_timehands is not important. + */ + tk->tk_enabled = enabled; } #ifdef COMPAT_FREEBSD32 static void timehands_update32(struct sysentvec *sv) { - struct vdso_timekeep32 *tk; struct vdso_timehands32 th; + struct vdso_timekeep32 *tk; uint32_t enabled, idx; enabled = tc_fill_vdso_timehands32(&th); - tk = (struct vdso_timekeep32 *)(shared_page_mapping + - sv->sv_timekeep_off); + th.th_gen = 0; idx = sv->sv_timekeep_curr; - atomic_store_rel_32(&tk->tk_th[idx].th_gen, 0); if (++idx >= VDSO_TH_NUM) idx = 0; sv->sv_timekeep_curr = idx; if (++sv->sv_timekeep_gen == 0) sv->sv_timekeep_gen = 1; - th.th_gen = 0; + + tk = (struct vdso_timekeep32 *)(shared_page_mapping + + sv->sv_timekeep_off); + tk->tk_th[idx].th_gen = 0; + atomic_thread_fence_rel(); if (enabled) tk->tk_th[idx] = th; - tk->tk_enabled = enabled; atomic_store_rel_32(&tk->tk_th[idx].th_gen, sv->sv_timekeep_gen); - tk->tk_current = idx; + atomic_store_rel_32(&tk->tk_current, idx); + tk->tk_enabled = enabled; } #endif /* * This is hackish, but easiest way to avoid creating list structures * that needs to be iterated over from the hardclock interrupt * context. */ static struct sysentvec *host_sysentvec; #ifdef COMPAT_FREEBSD32 static struct sysentvec *compat32_sysentvec; #endif void timekeep_push_vdso(void) { if (host_sysentvec != NULL && host_sysentvec->sv_timekeep_base != 0) timehands_update(host_sysentvec); #ifdef COMPAT_FREEBSD32 if (compat32_sysentvec != NULL && compat32_sysentvec->sv_timekeep_base != 0) timehands_update32(compat32_sysentvec); #endif } void exec_sysvec_init(void *param) { struct sysentvec *sv; int tk_base; uint32_t tk_ver; sv = (struct sysentvec *)param; if ((sv->sv_flags & SV_SHP) == 0) return; sv->sv_shared_page_obj = shared_page_obj; sv->sv_sigcode_base = sv->sv_shared_page_base + shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode); if ((sv->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD) return; tk_ver = VDSO_TK_VER_CURR; #ifdef COMPAT_FREEBSD32 if ((sv->sv_flags & SV_ILP32) != 0) { tk_base = shared_page_alloc(sizeof(struct vdso_timekeep32) + sizeof(struct vdso_timehands32) * VDSO_TH_NUM, 16); KASSERT(tk_base != -1, ("tk_base -1 for 32bit")); shared_page_write(tk_base + offsetof(struct vdso_timekeep32, tk_ver), sizeof(uint32_t), &tk_ver); KASSERT(compat32_sysentvec == 0, ("Native compat32 already registered")); compat32_sysentvec = sv; } else { #endif tk_base = shared_page_alloc(sizeof(struct vdso_timekeep) + sizeof(struct vdso_timehands) * VDSO_TH_NUM, 16); KASSERT(tk_base != -1, ("tk_base -1 for native")); shared_page_write(tk_base + offsetof(struct vdso_timekeep, tk_ver), sizeof(uint32_t), &tk_ver); KASSERT(host_sysentvec == 0, ("Native already registered")); host_sysentvec = sv; #ifdef COMPAT_FREEBSD32 } #endif sv->sv_timekeep_base = sv->sv_shared_page_base + tk_base; sv->sv_timekeep_off = tk_base; timekeep_push_vdso(); }