Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c +++ sys/amd64/amd64/trap.c @@ -999,6 +999,7 @@ regcnt--; } + syscall_read_barrier(); if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else @@ -1041,6 +1042,7 @@ sa->code >= p->p_sysent->sv_size)) return (cpu_fetch_syscall_args_fallback(td, sa)); + syscall_read_barrier(); sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= nitems(sa->args), ("Too many syscall arguments!")); Index: sys/arm/arm/syscall.c =================================================================== --- sys/arm/arm/syscall.c +++ sys/arm/arm/syscall.c @@ -118,6 +118,7 @@ ap += 2; } p = td->td_proc; + syscall_read_barrier(); if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else Index: sys/arm64/arm64/trap.c =================================================================== --- sys/arm64/arm64/trap.c +++ sys/arm64/arm64/trap.c @@ -136,6 +136,7 @@ nap--; } + syscall_read_barrier(); if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else Index: sys/i386/i386/trap.c =================================================================== --- sys/i386/i386/trap.c +++ sys/i386/i386/trap.c @@ -1085,6 +1085,7 @@ params += sizeof(quad_t); } + syscall_read_barrier(); if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else Index: sys/kern/kern_syscalls.c =================================================================== --- sys/kern/kern_syscalls.c +++ sys/kern/kern_syscalls.c @@ -41,6 +41,7 @@ #include #include #include +#include #include /* @@ -64,44 +65,93 @@ } static void -syscall_thread_drain(struct sysent *se) +syscall_wait_thread(struct thread *td, struct sysent *se) { - u_int32_t cnt, oldcnt; - - do { - oldcnt = se->sy_thrcnt; - KASSERT((oldcnt & SY_THR_STATIC) == 0, - ("drain on static syscall")); - cnt = oldcnt | SY_THR_DRAINING; - } while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0); - while (atomic_cmpset_32(&se->sy_thrcnt, SY_THR_DRAINING, - SY_THR_ABSENT) == 0) + + while ((struct sysent *)atomic_load_ptr(&td->td_sa.callp) == se) pause("scdrn", hz/2); } -int -_syscall_thread_enter(struct thread *td, struct sysent *se) +static int +syscall_wait(struct proc *p, struct sysent *se, struct sx *lock) { - u_int32_t cnt, oldcnt; - - do { - oldcnt = se->sy_thrcnt; - if ((oldcnt & (SY_THR_DRAINING | SY_THR_ABSENT)) != 0) - return (ENOSYS); - cnt = oldcnt + SY_THR_INCR; - } while (atomic_cmpset_acq_32(&se->sy_thrcnt, oldcnt, cnt) == 0); + struct thread *td; + PROC_LOCK(p); + FOREACH_THREAD_IN_PROC(p, td) { + if (td->td_sa.callp != se) + continue; + sx_sunlock(lock); + PROC_UNLOCK(p); + syscall_wait_thread(td, se); + return (1); + } + PROC_UNLOCK(p); return (0); } -void -_syscall_thread_exit(struct thread *td, struct sysent *se) +/* + * Observe all threads not executing the passed syscall. + * + * We are called when the func pointer is replaced with a dummy. + * + * All code preparing syscall execution uses syscall_read_barrier before + * reading sysent. cpus_fence_seq_cst below both publishes our update and + * synchronizes against aforementioned syscall_read_barrier consumers. This + * guarantees there will be no new threads executing the old syscall code (but + * there maybe new threads executing the newly installed ENOSYS handler). + * + * From then on we only have to observe all threads executing a different + * sysent (or no sysent in the first place). + * + * Note no effort is made to nudge forward a thread blocked on the syscall. + */ +static void +syscall_drain(struct sysent *se, int code) +{ + struct proc *p; + int i, j; + + cpus_fence_seq_cst(); + + for (i = 0; i < pidhashlock + 1; i++) { +loop_unlocked: + sx_slock(&pidhashtbl_lock[i]); + for (j = i; j <= pidhash; j += pidhashlock + 1) { + LIST_FOREACH(p, &pidhashtbl[j], p_hash) { + if (p->p_state == PRS_NEW) + continue; + if (syscall_wait(p, se, &pidhashtbl_lock[i])) + /* + * Note this is likely to revisit + * the same process, which is fine. + * This avoids inserting a marker to + * make sure we did not miss anything. + */ + goto loop_unlocked; + } + } + sx_sunlock(&pidhashtbl_lock[i]); + } +} + +static void +syscall_sysent_replace(struct sysent *se, struct sysent *old, + const struct sysent *new) { - u_int32_t cnt, oldcnt; - do { - oldcnt = se->sy_thrcnt; - cnt = oldcnt - SY_THR_INCR; - } while (atomic_cmpset_rel_32(&se->sy_thrcnt, oldcnt, cnt) == 0); + if (old != NULL) + *old = *se; + + atomic_store_int(&se->sy_narg, new->sy_narg); + atomic_store_16(&se->sy_auevent, new->sy_auevent); + atomic_store_ptr((void *)&se->sy_systrace_args_func, + (uintptr_t)new->sy_systrace_args_func); + atomic_store_int(&se->sy_entry, new->sy_entry); + atomic_store_int(&se->sy_return, new->sy_return); + atomic_store_int(&se->sy_flags, new->sy_flags); + atomic_store_int(&se->sy_thrflags, new->sy_thrflags); + atomic_thread_fence_rel(); + atomic_store_ptr((void *)&se->sy_call, (uintptr_t)new->sy_call); } int @@ -129,12 +179,9 @@ return (EEXIST); } - KASSERT(sysents[*offset].sy_thrcnt == SY_THR_ABSENT, + KASSERT(sysents[*offset].sy_thrflags == SY_THR_ABSENT, ("dynamic syscall is not protected")); - *old_sysent = sysents[*offset]; - new_sysent->sy_thrcnt = SY_THR_ABSENT; - sysents[*offset] = *new_sysent; - atomic_store_rel_32(&sysents[*offset].sy_thrcnt, flags); + syscall_sysent_replace(&sysents[*offset], old_sysent, new_sysent); return (0); } @@ -148,10 +195,11 @@ return (0); /* XXX? */ se = &sysents[offset]; - if ((se->sy_thrcnt & SY_THR_STATIC) != 0) + if ((se->sy_thrflags & SY_THR_STATIC) != 0) return (EINVAL); - syscall_thread_drain(se); - sysents[offset] = *old_sysent; + syscall_sysent_replace(se, NULL, old_sysent); + syscall_drain(se, offset); + return (0); } Index: sys/kern/kern_thr.c =================================================================== --- sys/kern/kern_thr.c +++ sys/kern/kern_thr.c @@ -375,6 +375,7 @@ #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif + syscall_exit(td); PROC_SLOCK(p); thread_stopped(p); Index: sys/kern/subr_syscall.c =================================================================== --- sys/kern/subr_syscall.c +++ sys/kern/subr_syscall.c @@ -125,12 +125,6 @@ } #endif - error = syscall_thread_enter(td, sa->callp); - if (error != 0) { - td->td_errno = error; - goto retval; - } - #ifdef KDTRACE_HOOKS /* Give the syscall:::entry DTrace probe a chance to fire. */ if (__predict_false(systrace_enabled && sa->callp->sy_entry != 0)) @@ -161,7 +155,6 @@ (*systrace_probe_func)(sa, SYSTRACE_RETURN, error ? -1 : td->td_retval[0]); #endif - syscall_thread_exit(td, sa->callp); retval: KTR_STOP4(KTR_SYSC, "syscall", syscallname(p, sa->code), Index: sys/kern/subr_trap.c =================================================================== --- sys/kern/subr_trap.c +++ sys/kern/subr_trap.c @@ -137,6 +137,8 @@ td_softdep_cleanup(td); MPASS(td->td_su == NULL); + syscall_exit(td); + /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland Index: sys/mips/mips/trap.c =================================================================== --- sys/mips/mips/trap.c +++ sys/mips/mips/trap.c @@ -438,6 +438,7 @@ printf("SYSCALL #%d pid:%u\n", sa->code, td->td_proc->p_pid); #endif + syscall_read_barrier(); se = td->td_proc->p_sysent; /* * XXX Index: sys/powerpc/powerpc/trap.c =================================================================== --- sys/powerpc/powerpc/trap.c +++ sys/powerpc/powerpc/trap.c @@ -652,6 +652,7 @@ } } + syscall_read_barrier(); if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else Index: sys/riscv/riscv/trap.c =================================================================== --- sys/riscv/riscv/trap.c +++ sys/riscv/riscv/trap.c @@ -112,6 +112,7 @@ nap--; } + syscall_read_barrier(); if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else Index: sys/sys/sysent.h =================================================================== --- sys/sys/sysent.h +++ sys/sys/sysent.h @@ -58,6 +58,9 @@ #endif extern systrace_probe_func_t systrace_probe_func; +/* + * Make sure to update syscall_sysent_replace when modifying this structure. + */ struct sysent { /* system call table */ int sy_narg; /* number of arguments */ sy_call_t *sy_call; /* implementing function */ @@ -67,7 +70,7 @@ u_int32_t sy_entry; /* DTrace entry ID for systrace. */ u_int32_t sy_return; /* DTrace return ID for systrace. */ u_int32_t sy_flags; /* General flags for system calls. */ - u_int32_t sy_thrcnt; + u_int32_t sy_thrflags; }; /* @@ -75,11 +78,9 @@ */ #define SYF_CAPENABLED 0x00000001 -#define SY_THR_FLAGMASK 0x7 +#define SY_THR_FLAGMASK 0x3 #define SY_THR_STATIC 0x1 -#define SY_THR_DRAINING 0x2 -#define SY_THR_ABSENT 0x4 -#define SY_THR_INCR 0x8 +#define SY_THR_ABSENT 0x2 #ifdef KLD_MODULE #define SY_THR_STATIC_KLD 0 @@ -190,7 +191,7 @@ .sy_entry = 0, \ .sy_return = 0, \ .sy_flags = 0, \ - .sy_thrcnt = 0 \ + .sy_thrflags = 0 \ } #define MAKE_SYSENT(syscallname) \ @@ -288,26 +289,11 @@ int lkmnosys(struct thread *, struct nosys_args *); int lkmressys(struct thread *, struct nosys_args *); -int _syscall_thread_enter(struct thread *td, struct sysent *se); -void _syscall_thread_exit(struct thread *td, struct sysent *se); - -static inline int -syscall_thread_enter(struct thread *td, struct sysent *se) -{ - - if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0)) - return (0); - return (_syscall_thread_enter(td, se)); -} - -static inline void -syscall_thread_exit(struct thread *td, struct sysent *se) -{ - - if (__predict_true((se->sy_thrcnt & SY_THR_STATIC) != 0)) - return; - _syscall_thread_exit(td, se); -} +/* + * See syscall_drain(). + */ +#define syscall_read_barrier() __compiler_membar() +#define syscall_exit(td) do { (td)->td_sa.callp = NULL; } while (0) int shared_page_alloc(int size, int align); int shared_page_fill(int size, int align, const void *data);