Index: head/include/pthread.h =================================================================== --- head/include/pthread.h (revision 300042) +++ head/include/pthread.h (revision 300043) @@ -1,339 +1,349 @@ /* * Copyright (c) 1993, 1994 by Chris Provenzano, proven@mit.edu * Copyright (c) 1995-1998 by John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Chris Provenzano. * 4. The name of Chris Provenzano may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY CHRIS PROVENZANO ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL CHRIS PROVENZANO BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _PTHREAD_H_ #define _PTHREAD_H_ /* * Header files. */ #include #include #include #include #include #include #include /* * Run-time invariant values: */ #define PTHREAD_DESTRUCTOR_ITERATIONS 4 #define PTHREAD_KEYS_MAX 256 #define PTHREAD_STACK_MIN __MINSIGSTKSZ #define PTHREAD_THREADS_MAX __ULONG_MAX #define PTHREAD_BARRIER_SERIAL_THREAD -1 /* * Flags for threads and thread attributes. */ #define PTHREAD_DETACHED 0x1 #define PTHREAD_SCOPE_SYSTEM 0x2 #define PTHREAD_INHERIT_SCHED 0x4 #define PTHREAD_NOFLOAT 0x8 #define PTHREAD_CREATE_DETACHED PTHREAD_DETACHED #define PTHREAD_CREATE_JOINABLE 0 #define PTHREAD_SCOPE_PROCESS 0 #define PTHREAD_EXPLICIT_SCHED 0 /* * Values for process shared/private attributes. */ #define PTHREAD_PROCESS_PRIVATE 0 #define PTHREAD_PROCESS_SHARED 1 /* * Flags for cancelling threads */ #define PTHREAD_CANCEL_ENABLE 0 #define PTHREAD_CANCEL_DISABLE 1 #define PTHREAD_CANCEL_DEFERRED 0 #define PTHREAD_CANCEL_ASYNCHRONOUS 2 #define PTHREAD_CANCELED ((void *) 1) /* * Flags for once initialization. */ #define PTHREAD_NEEDS_INIT 0 #define PTHREAD_DONE_INIT 1 /* * Static once initialization values. */ #define PTHREAD_ONCE_INIT { PTHREAD_NEEDS_INIT, NULL } /* * Static initialization values. */ #define PTHREAD_MUTEX_INITIALIZER NULL #define PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP ((pthread_mutex_t)1) #define PTHREAD_COND_INITIALIZER NULL #define PTHREAD_RWLOCK_INITIALIZER NULL /* * Default attribute arguments (draft 4, deprecated). */ #ifndef PTHREAD_KERNEL #define pthread_condattr_default NULL #define pthread_mutexattr_default NULL #define pthread_attr_default NULL #endif #define PTHREAD_PRIO_NONE 0 #define PTHREAD_PRIO_INHERIT 1 #define PTHREAD_PRIO_PROTECT 2 /* * Mutex types (Single UNIX Specification, Version 2, 1997). * * Note that a mutex attribute with one of the following types: * * PTHREAD_MUTEX_NORMAL * PTHREAD_MUTEX_RECURSIVE * * will deviate from POSIX specified semantics. */ enum pthread_mutextype { PTHREAD_MUTEX_ERRORCHECK = 1, /* Default POSIX mutex */ PTHREAD_MUTEX_RECURSIVE = 2, /* Recursive mutex */ PTHREAD_MUTEX_NORMAL = 3, /* No error checking */ PTHREAD_MUTEX_ADAPTIVE_NP = 4, /* Adaptive mutex, spins briefly before blocking on lock */ PTHREAD_MUTEX_TYPE_MAX }; #define PTHREAD_MUTEX_DEFAULT PTHREAD_MUTEX_ERRORCHECK +#define PTHREAD_MUTEX_STALLED 0 +#define PTHREAD_MUTEX_ROBUST 1 + struct _pthread_cleanup_info { __uintptr_t pthread_cleanup_pad[8]; }; /* * Thread function prototype definitions: */ __BEGIN_DECLS int pthread_atfork(void (*)(void), void (*)(void), void (*)(void)); int pthread_attr_destroy(pthread_attr_t *) __nonnull(1); int pthread_attr_getstack(const pthread_attr_t * __restrict, void ** __restrict, size_t * __restrict) __nonnull_all; int pthread_attr_getstacksize(const pthread_attr_t *, size_t *) __nonnull_all; int pthread_attr_getguardsize(const pthread_attr_t *, size_t *); int pthread_attr_getstackaddr(const pthread_attr_t *, void **); int pthread_attr_getdetachstate(const pthread_attr_t *, int *) __nonnull_all; int pthread_attr_init(pthread_attr_t *) __nonnull(1); int pthread_attr_setstacksize(pthread_attr_t *, size_t) __nonnull(1); int pthread_attr_setguardsize(pthread_attr_t *, size_t) __nonnull(1); int pthread_attr_setstack(pthread_attr_t *, void *, size_t) __nonnull(1); int pthread_attr_setstackaddr(pthread_attr_t *, void *); int pthread_attr_setdetachstate(pthread_attr_t *, int) __nonnull(1); int pthread_barrier_destroy(pthread_barrier_t *); int pthread_barrier_init(pthread_barrier_t *, const pthread_barrierattr_t *, unsigned); int pthread_barrier_wait(pthread_barrier_t *); int pthread_barrierattr_destroy(pthread_barrierattr_t *); int pthread_barrierattr_getpshared(const pthread_barrierattr_t *, int *); int pthread_barrierattr_init(pthread_barrierattr_t *) __nonnull(1); int pthread_barrierattr_setpshared(pthread_barrierattr_t *, int); #define pthread_cleanup_push(cleanup_routine, cleanup_arg) \ { \ struct _pthread_cleanup_info __cleanup_info__; \ __pthread_cleanup_push_imp(cleanup_routine, cleanup_arg,\ &__cleanup_info__); \ { #define pthread_cleanup_pop(execute) \ (void)0; \ } \ __pthread_cleanup_pop_imp(execute); \ } int pthread_condattr_destroy(pthread_condattr_t *) __nonnull(1); int pthread_condattr_getclock(const pthread_condattr_t *, clockid_t *) __nonnull_all; int pthread_condattr_getpshared(const pthread_condattr_t *, int *) __nonnull_all; int pthread_condattr_init(pthread_condattr_t *) __nonnull(1); int pthread_condattr_setclock(pthread_condattr_t *, clockid_t) __nonnull(1); int pthread_condattr_setpshared(pthread_condattr_t *, int) __nonnull(1); int pthread_cond_broadcast(pthread_cond_t *) __nonnull(1); int pthread_cond_destroy(pthread_cond_t *) __nonnull(1); int pthread_cond_init(pthread_cond_t *, const pthread_condattr_t *) __nonnull(1); int pthread_cond_signal(pthread_cond_t *) __nonnull(1); int pthread_cond_timedwait(pthread_cond_t *, pthread_mutex_t *__mutex, const struct timespec *) __nonnull_all __requires_exclusive(*__mutex); int pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *__mutex) __nonnull_all __requires_exclusive(*__mutex); int pthread_create(pthread_t *, const pthread_attr_t *, void *(*) (void *), void *) __nonnull(1) __nonnull(3); int pthread_detach(pthread_t); int pthread_equal(pthread_t, pthread_t); void pthread_exit(void *) __dead2; void *pthread_getspecific(pthread_key_t); int pthread_getcpuclockid(pthread_t, clockid_t *) __nonnull(2); int pthread_join(pthread_t, void **); int pthread_key_create(pthread_key_t *, void (*) (void *)) __nonnull(1); int pthread_key_delete(pthread_key_t); int pthread_mutexattr_init(pthread_mutexattr_t *) __nonnull(1); int pthread_mutexattr_destroy(pthread_mutexattr_t *) __nonnull(1); int pthread_mutexattr_getpshared(const pthread_mutexattr_t *, int *) __nonnull_all; int pthread_mutexattr_gettype(pthread_mutexattr_t *, int *) __nonnull_all; int pthread_mutexattr_settype(pthread_mutexattr_t *, int) __nonnull(1); int pthread_mutexattr_setpshared(pthread_mutexattr_t *, int) __nonnull(1); +int pthread_mutex_consistent(pthread_mutex_t *__mutex) + __nonnull(1) __requires_exclusive(*__mutex); int pthread_mutex_destroy(pthread_mutex_t *__mutex) __nonnull(1) __requires_unlocked(*__mutex); int pthread_mutex_init(pthread_mutex_t *__mutex, const pthread_mutexattr_t *) __nonnull(1) __requires_unlocked(*__mutex); int pthread_mutex_lock(pthread_mutex_t *__mutex) __nonnull(1) __locks_exclusive(*__mutex); int pthread_mutex_trylock(pthread_mutex_t *__mutex) __nonnull(1) __trylocks_exclusive(0, *__mutex); int pthread_mutex_timedlock(pthread_mutex_t *__mutex, const struct timespec *) __nonnull_all __trylocks_exclusive(0, *__mutex); int pthread_mutex_unlock(pthread_mutex_t *__mutex) __nonnull(1) __unlocks(*__mutex); int pthread_once(pthread_once_t *, void (*) (void)) __nonnull_all; int pthread_rwlock_destroy(pthread_rwlock_t *__rwlock) __nonnull(1) __requires_unlocked(*__rwlock); int pthread_rwlock_init(pthread_rwlock_t *__rwlock, const pthread_rwlockattr_t *) __nonnull(1) __requires_unlocked(*__rwlock); int pthread_rwlock_rdlock(pthread_rwlock_t *__rwlock) __nonnull(1) __locks_shared(*__rwlock); int pthread_rwlock_timedrdlock(pthread_rwlock_t *__rwlock, const struct timespec *) __nonnull_all __trylocks_shared(0, *__rwlock); int pthread_rwlock_timedwrlock(pthread_rwlock_t *__rwlock, const struct timespec *) __nonnull_all __trylocks_exclusive(0, *__rwlock); int pthread_rwlock_tryrdlock(pthread_rwlock_t *__rwlock) __nonnull(1) __trylocks_shared(0, *__rwlock); int pthread_rwlock_trywrlock(pthread_rwlock_t *__rwlock) __nonnull(1) __trylocks_exclusive(0, *__rwlock); int pthread_rwlock_unlock(pthread_rwlock_t *__rwlock) __nonnull(1) __unlocks(*__rwlock); int pthread_rwlock_wrlock(pthread_rwlock_t *__rwlock) __nonnull(1) __locks_exclusive(*__rwlock); int pthread_rwlockattr_destroy(pthread_rwlockattr_t *) __nonnull(1); int pthread_rwlockattr_getkind_np(const pthread_rwlockattr_t *, int *); int pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *, int *) __nonnull_all; int pthread_rwlockattr_init(pthread_rwlockattr_t *) __nonnull(1); int pthread_rwlockattr_setkind_np(pthread_rwlockattr_t *, int); int pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int) __nonnull(1); pthread_t pthread_self(void); int pthread_setspecific(pthread_key_t, const void *); int pthread_spin_init(pthread_spinlock_t *__spin, int) __requires_unlocked(*__spin); int pthread_spin_destroy(pthread_spinlock_t *__spin) __requires_unlocked(*__spin); int pthread_spin_lock(pthread_spinlock_t *__spin) __locks_exclusive(*__spin); int pthread_spin_trylock(pthread_spinlock_t *__spin) __trylocks_exclusive(0, *__spin); int pthread_spin_unlock(pthread_spinlock_t *__spin) __unlocks(*__spin); int pthread_cancel(pthread_t); int pthread_setcancelstate(int, int *); int pthread_setcanceltype(int, int *); void pthread_testcancel(void); #if __BSD_VISIBLE int pthread_getprio(pthread_t); int pthread_setprio(pthread_t, int); void pthread_yield(void); #endif int pthread_mutexattr_getprioceiling(pthread_mutexattr_t *, int *); int pthread_mutexattr_setprioceiling(pthread_mutexattr_t *, int); int pthread_mutex_getprioceiling(pthread_mutex_t *, int *); int pthread_mutex_setprioceiling(pthread_mutex_t *, int, int *); int pthread_mutexattr_getprotocol(pthread_mutexattr_t *, int *); int pthread_mutexattr_setprotocol(pthread_mutexattr_t *, int); + +int pthread_mutexattr_getrobust(pthread_mutexattr_t *__restrict, + int *__restrict) __nonnull_all; +int pthread_mutexattr_setrobust(pthread_mutexattr_t *, int) + __nonnull(1); int pthread_attr_getinheritsched(const pthread_attr_t *, int *); int pthread_attr_getschedparam(const pthread_attr_t *, struct sched_param *) __nonnull_all; int pthread_attr_getschedpolicy(const pthread_attr_t *, int *) __nonnull_all; int pthread_attr_getscope(const pthread_attr_t *, int *) __nonnull_all; int pthread_attr_setinheritsched(pthread_attr_t *, int); int pthread_attr_setschedparam(pthread_attr_t *, const struct sched_param *) __nonnull(1) __nonnull(2); int pthread_attr_setschedpolicy(pthread_attr_t *, int) __nonnull(1); int pthread_attr_setscope(pthread_attr_t *, int) __nonnull(1); int pthread_getschedparam(pthread_t pthread, int *, struct sched_param *) __nonnull(2) __nonnull(3); int pthread_setschedparam(pthread_t, int, const struct sched_param *) __nonnull(3); #if __XSI_VISIBLE int pthread_getconcurrency(void); int pthread_setconcurrency(int); #endif void __pthread_cleanup_push_imp(void (*)(void *), void *, struct _pthread_cleanup_info *); void __pthread_cleanup_pop_imp(int); __END_DECLS #endif Index: head/lib/libc/gen/Symbol.map =================================================================== --- head/lib/libc/gen/Symbol.map (revision 300042) +++ head/lib/libc/gen/Symbol.map (revision 300043) @@ -1,543 +1,546 @@ /* * $FreeBSD$ */ FBSD_1.0 { __xuname; pthread_atfork; pthread_attr_destroy; pthread_attr_getdetachstate; pthread_attr_getguardsize; pthread_attr_getinheritsched; pthread_attr_getschedparam; pthread_attr_getschedpolicy; pthread_attr_getscope; pthread_attr_getstackaddr; pthread_attr_getstacksize; pthread_attr_init; pthread_attr_setdetachstate; pthread_attr_setguardsize; pthread_attr_setinheritsched; pthread_attr_setschedparam; pthread_attr_setschedpolicy; pthread_attr_setscope; pthread_attr_setstackaddr; pthread_attr_setstacksize; pthread_cancel; pthread_cleanup_pop; pthread_cleanup_push; pthread_cond_broadcast; pthread_cond_destroy; pthread_cond_init; pthread_cond_signal; pthread_cond_timedwait; pthread_cond_wait; pthread_detach; pthread_equal; pthread_exit; pthread_getspecific; pthread_join; pthread_key_create; pthread_key_delete; pthread_kill; pthread_main_np; pthread_mutex_destroy; pthread_mutex_init; pthread_mutex_lock; pthread_mutex_trylock; pthread_mutex_unlock; pthread_mutexattr_destroy; pthread_mutexattr_init; pthread_mutexattr_settype; pthread_once; pthread_rwlock_destroy; pthread_rwlock_init; pthread_rwlock_rdlock; pthread_rwlock_tryrdlock; pthread_rwlock_trywrlock; pthread_rwlock_unlock; pthread_rwlock_wrlock; pthread_self; pthread_setcancelstate; pthread_setcanceltype; pthread_setspecific; pthread_sigmask; pthread_testcancel; alarm; arc4random; arc4random_addrandom; arc4random_stir; __assert; basename; check_utility_compat; clock; closedir; confstr; encrypt; des_setkey; des_cipher; setkey; ctermid; ctermid_r; daemon; devname; devname_r; dirname; getdiskbyname; dladdr; dlclose; dlerror; dlfunc; dllockinit; dlopen; dlsym; dlvsym; dlinfo; dl_iterate_phdr; drand48; erand48; err_set_file; err_set_exit; err; verr; errc; verrc; errx; verrx; warn; vwarn; warnc; vwarnc; warnx; vwarnx; sys_errlist; sys_nerr; errno; execl; execle; execlp; execv; execvp; execvP; fmtcheck; fmtmsg; fnmatch; __fpclassifyf; __fpclassifyd; __fpclassifyl; frexp; setfstab; getfstab; getfsent; getfsspec; getfsfile; setfsent; endfsent; ftok; ftw; glob; globfree; getbootfile; getbsize; cgetset; cgetcap; cgetent; cgetmatch; cgetfirst; cgetclose; cgetnext; cgetstr; cgetustr; cgetnum; getcwd; getdomainname; setgrent; setgroupent; endgrent; getgrent_r; getgrnam_r; getgrgid_r; getgrnam; getgrgid; getgrent; /* * Why are __gr_parse_entry() and __gr_match_entry() not static in * gen/getgrent.c? */ getgrouplist; gethostname; getloadavg; getlogin; getlogin_r; getmntinfo; setnetgrent; getnetgrent; endnetgrent; innetgr; getosreldate; getpagesize; getpeereid; _getprogname; getprogname; setpwent; setpassent; endpwent; getpwent_r; getpwnam_r; getpwuid_r; getpwnam; getpwuid; getpwent; getttynam; getttyent; setttyent; endttyent; isdialuptty; isnettty; getusershell; endusershell; setusershell; getvfsbyname; __isnan; isnan; __isnanf; isnanf; __isinf; isinf; __isinff; __isinfl; isatty; initgroups; jrand48; lcong48; ldexp; lockf; lrand48; modf; mrand48; nftw; nice; nlist; nrand48; opendir; pause; posix_madvise; popen; pclose; psignal; raise; readdir; readdir_r; readpassphrase; getpass; rewinddir; scandir; alphasort; seed48; seekdir; user_from_uid; group_from_gid; setdomainname; sethostname; longjmperror; getmode; setmode; setproctitle; setprogname; siginterrupt; sys_signame; sys_siglist; sys_nsig; signal; sigaddset; sigdelset; sigemptyset; sigfillset; sigismember; sleep; srand48; fstatvfs; statvfs; sl_init; sl_add; sl_free; sl_find; fflagstostr; strtofflags; sysconf; sysctl; sysctlbyname; sysctlnametomib; syslog; vsyslog; openlog; closelog; setlogmask; ttyname_r; ttyname; timezone; times; time; telldir; tcgetattr; tcsetattr; tcsetpgrp; tcgetpgrp; cfgetospeed; cfgetispeed; cfsetospeed; cfsetispeed; cfsetspeed; cfmakeraw; tcsendbreak; _init_tls; __tls_get_addr; tcdrain; tcflush; tcflow; ualarm; ulimit; uname; strunvis; strunvisx; usleep; utime; valloc; vis; strvis; strvisx; wait; wait3; waitpid; wordexp; wordfree; }; FBSD_1.1 { arc4random_buf; arc4random_uniform; fdevname; fdevname_r; fdopendir; feature_present; fts_children; fts_close; fts_get_clientptr; fts_get_stream; fts_open; fts_read; fts_set; fts_set_clientptr; posix_spawn; posix_spawn_file_actions_addclose; posix_spawn_file_actions_adddup2; posix_spawn_file_actions_addopen; posix_spawn_file_actions_destroy; posix_spawn_file_actions_init; posix_spawnattr_destroy; posix_spawnattr_getflags; posix_spawnattr_getpgroup; posix_spawnattr_getschedparam; posix_spawnattr_getschedpolicy; posix_spawnattr_getsigdefault; posix_spawnattr_getsigmask; posix_spawnattr_init; posix_spawnattr_setflags; posix_spawnattr_setpgroup; posix_spawnattr_setschedparam; posix_spawnattr_setschedpolicy; posix_spawnattr_setsigdefault; posix_spawnattr_setsigmask; posix_spawnp; semctl; tcgetsid; tcsetsid; __pthread_cleanup_pop_imp; __pthread_cleanup_push_imp; }; FBSD_1.2 { basename_r; cfmakesane; endutxent; getpagesizes; getutxent; getutxid; getutxline; getutxuser; pututxline; sem_close; sem_destroy; sem_getvalue; sem_init; sem_open; sem_post; sem_timedwait; sem_trywait; sem_unlink; sem_wait; setutxdb; setutxent; }; FBSD_1.3 { clock_getcpuclockid; dirfd; dup3; fdclosedir; fdlopen; __FreeBSD_libc_enter_restricted_mode; getcontextx; gid_from_group; nvis; pwcache_userdb; pwcache_groupdb; snvis; strenvisx; strnunvis; strnunvisx; strnvis; strnvisx; strsenvisx; strsnvis; strsnvisx; strsvis; strsvisx; svis; uid_from_user; unvis; waitid; }; FBSD_1.4 { + pthread_mutex_consistent; + pthread_mutexattr_getrobust; + pthread_mutexattr_setrobust; scandir_b; }; FBSDprivate_1.0 { /* needed by thread libraries */ __thr_jtable; _pthread_atfork; _pthread_attr_destroy; _pthread_attr_getdetachstate; _pthread_attr_getguardsize; _pthread_attr_getinheritsched; _pthread_attr_getschedparam; _pthread_attr_getschedpolicy; _pthread_attr_getscope; _pthread_attr_getstackaddr; _pthread_attr_getstacksize; _pthread_attr_init; _pthread_attr_setdetachstate; _pthread_attr_setguardsize; _pthread_attr_setinheritsched; _pthread_attr_setschedparam; _pthread_attr_setschedpolicy; _pthread_attr_setscope; _pthread_attr_setstackaddr; _pthread_attr_setstacksize; _pthread_cancel; _pthread_cancel_enter; _pthread_cancel_leave; _pthread_cleanup_pop; _pthread_cleanup_push; _pthread_cond_broadcast; _pthread_cond_destroy; _pthread_cond_init; _pthread_cond_signal; _pthread_cond_timedwait; _pthread_cond_wait; _pthread_detach; _pthread_equal; _pthread_exit; _pthread_getspecific; _pthread_join; _pthread_key_create; _pthread_key_delete; _pthread_kill; _pthread_main_np; _pthread_mutex_destroy; _pthread_mutex_init_calloc_cb; _pthread_mutex_init; _pthread_mutex_lock; _pthread_mutex_trylock; _pthread_mutex_unlock; _pthread_mutexattr_destroy; _pthread_mutexattr_init; _pthread_mutexattr_settype; _pthread_once; _pthread_rwlock_destroy; _pthread_rwlock_init; _pthread_rwlock_rdlock; _pthread_rwlock_tryrdlock; _pthread_rwlock_trywrlock; _pthread_rwlock_unlock; _pthread_rwlock_wrlock; _pthread_self; _pthread_setcancelstate; _pthread_setcanceltype; _pthread_setspecific; _pthread_sigmask; _pthread_testcancel; _spinlock; _spinlock_debug; _spinunlock; _rtld_addr_phdr; _rtld_atfork_pre; _rtld_atfork_post; _rtld_error; /* for private use */ _rtld_get_stack_prot; _rtld_is_dlopened; _rtld_thread_init; /* for private use */ __elf_phdr_match_addr; _err; _warn; __fmtcheck; /* __pw_match_entry; */ /* __pw_parse_entry; */ __fdnlist; /* used by libkvm */ /* __aout_fdnlist; */ /* __elf_is_okay__; */ /* __elf_fdnlist; */ __opendir2; __pause; _pause; __pw_scan; /* Used by (at least) libutil */ __raise; _raise; __sleep; _sleep; _rtld_allocate_tls; _rtld_free_tls; #if defined(i386) ___libc_tls_get_addr; /* x86 only */ #endif __libc_tls_get_addr; __tcdrain; _tcdrain; __usleep; _usleep; __wait; _wait; __waitpid; _waitpid; _libc_sem_init_compat; _libc_sem_destroy_compat; _libc_sem_open_compat; _libc_sem_close_compat; _libc_sem_unlink_compat; _libc_sem_wait_compat; _libc_sem_trywait_compat; _libc_sem_timedwait_compat; _libc_sem_post_compat; _libc_sem_getvalue_compat; __libc_tcdrain; __elf_aux_vector; __pthread_map_stacks_exec; __fillcontextx; __fillcontextx2; __getcontextx_size; }; Index: head/lib/libc/gen/_pthread_stubs.c =================================================================== --- head/lib/libc/gen/_pthread_stubs.c (revision 300042) +++ head/lib/libc/gen/_pthread_stubs.c (revision 300043) @@ -1,319 +1,327 @@ /* * Copyright (c) 2001 Daniel Eischen . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY DANIEL EISCHEN AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "libc_private.h" /* * Weak symbols: All libc internal usage of these functions should * use the weak symbol versions (_pthread_XXX). If libpthread is * linked, it will override these functions with (non-weak) routines. * The _pthread_XXX functions are provided solely for internal libc * usage to avoid unwanted cancellation points and to differentiate * between application locks and libc locks (threads holding the * latter can't be allowed to exit/terminate). */ /* Define a null pthread structure just to satisfy _pthread_self. */ struct pthread { }; static struct pthread main_thread; static int stub_main(void); static void *stub_null(void); static struct pthread *stub_self(void); static int stub_zero(void); static int stub_fail(void); static int stub_true(void); static void stub_exit(void); #define PJT_DUAL_ENTRY(entry) \ (pthread_func_t)entry, (pthread_func_t)entry pthread_func_entry_t __thr_jtable[PJT_MAX] = { {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATFORK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_DESTROY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETDETACHSTATE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETGUARDSIZE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETINHERITSCHED */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETSCHEDPARAM */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETSCHEDPOLICY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETSCOPE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETSTACKADDR */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_GETSTACKSIZE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_INIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETDETACHSTATE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETGUARDSIZE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETINHERITSCHED */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETSCHEDPARAM */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETSCHEDPOLICY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETSCOPE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETSTACKADDR */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_ATTR_SETSTACKSIZE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CANCEL */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CLEANUP_POP */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CLEANUP_PUSH */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_COND_BROADCAST */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_COND_DESTROY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_COND_INIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_COND_SIGNAL */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_COND_TIMEDWAIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_COND_WAIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_DETACH */ {PJT_DUAL_ENTRY(stub_true)}, /* PJT_EQUAL */ {PJT_DUAL_ENTRY(stub_exit)}, /* PJT_EXIT */ {PJT_DUAL_ENTRY(stub_null)}, /* PJT_GETSPECIFIC */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_JOIN */ {PJT_DUAL_ENTRY(stub_fail)}, /* PJT_KEY_CREATE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_KEY_DELETE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_KILL */ {PJT_DUAL_ENTRY(stub_main)}, /* PJT_MAIN_NP */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEXATTR_DESTROY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEXATTR_INIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEXATTR_SETTYPE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEX_DESTROY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEX_INIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEX_LOCK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEX_TRYLOCK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEX_UNLOCK */ {PJT_DUAL_ENTRY(stub_fail)}, /* PJT_ONCE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_DESTROY */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_INIT */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_RDLOCK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_TRYRDLOCK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_TRYWRLOCK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_UNLOCK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_RWLOCK_WRLOCK */ {PJT_DUAL_ENTRY(stub_self)}, /* PJT_SELF */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_SETCANCELSTATE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_SETCANCELTYPE */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_SETSPECIFIC */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_SIGMASK */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_TESTCANCEL */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CLEANUP_POP_IMP */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CLEANUP_PUSH_IMP */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CANCEL_ENTER */ {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_CANCEL_LEAVE */ + {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEX_CONSISTENT */ + {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEXATTR_GETROBUST */ + {PJT_DUAL_ENTRY(stub_zero)}, /* PJT_MUTEXATTR_SETROBUST */ }; /* * Weak aliases for exported (pthread_*) and internal (_pthread_*) routines. */ #define WEAK_REF(sym, alias) __weak_reference(sym, alias) #define FUNC_TYPE(name) __CONCAT(name, _func_t) #define FUNC_INT(name) __CONCAT(name, _int) #define FUNC_EXP(name) __CONCAT(name, _exp) #define STUB_FUNC(name, idx, ret) \ static ret FUNC_EXP(name)(void) __used; \ static ret FUNC_INT(name)(void) __used; \ WEAK_REF(FUNC_EXP(name), name); \ WEAK_REF(FUNC_INT(name), __CONCAT(_, name)); \ typedef ret (*FUNC_TYPE(name))(void); \ static ret FUNC_EXP(name)(void) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][0]; \ return (func()); \ } \ static ret FUNC_INT(name)(void) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][1]; \ return (func()); \ } #define STUB_FUNC1(name, idx, ret, p0_type) \ static ret FUNC_EXP(name)(p0_type) __used; \ static ret FUNC_INT(name)(p0_type) __used; \ WEAK_REF(FUNC_EXP(name), name); \ WEAK_REF(FUNC_INT(name), __CONCAT(_, name)); \ typedef ret (*FUNC_TYPE(name))(p0_type); \ static ret FUNC_EXP(name)(p0_type p0) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][0]; \ return (func(p0)); \ } \ static ret FUNC_INT(name)(p0_type p0) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][1]; \ return (func(p0)); \ } #define STUB_FUNC2(name, idx, ret, p0_type, p1_type) \ static ret FUNC_EXP(name)(p0_type, p1_type) __used; \ static ret FUNC_INT(name)(p0_type, p1_type) __used; \ WEAK_REF(FUNC_EXP(name), name); \ WEAK_REF(FUNC_INT(name), __CONCAT(_, name)); \ typedef ret (*FUNC_TYPE(name))(p0_type, p1_type); \ static ret FUNC_EXP(name)(p0_type p0, p1_type p1) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][0]; \ return (func(p0, p1)); \ } \ static ret FUNC_INT(name)(p0_type p0, p1_type p1) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][1]; \ return (func(p0, p1)); \ } #define STUB_FUNC3(name, idx, ret, p0_type, p1_type, p2_type) \ static ret FUNC_EXP(name)(p0_type, p1_type, p2_type) __used; \ static ret FUNC_INT(name)(p0_type, p1_type, p2_type) __used; \ WEAK_REF(FUNC_EXP(name), name); \ WEAK_REF(FUNC_INT(name), __CONCAT(_, name)); \ typedef ret (*FUNC_TYPE(name))(p0_type, p1_type, p2_type); \ static ret FUNC_EXP(name)(p0_type p0, p1_type p1, p2_type p2) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][0]; \ return (func(p0, p1, p2)); \ } \ static ret FUNC_INT(name)(p0_type p0, p1_type p1, p2_type p2) \ { \ FUNC_TYPE(name) func; \ func = (FUNC_TYPE(name))__thr_jtable[idx][1]; \ return (func(p0, p1, p2)); \ } STUB_FUNC1(pthread_cond_broadcast, PJT_COND_BROADCAST, int, void *) STUB_FUNC1(pthread_cond_destroy, PJT_COND_DESTROY, int, void *) STUB_FUNC2(pthread_cond_init, PJT_COND_INIT, int, void *, void *) STUB_FUNC1(pthread_cond_signal, PJT_COND_SIGNAL, int, void *) STUB_FUNC2(pthread_cond_wait, PJT_COND_WAIT, int, void *, void *) STUB_FUNC1(pthread_getspecific, PJT_GETSPECIFIC, void *, pthread_key_t) STUB_FUNC2(pthread_key_create, PJT_KEY_CREATE, int, void *, void *) STUB_FUNC1(pthread_key_delete, PJT_KEY_DELETE, int, pthread_key_t) STUB_FUNC(pthread_main_np, PJT_MAIN_NP, int) STUB_FUNC1(pthread_mutex_destroy, PJT_MUTEX_DESTROY, int, void *) STUB_FUNC2(pthread_mutex_init, PJT_MUTEX_INIT, int, void *, void *) STUB_FUNC1(pthread_mutex_lock, PJT_MUTEX_LOCK, int, void *) STUB_FUNC1(pthread_mutex_trylock, PJT_MUTEX_TRYLOCK, int, void *) STUB_FUNC1(pthread_mutex_unlock, PJT_MUTEX_UNLOCK, int, void *) +STUB_FUNC1(pthread_mutex_consistent, PJT_MUTEX_CONSISTENT, int, void *) STUB_FUNC1(pthread_mutexattr_destroy, PJT_MUTEXATTR_DESTROY, int, void *) STUB_FUNC1(pthread_mutexattr_init, PJT_MUTEXATTR_INIT, int, void *) STUB_FUNC2(pthread_mutexattr_settype, PJT_MUTEXATTR_SETTYPE, int, void *, int) +STUB_FUNC2(pthread_mutexattr_getrobust, PJT_MUTEXATTR_GETROBUST, int, void *, + int *) +STUB_FUNC2(pthread_mutexattr_setrobust, PJT_MUTEXATTR_SETROBUST, int, void *, + int) STUB_FUNC2(pthread_once, PJT_ONCE, int, void *, void *) STUB_FUNC1(pthread_rwlock_destroy, PJT_RWLOCK_DESTROY, int, void *) STUB_FUNC2(pthread_rwlock_init, PJT_RWLOCK_INIT, int, void *, void *) STUB_FUNC1(pthread_rwlock_rdlock, PJT_RWLOCK_RDLOCK, int, void *) STUB_FUNC1(pthread_rwlock_tryrdlock, PJT_RWLOCK_TRYRDLOCK, int, void *) STUB_FUNC1(pthread_rwlock_trywrlock, PJT_RWLOCK_TRYWRLOCK, int, void *) STUB_FUNC1(pthread_rwlock_unlock, PJT_RWLOCK_UNLOCK, int, void *) STUB_FUNC1(pthread_rwlock_wrlock, PJT_RWLOCK_WRLOCK, int, void *) STUB_FUNC(pthread_self, PJT_SELF, pthread_t) STUB_FUNC2(pthread_setspecific, PJT_SETSPECIFIC, int, pthread_key_t, void *) STUB_FUNC3(pthread_sigmask, PJT_SIGMASK, int, int, void *, void *) STUB_FUNC3(pthread_atfork, PJT_ATFORK, int, void *, void *, void*) STUB_FUNC1(pthread_attr_destroy, PJT_ATTR_DESTROY, int, void *); STUB_FUNC2(pthread_attr_getdetachstate, PJT_ATTR_GETDETACHSTATE, int, void *, void *) STUB_FUNC2(pthread_attr_getguardsize, PJT_ATTR_GETGUARDSIZE, int, void *, void *) STUB_FUNC2(pthread_attr_getstackaddr, PJT_ATTR_GETSTACKADDR, int, void *, void *) STUB_FUNC2(pthread_attr_getstacksize, PJT_ATTR_GETSTACKSIZE, int, void *, void *) STUB_FUNC2(pthread_attr_getinheritsched, PJT_ATTR_GETINHERITSCHED, int, void *, void *) STUB_FUNC2(pthread_attr_getschedparam, PJT_ATTR_GETSCHEDPARAM, int, void *, void *) STUB_FUNC2(pthread_attr_getschedpolicy, PJT_ATTR_GETSCHEDPOLICY, int, void *, void *) STUB_FUNC2(pthread_attr_getscope, PJT_ATTR_GETSCOPE, int, void *, void *) STUB_FUNC1(pthread_attr_init, PJT_ATTR_INIT, int, void *) STUB_FUNC2(pthread_attr_setdetachstate, PJT_ATTR_SETDETACHSTATE, int, void *, int) STUB_FUNC2(pthread_attr_setguardsize, PJT_ATTR_SETGUARDSIZE, int, void *, size_t) STUB_FUNC2(pthread_attr_setstackaddr, PJT_ATTR_SETSTACKADDR, int, void *, void *) STUB_FUNC2(pthread_attr_setstacksize, PJT_ATTR_SETSTACKSIZE, int, void *, size_t) STUB_FUNC2(pthread_attr_setinheritsched, PJT_ATTR_SETINHERITSCHED, int, void *, int) STUB_FUNC2(pthread_attr_setschedparam, PJT_ATTR_SETSCHEDPARAM, int, void *, void *) STUB_FUNC2(pthread_attr_setschedpolicy, PJT_ATTR_SETSCHEDPOLICY, int, void *, int) STUB_FUNC2(pthread_attr_setscope, PJT_ATTR_SETSCOPE, int, void *, int) STUB_FUNC1(pthread_cancel, PJT_CANCEL, int, void *) STUB_FUNC1(pthread_cleanup_pop, PJT_CLEANUP_POP, int, int) STUB_FUNC2(pthread_cleanup_push, PJT_CLEANUP_PUSH, void, void *, void *) STUB_FUNC3(pthread_cond_timedwait, PJT_COND_TIMEDWAIT, int, void *, void *, void *) STUB_FUNC1(pthread_detach, PJT_DETACH, int, void *) STUB_FUNC2(pthread_equal, PJT_EQUAL, int, void *, void *) STUB_FUNC1(pthread_exit, PJT_EXIT, void, void *) STUB_FUNC2(pthread_join, PJT_JOIN, int, void *, void *) STUB_FUNC2(pthread_kill, PJT_KILL, int, void *, int) STUB_FUNC2(pthread_setcancelstate, PJT_SETCANCELSTATE, int, int, void *) STUB_FUNC2(pthread_setcanceltype, PJT_SETCANCELTYPE, int, int, void *) STUB_FUNC(pthread_testcancel, PJT_TESTCANCEL, void) STUB_FUNC1(__pthread_cleanup_pop_imp, PJT_CLEANUP_POP_IMP, int, int) STUB_FUNC2(__pthread_cleanup_push_imp, PJT_CLEANUP_PUSH_IMP, void, void*, void *); STUB_FUNC1(_pthread_cancel_enter, PJT_CANCEL_ENTER, int, int) STUB_FUNC1(_pthread_cancel_leave, PJT_CANCEL_LEAVE, int, int) static int stub_zero(void) { return (0); } static void * stub_null(void) { return (NULL); } static struct pthread * stub_self(void) { return (&main_thread); } static int stub_fail(void) { return (ENOSYS); } static int stub_main(void) { return (-1); } static int stub_true(void) { return (1); } static void stub_exit(void) { exit(0); } Index: head/lib/libc/include/libc_private.h =================================================================== --- head/lib/libc/include/libc_private.h (revision 300042) +++ head/lib/libc/include/libc_private.h (revision 300043) @@ -1,390 +1,393 @@ /* * Copyright (c) 1998 John Birrell . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * * Private definitions for libc, libc_r and libpthread. * */ #ifndef _LIBC_PRIVATE_H_ #define _LIBC_PRIVATE_H_ #include #include /* * This global flag is non-zero when a process has created one * or more threads. It is used to avoid calling locking functions * when they are not required. */ extern int __isthreaded; /* * Elf_Auxinfo *__elf_aux_vector, the pointer to the ELF aux vector * provided by kernel. Either set for us by rtld, or found at runtime * on stack for static binaries. * * Type is void to avoid polluting whole libc with ELF types. */ extern void *__elf_aux_vector; /* * libc should use libc_dlopen internally, which respects a global * flag where loading of new shared objects can be restricted. */ void *libc_dlopen(const char *, int); /* * For dynamic linker. */ void _rtld_error(const char *fmt, ...); /* * File lock contention is difficult to diagnose without knowing * where locks were set. Allow a debug library to be built which * records the source file and line number of each lock call. */ #ifdef _FLOCK_DEBUG #define _FLOCKFILE(x) _flockfile_debug(x, __FILE__, __LINE__) #else #define _FLOCKFILE(x) _flockfile(x) #endif /* * Macros for locking and unlocking FILEs. These test if the * process is threaded to avoid locking when not required. */ #define FLOCKFILE(fp) if (__isthreaded) _FLOCKFILE(fp) #define FUNLOCKFILE(fp) if (__isthreaded) _funlockfile(fp) struct _spinlock; extern struct _spinlock __stdio_thread_lock __hidden; #define STDIO_THREAD_LOCK() \ do { \ if (__isthreaded) \ _SPINLOCK(&__stdio_thread_lock); \ } while (0) #define STDIO_THREAD_UNLOCK() \ do { \ if (__isthreaded) \ _SPINUNLOCK(&__stdio_thread_lock); \ } while (0) void __libc_spinlock_stub(struct _spinlock *); void __libc_spinunlock_stub(struct _spinlock *); /* * Indexes into the pthread jump table. * * Warning! If you change this type, you must also change the threads * libraries that reference it (libc_r, libpthread). */ typedef enum { PJT_ATFORK, PJT_ATTR_DESTROY, PJT_ATTR_GETDETACHSTATE, PJT_ATTR_GETGUARDSIZE, PJT_ATTR_GETINHERITSCHED, PJT_ATTR_GETSCHEDPARAM, PJT_ATTR_GETSCHEDPOLICY, PJT_ATTR_GETSCOPE, PJT_ATTR_GETSTACKADDR, PJT_ATTR_GETSTACKSIZE, PJT_ATTR_INIT, PJT_ATTR_SETDETACHSTATE, PJT_ATTR_SETGUARDSIZE, PJT_ATTR_SETINHERITSCHED, PJT_ATTR_SETSCHEDPARAM, PJT_ATTR_SETSCHEDPOLICY, PJT_ATTR_SETSCOPE, PJT_ATTR_SETSTACKADDR, PJT_ATTR_SETSTACKSIZE, PJT_CANCEL, PJT_CLEANUP_POP, PJT_CLEANUP_PUSH, PJT_COND_BROADCAST, PJT_COND_DESTROY, PJT_COND_INIT, PJT_COND_SIGNAL, PJT_COND_TIMEDWAIT, PJT_COND_WAIT, PJT_DETACH, PJT_EQUAL, PJT_EXIT, PJT_GETSPECIFIC, PJT_JOIN, PJT_KEY_CREATE, PJT_KEY_DELETE, PJT_KILL, PJT_MAIN_NP, PJT_MUTEXATTR_DESTROY, PJT_MUTEXATTR_INIT, PJT_MUTEXATTR_SETTYPE, PJT_MUTEX_DESTROY, PJT_MUTEX_INIT, PJT_MUTEX_LOCK, PJT_MUTEX_TRYLOCK, PJT_MUTEX_UNLOCK, PJT_ONCE, PJT_RWLOCK_DESTROY, PJT_RWLOCK_INIT, PJT_RWLOCK_RDLOCK, PJT_RWLOCK_TRYRDLOCK, PJT_RWLOCK_TRYWRLOCK, PJT_RWLOCK_UNLOCK, PJT_RWLOCK_WRLOCK, PJT_SELF, PJT_SETCANCELSTATE, PJT_SETCANCELTYPE, PJT_SETSPECIFIC, PJT_SIGMASK, PJT_TESTCANCEL, PJT_CLEANUP_POP_IMP, PJT_CLEANUP_PUSH_IMP, PJT_CANCEL_ENTER, PJT_CANCEL_LEAVE, + PJT_MUTEX_CONSISTENT, + PJT_MUTEXATTR_GETROBUST, + PJT_MUTEXATTR_SETROBUST, PJT_MAX } pjt_index_t; typedef int (*pthread_func_t)(void); typedef pthread_func_t pthread_func_entry_t[2]; extern pthread_func_entry_t __thr_jtable[]; void __set_error_selector(int *(*arg)(void)); int _pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex, void *(calloc_cb)(__size_t, __size_t)); typedef int (*interpos_func_t)(void); interpos_func_t *__libc_interposing_slot(int interposno); extern interpos_func_t __libc_interposing[] __hidden; enum { INTERPOS_accept, INTERPOS_accept4, INTERPOS_aio_suspend, INTERPOS_close, INTERPOS_connect, INTERPOS_fcntl, INTERPOS_fsync, INTERPOS_fork, INTERPOS_msync, INTERPOS_nanosleep, INTERPOS_openat, INTERPOS_poll, INTERPOS_pselect, INTERPOS_recvfrom, INTERPOS_recvmsg, INTERPOS_select, INTERPOS_sendmsg, INTERPOS_sendto, INTERPOS_setcontext, INTERPOS_sigaction, INTERPOS_sigprocmask, INTERPOS_sigsuspend, INTERPOS_sigwait, INTERPOS_sigtimedwait, INTERPOS_sigwaitinfo, INTERPOS_swapcontext, INTERPOS_system, INTERPOS_tcdrain, INTERPOS_read, INTERPOS_readv, INTERPOS_wait4, INTERPOS_write, INTERPOS_writev, INTERPOS__pthread_mutex_init_calloc_cb, INTERPOS_spinlock, INTERPOS_spinunlock, INTERPOS_kevent, INTERPOS_wait6, INTERPOS_ppoll, INTERPOS_map_stacks_exec, INTERPOS_MAX }; /* * yplib internal interfaces */ #ifdef YP int _yp_check(char **); #endif /* * Initialise TLS for static programs */ void _init_tls(void); /* * Provides pthread_once()-like functionality for both single-threaded * and multi-threaded applications. */ int _once(pthread_once_t *, void (*)(void)); /* * Set the TLS thread pointer */ void _set_tp(void *tp); /* * This is a pointer in the C run-time startup code. It is used * by getprogname() and setprogname(). */ extern const char *__progname; /* * This function is used by the threading libraries to notify malloc that a * thread is exiting. */ void _malloc_thread_cleanup(void); /* * These functions are used by the threading libraries in order to protect * malloc across fork(). */ void _malloc_prefork(void); void _malloc_postfork(void); void _malloc_first_thread(void); /* * Function to clean up streams, called from abort() and exit(). */ extern void (*__cleanup)(void) __hidden; /* * Get kern.osreldate to detect ABI revisions. Explicitly * ignores value of $OSVERSION and caches result. */ int __getosreldate(void); #include #include struct aiocb; struct fd_set; struct iovec; struct kevent; struct msghdr; struct pollfd; struct rusage; struct sigaction; struct sockaddr; struct timespec; struct timeval; struct timezone; struct __siginfo; struct __ucontext; struct __wrusage; enum idtype; int __sys_aio_suspend(const struct aiocb * const[], int, const struct timespec *); int __sys_accept(int, struct sockaddr *, __socklen_t *); int __sys_accept4(int, struct sockaddr *, __socklen_t *, int); int __sys_clock_gettime(__clockid_t, struct timespec *ts); int __sys_close(int); int __sys_connect(int, const struct sockaddr *, __socklen_t); int __sys_fcntl(int, int, ...); int __sys_fsync(int); __pid_t __sys_fork(void); int __sys_ftruncate(int, __off_t); int __sys_gettimeofday(struct timeval *, struct timezone *); int __sys_kevent(int, const struct kevent *, int, struct kevent *, int, const struct timespec *); __off_t __sys_lseek(int, __off_t, int); void *__sys_mmap(void *, __size_t, int, int, int, __off_t); int __sys_msync(void *, __size_t, int); int __sys_nanosleep(const struct timespec *, struct timespec *); int __sys_open(const char *, int, ...); int __sys_openat(int, const char *, int, ...); int __sys_pselect(int, struct fd_set *, struct fd_set *, struct fd_set *, const struct timespec *, const __sigset_t *); int __sys_poll(struct pollfd *, unsigned, int); int __sys_ppoll(struct pollfd *, unsigned, const struct timespec *, const __sigset_t *); __ssize_t __sys_pread(int, void *, __size_t, __off_t); __ssize_t __sys_pwrite(int, const void *, __size_t, __off_t); __ssize_t __sys_read(int, void *, __size_t); __ssize_t __sys_readv(int, const struct iovec *, int); __ssize_t __sys_recv(int, void *, __size_t, int); __ssize_t __sys_recvfrom(int, void *, __size_t, int, struct sockaddr *, __socklen_t *); __ssize_t __sys_recvmsg(int, struct msghdr *, int); int __sys_select(int, struct fd_set *, struct fd_set *, struct fd_set *, struct timeval *); __ssize_t __sys_sendmsg(int, const struct msghdr *, int); __ssize_t __sys_sendto(int, const void *, __size_t, int, const struct sockaddr *, __socklen_t); int __sys_setcontext(const struct __ucontext *); int __sys_sigaction(int, const struct sigaction *, struct sigaction *); int __sys_sigprocmask(int, const __sigset_t *, __sigset_t *); int __sys_sigsuspend(const __sigset_t *); int __sys_sigtimedwait(const __sigset_t *, struct __siginfo *, const struct timespec *); int __sys_sigwait(const __sigset_t *, int *); int __sys_sigwaitinfo(const __sigset_t *, struct __siginfo *); int __sys_swapcontext(struct __ucontext *, const struct __ucontext *); int __sys_thr_kill(long, int); int __sys_thr_self(long *); int __sys_truncate(const char *, __off_t); __pid_t __sys_wait4(__pid_t, int *, int, struct rusage *); __pid_t __sys_wait6(enum idtype, __id_t, int *, int, struct __wrusage *, struct __siginfo *); __ssize_t __sys_write(int, const void *, __size_t); __ssize_t __sys_writev(int, const struct iovec *, int); int __libc_sigaction(int, const struct sigaction *, struct sigaction *) __hidden; int __libc_sigprocmask(int, const __sigset_t *, __sigset_t *) __hidden; int __libc_sigsuspend(const __sigset_t *) __hidden; int __libc_sigwait(const __sigset_t * __restrict, int * restrict sig); int __libc_system(const char *); int __libc_tcdrain(int); int __fcntl_compat(int fd, int cmd, ...); int __sys_futimens(int fd, const struct timespec *times) __hidden; int __sys_utimensat(int fd, const char *path, const struct timespec *times, int flag) __hidden; /* execve() with PATH processing to implement posix_spawnp() */ int _execvpe(const char *, char * const *, char * const *); int _elf_aux_info(int aux, void *buf, int buflen); struct dl_phdr_info; int __elf_phdr_match_addr(struct dl_phdr_info *, void *); void __init_elf_aux_vector(void); void __libc_map_stacks_exec(void); void _pthread_cancel_enter(int); void _pthread_cancel_leave(int); #endif /* _LIBC_PRIVATE_H_ */ Index: head/lib/libthr/pthread.map =================================================================== --- head/lib/libthr/pthread.map (revision 300042) +++ head/lib/libthr/pthread.map (revision 300043) @@ -1,317 +1,323 @@ /* * $FreeBSD$ */ /* * Use the same naming scheme as libc. */ FBSD_1.0 { pthread_atfork; pthread_barrier_destroy; pthread_barrier_init; pthread_barrier_wait; pthread_barrierattr_destroy; pthread_barrierattr_getpshared; pthread_barrierattr_init; pthread_barrierattr_setpshared; pthread_attr_destroy; pthread_attr_get_np; pthread_attr_getdetachstate; pthread_attr_getguardsize; pthread_attr_getinheritsched; pthread_attr_getschedparam; pthread_attr_getschedpolicy; pthread_attr_getscope; pthread_attr_getstack; pthread_attr_getstackaddr; pthread_attr_getstacksize; pthread_attr_init; pthread_attr_setcreatesuspend_np; pthread_attr_setdetachstate; pthread_attr_setguardsize; pthread_attr_setinheritsched; pthread_attr_setschedparam; pthread_attr_setschedpolicy; pthread_attr_setscope; pthread_attr_setstack; pthread_attr_setstackaddr; pthread_attr_setstacksize; pthread_cancel; pthread_cleanup_pop; pthread_cleanup_push; pthread_cond_broadcast; pthread_cond_destroy; pthread_cond_init; pthread_cond_signal; pthread_cond_timedwait; pthread_cond_wait; pthread_condattr_destroy; pthread_condattr_getclock; pthread_condattr_getpshared; pthread_condattr_init; pthread_condattr_setclock; pthread_condattr_setpshared; pthread_create; pthread_detach; pthread_equal; pthread_exit; pthread_getconcurrency; pthread_getprio; pthread_getschedparam; pthread_getspecific; pthread_join; pthread_key_create; pthread_key_delete; pthread_kill; pthread_main_np; pthread_multi_np; pthread_mutex_destroy; pthread_mutex_getprioceiling; pthread_mutex_init; pthread_mutex_lock; pthread_mutex_setprioceiling; pthread_mutex_timedlock; pthread_mutex_trylock; pthread_mutex_unlock; pthread_mutexattr_destroy; pthread_mutexattr_getkind_np; pthread_mutexattr_getprioceiling; pthread_mutexattr_getpshared; pthread_mutexattr_getprotocol; pthread_mutexattr_gettype; pthread_mutexattr_init; pthread_mutexattr_setkind_np; pthread_mutexattr_setprioceiling; pthread_mutexattr_setprotocol; pthread_mutexattr_setpshared; pthread_mutexattr_settype; pthread_once; pthread_resume_all_np; pthread_resume_np; pthread_rwlock_destroy; pthread_rwlock_init; pthread_rwlock_rdlock; pthread_rwlock_timedrdlock; pthread_rwlock_timedwrlock; pthread_rwlock_tryrdlock; pthread_rwlock_trywrlock; pthread_rwlock_unlock; pthread_rwlock_wrlock; pthread_rwlockattr_destroy; pthread_rwlockattr_getpshared; pthread_rwlockattr_init; pthread_rwlockattr_setpshared; pthread_set_name_np; pthread_self; pthread_setcancelstate; pthread_setcanceltype; pthread_setconcurrency; pthread_setprio; pthread_setschedparam; pthread_setspecific; pthread_sigmask; pthread_single_np; pthread_spin_destroy; pthread_spin_init; pthread_spin_lock; pthread_spin_trylock; pthread_spin_unlock; pthread_suspend_all_np; pthread_suspend_np; pthread_switch_add_np; pthread_switch_delete_np; pthread_testcancel; pthread_timedjoin_np; pthread_yield; }; /* * List the private interfaces reserved for use in FreeBSD libraries. * These are not part of our application ABI. */ FBSDprivate_1.0 { __pthread_cond_timedwait; __pthread_cond_wait; __pthread_cxa_finalize; __pthread_mutex_init; __pthread_mutex_lock; __pthread_mutex_timedlock; __pthread_mutex_trylock; _pthread_atfork; _pthread_barrier_destroy; _pthread_barrier_init; _pthread_barrier_wait; _pthread_barrierattr_destroy; _pthread_barrierattr_getpshared; _pthread_barrierattr_init; _pthread_barrierattr_setpshared; _pthread_attr_destroy; _pthread_attr_get_np; _pthread_attr_getaffinity_np; _pthread_attr_getdetachstate; _pthread_attr_getguardsize; _pthread_attr_getinheritsched; _pthread_attr_getschedparam; _pthread_attr_getschedpolicy; _pthread_attr_getscope; _pthread_attr_getstack; _pthread_attr_getstackaddr; _pthread_attr_getstacksize; _pthread_attr_init; _pthread_attr_setaffinity_np; _pthread_attr_setcreatesuspend_np; _pthread_attr_setdetachstate; _pthread_attr_setguardsize; _pthread_attr_setinheritsched; _pthread_attr_setschedparam; _pthread_attr_setschedpolicy; _pthread_attr_setscope; _pthread_attr_setstack; _pthread_attr_setstackaddr; _pthread_attr_setstacksize; _pthread_cancel; _pthread_cancel_enter; _pthread_cancel_leave; _pthread_cleanup_pop; _pthread_cleanup_push; _pthread_cond_broadcast; _pthread_cond_destroy; _pthread_cond_init; _pthread_cond_signal; _pthread_cond_timedwait; _pthread_cond_wait; _pthread_condattr_destroy; _pthread_condattr_getclock; _pthread_condattr_getpshared; _pthread_condattr_init; _pthread_condattr_setclock; _pthread_condattr_setpshared; _pthread_create; _pthread_detach; _pthread_equal; _pthread_exit; _pthread_getaffinity_np; _pthread_getconcurrency; _pthread_getcpuclockid; _pthread_getprio; _pthread_getschedparam; _pthread_getspecific; _pthread_getthreadid_np; _pthread_join; _pthread_key_create; _pthread_key_delete; _pthread_kill; _pthread_main_np; _pthread_multi_np; _pthread_mutex_destroy; _pthread_mutex_getprioceiling; _pthread_mutex_getspinloops_np; _pthread_mutex_getyieldloops_np; _pthread_mutex_init; _pthread_mutex_init_calloc_cb; _pthread_mutex_isowned_np; _pthread_mutex_lock; _pthread_mutex_setprioceiling; _pthread_mutex_setspinloops_np; _pthread_mutex_setyieldloops_np; _pthread_mutex_timedlock; _pthread_mutex_trylock; _pthread_mutex_unlock; _pthread_mutexattr_destroy; _pthread_mutexattr_getkind_np; _pthread_mutexattr_getprioceiling; _pthread_mutexattr_getprotocol; _pthread_mutexattr_getpshared; _pthread_mutexattr_gettype; _pthread_mutexattr_init; _pthread_mutexattr_setkind_np; _pthread_mutexattr_setprioceiling; _pthread_mutexattr_setprotocol; _pthread_mutexattr_setpshared; _pthread_mutexattr_settype; _pthread_once; _pthread_resume_all_np; _pthread_resume_np; _pthread_rwlock_destroy; _pthread_rwlock_init; _pthread_rwlock_rdlock; _pthread_rwlock_timedrdlock; _pthread_rwlock_timedwrlock; _pthread_rwlock_tryrdlock; _pthread_rwlock_trywrlock; _pthread_rwlock_unlock; _pthread_rwlock_wrlock; _pthread_rwlockattr_destroy; _pthread_rwlockattr_getpshared; _pthread_rwlockattr_init; _pthread_rwlockattr_setpshared; _pthread_self; _pthread_set_name_np; _pthread_setaffinity_np; _pthread_setcancelstate; _pthread_setcanceltype; _pthread_setconcurrency; _pthread_setprio; _pthread_setschedparam; _pthread_setspecific; _pthread_sigmask; _pthread_single_np; _pthread_spin_destroy; _pthread_spin_init; _pthread_spin_lock; _pthread_spin_trylock; _pthread_spin_unlock; _pthread_suspend_all_np; _pthread_suspend_np; _pthread_switch_add_np; _pthread_switch_delete_np; _pthread_testcancel; _pthread_timedjoin_np; _pthread_yield; /* Debugger needs these. */ _libthr_debug; _thread_active_threads; _thread_bp_create; _thread_bp_death; _thread_event_mask; _thread_keytable; _thread_last_event; _thread_list; _thread_max_keys; _thread_off_attr_flags; _thread_off_dtv; _thread_off_event_buf; _thread_off_event_mask; _thread_off_key_allocated; _thread_off_key_destructor; _thread_off_linkmap; _thread_off_next; _thread_off_report_events; _thread_off_state; _thread_off_tcb; _thread_off_tid; _thread_off_tlsindex; _thread_size_key; _thread_state_running; _thread_state_zoombie; }; FBSD_1.1 { __pthread_cleanup_pop_imp; __pthread_cleanup_push_imp; pthread_attr_getaffinity_np; pthread_attr_setaffinity_np; pthread_getaffinity_np; pthread_getcpuclockid; pthread_setaffinity_np; pthread_mutex_getspinloops_np; pthread_mutex_getyieldloops_np; pthread_mutex_isowned_np; pthread_mutex_setspinloops_np; pthread_mutex_setyieldloops_np; }; FBSD_1.2 { pthread_getthreadid_np; }; + +FBSD_1.4 { + pthread_mutex_consistent; + pthread_mutexattr_getrobust; + pthread_mutexattr_setrobust; +}; Index: head/lib/libthr/thread/thr_cond.c =================================================================== --- head/lib/libthr/thread/thr_cond.c (revision 300042) +++ head/lib/libthr/thread/thr_cond.c (revision 300043) @@ -1,523 +1,533 @@ /* * Copyright (c) 2005 David Xu * Copyright (c) 2015 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "namespace.h" #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" _Static_assert(sizeof(struct pthread_cond) <= PAGE_SIZE, "pthread_cond too large"); /* * Prototypes */ int __pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); int __pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec * abstime); static int cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr); static int cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime, int cancel); static int cond_signal_common(pthread_cond_t *cond); static int cond_broadcast_common(pthread_cond_t *cond); /* * Double underscore versions are cancellation points. Single underscore * versions are not and are provided for libc internal usage (which * shouldn't introduce cancellation points). */ __weak_reference(__pthread_cond_wait, pthread_cond_wait); __weak_reference(__pthread_cond_timedwait, pthread_cond_timedwait); __weak_reference(_pthread_cond_init, pthread_cond_init); __weak_reference(_pthread_cond_destroy, pthread_cond_destroy); __weak_reference(_pthread_cond_signal, pthread_cond_signal); __weak_reference(_pthread_cond_broadcast, pthread_cond_broadcast); #define CV_PSHARED(cvp) (((cvp)->__flags & USYNC_PROCESS_SHARED) != 0) static void cond_init_body(struct pthread_cond *cvp, const struct pthread_cond_attr *cattr) { if (cattr == NULL) { cvp->__clock_id = CLOCK_REALTIME; } else { if (cattr->c_pshared) cvp->__flags |= USYNC_PROCESS_SHARED; cvp->__clock_id = cattr->c_clockid; } } static int cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr) { struct pthread_cond *cvp; const struct pthread_cond_attr *cattr; int pshared; cattr = cond_attr != NULL ? *cond_attr : NULL; if (cattr == NULL || cattr->c_pshared == PTHREAD_PROCESS_PRIVATE) { pshared = 0; cvp = calloc(1, sizeof(struct pthread_cond)); if (cvp == NULL) return (ENOMEM); } else { pshared = 1; cvp = __thr_pshared_offpage(cond, 1); if (cvp == NULL) return (EFAULT); } /* * Initialise the condition variable structure: */ cond_init_body(cvp, cattr); *cond = pshared ? THR_PSHARED_PTR : cvp; return (0); } static int init_static(struct pthread *thread, pthread_cond_t *cond) { int ret; THR_LOCK_ACQUIRE(thread, &_cond_static_lock); if (*cond == NULL) ret = cond_init(cond, NULL); else ret = 0; THR_LOCK_RELEASE(thread, &_cond_static_lock); return (ret); } #define CHECK_AND_INIT_COND \ if (*cond == THR_PSHARED_PTR) { \ cvp = __thr_pshared_offpage(cond, 0); \ if (cvp == NULL) \ return (EINVAL); \ } else if (__predict_false((cvp = (*cond)) <= THR_COND_DESTROYED)) { \ if (cvp == THR_COND_INITIALIZER) { \ int ret; \ ret = init_static(_get_curthread(), cond); \ if (ret) \ return (ret); \ } else if (cvp == THR_COND_DESTROYED) { \ return (EINVAL); \ } \ cvp = *cond; \ } int _pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr) { *cond = NULL; return (cond_init(cond, cond_attr)); } int _pthread_cond_destroy(pthread_cond_t *cond) { struct pthread_cond *cvp; int error; error = 0; if (*cond == THR_PSHARED_PTR) { cvp = __thr_pshared_offpage(cond, 0); if (cvp != NULL) __thr_pshared_destroy(cond); *cond = THR_COND_DESTROYED; } else if ((cvp = *cond) == THR_COND_INITIALIZER) { /* nothing */ } else if (cvp == THR_COND_DESTROYED) { error = EINVAL; } else { cvp = *cond; *cond = THR_COND_DESTROYED; free(cvp); } return (error); } /* * Cancellation behavior: * Thread may be canceled at start, if thread is canceled, it means it * did not get a wakeup from pthread_cond_signal(), otherwise, it is * not canceled. * Thread cancellation never cause wakeup from pthread_cond_signal() * to be lost. */ static int cond_wait_kernel(struct pthread_cond *cvp, struct pthread_mutex *mp, - const struct timespec *abstime, int cancel) + const struct timespec *abstime, int cancel) { - struct pthread *curthread = _get_curthread(); - int recurse; - int error, error2 = 0; + struct pthread *curthread; + int error, error2, recurse, robust; + curthread = _get_curthread(); + robust = _mutex_enter_robust(curthread, mp); + error = _mutex_cv_detach(mp, &recurse); - if (error != 0) + if (error != 0) { + if (robust) + _mutex_leave_robust(curthread, mp); return (error); + } - if (cancel) { + if (cancel) _thr_cancel_enter2(curthread, 0); - error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters, - (struct umutex *)&mp->m_lock, abstime, - CVWAIT_ABSTIME|CVWAIT_CLOCKID); + error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters, + (struct umutex *)&mp->m_lock, abstime, CVWAIT_ABSTIME | + CVWAIT_CLOCKID); + if (cancel) _thr_cancel_leave(curthread, 0); - } else { - error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters, - (struct umutex *)&mp->m_lock, abstime, - CVWAIT_ABSTIME|CVWAIT_CLOCKID); - } /* * Note that PP mutex and ROBUST mutex may return * interesting error codes. */ if (error == 0) { - error2 = _mutex_cv_lock(mp, recurse); + error2 = _mutex_cv_lock(mp, recurse, true); } else if (error == EINTR || error == ETIMEDOUT) { - error2 = _mutex_cv_lock(mp, recurse); + error2 = _mutex_cv_lock(mp, recurse, true); + /* + * Do not do cancellation on EOWNERDEAD there. The + * cancellation cleanup handler will use the protected + * state and unlock the mutex without making the state + * consistent and the state will be unrecoverable. + */ if (error2 == 0 && cancel) _thr_testcancel(curthread); + if (error == EINTR) error = 0; } else { /* We know that it didn't unlock the mutex. */ - error2 = _mutex_cv_attach(mp, recurse); - if (error2 == 0 && cancel) + _mutex_cv_attach(mp, recurse); + if (cancel) _thr_testcancel(curthread); + error2 = 0; } + if (robust) + _mutex_leave_robust(curthread, mp); return (error2 != 0 ? error2 : error); } /* * Thread waits in userland queue whenever possible, when thread * is signaled or broadcasted, it is removed from the queue, and * is saved in curthread's defer_waiters[] buffer, but won't be * woken up until mutex is unlocked. */ static int cond_wait_user(struct pthread_cond *cvp, struct pthread_mutex *mp, - const struct timespec *abstime, int cancel) + const struct timespec *abstime, int cancel) { - struct pthread *curthread = _get_curthread(); + struct pthread *curthread; struct sleepqueue *sq; - int recurse; - int error; - int defered; + int deferred, error, error2, recurse; + curthread = _get_curthread(); if (curthread->wchan != NULL) PANIC("thread was already on queue."); if (cancel) _thr_testcancel(curthread); _sleepq_lock(cvp); /* * set __has_user_waiters before unlocking mutex, this allows * us to check it without locking in pthread_cond_signal(). */ cvp->__has_user_waiters = 1; - defered = 0; - (void)_mutex_cv_unlock(mp, &recurse, &defered); + deferred = 0; + (void)_mutex_cv_unlock(mp, &recurse, &deferred); curthread->mutex_obj = mp; _sleepq_add(cvp, curthread); for(;;) { _thr_clear_wake(curthread); _sleepq_unlock(cvp); - if (defered) { - defered = 0; + if (deferred) { + deferred = 0; if ((mp->m_lock.m_owner & UMUTEX_CONTESTED) == 0) - (void)_umtx_op_err(&mp->m_lock, UMTX_OP_MUTEX_WAKE2, - mp->m_lock.m_flags, 0, 0); + (void)_umtx_op_err(&mp->m_lock, + UMTX_OP_MUTEX_WAKE2, mp->m_lock.m_flags, + 0, 0); } if (curthread->nwaiter_defer > 0) { _thr_wake_all(curthread->defer_waiters, - curthread->nwaiter_defer); + curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } - if (cancel) { + if (cancel) _thr_cancel_enter2(curthread, 0); - error = _thr_sleep(curthread, cvp->__clock_id, abstime); + error = _thr_sleep(curthread, cvp->__clock_id, abstime); + if (cancel) _thr_cancel_leave(curthread, 0); - } else { - error = _thr_sleep(curthread, cvp->__clock_id, abstime); - } _sleepq_lock(cvp); if (curthread->wchan == NULL) { error = 0; break; } else if (cancel && SHOULD_CANCEL(curthread)) { sq = _sleepq_lookup(cvp); - cvp->__has_user_waiters = - _sleepq_remove(sq, curthread); + cvp->__has_user_waiters = _sleepq_remove(sq, curthread); _sleepq_unlock(cvp); curthread->mutex_obj = NULL; - _mutex_cv_lock(mp, recurse); + error2 = _mutex_cv_lock(mp, recurse, false); if (!THR_IN_CRITICAL(curthread)) _pthread_exit(PTHREAD_CANCELED); else /* this should not happen */ - return (0); + return (error2); } else if (error == ETIMEDOUT) { sq = _sleepq_lookup(cvp); cvp->__has_user_waiters = - _sleepq_remove(sq, curthread); + _sleepq_remove(sq, curthread); break; } } _sleepq_unlock(cvp); curthread->mutex_obj = NULL; - _mutex_cv_lock(mp, recurse); + error2 = _mutex_cv_lock(mp, recurse, false); + if (error == 0) + error = error2; return (error); } static int cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime, int cancel) { struct pthread *curthread = _get_curthread(); struct pthread_cond *cvp; struct pthread_mutex *mp; int error; CHECK_AND_INIT_COND if (*mutex == THR_PSHARED_PTR) { mp = __thr_pshared_offpage(mutex, 0); if (mp == NULL) return (EINVAL); } else { mp = *mutex; } if ((error = _mutex_owned(curthread, mp)) != 0) return (error); if (curthread->attr.sched_policy != SCHED_OTHER || - (mp->m_lock.m_flags & (UMUTEX_PRIO_PROTECT|UMUTEX_PRIO_INHERIT| - USYNC_PROCESS_SHARED)) != 0 || + (mp->m_lock.m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT | + USYNC_PROCESS_SHARED)) != 0 || (cvp->__flags & USYNC_PROCESS_SHARED) != 0) - return cond_wait_kernel(cvp, mp, abstime, cancel); + return (cond_wait_kernel(cvp, mp, abstime, cancel)); else - return cond_wait_user(cvp, mp, abstime, cancel); + return (cond_wait_user(cvp, mp, abstime, cancel)); } int _pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { return (cond_wait_common(cond, mutex, NULL, 0)); } int __pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { return (cond_wait_common(cond, mutex, NULL, 1)); } int _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec * abstime) { if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) return (EINVAL); return (cond_wait_common(cond, mutex, abstime, 0)); } int __pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime) { if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) return (EINVAL); return (cond_wait_common(cond, mutex, abstime, 1)); } static int cond_signal_common(pthread_cond_t *cond) { struct pthread *curthread = _get_curthread(); struct pthread *td; struct pthread_cond *cvp; struct pthread_mutex *mp; struct sleepqueue *sq; int *waddr; int pshared; /* * If the condition variable is statically initialized, perform dynamic * initialization. */ CHECK_AND_INIT_COND pshared = CV_PSHARED(cvp); _thr_ucond_signal((struct ucond *)&cvp->__has_kern_waiters); if (pshared || cvp->__has_user_waiters == 0) return (0); curthread = _get_curthread(); waddr = NULL; _sleepq_lock(cvp); sq = _sleepq_lookup(cvp); if (sq == NULL) { _sleepq_unlock(cvp); return (0); } td = _sleepq_first(sq); mp = td->mutex_obj; cvp->__has_user_waiters = _sleepq_remove(sq, td); - if (mp->m_owner == TID(curthread)) { + if (PMUTEX_OWNER_ID(mp) == TID(curthread)) { if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) { _thr_wake_all(curthread->defer_waiters, - curthread->nwaiter_defer); + curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } curthread->defer_waiters[curthread->nwaiter_defer++] = - &td->wake_addr->value; - mp->m_flags |= PMUTEX_FLAG_DEFERED; + &td->wake_addr->value; + mp->m_flags |= PMUTEX_FLAG_DEFERRED; } else { waddr = &td->wake_addr->value; } _sleepq_unlock(cvp); if (waddr != NULL) _thr_set_wake(waddr); return (0); } struct broadcast_arg { struct pthread *curthread; unsigned int *waddrs[MAX_DEFER_WAITERS]; int count; }; static void drop_cb(struct pthread *td, void *arg) { struct broadcast_arg *ba = arg; struct pthread_mutex *mp; struct pthread *curthread = ba->curthread; mp = td->mutex_obj; - if (mp->m_owner == TID(curthread)) { + if (PMUTEX_OWNER_ID(mp) == TID(curthread)) { if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) { _thr_wake_all(curthread->defer_waiters, - curthread->nwaiter_defer); + curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } curthread->defer_waiters[curthread->nwaiter_defer++] = - &td->wake_addr->value; - mp->m_flags |= PMUTEX_FLAG_DEFERED; + &td->wake_addr->value; + mp->m_flags |= PMUTEX_FLAG_DEFERRED; } else { if (ba->count >= MAX_DEFER_WAITERS) { _thr_wake_all(ba->waddrs, ba->count); ba->count = 0; } ba->waddrs[ba->count++] = &td->wake_addr->value; } } static int cond_broadcast_common(pthread_cond_t *cond) { int pshared; struct pthread_cond *cvp; struct sleepqueue *sq; struct broadcast_arg ba; /* * If the condition variable is statically initialized, perform dynamic * initialization. */ CHECK_AND_INIT_COND pshared = CV_PSHARED(cvp); _thr_ucond_broadcast((struct ucond *)&cvp->__has_kern_waiters); if (pshared || cvp->__has_user_waiters == 0) return (0); ba.curthread = _get_curthread(); ba.count = 0; _sleepq_lock(cvp); sq = _sleepq_lookup(cvp); if (sq == NULL) { _sleepq_unlock(cvp); return (0); } _sleepq_drop(sq, drop_cb, &ba); cvp->__has_user_waiters = 0; _sleepq_unlock(cvp); if (ba.count > 0) _thr_wake_all(ba.waddrs, ba.count); return (0); } int _pthread_cond_signal(pthread_cond_t * cond) { return (cond_signal_common(cond)); } int _pthread_cond_broadcast(pthread_cond_t * cond) { return (cond_broadcast_common(cond)); } Index: head/lib/libthr/thread/thr_init.c =================================================================== --- head/lib/libthr/thread/thr_init.c (revision 300042) +++ head/lib/libthr/thread/thr_init.c (revision 300043) @@ -1,491 +1,496 @@ /* * Copyright (c) 2003 Daniel M. Eischen * Copyright (c) 1995-1998 John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by John Birrell. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "namespace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "un-namespace.h" #include "libc_private.h" #include "thr_private.h" char *_usrstack; struct pthread *_thr_initial; int _libthr_debug; int _thread_event_mask; struct pthread *_thread_last_event; pthreadlist _thread_list = TAILQ_HEAD_INITIALIZER(_thread_list); pthreadlist _thread_gc_list = TAILQ_HEAD_INITIALIZER(_thread_gc_list); int _thread_active_threads = 1; atfork_head _thr_atfork_list = TAILQ_HEAD_INITIALIZER(_thr_atfork_list); struct urwlock _thr_atfork_lock = DEFAULT_URWLOCK; struct pthread_prio _thr_priorities[3] = { {RTP_PRIO_MIN, RTP_PRIO_MAX, 0}, /* FIFO */ {0, 0, 63}, /* OTHER */ {RTP_PRIO_MIN, RTP_PRIO_MAX, 0} /* RR */ }; struct pthread_attr _pthread_attr_default = { .sched_policy = SCHED_OTHER, .sched_inherit = PTHREAD_INHERIT_SCHED, .prio = 0, .suspend = THR_CREATE_RUNNING, .flags = PTHREAD_SCOPE_SYSTEM, .stackaddr_attr = NULL, .stacksize_attr = THR_STACK_DEFAULT, .guardsize_attr = 0, .cpusetsize = 0, .cpuset = NULL }; struct pthread_mutex_attr _pthread_mutexattr_default = { .m_type = PTHREAD_MUTEX_DEFAULT, .m_protocol = PTHREAD_PRIO_NONE, .m_ceiling = 0, .m_pshared = PTHREAD_PROCESS_PRIVATE, + .m_robust = PTHREAD_MUTEX_STALLED, }; struct pthread_mutex_attr _pthread_mutexattr_adaptive_default = { .m_type = PTHREAD_MUTEX_ADAPTIVE_NP, .m_protocol = PTHREAD_PRIO_NONE, .m_ceiling = 0, .m_pshared = PTHREAD_PROCESS_PRIVATE, + .m_robust = PTHREAD_MUTEX_STALLED, }; /* Default condition variable attributes: */ struct pthread_cond_attr _pthread_condattr_default = { .c_pshared = PTHREAD_PROCESS_PRIVATE, .c_clockid = CLOCK_REALTIME }; int _thr_is_smp = 0; size_t _thr_guard_default; size_t _thr_stack_default = THR_STACK_DEFAULT; size_t _thr_stack_initial = THR_STACK_INITIAL; int _thr_page_size; int _thr_spinloops; int _thr_yieldloops; int _thr_queuefifo = 4; int _gc_count; struct umutex _mutex_static_lock = DEFAULT_UMUTEX; struct umutex _cond_static_lock = DEFAULT_UMUTEX; struct umutex _rwlock_static_lock = DEFAULT_UMUTEX; struct umutex _keytable_lock = DEFAULT_UMUTEX; struct urwlock _thr_list_lock = DEFAULT_URWLOCK; struct umutex _thr_event_lock = DEFAULT_UMUTEX; struct umutex _suspend_all_lock = DEFAULT_UMUTEX; struct pthread *_single_thread; int _suspend_all_cycle; int _suspend_all_waiters; int __pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *); int __pthread_mutex_lock(pthread_mutex_t *); int __pthread_mutex_trylock(pthread_mutex_t *); void _thread_init_hack(void) __attribute__ ((constructor)); static void init_private(void); static void init_main_thread(struct pthread *thread); /* * All weak references used within libc should be in this table. * This is so that static libraries will work. */ STATIC_LIB_REQUIRE(_fork); STATIC_LIB_REQUIRE(_pthread_getspecific); STATIC_LIB_REQUIRE(_pthread_key_create); STATIC_LIB_REQUIRE(_pthread_key_delete); STATIC_LIB_REQUIRE(_pthread_mutex_destroy); STATIC_LIB_REQUIRE(_pthread_mutex_init); STATIC_LIB_REQUIRE(_pthread_mutex_lock); STATIC_LIB_REQUIRE(_pthread_mutex_trylock); STATIC_LIB_REQUIRE(_pthread_mutex_unlock); STATIC_LIB_REQUIRE(_pthread_mutexattr_init); STATIC_LIB_REQUIRE(_pthread_mutexattr_destroy); STATIC_LIB_REQUIRE(_pthread_mutexattr_settype); STATIC_LIB_REQUIRE(_pthread_once); STATIC_LIB_REQUIRE(_pthread_setspecific); STATIC_LIB_REQUIRE(_raise); STATIC_LIB_REQUIRE(_sem_destroy); STATIC_LIB_REQUIRE(_sem_getvalue); STATIC_LIB_REQUIRE(_sem_init); STATIC_LIB_REQUIRE(_sem_post); STATIC_LIB_REQUIRE(_sem_timedwait); STATIC_LIB_REQUIRE(_sem_trywait); STATIC_LIB_REQUIRE(_sem_wait); STATIC_LIB_REQUIRE(_sigaction); STATIC_LIB_REQUIRE(_sigprocmask); STATIC_LIB_REQUIRE(_sigsuspend); STATIC_LIB_REQUIRE(_sigtimedwait); STATIC_LIB_REQUIRE(_sigwait); STATIC_LIB_REQUIRE(_sigwaitinfo); STATIC_LIB_REQUIRE(_spinlock); STATIC_LIB_REQUIRE(_spinlock_debug); STATIC_LIB_REQUIRE(_spinunlock); STATIC_LIB_REQUIRE(_thread_init_hack); /* * These are needed when linking statically. All references within * libgcc (and in the future libc) to these routines are weak, but * if they are not (strongly) referenced by the application or other * libraries, then the actual functions will not be loaded. */ STATIC_LIB_REQUIRE(_pthread_once); STATIC_LIB_REQUIRE(_pthread_key_create); STATIC_LIB_REQUIRE(_pthread_key_delete); STATIC_LIB_REQUIRE(_pthread_getspecific); STATIC_LIB_REQUIRE(_pthread_setspecific); STATIC_LIB_REQUIRE(_pthread_mutex_init); STATIC_LIB_REQUIRE(_pthread_mutex_destroy); STATIC_LIB_REQUIRE(_pthread_mutex_lock); STATIC_LIB_REQUIRE(_pthread_mutex_trylock); STATIC_LIB_REQUIRE(_pthread_mutex_unlock); STATIC_LIB_REQUIRE(_pthread_create); /* Pull in all symbols required by libthread_db */ STATIC_LIB_REQUIRE(_thread_state_running); #define DUAL_ENTRY(entry) \ (pthread_func_t)entry, (pthread_func_t)entry static pthread_func_t jmp_table[][2] = { {DUAL_ENTRY(_pthread_atfork)}, /* PJT_ATFORK */ {DUAL_ENTRY(_pthread_attr_destroy)}, /* PJT_ATTR_DESTROY */ {DUAL_ENTRY(_pthread_attr_getdetachstate)}, /* PJT_ATTR_GETDETACHSTATE */ {DUAL_ENTRY(_pthread_attr_getguardsize)}, /* PJT_ATTR_GETGUARDSIZE */ {DUAL_ENTRY(_pthread_attr_getinheritsched)}, /* PJT_ATTR_GETINHERITSCHED */ {DUAL_ENTRY(_pthread_attr_getschedparam)}, /* PJT_ATTR_GETSCHEDPARAM */ {DUAL_ENTRY(_pthread_attr_getschedpolicy)}, /* PJT_ATTR_GETSCHEDPOLICY */ {DUAL_ENTRY(_pthread_attr_getscope)}, /* PJT_ATTR_GETSCOPE */ {DUAL_ENTRY(_pthread_attr_getstackaddr)}, /* PJT_ATTR_GETSTACKADDR */ {DUAL_ENTRY(_pthread_attr_getstacksize)}, /* PJT_ATTR_GETSTACKSIZE */ {DUAL_ENTRY(_pthread_attr_init)}, /* PJT_ATTR_INIT */ {DUAL_ENTRY(_pthread_attr_setdetachstate)}, /* PJT_ATTR_SETDETACHSTATE */ {DUAL_ENTRY(_pthread_attr_setguardsize)}, /* PJT_ATTR_SETGUARDSIZE */ {DUAL_ENTRY(_pthread_attr_setinheritsched)}, /* PJT_ATTR_SETINHERITSCHED */ {DUAL_ENTRY(_pthread_attr_setschedparam)}, /* PJT_ATTR_SETSCHEDPARAM */ {DUAL_ENTRY(_pthread_attr_setschedpolicy)}, /* PJT_ATTR_SETSCHEDPOLICY */ {DUAL_ENTRY(_pthread_attr_setscope)}, /* PJT_ATTR_SETSCOPE */ {DUAL_ENTRY(_pthread_attr_setstackaddr)}, /* PJT_ATTR_SETSTACKADDR */ {DUAL_ENTRY(_pthread_attr_setstacksize)}, /* PJT_ATTR_SETSTACKSIZE */ {DUAL_ENTRY(_pthread_cancel)}, /* PJT_CANCEL */ {DUAL_ENTRY(_pthread_cleanup_pop)}, /* PJT_CLEANUP_POP */ {DUAL_ENTRY(_pthread_cleanup_push)}, /* PJT_CLEANUP_PUSH */ {DUAL_ENTRY(_pthread_cond_broadcast)}, /* PJT_COND_BROADCAST */ {DUAL_ENTRY(_pthread_cond_destroy)}, /* PJT_COND_DESTROY */ {DUAL_ENTRY(_pthread_cond_init)}, /* PJT_COND_INIT */ {DUAL_ENTRY(_pthread_cond_signal)}, /* PJT_COND_SIGNAL */ {DUAL_ENTRY(_pthread_cond_timedwait)}, /* PJT_COND_TIMEDWAIT */ {(pthread_func_t)__pthread_cond_wait, (pthread_func_t)_pthread_cond_wait}, /* PJT_COND_WAIT */ {DUAL_ENTRY(_pthread_detach)}, /* PJT_DETACH */ {DUAL_ENTRY(_pthread_equal)}, /* PJT_EQUAL */ {DUAL_ENTRY(_pthread_exit)}, /* PJT_EXIT */ {DUAL_ENTRY(_pthread_getspecific)}, /* PJT_GETSPECIFIC */ {DUAL_ENTRY(_pthread_join)}, /* PJT_JOIN */ {DUAL_ENTRY(_pthread_key_create)}, /* PJT_KEY_CREATE */ {DUAL_ENTRY(_pthread_key_delete)}, /* PJT_KEY_DELETE*/ {DUAL_ENTRY(_pthread_kill)}, /* PJT_KILL */ {DUAL_ENTRY(_pthread_main_np)}, /* PJT_MAIN_NP */ {DUAL_ENTRY(_pthread_mutexattr_destroy)}, /* PJT_MUTEXATTR_DESTROY */ {DUAL_ENTRY(_pthread_mutexattr_init)}, /* PJT_MUTEXATTR_INIT */ {DUAL_ENTRY(_pthread_mutexattr_settype)}, /* PJT_MUTEXATTR_SETTYPE */ {DUAL_ENTRY(_pthread_mutex_destroy)}, /* PJT_MUTEX_DESTROY */ {DUAL_ENTRY(_pthread_mutex_init)}, /* PJT_MUTEX_INIT */ {(pthread_func_t)__pthread_mutex_lock, (pthread_func_t)_pthread_mutex_lock}, /* PJT_MUTEX_LOCK */ {(pthread_func_t)__pthread_mutex_trylock, (pthread_func_t)_pthread_mutex_trylock},/* PJT_MUTEX_TRYLOCK */ {DUAL_ENTRY(_pthread_mutex_unlock)}, /* PJT_MUTEX_UNLOCK */ {DUAL_ENTRY(_pthread_once)}, /* PJT_ONCE */ {DUAL_ENTRY(_pthread_rwlock_destroy)}, /* PJT_RWLOCK_DESTROY */ {DUAL_ENTRY(_pthread_rwlock_init)}, /* PJT_RWLOCK_INIT */ {DUAL_ENTRY(_pthread_rwlock_rdlock)}, /* PJT_RWLOCK_RDLOCK */ {DUAL_ENTRY(_pthread_rwlock_tryrdlock)},/* PJT_RWLOCK_TRYRDLOCK */ {DUAL_ENTRY(_pthread_rwlock_trywrlock)},/* PJT_RWLOCK_TRYWRLOCK */ {DUAL_ENTRY(_pthread_rwlock_unlock)}, /* PJT_RWLOCK_UNLOCK */ {DUAL_ENTRY(_pthread_rwlock_wrlock)}, /* PJT_RWLOCK_WRLOCK */ {DUAL_ENTRY(_pthread_self)}, /* PJT_SELF */ {DUAL_ENTRY(_pthread_setcancelstate)}, /* PJT_SETCANCELSTATE */ {DUAL_ENTRY(_pthread_setcanceltype)}, /* PJT_SETCANCELTYPE */ {DUAL_ENTRY(_pthread_setspecific)}, /* PJT_SETSPECIFIC */ {DUAL_ENTRY(_pthread_sigmask)}, /* PJT_SIGMASK */ {DUAL_ENTRY(_pthread_testcancel)}, /* PJT_TESTCANCEL */ {DUAL_ENTRY(__pthread_cleanup_pop_imp)},/* PJT_CLEANUP_POP_IMP */ {DUAL_ENTRY(__pthread_cleanup_push_imp)},/* PJT_CLEANUP_PUSH_IMP */ {DUAL_ENTRY(_pthread_cancel_enter)}, /* PJT_CANCEL_ENTER */ - {DUAL_ENTRY(_pthread_cancel_leave)} /* PJT_CANCEL_LEAVE */ + {DUAL_ENTRY(_pthread_cancel_leave)}, /* PJT_CANCEL_LEAVE */ + {DUAL_ENTRY(_pthread_mutex_consistent)},/* PJT_MUTEX_CONSISTENT */ + {DUAL_ENTRY(_pthread_mutexattr_getrobust)},/* PJT_MUTEXATTR_GETROBUST */ + {DUAL_ENTRY(_pthread_mutexattr_setrobust)},/* PJT_MUTEXATTR_SETROBUST */ }; static int init_once = 0; /* * For the shared version of the threads library, the above is sufficient. * But for the archive version of the library, we need a little bit more. * Namely, we must arrange for this particular module to be pulled in from * the archive library at link time. To accomplish that, we define and * initialize a variable, "_thread_autoinit_dummy_decl". This variable is * referenced (as an extern) from libc/stdlib/exit.c. This will always * create a need for this module, ensuring that it is present in the * executable. */ extern int _thread_autoinit_dummy_decl; int _thread_autoinit_dummy_decl = 0; void _thread_init_hack(void) { _libpthread_init(NULL); } /* * Threaded process initialization. * * This is only called under two conditions: * * 1) Some thread routines have detected that the library hasn't yet * been initialized (_thr_initial == NULL && curthread == NULL), or * * 2) An explicit call to reinitialize after a fork (indicated * by curthread != NULL) */ void _libpthread_init(struct pthread *curthread) { int first, dlopened; /* Check if this function has already been called: */ - if ((_thr_initial != NULL) && (curthread == NULL)) + if (_thr_initial != NULL && curthread == NULL) /* Only initialize the threaded application once. */ return; /* * Check the size of the jump table to make sure it is preset * with the correct number of entries. */ - if (sizeof(jmp_table) != (sizeof(pthread_func_t) * PJT_MAX * 2)) + if (sizeof(jmp_table) != sizeof(pthread_func_t) * PJT_MAX * 2) PANIC("Thread jump table not properly initialized"); memcpy(__thr_jtable, jmp_table, sizeof(jmp_table)); __thr_interpose_libc(); /* Initialize pthread private data. */ init_private(); /* Set the initial thread. */ if (curthread == NULL) { first = 1; /* Create and initialize the initial thread. */ curthread = _thr_alloc(NULL); if (curthread == NULL) PANIC("Can't allocate initial thread"); init_main_thread(curthread); } else { first = 0; } /* * Add the thread to the thread list queue. */ THR_LIST_ADD(curthread); _thread_active_threads = 1; /* Setup the thread specific data */ _tcb_set(curthread->tcb); if (first) { _thr_initial = curthread; dlopened = _rtld_is_dlopened(&_thread_autoinit_dummy_decl) != 0; _thr_signal_init(dlopened); if (_thread_event_mask & TD_CREATE) _thr_report_creation(curthread, curthread); /* * Always use our rtld lock implementation. * It is faster because it postpones signal handlers * instead of calling sigprocmask(2). */ _thr_rtld_init(); } } /* * This function and pthread_create() do a lot of the same things. * It'd be nice to consolidate the common stuff in one place. */ static void init_main_thread(struct pthread *thread) { struct sched_param sched_param; int i; /* Setup the thread attributes. */ thr_self(&thread->tid); thread->attr = _pthread_attr_default; /* * Set up the thread stack. * * Create a red zone below the main stack. All other stacks * are constrained to a maximum size by the parameters * passed to mmap(), but this stack is only limited by * resource limits, so this stack needs an explicitly mapped * red zone to protect the thread stack that is just beyond. */ if (mmap(_usrstack - _thr_stack_initial - _thr_guard_default, _thr_guard_default, 0, MAP_ANON, -1, 0) == MAP_FAILED) PANIC("Cannot allocate red zone for initial thread"); /* * Mark the stack as an application supplied stack so that it * isn't deallocated. * * XXX - I'm not sure it would hurt anything to deallocate * the main thread stack because deallocation doesn't * actually free() it; it just puts it in the free * stack queue for later reuse. */ thread->attr.stackaddr_attr = _usrstack - _thr_stack_initial; thread->attr.stacksize_attr = _thr_stack_initial; thread->attr.guardsize_attr = _thr_guard_default; thread->attr.flags |= THR_STACK_USER; /* * Write a magic value to the thread structure * to help identify valid ones: */ thread->magic = THR_MAGIC; thread->cancel_enable = 1; thread->cancel_async = 0; /* Initialize the mutex queues */ for (i = 0; i < TMQ_NITEMS; i++) TAILQ_INIT(&thread->mq[i]); thread->state = PS_RUNNING; _thr_getscheduler(thread->tid, &thread->attr.sched_policy, &sched_param); thread->attr.prio = sched_param.sched_priority; #ifdef _PTHREAD_FORCED_UNWIND thread->unwind_stackend = _usrstack; #endif /* Others cleared to zero by thr_alloc() */ } static void init_private(void) { struct rlimit rlim; size_t len; int mib[2]; char *env, *env_bigstack, *env_splitstack; _thr_umutex_init(&_mutex_static_lock); _thr_umutex_init(&_cond_static_lock); _thr_umutex_init(&_rwlock_static_lock); _thr_umutex_init(&_keytable_lock); _thr_urwlock_init(&_thr_atfork_lock); _thr_umutex_init(&_thr_event_lock); _thr_umutex_init(&_suspend_all_lock); _thr_once_init(); _thr_spinlock_init(); _thr_list_init(); _thr_wake_addr_init(); _sleepq_init(); _single_thread = NULL; _suspend_all_waiters = 0; /* * Avoid reinitializing some things if they don't need to be, * e.g. after a fork(). */ if (init_once == 0) { __thr_pshared_init(); /* Find the stack top */ mib[0] = CTL_KERN; mib[1] = KERN_USRSTACK; len = sizeof (_usrstack); if (sysctl(mib, 2, &_usrstack, &len, NULL, 0) == -1) PANIC("Cannot get kern.usrstack from sysctl"); env_bigstack = getenv("LIBPTHREAD_BIGSTACK_MAIN"); env_splitstack = getenv("LIBPTHREAD_SPLITSTACK_MAIN"); if (env_bigstack != NULL || env_splitstack == NULL) { if (getrlimit(RLIMIT_STACK, &rlim) == -1) PANIC("Cannot get stack rlimit"); _thr_stack_initial = rlim.rlim_cur; } len = sizeof(_thr_is_smp); sysctlbyname("kern.smp.cpus", &_thr_is_smp, &len, NULL, 0); _thr_is_smp = (_thr_is_smp > 1); _thr_page_size = getpagesize(); _thr_guard_default = _thr_page_size; _pthread_attr_default.guardsize_attr = _thr_guard_default; _pthread_attr_default.stacksize_attr = _thr_stack_default; env = getenv("LIBPTHREAD_SPINLOOPS"); if (env) _thr_spinloops = atoi(env); env = getenv("LIBPTHREAD_YIELDLOOPS"); if (env) _thr_yieldloops = atoi(env); env = getenv("LIBPTHREAD_QUEUE_FIFO"); if (env) _thr_queuefifo = atoi(env); TAILQ_INIT(&_thr_atfork_list); } init_once = 1; } Index: head/lib/libthr/thread/thr_mutex.c =================================================================== --- head/lib/libthr/thread/thr_mutex.c (revision 300042) +++ head/lib/libthr/thread/thr_mutex.c (revision 300043) @@ -1,1026 +1,1200 @@ /* * Copyright (c) 1995 John Birrell . * Copyright (c) 2006 David Xu . - * Copyright (c) 2015 The FreeBSD Foundation + * Copyright (c) 2015, 2016 The FreeBSD Foundation * * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by John Birrell. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); -#include #include "namespace.h" #include #include #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" _Static_assert(sizeof(struct pthread_mutex) <= PAGE_SIZE, "pthread_mutex is too large for off-page"); /* * For adaptive mutexes, how many times to spin doing trylock2 * before entering the kernel to block */ #define MUTEX_ADAPTIVE_SPINS 2000 /* * Prototypes */ +int __pthread_mutex_consistent(pthread_mutex_t *mutex); int __pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *mutex_attr); int __pthread_mutex_trylock(pthread_mutex_t *mutex); int __pthread_mutex_lock(pthread_mutex_t *mutex); int __pthread_mutex_timedlock(pthread_mutex_t *mutex, const struct timespec *abstime); int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)); int _pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count); int _pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count); int __pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count); int _pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count); int _pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count); int __pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count); static int mutex_self_trylock(pthread_mutex_t); static int mutex_self_lock(pthread_mutex_t, const struct timespec *abstime); -static int mutex_unlock_common(struct pthread_mutex *, int, int *); +static int mutex_unlock_common(struct pthread_mutex *, bool, int *); static int mutex_lock_sleep(struct pthread *, pthread_mutex_t, const struct timespec *); +static void mutex_init_robust(struct pthread *curthread); +static int mutex_qidx(struct pthread_mutex *m); +static bool is_robust_mutex(struct pthread_mutex *m); +static bool is_pshared_mutex(struct pthread_mutex *m); __weak_reference(__pthread_mutex_init, pthread_mutex_init); __strong_reference(__pthread_mutex_init, _pthread_mutex_init); __weak_reference(__pthread_mutex_lock, pthread_mutex_lock); __strong_reference(__pthread_mutex_lock, _pthread_mutex_lock); __weak_reference(__pthread_mutex_timedlock, pthread_mutex_timedlock); __strong_reference(__pthread_mutex_timedlock, _pthread_mutex_timedlock); __weak_reference(__pthread_mutex_trylock, pthread_mutex_trylock); __strong_reference(__pthread_mutex_trylock, _pthread_mutex_trylock); +__weak_reference(_pthread_mutex_consistent, pthread_mutex_consistent); +__strong_reference(_pthread_mutex_consistent, __pthread_mutex_consistent); /* Single underscore versions provided for libc internal usage: */ /* No difference between libc and application usage of these: */ __weak_reference(_pthread_mutex_destroy, pthread_mutex_destroy); __weak_reference(_pthread_mutex_unlock, pthread_mutex_unlock); __weak_reference(_pthread_mutex_getprioceiling, pthread_mutex_getprioceiling); __weak_reference(_pthread_mutex_setprioceiling, pthread_mutex_setprioceiling); __weak_reference(__pthread_mutex_setspinloops_np, pthread_mutex_setspinloops_np); __strong_reference(__pthread_mutex_setspinloops_np, _pthread_mutex_setspinloops_np); __weak_reference(_pthread_mutex_getspinloops_np, pthread_mutex_getspinloops_np); __weak_reference(__pthread_mutex_setyieldloops_np, pthread_mutex_setyieldloops_np); __strong_reference(__pthread_mutex_setyieldloops_np, _pthread_mutex_setyieldloops_np); __weak_reference(_pthread_mutex_getyieldloops_np, pthread_mutex_getyieldloops_np); __weak_reference(_pthread_mutex_isowned_np, pthread_mutex_isowned_np); static void mutex_init_link(struct pthread_mutex *m) { #if defined(_PTHREADS_INVARIANTS) m->m_qe.tqe_prev = NULL; m->m_qe.tqe_next = NULL; m->m_pqe.tqe_prev = NULL; m->m_pqe.tqe_next = NULL; #endif } static void -mutex_assert_is_owned(struct pthread_mutex *m) +mutex_assert_is_owned(struct pthread_mutex *m __unused) { #if defined(_PTHREADS_INVARIANTS) if (__predict_false(m->m_qe.tqe_prev == NULL)) { char msg[128]; snprintf(msg, sizeof(msg), - "mutex %p own %#x %#x is not on list %p %p", - m, m->m_lock.m_owner, m->m_owner, m->m_qe.tqe_prev, - m->m_qe.tqe_next); + "mutex %p own %#x is not on list %p %p", + m, m->m_lock.m_owner, m->m_qe.tqe_prev, m->m_qe.tqe_next); PANIC(msg); } #endif } static void -mutex_assert_not_owned(struct pthread_mutex *m) +mutex_assert_not_owned(struct pthread *curthread __unused, + struct pthread_mutex *m __unused) { #if defined(_PTHREADS_INVARIANTS) if (__predict_false(m->m_qe.tqe_prev != NULL || m->m_qe.tqe_next != NULL)) { char msg[128]; snprintf(msg, sizeof(msg), - "mutex %p own %#x %#x is on list %p %p", - m, m->m_lock.m_owner, m->m_owner, m->m_qe.tqe_prev, - m->m_qe.tqe_next); + "mutex %p own %#x is on list %p %p", + m, m->m_lock.m_owner, m->m_qe.tqe_prev, m->m_qe.tqe_next); PANIC(msg); } + if (__predict_false(is_robust_mutex(m) && + (m->m_lock.m_rb_lnk != 0 || m->m_rb_prev != NULL || + (is_pshared_mutex(m) && curthread->robust_list == + (uintptr_t)&m->m_lock) || + (!is_pshared_mutex(m) && curthread->priv_robust_list == + (uintptr_t)&m->m_lock)))) { + char msg[128]; + snprintf(msg, sizeof(msg), + "mutex %p own %#x is on robust linkage %p %p head %p phead %p", + m, m->m_lock.m_owner, (void *)m->m_lock.m_rb_lnk, + m->m_rb_prev, (void *)curthread->robust_list, + (void *)curthread->priv_robust_list); + PANIC(msg); + } #endif } -static int +static bool is_pshared_mutex(struct pthread_mutex *m) { return ((m->m_lock.m_flags & USYNC_PROCESS_SHARED) != 0); } +static bool +is_robust_mutex(struct pthread_mutex *m) +{ + + return ((m->m_lock.m_flags & UMUTEX_ROBUST) != 0); +} + +int +_mutex_enter_robust(struct pthread *curthread, struct pthread_mutex *m) +{ + +#if defined(_PTHREADS_INVARIANTS) + if (__predict_false(curthread->inact_mtx != 0)) + PANIC("inact_mtx enter"); +#endif + if (!is_robust_mutex(m)) + return (0); + + mutex_init_robust(curthread); + curthread->inact_mtx = (uintptr_t)&m->m_lock; + return (1); +} + +void +_mutex_leave_robust(struct pthread *curthread, struct pthread_mutex *m __unused) +{ + +#if defined(_PTHREADS_INVARIANTS) + if (__predict_false(curthread->inact_mtx != (uintptr_t)&m->m_lock)) + PANIC("inact_mtx leave"); +#endif + curthread->inact_mtx = 0; +} + static int mutex_check_attr(const struct pthread_mutex_attr *attr) { if (attr->m_type < PTHREAD_MUTEX_ERRORCHECK || attr->m_type >= PTHREAD_MUTEX_TYPE_MAX) return (EINVAL); if (attr->m_protocol < PTHREAD_PRIO_NONE || attr->m_protocol > PTHREAD_PRIO_PROTECT) return (EINVAL); return (0); } static void +mutex_init_robust(struct pthread *curthread) +{ + struct umtx_robust_lists_params rb; + + if (curthread == NULL) + curthread = _get_curthread(); + if (curthread->robust_inited) + return; + rb.robust_list_offset = (uintptr_t)&curthread->robust_list; + rb.robust_priv_list_offset = (uintptr_t)&curthread->priv_robust_list; + rb.robust_inact_offset = (uintptr_t)&curthread->inact_mtx; + _umtx_op(NULL, UMTX_OP_ROBUST_LISTS, sizeof(rb), &rb, NULL); + curthread->robust_inited = 1; +} + +static void mutex_init_body(struct pthread_mutex *pmutex, const struct pthread_mutex_attr *attr) { pmutex->m_flags = attr->m_type; - pmutex->m_owner = 0; pmutex->m_count = 0; pmutex->m_spinloops = 0; pmutex->m_yieldloops = 0; mutex_init_link(pmutex); switch (attr->m_protocol) { case PTHREAD_PRIO_NONE: pmutex->m_lock.m_owner = UMUTEX_UNOWNED; pmutex->m_lock.m_flags = 0; break; case PTHREAD_PRIO_INHERIT: pmutex->m_lock.m_owner = UMUTEX_UNOWNED; pmutex->m_lock.m_flags = UMUTEX_PRIO_INHERIT; break; case PTHREAD_PRIO_PROTECT: pmutex->m_lock.m_owner = UMUTEX_CONTESTED; pmutex->m_lock.m_flags = UMUTEX_PRIO_PROTECT; pmutex->m_lock.m_ceilings[0] = attr->m_ceiling; break; } if (attr->m_pshared == PTHREAD_PROCESS_SHARED) pmutex->m_lock.m_flags |= USYNC_PROCESS_SHARED; - + if (attr->m_robust == PTHREAD_MUTEX_ROBUST) { + mutex_init_robust(NULL); + pmutex->m_lock.m_flags |= UMUTEX_ROBUST; + } if (PMUTEX_TYPE(pmutex->m_flags) == PTHREAD_MUTEX_ADAPTIVE_NP) { pmutex->m_spinloops = _thr_spinloops ? _thr_spinloops: MUTEX_ADAPTIVE_SPINS; pmutex->m_yieldloops = _thr_yieldloops; } } static int mutex_init(pthread_mutex_t *mutex, const struct pthread_mutex_attr *mutex_attr, void *(calloc_cb)(size_t, size_t)) { const struct pthread_mutex_attr *attr; struct pthread_mutex *pmutex; int error; if (mutex_attr == NULL) { attr = &_pthread_mutexattr_default; } else { attr = mutex_attr; error = mutex_check_attr(attr); if (error != 0) return (error); } if ((pmutex = (pthread_mutex_t) calloc_cb(1, sizeof(struct pthread_mutex))) == NULL) return (ENOMEM); mutex_init_body(pmutex, attr); *mutex = pmutex; return (0); } static int init_static(struct pthread *thread, pthread_mutex_t *mutex) { int ret; THR_LOCK_ACQUIRE(thread, &_mutex_static_lock); if (*mutex == THR_MUTEX_INITIALIZER) ret = mutex_init(mutex, &_pthread_mutexattr_default, calloc); else if (*mutex == THR_ADAPTIVE_MUTEX_INITIALIZER) ret = mutex_init(mutex, &_pthread_mutexattr_adaptive_default, calloc); else ret = 0; THR_LOCK_RELEASE(thread, &_mutex_static_lock); return (ret); } static void set_inherited_priority(struct pthread *curthread, struct pthread_mutex *m) { struct pthread_mutex *m2; - m2 = TAILQ_LAST(&curthread->mq[TMQ_NORM_PP], mutex_queue); + m2 = TAILQ_LAST(&curthread->mq[mutex_qidx(m)], mutex_queue); if (m2 != NULL) m->m_lock.m_ceilings[1] = m2->m_lock.m_ceilings[0]; else m->m_lock.m_ceilings[1] = -1; } static void shared_mutex_init(struct pthread_mutex *pmtx, const struct pthread_mutex_attr *mutex_attr) { static const struct pthread_mutex_attr foobar_mutex_attr = { .m_type = PTHREAD_MUTEX_DEFAULT, .m_protocol = PTHREAD_PRIO_NONE, .m_ceiling = 0, - .m_pshared = PTHREAD_PROCESS_SHARED + .m_pshared = PTHREAD_PROCESS_SHARED, + .m_robust = PTHREAD_MUTEX_STALLED, }; bool done; /* * Hack to allow multiple pthread_mutex_init() calls on the * same process-shared mutex. We rely on kernel allocating * zeroed offpage for the mutex, i.e. the * PMUTEX_INITSTAGE_ALLOC value must be zero. */ for (done = false; !done;) { switch (pmtx->m_ps) { case PMUTEX_INITSTAGE_DONE: atomic_thread_fence_acq(); done = true; break; case PMUTEX_INITSTAGE_ALLOC: if (atomic_cmpset_int(&pmtx->m_ps, PMUTEX_INITSTAGE_ALLOC, PMUTEX_INITSTAGE_BUSY)) { if (mutex_attr == NULL) mutex_attr = &foobar_mutex_attr; mutex_init_body(pmtx, mutex_attr); atomic_store_rel_int(&pmtx->m_ps, PMUTEX_INITSTAGE_DONE); done = true; } break; case PMUTEX_INITSTAGE_BUSY: _pthread_yield(); break; default: PANIC("corrupted offpage"); break; } } } int __pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *mutex_attr) { struct pthread_mutex *pmtx; int ret; if (mutex_attr != NULL) { ret = mutex_check_attr(*mutex_attr); if (ret != 0) return (ret); } if (mutex_attr == NULL || (*mutex_attr)->m_pshared == PTHREAD_PROCESS_PRIVATE) { return (mutex_init(mutex, mutex_attr ? *mutex_attr : NULL, - calloc)); + calloc)); } pmtx = __thr_pshared_offpage(mutex, 1); if (pmtx == NULL) return (EFAULT); *mutex = THR_PSHARED_PTR; shared_mutex_init(pmtx, *mutex_attr); return (0); } /* This function is used internally by malloc. */ int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)) { static const struct pthread_mutex_attr attr = { .m_type = PTHREAD_MUTEX_NORMAL, .m_protocol = PTHREAD_PRIO_NONE, .m_ceiling = 0, .m_pshared = PTHREAD_PROCESS_PRIVATE, + .m_robust = PTHREAD_MUTEX_STALLED, }; int ret; ret = mutex_init(mutex, &attr, calloc_cb); if (ret == 0) (*mutex)->m_flags |= PMUTEX_FLAG_PRIVATE; return (ret); } /* * Fix mutex ownership for child process. * * Process private mutex ownership is transmitted from the forking * thread to the child process. * * Process shared mutex should not be inherited because owner is * forking thread which is in parent process, they are removed from * the owned mutex list. */ static void queue_fork(struct pthread *curthread, struct mutex_queue *q, struct mutex_queue *qp, uint bit) { struct pthread_mutex *m; TAILQ_INIT(q); TAILQ_FOREACH(m, qp, m_pqe) { TAILQ_INSERT_TAIL(q, m, m_qe); m->m_lock.m_owner = TID(curthread) | bit; - m->m_owner = TID(curthread); } } void _mutex_fork(struct pthread *curthread) { queue_fork(curthread, &curthread->mq[TMQ_NORM], &curthread->mq[TMQ_NORM_PRIV], 0); queue_fork(curthread, &curthread->mq[TMQ_NORM_PP], &curthread->mq[TMQ_NORM_PP_PRIV], UMUTEX_CONTESTED); + queue_fork(curthread, &curthread->mq[TMQ_ROBUST_PP], + &curthread->mq[TMQ_ROBUST_PP_PRIV], UMUTEX_CONTESTED); + curthread->robust_list = 0; } int _pthread_mutex_destroy(pthread_mutex_t *mutex) { pthread_mutex_t m, m1; int ret; m = *mutex; if (m < THR_MUTEX_DESTROYED) { ret = 0; } else if (m == THR_MUTEX_DESTROYED) { ret = EINVAL; } else { if (m == THR_PSHARED_PTR) { m1 = __thr_pshared_offpage(mutex, 0); if (m1 != NULL) { - mutex_assert_not_owned(m1); + mutex_assert_not_owned(_get_curthread(), m1); __thr_pshared_destroy(mutex); } *mutex = THR_MUTEX_DESTROYED; return (0); } - if (m->m_owner != 0) { + if (PMUTEX_OWNER_ID(m) != 0 && + (uint32_t)m->m_lock.m_owner != UMUTEX_RB_NOTRECOV) { ret = EBUSY; } else { *mutex = THR_MUTEX_DESTROYED; - mutex_assert_not_owned(m); + mutex_assert_not_owned(_get_curthread(), m); free(m); ret = 0; } } return (ret); } static int mutex_qidx(struct pthread_mutex *m) { if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) return (TMQ_NORM); - return (TMQ_NORM_PP); + return (is_robust_mutex(m) ? TMQ_ROBUST_PP : TMQ_NORM_PP); } +/* + * Both enqueue_mutex() and dequeue_mutex() operate on the + * thread-private linkage of the locked mutexes and on the robust + * linkage. + * + * Robust list, as seen by kernel, must be consistent even in the case + * of thread termination at arbitrary moment. Since either enqueue or + * dequeue for list walked by kernel consists of rewriting a single + * forward pointer, it is safe. On the other hand, rewrite of the + * back pointer is not atomic WRT the forward one, but kernel does not + * care. + */ static void -enqueue_mutex(struct pthread *curthread, struct pthread_mutex *m) +enqueue_mutex(struct pthread *curthread, struct pthread_mutex *m, + int error) { + struct pthread_mutex *m1; + uintptr_t *rl; int qidx; - m->m_owner = TID(curthread); /* Add to the list of owned mutexes: */ - mutex_assert_not_owned(m); + if (error != EOWNERDEAD) + mutex_assert_not_owned(curthread, m); qidx = mutex_qidx(m); TAILQ_INSERT_TAIL(&curthread->mq[qidx], m, m_qe); if (!is_pshared_mutex(m)) TAILQ_INSERT_TAIL(&curthread->mq[qidx + 1], m, m_pqe); + if (is_robust_mutex(m)) { + rl = is_pshared_mutex(m) ? &curthread->robust_list : + &curthread->priv_robust_list; + m->m_rb_prev = NULL; + if (*rl != 0) { + m1 = __containerof((void *)*rl, + struct pthread_mutex, m_lock); + m->m_lock.m_rb_lnk = (uintptr_t)&m1->m_lock; + m1->m_rb_prev = m; + } else { + m1 = NULL; + m->m_lock.m_rb_lnk = 0; + } + *rl = (uintptr_t)&m->m_lock; + } } static void dequeue_mutex(struct pthread *curthread, struct pthread_mutex *m) { + struct pthread_mutex *mp, *mn; int qidx; - m->m_owner = 0; mutex_assert_is_owned(m); qidx = mutex_qidx(m); + if (is_robust_mutex(m)) { + mp = m->m_rb_prev; + if (mp == NULL) { + if (is_pshared_mutex(m)) { + curthread->robust_list = m->m_lock.m_rb_lnk; + } else { + curthread->priv_robust_list = + m->m_lock.m_rb_lnk; + } + } else { + mp->m_lock.m_rb_lnk = m->m_lock.m_rb_lnk; + } + if (m->m_lock.m_rb_lnk != 0) { + mn = __containerof((void *)m->m_lock.m_rb_lnk, + struct pthread_mutex, m_lock); + mn->m_rb_prev = m->m_rb_prev; + } + m->m_lock.m_rb_lnk = 0; + m->m_rb_prev = NULL; + } TAILQ_REMOVE(&curthread->mq[qidx], m, m_qe); if (!is_pshared_mutex(m)) TAILQ_REMOVE(&curthread->mq[qidx + 1], m, m_pqe); if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) != 0) set_inherited_priority(curthread, m); mutex_init_link(m); } static int check_and_init_mutex(pthread_mutex_t *mutex, struct pthread_mutex **m) { int ret; *m = *mutex; ret = 0; if (*m == THR_PSHARED_PTR) { *m = __thr_pshared_offpage(mutex, 0); if (*m == NULL) ret = EINVAL; else shared_mutex_init(*m, NULL); } else if (__predict_false(*m <= THR_MUTEX_DESTROYED)) { if (*m == THR_MUTEX_DESTROYED) { ret = EINVAL; } else { ret = init_static(_get_curthread(), mutex); if (ret == 0) *m = *mutex; } } return (ret); } int __pthread_mutex_trylock(pthread_mutex_t *mutex) { struct pthread *curthread; struct pthread_mutex *m; uint32_t id; - int ret; + int ret, robust; ret = check_and_init_mutex(mutex, &m); if (ret != 0) return (ret); curthread = _get_curthread(); id = TID(curthread); if (m->m_flags & PMUTEX_FLAG_PRIVATE) THR_CRITICAL_ENTER(curthread); + robust = _mutex_enter_robust(curthread, m); ret = _thr_umutex_trylock(&m->m_lock, id); - if (__predict_true(ret == 0)) { - enqueue_mutex(curthread, m); - } else if (m->m_owner == id) { + if (__predict_true(ret == 0) || ret == EOWNERDEAD) { + enqueue_mutex(curthread, m, ret); + if (ret == EOWNERDEAD) + m->m_lock.m_flags |= UMUTEX_NONCONSISTENT; + } else if (PMUTEX_OWNER_ID(m) == id) { ret = mutex_self_trylock(m); } /* else {} */ - if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE)) + if (robust) + _mutex_leave_robust(curthread, m); + if ((ret == 0 || ret == EOWNERDEAD) && + (m->m_flags & PMUTEX_FLAG_PRIVATE) != 0) THR_CRITICAL_LEAVE(curthread); return (ret); } static int mutex_lock_sleep(struct pthread *curthread, struct pthread_mutex *m, - const struct timespec *abstime) + const struct timespec *abstime) { - uint32_t id, owner; - int count; - int ret; + uint32_t id, owner; + int count, ret; id = TID(curthread); - if (m->m_owner == id) + if (PMUTEX_OWNER_ID(m) == id) return (mutex_self_lock(m, abstime)); /* * For adaptive mutexes, spin for a bit in the expectation * that if the application requests this mutex type then * the lock is likely to be released quickly and it is * faster than entering the kernel */ - if (__predict_false( - (m->m_lock.m_flags & - (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0)) - goto sleep_in_kernel; + if (__predict_false((m->m_lock.m_flags & (UMUTEX_PRIO_PROTECT | + UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST | UMUTEX_NONCONSISTENT)) != 0)) + goto sleep_in_kernel; if (!_thr_is_smp) goto yield_loop; count = m->m_spinloops; while (count--) { owner = m->m_lock.m_owner; if ((owner & ~UMUTEX_CONTESTED) == 0) { - if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) { + if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, + id | owner)) { ret = 0; goto done; } } CPU_SPINWAIT; } yield_loop: count = m->m_yieldloops; while (count--) { _sched_yield(); owner = m->m_lock.m_owner; if ((owner & ~UMUTEX_CONTESTED) == 0) { - if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) { + if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, + id | owner)) { ret = 0; goto done; } } } sleep_in_kernel: - if (abstime == NULL) { + if (abstime == NULL) ret = __thr_umutex_lock(&m->m_lock, id); - } else if (__predict_false( - abstime->tv_nsec < 0 || - abstime->tv_nsec >= 1000000000)) { + else if (__predict_false(abstime->tv_nsec < 0 || + abstime->tv_nsec >= 1000000000)) ret = EINVAL; - } else { + else ret = __thr_umutex_timedlock(&m->m_lock, id, abstime); - } done: - if (ret == 0) - enqueue_mutex(curthread, m); - + if (ret == 0 || ret == EOWNERDEAD) { + enqueue_mutex(curthread, m, ret); + if (ret == EOWNERDEAD) + m->m_lock.m_flags |= UMUTEX_NONCONSISTENT; + } return (ret); } static inline int -mutex_lock_common(struct pthread_mutex *m, - const struct timespec *abstime, int cvattach) +mutex_lock_common(struct pthread_mutex *m, const struct timespec *abstime, + bool cvattach, bool rb_onlist) { - struct pthread *curthread = _get_curthread(); - int ret; + struct pthread *curthread; + int ret, robust; + curthread = _get_curthread(); if (!cvattach && m->m_flags & PMUTEX_FLAG_PRIVATE) THR_CRITICAL_ENTER(curthread); - if (_thr_umutex_trylock2(&m->m_lock, TID(curthread)) == 0) { - enqueue_mutex(curthread, m); - ret = 0; + if (!rb_onlist) + robust = _mutex_enter_robust(curthread, m); + ret = _thr_umutex_trylock2(&m->m_lock, TID(curthread)); + if (ret == 0 || ret == EOWNERDEAD) { + enqueue_mutex(curthread, m, ret); + if (ret == EOWNERDEAD) + m->m_lock.m_flags |= UMUTEX_NONCONSISTENT; } else { ret = mutex_lock_sleep(curthread, m, abstime); } - if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE) && !cvattach) + if (!rb_onlist && robust) + _mutex_leave_robust(curthread, m); + if (ret != 0 && ret != EOWNERDEAD && + (m->m_flags & PMUTEX_FLAG_PRIVATE) != 0 && !cvattach) THR_CRITICAL_LEAVE(curthread); return (ret); } int __pthread_mutex_lock(pthread_mutex_t *mutex) { struct pthread_mutex *m; int ret; _thr_check_init(); ret = check_and_init_mutex(mutex, &m); if (ret == 0) - ret = mutex_lock_common(m, NULL, 0); + ret = mutex_lock_common(m, NULL, false, false); return (ret); } int __pthread_mutex_timedlock(pthread_mutex_t *mutex, const struct timespec *abstime) { struct pthread_mutex *m; int ret; _thr_check_init(); ret = check_and_init_mutex(mutex, &m); if (ret == 0) - ret = mutex_lock_common(m, abstime, 0); + ret = mutex_lock_common(m, abstime, false, false); return (ret); } int _pthread_mutex_unlock(pthread_mutex_t *mutex) { struct pthread_mutex *mp; if (*mutex == THR_PSHARED_PTR) { mp = __thr_pshared_offpage(mutex, 0); if (mp == NULL) return (EINVAL); shared_mutex_init(mp, NULL); } else { mp = *mutex; } - return (mutex_unlock_common(mp, 0, NULL)); + return (mutex_unlock_common(mp, false, NULL)); } int -_mutex_cv_lock(struct pthread_mutex *m, int count) +_mutex_cv_lock(struct pthread_mutex *m, int count, bool rb_onlist) { - int error; + int error; - error = mutex_lock_common(m, NULL, 1); - if (error == 0) + error = mutex_lock_common(m, NULL, true, rb_onlist); + if (error == 0 || error == EOWNERDEAD) m->m_count = count; return (error); } int _mutex_cv_unlock(struct pthread_mutex *m, int *count, int *defer) { /* * Clear the count in case this is a recursive mutex. */ *count = m->m_count; m->m_count = 0; - (void)mutex_unlock_common(m, 1, defer); + (void)mutex_unlock_common(m, true, defer); return (0); } int _mutex_cv_attach(struct pthread_mutex *m, int count) { - struct pthread *curthread = _get_curthread(); + struct pthread *curthread; - enqueue_mutex(curthread, m); + curthread = _get_curthread(); + enqueue_mutex(curthread, m, 0); m->m_count = count; return (0); } int _mutex_cv_detach(struct pthread_mutex *mp, int *recurse) { - struct pthread *curthread = _get_curthread(); - int defered; - int error; + struct pthread *curthread; + int deferred, error; + curthread = _get_curthread(); if ((error = _mutex_owned(curthread, mp)) != 0) - return (error); + return (error); /* * Clear the count in case this is a recursive mutex. */ *recurse = mp->m_count; mp->m_count = 0; dequeue_mutex(curthread, mp); /* Will this happen in real-world ? */ - if ((mp->m_flags & PMUTEX_FLAG_DEFERED) != 0) { - defered = 1; - mp->m_flags &= ~PMUTEX_FLAG_DEFERED; + if ((mp->m_flags & PMUTEX_FLAG_DEFERRED) != 0) { + deferred = 1; + mp->m_flags &= ~PMUTEX_FLAG_DEFERRED; } else - defered = 0; + deferred = 0; - if (defered) { + if (deferred) { _thr_wake_all(curthread->defer_waiters, - curthread->nwaiter_defer); + curthread->nwaiter_defer); curthread->nwaiter_defer = 0; } return (0); } static int mutex_self_trylock(struct pthread_mutex *m) { - int ret; + int ret; switch (PMUTEX_TYPE(m->m_flags)) { case PTHREAD_MUTEX_ERRORCHECK: case PTHREAD_MUTEX_NORMAL: case PTHREAD_MUTEX_ADAPTIVE_NP: ret = EBUSY; break; case PTHREAD_MUTEX_RECURSIVE: /* Increment the lock count: */ if (m->m_count + 1 > 0) { m->m_count++; ret = 0; } else ret = EAGAIN; break; default: /* Trap invalid mutex types; */ ret = EINVAL; } return (ret); } static int mutex_self_lock(struct pthread_mutex *m, const struct timespec *abstime) { struct timespec ts1, ts2; - int ret; + int ret; switch (PMUTEX_TYPE(m->m_flags)) { case PTHREAD_MUTEX_ERRORCHECK: case PTHREAD_MUTEX_ADAPTIVE_NP: if (abstime) { if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) { ret = EINVAL; } else { clock_gettime(CLOCK_REALTIME, &ts1); TIMESPEC_SUB(&ts2, abstime, &ts1); __sys_nanosleep(&ts2, NULL); ret = ETIMEDOUT; } } else { /* * POSIX specifies that mutexes should return * EDEADLK if a recursive lock is detected. */ ret = EDEADLK; } break; case PTHREAD_MUTEX_NORMAL: /* * What SS2 define as a 'normal' mutex. Intentionally * deadlock on attempts to get a lock you already own. */ ret = 0; if (abstime) { if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000) { ret = EINVAL; } else { clock_gettime(CLOCK_REALTIME, &ts1); TIMESPEC_SUB(&ts2, abstime, &ts1); __sys_nanosleep(&ts2, NULL); ret = ETIMEDOUT; } } else { ts1.tv_sec = 30; ts1.tv_nsec = 0; for (;;) __sys_nanosleep(&ts1, NULL); } break; case PTHREAD_MUTEX_RECURSIVE: /* Increment the lock count: */ if (m->m_count + 1 > 0) { m->m_count++; ret = 0; } else ret = EAGAIN; break; default: /* Trap invalid mutex types; */ ret = EINVAL; } return (ret); } static int -mutex_unlock_common(struct pthread_mutex *m, int cv, int *mtx_defer) +mutex_unlock_common(struct pthread_mutex *m, bool cv, int *mtx_defer) { - struct pthread *curthread = _get_curthread(); + struct pthread *curthread; uint32_t id; - int defered, error; + int deferred, error, robust; if (__predict_false(m <= THR_MUTEX_DESTROYED)) { if (m == THR_MUTEX_DESTROYED) return (EINVAL); return (EPERM); } + curthread = _get_curthread(); id = TID(curthread); /* * Check if the running thread is not the owner of the mutex. */ - if (__predict_false(m->m_owner != id)) + if (__predict_false(PMUTEX_OWNER_ID(m) != id)) return (EPERM); error = 0; - if (__predict_false( - PMUTEX_TYPE(m->m_flags) == PTHREAD_MUTEX_RECURSIVE && - m->m_count > 0)) { + if (__predict_false(PMUTEX_TYPE(m->m_flags) == + PTHREAD_MUTEX_RECURSIVE && m->m_count > 0)) { m->m_count--; } else { - if ((m->m_flags & PMUTEX_FLAG_DEFERED) != 0) { - defered = 1; - m->m_flags &= ~PMUTEX_FLAG_DEFERED; + if ((m->m_flags & PMUTEX_FLAG_DEFERRED) != 0) { + deferred = 1; + m->m_flags &= ~PMUTEX_FLAG_DEFERRED; } else - defered = 0; + deferred = 0; + robust = _mutex_enter_robust(curthread, m); dequeue_mutex(curthread, m); error = _thr_umutex_unlock2(&m->m_lock, id, mtx_defer); - - if (mtx_defer == NULL && defered) { - _thr_wake_all(curthread->defer_waiters, - curthread->nwaiter_defer); - curthread->nwaiter_defer = 0; + if (deferred) { + if (mtx_defer == NULL) { + _thr_wake_all(curthread->defer_waiters, + curthread->nwaiter_defer); + curthread->nwaiter_defer = 0; + } else + *mtx_defer = 1; } + if (robust) + _mutex_leave_robust(curthread, m); } if (!cv && m->m_flags & PMUTEX_FLAG_PRIVATE) THR_CRITICAL_LEAVE(curthread); return (error); } int _pthread_mutex_getprioceiling(pthread_mutex_t *mutex, int *prioceiling) { struct pthread_mutex *m; if (*mutex == THR_PSHARED_PTR) { m = __thr_pshared_offpage(mutex, 0); if (m == NULL) return (EINVAL); shared_mutex_init(m, NULL); } else { m = *mutex; if (m <= THR_MUTEX_DESTROYED) return (EINVAL); } if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); *prioceiling = m->m_lock.m_ceilings[0]; return (0); } int _pthread_mutex_setprioceiling(pthread_mutex_t *mutex, int ceiling, int *old_ceiling) { struct pthread *curthread; struct pthread_mutex *m, *m1, *m2; struct mutex_queue *q, *qp; - int ret; + int qidx, ret; if (*mutex == THR_PSHARED_PTR) { m = __thr_pshared_offpage(mutex, 0); if (m == NULL) return (EINVAL); shared_mutex_init(m, NULL); } else { m = *mutex; if (m <= THR_MUTEX_DESTROYED) return (EINVAL); } if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); ret = __thr_umutex_set_ceiling(&m->m_lock, ceiling, old_ceiling); if (ret != 0) return (ret); curthread = _get_curthread(); - if (m->m_owner == TID(curthread)) { + if (PMUTEX_OWNER_ID(m) == TID(curthread)) { mutex_assert_is_owned(m); m1 = TAILQ_PREV(m, mutex_queue, m_qe); m2 = TAILQ_NEXT(m, m_qe); if ((m1 != NULL && m1->m_lock.m_ceilings[0] > (u_int)ceiling) || (m2 != NULL && m2->m_lock.m_ceilings[0] < (u_int)ceiling)) { - q = &curthread->mq[TMQ_NORM_PP]; - qp = &curthread->mq[TMQ_NORM_PP_PRIV]; + qidx = mutex_qidx(m); + q = &curthread->mq[qidx]; + qp = &curthread->mq[qidx + 1]; TAILQ_REMOVE(q, m, m_qe); if (!is_pshared_mutex(m)) TAILQ_REMOVE(qp, m, m_pqe); TAILQ_FOREACH(m2, q, m_qe) { if (m2->m_lock.m_ceilings[0] > (u_int)ceiling) { TAILQ_INSERT_BEFORE(m2, m, m_qe); if (!is_pshared_mutex(m)) { while (m2 != NULL && is_pshared_mutex(m2)) { m2 = TAILQ_PREV(m2, mutex_queue, m_qe); } if (m2 == NULL) { TAILQ_INSERT_HEAD(qp, m, m_pqe); } else { TAILQ_INSERT_BEFORE(m2, m, m_pqe); } } return (0); } } TAILQ_INSERT_TAIL(q, m, m_qe); if (!is_pshared_mutex(m)) TAILQ_INSERT_TAIL(qp, m, m_pqe); } } return (0); } int _pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count) { struct pthread_mutex *m; int ret; ret = check_and_init_mutex(mutex, &m); if (ret == 0) *count = m->m_spinloops; return (ret); } int __pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count) { struct pthread_mutex *m; int ret; ret = check_and_init_mutex(mutex, &m); if (ret == 0) m->m_spinloops = count; return (ret); } int _pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count) { struct pthread_mutex *m; int ret; ret = check_and_init_mutex(mutex, &m); if (ret == 0) *count = m->m_yieldloops; return (ret); } int __pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count) { struct pthread_mutex *m; int ret; ret = check_and_init_mutex(mutex, &m); if (ret == 0) m->m_yieldloops = count; return (0); } int _pthread_mutex_isowned_np(pthread_mutex_t *mutex) { struct pthread_mutex *m; if (*mutex == THR_PSHARED_PTR) { m = __thr_pshared_offpage(mutex, 0); if (m == NULL) return (0); shared_mutex_init(m, NULL); } else { m = *mutex; if (m <= THR_MUTEX_DESTROYED) return (0); } - return (m->m_owner == TID(_get_curthread())); + return (PMUTEX_OWNER_ID(m) == TID(_get_curthread())); } int _mutex_owned(struct pthread *curthread, const struct pthread_mutex *mp) { + if (__predict_false(mp <= THR_MUTEX_DESTROYED)) { if (mp == THR_MUTEX_DESTROYED) return (EINVAL); return (EPERM); } - if (mp->m_owner != TID(curthread)) + if (PMUTEX_OWNER_ID(mp) != TID(curthread)) return (EPERM); return (0); +} + +int +_pthread_mutex_consistent(pthread_mutex_t *mutex) +{ + struct pthread_mutex *m; + struct pthread *curthread; + + if (*mutex == THR_PSHARED_PTR) { + m = __thr_pshared_offpage(mutex, 0); + if (m == NULL) + return (EINVAL); + shared_mutex_init(m, NULL); + } else { + m = *mutex; + if (m <= THR_MUTEX_DESTROYED) + return (EINVAL); + } + curthread = _get_curthread(); + if ((m->m_lock.m_flags & (UMUTEX_ROBUST | UMUTEX_NONCONSISTENT)) != + (UMUTEX_ROBUST | UMUTEX_NONCONSISTENT)) + return (EINVAL); + if (PMUTEX_OWNER_ID(m) != TID(curthread)) + return (EPERM); + m->m_lock.m_flags &= ~UMUTEX_NONCONSISTENT; + return (0); } Index: head/lib/libthr/thread/thr_mutexattr.c =================================================================== --- head/lib/libthr/thread/thr_mutexattr.c (revision 300042) +++ head/lib/libthr/thread/thr_mutexattr.c (revision 300043) @@ -1,253 +1,290 @@ /* * Copyright (c) 1996 Jeffrey Hsu . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Copyright (c) 1997 John Birrell . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "namespace.h" #include #include #include #include #include #include "un-namespace.h" #include "thr_private.h" __weak_reference(_pthread_mutexattr_init, pthread_mutexattr_init); __weak_reference(_pthread_mutexattr_setkind_np, pthread_mutexattr_setkind_np); __weak_reference(_pthread_mutexattr_getkind_np, pthread_mutexattr_getkind_np); __weak_reference(_pthread_mutexattr_gettype, pthread_mutexattr_gettype); __weak_reference(_pthread_mutexattr_settype, pthread_mutexattr_settype); __weak_reference(_pthread_mutexattr_destroy, pthread_mutexattr_destroy); __weak_reference(_pthread_mutexattr_getpshared, pthread_mutexattr_getpshared); __weak_reference(_pthread_mutexattr_setpshared, pthread_mutexattr_setpshared); __weak_reference(_pthread_mutexattr_getprotocol, pthread_mutexattr_getprotocol); __weak_reference(_pthread_mutexattr_setprotocol, pthread_mutexattr_setprotocol); -__weak_reference(_pthread_mutexattr_getprioceiling, pthread_mutexattr_getprioceiling); -__weak_reference(_pthread_mutexattr_setprioceiling, pthread_mutexattr_setprioceiling); +__weak_reference(_pthread_mutexattr_getprioceiling, + pthread_mutexattr_getprioceiling); +__weak_reference(_pthread_mutexattr_setprioceiling, + pthread_mutexattr_setprioceiling); +__weak_reference(_pthread_mutexattr_getrobust, pthread_mutexattr_getrobust); +__weak_reference(_pthread_mutexattr_setrobust, pthread_mutexattr_setrobust); int _pthread_mutexattr_init(pthread_mutexattr_t *attr) { int ret; pthread_mutexattr_t pattr; if ((pattr = (pthread_mutexattr_t) malloc(sizeof(struct pthread_mutex_attr))) == NULL) { ret = ENOMEM; } else { memcpy(pattr, &_pthread_mutexattr_default, sizeof(struct pthread_mutex_attr)); *attr = pattr; ret = 0; } return (ret); } int _pthread_mutexattr_setkind_np(pthread_mutexattr_t *attr, int kind) { int ret; if (attr == NULL || *attr == NULL) { errno = EINVAL; ret = -1; } else { (*attr)->m_type = kind; ret = 0; } return(ret); } int _pthread_mutexattr_getkind_np(pthread_mutexattr_t attr) { int ret; + if (attr == NULL) { errno = EINVAL; ret = -1; } else { ret = attr->m_type; } - return(ret); + return (ret); } int _pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type) { int ret; + if (attr == NULL || *attr == NULL || type >= PTHREAD_MUTEX_TYPE_MAX) { ret = EINVAL; } else { (*attr)->m_type = type; ret = 0; } - return(ret); + return (ret); } int _pthread_mutexattr_gettype(pthread_mutexattr_t *attr, int *type) { int ret; if (attr == NULL || *attr == NULL || (*attr)->m_type >= PTHREAD_MUTEX_TYPE_MAX) { ret = EINVAL; } else { *type = (*attr)->m_type; ret = 0; } - return ret; + return (ret); } int _pthread_mutexattr_destroy(pthread_mutexattr_t *attr) { int ret; if (attr == NULL || *attr == NULL) { ret = EINVAL; } else { free(*attr); *attr = NULL; ret = 0; } - return(ret); + return (ret); } int _pthread_mutexattr_getpshared(const pthread_mutexattr_t *attr, int *pshared) { if (attr == NULL || *attr == NULL) return (EINVAL); *pshared = (*attr)->m_pshared; return (0); } int _pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int pshared) { if (attr == NULL || *attr == NULL || (pshared != PTHREAD_PROCESS_PRIVATE && pshared != PTHREAD_PROCESS_SHARED)) return (EINVAL); (*attr)->m_pshared = pshared; return (0); } int _pthread_mutexattr_getprotocol(pthread_mutexattr_t *mattr, int *protocol) { int ret = 0; - if ((mattr == NULL) || (*mattr == NULL)) + if (mattr == NULL || *mattr == NULL) ret = EINVAL; else *protocol = (*mattr)->m_protocol; - return(ret); + return (ret); } int _pthread_mutexattr_setprotocol(pthread_mutexattr_t *mattr, int protocol) { int ret = 0; - if ((mattr == NULL) || (*mattr == NULL) || - (protocol < PTHREAD_PRIO_NONE) || (protocol > PTHREAD_PRIO_PROTECT)) + if (mattr == NULL || *mattr == NULL || + protocol < PTHREAD_PRIO_NONE || protocol > PTHREAD_PRIO_PROTECT) ret = EINVAL; else { (*mattr)->m_protocol = protocol; (*mattr)->m_ceiling = THR_MAX_RR_PRIORITY; } - return(ret); + return (ret); } int _pthread_mutexattr_getprioceiling(pthread_mutexattr_t *mattr, int *prioceiling) { int ret = 0; - if ((mattr == NULL) || (*mattr == NULL)) + if (mattr == NULL || *mattr == NULL) ret = EINVAL; else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT) ret = EINVAL; else *prioceiling = (*mattr)->m_ceiling; - return(ret); + return (ret); } int _pthread_mutexattr_setprioceiling(pthread_mutexattr_t *mattr, int prioceiling) { int ret = 0; - if ((mattr == NULL) || (*mattr == NULL)) + if (mattr == NULL || *mattr == NULL) ret = EINVAL; else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT) ret = EINVAL; else (*mattr)->m_ceiling = prioceiling; - return(ret); + return (ret); +} + +int +_pthread_mutexattr_getrobust(pthread_mutexattr_t *mattr, int *robust) +{ + int ret; + + if (mattr == NULL || *mattr == NULL) { + ret = EINVAL; + } else { + ret = 0; + *robust = (*mattr)->m_robust; + } + return (ret); +} + +int +_pthread_mutexattr_setrobust(pthread_mutexattr_t *mattr, int robust) +{ + int ret; + + if (mattr == NULL || *mattr == NULL) { + ret = EINVAL; + } else if (robust != PTHREAD_MUTEX_STALLED && + robust != PTHREAD_MUTEX_ROBUST) { + ret = EINVAL; + } else { + ret = 0; + (*mattr)->m_robust = robust; + } + return (ret); } Index: head/lib/libthr/thread/thr_private.h =================================================================== --- head/lib/libthr/thread/thr_private.h (revision 300042) +++ head/lib/libthr/thread/thr_private.h (revision 300043) @@ -1,967 +1,988 @@ /* * Copyright (C) 2005 Daniel M. Eischen * Copyright (c) 2005 David Xu * Copyright (c) 1995-1998 John Birrell . * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _THR_PRIVATE_H #define _THR_PRIVATE_H /* * Include files. */ #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #define SYM_FB10(sym) __CONCAT(sym, _fb10) #define SYM_FBP10(sym) __CONCAT(sym, _fbp10) #define WEAK_REF(sym, alias) __weak_reference(sym, alias) #define SYM_COMPAT(sym, impl, ver) __sym_compat(sym, impl, ver) #define SYM_DEFAULT(sym, impl, ver) __sym_default(sym, impl, ver) #define FB10_COMPAT(func, sym) \ WEAK_REF(func, SYM_FB10(sym)); \ SYM_COMPAT(sym, SYM_FB10(sym), FBSD_1.0) #define FB10_COMPAT_PRIVATE(func, sym) \ WEAK_REF(func, SYM_FBP10(sym)); \ SYM_DEFAULT(sym, SYM_FBP10(sym), FBSDprivate_1.0) #include "pthread_md.h" #include "thr_umtx.h" #include "thread_db.h" #ifdef _PTHREAD_FORCED_UNWIND #define _BSD_SOURCE #include #endif typedef TAILQ_HEAD(pthreadlist, pthread) pthreadlist; typedef TAILQ_HEAD(atfork_head, pthread_atfork) atfork_head; TAILQ_HEAD(mutex_queue, pthread_mutex); /* Signal to do cancellation */ #define SIGCANCEL SIGTHR /* * Kernel fatal error handler macro. */ #define PANIC(string) _thread_exit(__FILE__,__LINE__,string) /* Output debug messages like this: */ #define stdout_debug(args...) _thread_printf(STDOUT_FILENO, ##args) #define stderr_debug(args...) _thread_printf(STDERR_FILENO, ##args) #ifdef _PTHREADS_INVARIANTS #define THR_ASSERT(cond, msg) do { \ if (__predict_false(!(cond))) \ PANIC(msg); \ } while (0) #else #define THR_ASSERT(cond, msg) #endif #ifdef PIC # define STATIC_LIB_REQUIRE(name) #else # define STATIC_LIB_REQUIRE(name) __asm (".globl " #name) #endif #define TIMESPEC_ADD(dst, src, val) \ do { \ (dst)->tv_sec = (src)->tv_sec + (val)->tv_sec; \ (dst)->tv_nsec = (src)->tv_nsec + (val)->tv_nsec; \ if ((dst)->tv_nsec >= 1000000000) { \ (dst)->tv_sec++; \ (dst)->tv_nsec -= 1000000000; \ } \ } while (0) #define TIMESPEC_SUB(dst, src, val) \ do { \ (dst)->tv_sec = (src)->tv_sec - (val)->tv_sec; \ (dst)->tv_nsec = (src)->tv_nsec - (val)->tv_nsec; \ if ((dst)->tv_nsec < 0) { \ (dst)->tv_sec--; \ (dst)->tv_nsec += 1000000000; \ } \ } while (0) /* Magic cookie set for shared pthread locks and cv's pointers */ #define THR_PSHARED_PTR \ ((void *)(uintptr_t)((1ULL << (NBBY * sizeof(long) - 1)) | 1)) /* XXX These values should be same as those defined in pthread.h */ #define THR_MUTEX_INITIALIZER ((struct pthread_mutex *)NULL) #define THR_ADAPTIVE_MUTEX_INITIALIZER ((struct pthread_mutex *)1) #define THR_MUTEX_DESTROYED ((struct pthread_mutex *)2) #define THR_COND_INITIALIZER ((struct pthread_cond *)NULL) #define THR_COND_DESTROYED ((struct pthread_cond *)1) #define THR_RWLOCK_INITIALIZER ((struct pthread_rwlock *)NULL) #define THR_RWLOCK_DESTROYED ((struct pthread_rwlock *)1) #define PMUTEX_FLAG_TYPE_MASK 0x0ff #define PMUTEX_FLAG_PRIVATE 0x100 -#define PMUTEX_FLAG_DEFERED 0x200 +#define PMUTEX_FLAG_DEFERRED 0x200 #define PMUTEX_TYPE(mtxflags) ((mtxflags) & PMUTEX_FLAG_TYPE_MASK) +#define PMUTEX_OWNER_ID(m) ((m)->m_lock.m_owner & ~UMUTEX_CONTESTED) + #define MAX_DEFER_WAITERS 50 /* * Values for pthread_mutex m_ps indicator. */ #define PMUTEX_INITSTAGE_ALLOC 0 #define PMUTEX_INITSTAGE_BUSY 1 #define PMUTEX_INITSTAGE_DONE 2 struct pthread_mutex { /* * Lock for accesses to this structure. */ struct umutex m_lock; int m_flags; - uint32_t m_owner; int m_count; int m_spinloops; int m_yieldloops; int m_ps; /* pshared init stage */ /* * Link for all mutexes a thread currently owns, of the same * prio type. */ TAILQ_ENTRY(pthread_mutex) m_qe; /* Link for all private mutexes a thread currently owns. */ TAILQ_ENTRY(pthread_mutex) m_pqe; + struct pthread_mutex *m_rb_prev; }; struct pthread_mutex_attr { enum pthread_mutextype m_type; int m_protocol; int m_ceiling; int m_pshared; + int m_robust; }; #define PTHREAD_MUTEXATTR_STATIC_INITIALIZER \ - { PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE } + { PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE, \ + PTHREAD_MUTEX_STALLED } struct pthread_cond { __uint32_t __has_user_waiters; __uint32_t __has_kern_waiters; __uint32_t __flags; __uint32_t __clock_id; }; struct pthread_cond_attr { int c_pshared; int c_clockid; }; struct pthread_barrier { struct umutex b_lock; struct ucond b_cv; int64_t b_cycle; int b_count; int b_waiters; int b_refcount; int b_destroying; }; struct pthread_barrierattr { int pshared; }; struct pthread_spinlock { struct umutex s_lock; }; /* * Flags for condition variables. */ #define COND_FLAGS_PRIVATE 0x01 #define COND_FLAGS_INITED 0x02 #define COND_FLAGS_BUSY 0x04 /* * Cleanup definitions. */ struct pthread_cleanup { struct pthread_cleanup *prev; void (*routine)(void *); void *routine_arg; int onheap; }; #define THR_CLEANUP_PUSH(td, func, arg) { \ struct pthread_cleanup __cup; \ \ __cup.routine = func; \ __cup.routine_arg = arg; \ __cup.onheap = 0; \ __cup.prev = (td)->cleanup; \ (td)->cleanup = &__cup; #define THR_CLEANUP_POP(td, exec) \ (td)->cleanup = __cup.prev; \ if ((exec) != 0) \ __cup.routine(__cup.routine_arg); \ } struct pthread_atfork { TAILQ_ENTRY(pthread_atfork) qe; void (*prepare)(void); void (*parent)(void); void (*child)(void); }; struct pthread_attr { #define pthread_attr_start_copy sched_policy int sched_policy; int sched_inherit; int prio; int suspend; #define THR_STACK_USER 0x100 /* 0xFF reserved for */ int flags; void *stackaddr_attr; size_t stacksize_attr; size_t guardsize_attr; #define pthread_attr_end_copy cpuset cpuset_t *cpuset; size_t cpusetsize; }; struct wake_addr { struct wake_addr *link; unsigned int value; char pad[12]; }; struct sleepqueue { TAILQ_HEAD(, pthread) sq_blocked; SLIST_HEAD(, sleepqueue) sq_freeq; LIST_ENTRY(sleepqueue) sq_hash; SLIST_ENTRY(sleepqueue) sq_flink; void *sq_wchan; int sq_type; }; /* * Thread creation state attributes. */ #define THR_CREATE_RUNNING 0 #define THR_CREATE_SUSPENDED 1 /* * Miscellaneous definitions. */ #define THR_STACK_DEFAULT (sizeof(void *) / 4 * 1024 * 1024) /* * Maximum size of initial thread's stack. This perhaps deserves to be larger * than the stacks of other threads, since many applications are likely to run * almost entirely on this stack. */ #define THR_STACK_INITIAL (THR_STACK_DEFAULT * 2) /* * Define priorities returned by kernel. */ #define THR_MIN_PRIORITY (_thr_priorities[SCHED_OTHER-1].pri_min) #define THR_MAX_PRIORITY (_thr_priorities[SCHED_OTHER-1].pri_max) #define THR_DEF_PRIORITY (_thr_priorities[SCHED_OTHER-1].pri_default) #define THR_MIN_RR_PRIORITY (_thr_priorities[SCHED_RR-1].pri_min) #define THR_MAX_RR_PRIORITY (_thr_priorities[SCHED_RR-1].pri_max) #define THR_DEF_RR_PRIORITY (_thr_priorities[SCHED_RR-1].pri_default) /* XXX The SCHED_FIFO should have same priority range as SCHED_RR */ #define THR_MIN_FIFO_PRIORITY (_thr_priorities[SCHED_FIFO_1].pri_min) #define THR_MAX_FIFO_PRIORITY (_thr_priorities[SCHED_FIFO-1].pri_max) #define THR_DEF_FIFO_PRIORITY (_thr_priorities[SCHED_FIFO-1].pri_default) struct pthread_prio { int pri_min; int pri_max; int pri_default; }; struct pthread_rwlockattr { int pshared; }; struct pthread_rwlock { struct urwlock lock; uint32_t owner; }; /* * Thread states. */ enum pthread_state { PS_RUNNING, PS_DEAD }; struct pthread_specific_elem { const void *data; int seqno; }; struct pthread_key { volatile int allocated; int seqno; void (*destructor)(void *); }; /* * lwpid_t is 32bit but kernel thr API exports tid as long type * to preserve the ABI for M:N model in very early date (r131431). */ #define TID(thread) ((uint32_t) ((thread)->tid)) /* * Thread structure. */ struct pthread { #define _pthread_startzero tid /* Kernel thread id. */ long tid; #define TID_TERMINATED 1 /* * Lock for accesses to this thread structure. */ struct umutex lock; /* Internal condition variable cycle number. */ uint32_t cycle; /* How many low level locks the thread held. */ int locklevel; /* * Set to non-zero when this thread has entered a critical * region. We allow for recursive entries into critical regions. */ int critical_count; /* Signal blocked counter. */ int sigblock; /* Queue entry for list of all threads. */ TAILQ_ENTRY(pthread) tle; /* link for all threads in process */ /* Queue entry for GC lists. */ TAILQ_ENTRY(pthread) gcle; /* Hash queue entry. */ LIST_ENTRY(pthread) hle; /* Sleep queue entry */ TAILQ_ENTRY(pthread) wle; /* Threads reference count. */ int refcount; /* * Thread start routine, argument, stack pointer and thread * attributes. */ void *(*start_routine)(void *); void *arg; struct pthread_attr attr; #define SHOULD_CANCEL(thr) \ ((thr)->cancel_pending && (thr)->cancel_enable && \ (thr)->no_cancel == 0) /* Cancellation is enabled */ int cancel_enable; /* Cancellation request is pending */ int cancel_pending; /* Thread is at cancellation point */ int cancel_point; /* Cancellation is temporarily disabled */ int no_cancel; /* Asynchronouse cancellation is enabled */ int cancel_async; /* Cancellation is in progress */ int cancelling; /* Thread temporary signal mask. */ sigset_t sigmask; /* Thread should unblock SIGCANCEL. */ int unblock_sigcancel; /* In sigsuspend state */ int in_sigsuspend; /* deferred signal info */ siginfo_t deferred_siginfo; /* signal mask to restore. */ sigset_t deferred_sigmask; /* the sigaction should be used for deferred signal. */ struct sigaction deferred_sigact; /* deferred signal delivery is performed, do not reenter. */ int deferred_run; /* Force new thread to exit. */ int force_exit; /* Thread state: */ enum pthread_state state; /* * Error variable used instead of errno. The function __error() * returns a pointer to this. */ int error; /* * The joiner is the thread that is joining to this thread. The * join status keeps track of a join operation to another thread. */ struct pthread *joiner; /* Miscellaneous flags; only set with scheduling lock held. */ int flags; #define THR_FLAGS_PRIVATE 0x0001 #define THR_FLAGS_NEED_SUSPEND 0x0002 /* thread should be suspended */ #define THR_FLAGS_SUSPENDED 0x0004 /* thread is suspended */ #define THR_FLAGS_DETACHED 0x0008 /* thread is detached */ /* Thread list flags; only set with thread list lock held. */ int tlflags; #define TLFLAGS_GC_SAFE 0x0001 /* thread safe for cleaning */ #define TLFLAGS_IN_TDLIST 0x0002 /* thread in all thread list */ #define TLFLAGS_IN_GCLIST 0x0004 /* thread in gc list */ /* * Queues of the owned mutexes. Private queue must have index * + 1 of the corresponding full queue. */ #define TMQ_NORM 0 /* NORMAL or PRIO_INHERIT normal */ #define TMQ_NORM_PRIV 1 /* NORMAL or PRIO_INHERIT normal priv */ #define TMQ_NORM_PP 2 /* PRIO_PROTECT normal mutexes */ #define TMQ_NORM_PP_PRIV 3 /* PRIO_PROTECT normal priv */ -#define TMQ_NITEMS 4 +#define TMQ_ROBUST_PP 4 /* PRIO_PROTECT robust mutexes */ +#define TMQ_ROBUST_PP_PRIV 5 /* PRIO_PROTECT robust priv */ +#define TMQ_NITEMS 6 struct mutex_queue mq[TMQ_NITEMS]; void *ret; struct pthread_specific_elem *specific; int specific_data_count; /* Number rwlocks rdlocks held. */ int rdlock_count; /* * Current locks bitmap for rtld. */ int rtld_bits; /* Thread control block */ struct tcb *tcb; /* Cleanup handlers Link List */ struct pthread_cleanup *cleanup; #ifdef _PTHREAD_FORCED_UNWIND struct _Unwind_Exception ex; void *unwind_stackend; int unwind_disabled; #endif /* * Magic value to help recognize a valid thread structure * from an invalid one: */ #define THR_MAGIC ((u_int32_t) 0xd09ba115) u_int32_t magic; /* Enable event reporting */ int report_events; /* Event mask */ int event_mask; /* Event */ td_event_msg_t event_buf; /* Wait channel */ void *wchan; /* Referenced mutex. */ struct pthread_mutex *mutex_obj; /* Thread will sleep. */ int will_sleep; /* Number of threads deferred. */ int nwaiter_defer; + int robust_inited; + uintptr_t robust_list; + uintptr_t priv_robust_list; + uintptr_t inact_mtx; + /* Deferred threads from pthread_cond_signal. */ unsigned int *defer_waiters[MAX_DEFER_WAITERS]; #define _pthread_endzero wake_addr struct wake_addr *wake_addr; #define WAKE_ADDR(td) ((td)->wake_addr) /* Sleep queue */ struct sleepqueue *sleepqueue; }; #define THR_SHOULD_GC(thrd) \ ((thrd)->refcount == 0 && (thrd)->state == PS_DEAD && \ ((thrd)->flags & THR_FLAGS_DETACHED) != 0) #define THR_IN_CRITICAL(thrd) \ (((thrd)->locklevel > 0) || \ ((thrd)->critical_count > 0)) #define THR_CRITICAL_ENTER(thrd) \ (thrd)->critical_count++ #define THR_CRITICAL_LEAVE(thrd) \ do { \ (thrd)->critical_count--; \ _thr_ast(thrd); \ } while (0) #define THR_UMUTEX_TRYLOCK(thrd, lck) \ _thr_umutex_trylock((lck), TID(thrd)) #define THR_UMUTEX_LOCK(thrd, lck) \ _thr_umutex_lock((lck), TID(thrd)) #define THR_UMUTEX_TIMEDLOCK(thrd, lck, timo) \ _thr_umutex_timedlock((lck), TID(thrd), (timo)) #define THR_UMUTEX_UNLOCK(thrd, lck) \ _thr_umutex_unlock((lck), TID(thrd)) #define THR_LOCK_ACQUIRE(thrd, lck) \ do { \ (thrd)->locklevel++; \ _thr_umutex_lock(lck, TID(thrd)); \ } while (0) #define THR_LOCK_ACQUIRE_SPIN(thrd, lck) \ do { \ (thrd)->locklevel++; \ _thr_umutex_lock_spin(lck, TID(thrd)); \ } while (0) #ifdef _PTHREADS_INVARIANTS #define THR_ASSERT_LOCKLEVEL(thrd) \ do { \ if (__predict_false((thrd)->locklevel <= 0)) \ _thr_assert_lock_level(); \ } while (0) #else #define THR_ASSERT_LOCKLEVEL(thrd) #endif #define THR_LOCK_RELEASE(thrd, lck) \ do { \ THR_ASSERT_LOCKLEVEL(thrd); \ _thr_umutex_unlock((lck), TID(thrd)); \ (thrd)->locklevel--; \ _thr_ast(thrd); \ } while (0) #define THR_LOCK(curthrd) THR_LOCK_ACQUIRE(curthrd, &(curthrd)->lock) #define THR_UNLOCK(curthrd) THR_LOCK_RELEASE(curthrd, &(curthrd)->lock) #define THR_THREAD_LOCK(curthrd, thr) THR_LOCK_ACQUIRE(curthrd, &(thr)->lock) #define THR_THREAD_UNLOCK(curthrd, thr) THR_LOCK_RELEASE(curthrd, &(thr)->lock) #define THREAD_LIST_RDLOCK(curthrd) \ do { \ (curthrd)->locklevel++; \ _thr_rwl_rdlock(&_thr_list_lock); \ } while (0) #define THREAD_LIST_WRLOCK(curthrd) \ do { \ (curthrd)->locklevel++; \ _thr_rwl_wrlock(&_thr_list_lock); \ } while (0) #define THREAD_LIST_UNLOCK(curthrd) \ do { \ _thr_rwl_unlock(&_thr_list_lock); \ (curthrd)->locklevel--; \ _thr_ast(curthrd); \ } while (0) /* * Macros to insert/remove threads to the all thread list and * the gc list. */ #define THR_LIST_ADD(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) == 0) { \ TAILQ_INSERT_HEAD(&_thread_list, thrd, tle); \ _thr_hash_add(thrd); \ (thrd)->tlflags |= TLFLAGS_IN_TDLIST; \ } \ } while (0) #define THR_LIST_REMOVE(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) != 0) { \ TAILQ_REMOVE(&_thread_list, thrd, tle); \ _thr_hash_remove(thrd); \ (thrd)->tlflags &= ~TLFLAGS_IN_TDLIST; \ } \ } while (0) #define THR_GCLIST_ADD(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) == 0) { \ TAILQ_INSERT_HEAD(&_thread_gc_list, thrd, gcle);\ (thrd)->tlflags |= TLFLAGS_IN_GCLIST; \ _gc_count++; \ } \ } while (0) #define THR_GCLIST_REMOVE(thrd) do { \ if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) != 0) { \ TAILQ_REMOVE(&_thread_gc_list, thrd, gcle); \ (thrd)->tlflags &= ~TLFLAGS_IN_GCLIST; \ _gc_count--; \ } \ } while (0) #define THR_REF_ADD(curthread, pthread) { \ THR_CRITICAL_ENTER(curthread); \ pthread->refcount++; \ } while (0) #define THR_REF_DEL(curthread, pthread) { \ pthread->refcount--; \ THR_CRITICAL_LEAVE(curthread); \ } while (0) #define GC_NEEDED() (_gc_count >= 5) #define SHOULD_REPORT_EVENT(curthr, e) \ (curthr->report_events && \ (((curthr)->event_mask | _thread_event_mask ) & e) != 0) extern int __isthreaded; /* * Global variables for the pthread kernel. */ extern char *_usrstack __hidden; extern struct pthread *_thr_initial __hidden; /* For debugger */ extern int _libthr_debug; extern int _thread_event_mask; extern struct pthread *_thread_last_event; /* List of all threads: */ extern pthreadlist _thread_list; /* List of threads needing GC: */ extern pthreadlist _thread_gc_list __hidden; extern int _thread_active_threads; extern atfork_head _thr_atfork_list __hidden; extern struct urwlock _thr_atfork_lock __hidden; /* Default thread attributes: */ extern struct pthread_attr _pthread_attr_default __hidden; /* Default mutex attributes: */ extern struct pthread_mutex_attr _pthread_mutexattr_default __hidden; extern struct pthread_mutex_attr _pthread_mutexattr_adaptive_default __hidden; /* Default condition variable attributes: */ extern struct pthread_cond_attr _pthread_condattr_default __hidden; extern struct pthread_prio _thr_priorities[] __hidden; extern int _thr_is_smp __hidden; extern size_t _thr_guard_default __hidden; extern size_t _thr_stack_default __hidden; extern size_t _thr_stack_initial __hidden; extern int _thr_page_size __hidden; extern int _thr_spinloops __hidden; extern int _thr_yieldloops __hidden; extern int _thr_queuefifo __hidden; /* Garbage thread count. */ extern int _gc_count __hidden; extern struct umutex _mutex_static_lock __hidden; extern struct umutex _cond_static_lock __hidden; extern struct umutex _rwlock_static_lock __hidden; extern struct umutex _keytable_lock __hidden; extern struct urwlock _thr_list_lock __hidden; extern struct umutex _thr_event_lock __hidden; extern struct umutex _suspend_all_lock __hidden; extern int _suspend_all_waiters __hidden; extern int _suspend_all_cycle __hidden; extern struct pthread *_single_thread __hidden; /* * Function prototype definitions. */ __BEGIN_DECLS int _thr_setthreaded(int) __hidden; -int _mutex_cv_lock(struct pthread_mutex *, int) __hidden; +int _mutex_cv_lock(struct pthread_mutex *, int, bool) __hidden; int _mutex_cv_unlock(struct pthread_mutex *, int *, int *) __hidden; int _mutex_cv_attach(struct pthread_mutex *, int) __hidden; int _mutex_cv_detach(struct pthread_mutex *, int *) __hidden; int _mutex_owned(struct pthread *, const struct pthread_mutex *) __hidden; int _mutex_reinit(pthread_mutex_t *) __hidden; void _mutex_fork(struct pthread *curthread) __hidden; +int _mutex_enter_robust(struct pthread *curthread, struct pthread_mutex *m) + __hidden; +void _mutex_leave_robust(struct pthread *curthread, struct pthread_mutex *m) + __hidden; void _libpthread_init(struct pthread *) __hidden; struct pthread *_thr_alloc(struct pthread *) __hidden; void _thread_exit(const char *, int, const char *) __hidden __dead2; int _thr_ref_add(struct pthread *, struct pthread *, int) __hidden; void _thr_ref_delete(struct pthread *, struct pthread *) __hidden; void _thr_ref_delete_unlocked(struct pthread *, struct pthread *) __hidden; int _thr_find_thread(struct pthread *, struct pthread *, int) __hidden; void _thr_rtld_init(void) __hidden; void _thr_rtld_postfork_child(void) __hidden; int _thr_stack_alloc(struct pthread_attr *) __hidden; void _thr_stack_free(struct pthread_attr *) __hidden; void _thr_free(struct pthread *, struct pthread *) __hidden; void _thr_gc(struct pthread *) __hidden; void _thread_cleanupspecific(void) __hidden; void _thread_printf(int, const char *, ...) __hidden; void _thr_spinlock_init(void) __hidden; void _thr_cancel_enter(struct pthread *) __hidden; void _thr_cancel_enter2(struct pthread *, int) __hidden; void _thr_cancel_leave(struct pthread *, int) __hidden; void _thr_testcancel(struct pthread *) __hidden; void _thr_signal_block(struct pthread *) __hidden; void _thr_signal_unblock(struct pthread *) __hidden; void _thr_signal_init(int) __hidden; void _thr_signal_deinit(void) __hidden; int _thr_send_sig(struct pthread *, int sig) __hidden; void _thr_list_init(void) __hidden; void _thr_hash_add(struct pthread *) __hidden; void _thr_hash_remove(struct pthread *) __hidden; struct pthread *_thr_hash_find(struct pthread *) __hidden; void _thr_link(struct pthread *, struct pthread *) __hidden; void _thr_unlink(struct pthread *, struct pthread *) __hidden; void _thr_assert_lock_level(void) __hidden __dead2; void _thr_ast(struct pthread *) __hidden; void _thr_once_init(void) __hidden; void _thr_report_creation(struct pthread *curthread, struct pthread *newthread) __hidden; void _thr_report_death(struct pthread *curthread) __hidden; int _thr_getscheduler(lwpid_t, int *, struct sched_param *) __hidden; int _thr_setscheduler(lwpid_t, int, const struct sched_param *) __hidden; void _thr_signal_prefork(void) __hidden; void _thr_signal_postfork(void) __hidden; void _thr_signal_postfork_child(void) __hidden; void _thr_suspend_all_lock(struct pthread *) __hidden; void _thr_suspend_all_unlock(struct pthread *) __hidden; void _thr_try_gc(struct pthread *, struct pthread *) __hidden; int _rtp_to_schedparam(const struct rtprio *rtp, int *policy, struct sched_param *param) __hidden; int _schedparam_to_rtp(int policy, const struct sched_param *param, struct rtprio *rtp) __hidden; void _thread_bp_create(void); void _thread_bp_death(void); int _sched_yield(void); void _pthread_cleanup_push(void (*)(void *), void *); void _pthread_cleanup_pop(int); void _pthread_exit_mask(void *status, sigset_t *mask) __dead2 __hidden; void _pthread_cancel_enter(int maycancel); void _pthread_cancel_leave(int maycancel); +int _pthread_mutex_consistent(pthread_mutex_t *) __nonnull(1); +int _pthread_mutexattr_getrobust(pthread_mutexattr_t *__restrict, + int *__restrict) __nonnull_all; +int _pthread_mutexattr_setrobust(pthread_mutexattr_t *, int) + __nonnull(1); /* #include */ #ifdef _SYS_FCNTL_H_ int __sys_fcntl(int, int, ...); int __sys_openat(int, const char *, int, ...); #endif /* #include */ #ifdef _SIGNAL_H_ int __sys_kill(pid_t, int); int __sys_sigaction(int, const struct sigaction *, struct sigaction *); int __sys_sigpending(sigset_t *); int __sys_sigprocmask(int, const sigset_t *, sigset_t *); int __sys_sigsuspend(const sigset_t *); int __sys_sigreturn(const ucontext_t *); int __sys_sigaltstack(const struct sigaltstack *, struct sigaltstack *); int __sys_sigwait(const sigset_t *, int *); int __sys_sigtimedwait(const sigset_t *, siginfo_t *, const struct timespec *); int __sys_sigwaitinfo(const sigset_t *set, siginfo_t *info); #endif /* #include */ #ifdef _TIME_H_ int __sys_nanosleep(const struct timespec *, struct timespec *); #endif /* #include */ #ifdef _SYS_UCONTEXT_H_ int __sys_setcontext(const ucontext_t *ucp); int __sys_swapcontext(ucontext_t *oucp, const ucontext_t *ucp); #endif /* #include */ #ifdef _UNISTD_H_ int __sys_close(int); int __sys_fork(void); pid_t __sys_getpid(void); ssize_t __sys_read(int, void *, size_t); void __sys_exit(int); #endif static inline int _thr_isthreaded(void) { return (__isthreaded != 0); } static inline int _thr_is_inited(void) { return (_thr_initial != NULL); } static inline void _thr_check_init(void) { if (_thr_initial == NULL) _libpthread_init(NULL); } struct wake_addr *_thr_alloc_wake_addr(void); void _thr_release_wake_addr(struct wake_addr *); int _thr_sleep(struct pthread *, int, const struct timespec *); void _thr_wake_addr_init(void) __hidden; static inline void _thr_clear_wake(struct pthread *td) { td->wake_addr->value = 0; } static inline int _thr_is_woken(struct pthread *td) { return td->wake_addr->value != 0; } static inline void _thr_set_wake(unsigned int *waddr) { *waddr = 1; _thr_umtx_wake(waddr, INT_MAX, 0); } void _thr_wake_all(unsigned int *waddrs[], int) __hidden; static inline struct pthread * _sleepq_first(struct sleepqueue *sq) { return TAILQ_FIRST(&sq->sq_blocked); } void _sleepq_init(void) __hidden; struct sleepqueue *_sleepq_alloc(void) __hidden; void _sleepq_free(struct sleepqueue *) __hidden; void _sleepq_lock(void *) __hidden; void _sleepq_unlock(void *) __hidden; struct sleepqueue *_sleepq_lookup(void *) __hidden; void _sleepq_add(void *, struct pthread *) __hidden; int _sleepq_remove(struct sleepqueue *, struct pthread *) __hidden; void _sleepq_drop(struct sleepqueue *, void (*cb)(struct pthread *, void *arg), void *) __hidden; int _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex, void *(calloc_cb)(size_t, size_t)); struct dl_phdr_info; void __pthread_cxa_finalize(struct dl_phdr_info *phdr_info); void _thr_tsd_unload(struct dl_phdr_info *phdr_info) __hidden; void _thr_sigact_unload(struct dl_phdr_info *phdr_info) __hidden; void _thr_stack_fix_protection(struct pthread *thrd); int *__error_threaded(void) __hidden; void __thr_interpose_libc(void) __hidden; pid_t __thr_fork(void); int __thr_setcontext(const ucontext_t *ucp); int __thr_sigaction(int sig, const struct sigaction *act, struct sigaction *oact) __hidden; int __thr_sigprocmask(int how, const sigset_t *set, sigset_t *oset); int __thr_sigsuspend(const sigset_t * set); int __thr_sigtimedwait(const sigset_t *set, siginfo_t *info, const struct timespec * timeout); int __thr_sigwait(const sigset_t *set, int *sig); int __thr_sigwaitinfo(const sigset_t *set, siginfo_t *info); int __thr_swapcontext(ucontext_t *oucp, const ucontext_t *ucp); void __thr_map_stacks_exec(void); struct _spinlock; void __thr_spinunlock(struct _spinlock *lck); void __thr_spinlock(struct _spinlock *lck); struct tcb *_tcb_ctor(struct pthread *, int); void _tcb_dtor(struct tcb *); void __thr_pshared_init(void) __hidden; void *__thr_pshared_offpage(void *key, int doalloc) __hidden; void __thr_pshared_destroy(void *key) __hidden; void __thr_pshared_atfork_pre(void) __hidden; void __thr_pshared_atfork_post(void) __hidden; __END_DECLS #endif /* !_THR_PRIVATE_H */ Index: head/lib/libthr/thread/thr_umtx.c =================================================================== --- head/lib/libthr/thread/thr_umtx.c (revision 300042) +++ head/lib/libthr/thread/thr_umtx.c (revision 300043) @@ -1,343 +1,374 @@ /* * Copyright (c) 2005 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "thr_private.h" #include "thr_umtx.h" #ifndef HAS__UMTX_OP_ERR int _umtx_op_err(void *obj, int op, u_long val, void *uaddr, void *uaddr2) { + if (_umtx_op(obj, op, val, uaddr, uaddr2) == -1) return (errno); return (0); } #endif void _thr_umutex_init(struct umutex *mtx) { static const struct umutex default_mtx = DEFAULT_UMUTEX; *mtx = default_mtx; } void _thr_urwlock_init(struct urwlock *rwl) { static const struct urwlock default_rwl = DEFAULT_URWLOCK; *rwl = default_rwl; } int __thr_umutex_lock(struct umutex *mtx, uint32_t id) { uint32_t owner; - if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) { - for (;;) { - /* wait in kernel */ - _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0); + if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0) + return (_umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0)); - owner = mtx->m_owner; - if ((owner & ~UMUTEX_CONTESTED) == 0 && - atomic_cmpset_acq_32(&mtx->m_owner, owner, id|owner)) - return (0); - } - } + for (;;) { + owner = mtx->m_owner; + if ((owner & ~UMUTEX_CONTESTED) == 0 && + atomic_cmpset_acq_32(&mtx->m_owner, owner, id | owner)) + return (0); + if (owner == UMUTEX_RB_OWNERDEAD && + atomic_cmpset_acq_32(&mtx->m_owner, owner, + id | UMUTEX_CONTESTED)) + return (EOWNERDEAD); + if (owner == UMUTEX_RB_NOTRECOV) + return (ENOTRECOVERABLE); - return _umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0); + /* wait in kernel */ + _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0); + } } #define SPINLOOPS 1000 int __thr_umutex_lock_spin(struct umutex *mtx, uint32_t id) { uint32_t owner; + int count; if (!_thr_is_smp) - return __thr_umutex_lock(mtx, id); + return (__thr_umutex_lock(mtx, id)); + if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0) + return (_umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0)); - if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) { - for (;;) { - int count = SPINLOOPS; - while (count--) { - owner = mtx->m_owner; - if ((owner & ~UMUTEX_CONTESTED) == 0) { - if (atomic_cmpset_acq_32( - &mtx->m_owner, - owner, id|owner)) { - return (0); - } - } - CPU_SPINWAIT; - } - - /* wait in kernel */ - _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0); + for (;;) { + count = SPINLOOPS; + while (count--) { + owner = mtx->m_owner; + if ((owner & ~UMUTEX_CONTESTED) == 0 && + atomic_cmpset_acq_32(&mtx->m_owner, owner, + id | owner)) + return (0); + if (__predict_false(owner == UMUTEX_RB_OWNERDEAD) && + atomic_cmpset_acq_32(&mtx->m_owner, owner, + id | UMUTEX_CONTESTED)) + return (EOWNERDEAD); + if (__predict_false(owner == UMUTEX_RB_NOTRECOV)) + return (ENOTRECOVERABLE); + CPU_SPINWAIT; } - } - return _umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0); + /* wait in kernel */ + _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0); + } } int __thr_umutex_timedlock(struct umutex *mtx, uint32_t id, const struct timespec *abstime) { struct _umtx_time *tm_p, timeout; size_t tm_size; uint32_t owner; int ret; if (abstime == NULL) { tm_p = NULL; tm_size = 0; } else { timeout._clockid = CLOCK_REALTIME; timeout._flags = UMTX_ABSTIME; timeout._timeout = *abstime; tm_p = &timeout; tm_size = sizeof(timeout); } for (;;) { - if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) { - - /* wait in kernel */ - ret = _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, - (void *)tm_size, __DECONST(void *, tm_p)); - - /* now try to lock it */ + if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | + UMUTEX_PRIO_INHERIT)) == 0) { + /* try to lock it */ owner = mtx->m_owner; if ((owner & ~UMUTEX_CONTESTED) == 0 && - atomic_cmpset_acq_32(&mtx->m_owner, owner, id|owner)) + atomic_cmpset_acq_32(&mtx->m_owner, owner, + id | owner)) return (0); + if (__predict_false(owner == UMUTEX_RB_OWNERDEAD) && + atomic_cmpset_acq_32(&mtx->m_owner, owner, + id | UMUTEX_CONTESTED)) + return (EOWNERDEAD); + if (__predict_false(owner == UMUTEX_RB_NOTRECOV)) + return (ENOTRECOVERABLE); + /* wait in kernel */ + ret = _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, + (void *)tm_size, __DECONST(void *, tm_p)); } else { ret = _umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, - (void *)tm_size, __DECONST(void *, tm_p)); - if (ret == 0) + (void *)tm_size, __DECONST(void *, tm_p)); + if (ret == 0 || ret == EOWNERDEAD || + ret == ENOTRECOVERABLE) break; } if (ret == ETIMEDOUT) break; } return (ret); } int __thr_umutex_unlock(struct umutex *mtx, uint32_t id) { - return _umtx_op_err(mtx, UMTX_OP_MUTEX_UNLOCK, 0, 0, 0); + + return (_umtx_op_err(mtx, UMTX_OP_MUTEX_UNLOCK, 0, 0, 0)); } int __thr_umutex_trylock(struct umutex *mtx) { - return _umtx_op_err(mtx, UMTX_OP_MUTEX_TRYLOCK, 0, 0, 0); + + return (_umtx_op_err(mtx, UMTX_OP_MUTEX_TRYLOCK, 0, 0, 0)); } int __thr_umutex_set_ceiling(struct umutex *mtx, uint32_t ceiling, - uint32_t *oldceiling) + uint32_t *oldceiling) { - return _umtx_op_err(mtx, UMTX_OP_SET_CEILING, ceiling, oldceiling, 0); + + return (_umtx_op_err(mtx, UMTX_OP_SET_CEILING, ceiling, oldceiling, 0)); } int _thr_umtx_wait(volatile long *mtx, long id, const struct timespec *timeout) { + if (timeout && (timeout->tv_sec < 0 || (timeout->tv_sec == 0 && - timeout->tv_nsec <= 0))) + timeout->tv_nsec <= 0))) return (ETIMEDOUT); - return _umtx_op_err(__DEVOLATILE(void *, mtx), UMTX_OP_WAIT, id, 0, - __DECONST(void*, timeout)); + return (_umtx_op_err(__DEVOLATILE(void *, mtx), UMTX_OP_WAIT, id, 0, + __DECONST(void*, timeout))); } int -_thr_umtx_wait_uint(volatile u_int *mtx, u_int id, const struct timespec *timeout, int shared) +_thr_umtx_wait_uint(volatile u_int *mtx, u_int id, + const struct timespec *timeout, int shared) { + if (timeout && (timeout->tv_sec < 0 || (timeout->tv_sec == 0 && - timeout->tv_nsec <= 0))) + timeout->tv_nsec <= 0))) return (ETIMEDOUT); - return _umtx_op_err(__DEVOLATILE(void *, mtx), - shared ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, 0, - __DECONST(void*, timeout)); + return (_umtx_op_err(__DEVOLATILE(void *, mtx), shared ? + UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, 0, + __DECONST(void*, timeout))); } int _thr_umtx_timedwait_uint(volatile u_int *mtx, u_int id, int clockid, - const struct timespec *abstime, int shared) + const struct timespec *abstime, int shared) { struct _umtx_time *tm_p, timeout; size_t tm_size; if (abstime == NULL) { tm_p = NULL; tm_size = 0; } else { timeout._clockid = clockid; timeout._flags = UMTX_ABSTIME; timeout._timeout = *abstime; tm_p = &timeout; tm_size = sizeof(timeout); } - return _umtx_op_err(__DEVOLATILE(void *, mtx), - shared ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, - (void *)tm_size, __DECONST(void *, tm_p)); + return (_umtx_op_err(__DEVOLATILE(void *, mtx), shared ? + UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, + (void *)tm_size, __DECONST(void *, tm_p))); } int _thr_umtx_wake(volatile void *mtx, int nr_wakeup, int shared) { - return _umtx_op_err(__DEVOLATILE(void *, mtx), shared ? UMTX_OP_WAKE : UMTX_OP_WAKE_PRIVATE, - nr_wakeup, 0, 0); + + return (_umtx_op_err(__DEVOLATILE(void *, mtx), shared ? + UMTX_OP_WAKE : UMTX_OP_WAKE_PRIVATE, nr_wakeup, 0, 0)); } void _thr_ucond_init(struct ucond *cv) { + bzero(cv, sizeof(struct ucond)); } int _thr_ucond_wait(struct ucond *cv, struct umutex *m, const struct timespec *timeout, int flags) { + struct pthread *curthread; + if (timeout && (timeout->tv_sec < 0 || (timeout->tv_sec == 0 && timeout->tv_nsec <= 0))) { - struct pthread *curthread = _get_curthread(); + curthread = _get_curthread(); _thr_umutex_unlock(m, TID(curthread)); return (ETIMEDOUT); } - return _umtx_op_err(cv, UMTX_OP_CV_WAIT, flags, - m, __DECONST(void*, timeout)); + return (_umtx_op_err(cv, UMTX_OP_CV_WAIT, flags, m, + __DECONST(void*, timeout))); } int _thr_ucond_signal(struct ucond *cv) { + if (!cv->c_has_waiters) return (0); - return _umtx_op_err(cv, UMTX_OP_CV_SIGNAL, 0, NULL, NULL); + return (_umtx_op_err(cv, UMTX_OP_CV_SIGNAL, 0, NULL, NULL)); } int _thr_ucond_broadcast(struct ucond *cv) { + if (!cv->c_has_waiters) return (0); - return _umtx_op_err(cv, UMTX_OP_CV_BROADCAST, 0, NULL, NULL); + return (_umtx_op_err(cv, UMTX_OP_CV_BROADCAST, 0, NULL, NULL)); } int __thr_rwlock_rdlock(struct urwlock *rwlock, int flags, const struct timespec *tsp) { struct _umtx_time timeout, *tm_p; size_t tm_size; if (tsp == NULL) { tm_p = NULL; tm_size = 0; } else { timeout._timeout = *tsp; timeout._flags = UMTX_ABSTIME; timeout._clockid = CLOCK_REALTIME; tm_p = &timeout; tm_size = sizeof(timeout); } - return _umtx_op_err(rwlock, UMTX_OP_RW_RDLOCK, flags, (void *)tm_size, tm_p); + return (_umtx_op_err(rwlock, UMTX_OP_RW_RDLOCK, flags, + (void *)tm_size, tm_p)); } int __thr_rwlock_wrlock(struct urwlock *rwlock, const struct timespec *tsp) { struct _umtx_time timeout, *tm_p; size_t tm_size; if (tsp == NULL) { tm_p = NULL; tm_size = 0; } else { timeout._timeout = *tsp; timeout._flags = UMTX_ABSTIME; timeout._clockid = CLOCK_REALTIME; tm_p = &timeout; tm_size = sizeof(timeout); } - return _umtx_op_err(rwlock, UMTX_OP_RW_WRLOCK, 0, (void *)tm_size, tm_p); + return (_umtx_op_err(rwlock, UMTX_OP_RW_WRLOCK, 0, (void *)tm_size, + tm_p)); } int __thr_rwlock_unlock(struct urwlock *rwlock) { - return _umtx_op_err(rwlock, UMTX_OP_RW_UNLOCK, 0, NULL, NULL); + + return (_umtx_op_err(rwlock, UMTX_OP_RW_UNLOCK, 0, NULL, NULL)); } void _thr_rwl_rdlock(struct urwlock *rwlock) { int ret; for (;;) { if (_thr_rwlock_tryrdlock(rwlock, URWLOCK_PREFER_READER) == 0) return; ret = __thr_rwlock_rdlock(rwlock, URWLOCK_PREFER_READER, NULL); if (ret == 0) return; if (ret != EINTR) PANIC("rdlock error"); } } void _thr_rwl_wrlock(struct urwlock *rwlock) { int ret; for (;;) { if (_thr_rwlock_trywrlock(rwlock) == 0) return; ret = __thr_rwlock_wrlock(rwlock, NULL); if (ret == 0) return; if (ret != EINTR) PANIC("wrlock error"); } } void _thr_rwl_unlock(struct urwlock *rwlock) { + if (_thr_rwlock_unlock(rwlock)) PANIC("unlock error"); } Index: head/lib/libthr/thread/thr_umtx.h =================================================================== --- head/lib/libthr/thread/thr_umtx.h (revision 300042) +++ head/lib/libthr/thread/thr_umtx.h (revision 300043) @@ -1,234 +1,270 @@ /*- * Copyright (c) 2005 David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _THR_FBSD_UMTX_H_ #define _THR_FBSD_UMTX_H_ #include #include -#define DEFAULT_UMUTEX {0,0,{0,0},{0,0,0,0}} +#ifdef __LP64__ +#define DEFAULT_UMUTEX {0,0,{0,0},0,{0,0}} +#else +#define DEFAULT_UMUTEX {0,0,{0,0},0,0,{0,0}} +#endif #define DEFAULT_URWLOCK {0,0,0,0,{0,0,0,0}} int _umtx_op_err(void *, int op, u_long, void *, void *) __hidden; int __thr_umutex_lock(struct umutex *mtx, uint32_t id) __hidden; int __thr_umutex_lock_spin(struct umutex *mtx, uint32_t id) __hidden; int __thr_umutex_timedlock(struct umutex *mtx, uint32_t id, const struct timespec *timeout) __hidden; int __thr_umutex_unlock(struct umutex *mtx, uint32_t id) __hidden; int __thr_umutex_trylock(struct umutex *mtx) __hidden; int __thr_umutex_set_ceiling(struct umutex *mtx, uint32_t ceiling, uint32_t *oldceiling) __hidden; void _thr_umutex_init(struct umutex *mtx) __hidden; void _thr_urwlock_init(struct urwlock *rwl) __hidden; int _thr_umtx_wait(volatile long *mtx, long exp, const struct timespec *timeout) __hidden; int _thr_umtx_wait_uint(volatile u_int *mtx, u_int exp, const struct timespec *timeout, int shared) __hidden; int _thr_umtx_timedwait_uint(volatile u_int *mtx, u_int exp, int clockid, const struct timespec *timeout, int shared) __hidden; int _thr_umtx_wake(volatile void *mtx, int count, int shared) __hidden; int _thr_ucond_wait(struct ucond *cv, struct umutex *m, const struct timespec *timeout, int flags) __hidden; void _thr_ucond_init(struct ucond *cv) __hidden; int _thr_ucond_signal(struct ucond *cv) __hidden; int _thr_ucond_broadcast(struct ucond *cv) __hidden; int __thr_rwlock_rdlock(struct urwlock *rwlock, int flags, const struct timespec *tsp) __hidden; int __thr_rwlock_wrlock(struct urwlock *rwlock, const struct timespec *tsp) __hidden; int __thr_rwlock_unlock(struct urwlock *rwlock) __hidden; /* Internal used only */ void _thr_rwl_rdlock(struct urwlock *rwlock) __hidden; void _thr_rwl_wrlock(struct urwlock *rwlock) __hidden; void _thr_rwl_unlock(struct urwlock *rwlock) __hidden; static inline int _thr_umutex_trylock(struct umutex *mtx, uint32_t id) { - if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id)) - return (0); - if ((mtx->m_flags & UMUTEX_PRIO_PROTECT) == 0) - return (EBUSY); - return (__thr_umutex_trylock(mtx)); + + if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id)) + return (0); + if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_OWNERDEAD) && + atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_RB_OWNERDEAD, + id | UMUTEX_CONTESTED)) + return (EOWNERDEAD); + if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_NOTRECOV)) + return (ENOTRECOVERABLE); + if ((mtx->m_flags & UMUTEX_PRIO_PROTECT) == 0) + return (EBUSY); + return (__thr_umutex_trylock(mtx)); } static inline int _thr_umutex_trylock2(struct umutex *mtx, uint32_t id) { - if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id) != 0) - return (0); - if ((uint32_t)mtx->m_owner == UMUTEX_CONTESTED && - __predict_true((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0)) - if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED)) + + if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id) != 0) return (0); - return (EBUSY); + if ((uint32_t)mtx->m_owner == UMUTEX_CONTESTED && + __predict_true((mtx->m_flags & (UMUTEX_PRIO_PROTECT | + UMUTEX_PRIO_INHERIT)) == 0) && + atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_CONTESTED, + id | UMUTEX_CONTESTED)) + return (0); + if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_OWNERDEAD) && + atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_RB_OWNERDEAD, + id | UMUTEX_CONTESTED)) + return (EOWNERDEAD); + if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_NOTRECOV)) + return (ENOTRECOVERABLE); + return (EBUSY); } static inline int _thr_umutex_lock(struct umutex *mtx, uint32_t id) { - if (_thr_umutex_trylock2(mtx, id) == 0) - return (0); - return (__thr_umutex_lock(mtx, id)); + + if (_thr_umutex_trylock2(mtx, id) == 0) + return (0); + return (__thr_umutex_lock(mtx, id)); } static inline int _thr_umutex_lock_spin(struct umutex *mtx, uint32_t id) { - if (_thr_umutex_trylock2(mtx, id) == 0) - return (0); - return (__thr_umutex_lock_spin(mtx, id)); + + if (_thr_umutex_trylock2(mtx, id) == 0) + return (0); + return (__thr_umutex_lock_spin(mtx, id)); } static inline int _thr_umutex_timedlock(struct umutex *mtx, uint32_t id, - const struct timespec *timeout) + const struct timespec *timeout) { - if (_thr_umutex_trylock2(mtx, id) == 0) - return (0); - return (__thr_umutex_timedlock(mtx, id, timeout)); + + if (_thr_umutex_trylock2(mtx, id) == 0) + return (0); + return (__thr_umutex_timedlock(mtx, id, timeout)); } static inline int _thr_umutex_unlock2(struct umutex *mtx, uint32_t id, int *defer) { - uint32_t flags = mtx->m_flags; + uint32_t flags, owner; + bool noncst; - if ((flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) { - uint32_t owner; - do { - owner = mtx->m_owner; - if (__predict_false((owner & ~UMUTEX_CONTESTED) != id)) - return (EPERM); - } while (__predict_false(!atomic_cmpset_rel_32(&mtx->m_owner, - owner, UMUTEX_UNOWNED))); - if ((owner & UMUTEX_CONTESTED)) { - if (defer == NULL) - (void)_umtx_op_err(mtx, UMTX_OP_MUTEX_WAKE2, flags, 0, 0); - else - *defer = 1; - } - return (0); + flags = mtx->m_flags; + noncst = (flags & UMUTEX_NONCONSISTENT) != 0; + + if ((flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0) { + if (atomic_cmpset_rel_32(&mtx->m_owner, id, noncst ? + UMUTEX_RB_NOTRECOV : UMUTEX_UNOWNED)) + return (0); + return (__thr_umutex_unlock(mtx, id)); } - if (atomic_cmpset_rel_32(&mtx->m_owner, id, UMUTEX_UNOWNED)) - return (0); - return (__thr_umutex_unlock(mtx, id)); + + do { + owner = mtx->m_owner; + if (__predict_false((owner & ~UMUTEX_CONTESTED) != id)) + return (EPERM); + } while (__predict_false(!atomic_cmpset_rel_32(&mtx->m_owner, owner, + noncst ? UMUTEX_RB_NOTRECOV : UMUTEX_UNOWNED))); + if ((owner & UMUTEX_CONTESTED) != 0) { + if (defer == NULL || noncst) + (void)_umtx_op_err(mtx, UMTX_OP_MUTEX_WAKE2, + flags, 0, 0); + else + *defer = 1; + } + return (0); } static inline int _thr_umutex_unlock(struct umutex *mtx, uint32_t id) { - return _thr_umutex_unlock2(mtx, id, NULL); + + return (_thr_umutex_unlock2(mtx, id, NULL)); } static inline int _thr_rwlock_tryrdlock(struct urwlock *rwlock, int flags) { - int32_t state; - int32_t wrflags; + int32_t state, wrflags; - if (flags & URWLOCK_PREFER_READER || rwlock->rw_flags & URWLOCK_PREFER_READER) + if ((flags & URWLOCK_PREFER_READER) != 0 || + (rwlock->rw_flags & URWLOCK_PREFER_READER) != 0) wrflags = URWLOCK_WRITE_OWNER; else wrflags = URWLOCK_WRITE_OWNER | URWLOCK_WRITE_WAITERS; state = rwlock->rw_state; while (!(state & wrflags)) { - if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) + if (__predict_false(URWLOCK_READER_COUNT(state) == + URWLOCK_MAX_READERS)) return (EAGAIN); if (atomic_cmpset_acq_32(&rwlock->rw_state, state, state + 1)) return (0); state = rwlock->rw_state; } return (EBUSY); } static inline int _thr_rwlock_trywrlock(struct urwlock *rwlock) { int32_t state; state = rwlock->rw_state; - while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { - if (atomic_cmpset_acq_32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER)) + while ((state & URWLOCK_WRITE_OWNER) == 0 && + URWLOCK_READER_COUNT(state) == 0) { + if (atomic_cmpset_acq_32(&rwlock->rw_state, state, + state | URWLOCK_WRITE_OWNER)) return (0); state = rwlock->rw_state; } return (EBUSY); } static inline int _thr_rwlock_rdlock(struct urwlock *rwlock, int flags, struct timespec *tsp) { + if (_thr_rwlock_tryrdlock(rwlock, flags) == 0) return (0); return (__thr_rwlock_rdlock(rwlock, flags, tsp)); } static inline int _thr_rwlock_wrlock(struct urwlock *rwlock, struct timespec *tsp) { + if (_thr_rwlock_trywrlock(rwlock) == 0) return (0); return (__thr_rwlock_wrlock(rwlock, tsp)); } static inline int _thr_rwlock_unlock(struct urwlock *rwlock) { int32_t state; state = rwlock->rw_state; - if (state & URWLOCK_WRITE_OWNER) { - if (atomic_cmpset_rel_32(&rwlock->rw_state, URWLOCK_WRITE_OWNER, 0)) + if ((state & URWLOCK_WRITE_OWNER) != 0) { + if (atomic_cmpset_rel_32(&rwlock->rw_state, + URWLOCK_WRITE_OWNER, 0)) return (0); } else { for (;;) { if (__predict_false(URWLOCK_READER_COUNT(state) == 0)) return (EPERM); if (!((state & (URWLOCK_WRITE_WAITERS | - URWLOCK_READ_WAITERS)) && + URWLOCK_READ_WAITERS)) != 0 && URWLOCK_READER_COUNT(state) == 1)) { if (atomic_cmpset_rel_32(&rwlock->rw_state, - state, state-1)) + state, state - 1)) return (0); state = rwlock->rw_state; } else { break; } } } return (__thr_rwlock_unlock(rwlock)); } #endif Index: head/share/man/man3/Makefile =================================================================== --- head/share/man/man3/Makefile (revision 300042) +++ head/share/man/man3/Makefile (revision 300043) @@ -1,334 +1,337 @@ # @(#)Makefile 8.2 (Berkeley) 12/13/93 # $FreeBSD$ .include PACKAGE=runtime-manuals MAN= assert.3 \ ATOMIC_VAR_INIT.3 \ bitstring.3 \ end.3 \ fpgetround.3 \ intro.3 \ makedev.3 \ offsetof.3 \ ${PTHREAD_MAN} \ queue.3 \ siginfo.3 \ stdarg.3 \ sysexits.3 \ tgmath.3 \ timeradd.3 \ tree.3 MLINKS= ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong.3 \ ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak.3 \ ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_exchange.3 \ ATOMIC_VAR_INIT.3 atomic_exchange_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_add.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_add_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_and.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_and_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_or.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_or_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_sub.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_sub_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_xor.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_xor_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_init.3 \ ATOMIC_VAR_INIT.3 atomic_is_lock_free.3 \ ATOMIC_VAR_INIT.3 atomic_load.3 \ ATOMIC_VAR_INIT.3 atomic_load_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_store.3 \ ATOMIC_VAR_INIT.3 atomic_store_explicit.3 MLINKS+= bitstring.3 bit_alloc.3 \ bitstring.3 bit_clear.3 \ bitstring.3 bit_decl.3 \ bitstring.3 bit_ffc.3 \ bitstring.3 bit_ffs.3 \ bitstring.3 bit_nclear.3 \ bitstring.3 bit_nset.3 \ bitstring.3 bit_set.3 \ bitstring.3 bitstr_size.3 \ bitstring.3 bit_test.3 MLINKS+= end.3 edata.3 \ end.3 etext.3 MLINKS+= fpgetround.3 fpgetmask.3 \ fpgetround.3 fpgetprec.3 \ fpgetround.3 fpgetsticky.3 \ fpgetround.3 fpresetsticky.3 \ fpgetround.3 fpsetmask.3 \ fpgetround.3 fpsetprec.3 \ fpgetround.3 fpsetround.3 MLINKS+= makedev.3 major.3 \ makedev.3 minor.3 MLINKS+= ${PTHREAD_MLINKS} MLINKS+= queue.3 LIST_CLASS_ENTRY.3 \ queue.3 LIST_CLASS_HEAD.3 \ queue.3 LIST_EMPTY.3 \ queue.3 LIST_ENTRY.3 \ queue.3 LIST_FIRST.3 \ queue.3 LIST_FOREACH.3 \ queue.3 LIST_FOREACH_FROM.3 \ queue.3 LIST_FOREACH_FROM_SAFE.3 \ queue.3 LIST_FOREACH_SAFE.3 \ queue.3 LIST_HEAD.3 \ queue.3 LIST_HEAD_INITIALIZER.3 \ queue.3 LIST_INIT.3 \ queue.3 LIST_INSERT_AFTER.3 \ queue.3 LIST_INSERT_BEFORE.3 \ queue.3 LIST_INSERT_HEAD.3 \ queue.3 LIST_NEXT.3 \ queue.3 LIST_PREV.3 \ queue.3 LIST_REMOVE.3 \ queue.3 LIST_SWAP.3 \ queue.3 SLIST_CLASS_ENTRY.3 \ queue.3 SLIST_CLASS_HEAD.3 \ queue.3 SLIST_EMPTY.3 \ queue.3 SLIST_ENTRY.3 \ queue.3 SLIST_FIRST.3 \ queue.3 SLIST_FOREACH.3 \ queue.3 SLIST_FOREACH_FROM.3 \ queue.3 SLIST_FOREACH_FROM_SAFE.3 \ queue.3 SLIST_FOREACH_SAFE.3 \ queue.3 SLIST_HEAD.3 \ queue.3 SLIST_HEAD_INITIALIZER.3 \ queue.3 SLIST_INIT.3 \ queue.3 SLIST_INSERT_AFTER.3 \ queue.3 SLIST_INSERT_HEAD.3 \ queue.3 SLIST_NEXT.3 \ queue.3 SLIST_REMOVE.3 \ queue.3 SLIST_REMOVE_AFTER.3 \ queue.3 SLIST_REMOVE_HEAD.3 \ queue.3 SLIST_SWAP.3 \ queue.3 STAILQ_CLASS_ENTRY.3 \ queue.3 STAILQ_CLASS_HEAD.3 \ queue.3 STAILQ_CONCAT.3 \ queue.3 STAILQ_EMPTY.3 \ queue.3 STAILQ_ENTRY.3 \ queue.3 STAILQ_FIRST.3 \ queue.3 STAILQ_FOREACH.3 \ queue.3 STAILQ_FOREACH_FROM.3 \ queue.3 STAILQ_FOREACH_FROM_SAFE.3 \ queue.3 STAILQ_FOREACH_SAFE.3 \ queue.3 STAILQ_HEAD.3 \ queue.3 STAILQ_HEAD_INITIALIZER.3 \ queue.3 STAILQ_INIT.3 \ queue.3 STAILQ_INSERT_AFTER.3 \ queue.3 STAILQ_INSERT_HEAD.3 \ queue.3 STAILQ_INSERT_TAIL.3 \ queue.3 STAILQ_LAST.3 \ queue.3 STAILQ_NEXT.3 \ queue.3 STAILQ_REMOVE.3 \ queue.3 STAILQ_REMOVE_AFTER.3 \ queue.3 STAILQ_REMOVE_HEAD.3 \ queue.3 STAILQ_SWAP.3 \ queue.3 TAILQ_CLASS_ENTRY.3 \ queue.3 TAILQ_CLASS_HEAD.3 \ queue.3 TAILQ_CONCAT.3 \ queue.3 TAILQ_EMPTY.3 \ queue.3 TAILQ_ENTRY.3 \ queue.3 TAILQ_FIRST.3 \ queue.3 TAILQ_FOREACH.3 \ queue.3 TAILQ_FOREACH_FROM.3 \ queue.3 TAILQ_FOREACH_FROM_SAFE.3 \ queue.3 TAILQ_FOREACH_REVERSE.3 \ queue.3 TAILQ_FOREACH_REVERSE_FROM.3 \ queue.3 TAILQ_FOREACH_REVERSE_FROM_SAFE.3 \ queue.3 TAILQ_FOREACH_REVERSE_SAFE.3 \ queue.3 TAILQ_FOREACH_SAFE.3 \ queue.3 TAILQ_HEAD.3 \ queue.3 TAILQ_HEAD_INITIALIZER.3 \ queue.3 TAILQ_INIT.3 \ queue.3 TAILQ_INSERT_AFTER.3 \ queue.3 TAILQ_INSERT_BEFORE.3 \ queue.3 TAILQ_INSERT_HEAD.3 \ queue.3 TAILQ_INSERT_TAIL.3 \ queue.3 TAILQ_LAST.3 \ queue.3 TAILQ_NEXT.3 \ queue.3 TAILQ_PREV.3 \ queue.3 TAILQ_REMOVE.3 \ queue.3 TAILQ_SWAP.3 MLINKS+= stdarg.3 va_arg.3 \ stdarg.3 va_copy.3 \ stdarg.3 va_end.3 \ stdarg.3 varargs.3 \ stdarg.3 va_start.3 MLINKS+= timeradd.3 timerclear.3 \ timeradd.3 timercmp.3 \ timeradd.3 timerisset.3 \ timeradd.3 timersub.3 MLINKS+= tree.3 RB_EMPTY.3 \ tree.3 RB_ENTRY.3 \ tree.3 RB_FIND.3 \ tree.3 RB_FOREACH.3 \ tree.3 RB_FOREACH_REVERSE.3 \ tree.3 RB_GENERATE.3 \ tree.3 RB_GENERATE_STATIC.3 \ tree.3 RB_HEAD.3 \ tree.3 RB_INIT.3 \ tree.3 RB_INITIALIZER.3 \ tree.3 RB_INSERT.3 \ tree.3 RB_LEFT.3 \ tree.3 RB_MAX.3 \ tree.3 RB_MIN.3 \ tree.3 RB_NEXT.3 \ tree.3 RB_NFIND.3 \ tree.3 RB_PARENT.3 \ tree.3 RB_PREV.3 \ tree.3 RB_PROTOTYPE.3 \ tree.3 RB_PROTOTYPE_STATIC.3 \ tree.3 RB_REMOVE.3 \ tree.3 RB_RIGHT.3 \ tree.3 RB_ROOT.3 \ tree.3 SPLAY_EMPTY.3 \ tree.3 SPLAY_ENTRY.3 \ tree.3 SPLAY_FIND.3 \ tree.3 SPLAY_FOREACH.3 \ tree.3 SPLAY_GENERATE.3 \ tree.3 SPLAY_HEAD.3 \ tree.3 SPLAY_INIT.3 \ tree.3 SPLAY_INITIALIZER.3 \ tree.3 SPLAY_INSERT.3 \ tree.3 SPLAY_LEFT.3 \ tree.3 SPLAY_MAX.3 \ tree.3 SPLAY_MIN.3 \ tree.3 SPLAY_NEXT.3 \ tree.3 SPLAY_PROTOTYPE.3 \ tree.3 SPLAY_REMOVE.3 \ tree.3 SPLAY_RIGHT.3 \ tree.3 SPLAY_ROOT.3 .if ${MK_LIBTHR} != "no" PTHREAD_MAN= pthread.3 \ pthread_affinity_np.3 \ pthread_atfork.3 \ pthread_attr.3 \ pthread_attr_affinity_np.3 \ pthread_attr_get_np.3 \ pthread_attr_setcreatesuspend_np.3 \ pthread_barrierattr.3 \ pthread_barrier_destroy.3 \ pthread_cancel.3 \ pthread_cleanup_pop.3 \ pthread_cleanup_push.3 \ pthread_condattr.3 \ pthread_cond_broadcast.3 \ pthread_cond_destroy.3 \ pthread_cond_init.3 \ pthread_cond_signal.3 \ pthread_cond_timedwait.3 \ pthread_cond_wait.3 \ pthread_create.3 \ pthread_detach.3 \ pthread_equal.3 \ pthread_exit.3 \ pthread_getconcurrency.3 \ pthread_getcpuclockid.3 \ pthread_getspecific.3 \ pthread_getthreadid_np.3 \ pthread_join.3 \ pthread_key_create.3 \ pthread_key_delete.3 \ pthread_kill.3 \ pthread_main_np.3 \ pthread_multi_np.3 \ pthread_mutexattr.3 \ pthread_mutexattr_getkind_np.3 \ + pthread_mutex_consistent.3 \ pthread_mutex_destroy.3 \ pthread_mutex_init.3 \ pthread_mutex_lock.3 \ pthread_mutex_timedlock.3 \ pthread_mutex_trylock.3 \ pthread_mutex_unlock.3 \ pthread_once.3 \ pthread_resume_all_np.3 \ pthread_resume_np.3 \ pthread_rwlockattr_destroy.3 \ pthread_rwlockattr_getpshared.3 \ pthread_rwlockattr_init.3 \ pthread_rwlockattr_setpshared.3 \ pthread_rwlock_destroy.3 \ pthread_rwlock_init.3 \ pthread_rwlock_rdlock.3 \ pthread_rwlock_timedrdlock.3 \ pthread_rwlock_timedwrlock.3 \ pthread_rwlock_unlock.3 \ pthread_rwlock_wrlock.3 \ pthread_schedparam.3 \ pthread_self.3 \ pthread_set_name_np.3 \ pthread_setspecific.3 \ pthread_sigmask.3 \ pthread_spin_init.3 \ pthread_spin_lock.3 \ pthread_suspend_all_np.3 \ pthread_suspend_np.3 \ pthread_switch_add_np.3 \ pthread_testcancel.3 \ pthread_yield.3 PTHREAD_MLINKS= pthread_affinity_np.3 pthread_getaffinity_np.3 \ pthread_affinity_np.3 pthread_setaffinity_np.3 PTHREAD_MLINKS+=pthread_attr.3 pthread_attr_destroy.3 \ pthread_attr.3 pthread_attr_getdetachstate.3 \ pthread_attr.3 pthread_attr_getguardsize.3 \ pthread_attr.3 pthread_attr_getinheritsched.3 \ pthread_attr.3 pthread_attr_getschedparam.3 \ pthread_attr.3 pthread_attr_getschedpolicy.3 \ pthread_attr.3 pthread_attr_getscope.3 \ pthread_attr.3 pthread_attr_getstack.3 \ pthread_attr.3 pthread_attr_getstackaddr.3 \ pthread_attr.3 pthread_attr_getstacksize.3 \ pthread_attr.3 pthread_attr_init.3 \ pthread_attr.3 pthread_attr_setdetachstate.3 \ pthread_attr.3 pthread_attr_setguardsize.3 \ pthread_attr.3 pthread_attr_setinheritsched.3 \ pthread_attr.3 pthread_attr_setschedparam.3 \ pthread_attr.3 pthread_attr_setschedpolicy.3 \ pthread_attr.3 pthread_attr_setscope.3 \ pthread_attr.3 pthread_attr_setstack.3 \ pthread_attr.3 pthread_attr_setstackaddr.3 \ pthread_attr.3 pthread_attr_setstacksize.3 PTHREAD_MLINKS+=pthread_attr_affinity_np.3 pthread_attr_getaffinity_np.3 \ pthread_attr_affinity_np.3 pthread_attr_setaffinity_np.3 PTHREAD_MLINKS+=pthread_barrierattr.3 pthread_barrierattr_destroy.3 \ pthread_barrierattr.3 pthread_barrierattr_getpshared.3 \ pthread_barrierattr.3 pthread_barrierattr_init.3 \ pthread_barrierattr.3 pthread_barrierattr_setpshared.3 PTHREAD_MLINKS+=pthread_barrier_destroy.3 pthread_barrier_init.3 \ pthread_barrier_destroy.3 pthread_barrier_wait.3 PTHREAD_MLINKS+=pthread_condattr.3 pthread_condattr_destroy.3 \ pthread_condattr.3 pthread_condattr_init.3 \ pthread_condattr.3 pthread_condattr_getclock.3 \ pthread_condattr.3 pthread_condattr_setclock.3 \ pthread_condattr.3 pthread_condattr_getpshared.3 \ pthread_condattr.3 pthread_condattr_setpshared.3 PTHREAD_MLINKS+=pthread_getconcurrency.3 pthread_setconcurrency.3 PTHREAD_MLINKS+=pthread_multi_np.3 pthread_single_np.3 PTHREAD_MLINKS+=pthread_mutexattr.3 pthread_mutexattr_destroy.3 \ pthread_mutexattr.3 pthread_mutexattr_getprioceiling.3 \ pthread_mutexattr.3 pthread_mutexattr_getprotocol.3 \ + pthread_mutexattr.3 pthread_mutexattr_getrobust.3 \ pthread_mutexattr.3 pthread_mutexattr_gettype.3 \ pthread_mutexattr.3 pthread_mutexattr_init.3 \ pthread_mutexattr.3 pthread_mutexattr_setprioceiling.3 \ pthread_mutexattr.3 pthread_mutexattr_setprotocol.3 \ + pthread_mutexattr.3 pthread_mutexattr_setrobust.3 \ pthread_mutexattr.3 pthread_mutexattr_settype.3 PTHREAD_MLINKS+=pthread_mutexattr_getkind_np.3 pthread_mutexattr_setkind_np.3 PTHREAD_MLINKS+=pthread_rwlock_rdlock.3 pthread_rwlock_tryrdlock.3 PTHREAD_MLINKS+=pthread_rwlock_wrlock.3 pthread_rwlock_trywrlock.3 PTHREAD_MLINKS+=pthread_schedparam.3 pthread_getschedparam.3 \ pthread_schedparam.3 pthread_setschedparam.3 PTHREAD_MLINKS+=pthread_spin_init.3 pthread_spin_destroy.3 \ pthread_spin_lock.3 pthread_spin_trylock.3 \ pthread_spin_lock.3 pthread_spin_unlock.3 PTHREAD_MLINKS+=pthread_switch_add_np.3 pthread_switch_delete_np.3 PTHREAD_MLINKS+=pthread_testcancel.3 pthread_setcancelstate.3 \ pthread_testcancel.3 pthread_setcanceltype.3 PTHREAD_MLINKS+=pthread_join.3 pthread_timedjoin_np.3 .endif .include Index: head/share/man/man3/pthread_cond_wait.3 =================================================================== --- head/share/man/man3/pthread_cond_wait.3 (revision 300042) +++ head/share/man/man3/pthread_cond_wait.3 (revision 300043) @@ -1,89 +1,101 @@ .\" Copyright (c) 1997 Brian Cully .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the author nor the names of any co-contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd February 16, 2006 +.Dd April 29, 2016 .Dt PTHREAD_COND_WAIT 3 .Os .Sh NAME .Nm pthread_cond_wait .Nd wait on a condition variable .Sh LIBRARY .Lb libpthread .Sh SYNOPSIS .In pthread.h .Ft int .Fn pthread_cond_wait "pthread_cond_t *cond" "pthread_mutex_t *mutex" .Sh DESCRIPTION The .Fn pthread_cond_wait function atomically blocks the current thread waiting on the condition variable specified by .Fa cond , and releases the mutex specified by .Fa mutex . The waiting thread unblocks only after another thread calls .Xr pthread_cond_signal 3 , or .Xr pthread_cond_broadcast 3 with the same condition variable, and the current thread reacquires the lock on .Fa mutex . .Sh RETURN VALUES If successful, the .Fn pthread_cond_wait function will return zero. Otherwise an error number will be returned to indicate the error. .Sh ERRORS The .Fn pthread_cond_wait function will fail if: .Bl -tag -width Er .It Bq Er EINVAL The value specified by .Fa cond or the value specified by .Fa mutex is invalid. .It Bq Er EPERM The specified .Fa mutex was not locked by the calling thread. +.It Bq Er EOWNERDEAD +The argument +.Fa mutex +points to a robust mutex and the previous owning thread terminated +while holding the mutex lock. +The lock was granted to the caller and it is up to the new owner +to make the state consistent. +.It Bq Er ENOTRECOVERABLE +The state protected by the +.Fa mutex +is not recoverable. .El .Sh SEE ALSO .Xr pthread_cond_broadcast 3 , .Xr pthread_cond_destroy 3 , .Xr pthread_cond_init 3 , .Xr pthread_cond_signal 3 , -.Xr pthread_cond_timedwait 3 +.Xr pthread_cond_timedwait 3 , +.Xr pthread_mutex_consistent 3 .Sh STANDARDS The .Fn pthread_cond_wait function conforms to .St -p1003.1-96 . Index: head/share/man/man3/pthread_mutex_consistent.3 =================================================================== --- head/share/man/man3/pthread_mutex_consistent.3 (nonexistent) +++ head/share/man/man3/pthread_mutex_consistent.3 (revision 300043) @@ -0,0 +1,94 @@ +.\" Copyright (c) 2016 The FreeBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" This documentation was written by +.\" Konstantin Belousov under sponsorship +.\" from the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd May 8, 2016 +.Dt PTHREAD_MUTEX_CONSISTENT 3 +.Os +.Sh NAME +.Nm pthread_mutex_consistent +.Nd mark state protected by robust mutex as consistent +.Sh LIBRARY +.Lb libpthread +.Sh SYNOPSIS +.In pthread.h +.Ft int +.Fn pthread_mutex_consistent "pthread_mutex_t *mutex" +.Sh DESCRIPTION +If the thread owning a robust mutex terminates while holding the +mutex, the mutex becomes inconsistent and the next thread that +acquires the mutex lock is notified of the state by the return value +.Er EOWNERDEAD . +In this case, the mutex does not become normally usable again until +the state is marked consistent. +.Pp +The +.Fn pthread_mutex_consistent , +when called with the +.Fa mutex +argument, which points to the initialized robust mutex in an +inconsistent state, marks the by mutex as consistent again. +The consequent unlock of the mutex, by either +.Fn pthread_mutex_unlock +or other methods, allows other contenders to lock the mutex. +.Pp +If the mutex in the inconsistent state is not marked consistent +by the call to +.Fn pthread_mutex_consistent +before unlock, +further attempts to lock the +.Fa mutex +result in the +.Er ENOTRECOVERABLE +condition reported by the locking functions. +.Sh RETURN VALUES +If successful, +.Fn pthread_mutex_consistent +will return zero, otherwise an error number will be returned to +indicate the error. +.Sh ERRORS +The +.Fn pthread_mutex_lock +function will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +The mutex pointed to by the +.Fa mutex +argument is not robust, or is not in the inconsistent state. +.El +.Sh SEE ALSO +.Xr pthread_mutexattr_setrobust 3 , +.Xr pthread_mutex_init 3 , +.Xr pthread_mutex_lock 3 , +.Xr pthread_mutex_unlock 3 +.Sh STANDARDS +The +.Fn pthread_mutex_lock +function conforms to +.St -susv4 . Property changes on: head/share/man/man3/pthread_mutex_consistent.3 ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/share/man/man3/pthread_mutex_lock.3 =================================================================== --- head/share/man/man3/pthread_mutex_lock.3 (revision 300042) +++ head/share/man/man3/pthread_mutex_lock.3 (revision 300043) @@ -1,76 +1,88 @@ .\" Copyright (c) 1997 Brian Cully .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the author nor the names of any co-contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 31, 2006 +.Dd April 29, 2016 .Dt PTHREAD_MUTEX_LOCK 3 .Os .Sh NAME .Nm pthread_mutex_lock .Nd lock a mutex .Sh LIBRARY .Lb libpthread .Sh SYNOPSIS .In pthread.h .Ft int .Fn pthread_mutex_lock "pthread_mutex_t *mutex" .Sh DESCRIPTION The .Fn pthread_mutex_lock function locks .Fa mutex . If the mutex is already locked, the calling thread will block until the mutex becomes available. .Sh RETURN VALUES If successful, .Fn pthread_mutex_lock will return zero, otherwise an error number will be returned to indicate the error. .Sh ERRORS The .Fn pthread_mutex_lock function will fail if: -.Bl -tag -width Er +.Bl -tag -width "Er ENOTRECOVERABLE" .It Bq Er EINVAL The value specified by .Fa mutex is invalid. .It Bq Er EDEADLK A deadlock would occur if the thread blocked waiting for .Fa mutex . +.It Bq Er EOWNERDEAD +The argument +.Fa mutex +points to a robust mutex and the previous owning thread terminated +while holding the mutex lock. +The lock was granted to the caller and it is up to the new owner +to make the state consistent. +.It Bq Er ENOTRECOVERABLE +The state protected by the +.Fa mutex +is not recoverable. .El .Sh SEE ALSO +.Xr pthread_mutex_consistent 3 , .Xr pthread_mutex_destroy 3 , .Xr pthread_mutex_init 3 , .Xr pthread_mutex_trylock 3 , .Xr pthread_mutex_unlock 3 .Sh STANDARDS The .Fn pthread_mutex_lock function conforms to .St -p1003.1-96 . Index: head/share/man/man3/pthread_mutex_timedlock.3 =================================================================== --- head/share/man/man3/pthread_mutex_timedlock.3 (revision 300042) +++ head/share/man/man3/pthread_mutex_timedlock.3 (revision 300043) @@ -1,103 +1,115 @@ .\" Copyright (c) 2003 Michael Telahun Makonnen .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd December 30, 2003 +.Dd April 29, 2016 .Dt PTHREAD_MUTEX_TIMEDLOCK 3 .Os .Sh NAME .Nm pthread_mutex_timedlock .Nd lock a mutex without blocking indefinitely .Sh LIBRARY .Lb libpthread .Sh SYNOPSIS .In pthread.h .In time.h .Ft int .Fn pthread_mutex_timedlock "pthread_mutex_t *mutex" "const struct timespec *abs_timeout" .Sh DESCRIPTION The .Fn pthread_mutex_timedlock function will lock .Fa mutex . If it is already locked the calling thread will block until the mutex becomes available or the timeout, specified by abs_timeout, expires. The time of the timeout is an absolute time and is not relative to the current time. .Sh RETURN VALUES If successful, .Fn pthread_mutex_timedlock will return zero, otherwise an error number will be returned to indicate the error. .Sh ERRORS The .Fn pthread_mutex_timedlock function will fail if: .Bl -tag -width Er -.It Bq Er EINVAL +.It Bq "Er ENOTRECOVERABLE" The .Fa mutex was created with the protocol attribute having the value PTHREAD_PRIO_PROTECT and the calling thread's priority is higher than the mutex's current priority ceiling. .It Bq Er EINVAL The process or thread would have blocked, and .Fa abs_timeout specified a nanosecond value less than zero or greater than or equal to 1 billion. .It Bq Er EINVAL The .Fa mutex parameter is invalid. .It Bq Er ETIMEDOUT The .Fa mutex could not be locked before the timeout expired. .It Bq Er EAGAIN The .Fa mutex could not be acquired because the maximum number of recursive locks for the .Fa mutex has been exceeded. .It Bq Er EDEADLK The current thread already owns the .Fa mutex . +.It Bq Er EOWNERDEAD +The argument +.Fa mutex +points to a robust mutex and the previous owning thread terminated +while holding the mutex lock. +The lock was granted to the caller and it is up to the new owner +to make the state consistent. +.It Bq Er ENOTRECOVERABLE +The state protected by the +.Fa mutex +is not recoverable. .El .Sh SEE ALSO +.Xr pthread_mutex_consistent 3 , .Xr pthread_mutex_destroy 3 , .Xr pthread_mutex_init 3 , .Xr pthread_mutex_lock 3 , .Xr pthread_mutex_trylock 3 , .Xr pthread_mutex_unlock 3 .Sh STANDARDS The .Fn pthread_mutex_timedlock function is expected to conform to .St -p1003.1-96 . Index: head/share/man/man3/pthread_mutex_trylock.3 =================================================================== --- head/share/man/man3/pthread_mutex_trylock.3 (revision 300042) +++ head/share/man/man3/pthread_mutex_trylock.3 (revision 300043) @@ -1,77 +1,89 @@ .\" Copyright (c) 1997 Brian Cully .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the author nor the names of any co-contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd July 30, 1998 +.Dd April 29, 2016 .Dt PTHREAD_MUTEX_TRYLOCK 3 .Os .Sh NAME .Nm pthread_mutex_trylock .Nd attempt to lock a mutex without blocking .Sh LIBRARY .Lb libpthread .Sh SYNOPSIS .In pthread.h .Ft int .Fn pthread_mutex_trylock "pthread_mutex_t *mutex" .Sh DESCRIPTION The .Fn pthread_mutex_trylock function locks .Fa mutex . If the mutex is already locked, .Fn pthread_mutex_trylock will not block waiting for the mutex, but will return an error condition. .Sh RETURN VALUES If successful, .Fn pthread_mutex_trylock will return zero, otherwise an error number will be returned to indicate the error. .Sh ERRORS The .Fn pthread_mutex_trylock function will fail if: -.Bl -tag -width Er +.Bl -tag -width "Er ENOTRECOVERABLE" .It Bq Er EINVAL The value specified by .Fa mutex is invalid. .It Bq Er EBUSY .Fa Mutex is already locked. +.It Bq Er EOWNERDEAD +The argument +.Fa mutex +points to a robust mutex and the previous owning thread terminated +while holding the mutex lock. +The lock was granted to the caller and it is up to the new owner +to make the state consistent. +.It Bq Er ENOTRECOVERABLE +The state protected by the +.Fa mutex +is not recoverable. .El .Sh SEE ALSO +.Xr pthread_mutex_consistent 3 , .Xr pthread_mutex_destroy 3 , .Xr pthread_mutex_init 3 , .Xr pthread_mutex_lock 3 , .Xr pthread_mutex_unlock 3 .Sh STANDARDS The .Fn pthread_mutex_trylock function conforms to .St -p1003.1-96 . Index: head/share/man/man3/pthread_mutex_unlock.3 =================================================================== --- head/share/man/man3/pthread_mutex_unlock.3 (revision 300042) +++ head/share/man/man3/pthread_mutex_unlock.3 (revision 300043) @@ -1,76 +1,87 @@ .\" Copyright (c) 1997 Brian Cully .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the author nor the names of any co-contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd July 30, 1998 +.Dd April 29, 2016 .Dt PTHREAD_MUTEX_UNLOCK 3 .Os .Sh NAME .Nm pthread_mutex_unlock .Nd unlock a mutex .Sh LIBRARY .Lb libpthread .Sh SYNOPSIS .In pthread.h .Ft int .Fn pthread_mutex_unlock "pthread_mutex_t *mutex" .Sh DESCRIPTION If the current thread holds the lock on .Fa mutex , then the .Fn pthread_mutex_unlock function unlocks .Fa mutex . +.Pp +If the argument pointed by the +.Fa mutex +is a robust mutex in the inconsistent state, and the call to +.Fn pthread_mutex_consistent +function was not done prior to unlocking, further locking attempts on +the mutex +.Fa mutex +are denied and locking functions return +.Er ENOTRECOVERABLE +error. .Sh RETURN VALUES If successful, .Fn pthread_mutex_unlock will return zero, otherwise an error number will be returned to indicate the error. .Sh ERRORS The .Fn pthread_mutex_unlock function will fail if: .Bl -tag -width Er .It Bq Er EINVAL The value specified by .Fa mutex is invalid. .It Bq Er EPERM The current thread does not hold a lock on .Fa mutex . .El .Sh SEE ALSO .Xr pthread_mutex_destroy 3 , .Xr pthread_mutex_init 3 , .Xr pthread_mutex_lock 3 , .Xr pthread_mutex_trylock 3 .Sh STANDARDS The .Fn pthread_mutex_unlock function conforms to .St -p1003.1-96 . Index: head/share/man/man3/pthread_mutexattr.3 =================================================================== --- head/share/man/man3/pthread_mutexattr.3 (revision 300042) +++ head/share/man/man3/pthread_mutexattr.3 (revision 300043) @@ -1,187 +1,219 @@ .\" Copyright (C) 2000 Jason Evans . .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in .\" the documentation and/or other materials provided with the .\" distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR .\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR .\" BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, .\" WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE .\" OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, .\" EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .\" .\" $FreeBSD$ -.Dd May 1, 2000 +.Dd April 29, 2016 .Dt PTHREAD_MUTEXATTR 3 .Os .Sh NAME .Nm pthread_mutexattr_init , .Nm pthread_mutexattr_destroy , .Nm pthread_mutexattr_setprioceiling , .Nm pthread_mutexattr_getprioceiling , .Nm pthread_mutexattr_setprotocol , .Nm pthread_mutexattr_getprotocol , +.Nm pthread_mutexattr_setrobust , +.Nm pthread_mutexattr_getrobust , .Nm pthread_mutexattr_settype , .Nm pthread_mutexattr_gettype .Nd mutex attribute operations .Sh LIBRARY .Lb libpthread .Sh SYNOPSIS .In pthread.h .Ft int .Fn pthread_mutexattr_init "pthread_mutexattr_t *attr" .Ft int .Fn pthread_mutexattr_destroy "pthread_mutexattr_t *attr" .Ft int .Fn pthread_mutexattr_setprioceiling "pthread_mutexattr_t *attr" "int prioceiling" .Ft int .Fn pthread_mutexattr_getprioceiling "pthread_mutexattr_t *attr" "int *prioceiling" .Ft int .Fn pthread_mutexattr_setprotocol "pthread_mutexattr_t *attr" "int protocol" .Ft int .Fn pthread_mutexattr_getprotocol "pthread_mutexattr_t *attr" "int *protocol" .Ft int +.Fn pthread_mutexattr_setrobust "pthread_mutexattr_t *attr" "int robust" +.Ft int +.Fn pthread_mutexattr_getrobust "pthread_mutexattr_t *attr" "int *robust" +.Ft int .Fn pthread_mutexattr_settype "pthread_mutexattr_t *attr" "int type" .Ft int .Fn pthread_mutexattr_gettype "pthread_mutexattr_t *attr" "int *type" .Sh DESCRIPTION Mutex attributes are used to specify parameters to .Fn pthread_mutex_init . One attribute object can be used in multiple calls to .Fn pthread_mutex_init , with or without modifications between calls. .Pp The .Fn pthread_mutexattr_init function initializes .Fa attr with all the default mutex attributes. .Pp The .Fn pthread_mutexattr_destroy function destroys .Fa attr . .Pp The .Fn pthread_mutexattr_set* functions set the attribute that corresponds to each function name. .Pp The .Fn pthread_mutexattr_get* functions copy the value of the attribute that corresponds to each function name to the location pointed to by the second function parameter. .Sh RETURN VALUES If successful, these functions return 0. Otherwise, an error number is returned to indicate the error. .Sh ERRORS The .Fn pthread_mutexattr_init function will fail if: .Bl -tag -width Er .It Bq Er ENOMEM Out of memory. .El .Pp The .Fn pthread_mutexattr_destroy function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr . .El .Pp The .Fn pthread_mutexattr_setprioceiling function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr , or invalid value for .Fa prioceiling . .El .Pp The .Fn pthread_mutexattr_getprioceiling function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr . .El .Pp The .Fn pthread_mutexattr_setprotocol function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr , or invalid value for .Fa protocol . .El .Pp The .Fn pthread_mutexattr_getprotocol function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr . .El .Pp The .Fn pthread_mutexattr_settype function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr , or invalid value for .Fa type . .El .Pp The .Fn pthread_mutexattr_gettype function will fail if: .Bl -tag -width Er .It Bq Er EINVAL Invalid value for .Fa attr . .El +.Pp +The +.Fn pthread_mutexattr_setrobust +function will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +Invalid value for +.Fa attr , +or invalid value for +.Fa robust . +.El +.Pp +The +.Fn pthread_mutexattr_getrobust +function will fail if: +.Bl -tag -width Er +.It Bq Er EINVAL +Invalid value for +.Fa attr . +.El .Sh SEE ALSO .Xr pthread_mutex_init 3 .Sh STANDARDS The .Fn pthread_mutexattr_init and .Fn pthread_mutexattr_destroy functions conform to .St -p1003.1-96 .Pp The .Fn pthread_mutexattr_setprioceiling , .Fn pthread_mutexattr_getprioceiling , .Fn pthread_mutexattr_setprotocol , .Fn pthread_mutexattr_getprotocol , .Fn pthread_mutexattr_settype , and .Fn pthread_mutexattr_gettype functions conform to -.St -susv2 +.St -susv2 . +The +.Fn pthread_mutexattr_setrobust +and +.Fn pthread_mutexattr_getrobust +functions conform to +.St -susv4 . Index: head/sys/compat/cloudabi/cloudabi_thread.c =================================================================== --- head/sys/compat/cloudabi/cloudabi_thread.c (revision 300042) +++ head/sys/compat/cloudabi/cloudabi_thread.c (revision 300043) @@ -1,74 +1,77 @@ /*- * Copyright (c) 2015 Nuxi, https://nuxi.nl/ * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include +#include #include #include int cloudabi_sys_thread_exit(struct thread *td, struct cloudabi_sys_thread_exit_args *uap) { struct cloudabi_sys_lock_unlock_args cloudabi_sys_lock_unlock_args = { .lock = uap->lock, .scope = uap->scope, }; + + umtx_thread_exit(td); /* Wake up joining thread. */ cloudabi_sys_lock_unlock(td, &cloudabi_sys_lock_unlock_args); /* * Attempt to terminate the thread. Terminate the process if * it's the last thread. */ kern_thr_exit(td); exit1(td, 0, 0); /* NOTREACHED */ } int cloudabi_sys_thread_tcb_set(struct thread *td, struct cloudabi_sys_thread_tcb_set_args *uap) { return (cpu_set_user_tls(td, uap->tcb)); } int cloudabi_sys_thread_yield(struct thread *td, struct cloudabi_sys_thread_yield_args *uap) { sched_relinquish(td); return (0); } Index: head/sys/compat/linux/linux_fork.c =================================================================== --- head/sys/compat/linux/linux_fork.c (revision 300042) +++ head/sys/compat/linux/linux_fork.c (revision 300043) @@ -1,482 +1,485 @@ /*- * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include int linux_fork(struct thread *td, struct linux_fork_args *args) { struct fork_req fr; int error; struct proc *p2; struct thread *td2; #ifdef DEBUG if (ldebug(fork)) printf(ARGS(fork, "")); #endif bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &p2; if ((error = fork1(td, &fr)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); return (0); } int linux_vfork(struct thread *td, struct linux_vfork_args *args) { struct fork_req fr; int error; struct proc *p2; struct thread *td2; #ifdef DEBUG if (ldebug(vfork)) printf(ARGS(vfork, "")); #endif bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFMEM | RFPPWAIT | RFSTOPPED; fr.fr_procp = &p2; if ((error = fork1(td, &fr)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); return (0); } static int linux_clone_proc(struct thread *td, struct linux_clone_args *args) { struct fork_req fr; int error, ff = RFPROC | RFSTOPPED; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { exit_signal = linux_to_bsd_signal(exit_signal); } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; /* * XXX: In Linux, sharing of fs info (chroot/cwd/umask) * and open files is independent. In FreeBSD, its in one * structure but in reality it does not cause any problems * because both of these flags are usually set together. */ if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS))) ff |= RFFDG; if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); if (args->flags & LINUX_CLONE_VFORK) ff |= RFPPWAIT; bzero(&fr, sizeof(fr)); fr.fr_flags = ff; fr.fr_procp = &p2; error = fork1(td, &fr); if (error) return (error); td2 = FIRST_THREAD_IN_PROC(p2); /* create the emuldata */ linux_proc_init(td, td2, args->flags); em = em_find(td2); KASSERT(em != NULL, ("clone_proc: emuldata not found.\n")); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) printf(LMSG("copyout failed!")); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ linux_set_upcall_kse(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); /* * If CLONE_PARENT is set, then the parent of the new process will be * the same as that of the calling process. */ if (args->flags & LINUX_CLONE_PARENT) { sx_xlock(&proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr); PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } #ifdef DEBUG if (ldebug(clone)) printf(LMSG("clone: successful rfork to %d, " "stack %p sig = %d"), (int)p2->p_pid, args->stack, exit_signal); #endif /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); thread_unlock(td2); td->td_retval[0] = p2->p_pid; return (0); } static int linux_clone_thread(struct thread *td, struct linux_clone_args *args) { struct linux_emuldata *em; struct thread *newtd; struct proc *p; int error; #ifdef DEBUG if (ldebug(clone)) { printf(ARGS(clone, "thread: flags %x, stack %p, parent tid: %p, " "child tid: %p"), (unsigned)args->flags, args->stack, args->parent_tidptr, args->child_tidptr); } #endif LINUX_CTR4(clone_thread, "thread(%d) flags %x ptid %p ctid %p", td->td_tid, (unsigned)args->flags, args->parent_tidptr, args->child_tidptr); if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); /* Threads should be created with own stack */ if (args->stack == NULL) return (EINVAL); p = td->td_proc; #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_set_upcall(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = p; thread_cow_get(newtd, td); /* create the emuldata */ linux_proc_init(td, newtd, args->flags); em = em_find(newtd); KASSERT(em != NULL, ("clone_thread: emuldata not found.\n")); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(newtd, args->tls); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; cpu_thread_clean(newtd); linux_set_upcall_kse(newtd, PTROUT(args->stack)); PROC_LOCK(p); p->p_flag |= P_HADTHREADS; bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); if (args->flags & LINUX_CLONE_PARENT) thread_link(newtd, p->p_pptr); else thread_link(newtd, p); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; PROC_UNLOCK(p); tidhash_add(newtd); #ifdef DEBUG if (ldebug(clone)) printf(ARGS(clone, "successful clone to %d, stack %p"), (int)newtd->td_tid, args->stack); #endif LINUX_CTR2(clone_thread, "thread(%d) successful clone to %d", td->td_tid, newtd->td_tid); if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&newtd->td_tid, args->parent_tidptr, sizeof(newtd->td_tid)); if (error) printf(LMSG("clone_thread: copyout failed!")); } /* * Make this runnable after we are finished with it. */ thread_lock(newtd); TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); td->td_retval[0] = newtd->td_tid; return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); } int linux_clone(struct thread *td, struct linux_clone_args *args) { if (args->flags & LINUX_CLONE_THREAD) return (linux_clone_thread(td, args)); else return (linux_clone_proc(td, args)); } int linux_exit(struct thread *td, struct linux_exit_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("exit: emuldata not found.\n")); LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval); + + umtx_thread_exit(td); linux_thread_detach(td); /* * XXX. When the last two threads of a process * exit via pthread_exit() try thr_exit() first. */ kern_thr_exit(td); exit1(td, args->rval, 0); /* NOTREACHED */ } int linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n")); em->child_clear_tid = args->tidptr; td->td_retval[0] = em->em_tid; LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d", em->em_tid, args->tidptr, td->td_retval[0]); return (0); } void linux_thread_detach(struct thread *td) { struct linux_sys_futex_args cup; struct linux_emuldata *em; int *child_clear_tid; int error; em = em_find(td); KASSERT(em != NULL, ("thread_detach: emuldata not found.\n")); LINUX_CTR1(thread_detach, "thread(%d)", em->em_tid); release_futexes(td, em); child_clear_tid = em->child_clear_tid; if (child_clear_tid != NULL) { LINUX_CTR2(thread_detach, "thread(%d) %p", em->em_tid, child_clear_tid); error = suword32(child_clear_tid, 0); if (error != 0) return; cup.uaddr = child_clear_tid; cup.op = LINUX_FUTEX_WAKE; cup.val = 1; /* wake one */ cup.timeout = NULL; cup.uaddr2 = NULL; cup.val3 = 0; error = linux_sys_futex(td, &cup); /* * this cannot happen at the moment and if this happens it * probably means there is a user space bug */ if (error != 0) linux_msg(td, "futex stuff in thread_detach failed."); } } Index: head/sys/kern/kern_exit.c =================================================================== --- head/sys/kern/kern_exit.c (revision 300042) +++ head/sys/kern/kern_exit.c (revision 300043) @@ -1,1327 +1,1329 @@ /*- * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for acct_process() function prototype */ #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include dtrace_execexit_func_t dtrace_fasttrap_exit; #endif SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE1(proc, , , exit, "int"); /* Hook for NFS teardown procedure. */ void (*nlminfo_release_p)(struct proc *p); struct proc * proc_realparent(struct proc *child) { struct proc *p, *parent; sx_assert(&proctree_lock, SX_LOCKED); if ((child->p_treeflag & P_TREE_ORPHANED) == 0) { if (child->p_oppid == 0 || child->p_pptr->p_pid == child->p_oppid) parent = child->p_pptr; else parent = initproc; return (parent); } for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) { /* Cannot use LIST_PREV(), since the list head is not known. */ p = __containerof(p->p_orphan.le_prev, struct proc, p_orphan.le_next); KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0, ("missing P_ORPHAN %p", p)); } parent = __containerof(p->p_orphan.le_prev, struct proc, p_orphans.lh_first); return (parent); } void reaper_abandon_children(struct proc *p, bool exiting) { struct proc *p1, *p2, *ptmp; sx_assert(&proctree_lock, SX_LOCKED); KASSERT(p != initproc, ("reaper_abandon_children for initproc")); if ((p->p_treeflag & P_TREE_REAPER) == 0) return; p1 = p->p_reaper; LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) { LIST_REMOVE(p2, p_reapsibling); p2->p_reaper = p1; p2->p_reapsubtree = p->p_reapsubtree; LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling); if (exiting && p2->p_pptr == p) { PROC_LOCK(p2); proc_reparent(p2, p1); PROC_UNLOCK(p2); } } KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty")); p->p_treeflag &= ~P_TREE_REAPER; } static void clear_orphan(struct proc *p) { struct proc *p1; sx_assert(&proctree_lock, SA_XLOCKED); if ((p->p_treeflag & P_TREE_ORPHANED) == 0) return; if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) { p1 = LIST_NEXT(p, p_orphan); if (p1 != NULL) p1->p_treeflag |= P_TREE_FIRST_ORPHAN; p->p_treeflag &= ~P_TREE_FIRST_ORPHAN; } LIST_REMOVE(p, p_orphan); p->p_treeflag &= ~P_TREE_ORPHANED; } /* * exit -- death of process. */ void sys_sys_exit(struct thread *td, struct sys_exit_args *uap) { exit1(td, uap->rval, 0); /* NOTREACHED */ } /* * Exit: deallocate address space and other resources, change proc state to * zombie, and unlink proc from allproc and parent's lists. Save exit status * and rusage for wait(). Check for child processes and orphan them. */ void exit1(struct thread *td, int rval, int signo) { struct proc *p, *nq, *q, *t; struct thread *tdt; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(rval == 0 || signo == 0, ("exit1 rv %d sig %d", rval, signo)); p = td->td_proc; /* * XXX in case we're rebooting we just let init die in order to * work around an unsolved stack overflow seen very late during * shutdown on sparc64 when the gmirror worker process exists. */ if (p == initproc && rebooting == 0) { printf("init died (signal %d, exit %d)\n", signo, rval); panic("Going nowhere without my init!"); } /* * Deref SU mp, since the thread does not return to userspace. */ if (softdep_ast_cleanup != NULL) softdep_ast_cleanup(); /* * MUST abort all other threads before proceeding past here. */ PROC_LOCK(p); /* * First check if some other thread or external request got * here before us. If so, act appropriately: exit or suspend. * We must ensure that stop requests are handled before we set * P_WEXIT. */ thread_suspend_check(0); while (p->p_flag & P_HADTHREADS) { /* * Kill off the other threads. This requires * some co-operation from other parts of the kernel * so it may not be instantaneous. With this state set * any thread entering the kernel from userspace will * thread_exit() in trap(). Any thread attempting to * sleep will return immediately with EINTR or EWOULDBLOCK * which will hopefully force them to back out to userland * freeing resources as they go. Any thread attempting * to return to userland will thread_exit() from userret(). * thread_exit() will unsuspend us when the last of the * other threads exits. * If there is already a thread singler after resumption, * calling thread_single will fail; in that case, we just * re-check all suspension request, the thread should * either be suspended there or exit. */ if (!thread_single(p, SINGLE_EXIT)) /* * All other activity in this process is now * stopped. Threading support has been turned * off. */ break; /* * Recheck for new stop or suspend requests which * might appear while process lock was dropped in * thread_single(). */ thread_suspend_check(0); } KASSERT(p->p_numthreads == 1, ("exit1: proc %p exiting with %d threads", p, p->p_numthreads)); racct_sub(p, RACCT_NTHR, 1); /* Let event handler change exit status */ p->p_xexit = rval; p->p_xsig = signo; /* * Wakeup anyone in procfs' PIOCWAIT. They should have a hold * on our vmspace, so we should block below until they have * released their reference to us. Note that if they have * requested S_EXIT stops we will block here until they ack * via PIOCCONT. */ _STOPEVENT(p, S_EXIT, 0); /* * Ignore any pending request to stop due to a stop signal. * Once P_WEXIT is set, future requests will be ignored as * well. */ p->p_flag &= ~P_STOPPED_SIG; KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped")); /* * Note that we are exiting and do another wakeup of anyone in * PIOCWAIT in case they aren't listening for S_EXIT stops or * decided to wait again after we told them we are exiting. */ p->p_flag |= P_WEXIT; wakeup(&p->p_stype); /* * Wait for any processes that have a hold on our vmspace to * release their reference. */ while (p->p_lock > 0) msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0); PROC_UNLOCK(p); /* Drain the limit callout while we don't have the proc locked */ callout_drain(&p->p_limco); #ifdef AUDIT /* * The Sun BSM exit token contains two components: an exit status as * passed to exit(), and a return value to indicate what sort of exit * it was. The exit status is WEXITSTATUS(rv), but it's not clear * what the return value is. */ AUDIT_ARG_EXIT(rval, 0); AUDIT_SYSCALL_EXIT(0, td); #endif /* Are we a task leader with peers? */ if (p->p_peers != NULL && p == p->p_leader) { mtx_lock(&ppeers_lock); q = p->p_peers; while (q != NULL) { PROC_LOCK(q); kern_psignal(q, SIGKILL); PROC_UNLOCK(q); q = q->p_peers; } while (p->p_peers != NULL) msleep(p, &ppeers_lock, PWAIT, "exit1", 0); mtx_unlock(&ppeers_lock); } /* * Check if any loadable modules need anything done at process exit. * E.g. SYSV IPC stuff. * Event handler could change exit status. * XXX what if one of these generates an error? */ EVENTHANDLER_INVOKE(process_exit, p); /* * If parent is waiting for us to exit or exec, * P_PPWAIT is set; we will wakeup the parent below. */ PROC_LOCK(p); stopprofclock(p); p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE); /* * Stop the real interval timer. If the handler is currently * executing, prevent it from rearming itself and let it finish. */ if (timevalisset(&p->p_realtimer.it_value) && callout_stop(&p->p_itcallout) == 0) { timevalclear(&p->p_realtimer.it_interval); msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0); KASSERT(!timevalisset(&p->p_realtimer.it_value), ("realtime timer is still armed")); } + PROC_UNLOCK(p); + umtx_thread_exit(td); + /* * Reset any sigio structures pointing to us as a result of * F_SETOWN with our pid. */ funsetownlst(&p->p_sigiolst); /* * If this process has an nlminfo data area (for lockd), release it */ if (nlminfo_release_p != NULL && p->p_nlminfo != NULL) (*nlminfo_release_p)(p); /* * Close open files and release open-file table. * This may block! */ fdescfree(td); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Remove ourself from our leader's peer list and wake our leader. */ if (p->p_leader->p_peers != NULL) { mtx_lock(&ppeers_lock); if (p->p_leader->p_peers != NULL) { q = p->p_leader; while (q->p_peers != p) q = q->p_peers; q->p_peers = p->p_peers; wakeup(p->p_leader); } mtx_unlock(&ppeers_lock); } vmspace_exit(td); killjobc(); (void)acct_process(td); #ifdef KTRACE ktrprocexit(td); #endif /* * Release reference to text vnode */ if (p->p_textvp != NULL) { vrele(p->p_textvp); p->p_textvp = NULL; } /* * Release our limits structure. */ lim_free(p->p_limit); p->p_limit = NULL; tidhash_remove(td); /* * Remove proc from allproc queue and pidhash chain. * Place onto zombproc. Unlink from parent's child list. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); LIST_INSERT_HEAD(&zombproc, p, p_list); LIST_REMOVE(p, p_hash); sx_xunlock(&allproc_lock); /* * Call machine-dependent code to release any * machine-dependent resources other than the address space. * The address space is released by "vmspace_exitfree(p)" in * vm_waitproc(). */ cpu_exit(td); WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid); /* * Reparent all children processes: * - traced ones to the original parent (or init if we are that parent) * - the rest to init */ sx_xlock(&proctree_lock); q = LIST_FIRST(&p->p_children); if (q != NULL) /* only need this if any child is S_ZOMB */ wakeup(q->p_reaper); for (; q != NULL; q = nq) { nq = LIST_NEXT(q, p_sibling); PROC_LOCK(q); q->p_sigparent = SIGCHLD; if (!(q->p_flag & P_TRACED)) { proc_reparent(q, q->p_reaper); } else { /* * Traced processes are killed since their existence * means someone is screwing up. */ t = proc_realparent(q); if (t == p) { proc_reparent(q, q->p_reaper); } else { PROC_LOCK(t); proc_reparent(q, t); PROC_UNLOCK(t); } /* * Since q was found on our children list, the * proc_reparent() call moved q to the orphan * list due to present P_TRACED flag. Clear * orphan link for q now while q is locked. */ clear_orphan(q); q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE); FOREACH_THREAD_IN_PROC(q, tdt) tdt->td_dbgflags &= ~TDB_SUSPEND; kern_psignal(q, SIGKILL); } PROC_UNLOCK(q); } /* * Also get rid of our orphans. */ while ((q = LIST_FIRST(&p->p_orphans)) != NULL) { PROC_LOCK(q); CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid, q->p_pid); clear_orphan(q); PROC_UNLOCK(q); } /* Save exit status. */ PROC_LOCK(p); p->p_xthread = td; /* Tell the prison that we are gone. */ prison_proc_free(p->p_ucred->cr_prison); #ifdef KDTRACE_HOOKS /* * Tell the DTrace fasttrap provider about the exit if it * has declared an interest. */ if (dtrace_fasttrap_exit) dtrace_fasttrap_exit(p); #endif /* * Notify interested parties of our demise. */ KNOTE_LOCKED(&p->p_klist, NOTE_EXIT); #ifdef KDTRACE_HOOKS int reason = CLD_EXITED; if (WCOREDUMP(signo)) reason = CLD_DUMPED; else if (WIFSIGNALED(signo)) reason = CLD_KILLED; SDT_PROBE1(proc, , , exit, reason); #endif /* * Just delete all entries in the p_klist. At this point we won't * report any more events, and there are nasty race conditions that * can beat us if we don't. */ knlist_clear(&p->p_klist, 1); /* * If this is a process with a descriptor, we may not need to deliver * a signal to the parent. proctree_lock is held over * procdesc_exit() to serialize concurrent calls to close() and * exit(). */ if (p->p_procdesc == NULL || procdesc_exit(p)) { /* * Notify parent that we're gone. If parent has the * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN, * notify process 1 instead (and hope it will handle this * situation). */ PROC_LOCK(p->p_pptr); mtx_lock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) { struct proc *pp; mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); pp = p->p_pptr; PROC_UNLOCK(pp); proc_reparent(p, p->p_reaper); p->p_sigparent = SIGCHLD; PROC_LOCK(p->p_pptr); /* * Notify parent, so in case he was wait(2)ing or * executing waitpid(2) with our pid, he will * continue. */ wakeup(pp); } else mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx); if (p->p_pptr == p->p_reaper || p->p_pptr == initproc) childproc_exited(p); else if (p->p_sigparent != 0) { if (p->p_sigparent == SIGCHLD) childproc_exited(p); else /* LINUX thread */ kern_psignal(p->p_pptr, p->p_sigparent); } } else PROC_LOCK(p->p_pptr); sx_xunlock(&proctree_lock); /* * The state PRS_ZOMBIE prevents other proesses from sending * signal to the process, to avoid memory leak, we free memory * for signal queue at the time when the state is set. */ sigqueue_flush(&p->p_sigqueue); sigqueue_flush(&td->td_sigqueue); /* * We have to wait until after acquiring all locks before * changing p_state. We need to avoid all possible context * switches (including ones from blocking on a mutex) while * marked as a zombie. We also have to set the zombie state * before we release the parent process' proc lock to avoid * a lost wakeup. So, we first call wakeup, then we grab the * sched lock, update the state, and release the parent process' * proc lock. */ wakeup(p->p_pptr); cv_broadcast(&p->p_pwait); sched_exit(p->p_pptr, td); - umtx_thread_exit(td); PROC_SLOCK(p); p->p_state = PRS_ZOMBIE; PROC_UNLOCK(p->p_pptr); /* * Hopefully no one will try to deliver a signal to the process this * late in the game. */ knlist_destroy(&p->p_klist); /* * Save our children's rusage information in our exit rusage. */ PROC_STATLOCK(p); ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux); PROC_STATUNLOCK(p); /* * Make sure the scheduler takes this thread out of its tables etc. * This will also release this thread's reference to the ucred. * Other thread parts to release include pcb bits and such. */ thread_exit(); } #ifndef _SYS_SYSPROTO_H_ struct abort2_args { char *why; int nargs; void **args; }; #endif int sys_abort2(struct thread *td, struct abort2_args *uap) { struct proc *p = td->td_proc; struct sbuf *sb; void *uargs[16]; int error, i, sig; /* * Do it right now so we can log either proper call of abort2(), or * note, that invalid argument was passed. 512 is big enough to * handle 16 arguments' descriptions with additional comments. */ sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN); sbuf_clear(sb); sbuf_printf(sb, "%s(pid %d uid %d) aborted: ", p->p_comm, p->p_pid, td->td_ucred->cr_uid); /* * Since we can't return from abort2(), send SIGKILL in cases, where * abort2() was called improperly */ sig = SIGKILL; /* Prevent from DoSes from user-space. */ if (uap->nargs < 0 || uap->nargs > 16) goto out; if (uap->nargs > 0) { if (uap->args == NULL) goto out; error = copyin(uap->args, uargs, uap->nargs * sizeof(void *)); if (error != 0) goto out; } /* * Limit size of 'reason' string to 128. Will fit even when * maximal number of arguments was chosen to be logged. */ if (uap->why != NULL) { error = sbuf_copyin(sb, uap->why, 128); if (error < 0) goto out; } else { sbuf_printf(sb, "(null)"); } if (uap->nargs > 0) { sbuf_printf(sb, "("); for (i = 0;i < uap->nargs; i++) sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]); sbuf_printf(sb, ")"); } /* * Final stage: arguments were proper, string has been * successfully copied from userspace, and copying pointers * from user-space succeed. */ sig = SIGABRT; out: if (sig == SIGKILL) { sbuf_trim(sb); sbuf_printf(sb, " (Reason text inaccessible)"); } sbuf_cat(sb, "\n"); sbuf_finish(sb); log(LOG_INFO, "%s", sbuf_data(sb)); sbuf_delete(sb); exit1(td, 0, sig); return (0); } #ifdef COMPAT_43 /* * The dirty work is handled by kern_wait(). */ int owait(struct thread *td, struct owait_args *uap __unused) { int error, status; error = kern_wait(td, WAIT_ANY, &status, 0, NULL); if (error == 0) td->td_retval[1] = status; return (error); } #endif /* COMPAT_43 */ /* * The dirty work is handled by kern_wait(). */ int sys_wait4(struct thread *td, struct wait4_args *uap) { struct rusage ru, *rup; int error, status; if (uap->rusage != NULL) rup = &ru; else rup = NULL; error = kern_wait(td, uap->pid, &status, uap->options, rup); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->rusage != NULL && error == 0) error = copyout(&ru, uap->rusage, sizeof(struct rusage)); return (error); } int sys_wait6(struct thread *td, struct wait6_args *uap) { struct __wrusage wru, *wrup; siginfo_t si, *sip; idtype_t idtype; id_t id; int error, status; idtype = uap->idtype; id = uap->id; if (uap->wrusage != NULL) wrup = &wru; else wrup = NULL; if (uap->info != NULL) { sip = &si; bzero(sip, sizeof(*sip)); } else sip = NULL; /* * We expect all callers of wait6() to know about WEXITED and * WTRAPPED. */ error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip); if (uap->status != NULL && error == 0) error = copyout(&status, uap->status, sizeof(status)); if (uap->wrusage != NULL && error == 0) error = copyout(&wru, uap->wrusage, sizeof(wru)); if (uap->info != NULL && error == 0) error = copyout(&si, uap->info, sizeof(si)); return (error); } /* * Reap the remains of a zombie process and optionally return status and * rusage. Asserts and will release both the proctree_lock and the process * lock as part of its work. */ void proc_reap(struct thread *td, struct proc *p, int *status, int options) { struct proc *q, *t; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE")); q = td->td_proc; PROC_SUNLOCK(p); if (status) *status = KW_EXITCODE(p->p_xexit, p->p_xsig); if (options & WNOWAIT) { /* * Only poll, returning the status. Caller does not wish to * release the proc struct just yet. */ PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return; } PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); /* * If we got the child via a ptrace 'attach', we need to give it back * to the old parent. */ if (p->p_oppid != 0 && p->p_oppid != p->p_pptr->p_pid) { PROC_UNLOCK(p); t = proc_realparent(p); PROC_LOCK(t); PROC_LOCK(p); CTR2(KTR_PTRACE, "wait: traced child %d moved back to parent %d", p->p_pid, t->p_pid); proc_reparent(p, t); p->p_oppid = 0; PROC_UNLOCK(p); pksignal(t, SIGCHLD, p->p_ksi); wakeup(t); cv_broadcast(&p->p_pwait); PROC_UNLOCK(t); sx_xunlock(&proctree_lock); return; } p->p_oppid = 0; PROC_UNLOCK(p); /* * Remove other references to this process to ensure we have an * exclusive reference. */ sx_xlock(&allproc_lock); LIST_REMOVE(p, p_list); /* off zombproc */ sx_xunlock(&allproc_lock); LIST_REMOVE(p, p_sibling); reaper_abandon_children(p, true); LIST_REMOVE(p, p_reapsibling); PROC_LOCK(p); clear_orphan(p); PROC_UNLOCK(p); leavepgrp(p); if (p->p_procdesc != NULL) procdesc_reap(p); sx_xunlock(&proctree_lock); /* * Removal from allproc list and process group list paired with * PROC_LOCK which was executed during that time should guarantee * nothing can reach this process anymore. As such further locking * is unnecessary. */ p->p_xexit = p->p_xsig = 0; /* XXX: why? */ PROC_LOCK(q); ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux); PROC_UNLOCK(q); /* * Decrement the count of procs running with this uid. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0); /* * Destroy resource accounting information associated with the process. */ #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } #endif racct_proc_exit(p); /* * Free credentials, arguments, and sigacts. */ crfree(p->p_ucred); proc_set_cred(p, NULL); pargs_drop(p->p_args); p->p_args = NULL; sigacts_free(p->p_sigacts); p->p_sigacts = NULL; /* * Do any thread-system specific cleanups. */ thread_wait(p); /* * Give vm and machine-dependent layer a chance to free anything that * cpu_exit couldn't release while still running in process context. */ vm_waitproc(p); #ifdef MAC mac_proc_destroy(p); #endif /* * Free any domain policy that's still hiding around. */ vm_domain_policy_cleanup(&p->p_vm_dom_policy); KASSERT(FIRST_THREAD_IN_PROC(p), ("proc_reap: no residual thread!")); uma_zfree(proc_zone, p); atomic_add_int(&nprocs, -1); } static int proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo, int check_only) { struct rusage *rup; sx_assert(&proctree_lock, SA_XLOCKED); PROC_LOCK(p); switch (idtype) { case P_ALL: if (p->p_procdesc != NULL) { PROC_UNLOCK(p); return (0); } break; case P_PID: if (p->p_pid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_PGID: if (p->p_pgid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_SID: if (p->p_session->s_sid != (pid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_UID: if (p->p_ucred->cr_uid != (uid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_GID: if (p->p_ucred->cr_gid != (gid_t)id) { PROC_UNLOCK(p); return (0); } break; case P_JAILID: if (p->p_ucred->cr_prison->pr_id != (int)id) { PROC_UNLOCK(p); return (0); } break; /* * It seems that the thread structures get zeroed out * at process exit. This makes it impossible to * support P_SETID, P_CID or P_CPUID. */ default: PROC_UNLOCK(p); return (0); } if (p_canwait(td, p)) { PROC_UNLOCK(p); return (0); } if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) { PROC_UNLOCK(p); return (0); } /* * This special case handles a kthread spawned by linux_clone * (see linux_misc.c). The linux_wait4 and linux_waitpid * functions need to be able to distinguish between waiting * on a process and waiting on a thread. It is a thread if * p_sigparent is not SIGCHLD, and the WLINUXCLONE option * signifies we want to wait for threads and not processes. */ if ((p->p_sigparent != SIGCHLD) ^ ((options & WLINUXCLONE) != 0)) { PROC_UNLOCK(p); return (0); } if (siginfo != NULL) { bzero(siginfo, sizeof(*siginfo)); siginfo->si_errno = 0; /* * SUSv4 requires that the si_signo value is always * SIGCHLD. Obey it despite the rfork(2) interface * allows to request other signal for child exit * notification. */ siginfo->si_signo = SIGCHLD; /* * This is still a rough estimate. We will fix the * cases TRAPPED, STOPPED, and CONTINUED later. */ if (WCOREDUMP(p->p_xsig)) { siginfo->si_code = CLD_DUMPED; siginfo->si_status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { siginfo->si_code = CLD_KILLED; siginfo->si_status = WTERMSIG(p->p_xsig); } else { siginfo->si_code = CLD_EXITED; siginfo->si_status = p->p_xexit; } siginfo->si_pid = p->p_pid; siginfo->si_uid = p->p_ucred->cr_uid; /* * The si_addr field would be useful additional * detail, but apparently the PC value may be lost * when we reach this point. bzero() above sets * siginfo->si_addr to NULL. */ } /* * There should be no reason to limit resources usage info to * exited processes only. A snapshot about any resources used * by a stopped process may be exactly what is needed. */ if (wrusage != NULL) { rup = &wrusage->wru_self; *rup = p->p_ru; PROC_STATLOCK(p); calcru(p, &rup->ru_utime, &rup->ru_stime); PROC_STATUNLOCK(p); rup = &wrusage->wru_children; *rup = p->p_stats->p_cru; calccru(p, &rup->ru_utime, &rup->ru_stime); } if (p->p_state == PRS_ZOMBIE && !check_only) { PROC_SLOCK(p); proc_reap(td, p, status, options); return (-1); } PROC_UNLOCK(p); return (1); } int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rusage) { struct __wrusage wru, *wrup; idtype_t idtype; id_t id; int ret; /* * Translate the special pid values into the (idtype, pid) * pair for kern_wait6. The WAIT_MYPGRP case is handled by * kern_wait6() on its own. */ if (pid == WAIT_ANY) { idtype = P_ALL; id = 0; } else if (pid < 0) { idtype = P_PGID; id = (id_t)-pid; } else { idtype = P_PID; id = (id_t)pid; } if (rusage != NULL) wrup = &wru; else wrup = NULL; /* * For backward compatibility we implicitly add flags WEXITED * and WTRAPPED here. */ options |= WEXITED | WTRAPPED; ret = kern_wait6(td, idtype, id, status, options, wrup, NULL); if (rusage != NULL) *rusage = wru.wru_self; return (ret); } int kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo) { struct proc *p, *q; pid_t pid; int error, nfound, ret; AUDIT_ARG_VALUE((int)idtype); /* XXX - This is likely wrong! */ AUDIT_ARG_PID((pid_t)id); /* XXX - This may be wrong! */ AUDIT_ARG_VALUE(options); q = td->td_proc; if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) { PROC_LOCK(q); id = (id_t)q->p_pgid; PROC_UNLOCK(q); idtype = P_PGID; } /* If we don't know the option, just return. */ if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT | WEXITED | WTRAPPED | WLINUXCLONE)) != 0) return (EINVAL); if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) { /* * We will be unable to find any matching processes, * because there are no known events to look for. * Prefer to return error instead of blocking * indefinitely. */ return (EINVAL); } loop: if (q->p_flag & P_STATCHILD) { PROC_LOCK(q); q->p_flag &= ~P_STATCHILD; PROC_UNLOCK(q); } nfound = 0; sx_xlock(&proctree_lock); LIST_FOREACH(p, &q->p_children, p_sibling) { pid = p->p_pid; ret = proc_to_reap(td, p, idtype, id, status, options, wrusage, siginfo, 0); if (ret == 0) continue; else if (ret == 1) nfound++; else { td->td_retval[0] = pid; return (0); } PROC_LOCK(p); PROC_SLOCK(p); if ((options & WTRAPPED) != 0 && (p->p_flag & P_TRACED) != 0 && (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0)) { PROC_SUNLOCK(p); if ((options & WNOWAIT) == 0) p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); if (status != NULL) *status = W_STOPCODE(p->p_xsig); if (siginfo != NULL) { siginfo->si_status = p->p_xsig; siginfo->si_code = CLD_TRAPPED; } if ((options & WNOWAIT) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } CTR4(KTR_PTRACE, "wait: returning trapped pid %d status %#x (xstat %d) xthread %d", p->p_pid, W_STOPCODE(p->p_xsig), p->p_xsig, p->p_xthread != NULL ? p->p_xthread->td_tid : -1); PROC_UNLOCK(p); td->td_retval[0] = pid; return (0); } if ((options & WUNTRACED) != 0 && (p->p_flag & P_STOPPED_SIG) != 0 && (p->p_suspcount == p->p_numthreads) && ((p->p_flag & P_WAITED) == 0)) { PROC_SUNLOCK(p); if ((options & WNOWAIT) == 0) p->p_flag |= P_WAITED; sx_xunlock(&proctree_lock); if (status != NULL) *status = W_STOPCODE(p->p_xsig); if (siginfo != NULL) { siginfo->si_status = p->p_xsig; siginfo->si_code = CLD_STOPPED; } if ((options & WNOWAIT) == 0) { PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } PROC_UNLOCK(p); td->td_retval[0] = pid; return (0); } PROC_SUNLOCK(p); if ((options & WCONTINUED) != 0 && (p->p_flag & P_CONTINUED) != 0) { sx_xunlock(&proctree_lock); if ((options & WNOWAIT) == 0) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(q); sigqueue_take(p->p_ksi); PROC_UNLOCK(q); } PROC_UNLOCK(p); if (status != NULL) *status = SIGCONT; if (siginfo != NULL) { siginfo->si_status = SIGCONT; siginfo->si_code = CLD_CONTINUED; } td->td_retval[0] = pid; return (0); } PROC_UNLOCK(p); } /* * Look in the orphans list too, to allow the parent to * collect it's child exit status even if child is being * debugged. * * Debugger detaches from the parent upon successful * switch-over from parent to child. At this point due to * re-parenting the parent loses the child to debugger and a * wait4(2) call would report that it has no children to wait * for. By maintaining a list of orphans we allow the parent * to successfully wait until the child becomes a zombie. */ if (nfound == 0) { LIST_FOREACH(p, &q->p_orphans, p_orphan) { ret = proc_to_reap(td, p, idtype, id, NULL, options, NULL, NULL, 1); if (ret != 0) { KASSERT(ret != -1, ("reaped an orphan (pid %d)", (int)td->td_retval[0])); nfound++; break; } } } if (nfound == 0) { sx_xunlock(&proctree_lock); return (ECHILD); } if (options & WNOHANG) { sx_xunlock(&proctree_lock); td->td_retval[0] = 0; return (0); } PROC_LOCK(q); sx_xunlock(&proctree_lock); if (q->p_flag & P_STATCHILD) { q->p_flag &= ~P_STATCHILD; error = 0; } else error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0); PROC_UNLOCK(q); if (error) return (error); goto loop; } /* * Make process 'parent' the new parent of process 'child'. * Must be called with an exclusive hold of proctree lock. */ void proc_reparent(struct proc *child, struct proc *parent) { sx_assert(&proctree_lock, SX_XLOCKED); PROC_LOCK_ASSERT(child, MA_OWNED); if (child->p_pptr == parent) return; PROC_LOCK(child->p_pptr); sigqueue_take(child->p_ksi); PROC_UNLOCK(child->p_pptr); LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); clear_orphan(child); if (child->p_flag & P_TRACED) { if (LIST_EMPTY(&child->p_pptr->p_orphans)) { child->p_treeflag |= P_TREE_FIRST_ORPHAN; LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan); } else { LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans), child, p_orphan); } child->p_treeflag |= P_TREE_ORPHANED; } child->p_pptr = parent; } Index: head/sys/kern/kern_thr.c =================================================================== --- head/sys/kern/kern_thr.c (revision 300042) +++ head/sys/kern/kern_thr.c (revision 300043) @@ -1,610 +1,611 @@ /*- * Copyright (c) 2003, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation"); static int max_threads_per_proc = 1500; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW, &max_threads_per_proc, 0, "Limit on threads per proc"); static int max_threads_hits; SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD, &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count"); #ifdef COMPAT_FREEBSD32 static inline int suword_lwpid(void *addr, lwpid_t lwpid) { int error; if (SV_CURPROC_FLAG(SV_LP64)) error = suword(addr, lwpid); else error = suword32(addr, lwpid); return (error); } #else #define suword_lwpid suword #endif /* * System call interface. */ struct thr_create_initthr_args { ucontext_t ctx; long *tid; }; static int thr_create_initthr(struct thread *td, void *thunk) { struct thr_create_initthr_args *args; /* Copy out the child tid. */ args = thunk; if (args->tid != NULL && suword_lwpid(args->tid, td->td_tid)) return (EFAULT); return (set_mcontext(td, &args->ctx.uc_mcontext)); } int sys_thr_create(struct thread *td, struct thr_create_args *uap) /* ucontext_t *ctx, long *id, int flags */ { struct thr_create_initthr_args args; int error; if ((error = copyin(uap->ctx, &args.ctx, sizeof(args.ctx)))) return (error); args.tid = uap->id; return (thread_create(td, NULL, thr_create_initthr, &args)); } int sys_thr_new(struct thread *td, struct thr_new_args *uap) /* struct thr_param * */ { struct thr_param param; int error; if (uap->param_size < 0 || uap->param_size > sizeof(param)) return (EINVAL); bzero(¶m, sizeof(param)); if ((error = copyin(uap->param, ¶m, uap->param_size))) return (error); return (kern_thr_new(td, ¶m)); } static int thr_new_initthr(struct thread *td, void *thunk) { stack_t stack; struct thr_param *param; /* * Here we copy out tid to two places, one for child and one * for parent, because pthread can create a detached thread, * if parent wants to safely access child tid, it has to provide * its storage, because child thread may exit quickly and * memory is freed before parent thread can access it. */ param = thunk; if ((param->child_tid != NULL && suword_lwpid(param->child_tid, td->td_tid)) || (param->parent_tid != NULL && suword_lwpid(param->parent_tid, td->td_tid))) return (EFAULT); /* Set up our machine context. */ stack.ss_sp = param->stack_base; stack.ss_size = param->stack_size; /* Set upcall address to user thread entry function. */ cpu_set_upcall_kse(td, param->start_func, param->arg, &stack); /* Setup user TLS address and TLS pointer register. */ return (cpu_set_user_tls(td, param->tls_base)); } int kern_thr_new(struct thread *td, struct thr_param *param) { struct rtprio rtp, *rtpp; int error; rtpp = NULL; if (param->rtp != 0) { error = copyin(param->rtp, &rtp, sizeof(struct rtprio)); if (error) return (error); rtpp = &rtp; } return (thread_create(td, rtpp, thr_new_initthr, param)); } int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk) { struct thread *newtd; struct proc *p; int error; p = td->td_proc; if (rtp != NULL) { switch(rtp->type) { case RTP_PRIO_REALTIME: case RTP_PRIO_FIFO: /* Only root can set scheduler policy */ if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0) return (EPERM); if (rtp->prio > RTP_PRIO_MAX) return (EINVAL); break; case RTP_PRIO_NORMAL: rtp->prio = 0; break; default: return (EINVAL); } } #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_set_upcall(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = td->td_proc; thread_cow_get(newtd, td); error = initialize_thread(newtd, thunk); if (error != 0) { thread_cow_free(newtd); thread_free(newtd); goto fail; } PROC_LOCK(p); p->p_flag |= P_HADTHREADS; thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (p->p_flag2 & P2_LWP_EVENTS) newtd->td_dbgflags |= TDB_BORN; /* * Copy the existing thread VM policy into the new thread. */ vm_domain_policy_localcopy(&newtd->td_vm_dom_policy, &td->td_vm_dom_policy); PROC_UNLOCK(p); tidhash_add(newtd); thread_lock(newtd); if (rtp != NULL) { if (!(td->td_pri_class == PRI_TIMESHARE && rtp->type == RTP_PRIO_NORMAL)) { rtp_to_pri(rtp, newtd); sched_prio(newtd, newtd->td_user_pri); } /* ignore timesharing class */ } TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); thread_unlock(newtd); return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); } int sys_thr_self(struct thread *td, struct thr_self_args *uap) /* long *id */ { int error; error = suword_lwpid(uap->id, (unsigned)td->td_tid); if (error == -1) return (EFAULT); return (0); } int sys_thr_exit(struct thread *td, struct thr_exit_args *uap) /* long *state */ { + umtx_thread_exit(td); + /* Signal userland that it can free the stack. */ if ((void *)uap->state != NULL) { suword_lwpid(uap->state, 1); kern_umtx_wake(td, uap->state, INT_MAX, 0); } return (kern_thr_exit(td)); } int kern_thr_exit(struct thread *td) { struct proc *p; p = td->td_proc; /* * If all of the threads in a process call this routine to * exit (e.g. all threads call pthread_exit()), exactly one * thread should return to the caller to terminate the process * instead of the thread. * * Checking p_numthreads alone is not sufficient since threads * might be committed to terminating while the PROC_LOCK is * dropped in either ptracestop() or while removing this thread * from the tidhash. Instead, the p_pendingexits field holds * the count of threads in either of those states and a thread * is considered the "last" thread if all of the other threads * in a process are already terminating. */ PROC_LOCK(p); if (p->p_numthreads == p->p_pendingexits + 1) { /* * Ignore attempts to shut down last thread in the * proc. This will actually call _exit(2) in the * usermode trampoline when it returns. */ PROC_UNLOCK(p); return (0); } p->p_pendingexits++; td->td_dbgflags |= TDB_EXIT; if (p->p_flag & P_TRACED && p->p_flag2 & P2_LWP_EVENTS) ptracestop(td, SIGTRAP); PROC_UNLOCK(p); tidhash_remove(td); PROC_LOCK(p); p->p_pendingexits--; /* * The check above should prevent all other threads from this * process from exiting while the PROC_LOCK is dropped, so * there must be at least one other thread other than the * current thread. */ KASSERT(p->p_numthreads > 1, ("too few threads")); racct_sub(p, RACCT_NTHR, 1); tdsigcleanup(td); - umtx_thread_exit(td); PROC_SLOCK(p); thread_stopped(p); thread_exit(); /* NOTREACHED */ } int sys_thr_kill(struct thread *td, struct thr_kill_args *uap) /* long id, int sig */ { ksiginfo_t ksi; struct thread *ttd; struct proc *p; int error; p = td->td_proc; ksiginfo_init(&ksi); ksi.ksi_signo = uap->sig; ksi.ksi_code = SI_LWP; ksi.ksi_pid = p->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->id == -1) { if (uap->sig != 0 && !_SIG_VALID(uap->sig)) { error = EINVAL; } else { error = ESRCH; PROC_LOCK(p); FOREACH_THREAD_IN_PROC(p, ttd) { if (ttd != td) { error = 0; if (uap->sig == 0) break; tdksignal(ttd, uap->sig, &ksi); } } PROC_UNLOCK(p); } } else { error = 0; ttd = tdfind((lwpid_t)uap->id, p->p_pid); if (ttd == NULL) return (ESRCH); if (uap->sig == 0) ; else if (!_SIG_VALID(uap->sig)) error = EINVAL; else tdksignal(ttd, uap->sig, &ksi); PROC_UNLOCK(ttd->td_proc); } return (error); } int sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap) /* pid_t pid, long id, int sig */ { ksiginfo_t ksi; struct thread *ttd; struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->sig); ksiginfo_init(&ksi); ksi.ksi_signo = uap->sig; ksi.ksi_code = SI_LWP; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->id == -1) { if ((p = pfind(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->sig); if (error) { PROC_UNLOCK(p); return (error); } if (uap->sig != 0 && !_SIG_VALID(uap->sig)) { error = EINVAL; } else { error = ESRCH; FOREACH_THREAD_IN_PROC(p, ttd) { if (ttd != td) { error = 0; if (uap->sig == 0) break; tdksignal(ttd, uap->sig, &ksi); } } } PROC_UNLOCK(p); } else { ttd = tdfind((lwpid_t)uap->id, uap->pid); if (ttd == NULL) return (ESRCH); p = ttd->td_proc; AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->sig); if (uap->sig == 0) ; else if (!_SIG_VALID(uap->sig)) error = EINVAL; else tdksignal(ttd, uap->sig, &ksi); PROC_UNLOCK(p); } return (error); } int sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap) /* const struct timespec *timeout */ { struct timespec ts, *tsp; int error; tsp = NULL; if (uap->timeout != NULL) { error = umtx_copyin_timeout(uap->timeout, &ts); if (error != 0) return (error); tsp = &ts; } return (kern_thr_suspend(td, tsp)); } int kern_thr_suspend(struct thread *td, struct timespec *tsp) { struct proc *p = td->td_proc; struct timeval tv; int error = 0; int timo = 0; if (td->td_pflags & TDP_WAKEUP) { td->td_pflags &= ~TDP_WAKEUP; return (0); } if (tsp != NULL) { if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) error = EWOULDBLOCK; else { TIMESPEC_TO_TIMEVAL(&tv, tsp); timo = tvtohz(&tv); } } PROC_LOCK(p); if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0) error = msleep((void *)td, &p->p_mtx, PCATCH, "lthr", timo); if (td->td_flags & TDF_THRWAKEUP) { thread_lock(td); td->td_flags &= ~TDF_THRWAKEUP; thread_unlock(td); PROC_UNLOCK(p); return (0); } PROC_UNLOCK(p); if (error == EWOULDBLOCK) error = ETIMEDOUT; else if (error == ERESTART) { if (timo != 0) error = EINTR; } return (error); } int sys_thr_wake(struct thread *td, struct thr_wake_args *uap) /* long id */ { struct proc *p; struct thread *ttd; if (uap->id == td->td_tid) { td->td_pflags |= TDP_WAKEUP; return (0); } p = td->td_proc; ttd = tdfind((lwpid_t)uap->id, p->p_pid); if (ttd == NULL) return (ESRCH); thread_lock(ttd); ttd->td_flags |= TDF_THRWAKEUP; thread_unlock(ttd); wakeup((void *)ttd); PROC_UNLOCK(p); return (0); } int sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap) { struct proc *p; char name[MAXCOMLEN + 1]; struct thread *ttd; int error; error = 0; name[0] = '\0'; if (uap->name != NULL) { error = copyinstr(uap->name, name, sizeof(name), NULL); if (error) return (error); } p = td->td_proc; ttd = tdfind((lwpid_t)uap->id, p->p_pid); if (ttd == NULL) return (ESRCH); strcpy(ttd->td_name, name); #ifdef KTR sched_clear_tdname(ttd); #endif PROC_UNLOCK(p); return (error); } int kern_thr_alloc(struct proc *p, int pages, struct thread **ntd) { /* Have race condition but it is cheap. */ if (p->p_numthreads >= max_threads_per_proc) { ++max_threads_hits; return (EPROCLIM); } *ntd = thread_alloc(pages); if (*ntd == NULL) return (ENOMEM); return (0); } Index: head/sys/kern/kern_thread.c =================================================================== --- head/sys/kern/kern_thread.c (revision 300042) +++ head/sys/kern/kern_thread.c (revision 300043) @@ -1,1205 +1,1206 @@ /*- * Copyright (C) 2001 Julian Elischer . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_witness.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE(proc, , , lwp__exit); /* * thread related storage. */ static uma_zone_t thread_zone; TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads); static struct mtx zombie_lock; MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN); static void thread_zombie(struct thread *); static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary); #define TID_BUFFER_SIZE 1024 struct mtx tid_lock; static struct unrhdr *tid_unrhdr; static lwpid_t tid_buffer[TID_BUFFER_SIZE]; static int tid_head, tid_tail; static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash"); struct tidhashhead *tidhashtbl; u_long tidhash; struct rwlock tidhash_lock; static lwpid_t tid_alloc(void) { lwpid_t tid; tid = alloc_unr(tid_unrhdr); if (tid != -1) return (tid); mtx_lock(&tid_lock); if (tid_head == tid_tail) { mtx_unlock(&tid_lock); return (-1); } tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); return (tid); } static void tid_free(lwpid_t tid) { lwpid_t tmp_tid = -1; mtx_lock(&tid_lock); if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) { tmp_tid = tid_buffer[tid_head]; tid_head = (tid_head + 1) % TID_BUFFER_SIZE; } tid_buffer[tid_tail] = tid; tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE; mtx_unlock(&tid_lock); if (tmp_tid != -1) free_unr(tid_unrhdr, tmp_tid); } /* * Prepare a thread for use. */ static int thread_ctor(void *mem, int size, void *arg, int flags) { struct thread *td; td = (struct thread *)mem; td->td_state = TDS_INACTIVE; td->td_oncpu = NOCPU; td->td_tid = tid_alloc(); /* * Note that td_critnest begins life as 1 because the thread is not * running and is thereby implicitly waiting to be on the receiving * end of a context switch. */ td->td_critnest = 1; td->td_lend_user_pri = PRI_MAX; EVENTHANDLER_INVOKE(thread_ctor, td); #ifdef AUDIT audit_thread_alloc(td); #endif umtx_thread_alloc(td); return (0); } /* * Reclaim a thread after use. */ static void thread_dtor(void *mem, int size, void *arg) { struct thread *td; td = (struct thread *)mem; #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { case TDS_INHIBITED: case TDS_RUNNING: case TDS_CAN_RUN: case TDS_RUNQ: /* * We must never unlink a thread that is in one of * these states, because it is currently active. */ panic("bad state for thread unlinking"); /* NOTREACHED */ case TDS_INACTIVE: break; default: panic("bad thread state"); /* NOTREACHED */ } #endif #ifdef AUDIT audit_thread_free(td); #endif /* Free all OSD associated to this thread. */ osd_thread_exit(td); EVENTHANDLER_INVOKE(thread_dtor, td); tid_free(td->td_tid); } /* * Initialize type-stable parts of a thread (when newly created). */ static int thread_init(void *mem, int size, int flags) { struct thread *td; td = (struct thread *)mem; td->td_sleepqueue = sleepq_alloc(); td->td_turnstile = turnstile_alloc(); td->td_rlqe = NULL; EVENTHANDLER_INVOKE(thread_init, td); td->td_sched = (struct td_sched *)&td[1]; umtx_thread_init(td); td->td_kstack = 0; td->td_sel = NULL; return (0); } /* * Tear down type-stable parts of a thread (just before being discarded). */ static void thread_fini(void *mem, int size) { struct thread *td; td = (struct thread *)mem; EVENTHANDLER_INVOKE(thread_fini, td); rlqentry_free(td->td_rlqe); turnstile_free(td->td_turnstile); sleepq_free(td->td_sleepqueue); umtx_thread_fini(td); seltdfini(td); } /* * For a newly created process, * link up all the structures and its initial threads etc. * called from: * {arch}/{arch}/machdep.c {arch}_init(), init386() etc. * proc_dtor() (should go away) * proc_init() */ void proc_linkup0(struct proc *p, struct thread *td) { TAILQ_INIT(&p->p_threads); /* all threads in proc */ proc_linkup(p, td); } void proc_linkup(struct proc *p, struct thread *td) { sigqueue_init(&p->p_sigqueue, p); p->p_ksi = ksiginfo_alloc(1); if (p->p_ksi != NULL) { /* XXX p_ksi may be null if ksiginfo zone is not ready */ p->p_ksi->ksi_flags = KSI_EXT | KSI_INS; } LIST_INIT(&p->p_mqnotifier); p->p_numthreads = 0; thread_link(td, p); } /* * Initialize global thread allocation resources. */ void threadinit(void) { mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF); /* * pid_max cannot be greater than PID_MAX. * leave one number for thread0. */ tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock); thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(), thread_ctor, thread_dtor, thread_init, thread_fini, 16 - 1, UMA_ZONE_NOFREE); tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash); rw_init(&tidhash_lock, "tidhash"); } /* * Place an unused thread on the zombie list. * Use the slpq as that must be unused by now. */ void thread_zombie(struct thread *td) { mtx_lock_spin(&zombie_lock); TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq); mtx_unlock_spin(&zombie_lock); } /* * Release a thread that has exited after cpu_throw(). */ void thread_stash(struct thread *td) { atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1); thread_zombie(td); } /* * Reap zombie resources. */ void thread_reap(void) { struct thread *td_first, *td_next; /* * Don't even bother to lock if none at this instant, * we really don't care about the next instant.. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); td_first = TAILQ_FIRST(&zombie_threads); if (td_first) TAILQ_INIT(&zombie_threads); mtx_unlock_spin(&zombie_lock); while (td_first) { td_next = TAILQ_NEXT(td_first, td_slpq); thread_cow_free(td_first); thread_free(td_first); td_first = td_next; } } } /* * Allocate a thread. */ struct thread * thread_alloc(int pages) { struct thread *td; thread_reap(); /* check if any zombies to get */ td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK); KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack")); if (!vm_thread_new(td, pages)) { uma_zfree(thread_zone, td); return (NULL); } cpu_thread_alloc(td); vm_domain_policy_init(&td->td_vm_dom_policy); return (td); } int thread_alloc_stack(struct thread *td, int pages) { KASSERT(td->td_kstack == 0, ("thread_alloc_stack called on a thread with kstack")); if (!vm_thread_new(td, pages)) return (0); cpu_thread_alloc(td); return (1); } /* * Deallocate a thread. */ void thread_free(struct thread *td) { lock_profile_thread_exit(td); if (td->td_cpuset) cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_free(td); if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); uma_zfree(thread_zone, td); } void thread_cow_get_proc(struct thread *newtd, struct proc *p) { PROC_LOCK_ASSERT(p, MA_OWNED); newtd->td_ucred = crhold(p->p_ucred); newtd->td_limit = lim_hold(p->p_limit); newtd->td_cowgen = p->p_cowgen; } void thread_cow_get(struct thread *newtd, struct thread *td) { newtd->td_ucred = crhold(td->td_ucred); newtd->td_limit = lim_hold(td->td_limit); newtd->td_cowgen = td->td_cowgen; } void thread_cow_free(struct thread *td) { if (td->td_ucred != NULL) crfree(td->td_ucred); if (td->td_limit != NULL) lim_free(td->td_limit); } void thread_cow_update(struct thread *td) { struct proc *p; struct ucred *oldcred; struct plimit *oldlimit; p = td->td_proc; oldcred = NULL; oldlimit = NULL; PROC_LOCK(p); if (td->td_ucred != p->p_ucred) { oldcred = td->td_ucred; td->td_ucred = crhold(p->p_ucred); } if (td->td_limit != p->p_limit) { oldlimit = td->td_limit; td->td_limit = lim_hold(p->p_limit); } td->td_cowgen = p->p_cowgen; PROC_UNLOCK(p); if (oldcred != NULL) crfree(oldcred); if (oldlimit != NULL) lim_free(oldlimit); } /* * Discard the current thread and exit from its context. * Always called with scheduler locked. * * Because we can't free a thread while we're operating under its context, * push the current thread into our CPU's deadthread holder. This means * we needn't worry about someone else grabbing our context before we * do a cpu_throw(). */ void thread_exit(void) { uint64_t runtime, new_switchtime; struct thread *td; struct thread *td2; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(p != NULL, ("thread exiting without a process")); CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td, (long)p->p_pid, td->td_name); KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending")); #ifdef AUDIT AUDIT_SYSCALL_EXIT(0, td); #endif /* * drop FPU & debug register state storage, or any other * architecture specific resources that * would not be on a new untouched process. */ cpu_thread_exit(td); /* XXXSMP */ /* * The last thread is left attached to the process * So that the whole bundle gets recycled. Skip * all this stuff if we never had threads. * EXIT clears all sign of other threads when * it goes to single threading, so the last thread always * takes the short path. */ if (p->p_flag & P_HADTHREADS) { if (p->p_numthreads > 1) { atomic_add_int(&td->td_proc->p_exitthreads, 1); thread_unlink(td); td2 = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td2, td); /* * The test below is NOT true if we are the * sole exiting thread. P_STOPPED_SINGLE is unset * in exit1() after it is the only survivor. */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PCPU_SET(deadthread, td); } else { /* * The last thread is exiting.. but not through exit() */ panic ("thread_exit: Last thread exiting on its own"); } } #ifdef HWPMC_HOOKS /* * If this thread is part of a process that is being tracked by hwpmc(4), * inform the module of the thread's impending exit. */ if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif PROC_UNLOCK(p); PROC_STATLOCK(p); thread_lock(td); PROC_SUNLOCK(p); /* Do the same timestamp bookkeeping that mi_switch() would do. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); PCPU_SET(switchticks, ticks); PCPU_INC(cnt.v_swtch); /* Save our resource usage in our process. */ td->td_ru.ru_nvcsw++; ruxagg(p, td); rucollect(&p->p_ru, &td->td_ru); PROC_STATUNLOCK(p); td->td_state = TDS_INACTIVE; #ifdef WITNESS witness_thread_exit(td); #endif CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td); sched_throw(td); panic("I'm a teapot!"); /* NOTREACHED */ } /* * Do any thread specific cleanups that may be needed in wait() * called with Giant, proc and schedlock not held. */ void thread_wait(struct proc *p) { struct thread *td; mtx_assert(&Giant, MA_NOTOWNED); KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()")); KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking")); td = FIRST_THREAD_IN_PROC(p); /* Lock the last thread so we spin until it exits cpu_throw(). */ thread_lock(td); thread_unlock(td); lock_profile_thread_exit(td); cpuset_rel(td->td_cpuset); td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); thread_reap(); /* check for zombie threads etc. */ } /* * Link a thread to a process. * set up anything that needs to be initialized for it to * be used by the process. */ void thread_link(struct thread *td, struct proc *p) { /* * XXX This can't be enabled because it's called for proc0 before * its lock has been created. * PROC_LOCK_ASSERT(p, MA_OWNED); */ td->td_state = TDS_INACTIVE; td->td_proc = p; td->td_flags = TDF_INMEM; LIST_INIT(&td->td_contested); LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); callout_init(&td->td_slpcallout, 1); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } /* * Called from: * thread_exit() */ void thread_unlink(struct thread *td) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); TAILQ_REMOVE(&p->p_threads, td, td_plist); p->p_numthreads--; /* could clear a few other things here */ /* Must NOT clear links to proc! */ } static int calc_remaining(struct proc *p, int mode) { int remaining; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); if (mode == SINGLE_EXIT) remaining = p->p_numthreads; else if (mode == SINGLE_BOUNDARY) remaining = p->p_numthreads - p->p_boundary_count; else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC) remaining = p->p_numthreads - p->p_suspcount; else panic("calc_remaining: wrong mode %d", mode); return (remaining); } static int remain_for_mode(int mode) { return (mode == SINGLE_ALLPROC ? 0 : 1); } static int weed_inhib(int mode, struct thread *td2, struct proc *p) { int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td2, MA_OWNED); wakeup_swapper = 0; switch (mode) { case SINGLE_EXIT: if (TD_IS_SUSPENDED(td2)) wakeup_swapper |= thread_unsuspend_one(td2, p, true); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, EINTR); break; case SINGLE_BOUNDARY: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_NO_EXIT: if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) wakeup_swapper |= sleepq_abort(td2, ERESTART); break; case SINGLE_ALLPROC: /* * ALLPROC suspend tries to avoid spurious EINTR for * threads sleeping interruptable, by suspending the * thread directly, similarly to sig_suspend_threads(). * Since such sleep is not performed at the user * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP * is used to avoid immediate un-suspend. */ if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY | TDF_ALLPROCSUSP)) == 0) wakeup_swapper |= thread_unsuspend_one(td2, p, false); if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) { if ((td2->td_flags & TDF_SBDRY) == 0) { thread_suspend_one(td2); td2->td_flags |= TDF_ALLPROCSUSP; } else { wakeup_swapper |= sleepq_abort(td2, ERESTART); } } break; } return (wakeup_swapper); } /* * Enforce single-threading. * * Returns 1 if the caller must abort (another thread is waiting to * exit the process or similar). Process is locked! * Returns 0 when you are successfully the only thread running. * A process has successfully single threaded in the suspend mode when * There are no threads in user mode. Threads in the kernel must be * allowed to continue until they get to the user boundary. They may even * copy out their return values and data before suspending. They may however be * accelerated in reaching the user boundary as we will wake up * any sleeping threads that are interruptable. (PCATCH). */ int thread_single(struct proc *p, int mode) { struct thread *td; struct thread *td2; int remaining, wakeup_swapper; td = curthread; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); /* * If allowing non-ALLPROC singlethreading for non-curproc * callers, calc_remaining() and remain_for_mode() should be * adjusted to also account for td->td_proc != p. For now * this is not implemented because it is not used. */ KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) || (mode != SINGLE_ALLPROC && td->td_proc == p), ("mode %d proc %p curproc %p", mode, p, td->td_proc)); mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC) return (0); /* Is someone already single threading? */ if (p->p_singlethread != NULL && p->p_singlethread != td) return (1); if (mode == SINGLE_EXIT) { p->p_flag |= P_SINGLE_EXIT; p->p_flag &= ~P_SINGLE_BOUNDARY; } else { p->p_flag &= ~P_SINGLE_EXIT; if (mode == SINGLE_BOUNDARY) p->p_flag |= P_SINGLE_BOUNDARY; else p->p_flag &= ~P_SINGLE_BOUNDARY; } if (mode == SINGLE_ALLPROC) p->p_flag |= P_TOTAL_STOP; p->p_flag |= P_STOPPED_SINGLE; PROC_SLOCK(p); p->p_singlethread = td; remaining = calc_remaining(p, mode); while (remaining != remain_for_mode(mode)) { if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE) goto stopme; wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (TD_IS_INHIBITED(td2)) { wakeup_swapper |= weed_inhib(mode, td2, p); #ifdef SMP } else if (TD_IS_RUNNING(td2) && td != td2) { forward_signal(td2); #endif } thread_unlock(td2); } if (wakeup_swapper) kick_proc0(); remaining = calc_remaining(p, mode); /* * Maybe we suspended some threads.. was it enough? */ if (remaining == remain_for_mode(mode)) break; stopme: /* * Wake us up when everyone else has suspended. * In the mean time we suspend as well. */ thread_suspend_switch(td, p); remaining = calc_remaining(p, mode); } if (mode == SINGLE_EXIT) { /* * Convert the process to an unthreaded process. The * SINGLE_EXIT is called by exit1() or execve(), in * both cases other threads must be retired. */ KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads")); p->p_singlethread = NULL; p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS); /* * Wait for any remaining threads to exit cpu_throw(). */ while (p->p_exitthreads != 0) { PROC_SUNLOCK(p); PROC_UNLOCK(p); sched_relinquish(td); PROC_LOCK(p); PROC_SLOCK(p); } } else if (mode == SINGLE_BOUNDARY) { /* * Wait until all suspended threads are removed from * the processors. The thread_suspend_check() * increments p_boundary_count while it is still * running, which makes it possible for the execve() * to destroy vmspace while our other threads are * still using the address space. * * We lock the thread, which is only allowed to * succeed after context switch code finished using * the address space. */ FOREACH_THREAD_IN_PROC(p, td2) { if (td2 == td) continue; thread_lock(td2); KASSERT((td2->td_flags & TDF_BOUNDARY) != 0, ("td %p not on boundary", td2)); KASSERT(TD_IS_SUSPENDED(td2), ("td %p is not suspended", td2)); thread_unlock(td2); } } PROC_SUNLOCK(p); return (0); } bool thread_suspend_check_needed(void) { struct proc *p; struct thread *td; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 && (td->td_dbgflags & TDB_SUSPEND) != 0)); } /* * Called in from locations that can safely check to see * whether we have to suspend or at least throttle for a * single-thread event (e.g. fork). * * Such locations include userret(). * If the "return_instead" argument is non zero, the thread must be able to * accept 0 (caller may continue), or 1 (caller must abort) as a result. * * The 'return_instead' argument tells the function if it may do a * thread_exit() or suspend, or whether the caller must abort and back * out instead. * * If the thread that set the single_threading request has set the * P_SINGLE_EXIT bit in the process flags then this call will never return * if 'return_instead' is false, but will exit. * * P_SINGLE_EXIT | return_instead == 0| return_instead != 0 *---------------+--------------------+--------------------- * 0 | returns 0 | returns 0 or 1 * | when ST ends | immediately *---------------+--------------------+--------------------- * 1 | thread exits | returns 1 * | | immediately * 0 = thread_exit() or suspension ok, * other = return error instead of stopping the thread. * * While a full suspension is under effect, even a single threading * thread would be suspended if it made this call (but it shouldn't). * This call should only be made from places where * thread_exit() would be safe as that may be the outcome unless * return_instead is set. */ int thread_suspend_check(int return_instead) { struct thread *td; struct proc *p; int wakeup_swapper; td = curthread; p = td->td_proc; mtx_assert(&Giant, MA_NOTOWNED); PROC_LOCK_ASSERT(p, MA_OWNED); while (thread_suspend_check_needed()) { if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { KASSERT(p->p_singlethread != NULL, ("singlethread not set")); /* * The only suspension in action is a * single-threading. Single threader need not stop. * XXX Should be safe to access unlocked * as it can only be set to be true by us. */ if (p->p_singlethread == td) return (0); /* Exempt from stopping. */ } if ((p->p_flag & P_SINGLE_EXIT) && return_instead) return (EINTR); /* Should we goto user boundary if we didn't come from there? */ if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && (p->p_flag & P_SINGLE_BOUNDARY) && return_instead) return (ERESTART); /* * Ignore suspend requests if they are deferred. */ if ((td->td_flags & TDF_SBDRY) != 0) { KASSERT(return_instead, ("TDF_SBDRY set for unsafe thread_suspend_check")); return (0); } /* * If the process is waiting for us to exit, * this thread should just suicide. * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE. */ if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) { PROC_UNLOCK(p); /* * Allow Linux emulation layer to do some work * before thread suicide. */ if (__predict_false(p->p_sysent->sv_thread_detach != NULL)) (p->p_sysent->sv_thread_detach)(td); + umtx_thread_exit(td); kern_thr_exit(td); panic("stopped thread did not exit"); } PROC_SLOCK(p); thread_stopped(p); if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) { if (p->p_numthreads == p->p_suspcount + 1) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); if (wakeup_swapper) kick_proc0(); } } PROC_UNLOCK(p); thread_lock(td); /* * When a thread suspends, it just * gets taken off all queues. */ thread_suspend_one(td); if (return_instead == 0) { p->p_boundary_count++; td->td_flags |= TDF_BOUNDARY; } PROC_SUNLOCK(p); mi_switch(SW_INVOL | SWT_SUSPEND, NULL); thread_unlock(td); PROC_LOCK(p); } return (0); } void thread_suspend_switch(struct thread *td, struct proc *p) { KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); /* * We implement thread_suspend_one in stages here to avoid * dropping the proc lock while the thread lock is owned. */ if (p == td->td_proc) { thread_stopped(p); p->p_suspcount++; } PROC_UNLOCK(p); thread_lock(td); td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); PROC_SUNLOCK(p); DROP_GIANT(); mi_switch(SW_VOL | SWT_SUSPEND, NULL); thread_unlock(td); PICKUP_GIANT(); PROC_LOCK(p); PROC_SLOCK(p); } void thread_suspend_one(struct thread *td) { struct proc *p; p = td->td_proc; PROC_SLOCK_ASSERT(p, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(!TD_IS_SUSPENDED(td), ("already suspended")); p->p_suspcount++; td->td_flags &= ~TDF_NEEDSUSPCHK; TD_SET_SUSPENDED(td); sched_sleep(td, 0); } static int thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended")); TD_CLR_SUSPENDED(td); td->td_flags &= ~TDF_ALLPROCSUSP; if (td->td_proc == p) { PROC_SLOCK_ASSERT(p, MA_OWNED); p->p_suspcount--; if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) { td->td_flags &= ~TDF_BOUNDARY; p->p_boundary_count--; } } return (setrunnable(td)); } /* * Allow all threads blocked by single threading to continue running. */ void thread_unsuspend(struct proc *p) { struct thread *td; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); wakeup_swapper = 0; if (!P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, true); } thread_unlock(td); } } else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE && p->p_numthreads == p->p_suspcount) { /* * Stopping everything also did the job for the single * threading request. Now we've downgraded to single-threaded, * let it continue. */ if (p->p_singlethread->td_proc == p) { thread_lock(p->p_singlethread); wakeup_swapper = thread_unsuspend_one( p->p_singlethread, p, false); thread_unlock(p->p_singlethread); } } if (wakeup_swapper) kick_proc0(); } /* * End the single threading mode.. */ void thread_single_end(struct proc *p, int mode) { struct thread *td; int wakeup_swapper; KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY || mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT, ("invalid mode %d", mode)); PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) || (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0), ("mode %d does not match P_TOTAL_STOP", mode)); KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread, ("thread_single_end from other thread %p %p", curthread, p->p_singlethread)); KASSERT(mode != SINGLE_BOUNDARY || (p->p_flag & P_SINGLE_BOUNDARY) != 0, ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag)); p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY | P_TOTAL_STOP); PROC_SLOCK(p); p->p_singlethread = NULL; wakeup_swapper = 0; /* * If there are other threads they may now run, * unless of course there is a blanket 'stop order' * on the process. The single threader must be allowed * to continue however as this is a bad place to stop. */ if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) { FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_IS_SUSPENDED(td)) { wakeup_swapper |= thread_unsuspend_one(td, p, mode == SINGLE_BOUNDARY); } thread_unlock(td); } } KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0, ("inconsistent boundary count %d", p->p_boundary_count)); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); } struct thread * thread_find(struct proc *p, lwpid_t tid) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); FOREACH_THREAD_IN_PROC(p, td) { if (td->td_tid == tid) break; } return (td); } /* Locate a thread by number; return with proc lock held. */ struct thread * tdfind(lwpid_t tid, pid_t pid) { #define RUN_THRESH 16 struct thread *td; int run = 0; rw_rlock(&tidhash_lock); LIST_FOREACH(td, TIDHASH(tid), td_hash) { if (td->td_tid == tid) { if (pid != -1 && td->td_proc->p_pid != pid) { td = NULL; break; } PROC_LOCK(td->td_proc); if (td->td_proc->p_state == PRS_NEW) { PROC_UNLOCK(td->td_proc); td = NULL; break; } if (run > RUN_THRESH) { if (rw_try_upgrade(&tidhash_lock)) { LIST_REMOVE(td, td_hash); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); return (td); } } break; } run++; } rw_runlock(&tidhash_lock); return (td); } void tidhash_add(struct thread *td) { rw_wlock(&tidhash_lock); LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); rw_wunlock(&tidhash_lock); } void tidhash_remove(struct thread *td) { rw_wlock(&tidhash_lock); LIST_REMOVE(td, td_hash); rw_wunlock(&tidhash_lock); } Index: head/sys/kern/kern_umtx.c =================================================================== --- head/sys/kern/kern_umtx.c (revision 300042) +++ head/sys/kern/kern_umtx.c (revision 300043) @@ -1,4178 +1,4494 @@ /*- - * Copyright (c) 2015 The FreeBSD Foundation + * Copyright (c) 2015, 2016 The FreeBSD Foundation * Copyright (c) 2004, David Xu * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_umtx_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #endif #define _UMUTEX_TRY 1 #define _UMUTEX_WAIT 2 #ifdef UMTX_PROFILING #define UPROF_PERC_BIGGER(w, f, sw, sf) \ (((w) > (sw)) || ((w) == (sw) && (f) > (sf))) #endif /* Priority inheritance mutex info. */ struct umtx_pi { /* Owner thread */ struct thread *pi_owner; /* Reference count */ int pi_refcount; /* List entry to link umtx holding by thread */ TAILQ_ENTRY(umtx_pi) pi_link; /* List entry in hash */ TAILQ_ENTRY(umtx_pi) pi_hashlink; /* List for waiters */ TAILQ_HEAD(,umtx_q) pi_blocked; /* Identify a userland lock object */ struct umtx_key pi_key; }; /* A userland synchronous object user. */ struct umtx_q { /* Linked list for the hash. */ TAILQ_ENTRY(umtx_q) uq_link; /* Umtx key. */ struct umtx_key uq_key; /* Umtx flags. */ int uq_flags; #define UQF_UMTXQ 0x0001 /* The thread waits on. */ struct thread *uq_thread; /* * Blocked on PI mutex. read can use chain lock * or umtx_lock, write must have both chain lock and * umtx_lock being hold. */ struct umtx_pi *uq_pi_blocked; /* On blocked list */ TAILQ_ENTRY(umtx_q) uq_lockq; /* Thread contending with us */ TAILQ_HEAD(,umtx_pi) uq_pi_contested; /* Inherited priority from PP mutex */ u_char uq_inherited_pri; /* Spare queue ready to be reused */ struct umtxq_queue *uq_spare_queue; /* The queue we on */ struct umtxq_queue *uq_cur_queue; }; TAILQ_HEAD(umtxq_head, umtx_q); /* Per-key wait-queue */ struct umtxq_queue { struct umtxq_head head; struct umtx_key key; LIST_ENTRY(umtxq_queue) link; int length; }; LIST_HEAD(umtxq_list, umtxq_queue); /* Userland lock object's wait-queue chain */ struct umtxq_chain { /* Lock for this chain. */ struct mtx uc_lock; /* List of sleep queues. */ struct umtxq_list uc_queue[2]; #define UMTX_SHARED_QUEUE 0 #define UMTX_EXCLUSIVE_QUEUE 1 LIST_HEAD(, umtxq_queue) uc_spare_queue; /* Busy flag */ char uc_busy; /* Chain lock waiters */ int uc_waiters; /* All PI in the list */ TAILQ_HEAD(,umtx_pi) uc_pi_list; #ifdef UMTX_PROFILING u_int length; u_int max_length; #endif }; #define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED) /* * Don't propagate time-sharing priority, there is a security reason, * a user can simply introduce PI-mutex, let thread A lock the mutex, * and let another thread B block on the mutex, because B is * sleeping, its priority will be boosted, this causes A's priority to * be boosted via priority propagating too and will never be lowered even * if it is using 100%CPU, this is unfair to other processes. */ #define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\ PRI_MAX_TIMESHARE : (td)->td_user_pri) #define GOLDEN_RATIO_PRIME 2654404609U #define UMTX_CHAINS 512 #define UMTX_SHIFTS (__WORD_BIT - 9) #define GET_SHARE(flags) \ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE) #define BUSY_SPINS 200 struct abs_timeout { int clockid; struct timespec cur; struct timespec end; }; +#ifdef COMPAT_FREEBSD32 +struct umutex32 { + volatile __lwpid_t m_owner; /* Owner of the mutex */ + __uint32_t m_flags; /* Flags of the mutex */ + __uint32_t m_ceilings[2]; /* Priority protect ceiling */ + __uint32_t m_rb_lnk; /* Robust linkage */ + __uint32_t m_pad; + __uint32_t m_spare[2]; +}; + +_Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32"); +_Static_assert(__offsetof(struct umutex, m_spare[0]) == + __offsetof(struct umutex32, m_spare[0]), "m_spare32"); +#endif + +int umtx_shm_vnobj_persistent = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN, + &umtx_shm_vnobj_persistent, 0, + "False forces destruction of umtx attached to file, on last close"); +static int umtx_max_rb = 1000; +SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN, + &umtx_max_rb, 0, + ""); + static uma_zone_t umtx_pi_zone; static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS]; static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory"); static int umtx_pi_allocated; static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); +static int umtx_verbose_rb = 1; +SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN, + &umtx_verbose_rb, 0, + ""); #ifdef UMTX_PROFILING static long max_length; SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length"); static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats"); #endif static void umtx_shm_init(void); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); static struct umtxq_chain *umtxq_getchain(struct umtx_key *key); static void umtxq_lock(struct umtx_key *key); static void umtxq_unlock(struct umtx_key *key); static void umtxq_busy(struct umtx_key *key); static void umtxq_unbusy(struct umtx_key *key); static void umtxq_insert_queue(struct umtx_q *uq, int q); static void umtxq_remove_queue(struct umtx_q *uq, int q); static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *); static int umtxq_count(struct umtx_key *key); static struct umtx_pi *umtx_pi_alloc(int); static void umtx_pi_free(struct umtx_pi *pi); -static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags); +static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, + bool rb); static void umtx_thread_cleanup(struct thread *td); static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, - struct image_params *imgp __unused); + struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); #define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE) #define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE) #define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE) static struct mtx umtx_lock; #ifdef UMTX_PROFILING static void umtx_init_profiling(void) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < UMTX_CHAINS; ++i) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "umtx hash stats"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL); } } static int sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS) { char buf[512]; struct sbuf sb; struct umtxq_chain *uc; u_int fract, i, j, tot, whole; u_int sf0, sf1, sf2, sf3, sf4; u_int si0, si1, si2, si3, si4; u_int sw0, sw1, sw2, sw3, sw4; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < 2; i++) { tot = 0; for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); tot += uc->max_length; mtx_unlock(&uc->uc_lock); } if (tot == 0) sbuf_printf(&sb, "%u) Empty ", i); else { sf0 = sf1 = sf2 = sf3 = sf4 = 0; si0 = si1 = si2 = si3 = si4 = 0; sw0 = sw1 = sw2 = sw3 = sw4 = 0; for (j = 0; j < UMTX_CHAINS; j++) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); whole = uc->max_length * 100; mtx_unlock(&uc->uc_lock); fract = (whole % tot) * 100; if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) { sf0 = fract; si0 = j; sw0 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw1, sf1)) { sf1 = fract; si1 = j; sw1 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw2, sf2)) { sf2 = fract; si2 = j; sw2 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw3, sf3)) { sf3 = fract; si3 = j; sw3 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw4, sf4)) { sf4 = fract; si4 = j; sw4 = whole; } } sbuf_printf(&sb, "queue %u:\n", i); sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot, sf0 / tot, si0); sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot, sf1 / tot, si1); sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot, sf2 / tot, si2); sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot, sf3 / tot, si3); sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot, sf4 / tot, si4); } } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } static int sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS) { struct umtxq_chain *uc; u_int i, j; int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (clear != 0) { for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); uc->length = 0; uc->max_length = 0; mtx_unlock(&uc->uc_lock); } } } return (0); } SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics"); SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length"); #endif static void umtxq_sysinit(void *arg __unused) { int i, j; umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL, MTX_DEF | MTX_DUPOK); LIST_INIT(&umtxq_chains[i][j].uc_queue[0]); LIST_INIT(&umtxq_chains[i][j].uc_queue[1]); LIST_INIT(&umtxq_chains[i][j].uc_spare_queue); TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list); umtxq_chains[i][j].uc_busy = 0; umtxq_chains[i][j].uc_waiters = 0; #ifdef UMTX_PROFILING umtxq_chains[i][j].length = 0; umtxq_chains[i][j].max_length = 0; #endif } } #ifdef UMTX_PROFILING umtx_init_profiling(); #endif mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); umtx_shm_init(); } struct umtx_q * umtxq_alloc(void) { struct umtx_q *uq; uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO); - uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO); + uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, + M_WAITOK | M_ZERO); TAILQ_INIT(&uq->uq_spare_queue->head); TAILQ_INIT(&uq->uq_pi_contested); uq->uq_inherited_pri = PRI_MAX; return (uq); } void umtxq_free(struct umtx_q *uq) { + MPASS(uq->uq_spare_queue != NULL); free(uq->uq_spare_queue, M_UMTX); free(uq, M_UMTX); } static inline void umtxq_hash(struct umtx_key *key) { - unsigned n = (uintptr_t)key->info.both.a + key->info.both.b; + unsigned n; + + n = (uintptr_t)key->info.both.a + key->info.both.b; key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS; } static inline struct umtxq_chain * umtxq_getchain(struct umtx_key *key) { + if (key->type <= TYPE_SEM) return (&umtxq_chains[1][key->hash]); return (&umtxq_chains[0][key->hash]); } /* * Lock a chain. */ static inline void umtxq_lock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_lock(&uc->uc_lock); } /* * Unlock a chain. */ static inline void umtxq_unlock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_unlock(&uc->uc_lock); } /* * Set chain to busy state when following operation * may be blocked (kernel mutex can not be used). */ static inline void umtxq_busy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); if (uc->uc_busy) { #ifdef SMP if (smp_cpus > 1) { int count = BUSY_SPINS; if (count > 0) { umtxq_unlock(key); while (uc->uc_busy && --count > 0) cpu_spinwait(); umtxq_lock(key); } } #endif while (uc->uc_busy) { uc->uc_waiters++; msleep(uc, &uc->uc_lock, 0, "umtxqb", 0); uc->uc_waiters--; } } uc->uc_busy = 1; } /* * Unbusy a chain. */ static inline void umtxq_unbusy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); KASSERT(uc->uc_busy != 0, ("not busy")); uc->uc_busy = 0; if (uc->uc_waiters) wakeup_one(uc); } static inline void umtxq_unbusy_unlocked(struct umtx_key *key) { umtxq_lock(key); umtxq_unbusy(key); umtxq_unlock(key); } static struct umtxq_queue * umtxq_queue_lookup(struct umtx_key *key, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); LIST_FOREACH(uh, &uc->uc_queue[q], link) { if (umtx_key_match(&uh->key, key)) return (uh); } return (NULL); } static inline void umtxq_insert_queue(struct umtx_q *uq, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue")); uh = umtxq_queue_lookup(&uq->uq_key, q); if (uh != NULL) { LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link); } else { uh = uq->uq_spare_queue; uh->key = uq->uq_key; LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link); #ifdef UMTX_PROFILING uc->length++; if (uc->length > uc->max_length) { uc->max_length = uc->length; if (uc->max_length > max_length) max_length = uc->max_length; } #endif } uq->uq_spare_queue = NULL; TAILQ_INSERT_TAIL(&uh->head, uq, uq_link); uh->length++; uq->uq_flags |= UQF_UMTXQ; uq->uq_cur_queue = uh; return; } static inline void umtxq_remove_queue(struct umtx_q *uq, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (uq->uq_flags & UQF_UMTXQ) { uh = uq->uq_cur_queue; TAILQ_REMOVE(&uh->head, uq, uq_link); uh->length--; uq->uq_flags &= ~UQF_UMTXQ; if (TAILQ_EMPTY(&uh->head)) { KASSERT(uh->length == 0, ("inconsistent umtxq_queue length")); #ifdef UMTX_PROFILING uc->length--; #endif LIST_REMOVE(uh, link); } else { uh = LIST_FIRST(&uc->uc_spare_queue); KASSERT(uh != NULL, ("uc_spare_queue is empty")); LIST_REMOVE(uh, link); } uq->uq_spare_queue = uh; uq->uq_cur_queue = NULL; } } /* * Check if there are multiple waiters */ static int umtxq_count(struct umtx_key *key) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) return (uh->length); return (0); } /* * Check if there are multiple PI waiters and returns first * waiter. */ static int umtxq_count_pi(struct umtx_key *key, struct umtx_q **first) { struct umtxq_chain *uc; struct umtxq_queue *uh; *first = NULL; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) { *first = TAILQ_FIRST(&uh->head); return (uh->length); } return (0); } static int umtxq_check_susp(struct thread *td) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) { if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else error = ERESTART; } PROC_UNLOCK(p); return (error); } /* * Wake up threads waiting on an userland object. */ static int umtxq_signal_queue(struct umtx_key *key, int n_wake, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; struct umtx_q *uq; int ret; ret = 0; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); uh = umtxq_queue_lookup(key, q); if (uh != NULL) { while ((uq = TAILQ_FIRST(&uh->head)) != NULL) { umtxq_remove_queue(uq, q); wakeup(uq); if (++ret >= n_wake) return (ret); } } return (ret); } /* * Wake up specified thread. */ static inline void umtxq_signal_thread(struct umtx_q *uq) { struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); umtxq_remove(uq); wakeup(uq); } static inline int tstohz(const struct timespec *tsp) { struct timeval tv; TIMESPEC_TO_TIMEVAL(&tv, tsp); return tvtohz(&tv); } static void abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute, const struct timespec *timeout) { timo->clockid = clockid; if (!absolute) { kern_clock_gettime(curthread, clockid, &timo->end); timo->cur = timo->end; timespecadd(&timo->end, timeout); } else { timo->end = *timeout; kern_clock_gettime(curthread, clockid, &timo->cur); } } static void abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime) { abs_timeout_init(timo, umtxtime->_clockid, - (umtxtime->_flags & UMTX_ABSTIME) != 0, - &umtxtime->_timeout); + (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout); } static inline void abs_timeout_update(struct abs_timeout *timo) { + kern_clock_gettime(curthread, timo->clockid, &timo->cur); } static int abs_timeout_gethz(struct abs_timeout *timo) { struct timespec tts; if (timespeccmp(&timo->end, &timo->cur, <=)) return (-1); tts = timo->end; timespecsub(&tts, &timo->cur); return (tstohz(&tts)); } +static uint32_t +umtx_unlock_val(uint32_t flags, bool rb) +{ + + if (rb) + return (UMUTEX_RB_OWNERDEAD); + else if ((flags & UMUTEX_NONCONSISTENT) != 0) + return (UMUTEX_RB_NOTRECOV); + else + return (UMUTEX_UNOWNED); + +} + /* * Put thread into sleep state, before sleeping, check if * thread was removed from umtx queue. */ static inline int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime) { struct umtxq_chain *uc; int error, timo; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); for (;;) { if (!(uq->uq_flags & UQF_UMTXQ)) return (0); if (abstime != NULL) { timo = abs_timeout_gethz(abstime); if (timo < 0) return (ETIMEDOUT); } else timo = 0; error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo); if (error != EWOULDBLOCK) { umtxq_lock(&uq->uq_key); break; } if (abstime != NULL) abs_timeout_update(abstime); umtxq_lock(&uq->uq_key); } return (error); } /* * Convert userspace address into unique logical address. */ int umtx_key_get(const void *addr, int type, int share, struct umtx_key *key) { struct thread *td = curthread; vm_map_t map; vm_map_entry_t entry; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; key->type = type; if (share == THREAD_SHARE) { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } else { MPASS(share == PROCESS_SHARE || share == AUTO_SHARE); map = &td->td_proc->p_vmspace->vm_map; if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE, &entry, &key->info.shared.object, &pindex, &prot, &wired) != KERN_SUCCESS) { return (EFAULT); } if ((share == PROCESS_SHARE) || (share == AUTO_SHARE && VM_INHERIT_SHARE == entry->inheritance)) { key->shared = 1; key->info.shared.offset = (vm_offset_t)addr - entry->start + entry->offset; vm_object_reference(key->info.shared.object); } else { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } vm_map_lookup_done(map, entry); } umtxq_hash(key); return (0); } /* * Release key. */ void umtx_key_release(struct umtx_key *key) { if (key->shared) vm_object_deallocate(key->info.shared.object); } /* * Fetch and compare value, sleep on the address if value is not changed. */ static int do_wait(struct thread *td, void *addr, u_long id, - struct _umtx_time *timeout, int compat32, int is_private) + struct _umtx_time *timeout, int compat32, int is_private) { struct abs_timeout timo; struct umtx_q *uq; u_long tmp; uint32_t tmp32; int error = 0; uq = td->td_umtxq; if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); if (compat32 == 0) { error = fueword(addr, &tmp); if (error != 0) error = EFAULT; } else { error = fueword32(addr, &tmp32); if (error == 0) tmp = tmp32; else error = EFAULT; } umtxq_lock(&uq->uq_key); if (error == 0) { if (tmp == id) error = umtxq_sleep(uq, "uwait", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else umtxq_remove(uq); } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { umtxq_remove(uq); } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } /* * Wake up threads sleeping on the specified address. */ int kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private) { struct umtx_key key; int ret; if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, - is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0) + is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0) return (ret); umtxq_lock(&key); umtxq_signal(&key, n_wake); umtxq_unlock(&key); umtx_key_release(&key); return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, - struct _umtx_time *timeout, int mode) + struct _umtx_time *timeout, int mode) { struct abs_timeout timo; struct umtx_q *uq; uint32_t owner, old, id; int error, rv; id = td->td_tid; uq = td->td_umtxq; error = 0; if (timeout != NULL) abs_timeout_init2(&timo, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { rv = fueword32(&m->m_owner, &owner); if (rv == -1) return (EFAULT); if (mode == _UMUTEX_WAIT) { - if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED) + if (owner == UMUTEX_UNOWNED || + owner == UMUTEX_CONTESTED || + owner == UMUTEX_RB_OWNERDEAD || + owner == UMUTEX_RB_NOTRECOV) return (0); } else { /* - * Try the uncontested case. This should be done in userland. + * Robust mutex terminated. Kernel duty is to + * return EOWNERDEAD to the userspace. The + * umutex.m_flags UMUTEX_NONCONSISTENT is set + * by the common userspace code. */ + if (owner == UMUTEX_RB_OWNERDEAD) { + rv = casueword32(&m->m_owner, + UMUTEX_RB_OWNERDEAD, &owner, + id | UMUTEX_CONTESTED); + if (rv == -1) + return (EFAULT); + if (owner == UMUTEX_RB_OWNERDEAD) + return (EOWNERDEAD); /* success */ + rv = umtxq_check_susp(td); + if (rv != 0) + return (rv); + continue; + } + if (owner == UMUTEX_RB_NOTRECOV) + return (ENOTRECOVERABLE); + + + /* + * Try the uncontested case. This should be + * done in userland. + */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) return (EFAULT); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); - /* If no one owns it but it is contested try to acquire it. */ + /* + * If no one owns it but it is contested try + * to acquire it. + */ if (owner == UMUTEX_CONTESTED) { rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) return (EFAULT); if (owner == UMUTEX_CONTESTED) return (0); rv = umtxq_check_susp(td); if (rv != 0) return (rv); - /* If this failed the lock has changed, restart. */ + /* + * If this failed the lock has + * changed, restart. + */ continue; } } if (mode == _UMUTEX_TRY) return (EBUSY); /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtxn", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = umtxq_check_susp(td); } return (0); } /* * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int -do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags) +do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; - uint32_t owner, old, id; - int error; - int count; + uint32_t owner, old, id, newlock; + int error, count; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); + newlock = umtx_unlock_val(flags, rb); if ((owner & UMUTEX_CONTESTED) == 0) { - error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED); + error = casueword32(&m->m_owner, owner, &old, newlock); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ - error = casueword32(&m->m_owner, owner, &old, - count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); + if (count > 1) + newlock |= UMUTEX_CONTESTED; + error = casueword32(&m->m_owner, owner, &old, newlock); umtxq_lock(&key); - umtxq_signal(&key,1); + umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Check if the mutex is available and wake up a waiter, * only for simple mutex. */ static int do_wake_umutex(struct thread *td, struct umutex *m) { struct umtx_key key; uint32_t owner; uint32_t flags; int error; int count; error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); - if ((owner & ~UMUTEX_CONTESTED) != 0) + if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD && + owner != UMUTEX_RB_NOTRECOV) return (0); error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); - if (count <= 1) { + if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD && + owner != UMUTEX_RB_NOTRECOV) { error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, UMUTEX_UNOWNED); if (error == -1) error = EFAULT; } umtxq_lock(&key); - if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0) + if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || + owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } /* * Check if the mutex has waiters and tries to fix contention bit. */ static int do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old; int type; int error; int count; - switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { + switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT | + UMUTEX_ROBUST)) { case 0: + case UMUTEX_ROBUST: type = TYPE_NORMAL_UMUTEX; break; case UMUTEX_PRIO_INHERIT: type = TYPE_PI_UMUTEX; break; + case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST): + type = TYPE_PI_ROBUST_UMUTEX; + break; case UMUTEX_PRIO_PROTECT: type = TYPE_PP_UMUTEX; break; + case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST): + type = TYPE_PP_ROBUST_UMUTEX; + break; default: return (EINVAL); } - if ((error = umtx_key_get(m, type, GET_SHARE(flags), - &key)) != 0) + if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0) return (error); owner = 0; umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * Only repair contention bit if there is a waiter, this means the mutex * is still being referenced by userland code, otherwise don't update * any memory. */ if (count > 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } else if (count == 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 && - (owner & UMUTEX_CONTESTED) == 0) { + (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } umtxq_lock(&key); if (error == EFAULT) { umtxq_signal(&key, INT_MAX); - } else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0) + } else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || + owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static inline struct umtx_pi * umtx_pi_alloc(int flags) { struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); } static inline void umtx_pi_free(struct umtx_pi *pi) { uma_zfree(umtx_pi_zone, pi); atomic_add_int(&umtx_pi_allocated, -1); } /* * Adjust the thread's position on a pi_state after its priority has been * changed. */ static int umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td) { struct umtx_q *uq, *uq1, *uq2; struct thread *td1; mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); uq = td->td_umtxq; /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq); uq2 = TAILQ_NEXT(uq, uq_lockq); if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) || (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { td1 = uq1->uq_thread; MPASS(td1->td_proc->p_magic == P_MAGIC); if (UPRI(td1) > UPRI(td)) break; } if (uq1 == NULL) TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); else TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); } return (1); } static struct umtx_pi * umtx_pi_next(struct umtx_pi *pi) { struct umtx_q *uq_owner; if (pi->pi_owner == NULL) return (NULL); uq_owner = pi->pi_owner->td_umtxq; if (uq_owner == NULL) return (NULL); return (uq_owner->uq_pi_blocked); } /* * Floyd's Cycle-Finding Algorithm. */ static bool umtx_pi_check_loop(struct umtx_pi *pi) { struct umtx_pi *pi1; /* fast iterator */ mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (false); pi1 = pi; for (;;) { pi = umtx_pi_next(pi); if (pi == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; if (pi == pi1) return (true); } return (false); } /* * Propagate priority when a thread is blocked on POSIX * PI mutex. */ static void umtx_propagate_priority(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; int pri; mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) return; if (umtx_pi_check_loop(pi)) return; for (;;) { td = pi->pi_owner; if (td == NULL || td == curthread) return; MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); thread_lock(td); if (td->td_lend_user_pri > pri) sched_lend_user_prio(td, pri); else { thread_unlock(td); break; } thread_unlock(td); /* * Pick up the lock that td is blocked on. */ uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) break; /* Resort td on the list if needed. */ umtx_pi_adjust_thread(pi, td); } } /* * Unpropagate priority for a PI mutex when a thread blocked on * it is interrupted by signal or resumed by others. */ static void umtx_repropagate_priority(struct umtx_pi *pi) { struct umtx_q *uq, *uq_owner; struct umtx_pi *pi2; int pri; mtx_assert(&umtx_lock, MA_OWNED); if (umtx_pi_check_loop(pi)) return; while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; uq_owner = pi->pi_owner->td_umtxq; TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) { uq = TAILQ_FIRST(&pi2->pi_blocked); if (uq != NULL) { if (pri > UPRI(uq->uq_thread)) pri = UPRI(uq->uq_thread); } } if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; thread_lock(pi->pi_owner); sched_lend_user_prio(pi->pi_owner, pri); thread_unlock(pi->pi_owner); if ((pi = uq_owner->uq_pi_blocked) != NULL) umtx_pi_adjust_thread(pi, uq_owner->uq_thread); } } /* * Insert a PI mutex into owned list. */ static void umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; mtx_assert(&umtx_lock, MA_OWNED); if (pi->pi_owner != NULL) panic("pi_owner != NULL"); pi->pi_owner = owner; TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link); } /* * Disown a PI mutex, and remove it from the owned list. */ static void umtx_pi_disown(struct umtx_pi *pi) { mtx_assert(&umtx_lock, MA_OWNED); TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); pi->pi_owner = NULL; } /* * Claim ownership of a PI mutex. */ static int umtx_pi_claim(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq; + int pri; mtx_lock(&umtx_lock); if (pi->pi_owner == owner) { mtx_unlock(&umtx_lock); return (0); } if (pi->pi_owner != NULL) { /* * userland may have already messed the mutex, sigh. */ mtx_unlock(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); uq = TAILQ_FIRST(&pi->pi_blocked); if (uq != NULL) { - int pri; - pri = UPRI(uq->uq_thread); thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); thread_unlock(owner); } mtx_unlock(&umtx_lock); return (0); } /* * Adjust a thread's order position in its blocked PI mutex, * this may result new priority propagating process. */ void umtx_pi_adjust(struct thread *td, u_char oldpri) { struct umtx_q *uq; struct umtx_pi *pi; uq = td->td_umtxq; mtx_lock(&umtx_lock); /* * Pick up the lock that td is blocked on. */ pi = uq->uq_pi_blocked; if (pi != NULL) { umtx_pi_adjust_thread(pi, td); umtx_repropagate_priority(pi); } mtx_unlock(&umtx_lock); } /* * Sleep on a PI mutex. */ static int -umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, - uint32_t owner, const char *wmesg, struct abs_timeout *timo) +umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner, + const char *wmesg, struct abs_timeout *timo, bool shared) { struct umtxq_chain *uc; struct thread *td, *td1; struct umtx_q *uq1; - int pri; - int error = 0; + int error, pri; + error = 0; td = uq->uq_thread; KASSERT(td == curthread, ("inconsistent uq_thread")); uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(uc->uc_busy != 0, ("umtx chain is not busy")); umtxq_insert(uq); mtx_lock(&umtx_lock); if (pi->pi_owner == NULL) { mtx_unlock(&umtx_lock); - /* XXX Only look up thread in current process. */ - td1 = tdfind(owner, curproc->p_pid); + td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid); mtx_lock(&umtx_lock); if (td1 != NULL) { if (pi->pi_owner == NULL) umtx_pi_setowner(pi, td1); PROC_UNLOCK(td1->td_proc); } } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { pri = UPRI(uq1->uq_thread); if (pri > UPRI(td)) break; } if (uq1 != NULL) TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); else TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); uq->uq_pi_blocked = pi; thread_lock(td); td->td_flags |= TDF_UPIBLOCKED; thread_unlock(td); umtx_propagate_priority(td); mtx_unlock(&umtx_lock); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, wmesg, timo); umtxq_remove(uq); mtx_lock(&umtx_lock); uq->uq_pi_blocked = NULL; thread_lock(td); td->td_flags &= ~TDF_UPIBLOCKED; thread_unlock(td); TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_repropagate_priority(pi); mtx_unlock(&umtx_lock); umtxq_unlock(&uq->uq_key); return (error); } /* * Add reference count for a PI mutex. */ static void umtx_pi_ref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); pi->pi_refcount++; } /* * Decrease reference count for a PI mutex, if the counter * is decreased to zero, its memory space is freed. */ static void umtx_pi_unref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { mtx_lock(&umtx_lock); if (pi->pi_owner != NULL) umtx_pi_disown(pi); KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); mtx_unlock(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); umtx_pi_free(pi); } } /* * Find a PI mutex in hash table. */ static struct umtx_pi * umtx_pi_lookup(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_pi *pi; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) { if (umtx_key_match(&pi->pi_key, key)) { return (pi); } } return (NULL); } /* * Insert a PI mutex into hash table. */ static inline void umtx_pi_insert(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink); } /* * Lock a PI mutex. */ static int do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq; struct umtx_pi *pi, *new_pi; - uint32_t id, owner, old; + uint32_t id, old_owner, owner, old; int error, rv; id = td->td_tid; uq = td->td_umtxq; - if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags), + if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? + TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi == NULL) { new_pi = umtx_pi_alloc(M_NOWAIT); if (new_pi == NULL) { umtxq_unlock(&uq->uq_key); new_pi = umtx_pi_alloc(M_WAITOK); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi != NULL) { umtx_pi_free(new_pi); new_pi = NULL; } } if (new_pi != NULL) { new_pi->pi_key = uq->uq_key; umtx_pi_insert(new_pi); pi = new_pi; } } umtx_pi_ref(pi); umtxq_unlock(&uq->uq_key); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) { error = 0; break; } + if (owner == UMUTEX_RB_NOTRECOV) { + error = ENOTRECOVERABLE; + break; + } + /* If no one owns it but it is contested try to acquire it. */ - if (owner == UMUTEX_CONTESTED) { - rv = casueword32(&m->m_owner, - UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); + if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) { + old_owner = owner; + rv = casueword32(&m->m_owner, owner, &owner, + id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } - if (owner == UMUTEX_CONTESTED) { + if (owner == old_owner) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); error = umtx_pi_claim(pi, td); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); if (error != 0) { /* * Since we're going to return an * error, restore the m_owner to its * previous, unowned state to avoid * compounding the problem. */ (void)casuword32(&m->m_owner, id | UMUTEX_CONTESTED, - UMUTEX_CONTESTED); + old_owner); } + if (error == 0 && + old_owner == UMUTEX_RB_OWNERDEAD) + error = EOWNERDEAD; break; } error = umtxq_check_susp(td); if (error != 0) break; /* If this failed the lock has changed, restart. */ continue; } if ((owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ - rv = casueword32(&m->m_owner, owner, &old, - owner | UMUTEX_CONTESTED); + rv = casueword32(&m->m_owner, owner, &old, owner | + UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } umtxq_lock(&uq->uq_key); /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread - * unlocking the umtx. + * unlocking the umtx. Note that the UMUTEX_RB_OWNERDEAD + * value for owner is impossible there. */ if (old == owner) { - error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED, - "umtxpi", timeout == NULL ? NULL : &timo); + error = umtxq_sleep_pi(uq, pi, + owner & ~UMUTEX_CONTESTED, + "umtxpi", timeout == NULL ? NULL : &timo, + (flags & USYNC_PROCESS_SHARED) != 0); if (error != 0) continue; } else { umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } error = umtxq_check_susp(td); if (error != 0) break; } umtxq_lock(&uq->uq_key); umtx_pi_unref(pi); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PI mutex. */ static int -do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags) +do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq_first, *uq_first2, *uq_me; struct umtx_pi *pi, *pi2; - uint32_t owner, old, id; - int error; - int count; - int pri; + uint32_t id, new_owner, old, owner; + int count, error, pri; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); + new_owner = umtx_unlock_val(flags, rb); + /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { - error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED); + error = casueword32(&m->m_owner, owner, &old, new_owner); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ - if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags), + if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? + TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count_pi(&key, &uq_first); if (uq_first != NULL) { mtx_lock(&umtx_lock); pi = uq_first->uq_pi_blocked; KASSERT(pi != NULL, ("pi == NULL?")); - if (pi->pi_owner != td) { + if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) { mtx_unlock(&umtx_lock); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); /* userland messed the mutex */ return (EPERM); } uq_me = td->td_umtxq; - umtx_pi_disown(pi); + if (pi->pi_owner == td) + umtx_pi_disown(pi); /* get highest priority thread which is still sleeping. */ uq_first = TAILQ_FIRST(&pi->pi_blocked); while (uq_first != NULL && - (uq_first->uq_flags & UQF_UMTXQ) == 0) { + (uq_first->uq_flags & UQF_UMTXQ) == 0) { uq_first = TAILQ_NEXT(uq_first, uq_lockq); } pri = PRI_MAX; TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) { uq_first2 = TAILQ_FIRST(&pi2->pi_blocked); if (uq_first2 != NULL) { if (pri > UPRI(uq_first2->uq_thread)) pri = UPRI(uq_first2->uq_thread); } } thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); if (uq_first) umtxq_signal_thread(uq_first); } else { pi = umtx_pi_lookup(&key); /* * A umtx_pi can exist if a signal or timeout removed the * last waiter from the umtxq, but there is still * a thread in do_lock_pi() holding the umtx_pi. */ if (pi != NULL) { /* * The umtx_pi can be unowned, such as when a thread * has just entered do_lock_pi(), allocated the * umtx_pi, and unlocked the umtxq. * If the current thread owns it, it must disown it. */ mtx_lock(&umtx_lock); if (pi->pi_owner == td) umtx_pi_disown(pi); mtx_unlock(&umtx_lock); } } umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ - error = casueword32(&m->m_owner, owner, &old, - count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED); + if (count > 1) + new_owner |= UMUTEX_CONTESTED; + error = casueword32(&m->m_owner, owner, &old, new_owner); + umtxq_unbusy_unlocked(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Lock a PP mutex. */ static int do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t ceiling; uint32_t owner, id; int error, pri, old_inherited_pri, su, rv; id = td->td_tid; uq = td->td_umtxq; - if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), + if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? + TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); for (;;) { old_inherited_pri = uq->uq_inherited_pri; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &ceiling); if (rv == -1) { error = EFAULT; goto out; } ceiling = RTP_PRIO_MAX - ceiling; if (ceiling > RTP_PRIO_MAX) { error = EINVAL; goto out; } mtx_lock(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { mtx_unlock(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); thread_unlock(td); } mtx_unlock(&umtx_lock); - rv = casueword32(&m->m_owner, - UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); + rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, + id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { error = 0; break; + } else if (owner == UMUTEX_RB_OWNERDEAD) { + rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, + &owner, id | UMUTEX_CONTESTED); + if (rv == -1) { + error = EFAULT; + break; + } + if (owner == UMUTEX_RB_OWNERDEAD) { + error = EOWNERDEAD; /* success */ + break; + } + error = 0; + } else if (owner == UMUTEX_RB_NOTRECOV) { + error = ENOTRECOVERABLE; + break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } - if (error != 0) { + if (error != 0 && error != EOWNERDEAD) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } out: umtxq_unbusy_unlocked(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PP mutex. */ static int -do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags) +do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq, *uq2; struct umtx_pi *pi; - uint32_t owner, id; - uint32_t rceiling; + uint32_t id, owner, rceiling; int error, pri, new_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t)); if (error != 0) return (error); if (rceiling == -1) new_inherited_pri = PRI_MAX; else { rceiling = RTP_PRIO_MAX - rceiling; if (rceiling > RTP_PRIO_MAX) return (EINVAL); new_inherited_pri = PRI_MIN_REALTIME + rceiling; } - if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), + if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? + TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); /* * For priority protected mutex, always set unlocked state * to UMUTEX_CONTESTED, so that userland always enters kernel * to lock the mutex, it is necessary because thread priority * has to be adjusted for such mutex. */ - error = suword32(&m->m_owner, UMUTEX_CONTESTED); + error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) | + UMUTEX_CONTESTED); umtxq_lock(&key); if (error == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); if (error == -1) error = EFAULT; else { mtx_lock(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } umtx_key_release(&key); return (error); } static int do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling, - uint32_t *old_ceiling) + uint32_t *old_ceiling) { struct umtx_q *uq; - uint32_t save_ceiling; - uint32_t owner, id; - uint32_t flags; - int error, rv; + uint32_t flags, id, owner, save_ceiling; + int error, rv, rv1; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); if ((flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); if (ceiling > RTP_PRIO_MAX) return (EINVAL); id = td->td_tid; uq = td->td_umtxq; - if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags), - &uq->uq_key)) != 0) + if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? + TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), + &uq->uq_key)) != 0) return (error); for (;;) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &save_ceiling); if (rv == -1) { error = EFAULT; break; } - rv = casueword32(&m->m_owner, - UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); + rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, + id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { - suword32(&m->m_ceilings[0], ceiling); - suword32(&m->m_owner, UMUTEX_CONTESTED); - error = 0; + rv = suword32(&m->m_ceilings[0], ceiling); + rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED); + error = (rv == 0 && rv1 == 0) ? 0: EFAULT; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { - suword32(&m->m_ceilings[0], ceiling); - error = 0; + rv = suword32(&m->m_ceilings[0], ceiling); + error = rv == 0 ? 0 : EFAULT; break; } + if (owner == UMUTEX_RB_OWNERDEAD) { + error = EOWNERDEAD; + break; + } else if (owner == UMUTEX_RB_NOTRECOV) { + error = ENOTRECOVERABLE; + break; + } + /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", NULL); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); if (error == 0) umtxq_signal(&uq->uq_key, INT_MAX); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); - if (error == 0 && old_ceiling != NULL) - suword32(old_ceiling, save_ceiling); + if (error == 0 && old_ceiling != NULL) { + rv = suword32(old_ceiling, save_ceiling); + error = rv == 0 ? 0 : EFAULT; + } return (error); } /* * Lock a userland POSIX mutex. */ static int do_lock_umutex(struct thread *td, struct umutex *m, struct _umtx_time *timeout, int mode) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); - switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { + switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: error = do_lock_normal(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_INHERIT: error = do_lock_pi(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_PROTECT: error = do_lock_pp(td, m, flags, timeout, mode); break; default: return (EINVAL); } if (timeout == NULL) { if (error == EINTR && mode != _UMUTEX_WAIT) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a userland POSIX mutex. */ static int -do_unlock_umutex(struct thread *td, struct umutex *m) +do_unlock_umutex(struct thread *td, struct umutex *m, bool rb) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); - switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { + switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: - return (do_unlock_normal(td, m, flags)); + return (do_unlock_normal(td, m, flags, rb)); case UMUTEX_PRIO_INHERIT: - return (do_unlock_pi(td, m, flags)); + return (do_unlock_pi(td, m, flags, rb)); case UMUTEX_PRIO_PROTECT: - return (do_unlock_pp(td, m, flags)); + return (do_unlock_pp(td, m, flags, rb)); } return (EINVAL); } static int do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m, - struct timespec *timeout, u_long wflags) + struct timespec *timeout, u_long wflags) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, clockid, hasw; int error; uq = td->td_umtxq; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if ((wflags & CVWAIT_CLOCKID) != 0) { error = fueword32(&cv->c_clockid, &clockid); if (error == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (clockid < CLOCK_REALTIME || clockid >= CLOCK_THREAD_CPUTIME_ID) { /* hmm, only HW clock id will work. */ umtx_key_release(&uq->uq_key); return (EINVAL); } } else { clockid = CLOCK_REALTIME; } umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set c_has_waiters to 1 before releasing user mutex, also * don't modify cache line when unnecessary. */ error = fueword32(&cv->c_has_waiters, &hasw); if (error == 0 && hasw == 0) suword32(&cv->c_has_waiters, 1); umtxq_unbusy_unlocked(&uq->uq_key); - error = do_unlock_umutex(td, m); + error = do_unlock_umutex(td, m, false); if (timeout != NULL) - abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0), - timeout); + abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0, + timeout); umtxq_lock(&uq->uq_key); if (error == 0) { error = umtxq_sleep(uq, "ucond", timeout == NULL ? NULL : &timo); } if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { /* * This must be timeout,interrupted by signal or * surprious wakeup, clear c_has_waiter flag when * necessary. */ umtxq_busy(&uq->uq_key); if ((uq->uq_flags & UQF_UMTXQ) != 0) { int oldlen = uq->uq_cur_queue->length; umtxq_remove(uq); if (oldlen == 1) { umtxq_unlock(&uq->uq_key); suword32(&cv->c_has_waiters, 0); umtxq_lock(&uq->uq_key); } } umtxq_unbusy(&uq->uq_key); if (error == ERESTART) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland condition variable. */ static int do_cv_signal(struct thread *td, struct ucond *cv) { struct umtx_key key; int error, cnt, nwake; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); nwake = umtxq_signal(&key, 1); if (cnt <= nwake) { umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_lock(&key); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static int do_cv_broadcast(struct thread *td, struct ucond *cv) { struct umtx_key key; int error; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_signal(&key, INT_MAX); umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_unbusy_unlocked(&key); umtx_key_release(&key); return (error); } static int do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, wrflags; int32_t state, oldstate; int32_t blocked_readers; int error, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); wrflags = URWLOCK_WRITE_OWNER; if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER)) wrflags |= URWLOCK_WRITE_WAITERS; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } /* try to lock it */ while (!(state & wrflags)) { if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) { umtx_key_release(&uq->uq_key); return (EAGAIN); } rv = casueword32(&rwlock->rw_state, state, &oldstate, state + 1); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } error = umtxq_check_susp(td); if (error != 0) break; state = oldstate; } if (error) break; /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; /* set read contention bit */ while (error == 0 && (state & wrflags) && !(state & URWLOCK_READ_WAITERS)) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } /* state is changed while setting flags, restart */ if (!(state & wrflags)) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: /* contention bit is set, before sleeping, increase read waiter count */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers+1); while (state & wrflags) { umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "urdlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } /* decrease read waiter count, and may clear read contention bit */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers-1); if (blocked_readers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error = umtxq_check_susp(td); } } umtxq_unbusy_unlocked(&uq->uq_key); if (error != 0) break; } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int32_t blocked_writers; int32_t blocked_readers; int error, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); blocked_readers = 0; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_OWNER); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error) { if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) && blocked_readers != 0) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } break; } /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0 && ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) && (state & URWLOCK_WRITE_WAITERS) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers+1); while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) { umtxq_lock(&uq->uq_key); umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "uwrlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers-1); if (blocked_writers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error = umtxq_check_susp(td); /* * We are leaving the URWLOCK_WRITE_WAITERS * behind, but this should not harm the * correctness. */ if (error != 0) break; } rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } } else blocked_readers = 0; umtxq_unbusy_unlocked(&uq->uq_key); } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_unlock(struct thread *td, struct urwlock *rwlock) { struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int error, rv, q, count; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); error = fueword32(&rwlock->rw_state, &state); if (error == -1) { error = EFAULT; goto out; } if (state & URWLOCK_WRITE_OWNER) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_OWNER); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (!(oldstate & URWLOCK_WRITE_OWNER)) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else if (URWLOCK_READER_COUNT(state) != 0) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state - 1); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (URWLOCK_READER_COUNT(oldstate) == 0) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else { error = EPERM; goto out; } count = 0; if (!(flags & URWLOCK_PREFER_READER)) { if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } else if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } } else { if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } else if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } } if (count) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, count, q); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } out: umtx_key_release(&uq->uq_key); return (error); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, count, count1; int error, rv; uq = td->td_umtxq; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = casueword32(&sem->_has_waiters, 0, &count1, 1); if (rv == 0) rv = fueword32(&sem->_count, &count); if (rv == -1 || count != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (rv == -1 ? EFAULT : 0); } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem_wake(struct thread *td, struct _usem *sem) { struct umtx_key key; int error, cnt; uint32_t flags; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { umtxq_signal(&key, 1); /* * Check if count is greater than 0, this means the memory is * still being referenced by user code, so we can safely * update _has_waiters flag. */ if (cnt == 1) { umtxq_unlock(&key); error = suword32(&sem->_has_waiters, 0); umtxq_lock(&key); if (error == -1) error = EFAULT; } } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } #endif static int do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t count, flags; int error, rv; uq = td->td_umtxq; flags = fuword32(&sem->_flags); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = fueword32(&sem->_count, &count); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } for (;;) { if (USEM_COUNT(count) != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (0); } if (count == USEM_HAS_WAITERS) break; rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } if (count == 0) break; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem2_wake(struct thread *td, struct _usem2 *sem) { struct umtx_key key; int error, cnt, rv; uint32_t count, flags; rv = fueword32(&sem->_flags, &flags); if (rv == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { umtxq_signal(&key, 1); /* * If this was the last sleeping thread, clear the waiters * flag in _count. */ if (cnt == 1) { umtxq_unlock(&key); rv = fueword32(&sem->_count, &count); while (rv != -1 && count & USEM_HAS_WAITERS) rv = casueword32(&sem->_count, count, &count, count & ~USEM_HAS_WAITERS); if (rv == -1) error = EFAULT; umtxq_lock(&key); } } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } inline int umtx_copyin_timeout(const void *addr, struct timespec *tsp) { int error; error = copyin(addr, tsp, sizeof(struct timespec)); if (error == 0) { if (tsp->tv_sec < 0 || tsp->tv_nsec >= 1000000000 || tsp->tv_nsec < 0) error = EINVAL; } return (error); } static inline int umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp) { int error; if (size <= sizeof(struct timespec)) { tp->_clockid = CLOCK_REALTIME; tp->_flags = 0; error = copyin(addr, &tp->_timeout, sizeof(struct timespec)); } else error = copyin(addr, tp, sizeof(struct _umtx_time)); if (error != 0) return (error); if (tp->_timeout.tv_sec < 0 || tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0) return (EINVAL); return (0); } static int __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap) { return (EOPNOTSUPP); } static int __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_wait(td, uap->obj, uap->val, tm_p, 0, 0); + return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0)); } static int __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_wait(td, uap->obj, uap->val, tm_p, 1, 0); + return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_wait(td, uap->obj, uap->val, tm_p, 1, 1); + return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } static int __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap) { + return (kern_umtx_wake(td, uap->obj, uap->val, 0)); } #define BATCH_SIZE 128 static int __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap) { - int count = uap->val; - void *uaddrs[BATCH_SIZE]; - char **upp = (char **)uap->obj; - int tocopy; - int error = 0; - int i, pos = 0; + char *uaddrs[BATCH_SIZE], **upp; + int count, error, i, pos, tocopy; - while (count > 0) { - tocopy = count; - if (tocopy > BATCH_SIZE) - tocopy = BATCH_SIZE; - error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *)); + upp = (char **)uap->obj; + error = 0; + for (count = uap->val, pos = 0; count > 0; count -= tocopy, + pos += tocopy) { + tocopy = MIN(count, BATCH_SIZE); + error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, uaddrs[i], INT_MAX, 1); - count -= tocopy; - pos += tocopy; + maybe_yield(); } return (error); } static int __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap) { + return (kern_umtx_wake(td, uap->obj, uap->val, 1)); } static int __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_lock_umutex(td, uap->obj, tm_p, 0); + return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap) { - return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY); + + return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY)); } static int __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT); + return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap) { - return do_wake_umutex(td, uap->obj); + + return (do_wake_umutex(td, uap->obj)); } static int __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap) { - return do_unlock_umutex(td, uap->obj); + + return (do_unlock_umutex(td, uap->obj, false)); } static int __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap) { - return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1); + + return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1)); } static int __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap) { - return do_cv_signal(td, uap->obj); + + return (do_cv_signal(td, uap->obj)); } static int __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap) { - return do_cv_broadcast(td, uap->obj); + + return (do_cv_broadcast(td, uap->obj)); } static int __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap) { - return do_rw_unlock(td, uap->obj); + + return (do_rw_unlock(td, uap->obj)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem_wake(td, uap->obj)); } #endif static int __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_wake2_umutex(td, uap->obj, uap->val)); } static int __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem2_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem2_wake(td, uap->obj)); } #define USHM_OBJ_UMTX(o) \ ((struct umtx_shm_obj_list *)(&(o)->umtx_data)) #define USHMF_REG_LINKED 0x0001 #define USHMF_OBJ_LINKED 0x0002 struct umtx_shm_reg { TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link; LIST_ENTRY(umtx_shm_reg) ushm_obj_link; struct umtx_key ushm_key; struct ucred *ushm_cred; struct shmfd *ushm_obj; u_int ushm_refcnt; u_int ushm_flags; }; LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg); TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg); static uma_zone_t umtx_shm_reg_zone; static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS]; static struct mtx umtx_shm_lock; static struct umtx_shm_reg_head umtx_shm_reg_delfree = TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree); static void umtx_shm_free_reg(struct umtx_shm_reg *reg); static void umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused) { struct umtx_shm_reg_head d; struct umtx_shm_reg *reg, *reg1; TAILQ_INIT(&d); mtx_lock(&umtx_shm_lock); TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link); mtx_unlock(&umtx_shm_lock); TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) { TAILQ_REMOVE(&d, reg, ushm_reg_link); umtx_shm_free_reg(reg); } } static struct task umtx_shm_reg_delfree_task = TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL); static struct umtx_shm_reg * umtx_shm_find_reg_locked(const struct umtx_key *key) { struct umtx_shm_reg *reg; struct umtx_shm_reg_head *reg_head; KASSERT(key->shared, ("umtx_p_find_rg: private key")); mtx_assert(&umtx_shm_lock, MA_OWNED); reg_head = &umtx_shm_registry[key->hash]; TAILQ_FOREACH(reg, reg_head, ushm_reg_link) { KASSERT(reg->ushm_key.shared, ("non-shared key on reg %p %d", reg, reg->ushm_key.shared)); if (reg->ushm_key.info.shared.object == key->info.shared.object && reg->ushm_key.info.shared.offset == key->info.shared.offset) { KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM")); KASSERT(reg->ushm_refcnt > 0, ("reg %p refcnt 0 onlist", reg)); KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0, ("reg %p not linked", reg)); reg->ushm_refcnt++; return (reg); } } return (NULL); } static struct umtx_shm_reg * umtx_shm_find_reg(const struct umtx_key *key) { struct umtx_shm_reg *reg; mtx_lock(&umtx_shm_lock); reg = umtx_shm_find_reg_locked(key); mtx_unlock(&umtx_shm_lock); return (reg); } static void umtx_shm_free_reg(struct umtx_shm_reg *reg) { chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0); crfree(reg->ushm_cred); shm_drop(reg->ushm_obj); uma_zfree(umtx_shm_reg_zone, reg); } static bool umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force) { bool res; mtx_assert(&umtx_shm_lock, MA_OWNED); KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg)); reg->ushm_refcnt--; res = reg->ushm_refcnt == 0; if (res || force) { if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) { TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash], reg, ushm_reg_link); reg->ushm_flags &= ~USHMF_REG_LINKED; } if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) { LIST_REMOVE(reg, ushm_obj_link); reg->ushm_flags &= ~USHMF_OBJ_LINKED; } } return (res); } static void umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force) { vm_object_t object; bool dofree; if (force) { object = reg->ushm_obj->shm_object; VM_OBJECT_WLOCK(object); object->flags |= OBJ_UMTXDEAD; VM_OBJECT_WUNLOCK(object); } mtx_lock(&umtx_shm_lock); dofree = umtx_shm_unref_reg_locked(reg, force); mtx_unlock(&umtx_shm_lock); if (dofree) umtx_shm_free_reg(reg); } void umtx_shm_object_init(vm_object_t object) { LIST_INIT(USHM_OBJ_UMTX(object)); } void umtx_shm_object_terminated(vm_object_t object) { struct umtx_shm_reg *reg, *reg1; bool dofree; dofree = false; mtx_lock(&umtx_shm_lock); LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) { if (umtx_shm_unref_reg_locked(reg, true)) { TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg, ushm_reg_link); dofree = true; } } mtx_unlock(&umtx_shm_lock); if (dofree) taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task); } static int umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, struct umtx_shm_reg **res) { struct umtx_shm_reg *reg, *reg1; struct ucred *cred; int error; reg = umtx_shm_find_reg(key); if (reg != NULL) { *res = reg; return (0); } cred = td->td_ucred; if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP))) return (ENOMEM); reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { umtx_shm_free_reg(reg); return (error); } mtx_lock(&umtx_shm_lock); reg1 = umtx_shm_find_reg_locked(key); if (reg1 != NULL) { mtx_unlock(&umtx_shm_lock); umtx_shm_free_reg(reg); *res = reg1; return (0); } reg->ushm_refcnt++; TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link); LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg, ushm_obj_link); reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED; mtx_unlock(&umtx_shm_lock); *res = reg; return (0); } static int umtx_shm_alive(struct thread *td, void *addr) { vm_map_t map; vm_map_entry_t entry; vm_object_t object; vm_pindex_t pindex; vm_prot_t prot; int res, ret; boolean_t wired; map = &td->td_proc->p_vmspace->vm_map; res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry, &object, &pindex, &prot, &wired); if (res != KERN_SUCCESS) return (EFAULT); if (object == NULL) ret = EINVAL; else ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0; vm_map_lookup_done(map, entry); return (ret); } static void umtx_shm_init(void) { int i; umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF); for (i = 0; i < nitems(umtx_shm_registry); i++) TAILQ_INIT(&umtx_shm_registry[i]); } static int umtx_shm(struct thread *td, void *addr, u_int flags) { struct umtx_key key; struct umtx_shm_reg *reg; struct file *fp; int error, fd; if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP | UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1) return (EINVAL); if ((flags & UMTX_SHM_ALIVE) != 0) return (umtx_shm_alive(td, addr)); error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key); if (error != 0) return (error); KASSERT(key.shared == 1, ("non-shared key")); if ((flags & UMTX_SHM_CREAT) != 0) { error = umtx_shm_create_reg(td, &key, ®); } else { reg = umtx_shm_find_reg(&key); if (reg == NULL) error = ESRCH; } umtx_key_release(&key); if (error != 0) return (error); KASSERT(reg != NULL, ("no reg")); if ((flags & UMTX_SHM_DESTROY) != 0) { umtx_shm_unref_reg(reg, true); } else { #if 0 #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, reg->ushm_obj, FFLAGS(O_RDWR)); if (error == 0) #endif error = shm_access(reg->ushm_obj, td->td_ucred, FFLAGS(O_RDWR)); if (error == 0) #endif error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL); if (error == 0) { shm_hold(reg->ushm_obj); finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); } } umtx_shm_unref_reg(reg, false); return (error); } static int __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap) { return (umtx_shm(td, uap->uaddr1, uap->val)); } +static int +umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp) +{ + + td->td_rb_list = rbp->robust_list_offset; + td->td_rbp_list = rbp->robust_priv_list_offset; + td->td_rb_inact = rbp->robust_inact_offset; + return (0); +} + +static int +__umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap) +{ + struct umtx_robust_lists_params rb; + int error; + + if (uap->val > sizeof(rb)) + return (EINVAL); + bzero(&rb, sizeof(rb)); + error = copyin(uap->uaddr1, &rb, uap->val); + if (error != 0) + return (error); + return (umtx_robust_lists(td, &rb)); +} + typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap); static const _umtx_op_func op_table[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_uint, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, + [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists, }; int sys__umtx_op(struct thread *td, struct _umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table)) return (*op_table[uap->op])(td, uap); return (EINVAL); } #ifdef COMPAT_FREEBSD32 struct timespec32 { int32_t tv_sec; int32_t tv_nsec; }; struct umtx_time32 { struct timespec32 timeout; uint32_t flags; uint32_t clockid; }; static inline int umtx_copyin_timeout32(void *addr, struct timespec *tsp) { struct timespec32 ts32; int error; error = copyin(addr, &ts32, sizeof(struct timespec32)); if (error == 0) { if (ts32.tv_sec < 0 || ts32.tv_nsec >= 1000000000 || ts32.tv_nsec < 0) error = EINVAL; else { tsp->tv_sec = ts32.tv_sec; tsp->tv_nsec = ts32.tv_nsec; } } return (error); } static inline int umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp) { struct umtx_time32 t32; int error; t32.clockid = CLOCK_REALTIME; t32.flags = 0; if (size <= sizeof(struct timespec32)) error = copyin(addr, &t32.timeout, sizeof(struct timespec32)); else error = copyin(addr, &t32, sizeof(struct umtx_time32)); if (error != 0) return (error); if (t32.timeout.tv_sec < 0 || t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0) return (EINVAL); tp->_timeout.tv_sec = t32.timeout.tv_sec; tp->_timeout.tv_nsec = t32.timeout.tv_nsec; tp->_flags = t32.flags; tp->_clockid = t32.clockid; return (0); } static int __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_wait(td, uap->obj, uap->val, tm_p, 1, 0); + return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_lock_umutex(td, uap->obj, tm_p, 0); + return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT); + return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32( uap->uaddr2, (size_t)uap->uaddr1,&timeout); if (error != 0) return (error); tm_p = &timeout; } - return do_wait(td, uap->obj, uap->val, tm_p, 1, 1); + return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } #endif static int __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem2_wait(td, uap->obj, tm_p)); } static int __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap) { - int count = uap->val; - uint32_t uaddrs[BATCH_SIZE]; - uint32_t **upp = (uint32_t **)uap->obj; - int tocopy; - int error = 0; - int i, pos = 0; + uint32_t uaddrs[BATCH_SIZE], **upp; + int count, error, i, pos, tocopy; - while (count > 0) { - tocopy = count; - if (tocopy > BATCH_SIZE) - tocopy = BATCH_SIZE; - error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t)); + upp = (uint32_t **)uap->obj; + error = 0; + for (count = uap->val, pos = 0; count > 0; count -= tocopy, + pos += tocopy) { + tocopy = MIN(count, BATCH_SIZE); + error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i], - INT_MAX, 1); - count -= tocopy; - pos += tocopy; + INT_MAX, 1); + maybe_yield(); } return (error); } +struct umtx_robust_lists_params_compat32 { + uint32_t robust_list_offset; + uint32_t robust_priv_list_offset; + uint32_t robust_inact_offset; +}; + +static int +__umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap) +{ + struct umtx_robust_lists_params rb; + struct umtx_robust_lists_params_compat32 rb32; + int error; + + if (uap->val > sizeof(rb32)) + return (EINVAL); + bzero(&rb, sizeof(rb)); + bzero(&rb32, sizeof(rb32)); + error = copyin(uap->uaddr1, &rb32, uap->val); + if (error != 0) + return (error); + rb.robust_list_offset = rb32.robust_list_offset; + rb.robust_priv_list_offset = rb32.robust_priv_list_offset; + rb.robust_inact_offset = rb32.robust_inact_offset; + return (umtx_robust_lists(td, &rb)); +} + static const _umtx_op_func op_table_compat32[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, - [UMTX_OP_WAIT] = __umtx_op_wait_compat32, - [UMTX_OP_WAKE] = __umtx_op_wake, + [UMTX_OP_WAIT] = __umtx_op_wait_compat32, + [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex_compat32, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait_compat32, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_compat32, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock_compat32, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock_compat32, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex_compat32, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait_compat32, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private32, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait_compat32, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, + [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists_compat32, }; int freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table_compat32)) { return (*op_table_compat32[uap->op])(td, (struct _umtx_op_args *)uap); } return (EINVAL); } #endif void umtx_thread_init(struct thread *td) { + td->td_umtxq = umtxq_alloc(); td->td_umtxq->uq_thread = td; } void umtx_thread_fini(struct thread *td) { + umtxq_free(td->td_umtxq); } /* * It will be called when new thread is created, e.g fork(). */ void umtx_thread_alloc(struct thread *td) { struct umtx_q *uq; uq = td->td_umtxq; uq->uq_inherited_pri = PRI_MAX; KASSERT(uq->uq_flags == 0, ("uq_flags != 0")); KASSERT(uq->uq_thread == td, ("uq_thread != td")); KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL")); KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty")); } /* * exec() hook. + * + * Clear robust lists for all process' threads, not delaying the + * cleanup to thread_exit hook, since the relevant address space is + * destroyed right now. */ static void -umtx_exec_hook(void *arg __unused, struct proc *p __unused, - struct image_params *imgp __unused) +umtx_exec_hook(void *arg __unused, struct proc *p, + struct image_params *imgp __unused) { - umtx_thread_cleanup(curthread); + struct thread *td; + + KASSERT(p == curproc, ("need curproc")); + PROC_LOCK(p); + KASSERT((p->p_flag & P_HADTHREADS) == 0 || + (p->p_flag & P_STOPPED_SINGLE) != 0, + ("curproc must be single-threaded")); + FOREACH_THREAD_IN_PROC(p, td) { + KASSERT(td == curthread || + ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)), + ("running thread %p %p", p, td)); + PROC_UNLOCK(p); + umtx_thread_cleanup(td); + PROC_LOCK(p); + td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0; + } + PROC_UNLOCK(p); } /* * thread_exit() hook. */ void umtx_thread_exit(struct thread *td) { + umtx_thread_cleanup(td); } +static int +umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res) +{ + u_long res1; +#ifdef COMPAT_FREEBSD32 + uint32_t res32; +#endif + int error; + +#ifdef COMPAT_FREEBSD32 + if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { + error = fueword32((void *)ptr, &res32); + if (error == 0) + res1 = res32; + } else +#endif + { + error = fueword((void *)ptr, &res1); + } + if (error == 0) + *res = res1; + else + error = EFAULT; + return (error); +} + +static void +umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list) +{ +#ifdef COMPAT_FREEBSD32 + struct umutex32 m32; + + if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { + memcpy(&m32, m, sizeof(m32)); + *rb_list = m32.m_rb_lnk; + } else +#endif + *rb_list = m->m_rb_lnk; +} + +static int +umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact) +{ + struct umutex m; + int error; + + KASSERT(td->td_proc == curproc, ("need current vmspace")); + error = copyin((void *)rbp, &m, sizeof(m)); + if (error != 0) + return (error); + if (rb_list != NULL) + umtx_read_rb_list(td, &m, rb_list); + if ((m.m_flags & UMUTEX_ROBUST) == 0) + return (EINVAL); + if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid) + /* inact is cleared after unlock, allow the inconsistency */ + return (inact ? 0 : EINVAL); + return (do_unlock_umutex(td, (struct umutex *)rbp, true)); +} + +static void +umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact, + const char *name) +{ + int error, i; + uintptr_t rbp; + bool inact; + + if (rb_list == 0) + return; + error = umtx_read_uptr(td, rb_list, &rbp); + for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) { + if (rbp == *rb_inact) { + inact = true; + *rb_inact = 0; + } else + inact = false; + error = umtx_handle_rb(td, rbp, &rbp, inact); + } + if (i == umtx_max_rb && umtx_verbose_rb) { + uprintf("comm %s pid %d: reached umtx %smax rb %d\n", + td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb); + } + if (error != 0 && umtx_verbose_rb) { + uprintf("comm %s pid %d: handling %srb error %d\n", + td->td_proc->p_comm, td->td_proc->p_pid, name, error); + } +} + /* - * clean up umtx data. + * Clean up umtx data. */ static void umtx_thread_cleanup(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; + uintptr_t rb_inact; - if ((uq = td->td_umtxq) == NULL) - return; - - mtx_lock(&umtx_lock); - uq->uq_inherited_pri = PRI_MAX; - while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { - pi->pi_owner = NULL; - TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); + /* + * Disown pi mutexes. + */ + uq = td->td_umtxq; + if (uq != NULL) { + mtx_lock(&umtx_lock); + uq->uq_inherited_pri = PRI_MAX; + while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { + pi->pi_owner = NULL; + TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); + } + mtx_unlock(&umtx_lock); + thread_lock(td); + sched_lend_user_prio(td, PRI_MAX); + thread_unlock(td); } - mtx_unlock(&umtx_lock); - thread_lock(td); - sched_lend_user_prio(td, PRI_MAX); - thread_unlock(td); + + /* + * Handle terminated robust mutexes. Must be done after + * robust pi disown, otherwise unlock could see unowned + * entries. + */ + rb_inact = td->td_rb_inact; + if (rb_inact != 0) + (void)umtx_read_uptr(td, rb_inact, &rb_inact); + umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, ""); + umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv "); + if (rb_inact != 0) + (void)umtx_handle_rb(td, rb_inact, NULL, true); } Index: head/sys/sys/_umtx.h =================================================================== --- head/sys/sys/_umtx.h (revision 300042) +++ head/sys/sys/_umtx.h (revision 300043) @@ -1,75 +1,79 @@ /*- * Copyright (c) 2010, David Xu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _SYS__UMTX_H_ #define _SYS__UMTX_H_ #include #include struct umutex { volatile __lwpid_t m_owner; /* Owner of the mutex */ __uint32_t m_flags; /* Flags of the mutex */ __uint32_t m_ceilings[2]; /* Priority protect ceiling */ - __uint32_t m_spare[4]; + __uintptr_t m_rb_lnk; /* Robust linkage */ +#ifndef __LP64__ + __uint32_t m_pad; +#endif + __uint32_t m_spare[2]; }; struct ucond { volatile __uint32_t c_has_waiters; /* Has waiters in kernel */ __uint32_t c_flags; /* Flags of the condition variable */ __uint32_t c_clockid; /* Clock id */ __uint32_t c_spare[1]; /* Spare space */ }; struct urwlock { volatile __int32_t rw_state; __uint32_t rw_flags; __uint32_t rw_blocked_readers; __uint32_t rw_blocked_writers; __uint32_t rw_spare[4]; }; struct _usem { volatile __uint32_t _has_waiters; volatile __uint32_t _count; __uint32_t _flags; }; struct _usem2 { volatile __uint32_t _count; /* Waiters flag in high bit. */ __uint32_t _flags; }; struct _umtx_time { struct timespec _timeout; __uint32_t _flags; __uint32_t _clockid; }; #endif /* !_SYS__UMTX_H_ */ Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 300042) +++ head/sys/sys/proc.h (revision 300043) @@ -1,1067 +1,1070 @@ /*- * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #endif #include #include #include #include /* Machine-dependent proc substruct. */ /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ struct vm_domain_policy td_vm_dom_policy; /* (c) current numa domain policy */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ struct ksiginfo td_dbgksi; /* (c) ksi reflected to debugger. */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ int td_dom_rr_idx; /* (k) RR Numa domain selection. */ void *td_su; /* (k) FFS SU private */ + uintptr_t td_rb_list; /* (k) Robust list head. */ + uintptr_t td_rbp_list; /* (k) Robust priv list head. */ + uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ u_int td_dbg_sc_code; /* (c) Syscall code to debugger. */ u_int td_dbg_sc_narg; /* (c) Syscall arg count to debugger.*/ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct td_sched *td_sched; /* (*) Scheduler-specific data. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) ((td)->td_locks--) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_UNUSED19 0x00080000 /* --available-- */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_UNUSED21 0x00200000 /* --available-- */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (b) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ /* End area that is copied on creation. */ #define p_endcopy p_xsig struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ struct p_sched *p_sched; /* (*) Scheduler-specific data. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ struct vm_domain_policy p_vm_dom_policy; /* (c) process default VM domain, or -1 */ /* * An orphan is the child that has beed re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KPROC 0x00004 /* Kernel process. */ #define P_FOLLOWFORK 0x00008 /* Attach parent debugger to children. */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x2000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_LWP_EVENTS 0x00000010 /* Report LWP events via ptrace(2). */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to opepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* Hold process U-area in memory, normally for ptrace/procfs work. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread thread0; /* Primary thread in proc0. */ extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_set_fork_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *td, struct thread *td0); void cpu_set_upcall_kse(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/umtx.h =================================================================== --- head/sys/sys/umtx.h (revision 300042) +++ head/sys/sys/umtx.h (revision 300043) @@ -1,174 +1,196 @@ /*- * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef _SYS_UMTX_H_ #define _SYS_UMTX_H_ #include +/* Common lock flags */ #define USYNC_PROCESS_SHARED 0x0001 /* Process shared sync objs */ -#define UMUTEX_UNOWNED 0x0 -#define UMUTEX_CONTESTED 0x80000000U - +/* umutex flags */ #define UMUTEX_PRIO_INHERIT 0x0004 /* Priority inherited mutex */ #define UMUTEX_PRIO_PROTECT 0x0008 /* Priority protect mutex */ +#define UMUTEX_ROBUST 0x0010 /* Robust mutex */ +#define UMUTEX_NONCONSISTENT 0x0020 /* Robust locked but not consistent */ +/* + * The umutex.m_lock values and bits. The m_owner is the word which + * serves as the lock. Its high bit is the contention indicator and + * rest of bits records the owner TID. TIDs values start with PID_MAX + * + 2 and end by INT32_MAX. The low range [1..PID_MAX] is guaranteed + * to be useable as the special markers. + */ +#define UMUTEX_UNOWNED 0x0 +#define UMUTEX_CONTESTED 0x80000000U +#define UMUTEX_RB_OWNERDEAD (UMUTEX_CONTESTED | 0x10) +#define UMUTEX_RB_NOTRECOV (UMUTEX_CONTESTED | 0x11) + /* urwlock flags */ #define URWLOCK_PREFER_READER 0x0002 #define URWLOCK_WRITE_OWNER 0x80000000U #define URWLOCK_WRITE_WAITERS 0x40000000U #define URWLOCK_READ_WAITERS 0x20000000U #define URWLOCK_MAX_READERS 0x1fffffffU #define URWLOCK_READER_COUNT(c) ((c) & URWLOCK_MAX_READERS) /* _usem flags */ #define SEM_NAMED 0x0002 /* _usem2 count field */ #define USEM_HAS_WAITERS 0x80000000U #define USEM_MAX_COUNT 0x7fffffffU #define USEM_COUNT(c) ((c) & USEM_MAX_COUNT) /* op code for _umtx_op */ #define UMTX_OP_RESERVED0 0 #define UMTX_OP_RESERVED1 1 #define UMTX_OP_WAIT 2 #define UMTX_OP_WAKE 3 #define UMTX_OP_MUTEX_TRYLOCK 4 #define UMTX_OP_MUTEX_LOCK 5 #define UMTX_OP_MUTEX_UNLOCK 6 #define UMTX_OP_SET_CEILING 7 #define UMTX_OP_CV_WAIT 8 #define UMTX_OP_CV_SIGNAL 9 #define UMTX_OP_CV_BROADCAST 10 #define UMTX_OP_WAIT_UINT 11 #define UMTX_OP_RW_RDLOCK 12 #define UMTX_OP_RW_WRLOCK 13 #define UMTX_OP_RW_UNLOCK 14 #define UMTX_OP_WAIT_UINT_PRIVATE 15 #define UMTX_OP_WAKE_PRIVATE 16 #define UMTX_OP_MUTEX_WAIT 17 #define UMTX_OP_MUTEX_WAKE 18 /* deprecated */ #define UMTX_OP_SEM_WAIT 19 /* deprecated */ #define UMTX_OP_SEM_WAKE 20 /* deprecated */ #define UMTX_OP_NWAKE_PRIVATE 21 #define UMTX_OP_MUTEX_WAKE2 22 #define UMTX_OP_SEM2_WAIT 23 #define UMTX_OP_SEM2_WAKE 24 #define UMTX_OP_SHM 25 +#define UMTX_OP_ROBUST_LISTS 26 /* Flags for UMTX_OP_CV_WAIT */ #define CVWAIT_CHECK_UNPARKING 0x01 #define CVWAIT_ABSTIME 0x02 #define CVWAIT_CLOCKID 0x04 #define UMTX_ABSTIME 0x01 #define UMTX_CHECK_UNPARKING CVWAIT_CHECK_UNPARKING /* Flags for UMTX_OP_SHM */ #define UMTX_SHM_CREAT 0x0001 #define UMTX_SHM_LOOKUP 0x0002 #define UMTX_SHM_DESTROY 0x0004 #define UMTX_SHM_ALIVE 0x0008 +struct umtx_robust_lists_params { + uintptr_t robust_list_offset; + uintptr_t robust_priv_list_offset; + uintptr_t robust_inact_offset; +}; + #ifndef _KERNEL int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2); #else /* * The umtx_key structure is used by both the Linux futex code and the * umtx implementation to map userland addresses to unique keys. */ enum { TYPE_SIMPLE_WAIT, TYPE_CV, TYPE_SEM, TYPE_SIMPLE_LOCK, TYPE_NORMAL_UMUTEX, TYPE_PI_UMUTEX, TYPE_PP_UMUTEX, TYPE_RWLOCK, TYPE_FUTEX, TYPE_SHM, + TYPE_PI_ROBUST_UMUTEX, + TYPE_PP_ROBUST_UMUTEX, }; /* Key to represent a unique userland synchronous object */ struct umtx_key { int hash; int type; int shared; union { struct { struct vm_object *object; uintptr_t offset; } shared; struct { struct vmspace *vs; uintptr_t addr; } private; struct { void *a; uintptr_t b; } both; } info; }; #define THREAD_SHARE 0 #define PROCESS_SHARE 1 #define AUTO_SHARE 2 struct thread; static inline int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2) { return (k1->type == k2->type && k1->info.both.a == k2->info.both.a && k1->info.both.b == k2->info.both.b); } int umtx_copyin_timeout(const void *, struct timespec *); int umtx_key_get(const void *, int, int, struct umtx_key *); void umtx_key_release(struct umtx_key *); struct umtx_q *umtxq_alloc(void); void umtxq_free(struct umtx_q *); int kern_umtx_wake(struct thread *, void *, int, int); void umtx_pi_adjust(struct thread *, u_char); void umtx_thread_init(struct thread *); void umtx_thread_fini(struct thread *); void umtx_thread_alloc(struct thread *); void umtx_thread_exit(struct thread *); #endif /* !_KERNEL */ #endif /* !_SYS_UMTX_H_ */ Index: head/sys/vm/vm_object.c =================================================================== --- head/sys/vm/vm_object.c (revision 300042) +++ head/sys/vm/vm_object.c (revision 300043) @@ -1,2632 +1,2632 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Virtual memory object module. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include /* for curproc, pageproc */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int old_msync; SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0, "Use old (insecure) msync behavior"); static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio); static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags); static void vm_object_qcollapse(vm_object_t object); static void vm_object_vndeallocate(vm_object_t object); /* * Virtual memory objects maintain the actual data * associated with allocated virtual memory. A given * page of memory exists within exactly one object. * * An object is only deallocated when all "references" * are given up. Only one "reference" to a given * region of an object should be writeable. * * Associated with each object is a list of all resident * memory pages belonging to that object; this list is * maintained by the "vm_page" module, and locked by the object's * lock. * * Each object also records a "pager" routine which is * used to retrieve (and store) pages to the proper backing * storage. In addition, objects may be backed by other * objects from which they were virtual-copied. * * The only items within the object structure which are * modified after time of creation are: * reference count locked by object's lock * pager routine locked by object's lock * */ struct object_q vm_object_list; struct mtx vm_object_list_mtx; /* lock for object list and count */ struct vm_object kernel_object_store; struct vm_object kmem_object_store; static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats"); static long object_collapses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD, &object_collapses, 0, "VM object collapses"); static long object_bypasses; SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD, &object_bypasses, 0, "VM object bypasses"); static uma_zone_t obj_zone; static int vm_object_zinit(void *mem, int size, int flags); #ifdef INVARIANTS static void vm_object_zdtor(void *mem, int size, void *arg); static void vm_object_zdtor(void *mem, int size, void *arg) { vm_object_t object; object = (vm_object_t)mem; KASSERT(object->ref_count == 0, ("object %p ref_count = %d", object, object->ref_count)); KASSERT(TAILQ_EMPTY(&object->memq), ("object %p has resident pages in its memq", object)); KASSERT(vm_radix_is_empty(&object->rtree), ("object %p has resident pages in its trie", object)); #if VM_NRESERVLEVEL > 0 KASSERT(LIST_EMPTY(&object->rvq), ("object %p has reservations", object)); #endif KASSERT(vm_object_cache_is_empty(object), ("object %p has cached pages", object)); KASSERT(object->paging_in_progress == 0, ("object %p paging_in_progress = %d", object, object->paging_in_progress)); KASSERT(object->resident_page_count == 0, ("object %p resident_page_count = %d", object, object->resident_page_count)); KASSERT(object->shadow_count == 0, ("object %p shadow_count = %d", object, object->shadow_count)); KASSERT(object->type == OBJT_DEAD, ("object %p has non-dead type %d", object, object->type)); } #endif static int vm_object_zinit(void *mem, int size, int flags) { vm_object_t object; object = (vm_object_t)mem; rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW); /* These are true for any object that has been freed */ object->type = OBJT_DEAD; object->ref_count = 0; object->rtree.rt_root = 0; object->rtree.rt_flags = 0; object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; object->cache.rt_root = 0; object->cache.rt_flags = 0; mtx_lock(&vm_object_list_mtx); TAILQ_INSERT_TAIL(&vm_object_list, object, object_list); mtx_unlock(&vm_object_list_mtx); return (0); } static void _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) { TAILQ_INIT(&object->memq); LIST_INIT(&object->shadow_head); object->type = type; switch (type) { case OBJT_DEAD: panic("_vm_object_allocate: can't create OBJT_DEAD"); case OBJT_DEFAULT: case OBJT_SWAP: object->flags = OBJ_ONEMAPPING; break; case OBJT_DEVICE: case OBJT_SG: object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED; break; case OBJT_MGTDEVICE: object->flags = OBJ_FICTITIOUS; break; case OBJT_PHYS: object->flags = OBJ_UNMANAGED; break; case OBJT_VNODE: object->flags = 0; break; default: panic("_vm_object_allocate: type %d is undefined", type); } object->size = size; object->generation = 1; object->ref_count = 1; object->memattr = VM_MEMATTR_DEFAULT; object->cred = NULL; object->charge = 0; object->handle = NULL; object->backing_object = NULL; object->backing_object_offset = (vm_ooffset_t) 0; #if VM_NRESERVLEVEL > 0 LIST_INIT(&object->rvq); #endif umtx_shm_object_init(object); } /* * vm_object_init: * * Initialize the VM objects module. */ void vm_object_init(void) { TAILQ_INIT(&vm_object_list); mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF); rw_init(&kernel_object->lock, "kernel vm object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kernel_object); #if VM_NRESERVLEVEL > 0 kernel_object->flags |= OBJ_COLORED; kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif rw_init(&kmem_object->lock, "kmem vm object"); _vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS), kmem_object); #if VM_NRESERVLEVEL > 0 kmem_object->flags |= OBJ_COLORED; kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS); #endif /* * The lock portion of struct vm_object must be type stable due * to vm_pageout_fallback_object_lock locking a vm object * without holding any references to it. */ obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL, #ifdef INVARIANTS vm_object_zdtor, #else NULL, #endif vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); vm_radix_init(); } void vm_object_clear_flag(vm_object_t object, u_short bits) { VM_OBJECT_ASSERT_WLOCKED(object); object->flags &= ~bits; } /* * Sets the default memory attribute for the specified object. Pages * that are allocated to this object are by default assigned this memory * attribute. * * Presently, this function must be called before any pages are allocated * to the object. In the future, this requirement may be relaxed for * "default" and "swap" objects. */ int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr) { VM_OBJECT_ASSERT_WLOCKED(object); switch (object->type) { case OBJT_DEFAULT: case OBJT_DEVICE: case OBJT_MGTDEVICE: case OBJT_PHYS: case OBJT_SG: case OBJT_SWAP: case OBJT_VNODE: if (!TAILQ_EMPTY(&object->memq)) return (KERN_FAILURE); break; case OBJT_DEAD: return (KERN_INVALID_ARGUMENT); default: panic("vm_object_set_memattr: object %p is of undefined type", object); } object->memattr = memattr; return (KERN_SUCCESS); } void vm_object_pip_add(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress += i; } void vm_object_pip_subtract(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress -= i; } void vm_object_pip_wakeup(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); object->paging_in_progress--; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wakeupn(vm_object_t object, short i) { VM_OBJECT_ASSERT_WLOCKED(object); if (i) object->paging_in_progress -= i; if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) { vm_object_clear_flag(object, OBJ_PIPWNT); wakeup(object); } } void vm_object_pip_wait(vm_object_t object, char *waitid) { VM_OBJECT_ASSERT_WLOCKED(object); while (object->paging_in_progress) { object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PVM, waitid, 0); } } /* * vm_object_allocate: * * Returns a new object with the given size. */ vm_object_t vm_object_allocate(objtype_t type, vm_pindex_t size) { vm_object_t object; object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK); _vm_object_allocate(type, size, object); return (object); } /* * vm_object_reference: * * Gets another reference to the given object. Note: OBJ_DEAD * objects can be referenced during final cleaning. */ void vm_object_reference(vm_object_t object) { if (object == NULL) return; VM_OBJECT_WLOCK(object); vm_object_reference_locked(object); VM_OBJECT_WUNLOCK(object); } /* * vm_object_reference_locked: * * Gets another reference to the given object. * * The object must be locked. */ void vm_object_reference_locked(vm_object_t object) { struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); object->ref_count++; if (object->type == OBJT_VNODE) { vp = object->handle; vref(vp); } } /* * Handle deallocating an object of type OBJT_VNODE. */ static void vm_object_vndeallocate(vm_object_t object) { struct vnode *vp = (struct vnode *) object->handle; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_VNODE, ("vm_object_vndeallocate: not a vnode object")); KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); #ifdef INVARIANTS if (object->ref_count == 0) { vprint("vm_object_vndeallocate", vp); panic("vm_object_vndeallocate: bad object reference count"); } #endif - if (object->ref_count == 1) + if (!umtx_shm_vnobj_persistent && object->ref_count == 1) umtx_shm_object_terminated(object); /* * The test for text of vp vnode does not need a bypass to * reach right VV_TEXT there, since it is obtained from * object->handle. */ if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) { object->ref_count--; VM_OBJECT_WUNLOCK(object); /* vrele may need the vnode lock. */ vrele(vp); } else { vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vdrop(vp); VM_OBJECT_WLOCK(object); object->ref_count--; if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); } else { if (object->ref_count == 0) VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); vput(vp); } } } /* * vm_object_deallocate: * * Release a reference to the specified object, * gained either through a vm_object_allocate * or a vm_object_reference call. When all references * are gone, storage associated with this object * may be relinquished. * * No object may be locked. */ void vm_object_deallocate(vm_object_t object) { vm_object_t temp; struct vnode *vp; while (object != NULL) { VM_OBJECT_WLOCK(object); if (object->type == OBJT_VNODE) { vm_object_vndeallocate(object); return; } KASSERT(object->ref_count != 0, ("vm_object_deallocate: object deallocated too many times: %d", object->type)); /* * If the reference count goes to 0 we start calling * vm_object_terminate() on the object chain. * A ref count of 1 may be a special case depending on the * shadow count being 0 or 1. */ object->ref_count--; if (object->ref_count > 1) { VM_OBJECT_WUNLOCK(object); return; } else if (object->ref_count == 1) { if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) { vp = object->un_pager.swp.swp_tmpfs; vhold(vp); VM_OBJECT_WUNLOCK(object); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD || object->ref_count != 1) { VM_OBJECT_WUNLOCK(object); VOP_UNLOCK(vp, 0); vdrop(vp); return; } if ((object->flags & OBJ_TMPFS) != 0) VOP_UNSET_TEXT(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } if (object->shadow_count == 0 && object->handle == NULL && (object->type == OBJT_DEFAULT || (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS_NODE) == 0))) { vm_object_set_flag(object, OBJ_ONEMAPPING); } else if ((object->shadow_count == 1) && (object->handle == NULL) && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { vm_object_t robject; robject = LIST_FIRST(&object->shadow_head); KASSERT(robject != NULL, ("vm_object_deallocate: ref_count: %d, shadow_count: %d", object->ref_count, object->shadow_count)); KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object %p", object)); if (!VM_OBJECT_TRYWLOCK(robject)) { /* * Avoid a potential deadlock. */ object->ref_count++; VM_OBJECT_WUNLOCK(object); /* * More likely than not the thread * holding robject's lock has lower * priority than the current thread. * Let the lower priority thread run. */ pause("vmo_de", 1); continue; } /* * Collapse object into its shadow unless its * shadow is dead. In that case, object will * be deallocated by the thread that is * deallocating its shadow. */ if ((robject->flags & OBJ_DEAD) == 0 && (robject->handle == NULL) && (robject->type == OBJT_DEFAULT || robject->type == OBJT_SWAP)) { robject->ref_count++; retry: if (robject->paging_in_progress) { VM_OBJECT_WUNLOCK(object); vm_object_pip_wait(robject, "objde1"); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else if (object->paging_in_progress) { VM_OBJECT_WUNLOCK(robject); object->flags |= OBJ_PIPWNT; VM_OBJECT_SLEEP(object, object, PDROP | PVM, "objde2", 0); VM_OBJECT_WLOCK(robject); temp = robject->backing_object; if (object == temp) { VM_OBJECT_WLOCK(object); goto retry; } } else VM_OBJECT_WUNLOCK(object); if (robject->ref_count == 1) { robject->ref_count--; object = robject; goto doterm; } object = robject; vm_object_collapse(object); VM_OBJECT_WUNLOCK(object); continue; } VM_OBJECT_WUNLOCK(robject); } VM_OBJECT_WUNLOCK(object); return; } doterm: umtx_shm_object_terminated(object); temp = object->backing_object; if (temp != NULL) { KASSERT((object->flags & OBJ_TMPFS_NODE) == 0, ("shadowed tmpfs v_object 2 %p", object)); VM_OBJECT_WLOCK(temp); LIST_REMOVE(object, shadow_list); temp->shadow_count--; VM_OBJECT_WUNLOCK(temp); object->backing_object = NULL; } /* * Don't double-terminate, we could be in a termination * recursion due to the terminate having to sync data * to disk. */ if ((object->flags & OBJ_DEAD) == 0) vm_object_terminate(object); else VM_OBJECT_WUNLOCK(object); object = temp; } } /* * vm_object_destroy removes the object from the global object list * and frees the space for the object. */ void vm_object_destroy(vm_object_t object) { /* * Release the allocation charge. */ if (object->cred != NULL) { swap_release_by_cred(object->charge, object->cred); object->charge = 0; crfree(object->cred); object->cred = NULL; } /* * Free the space for the object. */ uma_zfree(obj_zone, object); } /* * vm_object_terminate actually destroys the specified object, freeing * up all previously used resources. * * The object must be locked. * This routine may block. */ void vm_object_terminate(vm_object_t object) { vm_page_t p, p_next; VM_OBJECT_ASSERT_WLOCKED(object); /* * Make sure no one uses us. */ vm_object_set_flag(object, OBJ_DEAD); /* * wait for the pageout daemon to be done with the object */ vm_object_pip_wait(object, "objtrm"); KASSERT(!object->paging_in_progress, ("vm_object_terminate: pageout in progress")); /* * Clean and free the pages, as appropriate. All references to the * object are gone, so we don't need to lock it. */ if (object->type == OBJT_VNODE) { struct vnode *vp = (struct vnode *)object->handle; /* * Clean pages and flush buffers. */ vm_object_page_clean(object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(object); vinvalbuf(vp, V_SAVE, 0, 0); VM_OBJECT_WLOCK(object); } KASSERT(object->ref_count == 0, ("vm_object_terminate: object with references, ref_count=%d", object->ref_count)); /* * Free any remaining pageable pages. This also removes them from the * paging queues. However, don't free wired pages, just remove them * from the object. Rather than incrementally removing each page from * the object, the page and object are reset to any empty state. */ TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) { vm_page_assert_unbusied(p); vm_page_lock(p); /* * Optimize the page's removal from the object by resetting * its "object" field. Specifically, if the page is not * wired, then the effect of this assignment is that * vm_page_free()'s call to vm_page_remove() will return * immediately without modifying the page or the object. */ p->object = NULL; if (p->wire_count == 0) { vm_page_free(p); PCPU_INC(cnt.v_pfree); } vm_page_unlock(p); } /* * If the object contained any pages, then reset it to an empty state. * None of the object's fields, including "resident_page_count", were * modified by the preceding loop. */ if (object->resident_page_count != 0) { vm_radix_reclaim_allnodes(&object->rtree); TAILQ_INIT(&object->memq); object->resident_page_count = 0; if (object->type == OBJT_VNODE) vdrop(object->handle); } #if VM_NRESERVLEVEL > 0 if (__predict_false(!LIST_EMPTY(&object->rvq))) vm_reserv_break_all(object); #endif if (__predict_false(!vm_object_cache_is_empty(object))) vm_page_cache_free(object, 0, 0); KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT || object->type == OBJT_SWAP, ("%s: non-swap obj %p has cred", __func__, object)); /* * Let the pager know object is dead. */ vm_pager_deallocate(object); VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); } /* * Make the page read-only so that we can clear the object flags. However, if * this is a nosync mmap then the object is likely to stay dirty so do not * mess with the page and do not clear the object flags. Returns TRUE if the * page should be flushed, and FALSE otherwise. */ static boolean_t vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags) { /* * If we have been asked to skip nosync pages and this is a * nosync page, skip it. Note that the object flags were not * cleared in this case so we do not have to set them. */ if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) { *clearobjflags = FALSE; return (FALSE); } else { pmap_remove_write(p); return (p->dirty != 0); } } /* * vm_object_page_clean * * Clean all dirty pages in the specified range of object. Leaves page * on whatever queue it is currently on. If NOSYNC is set then do not * write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC), * leaving the object dirty. * * When stuffing pages asynchronously, allow clustering. XXX we need a * synchronous clustering mode implementation. * * Odd semantics: if start == end, we clean everything. * * The object must be locked. * * Returns FALSE if some page from the range was not written, as * reported by the pager, and TRUE otherwise. */ boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags) { vm_page_t np, p; vm_pindex_t pi, tend, tstart; int curgeneration, n, pagerflags; boolean_t clearobjflags, eio, res; VM_OBJECT_ASSERT_WLOCKED(object); /* * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE * objects. The check below prevents the function from * operating on non-vnode objects. */ if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 || object->resident_page_count == 0) return (TRUE); pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0; tstart = OFF_TO_IDX(start); tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK); clearobjflags = tstart == 0 && tend >= object->size; res = TRUE; rescan: curgeneration = object->generation; for (p = vm_page_find_least(object, tstart); p != NULL; p = np) { pi = p->pindex; if (pi >= tend) break; np = TAILQ_NEXT(p, listq); if (p->valid == 0) continue; if (vm_page_sleep_if_busy(p, "vpcwai")) { if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } np = vm_page_find_least(object, pi); continue; } if (!vm_object_page_remove_write(p, flags, &clearobjflags)) continue; n = vm_object_page_collect_flush(object, p, pagerflags, flags, &clearobjflags, &eio); if (eio) { res = FALSE; clearobjflags = FALSE; } if (object->generation != curgeneration) { if ((flags & OBJPC_SYNC) != 0) goto rescan; else clearobjflags = FALSE; } /* * If the VOP_PUTPAGES() did a truncated write, so * that even the first page of the run is not fully * written, vm_pageout_flush() returns 0 as the run * length. Since the condition that caused truncated * write may be permanent, e.g. exhausted free space, * accepting n == 0 would cause an infinite loop. * * Forwarding the iterator leaves the unwritten page * behind, but there is not much we can do there if * filesystem refuses to write it. */ if (n == 0) { n = 1; clearobjflags = FALSE; } np = vm_page_find_least(object, pi + n); } #if 0 VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0); #endif if (clearobjflags) vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY); return (res); } static int vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags, int flags, boolean_t *clearobjflags, boolean_t *eio) { vm_page_t ma[vm_pageout_page_count], p_first, tp; int count, i, mreq, runlen; vm_page_lock_assert(p, MA_NOTOWNED); VM_OBJECT_ASSERT_WLOCKED(object); count = 1; mreq = 0; for (tp = p; count < vm_pageout_page_count; count++) { tp = vm_page_next(tp); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; } for (p_first = p; count < vm_pageout_page_count; count++) { tp = vm_page_prev(p_first); if (tp == NULL || vm_page_busied(tp)) break; if (!vm_object_page_remove_write(tp, flags, clearobjflags)) break; p_first = tp; mreq++; } for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++) ma[i] = tp; vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio); return (runlen); } /* * Note that there is absolutely no sense in writing out * anonymous objects, so we track down the vnode object * to write out. * We invalidate (remove) all pages from the address space * for semantic correctness. * * If the backing object is a device object with unmanaged pages, then any * mappings to the specified range of pages must be removed before this * function is called. * * Note: certain anonymous maps, such as MAP_NOSYNC maps, * may start out with a NULL object. */ boolean_t vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size, boolean_t syncio, boolean_t invalidate) { vm_object_t backing_object; struct vnode *vp; struct mount *mp; int error, flags, fsync_after; boolean_t res; if (object == NULL) return (TRUE); res = TRUE; error = 0; VM_OBJECT_WLOCK(object); while ((backing_object = object->backing_object) != NULL) { VM_OBJECT_WLOCK(backing_object); offset += object->backing_object_offset; VM_OBJECT_WUNLOCK(object); object = backing_object; if (object->size < OFF_TO_IDX(offset + size)) size = IDX_TO_OFF(object->size) - offset; } /* * Flush pages if writing is allowed, invalidate them * if invalidation requested. Pages undergoing I/O * will be ignored by vm_object_page_remove(). * * We cannot lock the vnode and then wait for paging * to complete without deadlocking against vm_fault. * Instead we simply call vm_object_page_remove() and * allow it to block internally on a page-by-page * basis when it encounters pages undergoing async * I/O. */ if (object->type == OBJT_VNODE && (object->flags & OBJ_MIGHTBEDIRTY) != 0) { vp = object->handle; VM_OBJECT_WUNLOCK(object); (void) vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (syncio && !invalidate && offset == 0 && OFF_TO_IDX(size) == object->size) { /* * If syncing the whole mapping of the file, * it is faster to schedule all the writes in * async mode, also allowing the clustering, * and then wait for i/o to complete. */ flags = 0; fsync_after = TRUE; } else { flags = (syncio || invalidate) ? OBJPC_SYNC : 0; flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0; fsync_after = FALSE; } VM_OBJECT_WLOCK(object); res = vm_object_page_clean(object, offset, offset + size, flags); VM_OBJECT_WUNLOCK(object); if (fsync_after) error = VOP_FSYNC(vp, MNT_WAIT, curthread); VOP_UNLOCK(vp, 0); vn_finished_write(mp); if (error != 0) res = FALSE; VM_OBJECT_WLOCK(object); } if ((object->type == OBJT_VNODE || object->type == OBJT_DEVICE) && invalidate) { if (object->type == OBJT_DEVICE) /* * The option OBJPR_NOTMAPPED must be passed here * because vm_object_page_remove() cannot remove * unmanaged mappings. */ flags = OBJPR_NOTMAPPED; else if (old_msync) flags = 0; else flags = OBJPR_CLEANONLY; vm_object_page_remove(object, OFF_TO_IDX(offset), OFF_TO_IDX(offset + size + PAGE_MASK), flags); } VM_OBJECT_WUNLOCK(object); return (res); } /* * vm_object_madvise: * * Implements the madvise function at the object/page level. * * MADV_WILLNEED (any object) * * Activate the specified pages if they are resident. * * MADV_DONTNEED (any object) * * Deactivate the specified pages if they are resident. * * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, * OBJ_ONEMAPPING only) * * Deactivate and clean the specified pages if they are * resident. This permits the process to reuse the pages * without faulting or the kernel to reclaim the pages * without I/O. */ void vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end, int advise) { vm_pindex_t tpindex; vm_object_t backing_object, tobject; vm_page_t m; if (object == NULL) return; VM_OBJECT_WLOCK(object); /* * Locate and adjust resident pages */ for (; pindex < end; pindex += 1) { relookup: tobject = object; tpindex = pindex; shadowlookup: /* * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages * and those pages must be OBJ_ONEMAPPING. */ if (advise == MADV_FREE) { if ((tobject->type != OBJT_DEFAULT && tobject->type != OBJT_SWAP) || (tobject->flags & OBJ_ONEMAPPING) == 0) { goto unlock_tobject; } } else if ((tobject->flags & OBJ_UNMANAGED) != 0) goto unlock_tobject; m = vm_page_lookup(tobject, tpindex); if (m == NULL && advise == MADV_WILLNEED) { /* * If the page is cached, reactivate it. */ m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED | VM_ALLOC_NOBUSY); } if (m == NULL) { /* * There may be swap even if there is no backing page */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); /* * next object */ backing_object = tobject->backing_object; if (backing_object == NULL) goto unlock_tobject; VM_OBJECT_WLOCK(backing_object); tpindex += OFF_TO_IDX(tobject->backing_object_offset); if (tobject != object) VM_OBJECT_WUNLOCK(tobject); tobject = backing_object; goto shadowlookup; } else if (m->valid != VM_PAGE_BITS_ALL) goto unlock_tobject; /* * If the page is not in a normal state, skip it. */ vm_page_lock(m); if (m->hold_count != 0 || m->wire_count != 0) { vm_page_unlock(m); goto unlock_tobject; } KASSERT((m->flags & PG_FICTITIOUS) == 0, ("vm_object_madvise: page %p is fictitious", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("vm_object_madvise: page %p is not managed", m)); if (vm_page_busied(m)) { if (advise == MADV_WILLNEED) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); } if (object != tobject) VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(tobject); vm_page_busy_sleep(m, "madvpo"); VM_OBJECT_WLOCK(object); goto relookup; } if (advise == MADV_WILLNEED) { vm_page_activate(m); } else { vm_page_advise(m, advise); } vm_page_unlock(m); if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); unlock_tobject: if (tobject != object) VM_OBJECT_WUNLOCK(tobject); } VM_OBJECT_WUNLOCK(object); } /* * vm_object_shadow: * * Create a new object which is backed by the * specified existing object range. The source * object reference is deallocated. * * The new object and offset into that object * are returned in the source parameters. */ void vm_object_shadow( vm_object_t *object, /* IN/OUT */ vm_ooffset_t *offset, /* IN/OUT */ vm_size_t length) { vm_object_t source; vm_object_t result; source = *object; /* * Don't create the new object if the old object isn't shared. */ if (source != NULL) { VM_OBJECT_WLOCK(source); if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { VM_OBJECT_WUNLOCK(source); return; } VM_OBJECT_WUNLOCK(source); } /* * Allocate a new object with the given length. */ result = vm_object_allocate(OBJT_DEFAULT, atop(length)); /* * The new object shadows the source object, adding a reference to it. * Our caller changes his reference to point to the new object, * removing a reference to the source object. Net result: no change * of reference count. * * Try to optimize the result object's page color when shadowing * in order to maintain page coloring consistency in the combined * shadowed object. */ result->backing_object = source; /* * Store the offset into the source object, and fix up the offset into * the new object. */ result->backing_object_offset = *offset; if (source != NULL) { VM_OBJECT_WLOCK(source); LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list); source->shadow_count++; #if VM_NRESERVLEVEL > 0 result->flags |= source->flags & OBJ_COLORED; result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & ((1 << (VM_NFREEORDER - 1)) - 1); #endif VM_OBJECT_WUNLOCK(source); } /* * Return the new things */ *offset = 0; *object = result; } /* * vm_object_split: * * Split the pages in a map entry into a new object. This affords * easier removal of unused pages, and keeps object inheritance from * being a negative impact on memory usage. */ void vm_object_split(vm_map_entry_t entry) { vm_page_t m, m_next; vm_object_t orig_object, new_object, source; vm_pindex_t idx, offidxstart; vm_size_t size; orig_object = entry->object.vm_object; if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP) return; if (orig_object->ref_count <= 1) return; VM_OBJECT_WUNLOCK(orig_object); offidxstart = OFF_TO_IDX(entry->offset); size = atop(entry->end - entry->start); /* * If swap_pager_copy() is later called, it will convert new_object * into a swap object. */ new_object = vm_object_allocate(OBJT_DEFAULT, size); /* * At this point, the new object is still private, so the order in * which the original and new objects are locked does not matter. */ VM_OBJECT_WLOCK(new_object); VM_OBJECT_WLOCK(orig_object); source = orig_object->backing_object; if (source != NULL) { VM_OBJECT_WLOCK(source); if ((source->flags & OBJ_DEAD) != 0) { VM_OBJECT_WUNLOCK(source); VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); vm_object_deallocate(new_object); VM_OBJECT_WLOCK(orig_object); return; } LIST_INSERT_HEAD(&source->shadow_head, new_object, shadow_list); source->shadow_count++; vm_object_reference_locked(source); /* for new_object */ vm_object_clear_flag(source, OBJ_ONEMAPPING); VM_OBJECT_WUNLOCK(source); new_object->backing_object_offset = orig_object->backing_object_offset + entry->offset; new_object->backing_object = source; } if (orig_object->cred != NULL) { new_object->cred = orig_object->cred; crhold(orig_object->cred); new_object->charge = ptoa(size); KASSERT(orig_object->charge >= ptoa(size), ("orig_object->charge < 0")); orig_object->charge -= ptoa(size); } retry: m = vm_page_find_least(orig_object, offidxstart); for (; m != NULL && (idx = m->pindex - offidxstart) < size; m = m_next) { m_next = TAILQ_NEXT(m, listq); /* * We must wait for pending I/O to complete before we can * rename the page. * * We do not have to VM_PROT_NONE the page as mappings should * not be changed by this operation. */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); vm_page_lock(m); VM_OBJECT_WUNLOCK(orig_object); vm_page_busy_sleep(m, "spltwt"); VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } /* vm_page_rename() will handle dirty and cache. */ if (vm_page_rename(m, new_object, idx)) { VM_OBJECT_WUNLOCK(new_object); VM_OBJECT_WUNLOCK(orig_object); VM_WAIT; VM_OBJECT_WLOCK(orig_object); VM_OBJECT_WLOCK(new_object); goto retry; } #if VM_NRESERVLEVEL > 0 /* * If some of the reservation's allocated pages remain with * the original object, then transferring the reservation to * the new object is neither particularly beneficial nor * particularly harmful as compared to leaving the reservation * with the original object. If, however, all of the * reservation's allocated pages are transferred to the new * object, then transferring the reservation is typically * beneficial. Determining which of these two cases applies * would be more costly than unconditionally renaming the * reservation. */ vm_reserv_rename(m, new_object, orig_object, offidxstart); #endif if (orig_object->type == OBJT_SWAP) vm_page_xbusy(m); } if (orig_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case the orig_object's * and new_object's locks are released and reacquired. */ swap_pager_copy(orig_object, new_object, offidxstart, 0); TAILQ_FOREACH(m, &new_object->memq, listq) vm_page_xunbusy(m); /* * Transfer any cached pages from orig_object to new_object. * If swap_pager_copy() found swapped out pages within the * specified range of orig_object, then it changed * new_object's type to OBJT_SWAP when it transferred those * pages to new_object. Otherwise, new_object's type * should still be OBJT_DEFAULT and orig_object should not * contain any cached pages within the specified range. */ if (__predict_false(!vm_object_cache_is_empty(orig_object))) vm_page_cache_transfer(orig_object, offidxstart, new_object); } VM_OBJECT_WUNLOCK(orig_object); VM_OBJECT_WUNLOCK(new_object); entry->object.vm_object = new_object; entry->offset = 0LL; vm_object_deallocate(orig_object); VM_OBJECT_WLOCK(new_object); } #define OBSC_COLLAPSE_NOWAIT 0x0002 #define OBSC_COLLAPSE_WAIT 0x0004 static vm_page_t vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, int op) { vm_object_t backing_object; VM_OBJECT_ASSERT_WLOCKED(object); backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(backing_object); KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p)); KASSERT(p == NULL || p->object == object || p->object == backing_object, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); if (p != NULL) vm_page_lock(p); VM_OBJECT_WUNLOCK(object); VM_OBJECT_WUNLOCK(backing_object); if (p == NULL) VM_WAIT; else vm_page_busy_sleep(p, "vmocol"); VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); } static bool vm_object_scan_all_shadowed(vm_object_t object) { vm_object_t backing_object; vm_page_t p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; /* * Initial conditions: * * We do not want to have to test for the existence of cache or swap * pages in the backing object. XXX but with the new swapper this * would be pretty easy to do. */ if (backing_object->type != OBJT_DEFAULT) return (false); backing_offset_index = OFF_TO_IDX(object->backing_object_offset); for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = TAILQ_NEXT(p, listq)) { new_pindex = p->pindex - backing_offset_index; /* * Ignore pages outside the parent object's range and outside * the parent object's mapping of the backing object. */ if (p->pindex < backing_offset_index || new_pindex >= object->size) continue; /* * See if the parent has the page or if the parent's object * pager has the page. If the parent has the page but the page * is not valid, the parent's object pager must have the page. * * If this fails, the parent does not completely shadow the * object and we might as well give up now. */ pp = vm_page_lookup(object, new_pindex); if ((pp == NULL || pp->valid == 0) && !vm_pager_has_page(object, new_pindex, NULL, NULL)) return (false); } return (true); } static bool vm_object_collapse_scan(vm_object_t object, int op) { vm_object_t backing_object; vm_page_t next, p, pp; vm_pindex_t backing_offset_index, new_pindex; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(object->backing_object); backing_object = object->backing_object; backing_offset_index = OFF_TO_IDX(object->backing_object_offset); /* * Initial conditions */ if ((op & OBSC_COLLAPSE_WAIT) != 0) vm_object_set_flag(backing_object, OBJ_DEAD); /* * Our scan */ for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) { next = TAILQ_NEXT(p, listq); new_pindex = p->pindex - backing_offset_index; /* * Check for busy page */ if (vm_page_busied(p)) { next = vm_object_collapse_scan_wait(object, p, next, op); continue; } KASSERT(p->object == backing_object, ("vm_object_collapse_scan: object mismatch")); if (p->pindex < backing_offset_index || new_pindex >= object->size) { if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); /* * Page is out of the parent object's range, we can * simply destroy it. */ vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } pp = vm_page_lookup(object, new_pindex); if (pp != NULL && vm_page_busied(pp)) { /* * The page in the parent is busy and possibly not * (yet) valid. Until its state is finalized by the * busy bit owner, we can't tell whether it shadows the * original page. Therefore, we must either skip it * and the original (backing_object) page or wait for * its state to be finalized. * * This is due to a race with vm_fault() where we must * unbusy the original (backing_obj) page before we can * (re)lock the parent. Hence we can get here. */ next = vm_object_collapse_scan_wait(object, pp, next, op); continue; } KASSERT(pp == NULL || pp->valid != 0, ("unbusy invalid page %p", pp)); if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL, NULL)) { /* * The page already exists in the parent OR swap exists * for this location in the parent. Leave the parent's * page alone. Destroy the original page from the * backing object. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, p->pindex, 1); vm_page_lock(p); KASSERT(!pmap_page_is_mapped(p), ("freeing mapped page %p", p)); if (p->wire_count == 0) vm_page_free(p); else vm_page_remove(p); vm_page_unlock(p); continue; } /* * Page does not exist in parent, rename the page from the * backing object to the main object. * * If the page was mapped to a process, it can remain mapped * through the rename. vm_page_rename() will handle dirty and * cache. */ if (vm_page_rename(p, object, new_pindex)) { next = vm_object_collapse_scan_wait(object, NULL, next, op); continue; } /* Use the old pindex to free the right page. */ if (backing_object->type == OBJT_SWAP) swap_pager_freespace(backing_object, new_pindex + backing_offset_index, 1); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(p, object, backing_object, backing_offset_index); #endif } return (true); } /* * this version of collapse allows the operation to occur earlier and * when paging_in_progress is true for an object... This is not a complete * operation, but should plug 99.9% of the rest of the leaks. */ static void vm_object_qcollapse(vm_object_t object) { vm_object_t backing_object = object->backing_object; VM_OBJECT_ASSERT_WLOCKED(object); VM_OBJECT_ASSERT_WLOCKED(backing_object); if (backing_object->ref_count != 1) return; vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT); } /* * vm_object_collapse: * * Collapse an object with the object backing it. * Pages in the backing object are moved into the * parent, and the backing object is deallocated. */ void vm_object_collapse(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); while (TRUE) { vm_object_t backing_object; /* * Verify that the conditions are right for collapse: * * The object exists and the backing object exists. */ if ((backing_object = object->backing_object) == NULL) break; /* * we check the backing object first, because it is most likely * not collapsable. */ VM_OBJECT_WLOCK(backing_object); if (backing_object->handle != NULL || (backing_object->type != OBJT_DEFAULT && backing_object->type != OBJT_SWAP) || (backing_object->flags & OBJ_DEAD) || object->handle != NULL || (object->type != OBJT_DEFAULT && object->type != OBJT_SWAP) || (object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(backing_object); break; } if ( object->paging_in_progress != 0 || backing_object->paging_in_progress != 0 ) { vm_object_qcollapse(object); VM_OBJECT_WUNLOCK(backing_object); break; } /* * We know that we can either collapse the backing object (if * the parent is the only reference to it) or (perhaps) have * the parent bypass the object if the parent happens to shadow * all the resident pages in the entire backing object. * * This is ignoring pager-backed pages such as swap pages. * vm_object_collapse_scan fails the shadowing test in this * case. */ if (backing_object->ref_count == 1) { /* * If there is exactly one reference to the backing * object, we can collapse it into the parent. */ vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT); #if VM_NRESERVLEVEL > 0 /* * Break any reservations from backing_object. */ if (__predict_false(!LIST_EMPTY(&backing_object->rvq))) vm_reserv_break_all(backing_object); #endif /* * Move the pager from backing_object to object. */ if (backing_object->type == OBJT_SWAP) { /* * swap_pager_copy() can sleep, in which case * the backing_object's and object's locks are * released and reacquired. * Since swap_pager_copy() is being asked to * destroy the source, it will change the * backing_object's type to OBJT_DEFAULT. */ swap_pager_copy( backing_object, object, OFF_TO_IDX(object->backing_object_offset), TRUE); /* * Free any cached pages from backing_object. */ if (__predict_false( !vm_object_cache_is_empty(backing_object))) vm_page_cache_free(backing_object, 0, 0); } /* * Object now shadows whatever backing_object did. * Note that the reference to * backing_object->backing_object moves from within * backing_object to within object. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; if (backing_object->backing_object) { VM_OBJECT_WLOCK(backing_object->backing_object); LIST_REMOVE(backing_object, shadow_list); LIST_INSERT_HEAD( &backing_object->backing_object->shadow_head, object, shadow_list); /* * The shadow_count has not changed. */ VM_OBJECT_WUNLOCK(backing_object->backing_object); } object->backing_object = backing_object->backing_object; object->backing_object_offset += backing_object->backing_object_offset; /* * Discard backing_object. * * Since the backing object has no pages, no pager left, * and no object references within it, all that is * necessary is to dispose of it. */ KASSERT(backing_object->ref_count == 1, ( "backing_object %p was somehow re-referenced during collapse!", backing_object)); backing_object->type = OBJT_DEAD; backing_object->ref_count = 0; VM_OBJECT_WUNLOCK(backing_object); vm_object_destroy(backing_object); object_collapses++; } else { vm_object_t new_backing_object; /* * If we do not entirely shadow the backing object, * there is nothing we can do so we give up. */ if (object->resident_page_count != object->size && !vm_object_scan_all_shadowed(object)) { VM_OBJECT_WUNLOCK(backing_object); break; } /* * Make the parent shadow the next object in the * chain. Deallocating backing_object will not remove * it, since its reference count is at least 2. */ LIST_REMOVE(object, shadow_list); backing_object->shadow_count--; new_backing_object = backing_object->backing_object; if ((object->backing_object = new_backing_object) != NULL) { VM_OBJECT_WLOCK(new_backing_object); LIST_INSERT_HEAD( &new_backing_object->shadow_head, object, shadow_list ); new_backing_object->shadow_count++; vm_object_reference_locked(new_backing_object); VM_OBJECT_WUNLOCK(new_backing_object); object->backing_object_offset += backing_object->backing_object_offset; } /* * Drop the reference count on backing_object. Since * its ref_count was at least 2, it will not vanish. */ backing_object->ref_count--; VM_OBJECT_WUNLOCK(backing_object); object_bypasses++; } /* * Try again with this object's new backing object. */ } } /* * vm_object_page_remove: * * For the given object, either frees or invalidates each of the * specified pages. In general, a page is freed. However, if a page is * wired for any reason other than the existence of a managed, wired * mapping, then it may be invalidated but not removed from the object. * Pages are specified by the given range ["start", "end") and the option * OBJPR_CLEANONLY. As a special case, if "end" is zero, then the range * extends from "start" to the end of the object. If the option * OBJPR_CLEANONLY is specified, then only the non-dirty pages within the * specified range are affected. If the option OBJPR_NOTMAPPED is * specified, then the pages within the specified range must have no * mappings. Otherwise, if this option is not specified, any mappings to * the specified pages are removed before the pages are freed or * invalidated. * * In general, this operation should only be performed on objects that * contain managed pages. There are, however, two exceptions. First, it * is performed on the kernel and kmem objects by vm_map_entry_delete(). * Second, it is used by msync(..., MS_INVALIDATE) to invalidate device- * backed pages. In both of these cases, the option OBJPR_CLEANONLY must * not be specified and the option OBJPR_NOTMAPPED must be specified. * * The object must be locked. */ void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED, ("vm_object_page_remove: illegal options for object %p", object)); if (object->resident_page_count == 0) goto skipmemq; vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * If the page is wired for any reason besides the existence * of managed, wired mappings, then it cannot be freed. For * example, fictitious pages, which represent device memory, * are inherently wired and cannot be freed. They can, * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ vm_page_lock(p); if (vm_page_xbusied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopax"); VM_OBJECT_WLOCK(object); goto again; } if (p->wire_count != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); if ((options & OBJPR_CLEANONLY) == 0) { p->valid = 0; vm_page_undirty(p); } goto next; } if (vm_page_busied(p)) { VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmopar"); VM_OBJECT_WLOCK(object); goto again; } KASSERT((p->flags & PG_FICTITIOUS) == 0, ("vm_object_page_remove: page %p is fictitious", p)); if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) { if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_write(p); if (p->dirty) goto next; } if ((options & OBJPR_NOTMAPPED) == 0) pmap_remove_all(p); vm_page_free(p); next: vm_page_unlock(p); } vm_object_pip_wakeup(object); skipmemq: if (__predict_false(!vm_object_cache_is_empty(object))) vm_page_cache_free(object, start, end); } /* * vm_object_page_noreuse: * * For the given object, attempt to move the specified pages to * the head of the inactive queue. This bypasses regular LRU * operation and allows the pages to be reused quickly under memory * pressure. If a page is wired for any reason, then it will not * be queued. Pages are specified by the range ["start", "end"). * As a special case, if "end" is zero, then the range extends from * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. * * The object must be locked. */ void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); /* * Here, the variable "p" is either (1) the page with the least pindex * greater than or equal to the parameter "start" or (2) NULL. */ mtx = NULL; for (; p != NULL && (p->pindex < end || end == 0); p = next) { next = TAILQ_NEXT(p, listq); /* * Avoid releasing and reacquiring the same page lock. */ new_mtx = vm_page_lockptr(p); if (mtx != new_mtx) { if (mtx != NULL) mtx_unlock(mtx); mtx = new_mtx; mtx_lock(mtx); } vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); } /* * Populate the specified range of the object with valid pages. Returns * TRUE if the range is successfully populated and FALSE otherwise. * * Note: This function should be optimized to pass a larger array of * pages to vm_pager_get_pages() before it is applied to a non- * OBJT_DEVICE object. * * The object must be locked. */ boolean_t vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { vm_page_t m; vm_pindex_t pindex; int rv; VM_OBJECT_ASSERT_WLOCKED(object); for (pindex = start; pindex < end; pindex++) { m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL); if (m->valid != VM_PAGE_BITS_ALL) { rv = vm_pager_get_pages(object, &m, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(m); vm_page_free(m); vm_page_unlock(m); break; } } /* * Keep "m" busy because a subsequent iteration may unlock * the object. */ } if (pindex > start) { m = vm_page_lookup(object, start); while (m != NULL && m->pindex < pindex) { vm_page_xunbusy(m); m = TAILQ_NEXT(m, listq); } } return (pindex == end); } /* * Routine: vm_object_coalesce * Function: Coalesces two objects backing up adjoining * regions of memory into a single object. * * returns TRUE if objects were combined. * * NOTE: Only works at the moment if the second object is NULL - * if it's not, which object do we lock first? * * Parameters: * prev_object First object to coalesce * prev_offset Offset into prev_object * prev_size Size of reference to prev_object * next_size Size of reference to the second object * reserved Indicator that extension region has * swap accounted for * * Conditions: * The object must *not* be locked. */ boolean_t vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset, vm_size_t prev_size, vm_size_t next_size, boolean_t reserved) { vm_pindex_t next_pindex; if (prev_object == NULL) return (TRUE); VM_OBJECT_WLOCK(prev_object); if ((prev_object->type != OBJT_DEFAULT && prev_object->type != OBJT_SWAP) || (prev_object->flags & OBJ_TMPFS_NODE) != 0) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Try to collapse the object first */ vm_object_collapse(prev_object); /* * Can't coalesce if: . more than one reference . paged out . shadows * another object . has a copy elsewhere (any of which mean that the * pages not mapped to prev_entry may be in use anyway) */ if (prev_object->backing_object != NULL) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } prev_size >>= PAGE_SHIFT; next_size >>= PAGE_SHIFT; next_pindex = OFF_TO_IDX(prev_offset) + prev_size; if ((prev_object->ref_count > 1) && (prev_object->size != next_pindex)) { VM_OBJECT_WUNLOCK(prev_object); return (FALSE); } /* * Account for the charge. */ if (prev_object->cred != NULL) { /* * If prev_object was charged, then this mapping, * although not charged now, may become writable * later. Non-NULL cred in the object would prevent * swap reservation during enabling of the write * access, so reserve swap now. Failed reservation * cause allocation of the separate object for the map * entry, and swap reservation for this entry is * managed in appropriate time. */ if (!reserved && !swap_reserve_by_cred(ptoa(next_size), prev_object->cred)) { return (FALSE); } prev_object->charge += ptoa(next_size); } /* * Remove any pages that may still be in the object from a previous * deallocation. */ if (next_pindex < prev_object->size) { vm_object_page_remove(prev_object, next_pindex, next_pindex + next_size, 0); if (prev_object->type == OBJT_SWAP) swap_pager_freespace(prev_object, next_pindex, next_size); #if 0 if (prev_object->cred != NULL) { KASSERT(prev_object->charge >= ptoa(prev_object->size - next_pindex), ("object %p overcharged 1 %jx %jx", prev_object, (uintmax_t)next_pindex, (uintmax_t)next_size)); prev_object->charge -= ptoa(prev_object->size - next_pindex); } #endif } /* * Extend the object if necessary. */ if (next_pindex + next_size > prev_object->size) prev_object->size = next_pindex + next_size; VM_OBJECT_WUNLOCK(prev_object); return (TRUE); } void vm_object_set_writeable_dirty(vm_object_t object) { VM_OBJECT_ASSERT_WLOCKED(object); if (object->type != OBJT_VNODE) { if ((object->flags & OBJ_TMPFS_NODE) != 0) { KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs")); vm_object_set_flag(object, OBJ_TMPFS_DIRTY); } return; } object->generation++; if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) return; vm_object_set_flag(object, OBJ_MIGHTBEDIRTY); } /* * vm_object_unwire: * * For each page offset within the specified range of the given object, * find the highest-level page in the shadow chain and unwire it. A page * must exist at every page offset, and the highest-level page must be * wired. */ void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue) { vm_object_t tobject; vm_page_t m, tm; vm_pindex_t end_pindex, pindex, tpindex; int depth, locked_depth; KASSERT((offset & PAGE_MASK) == 0, ("vm_object_unwire: offset is not page aligned")); KASSERT((length & PAGE_MASK) == 0, ("vm_object_unwire: length is not a multiple of PAGE_SIZE")); /* The wired count of a fictitious page never changes. */ if ((object->flags & OBJ_FICTITIOUS) != 0) return; pindex = OFF_TO_IDX(offset); end_pindex = pindex + atop(length); locked_depth = 1; VM_OBJECT_RLOCK(object); m = vm_page_find_least(object, pindex); while (pindex < end_pindex) { if (m == NULL || pindex < m->pindex) { /* * The first object in the shadow chain doesn't * contain a page at the current index. Therefore, * the page must exist in a backing object. */ tobject = object; tpindex = pindex; depth = 0; do { tpindex += OFF_TO_IDX(tobject->backing_object_offset); tobject = tobject->backing_object; KASSERT(tobject != NULL, ("vm_object_unwire: missing page")); if ((tobject->flags & OBJ_FICTITIOUS) != 0) goto next_page; depth++; if (depth == locked_depth) { locked_depth++; VM_OBJECT_RLOCK(tobject); } } while ((tm = vm_page_lookup(tobject, tpindex)) == NULL); } else { tm = m; m = TAILQ_NEXT(m, listq); } vm_page_lock(tm); vm_page_unwire(tm, queue); vm_page_unlock(tm); next_page: pindex++; } /* Release the accumulated object locks. */ for (depth = 0; depth < locked_depth; depth++) { tobject = object->backing_object; VM_OBJECT_RUNLOCK(object); object = tobject; } } struct vnode * vm_object_vnode(vm_object_t object) { VM_OBJECT_ASSERT_LOCKED(object); if (object->type == OBJT_VNODE) return (object->handle); if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0) return (object->un_pager.swp.swp_tmpfs); return (NULL); } static int sysctl_vm_object_list(SYSCTL_HANDLER_ARGS) { struct kinfo_vmobject kvo; char *fullpath, *freepath; struct vnode *vp; struct vattr va; vm_object_t obj; vm_page_t m; int count, error; if (req->oldptr == NULL) { /* * If an old buffer has not been provided, generate an * estimate of the space needed for a subsequent call. */ mtx_lock(&vm_object_list_mtx); count = 0; TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; count++; } mtx_unlock(&vm_object_list_mtx); return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) * count * 11 / 10)); } error = 0; /* * VM objects are type stable and are never removed from the * list once added. This allows us to safely read obj->object_list * after reacquiring the VM object lock. */ mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(obj, &vm_object_list, object_list) { if (obj->type == OBJT_DEAD) continue; VM_OBJECT_RLOCK(obj); if (obj->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(obj); continue; } mtx_unlock(&vm_object_list_mtx); kvo.kvo_size = ptoa(obj->size); kvo.kvo_resident = obj->resident_page_count; kvo.kvo_ref_count = obj->ref_count; kvo.kvo_shadow_count = obj->shadow_count; kvo.kvo_memattr = obj->memattr; kvo.kvo_active = 0; kvo.kvo_inactive = 0; TAILQ_FOREACH(m, &obj->memq, listq) { /* * A page may belong to the object but be * dequeued and set to PQ_NONE while the * object lock is not held. This makes the * reads of m->queue below racy, and we do not * count pages set to PQ_NONE. However, this * sysctl is only meant to give an * approximation of the system anyway. */ if (m->queue == PQ_ACTIVE) kvo.kvo_active++; else if (m->queue == PQ_INACTIVE) kvo.kvo_inactive++; } kvo.kvo_vn_fileid = 0; kvo.kvo_vn_fsid = 0; freepath = NULL; fullpath = ""; vp = NULL; switch (obj->type) { case OBJT_DEFAULT: kvo.kvo_type = KVME_TYPE_DEFAULT; break; case OBJT_VNODE: kvo.kvo_type = KVME_TYPE_VNODE; vp = obj->handle; vref(vp); break; case OBJT_SWAP: kvo.kvo_type = KVME_TYPE_SWAP; break; case OBJT_DEVICE: kvo.kvo_type = KVME_TYPE_DEVICE; break; case OBJT_PHYS: kvo.kvo_type = KVME_TYPE_PHYS; break; case OBJT_DEAD: kvo.kvo_type = KVME_TYPE_DEAD; break; case OBJT_SG: kvo.kvo_type = KVME_TYPE_SG; break; case OBJT_MGTDEVICE: kvo.kvo_type = KVME_TYPE_MGTDEVICE; break; default: kvo.kvo_type = KVME_TYPE_UNKNOWN; break; } VM_OBJECT_RUNLOCK(obj); if (vp != NULL) { vn_fullpath(curthread, vp, &fullpath, &freepath); vn_lock(vp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) { kvo.kvo_vn_fileid = va.va_fileid; kvo.kvo_vn_fsid = va.va_fsid; } vput(vp); } strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path)); if (freepath != NULL) free(freepath, M_TEMP); /* Pack record size down */ kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) + strlen(kvo.kvo_path) + 1; kvo.kvo_structsize = roundup(kvo.kvo_structsize, sizeof(uint64_t)); error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize); mtx_lock(&vm_object_list_mtx); if (error) break; } mtx_unlock(&vm_object_list_mtx); return (error); } SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject", "List of VM objects"); #include "opt_ddb.h" #ifdef DDB #include #include #include static int _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) { vm_map_t tmpm; vm_map_entry_t tmpe; vm_object_t obj; int entcount; if (map == 0) return 0; if (entry == 0) { tmpe = map->header.next; entcount = map->nentries; while (entcount-- && (tmpe != &map->header)) { if (_vm_object_in_map(map, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) { tmpm = entry->object.sub_map; tmpe = tmpm->header.next; entcount = tmpm->nentries; while (entcount-- && tmpe != &tmpm->header) { if (_vm_object_in_map(tmpm, object, tmpe)) { return 1; } tmpe = tmpe->next; } } else if ((obj = entry->object.vm_object) != NULL) { for (; obj; obj = obj->backing_object) if (obj == object) { return 1; } } return 0; } static int vm_object_in_map(vm_object_t object) { struct proc *p; /* sx_slock(&allproc_lock); */ FOREACH_PROC_IN_SYSTEM(p) { if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */) continue; if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) { /* sx_sunlock(&allproc_lock); */ return 1; } } /* sx_sunlock(&allproc_lock); */ if (_vm_object_in_map(kernel_map, object, 0)) return 1; return 0; } DB_SHOW_COMMAND(vmochk, vm_object_check) { vm_object_t object; /* * make sure that internal objs are in a map somewhere * and none have zero ref counts. */ TAILQ_FOREACH(object, &vm_object_list, object_list) { if (object->handle == NULL && (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) { if (object->ref_count == 0) { db_printf("vmochk: internal obj has zero ref count: %ld\n", (long)object->size); } if (!vm_object_in_map(object)) { db_printf( "vmochk: internal obj is not in a map: " "ref: %d, size: %lu: 0x%lx, backing_object: %p\n", object->ref_count, (u_long)object->size, (u_long)object->size, (void *)object->backing_object); } } } } /* * vm_object_print: [ debug ] */ DB_SHOW_COMMAND(object, vm_object_print_static) { /* XXX convert args. */ vm_object_t object = (vm_object_t)addr; boolean_t full = have_addr; vm_page_t p; /* XXX count is an (unused) arg. Avoid shadowing it. */ #define count was_count int count; if (object == NULL) return; db_iprintf( "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n", object, (int)object->type, (uintmax_t)object->size, object->resident_page_count, object->ref_count, object->flags, object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge); db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n", object->shadow_count, object->backing_object ? object->backing_object->ref_count : 0, object->backing_object, (uintmax_t)object->backing_object_offset); if (!full) return; db_indent += 2; count = 0; TAILQ_FOREACH(p, &object->memq, listq) { if (count == 0) db_iprintf("memory:="); else if (count == 6) { db_printf("\n"); db_iprintf(" ..."); count = 0; } else db_printf(","); count++; db_printf("(off=0x%jx,page=0x%jx)", (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p)); } if (count != 0) db_printf("\n"); db_indent -= 2; } /* XXX. */ #undef count /* XXX need this non-static entry for calling from vm_map_print. */ void vm_object_print( /* db_expr_t */ long addr, boolean_t have_addr, /* db_expr_t */ long count, char *modif) { vm_object_print_static(addr, have_addr, count, modif); } DB_SHOW_COMMAND(vmopag, vm_object_print_pages) { vm_object_t object; vm_pindex_t fidx; vm_paddr_t pa; vm_page_t m, prev_m; int rcount, nl, c; nl = 0; TAILQ_FOREACH(object, &vm_object_list, object_list) { db_printf("new object: %p\n", (void *)object); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; fidx = 0; pa = -1; TAILQ_FOREACH(m, &object->memq, listq) { if (m->pindex > 128) break; if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL && prev_m->pindex + 1 != m->pindex) { if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; rcount = 0; } } if (rcount && (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { ++rcount; continue; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } fidx = m->pindex; pa = VM_PAGE_TO_PHYS(m); rcount = 1; } if (rcount) { db_printf(" index(%ld)run(%d)pa(0x%lx)\n", (long)fidx, rcount, (long)pa); if (nl > 18) { c = cngetc(); if (c != ' ') return; nl = 0; } nl++; } } } #endif /* DDB */ Index: head/sys/vm/vm_object.h =================================================================== --- head/sys/vm/vm_object.h (revision 300042) +++ head/sys/vm/vm_object.h (revision 300043) @@ -1,334 +1,335 @@ /*- * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_object.h 8.3 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. * * $FreeBSD$ */ /* * Virtual memory object module definitions. */ #ifndef _VM_OBJECT_ #define _VM_OBJECT_ #include #include #include #include #include /* * Types defined: * * vm_object_t Virtual memory object. * * The root of cached pages pool is protected by both the per-object lock * and the free pages queue mutex. * On insert in the cache radix trie, the per-object lock is expected * to be already held and the free pages queue mutex will be * acquired during the operation too. * On remove and lookup from the cache radix trie, only the free * pages queue mutex is expected to be locked. * These rules allow for reliably checking for the presence of cached * pages with only the per-object lock held, thereby reducing contention * for the free pages queue mutex. * * List of locks * (c) const until freed * (o) per-object lock * (f) free pages queue mutex * */ struct vm_object { struct rwlock lock; TAILQ_ENTRY(vm_object) object_list; /* list of all objects */ LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */ LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */ TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */ struct vm_radix rtree; /* root of the resident page radix trie*/ vm_pindex_t size; /* Object size */ int generation; /* generation ID */ int ref_count; /* How many refs?? */ int shadow_count; /* how many objects that this is a shadow for */ vm_memattr_t memattr; /* default memory attribute for pages */ objtype_t type; /* type of pager */ u_short flags; /* see below */ u_short pg_color; /* (c) color of first page in obj */ u_int paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */ LIST_HEAD(, vm_reserv) rvq; /* list of reservations */ struct vm_radix cache; /* (o + f) root of the cache page radix trie */ void *handle; union { /* * VNode pager * * vnp_size - current size of file */ struct { off_t vnp_size; vm_ooffset_t writemappings; } vnp; /* * Device pager * * devp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) devp_pglist; struct cdev_pager_ops *ops; struct cdev *dev; } devp; /* * SG pager * * sgp_pglist - list of allocated pages */ struct { TAILQ_HEAD(, vm_page) sgp_pglist; } sgp; /* * Swap pager * * swp_tmpfs - back-pointer to the tmpfs vnode, * if any, which uses the vm object * as backing store. The handle * cannot be reused for linking, * because the vnode can be * reclaimed and recreated, making * the handle changed and hash-chain * invalid. * * swp_bcount - number of swap 'swblock' metablocks, each * contains up to 16 swapblk assignments. * see vm/swap_pager.h */ struct { void *swp_tmpfs; int swp_bcount; } swp; } un_pager; struct ucred *cred; vm_ooffset_t charge; void *umtx_data; }; /* * Flags */ #define OBJ_FICTITIOUS 0x0001 /* (c) contains fictitious pages */ #define OBJ_UNMANAGED 0x0002 /* (c) contains unmanaged pages */ #define OBJ_ACTIVE 0x0004 /* active objects */ #define OBJ_DEAD 0x0008 /* dead objects (during rundown) */ #define OBJ_NOSPLIT 0x0010 /* dont split this object */ #define OBJ_UMTXDEAD 0x0020 /* umtx pshared was terminated */ #define OBJ_PIPWNT 0x0040 /* paging in progress wanted */ #define OBJ_MIGHTBEDIRTY 0x0100 /* object might be dirty, only for vnode */ #define OBJ_TMPFS_NODE 0x0200 /* object belongs to tmpfs VREG node */ #define OBJ_TMPFS_DIRTY 0x0400 /* dirty tmpfs obj */ #define OBJ_COLORED 0x1000 /* pg_color is defined */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ #define OBJ_DISCONNECTWNT 0x4000 /* disconnect from vnode wanted */ #define OBJ_TMPFS 0x8000 /* has tmpfs vnode allocated */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) #ifdef _KERNEL #define OBJPC_SYNC 0x1 /* sync I/O */ #define OBJPC_INVAL 0x2 /* invalidate */ #define OBJPC_NOSYNC 0x4 /* skip if VPO_NOSYNC */ /* * The following options are supported by vm_object_page_remove(). */ #define OBJPR_CLEANONLY 0x1 /* Don't remove dirty pages. */ #define OBJPR_NOTMAPPED 0x2 /* Don't unmap pages. */ TAILQ_HEAD(object_q, vm_object); extern struct object_q vm_object_list; /* list of allocated objects */ extern struct mtx vm_object_list_mtx; /* lock for object list and count */ extern struct vm_object kernel_object_store; extern struct vm_object kmem_object_store; #define kernel_object (&kernel_object_store) #define kmem_object (&kmem_object_store) #define VM_OBJECT_ASSERT_LOCKED(object) \ rw_assert(&(object)->lock, RA_LOCKED) #define VM_OBJECT_ASSERT_RLOCKED(object) \ rw_assert(&(object)->lock, RA_RLOCKED) #define VM_OBJECT_ASSERT_WLOCKED(object) \ rw_assert(&(object)->lock, RA_WLOCKED) #define VM_OBJECT_ASSERT_UNLOCKED(object) \ rw_assert(&(object)->lock, RA_UNLOCKED) #define VM_OBJECT_LOCK_DOWNGRADE(object) \ rw_downgrade(&(object)->lock) #define VM_OBJECT_RLOCK(object) \ rw_rlock(&(object)->lock) #define VM_OBJECT_RUNLOCK(object) \ rw_runlock(&(object)->lock) #define VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo) \ rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo)) #define VM_OBJECT_TRYRLOCK(object) \ rw_try_rlock(&(object)->lock) #define VM_OBJECT_TRYWLOCK(object) \ rw_try_wlock(&(object)->lock) #define VM_OBJECT_TRYUPGRADE(object) \ rw_try_upgrade(&(object)->lock) #define VM_OBJECT_WLOCK(object) \ rw_wlock(&(object)->lock) #define VM_OBJECT_WOWNED(object) \ rw_wowned(&(object)->lock) #define VM_OBJECT_WUNLOCK(object) \ rw_wunlock(&(object)->lock) /* * The object must be locked or thread private. */ static __inline void vm_object_set_flag(vm_object_t object, u_short bits) { object->flags |= bits; } /* * Conditionally set the object's color, which (1) enables the allocation * of physical memory reservations for anonymous objects and larger-than- * superpage-sized named objects and (2) determines the first page offset * within the object at which a reservation may be allocated. In other * words, the color determines the alignment of the object with respect * to the largest superpage boundary. When mapping named objects, like * files or POSIX shared memory objects, the color should be set to zero * before a virtual address is selected for the mapping. In contrast, * for anonymous objects, the color may be set after the virtual address * is selected. * * The object must be locked. */ static __inline void vm_object_color(vm_object_t object, u_short color) { if ((object->flags & OBJ_COLORED) == 0) { object->pg_color = color; object->flags |= OBJ_COLORED; } } void vm_object_clear_flag(vm_object_t object, u_short bits); void vm_object_pip_add(vm_object_t object, short i); void vm_object_pip_subtract(vm_object_t object, short i); void vm_object_pip_wakeup(vm_object_t object); void vm_object_pip_wakeupn(vm_object_t object, short i); void vm_object_pip_wait(vm_object_t object, char *waitid); static __inline boolean_t vm_object_cache_is_empty(vm_object_t object) { return (vm_radix_is_empty(&object->cache)); } void umtx_shm_object_init(vm_object_t object); void umtx_shm_object_terminated(vm_object_t object); +extern int umtx_shm_vnobj_persistent; vm_object_t vm_object_allocate (objtype_t, vm_pindex_t); boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t, boolean_t); void vm_object_collapse (vm_object_t); void vm_object_deallocate (vm_object_t); void vm_object_destroy (vm_object_t); void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); void vm_object_print(long addr, boolean_t have_addr, long count, char *modif); void vm_object_reference (vm_object_t); void vm_object_reference_locked(vm_object_t); int vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr); void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t); void vm_object_split(vm_map_entry_t); boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t, boolean_t); void vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length, uint8_t queue); struct vnode *vm_object_vnode(vm_object_t object); #endif /* _KERNEL */ #endif /* _VM_OBJECT_ */ Index: head/sys/vm/vnode_pager.c =================================================================== --- head/sys/vm/vnode_pager.c (revision 300042) +++ head/sys/vm/vnode_pager.c (revision 300043) @@ -1,1371 +1,1372 @@ /*- * Copyright (c) 1990 University of Utah. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * Copyright (c) 1993, 1994 John S. Dyson * Copyright (c) 1995, David Greenman * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91 */ /* * Page to/from files (vnodes). */ /* * TODO: * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will * greatly re-simplify the vnode_pager. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run); static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m); static int vnode_pager_input_old(vm_object_t object, vm_page_t m); static void vnode_pager_dealloc(vm_object_t); static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *); static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *, int *, vop_getpages_iodone_t, void *); static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *); static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *); static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t, struct ucred *cred); static int vnode_pager_generic_getpages_done(struct buf *); static void vnode_pager_generic_getpages_done_async(struct buf *); struct pagerops vnodepagerops = { .pgo_alloc = vnode_pager_alloc, .pgo_dealloc = vnode_pager_dealloc, .pgo_getpages = vnode_pager_getpages, .pgo_getpages_async = vnode_pager_getpages_async, .pgo_putpages = vnode_pager_putpages, .pgo_haspage = vnode_pager_haspage, }; int vnode_pbuf_freecnt; int vnode_async_pbuf_freecnt; /* Create the VM system backing object for this vnode */ int vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td) { vm_object_t object; vm_ooffset_t size = isize; struct vattr va; if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) return (0); while ((object = vp->v_object) != NULL) { VM_OBJECT_WLOCK(object); if (!(object->flags & OBJ_DEAD)) { VM_OBJECT_WUNLOCK(object); return (0); } VOP_UNLOCK(vp, 0); vm_object_set_flag(object, OBJ_DISCONNECTWNT); VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } if (size == 0) { if (vn_isdisk(vp, NULL)) { size = IDX_TO_OFF(INT_MAX); } else { if (VOP_GETATTR(vp, &va, td->td_ucred)) return (0); size = va.va_size; } } object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred); /* * Dereference the reference we just created. This assumes * that the object is associated with the vp. */ VM_OBJECT_WLOCK(object); object->ref_count--; VM_OBJECT_WUNLOCK(object); vrele(vp); KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object")); return (0); } void vnode_destroy_vobject(struct vnode *vp) { struct vm_object *obj; obj = vp->v_object; if (obj == NULL) return; ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject"); VM_OBJECT_WLOCK(obj); + umtx_shm_object_terminated(obj); if (obj->ref_count == 0) { /* * don't double-terminate the object */ if ((obj->flags & OBJ_DEAD) == 0) vm_object_terminate(obj); else VM_OBJECT_WUNLOCK(obj); } else { /* * Woe to the process that tries to page now :-). */ vm_pager_deallocate(obj); VM_OBJECT_WUNLOCK(obj); } vp->v_object = NULL; } /* * Allocate (or lookup) pager for a vnode. * Handle is a vnode pointer. * * MPSAFE */ vm_object_t vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t offset, struct ucred *cred) { vm_object_t object; struct vnode *vp; /* * Pageout to vnode, no can do yet. */ if (handle == NULL) return (NULL); vp = (struct vnode *) handle; /* * If the object is being terminated, wait for it to * go away. */ retry: while ((object = vp->v_object) != NULL) { VM_OBJECT_WLOCK(object); if ((object->flags & OBJ_DEAD) == 0) break; vm_object_set_flag(object, OBJ_DISCONNECTWNT); VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0); } KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference")); if (object == NULL) { /* * Add an object of the appropriate size */ object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; object->un_pager.vnp.writemappings = 0; object->handle = handle; VI_LOCK(vp); if (vp->v_object != NULL) { /* * Object has been created while we were sleeping */ VI_UNLOCK(vp); VM_OBJECT_WLOCK(object); KASSERT(object->ref_count == 1, ("leaked ref %p %d", object, object->ref_count)); object->type = OBJT_DEAD; object->ref_count = 0; VM_OBJECT_WUNLOCK(object); vm_object_destroy(object); goto retry; } vp->v_object = object; VI_UNLOCK(vp); } else { object->ref_count++; #if VM_NRESERVLEVEL > 0 vm_object_color(object, 0); #endif VM_OBJECT_WUNLOCK(object); } vref(vp); return (object); } /* * The object must be locked. */ static void vnode_pager_dealloc(vm_object_t object) { struct vnode *vp; int refs; vp = object->handle; if (vp == NULL) panic("vnode_pager_dealloc: pager already dealloced"); VM_OBJECT_ASSERT_WLOCKED(object); vm_object_pip_wait(object, "vnpdea"); refs = object->ref_count; object->handle = NULL; object->type = OBJT_DEAD; if (object->flags & OBJ_DISCONNECTWNT) { vm_object_clear_flag(object, OBJ_DISCONNECTWNT); wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); if (object->un_pager.vnp.writemappings > 0) { object->un_pager.vnp.writemappings = 0; VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } vp->v_object = NULL; VOP_UNSET_TEXT(vp); VM_OBJECT_WUNLOCK(object); while (refs-- > 0) vunref(vp); VM_OBJECT_WLOCK(object); } static boolean_t vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after) { struct vnode *vp = object->handle; daddr_t bn; int err; daddr_t reqblock; int poff; int bsize; int pagesperblock, blocksperpage; VM_OBJECT_ASSERT_WLOCKED(object); /* * If no vp or vp is doomed or marked transparent to VM, we do not * have the page. */ if (vp == NULL || vp->v_iflag & VI_DOOMED) return FALSE; /* * If the offset is beyond end of file we do * not have the page. */ if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size) return FALSE; bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; blocksperpage = 0; if (pagesperblock > 0) { reqblock = pindex / pagesperblock; } else { blocksperpage = (PAGE_SIZE / bsize); reqblock = pindex * blocksperpage; } VM_OBJECT_WUNLOCK(object); err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before); VM_OBJECT_WLOCK(object); if (err) return TRUE; if (bn == -1) return FALSE; if (pagesperblock > 0) { poff = pindex - (reqblock * pagesperblock); if (before) { *before *= pagesperblock; *before += poff; } if (after) { /* * The BMAP vop can report a partial block in the * 'after', but must not report blocks after EOF. * Assert the latter, and truncate 'after' in case * of the former. */ KASSERT((reqblock + *after) * pagesperblock < roundup2(object->size, pagesperblock), ("%s: reqblock %jd after %d size %ju", __func__, (intmax_t )reqblock, *after, (uintmax_t )object->size)); *after *= pagesperblock; *after += pagesperblock - (poff + 1); if (pindex + *after >= object->size) *after = object->size - 1 - pindex; } } else { if (before) { *before /= blocksperpage; } if (after) { *after /= blocksperpage; } } return TRUE; } /* * Lets the VM system know about a change in size for a file. * We adjust our own internal size and flush any cached pages in * the associated object that are affected by the size change. * * Note: this routine may be invoked as a result of a pager put * operation (possibly at object termination time), so we must be careful. */ void vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize) { vm_object_t object; vm_page_t m; vm_pindex_t nobjsize; if ((object = vp->v_object) == NULL) return; /* ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */ VM_OBJECT_WLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_WUNLOCK(object); return; } KASSERT(object->type == OBJT_VNODE, ("not vnode-backed object %p", object)); if (nsize == object->un_pager.vnp.vnp_size) { /* * Hasn't changed size */ VM_OBJECT_WUNLOCK(object); return; } nobjsize = OFF_TO_IDX(nsize + PAGE_MASK); if (nsize < object->un_pager.vnp.vnp_size) { /* * File has shrunk. Toss any cached pages beyond the new EOF. */ if (nobjsize < object->size) vm_object_page_remove(object, nobjsize, object->size, 0); /* * this gets rid of garbage at the end of a page that is now * only partially backed by the vnode. * * XXX for some reason (I don't know yet), if we take a * completely invalid page and mark it partially valid * it can screw up NFS reads, so we don't allow the case. */ if ((nsize & PAGE_MASK) && (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL && m->valid != 0) { int base = (int)nsize & PAGE_MASK; int size = PAGE_SIZE - base; /* * Clear out partial-page garbage in case * the page has been mapped. */ pmap_zero_page_area(m, base, size); /* * Update the valid bits to reflect the blocks that * have been zeroed. Some of these valid bits may * have already been set. */ vm_page_set_valid_range(m, base, size); /* * Round "base" to the next block boundary so that the * dirty bit for a partially zeroed block is not * cleared. */ base = roundup2(base, DEV_BSIZE); /* * Clear out partial-page dirty bits. * * note that we do not clear out the valid * bits. This would prevent bogus_page * replacement from working properly. */ vm_page_clear_dirty(m, base, PAGE_SIZE - base); } else if ((nsize & PAGE_MASK) && vm_page_is_cached(object, OFF_TO_IDX(nsize))) { vm_page_cache_free(object, OFF_TO_IDX(nsize), nobjsize); } } object->un_pager.vnp.vnp_size = nsize; object->size = nobjsize; VM_OBJECT_WUNLOCK(object); } /* * calculate the linear (byte) disk address of specified virtual * file address */ static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress, int *run) { int bsize; int err; daddr_t vblock; daddr_t voffset; if (address < 0) return -1; if (vp->v_iflag & VI_DOOMED) return -1; bsize = vp->v_mount->mnt_stat.f_iosize; vblock = address / bsize; voffset = address % bsize; err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL); if (err == 0) { if (*rtaddress != -1) *rtaddress += voffset / DEV_BSIZE; if (run) { *run += 1; *run *= bsize/PAGE_SIZE; *run -= voffset/PAGE_SIZE; } } return (err); } /* * small block filesystem vnode pager input */ static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m) { struct vnode *vp; struct bufobj *bo; struct buf *bp; struct sf_buf *sf; daddr_t fileaddr; vm_offset_t bsize; vm_page_bits_t bits; int error, i; error = 0; vp = object->handle; if (vp->v_iflag & VI_DOOMED) return VM_PAGER_BAD; bsize = vp->v_mount->mnt_stat.f_iosize; VOP_BMAP(vp, 0, &bo, 0, NULL, NULL); sf = sf_buf_alloc(m, 0); for (i = 0; i < PAGE_SIZE / bsize; i++) { vm_ooffset_t address; bits = vm_page_bits(i * bsize, bsize); if (m->valid & bits) continue; address = IDX_TO_OFF(m->pindex) + i * bsize; if (address >= object->un_pager.vnp.vnp_size) { fileaddr = -1; } else { error = vnode_pager_addr(vp, address, &fileaddr, NULL); if (error) break; } if (fileaddr != -1) { bp = getpbuf(&vnode_pbuf_freecnt); /* build a minimal buffer header */ bp->b_iocmd = BIO_READ; bp->b_iodone = bdone; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize; bp->b_blkno = fileaddr; pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bsize; bp->b_bufsize = bsize; bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); /* do the input */ bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); bwait(bp, PVM, "vnsrd"); if ((bp->b_ioflags & BIO_ERROR) != 0) error = EIO; /* * free the buffer header back to the swap buffer pool */ bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); if (error) break; } else bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize); KASSERT((m->dirty & bits) == 0, ("vnode_pager_input_smlfs: page %p is dirty", m)); VM_OBJECT_WLOCK(object); m->valid |= bits; VM_OBJECT_WUNLOCK(object); } sf_buf_free(sf); if (error) { return VM_PAGER_ERROR; } return VM_PAGER_OK; } /* * old style vnode pager input routine */ static int vnode_pager_input_old(vm_object_t object, vm_page_t m) { struct uio auio; struct iovec aiov; int error; int size; struct sf_buf *sf; struct vnode *vp; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; /* * Return failure if beyond current EOF */ if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) { return VM_PAGER_BAD; } else { size = PAGE_SIZE; if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size) size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex); vp = object->handle; VM_OBJECT_WUNLOCK(object); /* * Allocate a kernel virtual address and initialize so that * we can use VOP_READ/WRITE routines. */ sf = sf_buf_alloc(m, 0); aiov.iov_base = (caddr_t)sf_buf_kva(sf); aiov.iov_len = size; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = IDX_TO_OFF(m->pindex); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_resid = size; auio.uio_td = curthread; error = VOP_READ(vp, &auio, 0, curthread->td_ucred); if (!error) { int count = size - auio.uio_resid; if (count == 0) error = EINVAL; else if (count != PAGE_SIZE) bzero((caddr_t)sf_buf_kva(sf) + count, PAGE_SIZE - count); } sf_buf_free(sf); VM_OBJECT_WLOCK(object); } KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m)); if (!error) m->valid = VM_PAGE_BITS_ALL; return error ? VM_PAGER_ERROR : VM_PAGER_OK; } /* * generic vnode pager input routine */ /* * Local media VFS's that do not implement their own VOP_GETPAGES * should have their VOP_GETPAGES call to vnode_pager_generic_getpages() * to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_GETPAGES. */ static int vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages not implemented\n")); VM_OBJECT_WLOCK(object); return rtval; } static int vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count, int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg) { struct vnode *vp; int rtval; vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: FS getpages_async not implemented\n")); VM_OBJECT_WLOCK(object); return (rtval); } /* * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for * local filesystems, where partially valid pages can only occur at * the end of file. */ int vnode_pager_local_getpages(struct vop_getpages_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL)); } int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_GETPAGES. */ int vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count, int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg) { vm_object_t object; struct bufobj *bo; struct buf *bp; off_t foff; int bsize, pagesperblock, *freecnt; int error, before, after, rbehind, rahead, poff, i; int bytecount, secmask; KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, ("%s does not support devices", __func__)); if (vp->v_iflag & VI_DOOMED) return (VM_PAGER_BAD); object = vp->v_object; foff = IDX_TO_OFF(m[0]->pindex); bsize = vp->v_mount->mnt_stat.f_iosize; pagesperblock = bsize / PAGE_SIZE; KASSERT(foff < object->un_pager.vnp.vnp_size, ("%s: page %p offset beyond vp %p size", __func__, m[0], vp)); KASSERT(count <= sizeof(bp->b_pages), ("%s: requested %d pages", __func__, count)); /* * The last page has valid blocks. Invalid part can only * exist at the end of file, and the page is made fully valid * by zeroing in vm_pager_get_pages(). */ if (m[count - 1]->valid != 0 && --count == 0) { if (iodone != NULL) iodone(arg, m, 1, 0); return (VM_PAGER_OK); } /* * Synchronous and asynchronous paging operations use different * free pbuf counters. This is done to avoid asynchronous requests * to consume all pbufs. * Allocate the pbuf at the very beginning of the function, so that * if we are low on certain kind of pbufs don't even proceed to BMAP, * but sleep. */ freecnt = iodone != NULL ? &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt; bp = getpbuf(freecnt); /* * Get the underlying device blocks for the file with VOP_BMAP(). * If the file system doesn't support VOP_BMAP, use old way of * getting pages via VOP_READ. */ error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before); if (error == EOPNOTSUPP) { relpbuf(bp, freecnt); VM_OBJECT_WLOCK(object); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_old(object, m[i]); if (error) break; } VM_OBJECT_WUNLOCK(object); return (error); } else if (error != 0) { relpbuf(bp, freecnt); return (VM_PAGER_ERROR); } /* * If the file system supports BMAP, but blocksize is smaller * than a page size, then use special small filesystem code. */ if (pagesperblock == 0) { relpbuf(bp, freecnt); for (i = 0; i < count; i++) { PCPU_INC(cnt.v_vnodein); PCPU_INC(cnt.v_vnodepgsin); error = vnode_pager_input_smlfs(object, m[i]); if (error) break; } return (error); } /* * A sparse file can be encountered only for a single page request, * which may not be preceded by call to vm_pager_haspage(). */ if (bp->b_blkno == -1) { KASSERT(count == 1, ("%s: array[%d] request to a sparse file %p", __func__, count, vp)); relpbuf(bp, freecnt); pmap_zero_page(m[0]); KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty", __func__, m[0])); VM_OBJECT_WLOCK(object); m[0]->valid = VM_PAGE_BITS_ALL; VM_OBJECT_WUNLOCK(object); return (VM_PAGER_OK); } bp->b_blkno += (foff % bsize) / DEV_BSIZE; /* Recalculate blocks available after/before to pages. */ poff = (foff % bsize) / PAGE_SIZE; before *= pagesperblock; before += poff; after *= pagesperblock; after += pagesperblock - (poff + 1); if (m[0]->pindex + after >= object->size) after = object->size - 1 - m[0]->pindex; KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d", __func__, count, after + 1)); after -= count - 1; /* Trim requested rbehind/rahead to possible values. */ rbehind = a_rbehind ? *a_rbehind : 0; rahead = a_rahead ? *a_rahead : 0; rbehind = min(rbehind, before); rbehind = min(rbehind, m[0]->pindex); rahead = min(rahead, after); rahead = min(rahead, object->size - m[count - 1]->pindex); KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages), ("%s: behind %d ahead %d count %d", __func__, rbehind, rahead, count)); /* * Fill in the bp->b_pages[] array with requested and optional * read behind or read ahead pages. Read behind pages are looked * up in a backward direction, down to a first cached page. Same * for read ahead pages, but there is no need to shift the array * in case of encountering a cached page. */ i = bp->b_npages = 0; if (rbehind) { vm_pindex_t startpindex, tpindex; vm_page_t p; VM_OBJECT_WLOCK(object); startpindex = m[0]->pindex - rbehind; if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL && p->pindex >= startpindex) startpindex = p->pindex + 1; /* tpindex is unsigned; beware of numeric underflow. */ for (tpindex = m[0]->pindex - 1; tpindex >= startpindex && tpindex < m[0]->pindex; tpindex--, i++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (p == NULL) { /* Shift the array. */ for (int j = 0; j < i; j++) bp->b_pages[j] = bp->b_pages[j + tpindex + 1 - startpindex]; break; } bp->b_pages[tpindex - startpindex] = p; } bp->b_pgbefore = i; bp->b_npages += i; bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE; } else bp->b_pgbefore = 0; /* Requested pages. */ for (int j = 0; j < count; j++, i++) bp->b_pages[i] = m[j]; bp->b_npages += count; if (rahead) { vm_pindex_t endpindex, tpindex; vm_page_t p; if (!VM_OBJECT_WOWNED(object)) VM_OBJECT_WLOCK(object); endpindex = m[count - 1]->pindex + rahead + 1; if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL && p->pindex < endpindex) endpindex = p->pindex; if (endpindex > object->size) endpindex = object->size; for (tpindex = m[count - 1]->pindex + 1; tpindex < endpindex; i++, tpindex++) { p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED); if (p == NULL) break; bp->b_pages[i] = p; } bp->b_pgafter = i - bp->b_npages; bp->b_npages = i; } else bp->b_pgafter = 0; if (VM_OBJECT_WOWNED(object)) VM_OBJECT_WUNLOCK(object); /* Report back actual behind/ahead read. */ if (a_rbehind) *a_rbehind = bp->b_pgbefore; if (a_rahead) *a_rahead = bp->b_pgafter; KASSERT(bp->b_npages <= sizeof(bp->b_pages), ("%s: buf %p overflowed", __func__, bp)); /* * Recalculate first offset and bytecount with regards to read behind. * Truncate bytecount to vnode real size and round up physical size * for real devices. */ foff = IDX_TO_OFF(bp->b_pages[0]->pindex); bytecount = bp->b_npages << PAGE_SHIFT; if ((foff + bytecount) > object->un_pager.vnp.vnp_size) bytecount = object->un_pager.vnp.vnp_size - foff; secmask = bo->bo_bsize - 1; KASSERT(secmask < PAGE_SIZE && secmask > 0, ("%s: sector size %d too large", __func__, secmask + 1)); bytecount = (bytecount + secmask) & ~secmask; /* * And map the pages to be read into the kva, if the filesystem * requires mapped buffers. */ if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 && unmapped_buf_allowed) { bp->b_data = unmapped_buf; bp->b_offset = 0; } else { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } /* Build a minimal buffer header. */ bp->b_iocmd = BIO_READ; KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred")); KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred")); bp->b_rcred = crhold(curthread->td_ucred); bp->b_wcred = crhold(curthread->td_ucred); pbgetbo(bo, bp); bp->b_vp = vp; bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount; bp->b_iooffset = dbtob(bp->b_blkno); atomic_add_long(&runningbufspace, bp->b_runningbufspace); PCPU_INC(cnt.v_vnodein); PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages); if (iodone != NULL) { /* async */ bp->b_pgiodone = iodone; bp->b_caller1 = arg; bp->b_iodone = vnode_pager_generic_getpages_done_async; bp->b_flags |= B_ASYNC; BUF_KERNPROC(bp); bstrategy(bp); return (VM_PAGER_OK); } else { bp->b_iodone = bdone; bstrategy(bp); bwait(bp, PVM, "vnread"); error = vnode_pager_generic_getpages_done(bp); for (i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_pbuf_freecnt); return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); } } static void vnode_pager_generic_getpages_done_async(struct buf *bp) { int error; error = vnode_pager_generic_getpages_done(bp); /* Run the iodone upon the requested range. */ bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore, bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error); for (int i = 0; i < bp->b_npages; i++) bp->b_pages[i] = NULL; bp->b_vp = NULL; pbrelbo(bp); relpbuf(bp, &vnode_async_pbuf_freecnt); } static int vnode_pager_generic_getpages_done(struct buf *bp) { vm_object_t object; off_t tfoff, nextoff; int i, error; error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0; object = bp->b_vp->v_object; if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) { if (!buf_mapped(bp)) { bp->b_data = bp->b_kvabase; pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages); } bzero(bp->b_data + bp->b_bcount, PAGE_SIZE * bp->b_npages - bp->b_bcount); } if (buf_mapped(bp)) { pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages); bp->b_data = unmapped_buf; } VM_OBJECT_WLOCK(object); for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex); i < bp->b_npages; i++, tfoff = nextoff) { vm_page_t mt; nextoff = tfoff + PAGE_SIZE; mt = bp->b_pages[i]; if (nextoff <= object->un_pager.vnp.vnp_size) { /* * Read filled up entire page. */ mt->valid = VM_PAGE_BITS_ALL; KASSERT(mt->dirty == 0, ("%s: page %p is dirty", __func__, mt)); KASSERT(!pmap_page_is_mapped(mt), ("%s: page %p is mapped", __func__, mt)); } else { /* * Read did not fill up entire page. * * Currently we do not set the entire page valid, * we just try to clear the piece that we couldn't * read. */ vm_page_set_valid_range(mt, 0, object->un_pager.vnp.vnp_size - tfoff); KASSERT((mt->dirty & vm_page_bits(0, object->un_pager.vnp.vnp_size - tfoff)) == 0, ("%s: page %p is dirty", __func__, mt)); } if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter) vm_page_readahead_finish(mt); } VM_OBJECT_WUNLOCK(object); if (error != 0) printf("%s: I/O read error %d\n", __func__, error); return (error); } /* * EOPNOTSUPP is no longer legal. For local media VFS's that do not * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to * vnode_pager_generic_putpages() to implement the previous behaviour. * * All other FS's should use the bypass to get to the local media * backing vp's VOP_PUTPAGES. */ static void vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count, int flags, int *rtvals) { int rtval; struct vnode *vp; int bytes = count * PAGE_SIZE; /* * Force synchronous operation if we are extremely low on memory * to prevent a low-memory deadlock. VOP operations often need to * allocate more memory to initiate the I/O ( i.e. do a BMAP * operation ). The swapper handles the case by limiting the amount * of asynchronous I/O, but that sort of solution doesn't scale well * for the vnode pager without a lot of work. * * Also, the backing vnode's iodone routine may not wake the pageout * daemon up. This should be probably be addressed XXX. */ if (vm_cnt.v_free_count + vm_cnt.v_cache_count < vm_cnt.v_pageout_free_min) flags |= VM_PAGER_PUT_SYNC; /* * Call device-specific putpages function */ vp = object->handle; VM_OBJECT_WUNLOCK(object); rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals); KASSERT(rtval != EOPNOTSUPP, ("vnode_pager: stale FS putpages\n")); VM_OBJECT_WLOCK(object); } /* * This is now called from local media FS's to operate against their * own vnodes if they fail to implement VOP_PUTPAGES. * * This is typically called indirectly via the pageout daemon and * clustering has already typically occurred, so in general we ask the * underlying filesystem to write the data out asynchronously rather * then delayed. */ int vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount, int flags, int *rtvals) { int i; vm_object_t object; vm_page_t m; int count; int maxsize, ncount; vm_ooffset_t poffset; struct uio auio; struct iovec aiov; int error; int ioflags; int ppscheck = 0; static struct timeval lastfail; static int curfail; object = vp->v_object; count = bytecount / PAGE_SIZE; for (i = 0; i < count; i++) rtvals[i] = VM_PAGER_ERROR; if ((int64_t)ma[0]->pindex < 0) { printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n", (long)ma[0]->pindex, (u_long)ma[0]->dirty); rtvals[0] = VM_PAGER_BAD; return VM_PAGER_BAD; } maxsize = count * PAGE_SIZE; ncount = count; poffset = IDX_TO_OFF(ma[0]->pindex); /* * If the page-aligned write is larger then the actual file we * have to invalidate pages occurring beyond the file EOF. However, * there is an edge case where a file may not be page-aligned where * the last page is partially invalid. In this case the filesystem * may not properly clear the dirty bits for the entire page (which * could be VM_PAGE_BITS_ALL due to the page having been mmap()d). * With the page locked we are free to fix-up the dirty bits here. * * We do not under any circumstances truncate the valid bits, as * this will screw up bogus page replacement. */ VM_OBJECT_WLOCK(object); if (maxsize + poffset > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > poffset) { int pgoff; maxsize = object->un_pager.vnp.vnp_size - poffset; ncount = btoc(maxsize); if ((pgoff = (int)maxsize & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("vnode_pager_generic_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { maxsize = 0; ncount = 0; } if (ncount < count) { for (i = ncount; i < count; i++) { rtvals[i] = VM_PAGER_BAD; } } } VM_OBJECT_WUNLOCK(object); /* * pageouts are already clustered, use IO_ASYNC to force a bawrite() * rather then a bdwrite() to prevent paging I/O from saturating * the buffer cache. Dummy-up the sequential heuristic to cause * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set, * the system decides how to cluster. */ ioflags = IO_VMIO; if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ioflags |= IO_SYNC; else if ((flags & VM_PAGER_CLUSTER_OK) == 0) ioflags |= IO_ASYNC; ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0; ioflags |= IO_SEQMAX << IO_SEQSHIFT; aiov.iov_base = (caddr_t) 0; aiov.iov_len = maxsize; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = poffset; auio.uio_segflg = UIO_NOCOPY; auio.uio_rw = UIO_WRITE; auio.uio_resid = maxsize; auio.uio_td = (struct thread *) 0; error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred); PCPU_INC(cnt.v_vnodeout); PCPU_ADD(cnt.v_vnodepgsout, ncount); if (error) { if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1))) printf("vnode_pager_putpages: I/O error %d\n", error); } if (auio.uio_resid) { if (ppscheck || ppsratecheck(&lastfail, &curfail, 1)) printf("vnode_pager_putpages: residual I/O %zd at %lu\n", auio.uio_resid, (u_long)ma[0]->pindex); } for (i = 0; i < ncount; i++) { rtvals[i] = VM_PAGER_OK; } return rtvals[0]; } void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written) { vm_object_t obj; int i, pos; if (written == 0) return; obj = ma[0]->object; VM_OBJECT_WLOCK(obj); for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) { if (pos < trunc_page(written)) { rtvals[i] = VM_PAGER_OK; vm_page_undirty(ma[i]); } else { /* Partially written page. */ rtvals[i] = VM_PAGER_AGAIN; vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK); } } VM_OBJECT_WUNLOCK(obj); } void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; vm_ooffset_t old_wm; VM_OBJECT_WLOCK(object); if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } old_wm = object->un_pager.vnp.writemappings; object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; vp = object->handle; if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { ASSERT_VOP_ELOCKED(vp, "v_writecount inc"); VOP_ADD_WRITECOUNT(vp, 1); CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, vp->v_writecount); } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { ASSERT_VOP_ELOCKED(vp, "v_writecount dec"); VOP_ADD_WRITECOUNT(vp, -1); CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d", __func__, vp, vp->v_writecount); } VM_OBJECT_WUNLOCK(object); } void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, vm_offset_t end) { struct vnode *vp; struct mount *mp; vm_offset_t inc; VM_OBJECT_WLOCK(object); /* * First, recheck the object type to account for the race when * the vnode is reclaimed. */ if (object->type != OBJT_VNODE) { VM_OBJECT_WUNLOCK(object); return; } /* * Optimize for the case when writemappings is not going to * zero. */ inc = end - start; if (object->un_pager.vnp.writemappings != inc) { object->un_pager.vnp.writemappings -= inc; VM_OBJECT_WUNLOCK(object); return; } vp = object->handle; vhold(vp); VM_OBJECT_WUNLOCK(object); mp = NULL; vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * Decrement the object's writemappings, by swapping the start * and end arguments for vnode_pager_update_writecount(). If * there was not a race with vnode reclaimation, then the * vnode's v_writecount is decremented. */ vnode_pager_update_writecount(object, end, start); VOP_UNLOCK(vp, 0); vdrop(vp); if (mp != NULL) vn_finished_write(mp); }